PEXSI
 All Classes Namespaces Files Functions Variables Typedefs Pages
pselinv_impl.hpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2012 The Regents of the University of California,
3  through Lawrence Berkeley National Laboratory.
4 
5 Authors: Lin Lin and Mathias Jacquelin
6 
7 This file is part of PEXSI. All rights reserved.
8 
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are met:
11 
12 (1) Redistributions of source code must retain the above copyright notice, this
13 list of conditions and the following disclaimer.
14 (2) Redistributions in binary form must reproduce the above copyright notice,
15 this list of conditions and the following disclaimer in the documentation
16 and/or other materials provided with the distribution.
17 (3) Neither the name of the University of California, Lawrence Berkeley
18 National Laboratory, U.S. Dept. of Energy nor the names of its contributors may
19 be used to endorse or promote products derived from this software without
20 specific prior written permission.
21 
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 You are under no obligation whatsoever to provide any bug fixes, patches, or
34 upgrades to the features, functionality or performance of the source code
35 ("Enhancements") to anyone; however, if you choose to make your Enhancements
36 available either publicly, or directly to Lawrence Berkeley National
37 Laboratory, without imposing a separate written license agreement for such
38 Enhancements, then you hereby grant the following license: a non-exclusive,
39 royalty-free perpetual license to install, use, modify, prepare derivative
40 works, incorporate into other computer software, distribute, and sublicense
41 such enhancements or derivative works thereof, in binary and source code form.
42  */
46 #ifndef _PEXSI_PSELINV_IMPL_HPP_
47 #define _PEXSI_PSELINV_IMPL_HPP_
48 
49 #include <list>
50 #include <limits>
51 
52 #include "pexsi/timer.h"
54 
55 
56 #define MPI_MAX_COMM (1024)
57 #define BCAST_THRESHOLD 16
58 
59 #define MOD(a,b) \
60  ( ((a)%(b)+(b))%(b))
61 
62 #ifdef COMMUNICATOR_PROFILE
63 namespace PEXSI{
64  class CommunicatorStat{
65  public:
66  typedef std::map<std::vector<bool> , std::vector<Int> > bitMaskSet;
67  //Communicators for the Bcast variant
68  NumVec<Int> countSendToBelow_;
69  bitMaskSet maskSendToBelow_;
70 
71  NumVec<Int> countRecvFromBelow_;
72  bitMaskSet maskRecvFromBelow_;
73 
74  NumVec<Int> countSendToRight_;
75  bitMaskSet maskSendToRight_;
76  };
77 }
78 #endif
79 
80 
81 
82 namespace PEXSI{
83  inline GridType::GridType ( MPI_Comm Bcomm, int nprow, int npcol )
84  {
85  Int info;
86  MPI_Initialized( &info );
87  if( !info ){
88  ErrorHandling( "MPI has not been initialized." );
89  }
90  MPI_Group comm_group;
91  MPI_Comm_group( Bcomm, &comm_group );
92  MPI_Comm_create( Bcomm, comm_group, &comm );
93  // comm = Bcomm;
94 
95  MPI_Comm_rank( comm, &mpirank );
96  MPI_Comm_size( comm, &mpisize );
97  if( mpisize != nprow * npcol ){
98  ErrorHandling( "mpisize != nprow * npcol." );
99  }
100 
101 
102  numProcRow = nprow;
103  numProcCol = npcol;
104 #ifdef SWAP_ROWS_COLS
105  Int myrow = mpirank % npcol;
106  Int mycol = mpirank / npcol;
107 #else
108  Int myrow = mpirank / npcol;
109  Int mycol = mpirank % npcol;
110 #endif
111 
112  MPI_Comm_split( comm, myrow, mycol, &rowComm );
113  MPI_Comm_split( comm, mycol, myrow, &colComm );
114 
115  MPI_Group_free( &comm_group );
116 
117 
118  return ;
119  } // ----- end of method GridType::GridType -----
120 
121 
122  inline GridType::~GridType ( )
123  {
124  // Dot not free grid.comm which is not generated by GridType().
125 
126  MPI_Comm_free( &rowComm );
127  MPI_Comm_free( &colComm );
128  MPI_Comm_free( &comm );
129 
130  return ;
131  } // ----- end of method GridType::~GridType -----
132 }
133 
134 
135 namespace PEXSI{
136  template<typename T>
137  void PMatrix<T>::deallocate(){
138 
139  grid_ = NULL;
140  super_ = NULL;
141  options_ = NULL;
142  optionsFact_ = NULL;
143 
144  ColBlockIdx_.clear();
145  RowBlockIdx_.clear();
146  U_.clear();
147  L_.clear();
148  workingSet_.clear();
149 
150  // Communication variables
151  isSendToBelow_.Clear();
152  isSendToRight_.Clear();
153  isSendToDiagonal_.Clear();
154  isSendToCrossDiagonal_.Clear();
155 
156  isRecvFromBelow_.Clear();
157  isRecvFromAbove_.Clear();
158  isRecvFromLeft_.Clear();
159  isRecvFromCrossDiagonal_.Clear();
160 
161 
162  //Cleanup tree information
163 #ifdef NEW_BCAST
164  for(int i =0;i<fwdToBelowTree2_.size();++i){
165  if(fwdToBelowTree2_[i]!=NULL){
166  delete fwdToBelowTree2_[i];
167  fwdToBelowTree2_[i] = NULL;
168  }
169  }
170  for(int i =0;i<fwdToRightTree2_.size();++i){
171  if(fwdToRightTree2_[i]!=NULL){
172  delete fwdToRightTree2_[i];
173  fwdToRightTree2_[i] = NULL;
174  }
175  }
176 #else
177  for(int i =0;i<fwdToBelowTree_.size();++i){
178  if(fwdToBelowTree_[i]!=NULL){
179  delete fwdToBelowTree_[i];
180  fwdToBelowTree_[i] = NULL;
181  }
182  }
183 
184  for(int i =0;i<fwdToRightTree_.size();++i){
185  if(fwdToRightTree_[i]!=NULL){
186  delete fwdToRightTree_[i];
187  fwdToRightTree_[i] = NULL;
188  }
189  }
190 #endif
191 
192  for(int i =0;i<redToLeftTree_.size();++i){
193  if(redToLeftTree_[i]!=NULL){
194  delete redToLeftTree_[i];
195  redToLeftTree_[i] = NULL;
196  }
197  }
198  for(int i =0;i<redToAboveTree_.size();++i){
199  if(redToAboveTree_[i]!=NULL){
200  delete redToAboveTree_[i];
201  redToAboveTree_[i] = NULL;
202  }
203  }
204 
205 //dump comm_profile info
206 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
207  if(commOFS.is_open()){
208  commOFS.close();
209  }
210 #endif
211 
212 
213  }
214 
215  template<typename T>
216  PMatrix<T> & PMatrix<T>::operator = ( const PMatrix<T> & C){
217  if(&C!=this){
218  //If we have some memory allocated, delete it
219  deallocate();
220 
221  grid_ = C.grid_;
222  super_ = C.super_;
223  options_ = C.options_;
224  optionsFact_ = C.optionsFact_;
225 
226 
227  maxTag_ = C.maxTag_;
228  limIndex_ = C.limIndex_;
229 
230  ColBlockIdx_ = C.ColBlockIdx_;
231  RowBlockIdx_ = C.RowBlockIdx_;
232  L_ = C.L_;
233  U_ = C.U_;
234 
235  workingSet_ = C.workingSet_;
236 
237 
238  // Communication variables
239  isSendToBelow_ = C.isSendToBelow_;
240  isSendToRight_ = C.isSendToRight_;
241  isSendToDiagonal_ = C.isSendToDiagonal_;
242  isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
243 
244  isRecvFromBelow_ = C.isRecvFromBelow_;
245  isRecvFromAbove_ = C.isRecvFromAbove_;
246  isRecvFromLeft_ = C.isRecvFromLeft_;
247  isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
248 
249 #ifdef NEW_BCAST
250  fwdToBelowTree2_.resize(C.fwdToBelowTree2_.size());
251  for(int i = 0 ; i< C.fwdToBelowTree2_.size();++i){
252  if(C.fwdToBelowTree2_[i]!=NULL){
253  fwdToBelowTree2_[i] = C.fwdToBelowTree2_[i]->clone();
254  }
255  }
256  fwdToRightTree2_.resize(C.fwdToRightTree2_.size());
257  for(int i = 0 ; i< C.fwdToRightTree2_.size();++i){
258  if(C.fwdToRightTree2_[i]!=NULL){
259  fwdToRightTree2_[i] = C.fwdToRightTree2_[i]->clone();
260  }
261  }
262 #else
263  fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
264  for(int i = 0 ; i< C.fwdToBelowTree_.size();++i){
265  if(C.fwdToBelowTree_[i]!=NULL){
266  fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
267  }
268  }
269  fwdToRightTree_.resize(C.fwdToRightTree_.size());
270  for(int i = 0 ; i< C.fwdToRightTree_.size();++i){
271  if(C.fwdToRightTree_[i]!=NULL){
272  fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
273  }
274  }
275 #endif
276 
277  redToLeftTree_.resize(C.redToLeftTree_.size());
278  for(int i = 0 ; i< C.redToLeftTree_.size();++i){
279  if(C.redToLeftTree_[i]!=NULL){
280  redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
281  }
282  }
283  redToAboveTree_.resize(C.redToAboveTree_.size());
284  for(int i = 0 ; i< C.redToAboveTree_.size();++i){
285  if(C.redToAboveTree_[i]!=NULL){
286  redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
287  }
288  }
289 
290  }
291 
292  return *this;
293  }
294 
295  template<typename T>
296  PMatrix<T>::PMatrix( const PMatrix<T> & C):PMatrix(){
297  //If we have some memory allocated, delete it
298  deallocate();
299 
300  grid_ = C.grid_;
301  super_ = C.super_;
302  options_ = C.options_;
303  optionsFact_ = C.optionsFact_;
304 
305 
306  limIndex_ = C.limIndex_;
307  maxTag_ = C.maxTag_;
308  ColBlockIdx_ = C.ColBlockIdx_;
309  RowBlockIdx_ = C.RowBlockIdx_;
310  L_ = C.L_;
311  U_ = C.U_;
312 
313  workingSet_ = C.workingSet_;
314 
315 
316  // Communication variables
317  isSendToBelow_ = C.isSendToBelow_;
318  isSendToRight_ = C.isSendToRight_;
319  isSendToDiagonal_ = C.isSendToDiagonal_;
320  isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
321 
322  isRecvFromBelow_ = C.isRecvFromBelow_;
323  isRecvFromAbove_ = C.isRecvFromAbove_;
324  isRecvFromLeft_ = C.isRecvFromLeft_;
325  isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
326 
327 
328 
329 #ifdef NEW_BCAST
330  fwdToBelowTree2_.resize(C.fwdToBelowTree2_.size());
331  for(int i = 0 ; i< C.fwdToBelowTree2_.size();++i){
332  if(C.fwdToBelowTree2_[i]!=NULL){
333  fwdToBelowTree2_[i] = C.fwdToBelowTree2_[i]->clone();
334  }
335  }
336  fwdToRightTree2_.resize(C.fwdToRightTree2_.size());
337  for(int i = 0 ; i< C.fwdToRightTree2_.size();++i){
338  if(C.fwdToRightTree2_[i]!=NULL){
339  fwdToRightTree2_[i] = C.fwdToRightTree2_[i]->clone();
340  }
341  }
342 #else
343  fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
344  for(int i = 0 ; i< C.fwdToBelowTree_.size();++i){
345  if(C.fwdToBelowTree_[i]!=NULL){
346  fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
347  }
348  }
349  fwdToRightTree_.resize(C.fwdToRightTree_.size());
350  for(int i = 0 ; i< C.fwdToRightTree_.size();++i){
351  if(C.fwdToRightTree_[i]!=NULL){
352  fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
353  }
354  }
355 #endif
356 
357  redToLeftTree_.resize(C.redToLeftTree_.size());
358  for(int i = 0 ; i< C.redToLeftTree_.size();++i){
359  if(C.redToLeftTree_[i]!=NULL){
360  redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
361  }
362  }
363  redToAboveTree_.resize(C.redToAboveTree_.size());
364  for(int i = 0 ; i< C.redToAboveTree_.size();++i){
365  if(C.redToAboveTree_[i]!=NULL){
366  redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
367  }
368  }
369 
370  }
371 
372 
373  template<typename T>
374  PMatrix<T>::PMatrix ( ){
375  }
376 
377  template<typename T>
378  PMatrix<T>::PMatrix (
379  const GridType* g,
380  const SuperNodeType* s,
381  const PEXSI::PSelInvOptions * o,
382  const PEXSI::FactorizationOptions * oFact
383  ):PMatrix()
384  {
385 
386  this->Setup( g, s, o, oFact );
387 
388 
389  return ;
390  } // ----- end of method PMatrix::PMatrix -----
391 
392  template<typename T>
393  PMatrix<T>::~PMatrix ( )
394  {
395 
396 
397  deallocate();
398 
399  return ;
400  } // ----- end of method PMatrix::~PMatrix -----
401 
402  template<typename T>
403  void PMatrix<T>::Setup(
404  const GridType* g,
405  const SuperNodeType* s,
406  const PEXSI::PSelInvOptions * o,
407  const PEXSI::FactorizationOptions * oFact
408  )
409  {
410 
411  grid_ = g;
412  super_ = s;
413  options_ = o;
414  optionsFact_ = oFact;
415 
416  // if( grid_->numProcRow != grid_->numProcCol ){
417  //ErrorHandling( "The current version of SelInv only works for square processor grids." ); }
418 
419  L_.clear();
420  U_.clear();
421  ColBlockIdx_.clear();
422  RowBlockIdx_.clear();
423 
424  L_.resize( this->NumLocalBlockCol() );
425  U_.resize( this->NumLocalBlockRow() );
426 
427  ColBlockIdx_.resize( this->NumLocalBlockCol() );
428  RowBlockIdx_.resize( this->NumLocalBlockRow() );
429 
430  //workingSet_.resize(this->NumSuper());
431 #if ( _DEBUGlevel_ >= 1 )
432  statusOFS << std::endl << "PMatrix is constructed. The grid information: " << std::endl;
433  statusOFS << "mpirank = " << MYPROC(grid_) << std::endl;
434  statusOFS << "myrow = " << MYROW(grid_) << std::endl;
435  statusOFS << "mycol = " << MYCOL(grid_) << std::endl;
436 #endif
437 
438 //TODO enable this
439  //Get maxTag value and compute the max depth we can use
440  //int * pmaxTag;
441  int pmaxTag = 4000;
442 
443  void *v = NULL;
444  int flag;
445  MPI_Comm_get_attr(grid_->comm, MPI_TAG_UB, &v, &flag);
446  if (flag) {
447  pmaxTag = *(int*)v;
448  }
449 
450 
451  maxTag_ = pmaxTag;
452  limIndex_ = (Int)maxTag_/(Int)SELINV_TAG_COUNT -1;
453 
454 
455 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
456  std::stringstream ss3;
457  ss3 << "comm_stat" << MYPROC(this->grid_);
458  if(!commOFS.is_open()){
459  commOFS.open( ss3.str().c_str());
460  }
461 #endif
462 
463 
464 
465  return ;
466  } // ----- end of method PMatrix::Setup -----
467 
468 
470  template<typename T>
472  SuperNodeBufferType & snode,
473  std::vector<LBlock<T> > & LcolRecv,
474  std::vector<UBlock<T> > & UrowRecv,
475  NumMat<T> & AinvBuf,
476  NumMat<T> & UBuf )
477  {
478  TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
479 
480  TIMER_START(Build_colptr_rowptr);
481  // rowPtr[ib] gives the row index in snode.LUpdateBuf for the first
482  // nonzero row in LcolRecv[ib]. The total number of rows in
483  // snode.LUpdateBuf is given by rowPtr[end]-1
484  std::vector<Int> rowPtr(LcolRecv.size() + 1);
485  // colPtr[jb] gives the column index in UBuf for the first
486  // nonzero column in UrowRecv[jb]. The total number of rows in
487  // UBuf is given by colPtr[end]-1
488  std::vector<Int> colPtr(UrowRecv.size() + 1);
489 
490  rowPtr[0] = 0;
491  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
492  rowPtr[ib+1] = rowPtr[ib] + LcolRecv[ib].numRow;
493  }
494  colPtr[0] = 0;
495  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
496  colPtr[jb+1] = colPtr[jb] + UrowRecv[jb].numCol;
497  }
498 
499  Int numRowAinvBuf = *rowPtr.rbegin();
500  Int numColAinvBuf = *colPtr.rbegin();
501  TIMER_STOP(Build_colptr_rowptr);
502 
503  TIMER_START(Allocate_lookup);
504  // Allocate for the computational storage
505  AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
506  UBuf.Resize( SuperSize( snode.Index, super_ ), numColAinvBuf );
507  // TIMER_START(SetValue_lookup);
508  // SetValue( AinvBuf, ZERO<T>() );
509  //SetValue( snode.LUpdateBuf, ZERO<T>() );
510  // SetValue( UBuf, ZERO<T>() );
511  // TIMER_STOP(SetValue_lookup);
512  TIMER_STOP(Allocate_lookup);
513 
514  TIMER_START(Fill_UBuf);
515  // Fill UBuf first. Make the transpose later in the Gemm phase.
516  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
517  UBlock<T>& UB = UrowRecv[jb];
518  if( UB.numRow != SuperSize(snode.Index, super_) ){
519  ErrorHandling( "The size of UB is not right. Something is seriously wrong." );
520  }
521  lapack::Lacpy( 'A', UB.numRow, UB.numCol, UB.nzval.Data(),
522  UB.numRow, UBuf.VecData( colPtr[jb] ), SuperSize( snode.Index, super_ ) );
523  }
524  TIMER_STOP(Fill_UBuf);
525 
526  // Calculate the relative indices for (isup, jsup)
527  // Fill AinvBuf with the information in L or U block.
528  TIMER_START(JB_Loop);
529 
530 #ifdef STDFIND
531  // for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
532  //
533  // UBlock& UB = UrowRecv[jb];
534  // Int jsup = UB.blockIdx;
535  // Int SinvColsSta = FirstBlockCol( jsup, super_ );
536  //
537  // // Column relative indicies
538  // std::vector<Int> relCols( UB.numCol );
539  // for( Int j = 0; j < UB.numCol; j++ ){
540  // relCols[j] = UB.cols[j] - SinvColsSta;
541  // }
542  //
543  //
544  //
545  //
546  // for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
547  // LBlock& LB = LcolRecv[ib];
548  // Int isup = LB.blockIdx;
549  // Int SinvRowsSta = FirstBlockCol( isup, super_ );
550  // Scalar* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
551  // Int ldAinv = numRowAinvBuf;
552  //
553  // // Pin down the corresponding block in the part of Sinv.
554  // if( isup >= jsup ){
555  // std::vector<LBlock>& LcolSinv = this->L( LBj(jsup, grid_ ) );
556  // bool isBlockFound = false;
557  // TIMER_START(PARSING_ROW_BLOCKIDX);
558  // for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
559  // // Found the (isup, jsup) block in Sinv
560  // if( LcolSinv[ibSinv].blockIdx == isup ){
561  // LBlock& SinvB = LcolSinv[ibSinv];
562  //
563  // // Row relative indices
564  // std::vector<Int> relRows( LB.numRow );
565  // Int* rowsLBPtr = LB.rows.Data();
566  // Int* rowsSinvBPtr = SinvB.rows.Data();
567  //
568  // TIMER_START(STDFIND_ROW);
569  // Int * pos =&rowsSinvBPtr[0];
570  // Int * last =&rowsSinvBPtr[SinvB.numRow];
571  // for( Int i = 0; i < LB.numRow; i++ ){
572  // // pos = std::find(pos, &rowsSinvBPtr[SinvB.numRow-1], rowsLBPtr[i]);
573  // pos = std::find(rowsSinvBPtr, last, rowsLBPtr[i]);
574  // if(pos != last){
575  // relRows[i] = (Int)(pos - rowsSinvBPtr);
576  // }
577  // else{
578  // std::ostringstream msg;
579  // msg << "Row " << rowsLBPtr[i] <<
580  // " in LB cannot find the corresponding row in SinvB" << std::endl
581  // << "LB.rows = " << LB.rows << std::endl
582  // << "SinvB.rows = " << SinvB.rows << std::endl;
583  // ErrorHandling( msg.str().c_str() );
584  // }
585  // }
586  // TIMER_STOP(STDFIND_ROW);
587  //
588  // TIMER_START(Copy_Sinv_to_Ainv);
589  // // Transfer the values from Sinv to AinvBlock
590  // Scalar* nzvalSinv = SinvB.nzval.Data();
591  // Int ldSinv = SinvB.numRow;
592  // for( Int j = 0; j < UB.numCol; j++ ){
593  // for( Int i = 0; i < LB.numRow; i++ ){
594  // nzvalAinv[i+j*ldAinv] =
595  // nzvalSinv[relRows[i] + relCols[j] * ldSinv];
596  // }
597  // }
598  // TIMER_STOP(Copy_Sinv_to_Ainv);
599  //
600  // isBlockFound = true;
601  // break;
602  // }
603  // } // for (ibSinv )
604  // TIMER_STOP(PARSING_ROW_BLOCKIDX);
605  // if( isBlockFound == false ){
606  // std::ostringstream msg;
607  // msg << "Block(" << isup << ", " << jsup
608  // << ") did not find a matching block in Sinv." << std::endl;
609  // ErrorHandling( msg.str().c_str() );
610  // }
611  // } // if (isup, jsup) is in L
612  // else{
613  // // Row relative indices
614  // std::vector<Int> relRows( LB.numRow );
615  // Int SinvRowsSta = FirstBlockCol( isup, super_ );
616  // for( Int i = 0; i < LB.numRow; i++ ){
617  // relRows[i] = LB.rows[i] - SinvRowsSta;
618  // }
619  // std::vector<UBlock>& UrowSinv = this->U( LBi( isup, grid_ ) );
620  // bool isBlockFound = false;
621  // TIMER_START(PARSING_COL_BLOCKIDX);
622  // for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
623  // // Found the (isup, jsup) block in Sinv
624  // if( UrowSinv[jbSinv].blockIdx == jsup ){
625  // UBlock& SinvB = UrowSinv[jbSinv];
626  //
627  //
628  //
629  // // Column relative indices
630  // std::vector<Int> relCols( UB.numCol );
631  // Int* colsUBPtr = UB.cols.Data();
632  // Int* colsSinvBPtr = SinvB.cols.Data();
633  // TIMER_START(STDFIND_COL);
634  // Int * pos =&colsSinvBPtr[0];
635  // Int * last =&colsSinvBPtr[SinvB.numCol];
636  // for( Int j = 0; j < UB.numCol; j++ ){
637  // //colsUB is sorted
638  // pos = std::find(colsSinvBPtr, last, colsUBPtr[j]);
639  // if(pos !=last){
640  // relCols[j] = (Int)(pos - colsSinvBPtr);
641  // }
642  // else{
643  // std::ostringstream msg;
644  // msg << "Col " << colsUBPtr[j] <<
645  // " in UB cannot find the corresponding row in SinvB" << std::endl
646  // << "UB.cols = " << UB.cols << std::endl
647  // << "UinvB.cols = " << SinvB.cols << std::endl;
648  // ErrorHandling( msg.str().c_str() );
649  // }
650  // }
651  // TIMER_STOP(STDFIND_COL);
652  //
653  //
654  // TIMER_START(Copy_Sinv_to_Ainv);
655  // // Transfer the values from Sinv to AinvBlock
656  // Scalar* nzvalSinv = SinvB.nzval.Data();
657  // Int ldSinv = SinvB.numRow;
658  // for( Int j = 0; j < UB.numCol; j++ ){
659  // for( Int i = 0; i < LB.numRow; i++ ){
660  // nzvalAinv[i+j*ldAinv] =
661  // nzvalSinv[relRows[i] + relCols[j] * ldSinv];
662  // }
663  // }
664  // TIMER_STOP(Copy_Sinv_to_Ainv);
665  //
666  // isBlockFound = true;
667  // break;
668  // }
669  // } // for (jbSinv)
670  // TIMER_STOP(PARSING_COL_BLOCKIDX);
671  // if( isBlockFound == false ){
672  // std::ostringstream msg;
673  // msg << "Block(" << isup << ", " << jsup
674  // << ") did not find a matching block in Sinv." << std::endl;
675  // ErrorHandling( msg.str().c_str() );
676  // }
677  // } // if (isup, jsup) is in U
678  //
679  // } // for( ib )
680  // } // for ( jb )
681 #else
682  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
683  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
684  LBlock<T>& LB = LcolRecv[ib];
685  UBlock<T>& UB = UrowRecv[jb];
686  Int isup = LB.blockIdx;
687  Int jsup = UB.blockIdx;
688  T* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
689  Int ldAinv = AinvBuf.m();
690 
691  // Pin down the corresponding block in the part of Sinv.
692  if( isup >= jsup ){
693  std::vector<LBlock<T> >& LcolSinv = this->L( LBj(jsup, grid_ ) );
694  bool isBlockFound = false;
695  TIMER_START(PARSING_ROW_BLOCKIDX);
696  for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
697  // Found the (isup, jsup) block in Sinv
698  if( LcolSinv[ibSinv].blockIdx == isup ){
699  LBlock<T> & SinvB = LcolSinv[ibSinv];
700 
701  // Row relative indices
702  std::vector<Int> relRows( LB.numRow );
703  Int* rowsLBPtr = LB.rows.Data();
704  Int* rowsSinvBPtr = SinvB.rows.Data();
705  for( Int i = 0; i < LB.numRow; i++ ){
706  bool isRowFound = false;
707  for( Int i1 = 0; i1 < SinvB.numRow; i1++ ){
708  if( rowsLBPtr[i] == rowsSinvBPtr[i1] ){
709  isRowFound = true;
710  relRows[i] = i1;
711  break;
712  }
713  }
714  if( isRowFound == false ){
715  std::ostringstream msg;
716  msg << "Row " << rowsLBPtr[i] <<
717  " in LB cannot find the corresponding row in SinvB" << std::endl
718  << "LB.rows = " << LB.rows << std::endl
719  << "SinvB.rows = " << SinvB.rows << std::endl;
720  ErrorHandling( msg.str().c_str() );
721  }
722  }
723 
724  // Column relative indicies
725  std::vector<Int> relCols( UB.numCol );
726  Int SinvColsSta = FirstBlockCol( jsup, super_ );
727  for( Int j = 0; j < UB.numCol; j++ ){
728  relCols[j] = UB.cols[j] - SinvColsSta;
729  }
730 
731  // Transfer the values from Sinv to AinvBlock
732  T* nzvalSinv = SinvB.nzval.Data();
733  Int ldSinv = SinvB.numRow;
734  for( Int j = 0; j < UB.numCol; j++ ){
735  for( Int i = 0; i < LB.numRow; i++ ){
736  nzvalAinv[i+j*ldAinv] =
737  nzvalSinv[relRows[i] + relCols[j] * ldSinv];
738  }
739  }
740 
741  isBlockFound = true;
742  break;
743  }
744  } // for (ibSinv )
745  TIMER_STOP(PARSING_ROW_BLOCKIDX);
746  if( isBlockFound == false ){
747  std::ostringstream msg;
748  msg << "Block(" << isup << ", " << jsup
749  << ") did not find a matching block in Sinv." << std::endl;
750  ErrorHandling( msg.str().c_str() );
751  }
752  } // if (isup, jsup) is in L
753  else{
754  std::vector<UBlock<T> >& UrowSinv = this->U( LBi( isup, grid_ ) );
755  bool isBlockFound = false;
756  TIMER_START(PARSING_COL_BLOCKIDX);
757  for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
758  // Found the (isup, jsup) block in Sinv
759  if( UrowSinv[jbSinv].blockIdx == jsup ){
760  UBlock<T> & SinvB = UrowSinv[jbSinv];
761 
762  // Row relative indices
763  std::vector<Int> relRows( LB.numRow );
764  Int SinvRowsSta = FirstBlockCol( isup, super_ );
765  for( Int i = 0; i < LB.numRow; i++ ){
766  relRows[i] = LB.rows[i] - SinvRowsSta;
767  }
768 
769  // Column relative indices
770  std::vector<Int> relCols( UB.numCol );
771  Int* colsUBPtr = UB.cols.Data();
772  Int* colsSinvBPtr = SinvB.cols.Data();
773  for( Int j = 0; j < UB.numCol; j++ ){
774  bool isColFound = false;
775  for( Int j1 = 0; j1 < SinvB.numCol; j1++ ){
776  if( colsUBPtr[j] == colsSinvBPtr[j1] ){
777  isColFound = true;
778  relCols[j] = j1;
779  break;
780  }
781  }
782  if( isColFound == false ){
783  std::ostringstream msg;
784  msg << "Col " << colsUBPtr[j] <<
785  " in UB cannot find the corresponding row in SinvB" << std::endl
786  << "UB.cols = " << UB.cols << std::endl
787  << "UinvB.cols = " << SinvB.cols << std::endl;
788  ErrorHandling( msg.str().c_str() );
789  }
790  }
791 
792 
793  // Transfer the values from Sinv to AinvBlock
794  T* nzvalSinv = SinvB.nzval.Data();
795  Int ldSinv = SinvB.numRow;
796  for( Int j = 0; j < UB.numCol; j++ ){
797  for( Int i = 0; i < LB.numRow; i++ ){
798  nzvalAinv[i+j*ldAinv] =
799  nzvalSinv[relRows[i] + relCols[j] * ldSinv];
800  }
801  }
802 
803  isBlockFound = true;
804  break;
805  }
806  } // for (jbSinv)
807  TIMER_STOP(PARSING_COL_BLOCKIDX);
808  if( isBlockFound == false ){
809  std::ostringstream msg;
810  msg << "Block(" << isup << ", " << jsup
811  << ") did not find a matching block in Sinv." << std::endl;
812  ErrorHandling( msg.str().c_str() );
813  }
814  } // if (isup, jsup) is in U
815 
816  } // for( ib )
817  } // for ( jb )
818 
819 
820 #endif
821  TIMER_STOP(JB_Loop);
822 
823 
824  TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
825  }
826 
827  template<typename T>
829  std::vector<SuperNodeBufferType > & arrSuperNodes,
830  Int stepSuper)
831  {
832 
833  TIMER_START(Send_CD_Update_U);
834  //compute the number of requests
835  Int sendCount = 0;
836  Int recvCount = 0;
837  Int sendOffset[stepSuper];
838  Int recvOffset[stepSuper];
839  Int recvIdx=0;
840  for (Int supidx=0; supidx<stepSuper; supidx++){
841  SuperNodeBufferType & snode = arrSuperNodes[supidx];
842  sendOffset[supidx]=sendCount;
843  recvOffset[supidx]=recvCount;
844  sendCount+= CountSendToCrossDiagonal(snode.Index);
845  recvCount+= CountRecvFromCrossDiagonal(snode.Index);
846  }
847 
848 
849  std::vector<MPI_Request > arrMpiReqsSendCD(sendCount, MPI_REQUEST_NULL );
850  std::vector<MPI_Request > arrMpiReqsSizeSendCD(sendCount, MPI_REQUEST_NULL );
851 
852  std::vector<MPI_Request > arrMpiReqsRecvCD(recvCount, MPI_REQUEST_NULL );
853  std::vector<MPI_Request > arrMpiReqsSizeRecvCD(recvCount, MPI_REQUEST_NULL );
854  std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
855  std::vector<int > arrSstrLcolSizeSendCD(sendCount);
856  std::vector<std::vector<char> > arrSstrLcolRecvCD(recvCount);
857  std::vector<int > arrSstrLcolSizeRecvCD(recvCount);
858 
859  for (Int supidx=0; supidx<stepSuper; supidx++){
860  SuperNodeBufferType & snode = arrSuperNodes[supidx];
861 
862  // Send LUpdateBufReduced to the cross diagonal blocks.
863  // NOTE: This assumes square processor grid
864 
865  TIMER_START(Send_L_CrossDiag);
866 
867  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && isSendToCrossDiagonal_(grid_->numProcCol, snode.Index ) ){
868 
869  Int sendIdx = 0;
870  for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
871  if(isSendToCrossDiagonal_(dstCol,snode.Index) ){
872  Int dest = PNUM(PROW(snode.Index,grid_),dstCol,grid_);
873 
874  if( MYPROC( grid_ ) != dest ){
875  MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSendCD[sendOffset[supidx]+sendIdx];
876  MPI_Request & mpiReqSend = arrMpiReqsSendCD[sendOffset[supidx]+sendIdx];
877 
878 
879  std::stringstream sstm;
880  std::vector<char> & sstrLcolSend = arrSstrLcolSendCD[sendOffset[supidx]+sendIdx];
881  Int & sstrSize = arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdx];
882 
883  serialize( snode.RowLocalPtr, sstm, NO_MASK );
884  serialize( snode.BlockIdxLocal, sstm, NO_MASK );
885  serialize( snode.LUpdateBuf, sstm, NO_MASK );
886 
887  sstrLcolSend.resize( Size(sstm) );
888  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
889  sstrSize = sstrLcolSend.size();
890 
891 #if ( _DEBUGlevel_ >= 1 )
892  assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_)<=maxTag_);
893  assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
894 #endif
895 
896  MPI_Isend( &sstrSize, sizeof(sstrSize), MPI_BYTE, dest, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_), grid_->comm, &mpiReqSizeSend );
897  MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dest, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_), grid_->comm, &mpiReqSend );
898 
899  PROFILE_COMM(MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_),sizeof(sstrSize));
900  PROFILE_COMM(MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_),sstrSize);
901 
902 
903  sendIdx++;
904  }
905  }
906  }
907 
908 
909  } // sender
910  TIMER_STOP(Send_L_CrossDiag);
911  }
912 
913 
914  //Do Irecv for sizes
915  for (Int supidx=0; supidx<stepSuper; supidx++){
916  SuperNodeBufferType & snode = arrSuperNodes[supidx];
917  //If I'm a receiver
918  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
919  Int recvIdx=0;
920  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
921  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
922  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
923  if( MYPROC( grid_ ) != src ){
924  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
925  MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecvCD[recvOffset[supidx]+recvIdx];
926 
927 #if ( _DEBUGlevel_ >= 1 )
928  assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
929 #endif
930  MPI_Irecv( &sstrSize, 1, MPI_INT, src, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_), grid_->comm, &mpiReqSizeRecv);
931  recvIdx++;
932  }
933  }
934  }
935  }//end if I'm a receiver
936  }
937 
938  //waitall sizes
939  mpi::Waitall(arrMpiReqsSizeRecvCD);
940  //Allocate content and do Irecv
941  for (Int supidx=0; supidx<stepSuper; supidx++){
942  SuperNodeBufferType & snode = arrSuperNodes[supidx];
943  //If I'm a receiver
944  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
945  Int recvIdx=0;
946  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
947  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
948  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
949  if( MYPROC( grid_ ) != src ){
950  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
951  std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
952  MPI_Request & mpiReqRecv = arrMpiReqsRecvCD[recvOffset[supidx]+recvIdx];
953  sstrLcolRecv.resize( sstrSize);
954 
955 #if ( _DEBUGlevel_ >= 1 )
956  assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
957 #endif
958  MPI_Irecv( (void*)&sstrLcolRecv[0], sstrSize, MPI_BYTE, src, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_), grid_->comm, &mpiReqRecv );
959  //statusOFS<<"P"<<MYPROC(grid_)<<" received "<<sstrSize<<" bytes of L/U from CD P"<<src<<std::endl;
960  recvIdx++;
961  }
962  }
963  }
964  }//end if I'm a receiver
965  }
966 
967  //waitall content
968  mpi::Waitall(arrMpiReqsRecvCD);
969  //Do the work
970  for (Int supidx=0; supidx<stepSuper; supidx++){
971  SuperNodeBufferType & snode = arrSuperNodes[supidx];
972 
973  // Send LUpdateBufReduced to the cross diagonal blocks.
974  // NOTE: This assumes square processor grid
975  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
976 
977 #if ( _DEBUGlevel_ >= 1 )
978  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "Update the upper triangular block" << std::endl << std::endl;
979  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "blockIdxLocal:" << snode.BlockIdxLocal << std::endl << std::endl;
980  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "rowLocalPtr:" << snode.RowLocalPtr << std::endl << std::endl;
981 #endif
982 
983  std::vector<UBlock<T> >& Urow = this->U( LBi( snode.Index, grid_ ) );
984  std::vector<bool> isBlockFound(Urow.size(),false);
985 
986  recvIdx=0;
987  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
988  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
989  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
990  TIMER_START(Recv_L_CrossDiag);
991 
992  std::vector<Int> rowLocalPtrRecv;
993  std::vector<Int> blockIdxLocalRecv;
994  NumMat<T> UUpdateBuf;
995 
996  if( MYPROC( grid_ ) != src ){
997  std::stringstream sstm;
998  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
999  std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
1000  sstm.write( &sstrLcolRecv[0], sstrSize );
1001 
1002  deserialize( rowLocalPtrRecv, sstm, NO_MASK );
1003  deserialize( blockIdxLocalRecv, sstm, NO_MASK );
1004  deserialize( UUpdateBuf, sstm, NO_MASK );
1005 
1006  recvIdx++;
1007 
1008  } // sender is not the same as receiver
1009  else{
1010  rowLocalPtrRecv = snode.RowLocalPtr;
1011  blockIdxLocalRecv = snode.BlockIdxLocal;
1012  UUpdateBuf = snode.LUpdateBuf;
1013  } // sender is the same as receiver
1014 
1015 
1016 
1017  TIMER_STOP(Recv_L_CrossDiag);
1018 
1019 #if ( _DEBUGlevel_ >= 1 )
1020  statusOFS<<" ["<<snode.Index<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<snode.Index<<") <--- P"<<src<<std::endl;
1021  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "rowLocalPtrRecv:" << rowLocalPtrRecv << std::endl << std::endl;
1022  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "blockIdxLocalRecv:" << blockIdxLocalRecv << std::endl << std::endl;
1023 #endif
1024 
1025 
1026  // Update U
1027  for( Int ib = 0; ib < blockIdxLocalRecv.size(); ib++ ){
1028  for( Int jb = 0; jb < Urow.size(); jb++ ){
1029  UBlock<T>& UB = Urow[jb];
1030  if( UB.blockIdx == blockIdxLocalRecv[ib] ){
1031  NumMat<T> Ltmp ( UB.numCol, UB.numRow );
1032  lapack::Lacpy( 'A', Ltmp.m(), Ltmp.n(),
1033  &UUpdateBuf( rowLocalPtrRecv[ib], 0 ),
1034  UUpdateBuf.m(), Ltmp.Data(), Ltmp.m() );
1035  isBlockFound[jb] = true;
1036  Transpose( Ltmp, UB.nzval );
1037  break;
1038  }
1039  }
1040  }
1041  }
1042  }
1043 
1044  for( Int jb = 0; jb < Urow.size(); jb++ ){
1045  UBlock<T>& UB = Urow[jb];
1046  if( !isBlockFound[jb] ){
1047  ErrorHandling( "UBlock cannot find its update. Something is seriously wrong." );
1048  }
1049  }
1050  } // receiver
1051  }
1052 
1053  TIMER_STOP(Send_CD_Update_U);
1054 
1055  mpi::Waitall(arrMpiReqsSizeSendCD);
1056  mpi::Waitall(arrMpiReqsSendCD);
1057  };
1058 
1059  template<typename T>
1061  SuperNodeBufferType & snode,
1062  std::vector<LBlock<T> > & LcolRecv,
1063  std::vector<UBlock<T> > & UrowRecv )
1064  {
1065 #if ( _DEBUGlevel_ >= 1 )
1066  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
1067 #endif
1068  // U part
1069  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
1070  std::stringstream sstm;
1071 #ifndef NEW_BCAST
1072  sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
1073 #else
1074  //sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
1075  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1076  sstm.write( (char*)bcastUTree2->GetLocalBuffer(),bcastUTree2->GetMsgSize());
1077 #endif
1078  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1079  Int numUBlock;
1080  deserialize( numUBlock, sstm, NO_MASK );
1081  UrowRecv.resize( numUBlock );
1082  for( Int jb = 0; jb < numUBlock; jb++ ){
1083  deserialize( UrowRecv[jb], sstm, mask );
1084  }
1085  } // sender is not the same as receiver
1086  else{
1087  // U is obtained locally, just make a copy. Include everything
1088  // (there is no diagonal block)
1089  // Is it a copy ? LL: YES. Maybe we should replace the copy by
1090  // something more efficient especially for mpisize == 1
1091  UrowRecv.resize(this->U( LBi( snode.Index, grid_ ) ).size());
1092  std::copy(this->U( LBi( snode.Index, grid_ ) ).begin(),this->U( LBi( snode.Index, grid_ )).end(),UrowRecv.begin());
1093  } // sender is the same as receiver
1094 
1095 
1096  //L part
1097  if( MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
1098  std::stringstream sstm;
1099 #ifndef NEW_BCAST
1100  sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1101 #else
1102  //sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1103  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1104  sstm.write( (char*)bcastLTree2->GetLocalBuffer(),bcastLTree2->GetMsgSize());
1105 #endif
1106  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1107  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
1108  Int numLBlock;
1109  deserialize( numLBlock, sstm, NO_MASK );
1110  LcolRecv.resize( numLBlock );
1111  for( Int ib = 0; ib < numLBlock; ib++ ){
1112  deserialize( LcolRecv[ib], sstm, mask );
1113  }
1114  } // sender is not the same as receiver
1115  else{
1116  // L is obtained locally, just make a copy.
1117  // Do not include the diagonal block
1118  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1119  Int startIdx = ( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )?1:0;
1120  LcolRecv.resize( Lcol.size() - startIdx );
1121  std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1122  } // sender is the same as receiver
1123  }
1124 
1125  template<typename T>
1127  {
1128 
1129  //---------Computing Diagonal block, all processors in the column are participating to all pipelined supernodes
1130  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1131 #if ( _DEBUGlevel_ >= 2 )
1132  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Updating the diagonal block" << std::endl << std::endl;
1133 #endif
1134  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1135 
1136  //Allocate DiagBuf even if Lcol.size() == 0
1137  snode.DiagBuf.Resize(SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));
1138  SetValue(snode.DiagBuf, ZERO<T>());
1139 
1140  // Do I own the diagonal block ?
1141  Int startIb = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;
1142  for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1143 
1144 #ifdef GEMM_PROFILE
1145  gemm_stat.push_back(snode.DiagBuf.m());
1146  gemm_stat.push_back(snode.DiagBuf.n());
1147  gemm_stat.push_back(Lcol[ib].numRow);
1148 #endif
1149 
1150  LBlock<T> & LB = Lcol[ib];
1151 
1152  blas::Gemm( 'T', 'N', snode.DiagBuf.m(), snode.DiagBuf.n(), LB.numRow,
1153  MINUS_ONE<T>(), &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ), snode.LUpdateBuf.m(),
1154  LB.nzval.Data(), LB.nzval.m(), ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1155  }
1156 
1157 #if ( _DEBUGlevel_ >= 1 )
1158  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Updated the diagonal block" << std::endl << std::endl;
1159 #endif
1160  }
1161  }
1162 
1163 
1164 
1165  template<typename T>
1166  void PMatrix<T>::GetEtree(std::vector<Int> & etree_supno )
1167  {
1168  Int nsupers = this->NumSuper();
1169 
1170  if( optionsFact_->ColPerm != "PARMETIS" ) {
1171  /* Use the etree computed from serial symb. fact., and turn it
1172  into supernodal tree. */
1173  const SuperNodeType * superNode = this->SuperNode();
1174 
1175 
1176  //translate from columns to supernodes etree using supIdx
1177  etree_supno.resize(this->NumSuper());
1178  for(Int i = 0; i < superNode->etree.m(); ++i){
1179  Int curSnode = superNode->superIdx[i];
1180  Int parentSnode = (superNode->etree[i]>= superNode->etree.m()) ?this->NumSuper():superNode->superIdx[superNode->etree[i]];
1181  if( curSnode != parentSnode){
1182  etree_supno[curSnode] = parentSnode;
1183  }
1184  }
1185 
1186  } else { /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */
1187  /* Compute an "etree" based on struct(L),
1188  assuming struct(U) = struct(L'). */
1189 
1190  /* find the first block in each supernodal-column of local L-factor */
1191  std::vector<Int> etree_supno_l( nsupers, nsupers );
1192  for( Int ksup = 0; ksup < nsupers; ksup++ ){
1193  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
1194  // L part
1195  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
1196  if(Lcol.size()>0){
1197  Int firstBlk = 0;
1198  if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
1199  firstBlk=1;
1200 
1201  for( Int ib = firstBlk; ib < Lcol.size(); ib++ ){
1202  etree_supno_l[ksup] = std::min(etree_supno_l[ksup] , Lcol[ib].blockIdx);
1203  }
1204  }
1205  }
1206  }
1207 
1208 
1209 #if ( _DEBUGlevel_ >= 1 )
1210  statusOFS << std::endl << " Local supernodal elimination tree is " << etree_supno_l <<std::endl<<std::endl;
1211 
1212 #endif
1213  /* form global e-tree */
1214  etree_supno.resize( nsupers );
1215  mpi::Allreduce( (Int*) &etree_supno_l[0],(Int *) &etree_supno[0], nsupers, MPI_MIN, grid_->comm );
1216  etree_supno[nsupers-1]=nsupers;
1217  }
1218 
1219 #ifndef _RELEASE_
1220  double end = MPI_Wtime( );
1221  statusOFS<<"Building the list took "<<end-begin<<"s"<<std::endl;
1222 #endif
1223  } // ----- end of method PMatrix::GetEtree -----
1224 
1225 
1226  template<typename T>
1228  std::vector<Int> & snodeEtree,
1229  std::vector<std::vector<Int> > & WSet)
1230  {
1231  TIMER_START(Compute_WorkSet);
1232  Int numSuper = this->NumSuper();
1233 
1234 
1235  if (options_->maxPipelineDepth!=1){
1236 
1237  Int maxDepth = options_->maxPipelineDepth;
1238  maxDepth=maxDepth==-1?std::numeric_limits<Int>::max():maxDepth;
1239 #if ( _DEBUGlevel_ >= 1 )
1240  statusOFS<<"MaxDepth is "<<maxDepth<<endl;
1241 #endif
1242 
1243  //find roots in the supernode etree (it must be postordered)
1244  //initialize the parent we are looking at
1245  //Int rootParent = snodeEtree[numSuper-2];
1246  Int rootParent = numSuper;
1247 
1248  //compute the level of each supernode and the total number of levels
1249  //IntNumVec level(numSuper);
1250  //level(rootParent)=0;
1251  IntNumVec level(numSuper+1);
1252  level(rootParent)=-1;
1253  Int numLevel = 0;
1254  for(Int i=rootParent-1; i>=0; i-- ){
1255  level[i] = level[snodeEtree[i]]+1;
1256  numLevel = std::max(numLevel, level[i]);
1257  }
1258  numLevel++;
1259 
1260  //Compute the number of supernodes at each level
1261  IntNumVec levelSize(numLevel);
1262  SetValue(levelSize,I_ZERO);
1263  //for(Int i=rootParent-1; i>=0; i-- ){ levelSize(level(i))++; }
1264  for(Int i=rootParent-1; i>=0; i-- ){
1265  if(level[i]>=0){
1266  levelSize[level[i]]++;
1267  }
1268  }
1269 
1270  //Allocate memory
1271  WSet.resize(numLevel,std::vector<Int>());
1272  for(Int i=0; i<numLevel; i++ ){
1273  WSet[i].reserve(levelSize(i));
1274  }
1275 
1276  //Fill the worklist based on the level of each supernode
1277  for(Int i=rootParent-1; i>=0; i-- ){
1279  // TreeBcast * bcastLTree = fwdToRightTree_[i];
1280  // TreeBcast * bcastUTree = fwdToBelowTree_[i];
1281  // TreeReduce<T> * redLTree = redToLeftTree_[i];
1282  // TreeReduce<T> * redDTree = redToAboveTree_[i];
1283  //
1284  //bool participating = MYROW( grid_ ) == PROW( i, grid_ ) || MYCOL( grid_ ) == PCOL( i, grid_ )
1285  // || CountSendToRight(i) > 0
1286  // || CountSendToBelow(i) > 0
1287  // || CountSendToCrossDiagonal(i) > 0
1288  // || CountRecvFromCrossDiagonal(i) >0
1289  // || ( isRecvFromLeft_( i ) )
1290  // || ( isRecvFromAbove_( i ) )
1291  // || isSendToDiagonal_(i)
1292  // || (bcastUTree!=NULL)
1293  // || (bcastLTree!=NULL)
1294  // || (redLTree!=NULL)
1295  // || (redDTree!=NULL) ;
1296  //participating = true;
1297  //if( participating){
1298  WSet[level[i]].push_back(i);
1299  //}
1300 
1301  }
1302 
1303 #if 1
1304  //Constrain the size of each list to be min(MPI_MAX_COMM,options_->maxPipelineDepth)
1305  Int limit = maxDepth; //(options_->maxPipelineDepth>0)?std::min(MPI_MAX_COMM,options_->maxPipelineDepth):MPI_MAX_COMM;
1306  Int rank = 0;
1307  for (Int lidx=0; lidx<WSet.size() ; lidx++){
1308 
1309 
1310  //Assign a rank in the order they are processed ?
1311 
1312  bool split = false;
1313  Int splitIdx = 0;
1314 
1315  Int orank = rank;
1316  Int maxRank = rank + WSet[lidx].size()-1;
1317  // Int maxRank = WSet[lidx].back();
1318 #if ( _DEBUGlevel_ >= 1 )
1319  statusOFS<< (Int)(rank/limIndex_) << " vs2 "<<(Int)(maxRank/limIndex_)<<std::endl;
1320 #endif
1321  if( (Int)(rank/limIndex_) != (Int)(maxRank/limIndex_)){
1322  split = true;
1323  //splitIdx = maxRank - maxRank%limIndex_ - rank;
1324  splitIdx = limIndex_-(rank%limIndex_)-1;
1325  }
1326 
1327  Int splitPoint = std::min((Int)WSet[lidx].size()-1,splitIdx>0?std::min(limit-1,splitIdx):limit-1);
1328 #if ( _DEBUGlevel_ >= 1 )
1329  if(split){
1330  statusOFS<<"TEST SPLIT at "<<splitIdx<<" "<<std::endl;
1331  }
1332 #endif
1333  split = split || (WSet[lidx].size()>limit);
1334 
1335  rank += splitPoint+1;
1336 
1337  if( split && splitPoint>0)
1338  {
1339  statusOFS<<"SPLITPOINT is "<<splitPoint<<" "<<std::endl;
1340 #if ( _DEBUGlevel_ >= 1 )
1341  if(splitPoint != limit-1){
1342  statusOFS<<"-----------------------"<<std::endl;
1343  statusOFS<<lidx<<": "<<orank<<" -- "<<maxRank<<std::endl;
1344  statusOFS<<" is NOW "<<std::endl;
1345  statusOFS<<lidx<<": "<<orank<<" -- "<<rank-1<<std::endl;
1346  statusOFS<<lidx+1<<": "<<rank<<" -- "<<maxRank<<std::endl;
1347  statusOFS<<"-----------------------"<<std::endl;
1348  }
1349 #endif
1350  std::vector<std::vector<Int> >::iterator pos = WSet.begin()+lidx+1;
1351  WSet.insert(pos,std::vector<Int>());
1352  WSet[lidx+1].insert(WSet[lidx+1].begin(),WSet[lidx].begin() + splitPoint+1 ,WSet[lidx].end());
1353  WSet[lidx].erase(WSet[lidx].begin()+splitPoint+1,WSet[lidx].end());
1354 
1355 #if ( _DEBUGlevel_ >= 1 )
1356  if(splitPoint != limit-1){
1357  assert((orank+WSet[lidx].size()-1)%limIndex_==0);
1358  }
1359 #endif
1360 
1361 
1362 
1363 
1364  //statusOFS<<"-----------------------"<<std::endl;
1365  //for(auto it = WSet[lidx].begin();it !=WSet[lidx].end(); it++){statusOFS<<*it<<" ";}statusOFS<<std::endl;
1366  //for(auto it = WSet[lidx+1].begin();it !=WSet[lidx+1].end(); it++){statusOFS<<*it<<" ";}statusOFS<<std::endl;
1367  //statusOFS<<"-----------------------"<<std::endl;
1368 
1381 
1382  // //THERE IS A SPECIAL CASE: list longer than limit AND splitidx in between: we have to check if we don't have to do multiple splits
1383  // if(splitPoint<splitIdx){
1384  // Int newSplitIdx = splitIdx - splitPoint;
1385  //
1386  // pos = WSet.begin()+lidx+2;
1387  // WSet.insert(pos,std::vector<Int>());
1388  // WSet[lidx+2].insert(WSet[lidx+2].begin(),WSet[lidx+1].begin() + newSplitIdx+1 ,WSet[lidx+1].end());
1389  // WSet[lidx+1].erase(WSet[lidx+1].begin()+newSplitIdx+1,WSet[lidx+1].end());
1390  //
1391  // pos2 = workingRanks_.begin()+lidx+2;
1392  // workingRanks_.insert(pos2,std::vector<Int>());
1393  // workingRanks_[lidx+2].insert(workingRanks_[lidx+2].begin(),workingRanks_[lidx+1].begin() + newSplitIdx+1 ,workingRanks_[lidx+1].end());
1394  //
1395  //statusOFS<<"SECOND SPLITPOINT IS "<<newSplitIdx<<std::endl;
1396  //statusOFS<<"-----------------------"<<std::endl;
1397  //statusOFS<<workingRanks_[lidx+1].front()<<" -- "<<workingRanks_[lidx+1].back()<<std::endl;
1398  // workingRanks_[lidx+1].erase(workingRanks_[lidx+1].begin()+newSplitIdx+1,workingRanks_[lidx+1].end());
1399  //statusOFS<<" is NOW "<<std::endl;
1400  //statusOFS<<workingRanks_[lidx+1].front()<<" -- "<<workingRanks_[lidx+1].back()<<std::endl;
1401  //statusOFS<<workingRanks_[lidx+2].front()<<" -- "<<workingRanks_[lidx+2].back()<<std::endl;
1402  //statusOFS<<"-----------------------"<<std::endl;
1403  // }
1404 
1405  }
1406 
1407 
1408  }
1409 
1410  //filter out non participating ?
1411  // for(Int i=rootParent-1; i>=0; i-- ){
1412  // ////TODO do I have something to do here ?
1413  // // TreeBcast * bcastLTree = fwdToRightTree_[i];
1414  // // TreeBcast * bcastUTree = fwdToBelowTree_[i];
1415  // // TreeReduce<T> * redLTree = redToLeftTree_[i];
1416  // // TreeReduce<T> * redDTree = redToAboveTree_[i];
1417  // //
1418  // //bool participating = MYROW( grid_ ) == PROW( i, grid_ ) || MYCOL( grid_ ) == PCOL( i, grid_ )
1419  // // || CountSendToRight(i) > 0
1420  // // || CountSendToBelow(i) > 0
1421  // // || CountSendToCrossDiagonal(i) > 0
1422  // // || CountRecvFromCrossDiagonal(i) >0
1423  // // || ( isRecvFromLeft_( i ) )
1424  // // || ( isRecvFromAbove_( i ) )
1425  // // || isSendToDiagonal_(i)
1426  // // || (bcastUTree!=NULL)
1427  // // || (bcastLTree!=NULL)
1428  // // || (redLTree!=NULL)
1429  // // || (redDTree!=NULL) ;
1430  // //participating = true;
1431  // //if(! participating){
1432  // // WSet[level[i]].erase(i);
1433  // //}
1434 
1435  // }
1436 #endif
1437 
1438 
1439 
1440  }
1441  else{
1442  for( Int ksup = numSuper - 2; ksup >= 0; ksup-- ){
1443  WSet.push_back(std::vector<Int>());
1444  WSet.back().push_back(ksup);
1445  }
1446 
1447  }
1448 
1449 
1450 
1451 
1452  TIMER_STOP(Compute_WorkSet);
1453 #if ( _DEBUGlevel_ >= 1 )
1454  for (Int lidx=0; lidx<WSet.size() ; lidx++){
1455  statusOFS << std::endl << "L"<< lidx << " is: {";
1456  for (Int supidx=0; supidx<WSet[lidx].size() ; supidx++){
1457  statusOFS << WSet[lidx][supidx] << " ["<<snodeEtree[WSet[lidx][supidx]]<<"] ";
1458  }
1459  statusOFS << " }"<< std::endl;
1460  }
1461 #endif
1462  }
1463 
1464  template<typename T>
1465  inline void PMatrix<T>::SelInvIntra_P2p(Int lidx,Int & rank)
1466  {
1467 
1468 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1469  Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1470 #endif
1471  Int numSuper = this->NumSuper();
1472  std::vector<std::vector<Int> > & superList = this->WorkingSet();
1473  Int numSteps = superList.size();
1474  Int stepSuper = superList[lidx].size();
1475 
1476 
1477 
1478 
1479  TIMER_START(AllocateBuffer);
1480 
1481  stepSuper = 0;
1482  for (Int supidx=0; supidx<superList[lidx].size(); supidx++){
1483  Int snodeIdx = superList[lidx][supidx];
1484 #ifndef NEW_BCAST
1485  TreeBcast * bcastLTree = fwdToRightTree_[snodeIdx];
1486  TreeBcast * bcastUTree = fwdToBelowTree_[snodeIdx];
1487 #else
1488  TreeBcast2<T> * bcastLTree = fwdToRightTree2_[snodeIdx];
1489  TreeBcast2<T> * bcastUTree = fwdToBelowTree2_[snodeIdx];
1490 #endif
1491  TreeReduce<T> * redLTree = redToLeftTree_[snodeIdx];
1492  TreeReduce<T> * redDTree = redToAboveTree_[snodeIdx];
1493  bool participating = MYROW( grid_ ) == PROW( snodeIdx, grid_ )
1494  || MYCOL( grid_ ) == PCOL( snodeIdx, grid_ )
1495  || CountSendToRight(snodeIdx) > 0
1496  || CountSendToBelow(snodeIdx) > 0
1497  || CountSendToCrossDiagonal(snodeIdx) > 0
1498  || CountRecvFromCrossDiagonal(snodeIdx) >0
1499  || ( isRecvFromLeft_( snodeIdx ) )
1500  || ( isRecvFromAbove_( snodeIdx ) )
1501  || isSendToDiagonal_(snodeIdx)
1502  || (bcastUTree!=NULL)
1503  || (bcastLTree!=NULL)
1504  || (redLTree!=NULL)
1505  || (redDTree!=NULL) ;
1506  if(participating){
1507  stepSuper++;
1508  }
1509  }
1510 
1511 
1512 
1513 #ifndef NEW_BCAST
1514  //This is required to send the size and content of U/L
1515  std::vector<std::vector<MPI_Request> > arrMpireqsSendToBelow;
1516  arrMpireqsSendToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcRow, MPI_REQUEST_NULL ));
1517  std::vector<std::vector<MPI_Request> > arrMpireqsSendToRight;
1518  arrMpireqsSendToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcCol, MPI_REQUEST_NULL ));
1519 #endif
1520 
1521 #if 0
1522  //This is required to reduce L
1523  std::vector<MPI_Request> arrMpireqsSendToLeft;
1524  arrMpireqsSendToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1525  //This is required to reduce D
1526  std::vector<MPI_Request> arrMpireqsSendToAbove;
1527  arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1528 #endif
1529 
1530  //This is required to receive the size and content of U/L
1531 #ifndef NEW_BCAST
1532  std::vector<MPI_Request> arrMpireqsRecvSizeFromAny;
1533  arrMpireqsRecvSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1534  std::vector<MPI_Request> arrMpireqsRecvContentFromAny;
1535  arrMpireqsRecvContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1536 #endif
1537 
1538  //allocate the buffers for this supernode
1539  std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);
1540  Int pos = 0;
1541  for (Int supidx=0; supidx<superList[lidx].size(); supidx++){
1542  Int snodeIdx = superList[lidx][supidx];
1543 #ifndef NEW_BCAST
1544  TreeBcast * bcastLTree = fwdToRightTree_[snodeIdx];
1545  TreeBcast * bcastUTree = fwdToBelowTree_[snodeIdx];
1546 #else
1547  TreeBcast2<T> * bcastLTree = fwdToRightTree2_[snodeIdx];
1548  TreeBcast2<T> * bcastUTree = fwdToBelowTree2_[snodeIdx];
1549 #endif
1550  TreeReduce<T> * redLTree = redToLeftTree_[snodeIdx];
1551  TreeReduce<T> * redDTree = redToAboveTree_[snodeIdx];
1552  bool participating = MYROW( grid_ ) == PROW( snodeIdx, grid_ )
1553  || MYCOL( grid_ ) == PCOL( snodeIdx, grid_ )
1554  || CountSendToRight(snodeIdx) > 0
1555  || CountSendToBelow(snodeIdx) > 0
1556  || CountSendToCrossDiagonal(snodeIdx) > 0
1557  || CountRecvFromCrossDiagonal(snodeIdx) >0
1558  || ( isRecvFromLeft_( snodeIdx ) )
1559  || ( isRecvFromAbove_( snodeIdx ) )
1560  || isSendToDiagonal_(snodeIdx)
1561  || (bcastUTree!=NULL)
1562  || (bcastLTree!=NULL)
1563  || (redLTree!=NULL)
1564  || (redDTree!=NULL) ;
1565 
1566  if(participating){
1567  arrSuperNodes[pos].Index = superList[lidx][supidx];
1568  arrSuperNodes[pos].Rank = rank;
1569 
1570 
1571  SuperNodeBufferType & snode = arrSuperNodes[pos];
1572 
1573  pos++;
1574  }
1575  rank++;
1576  }
1577 
1578 
1579 
1580  int numSentToLeft = 0;
1581  std::vector<int> reqSentToLeft;
1582 
1583 
1584  NumMat<T> AinvBuf, UBuf;
1585 
1586  TIMER_STOP(AllocateBuffer);
1587 
1588 #if ( _DEBUGlevel_ >= 1 )
1589  statusOFS << std::endl << "Communication to the Schur complement." << std::endl << std::endl;
1590 #endif
1591 
1592 #ifdef NEW_BCAST
1593  vector<char> bcastUready(stepSuper,0);
1594  // vector<char> bcastUdone(stepSuper,0);
1595  vector<char> bcastLready(stepSuper,0);
1596  // vector<char> bcastLdone(stepSuper,0);
1597 #endif
1598 
1599  {
1600  //Receivers have to resize their buffers
1601  TIMER_START(IRecv_Content_UL);
1602 
1603 
1604 
1605  // Receivers (Content)
1606  for (Int supidx=0; supidx<stepSuper ; supidx++){
1607  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1608 #ifndef NEW_BCAST
1609  MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvContentFromAny[supidx*2];
1610  MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvContentFromAny[supidx*2+1];
1611 #endif
1612 
1613  if( isRecvFromAbove_( snode.Index ) &&
1614  MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
1615 
1616 #ifdef NEW_BCAST
1617  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1618 
1619  if(bcastUTree2 != NULL){
1620  bcastUTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_));
1621  //Initialize the tree
1622  //bcastUTree2->AllocRecvBuffer();
1623  //Post Recv request;
1624  bool done = bcastUTree2->Progress();
1625 
1626 #if ( _DEBUGlevel_ >= 1 )
1627  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast U "<<done<<std::endl;
1628 #endif
1629  }
1630 #else
1631  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1632  if(bcastUTree!=NULL){
1633  Int myRoot = bcastUTree->GetRoot();
1634  snode.SizeSstrUrowRecv = bcastUTree->GetMsgSize();
1635  snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv);
1636  MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
1637  myRoot, IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_),
1638  grid_->colComm, mpireqsRecvFromAbove );
1639 #if ( _DEBUGlevel_ >= 1 )
1640  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving U " << snode.SizeSstrUrowRecv << " BYTES from "<<myRoot<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
1641 #endif
1642  }
1643 #endif
1644  } // if I need to receive from up
1645 
1646  if( isRecvFromLeft_( snode.Index ) &&
1647  MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
1648 #ifdef NEW_BCAST
1649  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1650 
1651  if(bcastLTree2 != NULL){
1652  bcastLTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_));
1653  //Initialize the tree
1654  //bcastUTree2->AllocRecvBuffer();
1655  //Post Recv request;
1656  bool done = bcastLTree2->Progress();
1657 
1658 #if ( _DEBUGlevel_ >= 1 )
1659  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast L "<<done<<std::endl;
1660 #endif
1661  }
1662 #else
1663 
1664  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1665  if(bcastLTree!=NULL){
1666  Int myRoot = bcastLTree->GetRoot();
1667  snode.SizeSstrLcolRecv = bcastLTree->GetMsgSize();
1668  snode.SstrLcolRecv.resize(snode.SizeSstrLcolRecv);
1669  MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
1670  myRoot, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_),
1671  grid_->rowComm, mpireqsRecvFromLeft );
1672 #if ( _DEBUGlevel_ >= 1 )
1673  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving L " << snode.SizeSstrLcolRecv << " BYTES from "<<myRoot<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
1674 #endif
1675  }
1676 #endif
1677  } // if I need to receive from left
1678  }
1679  TIMER_STOP(IRecv_Content_UL);
1680 
1681  // Senders
1682  TIMER_START(ISend_Content_UL);
1683  for (Int supidx=0; supidx<stepSuper; supidx++){
1684  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1685 #ifndef NEW_BCAST
1686  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1687  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1688 #endif
1689 
1690 #if ( _DEBUGlevel_ >= 1 )
1691  statusOFS << std::endl << "["<<snode.Index<<"] "
1692  << "Communication for the U part." << std::endl << std::endl;
1693 #endif
1694  // Communication for the U part.
1695 
1696  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1697  std::vector<UBlock<T> >& Urow = this->U( LBi(snode.Index, grid_) );
1698  // Pack the data in U
1699  TIMER_START(Serialize_UL);
1700  std::stringstream sstm;
1701 
1702  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1703  // All blocks are to be sent down.
1704  serialize( (Int)Urow.size(), sstm, NO_MASK );
1705  for( Int jb = 0; jb < Urow.size(); jb++ ){
1706  serialize( Urow[jb], sstm, mask );
1707  }
1708  snode.SstrUrowSend.resize( Size( sstm ) );
1709  sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
1710  snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
1711  TIMER_STOP(Serialize_UL);
1712 
1713 #ifdef NEW_BCAST
1714  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1715  if(bcastUTree2!=NULL){
1716  bcastUTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_));
1717  bcastUTree2->SetLocalBuffer((T*)&snode.SstrUrowSend[0]);
1718  bcastUTree2->SetDataReady(true);
1719  bool done = bcastUTree2->Progress();
1720 #if ( _DEBUGlevel_ >= 1 )
1721  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast U "<<done<<std::endl;
1722 #endif
1723 
1724  //progress should do the send
1725 
1726 #if ( _DEBUGlevel_ >= 1 )
1727  for( Int idxRecv = 0; idxRecv < bcastUTree2->GetDestCount(); ++idxRecv ){
1728  Int iProcRow = bcastUTree2->GetDest(idxRecv);
1729  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending U " << snode.SizeSstrUrowSend << " BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_) << std::endl << std::endl;
1730  }
1731 #endif
1732 
1733  }
1734 #else
1735  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1736  if(bcastUTree!=NULL){
1737  bcastUTree->ForwardMessage((char*)&snode.SstrUrowSend[0], snode.SizeSstrUrowSend,
1738  IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_), &mpireqsSendToBelow[0]);
1739  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1740  Int iProcRow = bcastUTree->GetDest(idxRecv);
1741 #if ( _DEBUGlevel_ >= 1 )
1742  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending U " << snode.SizeSstrUrowSend << " BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_) << std::endl << std::endl;
1743 #endif
1744  }
1745  }
1746 
1747 #endif
1748  } // if I am the sender
1749 
1750 
1751 #if ( _DEBUGlevel_ >= 1 )
1752  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Communication for the L part." << std::endl << std::endl;
1753 #endif
1754 
1755 
1756 
1757  // Communication for the L part.
1758  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1759  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode.Index, grid_) );
1760  TIMER_START(Serialize_UL);
1761  // Pack the data in L
1762  std::stringstream sstm;
1763  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1764  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
1765 
1766  // All blocks except for the diagonal block are to be sent right
1767 
1768  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )
1769  serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );
1770  else
1771  serialize( (Int)Lcol.size(), sstm, NO_MASK );
1772 
1773  for( Int ib = 0; ib < Lcol.size(); ib++ ){
1774  if( Lcol[ib].blockIdx > snode.Index ){
1775 #if ( _DEBUGlevel_ >= 2 )
1776  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Serializing Block index " << Lcol[ib].blockIdx << std::endl;
1777 #endif
1778  serialize( Lcol[ib], sstm, mask );
1779  }
1780  }
1781  snode.SstrLcolSend.resize( Size( sstm ) );
1782  sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
1783  snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
1784  TIMER_STOP(Serialize_UL);
1785 
1786 #ifdef NEW_BCAST
1787  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1788  if(bcastLTree2!=NULL){
1789  bcastLTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_));
1790  bcastLTree2->SetLocalBuffer((T*)&snode.SstrLcolSend[0]);
1791  bcastLTree2->SetDataReady(true);
1792  bool done = bcastLTree2->Progress();
1793 #if ( _DEBUGlevel_ >= 1 )
1794  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast L "<<done<<std::endl;
1795 #endif
1796 
1797  //progress should do the send
1798 
1799 #if ( _DEBUGlevel_ >= 1 )
1800  for( Int idxRecv = 0; idxRecv < bcastLTree2->GetDestCount(); ++idxRecv ){
1801  Int iProcRow = bcastLTree2->GetDest(idxRecv);
1802  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending L " << snode.SizeSstrLcolSend << " BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_) << std::endl << std::endl;
1803  }
1804 #endif
1805 
1806  }
1807 #else
1808 
1809  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1810  if(bcastLTree!=NULL){
1811  bcastLTree->ForwardMessage((char*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend,
1812  IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_), &mpireqsSendToRight[0]);
1813 
1814  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1815  Int iProcCol = bcastLTree->GetDest(idxRecv);
1816 #if ( _DEBUGlevel_ >= 1 )
1817  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending L " << snode.SizeSstrLcolSend << " BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_) << std::endl << std::endl;
1818 #endif
1819  }
1820  }
1821 #endif
1822  } // if I am the sender
1823  } //Senders
1824  TIMER_STOP(ISend_Content_UL);
1825  }
1826 
1827  vector<char> redLdone(stepSuper,0);
1828  for (Int supidx=0; supidx<stepSuper; supidx++){
1829  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1830  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1831 
1832  if(redLTree != NULL){
1833  redLTree->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_REDUCE,limIndex_));
1834  //Initialize the tree
1835  redLTree->AllocRecvBuffers();
1836  //Post All Recv requests;
1837  redLTree->PostFirstRecv();
1838  }
1839  }
1840 
1841 
1842  TIMER_START(Compute_Sinv_LT);
1843  {
1844  Int msgForwarded = 0;
1845  Int msgToFwd = 0;
1846  Int gemmProcessed = 0;
1847  Int gemmToDo = 0;
1848  // Int toRecvGemm = 0;
1849  //copy the list of supernodes we need to process
1850  std::list<Int> readySupidx;
1851  //find local things to do
1852  for(Int supidx = 0;supidx<stepSuper;supidx++){
1853  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1854 
1855 
1856  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1857  gemmToDo++;
1858  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1859  snode.isReady++;
1860  }
1861 
1862  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1863  snode.isReady++;
1864  }
1865 
1866  if(snode.isReady==2){
1867  readySupidx.push_back(supidx);
1868 #if ( _DEBUGlevel_ >= 1 )
1869  statusOFS<<std::endl<<"Locally processing ["<<snode.Index<<"]"<<std::endl;
1870 #endif
1871  }
1872  }
1873  else if( (isRecvFromLeft_( snode.Index ) ) && MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) )
1874  {
1875  //Get the reduction tree
1876  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1877 
1878  if(redLTree != NULL){
1879  TIMER_START(Reduce_Sinv_LT_Isend);
1880  //send the data to NULL to ensure 0 byte send
1881  redLTree->SetLocalBuffer(NULL);
1882  redLTree->SetDataReady(true);
1883 
1884  bool done = redLTree->Progress();
1885 
1886 #if ( _DEBUGlevel_ >= 1 )
1887  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce L "<<done<<std::endl;
1888 #endif
1889  TIMER_STOP(Reduce_Sinv_LT_Isend);
1890  }
1891  }// if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ))
1892 
1893  if(MYROW(grid_)!=PROW(snode.Index,grid_)){
1894 #ifdef NEW_BCAST
1895  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1896  if(bcastUTree2 != NULL){
1897  if(bcastUTree2->GetDestCount()>0){
1898  msgToFwd++;
1899  }
1900  }
1901 #else
1902  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1903  if(bcastUTree != NULL){
1904  if(bcastUTree->GetDestCount()>0){
1905  msgToFwd++;
1906  }
1907  }
1908 #endif
1909  }
1910 
1911  if(MYCOL(grid_)!=PCOL(snode.Index,grid_)){
1912 #ifdef NEW_BCAST
1913  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1914  if(bcastLTree2 != NULL){
1915  if(bcastLTree2->GetDestCount()>0){
1916  msgToFwd++;
1917  }
1918  }
1919 #else
1920 
1921  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1922  if(bcastLTree != NULL){
1923  if(bcastLTree->GetDestCount()>0){
1924  msgToFwd++;
1925  }
1926  }
1927 #endif
1928  }
1929  }
1930 
1931 #if ( _DEBUGlevel_ >= 1 )
1932  statusOFS<<std::endl<<"gemmToDo ="<<gemmToDo<<std::endl;
1933  statusOFS<<std::endl<<"msgToFwd ="<<msgToFwd<<std::endl;
1934 #endif
1935 
1936 
1937 #if defined (PROFILE)
1938  end_SendULWaitContentFirst=0;
1939  begin_SendULWaitContentFirst=0;
1940 #endif
1941 
1942  while(gemmProcessed<gemmToDo || msgForwarded < msgToFwd)
1943  {
1944  Int reqidx = MPI_UNDEFINED;
1945  Int supidx = -1;
1946 
1947 
1948  //while I don't have anything to do, wait for data to arrive
1949  do{
1950 
1951 
1952 
1953  //then process with the remote ones
1954 
1955  TIMER_START(WaitContent_UL);
1956 #if defined(PROFILE)
1957  if(begin_SendULWaitContentFirst==0){
1958  begin_SendULWaitContentFirst=1;
1959  TIMER_START(WaitContent_UL_First);
1960  }
1961 #endif
1962 
1963 
1964 
1965 
1966 
1967 #ifdef NEW_BCAST
1968  //loop through each tree arrays and progress
1969  for (Int supidx2=0; supidx2<stepSuper; supidx2++){
1970  SuperNodeBufferType & snode = arrSuperNodes[supidx2];
1971 
1972  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1973 
1974  if(bcastUTree2 != NULL /*&& (!bcastUdone[supidx2])*/){
1975  if(!bcastUready[supidx2]){
1976  if(!bcastUTree2->IsRoot()){
1977  bool ready = bcastUTree2->IsDataReceived();
1978 
1979 
1980  if(ready){
1981  msgForwarded++;
1982  //TODO Temp : we can change snode to have a treepointer inside of it and get rid of this LUUpdateBuf to use tu local buffer of the tree directly instead ?
1983 
1984  //snode.SizeSstrUrowRecv = bcastUTree2->GetMsgSize();
1985  //snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv);
1986  //bcastUTree2->SetLocalBuffer((T*)&snode.SstrUrowRecv[0]);
1987  //assert( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ));
1988 
1989  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1990  snode.isReady++;
1991  //if we received both L and U, the supernode is ready
1992  if(snode.isReady==2){
1993  readySupidx.push_back(supidx2);
1994 #if defined(PROFILE)
1995  if(end_SendULWaitContentFirst==0){
1996  TIMER_STOP(WaitContent_UL_First);
1997  end_SendULWaitContentFirst=1;
1998  }
1999 #endif
2000  }
2001  }
2002 
2003  bcastUready[supidx2]=1;
2004  //#if ( _DEBUGlevel_ >= 1 )
2005  for( Int idxRecv = 0; idxRecv < bcastUTree2->GetDestCount(); ++idxRecv ){
2006  Int iProcRow = bcastUTree2->GetDest(idxRecv);
2007  statusOFS << std::endl << "MATDEBUG ["<<snode.Index<<"] "<< "Forwarded U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2008  }
2009  //#endif
2010 
2011  }
2012 
2013  }
2014  else{
2015  bcastUready[supidx2]=1;
2016  }
2017  }
2018  //Progress is not necessarily what I need: I need to know if I have received the data, and that's it
2019  bool done = bcastUTree2->Progress();
2020 
2021 #if ( _DEBUGlevel_ >= 1 )
2022  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast U "<<done<<std::endl;
2023 #endif
2024  }
2025 
2026 
2027 
2028 
2029 
2030 
2031 
2032 
2033 
2034  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
2035 
2036  if(bcastLTree2 != NULL /*&& (!bcastLdone[supidx2])*/){
2037  if(!bcastLready[supidx2]){
2038  if(!bcastLTree2->IsRoot()){
2039  bool ready = bcastLTree2->IsDataReceived();
2040 
2041 
2042  if(ready){
2043  msgForwarded++;
2044  //snode.SizeSstrLcolRecv = bcastLTree2->GetMsgSize();
2045  //snode.SstrLcolRecv.resize( snode.SizeSstrLcolRecv);
2046  //bcastLTree2->SetLocalBuffer((T*)&snode.SstrLcolRecv[0]);
2047  //assert( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ));
2048 
2049  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2050  snode.isReady++;
2051  //if we received both L and L, the supernode is ready
2052  if(snode.isReady==2){
2053  readySupidx.push_back(supidx2);
2054 #if defined(PROFILE)
2055  if(end_SendULWaitContentFirst==0){
2056  TIMER_STOP(WaitContent_UL_First);
2057  end_SendULWaitContentFirst=1;
2058  }
2059 #endif
2060  }
2061  }
2062 
2063  bcastLready[supidx2]=1;
2064  //#if ( _DEBLGlevel_ >= 1 )
2065  for( Int idxRecv = 0; idxRecv < bcastLTree2->GetDestCount(); ++idxRecv ){
2066  Int iProcRow = bcastLTree2->GetDest(idxRecv);
2067  statusOFS << std::endl << "MATDEBUG ["<<snode.Index<<"] "<< "Forwarded L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2068  }
2069  //#endif
2070 
2071  }
2072 
2073  }
2074  else{
2075  bcastLready[supidx2]=1;
2076  }
2077  }
2078  //Progress is not necessarily what I need: I need to know if I have received the data, and that's it
2079  bool done = bcastLTree2->Progress();
2080 
2081 #if ( _DEBUGlevel_ >= 1 )
2082  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast L "<<done<<std::endl;
2083 #endif
2084  }
2085 
2086 
2087 
2088 
2089 
2090 
2091 
2092 
2093 
2094 
2095 
2096 
2097 
2098 
2099 
2100  }
2101 #else
2102  int reqIndices[arrMpireqsRecvContentFromAny.size()];
2103  int numRecv = 0;
2104  numRecv = 0;
2105  int err = MPI_Waitsome(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &numRecv, reqIndices, MPI_STATUSES_IGNORE);
2106  assert(err==MPI_SUCCESS);
2107 
2108  for(int i =0;i<numRecv;i++){
2109  reqidx = reqIndices[i];
2110  //I've received something
2111  if(reqidx!=MPI_UNDEFINED)
2112  {
2113  //this stays true
2114  supidx = reqidx/2;
2115  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2116 
2117  //If it's a U block
2118  if(reqidx%2==0){
2119 
2120  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
2121  if(bcastUTree != NULL){
2122  if(bcastUTree->GetDestCount()>0){
2123 
2124  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
2125 #if ( _DEBUGlevel_ >= 1 )
2126  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
2127  Int iProcRow = bcastUTree->GetDest(idxRecv);
2128  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarding U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2129  }
2130 #endif
2131 
2132  bcastUTree->ForwardMessage( (char*)&snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv,
2133  IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_), &mpireqsSendToBelow[0] );
2134 #if ( _DEBUGlevel_ >= 1 )
2135  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
2136  Int iProcRow = bcastUTree->GetDest(idxRecv);
2137  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarded U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2138  }
2139 #endif
2140 
2141  msgForwarded++;
2142  }
2143  }
2144  }
2145  //If it's a L block
2146  else if(reqidx%2==1){
2147  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
2148  if(bcastLTree != NULL){
2149  if(bcastLTree->GetDestCount()>0){
2150 
2151  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
2152 #if ( _DEBUGlevel_ >= 1 )
2153  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
2154  Int iProcCol = bcastLTree->GetDest(idxRecv);
2155  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarding L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcCol<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2156  }
2157 #endif
2158 
2159  bcastLTree->ForwardMessage( (char*)&snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv,
2160  IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_), &mpireqsSendToRight[0]);
2161 
2162  // for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
2163  // Int iProcCol = bcastLTree->GetDest(idxRecv);
2164  // PROFILE_COMM(MYPROC(this->grid_),PNUM(MYROW(this->grid_),iProcCol,this->grid_),IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT),snode.SizeSstrLcolRecv);
2165  // }
2166 #if ( _DEBUGlevel_ >= 1 )
2167  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
2168  Int iProcCol = bcastLTree->GetDest(idxRecv);
2169  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarded L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcCol<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2170  }
2171 #endif
2172  msgForwarded++;
2173  }
2174  }
2175  }
2176 
2177 
2178 #if ( _DEBUGlevel_ >= 1 )
2179  statusOFS<<std::endl<<"Received data for ["<<snode.Index<<"] reqidx%2="<<reqidx%2<<" is ready ?"<<snode.isReady<<std::endl;
2180 #endif
2181 
2182  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2183  snode.isReady++;
2184 
2185  //if we received both L and U, the supernode is ready
2186  if(snode.isReady==2){
2187  readySupidx.push_back(supidx);
2188 
2189 #if defined(PROFILE)
2190  if(end_SendULWaitContentFirst==0){
2191  TIMER_STOP(WaitContent_UL_First);
2192  end_SendULWaitContentFirst=1;
2193  }
2194 #endif
2195  }
2196  }
2197  }
2198 
2199  }//end for waitsome
2200 #endif
2201 
2202  TIMER_STOP(WaitContent_UL);
2203 
2204  } while( (gemmProcessed<gemmToDo && readySupidx.size()==0) || (gemmProcessed==gemmToDo && msgForwarded<msgToFwd) );
2205 
2206  //If I have some work to do
2207  if(readySupidx.size()>0)
2208  {
2209  supidx = readySupidx.back();
2210  readySupidx.pop_back();
2211  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2212 
2213 
2214  // Only the processors received information participate in the Gemm
2215  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){
2216 
2217  std::vector<LBlock<T> > LcolRecv;
2218  std::vector<UBlock<T> > UrowRecv;
2219  // Save all the data to be updated for { L( isup, snode.Index ) | isup > snode.Index }.
2220  // The size will be updated in the Gemm phase and the reduce phase
2221 
2222  UnpackData(snode, LcolRecv, UrowRecv);
2223 #ifdef NEW_BCAST
2224  //cleanup corresponding tree
2225  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
2226  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
2227  if(bcastUTree2->IsDone()){
2228  bcastUTree2->CleanupBuffers();
2229  }
2230  if(bcastLTree2->IsDone()){
2231  bcastLTree2->CleanupBuffers();
2232  }
2233 #endif
2234 
2235  //NumMat<T> AinvBuf, UBuf;
2236  SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);
2237 
2238  snode.LUpdateBuf.Resize( AinvBuf.m(), SuperSize( snode.Index, super_ ) );
2239 
2240 #ifdef GEMM_PROFILE
2241  gemm_stat.push_back(AinvBuf.m());
2242  gemm_stat.push_back(UBuf.m());
2243  gemm_stat.push_back(AinvBuf.n());
2244 #endif
2245 
2246  TIMER_START(Compute_Sinv_LT_GEMM);
2247  blas::Gemm( 'N', 'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),
2248  AinvBuf.Data(), AinvBuf.m(),
2249  UBuf.Data(), UBuf.m(), ZERO<T>(),
2250  snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
2251  TIMER_STOP(Compute_Sinv_LT_GEMM);
2252 
2253 
2254 #if ( _DEBUGlevel_ >= 2 )
2255  statusOFS << std::endl << "["<<snode.Index<<"] "<< "snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;
2256 #endif
2257  } // if Gemm is to be done locally
2258 
2259  //Get the reduction tree
2260  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
2261  if(redLTree != NULL){
2262  assert( snode.LUpdateBuf.m() != 0 && snode.LUpdateBuf.n() != 0 );
2263  TIMER_START(Reduce_Sinv_LT_Isend);
2264  //send the data
2265  redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
2266  redLTree->SetDataReady(true);
2267 
2268  bool done = redLTree->Progress();
2269 #if ( _DEBUGlevel_ >= 1 )
2270  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce L "<<done<<std::endl;
2271 #endif
2272  TIMER_STOP(Reduce_Sinv_LT_Isend);
2273  }
2274 
2275  gemmProcessed++;
2276 
2277 #if ( _DEBUGlevel_ >= 1 )
2278  statusOFS<<std::endl<<"gemmProcessed ="<<gemmProcessed<<"/"<<gemmToDo<<std::endl;
2279 #endif
2280 
2281  //advance reductions
2282  for (Int supidx=0; supidx<stepSuper; supidx++){
2283  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2284  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
2285  if(redLTree != NULL && !redLdone[supidx]){
2286  bool done = redLTree->Progress();
2287 
2288 #if ( _DEBUGlevel_ >= 1 )
2289  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce L "<<done<<std::endl;
2290 #endif
2291  }
2292  }
2293  }
2294  }
2295 
2296  }
2297  TIMER_STOP(Compute_Sinv_LT);
2298 
2299  //Reduce Sinv L^T to the processors in PCOL(ksup,grid_)
2300 
2301 
2302  TIMER_START(Reduce_Sinv_LT);
2303  //blocking wait for the reduction
2304  bool all_done = false;
2305  while(!all_done)
2306  {
2307  all_done = true;
2308 
2309  for (Int supidx=0; supidx<stepSuper; supidx++){
2310  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2311 
2312  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
2313 
2314 #ifdef NEW_BCAST
2315  if(redLTree != NULL /*&& !redLdone[supidx]*/)
2316 #else
2317  if(redLTree != NULL && !redLdone[supidx])
2318 #endif
2319  {
2320 
2321  //TODO restore this
2322  //bool done = redLTree->Progress();
2323  redLTree->Wait();
2324  bool done = true;
2325 #if ( _DEBUGlevel_ >= 1 )
2326  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce L "<<done<<std::endl;
2327 #endif
2328  redLdone[supidx]=done?1:0;
2329  if(done){
2330 
2331 #if ( _DEBUGlevel_ >= 1 )
2332  statusOFS<<"["<<snode.Index<<"] "<<" DONE reduce L"<<std::endl;
2333 #endif
2334 
2335 //if(redLTree->GetTag() == 2344 && MYPROC(grid_)==0){gdb_lock();}
2336  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
2337  //determine the number of rows in LUpdateBufReduced
2338  Int numRowLUpdateBuf;
2339  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
2340  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
2341  snode.RowLocalPtr.resize( Lcol.size() + 1 );
2342  snode.BlockIdxLocal.resize( Lcol.size() );
2343  snode.RowLocalPtr[0] = 0;
2344  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2345  snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;
2346  snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;
2347  }
2348  } // I do not own the diagonal block
2349  else{
2350  snode.RowLocalPtr.resize( Lcol.size() );
2351  snode.BlockIdxLocal.resize( Lcol.size() - 1 );
2352  snode.RowLocalPtr[0] = 0;
2353  for( Int ib = 1; ib < Lcol.size(); ib++ ){
2354  snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;
2355  snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;
2356  }
2357  } // I own the diagonal block, skip the diagonal block
2358  numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
2359 
2360  if( numRowLUpdateBuf > 0 ){
2361  if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
2362  snode.LUpdateBuf.Resize( numRowLUpdateBuf,SuperSize( snode.Index, super_ ) );
2363  SetValue(snode.LUpdateBuf, ZERO<T>());
2364  }
2365  }
2366 
2367  //copy the buffer from the reduce tree
2368  redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
2369  }
2370 #ifndef NEW_BCAST
2371  redLdone[supidx]=1;
2372 #endif
2373  redLTree->CleanupBuffers();
2374  }
2375 
2376  all_done = all_done && (done || redLdone[supidx]);
2377  }
2378  }
2379  }
2380  TIMER_STOP(Reduce_Sinv_LT);
2381 
2382 
2383 
2384  //--------------------- End of reduce of LUpdateBuf-------------------------
2385 
2386  TIMER_START(Update_Diagonal);
2387 
2388  for (Int supidx=0; supidx<stepSuper; supidx++){
2389  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2390 
2391  ComputeDiagUpdate(snode);
2392 
2393  //Get the reduction tree
2394  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
2395 
2396  if(redDTree != NULL){
2397  //send the data
2398  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
2399  if(snode.DiagBuf.Size()==0){
2400  snode.DiagBuf.Resize( SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));
2401  SetValue(snode.DiagBuf, ZERO<T>());
2402  }
2403  }
2404 
2405 
2406  redDTree->SetLocalBuffer(snode.DiagBuf.Data());
2407  if(!redDTree->IsAllocated()){
2408  redDTree->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_D_REDUCE,limIndex_));
2409  redDTree->AllocRecvBuffers();
2410  //Post All Recv requests;
2411  redDTree->PostFirstRecv();
2412  }
2413 
2414  redDTree->SetDataReady(true);
2415  bool done = redDTree->Progress();
2416 #if ( _DEBUGlevel_ >= 1 )
2417  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce D "<<done<<std::endl;
2418 #endif
2419  }
2420 
2421  //advance reductions
2422  for (Int supidx=0; supidx<stepSuper; supidx++){
2423  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2424  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
2425  if(redDTree != NULL){
2426 #ifndef NEW_BCAST
2427  if(redDTree->IsAllocated())
2428 #endif
2429  {
2430  bool done = redDTree->Progress();
2431 #if ( _DEBUGlevel_ >= 1 )
2432  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce D "<<done<<std::endl;
2433 #endif
2434  }
2435  }
2436  }
2437  }
2438  TIMER_STOP(Update_Diagonal);
2439 
2440  TIMER_START(Reduce_Diagonal);
2441  //blocking wait for the reduction
2442  {
2443  vector<char> is_done(stepSuper,0);
2444  bool all_done = false;
2445  while(!all_done)
2446  {
2447  all_done = true;
2448 
2449  for (Int supidx=0; supidx<stepSuper; supidx++){
2450  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2451  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
2452 
2453 #ifdef NEW_BCAST
2454  if(redDTree != NULL /*&& !is_done[supidx]*/)
2455 #else
2456  if(redDTree != NULL && !is_done[supidx])
2457 #endif
2458  {
2459 
2460  bool done = redDTree->Progress();
2461 #if ( _DEBUGlevel_ >= 1 )
2462  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce D "<<done<<std::endl;
2463 #endif
2464  is_done[supidx]=done?1:0;
2465  if(done){
2466 
2467 #if ( _DEBUGlevel_ >= 1 )
2468  statusOFS<<"["<<snode.Index<<"] "<<" DONE reduce D"<<std::endl;
2469 #endif
2470  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
2471  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
2472  LBlock<T> & LB = this->L( LBj( snode.Index, grid_ ) )[0];
2473  // Symmetrize LB
2474  blas::Axpy( LB.numRow * LB.numCol, ONE<T>(), snode.DiagBuf.Data(), 1, LB.nzval.Data(), 1 );
2475  Symmetrize( LB.nzval );
2476  }
2477  }
2478 #ifndef NEW_BCAST
2479  is_done[supidx]=1;
2480 #endif
2481  redDTree->CleanupBuffers();
2482  }
2483 
2484  all_done = all_done && (done || is_done[supidx]);
2485  }
2486  }
2487  }
2488  }
2489  TIMER_STOP(Reduce_Diagonal);
2490 
2491 
2492 
2493 
2494 
2495 
2496  SendRecvCD_UpdateU(arrSuperNodes, stepSuper);
2497 
2498 
2499 
2500  TIMER_START(Update_L);
2501  for (Int supidx=0; supidx<stepSuper; supidx++){
2502  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2503 
2504 #if ( _DEBUGlevel_ >= 1 )
2505  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;
2506 #endif
2507 
2508  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){
2509  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
2510  //Need to skip the diagonal block if present
2511  Int startBlock = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;
2512  for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
2513  LBlock<T> & LB = Lcol[ib];
2514  lapack::Lacpy( 'A', LB.numRow, LB.numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),
2515  snode.LUpdateBuf.m(), LB.nzval.Data(), LB.numRow );
2516  }
2517  } // Finish updating L
2518  } // for (snode.Index) : Main loop
2519  TIMER_STOP(Update_L);
2520 
2521 
2522 
2523  TIMER_START(Barrier);
2524 
2525 #ifdef NEW_BCAST
2526  //block on bcastUTree
2527  for (Int supidx=0; supidx<stepSuper; supidx++){
2528  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2529  TreeBcast2<T> * &bcastUTree2 = fwdToBelowTree2_[snode.Index];
2530 
2531  if(bcastUTree2 != NULL){
2532  bcastUTree2->Wait();
2533  bcastUTree2->CleanupBuffers();
2534  // delete bcastUTree2;
2535  // bcastUTree2 = NULL;
2536  }
2537 
2538  TreeBcast2<T> * &bcastLTree2 = fwdToRightTree2_[snode.Index];
2539 
2540  if(bcastLTree2 != NULL){
2541  bcastLTree2->Wait();
2542  bcastLTree2->CleanupBuffers();
2543  // delete bcastLTree2;
2544  // bcastLTree2 = NULL;
2545  }
2546 
2547  }
2548 #endif
2549 
2550  for (Int supidx=0; supidx<stepSuper; supidx++){
2551  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2552 
2553  TreeReduce<T> * &redLTree = redToLeftTree_[snode.Index];
2554 
2555  if(redLTree != NULL){
2556  redLTree->Wait();
2557  redLTree->CleanupBuffers();
2558  // delete redLTree;
2559  // redLTree = NULL;
2560  }
2561  }
2562 
2563  for (Int supidx=0; supidx<stepSuper; supidx++){
2564  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2565  TreeReduce<T> * &redDTree = redToAboveTree_[snode.Index];
2566 
2567  if(redDTree != NULL){
2568  redDTree->Wait();
2569  redDTree->CleanupBuffers();
2570  // delete redDTree;
2571  // redDTree = NULL;
2572  }
2573  }
2574 
2575 #ifndef NEW_BCAST
2576  mpi::Waitall(arrMpireqsRecvContentFromAny);
2577  for (Int supidx=0; supidx<stepSuper; supidx++){
2578  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2579  Int ksup = snode.Index;
2580  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
2581  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
2582 
2583 #if ( _DEBUGlevel_ >= 1 )
2584  statusOFS<<"["<<ksup<<"] mpireqsSendToRight"<<std::endl;
2585 #endif
2586  mpi::Waitall( mpireqsSendToRight );
2587 #if ( _DEBUGlevel_ >= 1 )
2588  statusOFS<<"["<<ksup<<"] mpireqsSendToBelow"<<std::endl;
2589 #endif
2590 
2591  mpi::Waitall( mpireqsSendToBelow );
2592  }
2593 #endif
2594 
2595 #if ( _DEBUGlevel_ >= 1 )
2596  statusOFS<<"barrier done"<<std::endl;
2597 #endif
2598  TIMER_STOP(Barrier);
2599 
2600  }
2601 
2602  template<typename T>
2604  {
2605  ConstructCommunicationPattern_P2p();
2606  } // ----- end of method PMatrix::ConstructCommunicationPattern -----
2607 
2608 
2609 
2610  template<typename T>
2612  {
2613 #ifdef COMMUNICATOR_PROFILE
2614  CommunicatorStat stats;
2615  stats.countSendToBelow_.Resize(numSuper);
2616  stats.countSendToRight_.Resize(numSuper);
2617  stats.countRecvFromBelow_.Resize( numSuper );
2618  SetValue( stats.countSendToBelow_, 0 );
2619  SetValue( stats.countSendToRight_, 0 );
2620  SetValue( stats.countRecvFromBelow_, 0 );
2621 #endif
2622 
2623 
2624 
2625 
2626  Int numSuper = this->NumSuper();
2627 
2628  TIMER_START(Allocate);
2629 
2630 
2631 
2632 #ifdef NEW_BCAST
2633  fwdToBelowTree2_.resize(numSuper, NULL );
2634  fwdToRightTree2_.resize(numSuper, NULL );
2635 #else
2636  fwdToBelowTree_.resize(numSuper, NULL );
2637  fwdToRightTree_.resize(numSuper, NULL );
2638 #endif
2639  redToLeftTree_.resize(numSuper, NULL );
2640  redToAboveTree_.resize(numSuper, NULL );
2641 
2642  isSendToBelow_.Resize(grid_->numProcRow, numSuper);
2643  isSendToRight_.Resize(grid_->numProcCol, numSuper);
2644  isSendToDiagonal_.Resize( numSuper );
2645  SetValue( isSendToBelow_, false );
2646  SetValue( isSendToRight_, false );
2647  SetValue( isSendToDiagonal_, false );
2648 
2649  isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
2650  SetValue( isSendToCrossDiagonal_, false );
2651  isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
2652  SetValue( isRecvFromCrossDiagonal_, false );
2653 
2654  isRecvFromAbove_.Resize( numSuper );
2655  isRecvFromLeft_.Resize( numSuper );
2656  isRecvFromBelow_.Resize( grid_->numProcRow, numSuper );
2657  SetValue( isRecvFromAbove_, false );
2658  SetValue( isRecvFromBelow_, false );
2659  SetValue( isRecvFromLeft_, false );
2660 
2661  TIMER_STOP(Allocate);
2662 
2663  TIMER_START(GetEtree);
2664  std::vector<Int> snodeEtree(this->NumSuper());
2665  GetEtree(snodeEtree);
2666  TIMER_STOP(GetEtree);
2667 
2668 
2669 
2670 #if ( _DEBUGlevel_ >= 1 )
2671  statusOFS << std::endl << "Local column communication" << std::endl;
2672 #endif
2673  // localColBlockRowIdx stores the nonzero block indices for each local block column.
2674  // The nonzero block indices including contribution from both L and U.
2675  // Dimension: numLocalBlockCol x numNonzeroBlock
2676  std::vector<std::set<Int> > localColBlockRowIdx;
2677 
2678  localColBlockRowIdx.resize( this->NumLocalBlockCol() );
2679 
2680 
2681  TIMER_START(Column_communication);
2682 
2683 
2684  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2685  // All block columns perform independently
2686  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2687 
2688 
2689  // Communication
2690  std::vector<Int> tAllBlockRowIdx;
2691  std::vector<Int> & colBlockIdx = ColBlockIdx(LBj(ksup, grid_));
2692  TIMER_START(Allgatherv_Column_communication);
2693  if( grid_ -> mpisize != 1 )
2694  mpi::Allgatherv( colBlockIdx, tAllBlockRowIdx, grid_->colComm );
2695  else
2696  tAllBlockRowIdx = colBlockIdx;
2697 
2698  TIMER_STOP(Allgatherv_Column_communication);
2699 
2700  localColBlockRowIdx[LBj( ksup, grid_ )].insert(
2701  tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
2702 
2703 #if ( _DEBUGlevel_ >= 1 )
2704  statusOFS
2705  << " Column block " << ksup
2706  << " has the following nonzero block rows" << std::endl;
2707  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
2708  si != localColBlockRowIdx[LBj( ksup, grid_ )].end();
2709  si++ ){
2710  statusOFS << *si << " ";
2711  }
2712  statusOFS << std::endl;
2713 #endif
2714 
2715  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
2716  } // for(ksup)
2717 
2718 
2719 
2720  TIMER_STOP(Column_communication);
2721 
2722  TIMER_START(Row_communication);
2723 #if ( _DEBUGlevel_ >= 1 )
2724  statusOFS << std::endl << "Local row communication" << std::endl;
2725 #endif
2726  // localRowBlockColIdx stores the nonzero block indices for each local block row.
2727  // The nonzero block indices including contribution from both L and U.
2728  // Dimension: numLocalBlockRow x numNonzeroBlock
2729  std::vector<std::set<Int> > localRowBlockColIdx;
2730 
2731  localRowBlockColIdx.resize( this->NumLocalBlockRow() );
2732 
2733  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2734  // All block columns perform independently
2735  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2736 
2737  // Communication
2738  std::vector<Int> tAllBlockColIdx;
2739  std::vector<Int> & rowBlockIdx = RowBlockIdx(LBi(ksup, grid_));
2740  TIMER_START(Allgatherv_Row_communication);
2741  if( grid_ -> mpisize != 1 )
2742  mpi::Allgatherv( rowBlockIdx, tAllBlockColIdx, grid_->rowComm );
2743  else
2744  tAllBlockColIdx = rowBlockIdx;
2745 
2746  TIMER_STOP(Allgatherv_Row_communication);
2747 
2748  localRowBlockColIdx[LBi( ksup, grid_ )].insert(
2749  tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
2750 
2751 #if ( _DEBUGlevel_ >= 1 )
2752  statusOFS
2753  << " Row block " << ksup
2754  << " has the following nonzero block columns" << std::endl;
2755  for( std::set<Int>::iterator si = localRowBlockColIdx[LBi( ksup, grid_ )].begin();
2756  si != localRowBlockColIdx[LBi( ksup, grid_ )].end();
2757  si++ ){
2758  statusOFS << *si << " ";
2759  }
2760  statusOFS << std::endl;
2761 #endif
2762 
2763  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
2764  } // for(ksup)
2765 
2766 
2767  TIMER_STOP(Row_communication);
2768 
2769  TIMER_START(STB_RFA);
2770  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2771 
2772 #ifdef COMMUNICATOR_PROFILE
2773  std::vector<bool> sTB(grid_->numProcRow,false);
2774 #endif
2775  // Loop over all the supernodes to the right of ksup
2776  Int jsup = snodeEtree[ksup];
2777  while(jsup<numSuper){
2778  Int jsupLocalBlockCol = LBj( jsup, grid_ );
2779  Int jsupProcCol = PCOL( jsup, grid_ );
2780  if( MYCOL( grid_ ) == jsupProcCol ){
2781  // SendToBelow / RecvFromAbove only if (ksup, jsup) is nonzero.
2782  if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2783  for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2784  si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2785  Int isup = *si;
2786  Int isupProcRow = PROW( isup, grid_ );
2787  if( isup > ksup ){
2788  if( MYROW( grid_ ) == isupProcRow ){
2789  isRecvFromAbove_(ksup) = true;
2790  }
2791  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2792  isSendToBelow_( isupProcRow, ksup ) = true;
2793  }
2794  } // if( isup > ksup )
2795  } // for (si)
2796  } // if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 )
2797 #ifdef COMMUNICATOR_PROFILE
2798  sTB[ PROW(ksup,grid_) ] = true;
2799  if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2800  for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2801  si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2802  Int isup = *si;
2803  Int isupProcRow = PROW( isup, grid_ );
2804  if( isup > ksup ){
2805  sTB[isupProcRow] = true;
2806  } // if( isup > ksup )
2807  } // for (si)
2808  }
2809 #endif
2810 
2811 
2812 
2813  } // if( MYCOL( grid_ ) == PCOL( jsup, grid_ ) )
2814  jsup = snodeEtree[jsup];
2815  } // for(jsup)
2816 
2817 #ifdef COMMUNICATOR_PROFILE
2818  Int count= std::count(sTB.begin(), sTB.end(), true);
2819  Int color = sTB[MYROW(grid_)];
2820  if(count>1){
2821  std::vector<Int> & snodeList = stats.maskSendToBelow_[sTB];
2822  snodeList.push_back(ksup);
2823  }
2824  stats.countSendToBelow_(ksup) = count * color;
2825 #endif
2826 
2827 
2828  } // for(ksup)
2829  TIMER_STOP(STB_RFA);
2830 
2831  TIMER_START(STR_RFL_RFB);
2832  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2833 
2834 #ifdef COMMUNICATOR_PROFILE
2835  std::vector<bool> sTR(grid_->numProcCol,false);
2836  std::vector<bool> rFB(grid_->numProcRow,false);
2837 #endif
2838  // Loop over all the supernodes below ksup
2839  Int isup = snodeEtree[ksup];
2840  while(isup<numSuper){
2841  Int isupLocalBlockRow = LBi( isup, grid_ );
2842  Int isupProcRow = PROW( isup, grid_ );
2843  if( MYROW( grid_ ) == isupProcRow ){
2844  // SendToRight / RecvFromLeft only if (isup, ksup) is nonzero.
2845  if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2846  for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2847  si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2848  Int jsup = *si;
2849  Int jsupProcCol = PCOL( jsup, grid_ );
2850  if( jsup > ksup ){
2851  if( MYCOL( grid_ ) == jsupProcCol ){
2852  isRecvFromLeft_(ksup) = true;
2853  }
2854  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2855  isSendToRight_( jsupProcCol, ksup ) = true;
2856  }
2857  }
2858  } // for (si)
2859  } // if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 )
2860 
2861 #ifdef COMMUNICATOR_PROFILE
2862  sTR[ PCOL(ksup,grid_) ] = true;
2863  if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2864  for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2865  si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2866  Int jsup = *si;
2867  Int jsupProcCol = PCOL( jsup, grid_ );
2868  if( jsup > ksup ){
2869  sTR[ jsupProcCol ] = true;
2870  } // if( jsup > ksup )
2871  } // for (si)
2872  }
2873 #endif
2874 
2875 
2876  } // if( MYROW( grid_ ) == isupProcRow )
2877 
2878  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
2879  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2880  isRecvFromBelow_(isupProcRow,ksup) = true;
2881  }
2882  else if (MYROW(grid_) == isupProcRow){
2883  isSendToDiagonal_(ksup)=true;
2884  }
2885  } // if( MYCOL( grid_ ) == PCOL(ksup, grid_) )
2886 
2887 #ifdef COMMUNICATOR_PROFILE
2888  rFB[ PROW(ksup,grid_) ] = true;
2889  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
2890  rFB[ isupProcRow ] = true;
2891  } // if( MYCOL( grid_ ) == PCOL(ksup, grid_) )
2892 #endif
2893 
2894 
2895  isup = snodeEtree[isup];
2896  } // for (isup)
2897 
2898 #ifdef COMMUNICATOR_PROFILE
2899  Int count= std::count(rFB.begin(), rFB.end(), true);
2900  Int color = rFB[MYROW(grid_)];
2901  if(count>1){
2902  std::vector<Int> & snodeList = stats.maskRecvFromBelow_[rFB];
2903  snodeList.push_back(ksup);
2904  }
2905  stats.countRecvFromBelow_(ksup) = count * color;
2906 
2907  count= std::count(sTR.begin(), sTR.end(), true);
2908  color = sTR[MYCOL(grid_)];
2909  if(count>1){
2910  std::vector<Int> & snodeList = stats.maskSendToRight_[sTR];
2911  snodeList.push_back(ksup);
2912  }
2913  stats.countSendToRight_(ksup) = count * color;
2914 #endif
2915 
2916 
2917  } // for (ksup)
2918 
2919  TIMER_STOP(STR_RFL_RFB);
2920 
2921 #ifdef COMMUNICATOR_PROFILE
2922  {
2923 
2924 
2925  // statusOFS << std::endl << "stats.countSendToBelow:" << stats.countSendToBelow_ << std::endl;
2926  stats.maskSendToBelow_.insert(stats.maskRecvFromBelow_.begin(),stats.maskRecvFromBelow_.end());
2927  statusOFS << std::endl << "stats.maskSendToBelow_:" << stats.maskSendToBelow_.size() <<std::endl;
2928  // bitMaskSet::iterator it;
2929  // for(it = stats.maskSendToBelow_.begin(); it != stats.maskSendToBelow_.end(); it++){
2930  // //print the involved processors
2931  // for(int curi = 0; curi < it->first.size(); curi++){
2932  // statusOFS << it->first[curi] << " ";
2933  // }
2934  //
2935  // statusOFS<< " ( ";
2936  // //print the involved supernode indexes
2937  // for(int curi = 0; curi < it->second.size(); curi++){
2938  // statusOFS<< it->second[curi]<<" ";
2939  // }
2940  //
2941  // statusOFS << ")"<< std::endl;
2942  // }
2943  // statusOFS << std::endl << "stats.countRecvFromBelow:" << stats.countRecvFromBelow_ << std::endl;
2944  // statusOFS << std::endl << "stats.maskRecvFromBelow_:" << stats.maskRecvFromBelow_.size() <<std::endl;
2945  // statusOFS << std::endl << "stats.countSendToRight:" << stats.countSendToRight_ << std::endl;
2946  statusOFS << std::endl << "stats.maskSendToRight_:" << stats.maskSendToRight_.size() <<std::endl;
2947  }
2948 #endif
2949 
2950 
2951  TIMER_START(BUILD_BCAST_TREES);
2952  //Allgather RFL values within column
2953  {
2954  vector<double> SeedRFL(numSuper,0.0);
2955  vector<Int> aggRFL(numSuper);
2956  vector<Int> globalAggRFL(numSuper*grid_->numProcCol);
2957  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2958 #if ( _DEBUGlevel_ >= 1 )
2959  statusOFS<<"1 "<<ksup<<std::endl;
2960 #endif
2961  if(MYCOL(grid_)==PCOL(ksup,grid_)){
2962  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
2963  // All blocks except for the diagonal block are to be sent right
2964 
2965  Int totalSize = 0;
2966  //one integer holding the number of Lblocks
2967  totalSize+=sizeof(Int);
2968  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2969  if( Lcol[ib].blockIdx > ksup ){
2970  //three indices + one IntNumVec
2971  totalSize+=3*sizeof(Int);
2972  totalSize+= sizeof(Int)+Lcol[ib].rows.ByteSize();
2973  }
2974  }
2975 
2976 
2977  aggRFL[ksup]=totalSize;
2978  SeedRFL[ksup]=rand();
2979 
2980  }
2981  else if(isRecvFromLeft_(ksup)){
2982  aggRFL[ksup]=1;
2983  }
2984  else{
2985  aggRFL[ksup]=0;
2986  }
2987  }
2988  // //allgather
2989  MPI_Allgather(&aggRFL[0],numSuper*sizeof(Int),MPI_BYTE,
2990  &globalAggRFL[0],numSuper*sizeof(Int),MPI_BYTE,
2991  grid_->rowComm);
2992  MPI_Allreduce(MPI_IN_PLACE,&SeedRFL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
2993 
2994 
2995  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2996 #if ( _DEBUGlevel_ >= 1 )
2997  statusOFS<<"3 "<<ksup<<std::endl;
2998 #endif
2999  if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
3000  Int msgSize = 0;
3001  vector<Int> tree_ranks;
3002  Int countRFL= 0;
3003  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3004  if( iProcCol != PCOL( ksup, grid_ ) ){
3005  Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
3006  if(isRFL>0){
3007  ++countRFL;
3008  }
3009  }
3010  }
3011 
3012  tree_ranks.reserve(countRFL+1);
3013  tree_ranks.push_back(PCOL(ksup,grid_));
3014 
3015  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3016  Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
3017  if(isRFL>0){
3018  if( iProcCol != PCOL( ksup, grid_ ) ){
3019  tree_ranks.push_back(iProcCol);
3020  }
3021  else{
3022  msgSize = isRFL;
3023  }
3024  }
3025  }
3026 
3027 #ifdef NEW_BCAST
3028  TreeBcast2<T> * & BcastLTree2 = fwdToRightTree2_[ksup];
3029  if(BcastLTree2!=NULL){delete BcastLTree2; BcastLTree2=NULL;}
3030  BcastLTree2 = TreeBcast2<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
3031 
3032 #ifdef COMM_PROFILE_BCAST
3033  BcastLTree2->SetGlobalComm(grid_->comm);
3034 #endif
3035 #else
3036  TreeBcast * & BcastLTree = fwdToRightTree_[ksup];
3037  if(BcastLTree!=NULL){delete BcastLTree; BcastLTree=NULL;}
3038  BcastLTree = TreeBcast::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
3039 #ifdef COMM_PROFILE_BCAST
3040  BcastLTree->SetGlobalComm(grid_->comm);
3041 #endif
3042 #endif
3043 
3044  }
3045  }
3046 
3047 
3048 
3049 
3050 
3051  }
3052  {
3053  vector<double> SeedRFA(numSuper,0.0);
3054  vector<Int> aggRFA(numSuper);
3055  vector<Int> globalAggRFA(numSuper*grid_->numProcRow);
3056  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3057 #if ( _DEBUGlevel_ >= 1 )
3058  statusOFS<<"2 "<<ksup<<std::endl;
3059 #endif
3060  if(MYROW(grid_)==PROW(ksup,grid_)){
3061  std::vector<UBlock<T> >& Urow = this->U( LBi(ksup, grid_) );
3062  // All blocks except for the diagonal block are to be sent right
3063 
3064  Int totalSize = 0;
3065  //one integer holding the number of Lblocks
3066  totalSize+=sizeof(Int);
3067  for( Int jb = 0; jb < Urow.size(); jb++ ){
3068  if( Urow[jb].blockIdx >= ksup ){
3069  //three indices + one IntNumVec + one NumMat<T>
3070  totalSize+=3*sizeof(Int);
3071  totalSize+= sizeof(Int)+Urow[jb].cols.ByteSize();
3072  totalSize+= 2*sizeof(Int)+Urow[jb].nzval.ByteSize();
3073  }
3074  }
3075  aggRFA[ksup]=totalSize;
3076  SeedRFA[ksup]=rand();
3077  }
3078  else if(isRecvFromAbove_(ksup)){
3079  aggRFA[ksup]=1;
3080  }
3081  else{
3082  aggRFA[ksup]=0;
3083  }
3084  }
3085 
3086 
3087  //allgather
3088  MPI_Allgather(&aggRFA[0],numSuper*sizeof(Int),MPI_BYTE,
3089  &globalAggRFA[0],numSuper*sizeof(Int),MPI_BYTE,
3090  grid_->colComm);
3091  MPI_Allreduce(MPI_IN_PLACE,&SeedRFA[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
3092 
3093  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3094 #if ( _DEBUGlevel_ >= 1 )
3095  statusOFS<<"4 "<<ksup<<std::endl;
3096 #endif
3097  if( isRecvFromAbove_(ksup) || CountSendToBelow(ksup)>0 ){
3098  vector<Int> tree_ranks;
3099  Int msgSize = 0;
3100  Int countRFA= 0;
3101  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3102  if( iProcRow != PROW( ksup, grid_ ) ){
3103  Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
3104  if(isRFA>0){
3105  ++countRFA;
3106  }
3107  }
3108  }
3109 
3110  tree_ranks.reserve(countRFA+1);
3111  tree_ranks.push_back(PROW(ksup,grid_));
3112 
3113  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3114  Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
3115  if(isRFA>0){
3116  if( iProcRow != PROW( ksup, grid_ ) ){
3117  tree_ranks.push_back(iProcRow);
3118  }
3119  else{
3120  msgSize = isRFA;
3121  }
3122  }
3123  }
3124 
3125 #ifdef NEW_BCAST
3126  TreeBcast2<T> * & BcastUTree2 = fwdToBelowTree2_[ksup];
3127  if(BcastUTree2!=NULL){delete BcastUTree2; BcastUTree2=NULL;}
3128  BcastUTree2 = TreeBcast2<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
3129 
3130 #ifdef COMM_PROFILE_BCAST
3131  BcastUTree2->SetGlobalComm(grid_->comm);
3132 #endif
3133 #else
3134  TreeBcast * & BcastUTree = fwdToBelowTree_[ksup];
3135  if(BcastUTree!=NULL){delete BcastUTree; BcastUTree=NULL;}
3136  BcastUTree = TreeBcast::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
3137 #ifdef COMM_PROFILE_BCAST
3138  BcastUTree->SetGlobalComm(grid_->comm);
3139 #endif
3140 #endif
3141  }
3142 
3143  }
3144  }
3145  //do the same for the other arrays
3146  TIMER_STOP(BUILD_BCAST_TREES);
3147  TIMER_START(BUILD_REDUCE_D_TREE);
3148  {
3149  vector<double> SeedSTD(numSuper,0.0);
3150  vector<Int> aggSTD(numSuper);
3151  vector<Int> globalAggSTD(numSuper*grid_->numProcRow);
3152  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3153  if( MYCOL( grid_ ) == PCOL(ksup, grid_) && MYROW(grid_)==PROW(ksup,grid_)){
3154  Int totalSize = sizeof(T)*SuperSize( ksup, super_ )*SuperSize( ksup, super_ );
3155  aggSTD[ksup]=totalSize;
3156  SeedSTD[ksup]=rand();
3157  }
3158  else if(isSendToDiagonal_(ksup)){
3159  aggSTD[ksup]=1;
3160  }
3161  else{
3162  aggSTD[ksup]=0;
3163  }
3164  }
3165 
3166 
3167  //allgather
3168  MPI_Allgather(&aggSTD[0],numSuper*sizeof(Int),MPI_BYTE,
3169  &globalAggSTD[0],numSuper*sizeof(Int),MPI_BYTE,
3170  grid_->colComm);
3171  MPI_Allreduce(MPI_IN_PLACE,&SeedSTD[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
3172 
3173 
3174  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3175 #if ( _DEBUGlevel_ >= 1 )
3176  statusOFS<<"5 "<<ksup<<std::endl;
3177 #endif
3178  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
3179 
3180  Int amISTD = globalAggSTD[MYROW(grid_)*numSuper + ksup];
3181  // if( MYCOL( grid_ ) == PCOL(ksup, grid_) && MYROW(grid_)==PROW(ksup,grid_)){
3182  // assert(amISTD>0);
3183  // }
3184 
3185  if( amISTD ){
3186  vector<Int> tree_ranks;
3187  Int msgSize = 0;
3188 #if 0
3189  set<Int> set_ranks;
3190  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3191  Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3192  if(isSTD>0){
3193  if( iProcRow != PROW( ksup, grid_ ) ){
3194  set_ranks.insert(iProcRow);
3195  }
3196  else{
3197  msgSize = isSTD;
3198  }
3199  }
3200  }
3201 
3202 
3203  tree_ranks.push_back(PROW(ksup,grid_));
3204  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
3205 #else
3206  Int countSTD= 0;
3207  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3208  if( iProcRow != PROW( ksup, grid_ ) ){
3209  Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3210  if(isSTD>0){
3211  ++countSTD;
3212  }
3213  }
3214  }
3215 
3216  tree_ranks.reserve(countSTD+1);
3217  tree_ranks.push_back(PROW(ksup,grid_));
3218 
3219  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3220  Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3221  if(isSTD>0){
3222  if( iProcRow != PROW( ksup, grid_ ) ){
3223  tree_ranks.push_back(iProcRow);
3224  }
3225  else{
3226  msgSize = isSTD;
3227  }
3228  }
3229  }
3230 #endif
3231  //assert(set_ranks.find(MYROW(grid_))!= set_ranks.end() || MYROW(grid_)==tree_ranks[0]);
3232 
3233  TreeReduce<T> * & redDTree = redToAboveTree_[ksup];
3234 
3235 
3236  if(redDTree!=NULL){delete redDTree; redDTree=NULL;}
3237  redDTree = TreeReduce<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedSTD[ksup]);
3238 #ifdef COMM_PROFILE
3239  redDTree->SetGlobalComm(grid_->comm);
3240 #endif
3241  }
3242  }
3243  }
3244  }
3245  TIMER_STOP(BUILD_REDUCE_D_TREE);
3246 
3247 
3248 
3249  TIMER_START(BUILD_REDUCE_L_TREE);
3250  {
3251  vector<double> SeedRTL(numSuper,0.0);
3252  vector<Int> aggRTL(numSuper);
3253  vector<Int> globalAggRTL(numSuper*grid_->numProcCol);
3254  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3255  if(MYCOL(grid_)==PCOL(ksup,grid_)){
3256  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
3257  // All blocks except for the diagonal block are to be sent right
3258 
3259  Int totalSize = 0;
3260 
3261  //determine the number of rows in LUpdateBufReduced
3262  Int numRowLUpdateBuf = 0;
3263  if( MYROW( grid_ ) != PROW( ksup, grid_ ) ){
3264  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3265  numRowLUpdateBuf += Lcol[ib].numRow;
3266  }
3267  } // I do not own the diagonal block
3268  else{
3269  for( Int ib = 1; ib < Lcol.size(); ib++ ){
3270  numRowLUpdateBuf += Lcol[ib].numRow;
3271  }
3272  } // I own the diagonal block, skip the diagonal block
3273 
3274  //if(ksup==297){gdb_lock();}
3275  totalSize = numRowLUpdateBuf*SuperSize( ksup, super_ )*sizeof(T);
3276 
3277  aggRTL[ksup]=totalSize;
3278 
3279  SeedRTL[ksup]=rand();
3280  }
3281  else if(isRecvFromLeft_(ksup)){
3282  aggRTL[ksup]=1;
3283  }
3284  else{
3285  aggRTL[ksup]=0;
3286  }
3287  }
3288  // //allgather
3289  MPI_Allgather(&aggRTL[0],numSuper*sizeof(Int),MPI_BYTE,
3290  &globalAggRTL[0],numSuper*sizeof(Int),MPI_BYTE,
3291  grid_->rowComm);
3292  MPI_Allreduce(MPI_IN_PLACE,&SeedRTL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
3293 
3294 
3295  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3296 #if ( _DEBUGlevel_ >= 1 )
3297  statusOFS<<"6 "<<ksup<<std::endl;
3298 #endif
3299  if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
3300  vector<Int> tree_ranks;
3301  Int msgSize = 0;
3302 #if 0
3303  set<Int> set_ranks;
3304  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3305  Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3306  if(isRTL>0){
3307  if( iProcCol != PCOL( ksup, grid_ ) ){
3308  set_ranks.insert(iProcCol);
3309  }
3310  else{
3311  msgSize = isRTL;
3312  }
3313  }
3314  }
3315  tree_ranks.push_back(PCOL(ksup,grid_));
3316  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
3317 #else
3318  Int countRTL= 0;
3319  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3320  if( iProcCol != PCOL( ksup, grid_ ) ){
3321  Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3322  if(isRTL>0){
3323  ++countRTL;
3324  }
3325  }
3326  }
3327 
3328  tree_ranks.reserve(countRTL+1);
3329  tree_ranks.push_back(PCOL(ksup,grid_));
3330 
3331  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3332  Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3333  if(isRTL>0){
3334  if( iProcCol != PCOL( ksup, grid_ ) ){
3335  tree_ranks.push_back(iProcCol);
3336  }
3337  else{
3338  msgSize = isRTL;
3339  }
3340  }
3341  }
3342 
3343 #endif
3344 
3345 
3346  TreeReduce<T> * & redLTree = redToLeftTree_[ksup];
3347  if(redLTree!=NULL){delete redLTree; redLTree=NULL;}
3348  redLTree = TreeReduce<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRTL[ksup]);
3349 #ifdef COMM_PROFILE
3350  redLTree->SetGlobalComm(grid_->comm);
3351 #endif
3352  }
3353  }
3354  }
3355  TIMER_STOP(BUILD_REDUCE_L_TREE);
3356 
3357 
3358 
3359 
3360 
3361 
3362 
3363 #if ( _DEBUGlevel_ >= 1 )
3364  statusOFS << std::endl << "isSendToBelow:" << std::endl;
3365  for(int j = 0;j< isSendToBelow_.n();j++){
3366  statusOFS << "["<<j<<"] ";
3367  for(int i =0; i < isSendToBelow_.m();i++){
3368  statusOFS<< isSendToBelow_(i,j) << " ";
3369  }
3370  statusOFS<<std::endl;
3371  }
3372 
3373  statusOFS << std::endl << "isRecvFromAbove:" << std::endl;
3374  for(int j = 0;j< isRecvFromAbove_.m();j++){
3375  statusOFS << "["<<j<<"] "<< isRecvFromAbove_(j)<<std::endl;
3376  }
3377 #endif
3378 #if ( _DEBUGlevel_ >= 1 )
3379  statusOFS << std::endl << "isSendToRight:" << std::endl;
3380  for(int j = 0;j< isSendToRight_.n();j++){
3381  statusOFS << "["<<j<<"] ";
3382  for(int i =0; i < isSendToRight_.m();i++){
3383  statusOFS<< isSendToRight_(i,j) << " ";
3384  }
3385  statusOFS<<std::endl;
3386  }
3387 
3388  statusOFS << std::endl << "isRecvFromLeft:" << std::endl;
3389  for(int j = 0;j< isRecvFromLeft_.m();j++){
3390  statusOFS << "["<<j<<"] "<< isRecvFromLeft_(j)<<std::endl;
3391  }
3392 
3393  statusOFS << std::endl << "isRecvFromBelow:" << std::endl;
3394  for(int j = 0;j< isRecvFromBelow_.n();j++){
3395  statusOFS << "["<<j<<"] ";
3396  for(int i =0; i < isRecvFromBelow_.m();i++){
3397  statusOFS<< isRecvFromBelow_(i,j) << " ";
3398  }
3399  statusOFS<<std::endl;
3400  }
3401 #endif
3402 
3403 
3404 
3405 
3406 
3407 
3408 
3409  TIMER_START(STCD_RFCD);
3410 
3411 
3412  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3413  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3414  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
3415  si != localColBlockRowIdx[LBj( ksup, grid_ )].end(); si++ ){
3416  Int isup = *si;
3417  Int isupProcRow = PROW( isup, grid_ );
3418  Int isupProcCol = PCOL( isup, grid_ );
3419  if( isup > ksup && MYROW( grid_ ) == isupProcRow ){
3420  isSendToCrossDiagonal_(grid_->numProcCol, ksup ) = true;
3421  isSendToCrossDiagonal_(isupProcCol, ksup ) = true;
3422  }
3423  } // for (si)
3424  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
3425  } // for (ksup)
3426 
3427  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3428  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3429  for( std::set<Int>::iterator si = localRowBlockColIdx[ LBi(ksup, grid_) ].begin();
3430  si != localRowBlockColIdx[ LBi(ksup, grid_) ].end(); si++ ){
3431  Int jsup = *si;
3432  Int jsupProcCol = PCOL( jsup, grid_ );
3433  Int jsupProcRow = PROW( jsup, grid_ );
3434  if( jsup > ksup && MYCOL(grid_) == jsupProcCol ){
3435  isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) = true;
3436  isRecvFromCrossDiagonal_(jsupProcRow, ksup ) = true;
3437  }
3438  } // for (si)
3439  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
3440  } // for (ksup)
3441 #if ( _DEBUGlevel_ >= 1 )
3442  statusOFS << std::endl << "isSendToCrossDiagonal:" << std::endl;
3443  for(int j =0; j < isSendToCrossDiagonal_.n();j++){
3444  if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
3445  statusOFS << "["<<j<<"] ";
3446  for(int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
3447  if(isSendToCrossDiagonal_(i,j))
3448  {
3449  statusOFS<< PNUM(PROW(j,grid_),i,grid_)<<" ";
3450  }
3451  }
3452  statusOFS<<std::endl;
3453  }
3454  }
3455 
3456  statusOFS << std::endl << "isRecvFromCrossDiagonal:" << std::endl;
3457  for(int j =0; j < isRecvFromCrossDiagonal_.n();j++){
3458  if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
3459  statusOFS << "["<<j<<"] ";
3460  for(int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
3461  if(isRecvFromCrossDiagonal_(i,j))
3462  {
3463  statusOFS<< PNUM(i,PCOL(j,grid_),grid_)<<" ";
3464  }
3465  }
3466  statusOFS<<std::endl;
3467  }
3468  }
3469 
3470 
3471 #endif
3472 
3473 
3474  TIMER_STOP(STCD_RFCD);
3475 
3476 
3477  //Build the list of supernodes based on the elimination tree from SuperLU
3478  GetWorkSet(snodeEtree,this->WorkingSet());
3479 
3480  return ;
3481  } // ----- end of method PMatrix::ConstructCommunicationPattern_P2p -----
3482 
3483  template<typename T>
3485  {
3486 
3487  if(optionsFact_->Symmetric == 0){
3488  ErrorHandling( "The matrix is not symmetric, this routine can't handle it !" );
3489  }
3490  SelInv_P2p ( );
3491 
3492 
3493 #ifdef GEMM_PROFILE
3494  statOFS<<"m"<<"\t"<<"n"<<"\t"<<"z"<<std::endl;
3495  for(std::deque<int >::iterator it = gemm_stat.begin(); it!=gemm_stat.end(); it+=3){
3496  statOFS<<*it<<"\t"<<*(it+1)<<"\t"<<*(it+2)<<std::endl;
3497  }
3498 #endif
3499 
3500 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
3501  //std::cout<<"DUMPING COMM STATS "<<comm_stat.size()<<" "<<std::endl;
3502  commOFS<<HEADER_COMM<<std::endl;
3503  for(std::deque<int>::iterator it = comm_stat.begin(); it!=comm_stat.end(); it+=4){
3504  commOFS<<LINE_COMM(it)<<std::endl;
3505  }
3506 #endif
3507 
3508  //reset the trees
3509  for(int i = 0 ; i< fwdToBelowTree_.size();++i){
3510  if(fwdToBelowTree_[i]!=NULL){
3511  fwdToBelowTree_[i]->Reset();
3512  }
3513  }
3514  for(int i = 0 ; i< fwdToRightTree_.size();++i){
3515  if(fwdToRightTree_[i]!=NULL){
3516  fwdToRightTree_[i]->Reset();
3517  }
3518  }
3519  for(int i = 0 ; i< redToLeftTree_.size();++i){
3520  if(redToLeftTree_[i]!=NULL){
3521  redToLeftTree_[i]->Reset();
3522  }
3523  }
3524  for(int i = 0 ; i< redToAboveTree_.size();++i){
3525  if(redToAboveTree_[i]!=NULL){
3526  redToAboveTree_[i]->Reset();
3527  }
3528  }
3529 
3530 
3531 
3532  } // ----- end of method PMatrix::SelInv -----
3533 
3534 
3535 
3536  template<typename T>
3538  {
3539  TIMER_START(SelInv_P2p);
3540 
3541 
3542 statusOFS<<"maxTag value: "<<maxTag_<<std::endl;
3543 
3544  Int numSuper = this->NumSuper();
3545 
3546  // Main loop
3547  std::vector<std::vector<Int> > & superList = this->WorkingSet();
3548  Int numSteps = superList.size();
3549 
3550  Int rank = 0;
3551  for (Int lidx=0; lidx<numSteps ; lidx++){
3552  Int stepSuper = superList[lidx].size();
3553  //statusOFS<<"IN "<<lidx<<"/"<<numSteps<<std::endl;
3554  SelInvIntra_P2p(lidx,rank);
3555 #if ( _DEBUGlevel_ >= 1 )
3556  statusOFS<<"OUT "<<lidx<<"/"<<numSteps<<" "<<limIndex_<<std::endl;
3557 #endif
3558 
3559 #ifdef LIST_BARRIER
3560 #ifndef ALL_BARRIER
3561  if (options_->maxPipelineDepth!=-1)
3562 #endif
3563  {
3564  MPI_Barrier(grid_->comm);
3565  }
3566 #endif
3567 
3568 
3569 
3570  // assert(workingRanks_[lidx].size()==stepSuper);
3571 
3572  //find max snode.Index
3573  if(lidx>0 && (rank-1)%limIndex_==0){
3574  //#if ( _DEBUGlevel_ >= 1 )
3575  // statusOFS<<rank-1<<" Barrier "<<std::endl;
3576  //#endif
3577  MPI_Barrier(grid_->comm);
3578  }
3579 
3580 
3581  // if(lidx==1){ return;};
3582  }
3583 
3584  MPI_Barrier(grid_->comm);
3585 
3586  TIMER_STOP(SelInv_P2p);
3587 
3588  return ;
3589  } // ----- end of method PMatrix::SelInv_P2p -----
3590 
3591 
3592 
3593  template<typename T>
3595  {
3596 
3597  Int numSuper = this->NumSuper();
3598 
3599 #if ( _DEBUGlevel_ >= 1 )
3600  statusOFS << std::endl << "L(i,k) <- L(i,k) * L(k,k)^{-1}" << std::endl << std::endl;
3601 #endif
3602  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3603  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3604  // Broadcast the diagonal L block
3605  NumMat<T> nzvalLDiag;
3606  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3607  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3608  nzvalLDiag = Lcol[0].nzval;
3609  if( nzvalLDiag.m() != SuperSize(ksup, super_) ||
3610  nzvalLDiag.n() != SuperSize(ksup, super_) ){
3611  ErrorHandling( "The size of the diagonal block of L is wrong." );
3612  }
3613  } // Owns the diagonal block
3614  else
3615  {
3616  nzvalLDiag.Resize( SuperSize(ksup, super_), SuperSize(ksup, super_) );
3617  }
3618  MPI_Bcast( (void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
3619  MPI_BYTE, PROW( ksup, grid_ ), grid_->colComm );
3620 
3621  // Triangular solve
3622  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3623  LBlock<T> & LB = Lcol[ib];
3624  if( LB.blockIdx > ksup ){
3625 #if ( _DEBUGlevel_ >= 2 )
3626  // Check the correctness of the triangular solve for the first local column
3627  if( LBj( ksup, grid_ ) == 0 ){
3628  statusOFS << "Diag L(" << ksup << ", " << ksup << "): " << nzvalLDiag << std::endl;
3629  statusOFS << "Before solve L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;
3630  }
3631 #endif
3632  blas::Trsm( 'R', 'L', 'N', 'U', LB.numRow, LB.numCol, ONE<T>(),
3633  nzvalLDiag.Data(), LB.numCol, LB.nzval.Data(), LB.numRow );
3634 #if ( _DEBUGlevel_ >= 2 )
3635  // Check the correctness of the triangular solve for the first local column
3636  if( LBj( ksup, grid_ ) == 0 ){
3637  statusOFS << "After solve L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;
3638  }
3639 #endif
3640  }
3641  }
3642  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
3643  } // for (ksup)
3644 
3645 
3646 
3647 
3648 #if ( _DEBUGlevel_ >= 1 )
3649  statusOFS << std::endl << "U(k,i) <- L(i,k)" << std::endl << std::endl;
3650 #endif
3651 
3652  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3653  Int ksupProcRow = PROW( ksup, grid_ );
3654  Int ksupProcCol = PCOL( ksup, grid_ );
3655 
3656  Int sendCount = CountSendToCrossDiagonal(ksup);
3657  Int recvCount = CountRecvFromCrossDiagonal(ksup);
3658 
3659  std::vector<MPI_Request > arrMpiReqsSend(sendCount, MPI_REQUEST_NULL );
3660  std::vector<MPI_Request > arrMpiReqsSizeSend(sendCount, MPI_REQUEST_NULL );
3661  std::vector<std::vector<char> > arrSstrLcolSend(sendCount);
3662  std::vector<Int > arrSstrLcolSizeSend(sendCount);
3663 
3664  std::vector<MPI_Request > arrMpiReqsRecv(recvCount, MPI_REQUEST_NULL );
3665  std::vector<MPI_Request > arrMpiReqsSizeRecv(recvCount, MPI_REQUEST_NULL );
3666  std::vector<std::vector<char> > arrSstrLcolRecv(recvCount);
3667  std::vector<Int > arrSstrLcolSizeRecv(recvCount);
3668 
3669 
3670 
3671  // Sender
3672  if( isSendToCrossDiagonal_(grid_->numProcCol,ksup) ){
3673 #if ( _DEBUGlevel_ >= 1 )
3674  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should send to "<<CountSendToCrossDiagonal(ksup)<<" processors"<<std::endl;
3675 #endif
3676 
3677  Int sendIdx = 0;
3678  for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
3679  if(isSendToCrossDiagonal_(dstCol,ksup) ){
3680  Int dst = PNUM(PROW(ksup,grid_),dstCol,grid_);
3681  if(MYPROC(grid_)!= dst){
3682  // Pack L data
3683  std::stringstream sstm;
3684  std::vector<char> & sstrLcolSend = arrSstrLcolSend[sendIdx];
3685  Int & sstrSize = arrSstrLcolSizeSend[sendIdx];
3686  MPI_Request & mpiReqSend = arrMpiReqsSend[sendIdx];
3687  MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSend[sendIdx];
3688 
3689  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3690  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
3691  // All blocks except for the diagonal block are to be sent right
3692  //TODO not true > this is a scatter operation ! Can we know the destination ?
3693 
3694  Int count = 0;
3695  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3696  for( Int ib = 1; ib < Lcol.size(); ib++ ){
3697  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3698  count++;
3699  }
3700  }
3701  }
3702  else{
3703 
3704  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3705  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3706  count++;
3707  }
3708  }
3709  }
3710 
3711  serialize( (Int)count, sstm, NO_MASK );
3712 
3713  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3714  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3715 #if ( _DEBUGlevel_ >= 1 )
3716  statusOFS<<"["<<ksup<<"] SEND contains "<<Lcol[ib].blockIdx<< " which corresponds to "<<GBj(ib,grid_)<<std::endl;
3717 #endif
3718  serialize( Lcol[ib], sstm, mask );
3719  }
3720  }
3721 
3722  sstrLcolSend.resize( Size(sstm) );
3723  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
3724  sstrSize = sstrLcolSend.size();
3725 
3726 
3727 
3728  // Send/Recv is possible here due to the one to one correspondence
3729  // in the case of square processor grid
3730 
3731 #if ( _DEBUGlevel_ >= 1 )
3732  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") ---> LBj("<<ksup<<")="<<LBj(ksup,grid_)<<" ---> P"<<dst<<" ("<<ksupProcRow<<","<<dstCol<<")"<<std::endl;
3733 #endif
3734  MPI_Isend( &sstrSize, sizeof(sstrSize), MPI_BYTE, dst, SELINV_TAG_L_SIZE_CD, grid_->comm, &mpiReqSizeSend );
3735  MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dst, SELINV_TAG_L_CONTENT_CD, grid_->comm, &mpiReqSend );
3736 
3737 
3738  PROFILE_COMM(MYPROC(this->grid_),dst,SELINV_TAG_L_SIZE_CD,sizeof(sstrSize));
3739  PROFILE_COMM(MYPROC(this->grid_),dst,SELINV_TAG_L_CONTENT_CD,sstrSize);
3740 
3741  //mpi::Send( sstm, dst,SELINV_TAG_L_SIZE_CD, SELINV_TAG_L_CONTENT_CD, grid_->comm );
3742 
3743  sendIdx++;
3744  } // if I am a sender
3745  }
3746  }
3747  }
3748 
3749 
3750 
3751 
3752 
3753  // Receiver
3754  if( isRecvFromCrossDiagonal_(grid_->numProcRow,ksup) ){
3755 
3756 
3757 #if ( _DEBUGlevel_ >= 1 )
3758  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should receive from "<<CountRecvFromCrossDiagonal(ksup)<<" processors"<<std::endl;
3759 #endif
3760 
3761 
3762  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
3763  std::vector<bool> isBlockFound(Urow.size(),false);
3764 
3765 
3766  Int recvIdx = 0;
3767  //receive size first
3768  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3769  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3770  std::vector<LBlock<T> > LcolRecv;
3771  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
3772  if(MYPROC(grid_)!= src){
3773  MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecv[recvIdx];
3774  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3775 
3776  MPI_Irecv( &sstrSize, 1, MPI_INT, src, SELINV_TAG_L_SIZE_CD, grid_->comm, &mpiReqSizeRecv );
3777 
3778  recvIdx++;
3779  }
3780  }
3781  }
3782 
3783  mpi::Waitall(arrMpiReqsSizeRecv);
3784 
3785 
3786 
3787  //receive content
3788  recvIdx = 0;
3789  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3790  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3791  std::vector<LBlock<T> > LcolRecv;
3792  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
3793  if(MYPROC(grid_)!= src){
3794  MPI_Request & mpiReqRecv = arrMpiReqsRecv[recvIdx];
3795  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3796  std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
3797  sstrLcolRecv.resize(sstrSize);
3798 
3799  MPI_Irecv( &sstrLcolRecv[0], sstrSize, MPI_BYTE, src, SELINV_TAG_L_CONTENT_CD, grid_->comm, &mpiReqRecv );
3800 
3801  recvIdx++;
3802  }
3803  }
3804  }
3805 
3806  mpi::Waitall(arrMpiReqsRecv);
3807 
3808 
3809 
3810  //Process the content
3811  recvIdx = 0;
3812  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3813  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3814  std::vector<LBlock<T> > LcolRecv;
3815  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
3816  if(MYPROC(grid_)!= src){
3817 
3818  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3819  std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
3820  std::stringstream sstm;
3821 
3822 #if ( _DEBUGlevel_ >= 1 )
3823  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<ksup<<") <--- P"<<src<<" ("<<srcRow<<","<<ksupProcCol<<")"<<std::endl;
3824 #endif
3825 
3826 
3827  sstm.write( &sstrLcolRecv[0], sstrSize );
3828 
3829  // Unpack L data.
3830  Int numLBlock;
3831  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3832  deserialize( numLBlock, sstm, NO_MASK );
3833  LcolRecv.resize(numLBlock);
3834  for( Int ib = 0; ib < numLBlock; ib++ ){
3835  deserialize( LcolRecv[ib], sstm, mask );
3836 #if ( _DEBUGlevel_ >= 1 )
3837  statusOFS<<"["<<ksup<<"] RECV contains "<<LcolRecv[ib].blockIdx<< " which corresponds to "<< ib * grid_->numProcRow + srcRow; // <<std::endl;
3838  // statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<<((ib * grid_->numProcRow + srcRow)/grid_->numProcCol)%grid_->numProcCol <<std::endl;
3839  statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<< LcolRecv[ib].blockIdx % grid_->numProcCol <<std::endl;
3840 #endif
3841  }
3842 
3843 
3844  recvIdx++;
3845 
3846  } // sender is not the same as receiver
3847  else{
3848  // L is obtained locally, just make a copy. Do not include the diagonal block
3849  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3850  if( MYROW( grid_ ) != PROW( ksup, grid_ ) ){
3851  LcolRecv.resize( Lcol.size() );
3852  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3853  LcolRecv[ib] = Lcol[ib];
3854  }
3855  }
3856  else{
3857  LcolRecv.resize( Lcol.size() - 1 );
3858  for( Int ib = 0; ib < Lcol.size() - 1; ib++ ){
3859  LcolRecv[ib] = Lcol[ib+1];
3860  }
3861  }
3862  } // sender is the same as receiver
3863 
3864  // Update U
3865  // Make sure that the size of L and the corresponding U blocks match.
3866  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
3867  LBlock<T> & LB = LcolRecv[ib];
3868  if( LB.blockIdx <= ksup ){
3869  ErrorHandling( "LcolRecv contains the wrong blocks." );
3870  }
3871  for( Int jb = 0; jb < Urow.size(); jb++ ){
3872  UBlock<T> & UB = Urow[jb];
3873  if( LB.blockIdx == UB.blockIdx ){
3874  // Compare size
3875  if( LB.numRow != UB.numCol || LB.numCol != UB.numRow ){
3876  std::ostringstream msg;
3877  msg << "LB(" << LB.blockIdx << ", " << ksup << ") and UB("
3878  << ksup << ", " << UB.blockIdx << ") do not share the same size." << std::endl
3879  << "LB: " << LB.numRow << " x " << LB.numCol << std::endl
3880  << "UB: " << UB.numRow << " x " << UB.numCol << std::endl;
3881  ErrorHandling( msg.str().c_str() );
3882  }
3883 
3884  // Note that the order of the column indices of the U
3885  // block may not follow the order of the row indices,
3886  // overwrite the information in U.
3887  UB.cols = LB.rows;
3888  Transpose( LB.nzval, UB.nzval );
3889 
3890 #if ( _DEBUGlevel_ >= 1 )
3891  statusOFS<<"["<<ksup<<"] USING "<<LB.blockIdx<< std::endl;
3892 #endif
3893  isBlockFound[jb] = true;
3894  break;
3895  } // if( LB.blockIdx == UB.blockIdx )
3896  } // for (jb)
3897  } // for (ib)
3898  }
3899  }
3900 
3901  for( Int jb = 0; jb < Urow.size(); jb++ ){
3902  UBlock<T> & UB = Urow[jb];
3903  if( !isBlockFound[jb] ){
3904  ErrorHandling( "UBlock cannot find its update. Something is seriously wrong." );
3905  }
3906  }
3907 
3908 
3909 
3910  } // if I am a receiver
3911 
3912 
3913  //Wait until every receiver is done
3914  mpi::Waitall(arrMpiReqsSizeSend);
3915  mpi::Waitall(arrMpiReqsSend);
3916 
3917 
3918  } // for (ksup)
3919 
3920 
3921 #if ( _DEBUGlevel_ >= 1 )
3922  statusOFS << std::endl << "L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl << std::endl;
3923 #endif
3924 
3925  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3926  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
3927  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3928  IntNumVec ipiv( SuperSize( ksup, super_ ) );
3929  // Note that the pivoting vector ipiv should follow the FORTRAN
3930  // notation by adding the +1
3931  for(Int i = 0; i < SuperSize( ksup, super_ ); i++){
3932  ipiv[i] = i + 1;
3933  }
3934  LBlock<T> & LB = (this->L( LBj( ksup, grid_ ) ))[0];
3935 #if ( _DEBUGlevel_ >= 2 )
3936  // Check the correctness of the matrix inversion for the first local column
3937  statusOFS << "Factorized A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;
3938 #endif
3939  lapack::Getri( SuperSize( ksup, super_ ), LB.nzval.Data(),
3940  SuperSize( ksup, super_ ), ipiv.Data() );
3941 
3942  // Symmetrize the diagonal block
3943  Symmetrize( LB.nzval );
3944 
3945 #if ( _DEBUGlevel_ >= 2 )
3946  // Check the correctness of the matrix inversion for the first local column
3947  statusOFS << "Inversed A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;
3948 #endif
3949  } // if I need to inverse the diagonal block
3950  } // for (ksup)
3951 
3952 
3953 
3954 
3955 
3956 
3957  return ;
3958  } // ----- end of method PMatrix::PreSelInv -----
3959 
3960  template<typename T>
3962  {
3963  Int numSuper = this->NumSuper();
3964 
3965  Int numCol = this->NumCol();
3966  const IntNumVec& perm = super_->perm;
3967  const IntNumVec& permInv = super_->permInv;
3968 
3969  const IntNumVec * pPerm_r;
3970  const IntNumVec * pPermInv_r;
3971 
3972  pPerm_r = &super_->perm_r;
3973  pPermInv_r = &super_->permInv_r;
3974 
3975  const IntNumVec& perm_r = *pPerm_r;
3976  const IntNumVec& permInv_r = *pPermInv_r;
3977 
3978 
3979  NumVec<T> diagLocal( numCol );
3980  SetValue( diagLocal, ZERO<T>() );
3981 
3982  diag.Resize( numCol );
3983  SetValue( diag, ZERO<T>() );
3984 
3985  for( Int orow = 0; orow < numCol; orow++){
3986  //row index in the permuted order
3987  Int row = perm[ orow ];
3988  //col index in the permuted order
3989  Int col = perm[ perm_r[ orow] ];
3990 
3991  Int blockColIdx = BlockIdx( col, super_ );
3992  Int blockRowIdx = BlockIdx( row, super_ );
3993 
3994 
3995  // I own the column
3996  if( MYROW( grid_ ) == PROW( blockRowIdx, grid_ ) &&
3997  MYCOL( grid_ ) == PCOL( blockColIdx, grid_ ) ){
3998  // Search for the nzval
3999  bool isFound = false;
4000 
4001  if( blockColIdx <= blockRowIdx ){
4002  // Data on the L side
4003 
4004  std::vector<LBlock<T> >& Lcol = this->L( LBj( blockColIdx, grid_ ) );
4005 
4006  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4007 #if ( _DEBUGlevel_ >= 1 )
4008  statusOFS << "blockRowIdx = " << blockRowIdx << ", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx << ", blockColIdx = " << blockColIdx << std::endl;
4009 #endif
4010  if( Lcol[ib].blockIdx == blockRowIdx ){
4011  IntNumVec& rows = Lcol[ib].rows;
4012  for( int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
4013  if( rows[iloc] == row ){
4014  Int jloc = col - FirstBlockCol( blockColIdx, super_ );
4015 
4016  diagLocal[ orow ] = Lcol[ib].nzval( iloc, jloc );
4017 
4018 
4019  isFound = true;
4020  break;
4021  } // found the corresponding row
4022  }
4023  }
4024  if( isFound == true ) break;
4025  } // for (ib)
4026  }
4027  else{
4028  // Data on the U side
4029 
4030  std::vector<UBlock<T> >& Urow = this->U( LBi( blockRowIdx, grid_ ) );
4031 
4032  for( Int jb = 0; jb < Urow.size(); jb++ ){
4033  if( Urow[jb].blockIdx == blockColIdx ){
4034  IntNumVec& cols = Urow[jb].cols;
4035  for( int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
4036  if( cols[jloc] == col ){
4037  Int iloc = row - FirstBlockRow( blockRowIdx, super_ );
4038 
4039  diagLocal[ orow ] = Urow[jb].nzval( iloc, jloc );
4040 
4041  isFound = true;
4042  break;
4043  } // found the corresponding col
4044  }
4045  }
4046  if( isFound == true ) break;
4047  } // for (jb)
4048  } // if( blockColIdx <= blockRowIdx )
4049 
4050  // Did not find the corresponding row, set the value to zero.
4051  if( isFound == false ){
4052  statusOFS << "In the permutated order, (" << row << ", " << col <<
4053  ") is not found in PMatrix." << std::endl;
4054  diagLocal[orow] = ZERO<T>();
4055  }
4056  }
4057  }
4058 
4059  // //TODO This doesnt work with row perm
4060  // for( Int ksup = 0; ksup < numSuper; ksup++ ){
4061  // Int numRows = SuperSize(ksup, this->super_);
4062  //
4063  // // I own the diagonal block
4064  // if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
4065  // MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4066  // LBlock<T> & LB = this->L( LBj( ksup, grid_ ) )[0];
4067  // for( Int i = 0; i < LB.numRow; i++ ){
4068  // diagLocal( permInv[ LB.rows(i) ] ) = LB.nzval( i, perm[permInv_r[i]] );
4069  // }
4070  // }
4071  // }
4072 
4073  // All processors own diag
4074  mpi::Allreduce( diagLocal.Data(), diag.Data(), numCol, MPI_SUM, grid_->comm );
4075 
4076 
4077  return ;
4078  } // ----- end of method PMatrix::GetDiagonal -----
4079 
4080 
4081 
4082  template<typename T>
4084  {
4085 #if ( _DEBUGlevel_ >= 1 )
4086  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix." << std::endl;
4087 #endif
4088  Int mpirank = grid_->mpirank;
4089  Int mpisize = grid_->mpisize;
4090 
4091  std::vector<Int> rowSend( mpisize );
4092  std::vector<Int> colSend( mpisize );
4093  std::vector<T> valSend( mpisize );
4094  std::vector<Int> sizeSend( mpisize, 0 );
4095  std::vector<Int> displsSend( mpisize, 0 );
4096 
4097  std::vector<Int> rowRecv( mpisize );
4098  std::vector<Int> colRecv( mpisize );
4099  std::vector<T> valRecv( mpisize );
4100  std::vector<Int> sizeRecv( mpisize, 0 );
4101  std::vector<Int> displsRecv( mpisize, 0 );
4102 
4103  Int numSuper = this->NumSuper();
4104  const IntNumVec& perm = super_->perm;
4105  const IntNumVec& permInv = super_->permInv;
4106 
4107  const IntNumVec * pPerm_r;
4108  const IntNumVec * pPermInv_r;
4109 
4110  pPerm_r = &super_->perm_r;
4111  pPermInv_r = &super_->permInv_r;
4112 
4113  const IntNumVec& perm_r = *pPerm_r;
4114  const IntNumVec& permInv_r = *pPermInv_r;
4115 
4116  // The number of local columns in DistSparseMatrix format for the
4117  // processor with rank 0. This number is the same for processors
4118  // with rank ranging from 0 to mpisize - 2, and may or may not differ
4119  // from the number of local columns for processor with rank mpisize -
4120  // 1.
4121  Int numColFirst = this->NumCol() / mpisize;
4122 
4123  // Count the size first.
4124  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4125  // L blocks
4126  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4127  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4128  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4129  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4130  Int jcol = permInv[ permInv_r[ j + FirstBlockCol( ksup, super_ ) ] ];
4131  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4132  sizeSend[dest] += Lcol[ib].numRow;
4133  }
4134  }
4135  } // I own the column of ksup
4136 
4137  // U blocks
4138  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4139  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4140  for( Int jb = 0; jb < Urow.size(); jb++ ){
4141  IntNumVec& cols = Urow[jb].cols;
4142  for( Int j = 0; j < cols.m(); j++ ){
4143  Int jcol = permInv[ permInv_r[ cols[j] ] ];
4144  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4145  sizeSend[dest] += Urow[jb].numRow;
4146  }
4147  }
4148  } // I own the row of ksup
4149  } // for (ksup)
4150 
4151  // All-to-all exchange of size information
4152  MPI_Alltoall(
4153  &sizeSend[0], 1, MPI_INT,
4154  &sizeRecv[0], 1, MPI_INT, grid_->comm );
4155 
4156 
4157 
4158  // Reserve the space
4159  for( Int ip = 0; ip < mpisize; ip++ ){
4160  if( ip == 0 ){
4161  displsSend[ip] = 0;
4162  }
4163  else{
4164  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4165  }
4166 
4167  if( ip == 0 ){
4168  displsRecv[ip] = 0;
4169  }
4170  else{
4171  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4172  }
4173  }
4174  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4175  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4176 
4177  rowSend.resize( sizeSendTotal );
4178  colSend.resize( sizeSendTotal );
4179  valSend.resize( sizeSendTotal );
4180 
4181  rowRecv.resize( sizeRecvTotal );
4182  colRecv.resize( sizeRecvTotal );
4183  valRecv.resize( sizeRecvTotal );
4184 
4185 #if ( _DEBUGlevel_ >= 1 )
4186  statusOFS << "displsSend = " << displsSend << std::endl;
4187  statusOFS << "displsRecv = " << displsRecv << std::endl;
4188 #endif
4189 
4190  // Put (row, col, val) to the sending buffer
4191  std::vector<Int> cntSize( mpisize, 0 );
4192 
4193  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4194  // L blocks
4195  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4196  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4197  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4198  IntNumVec& rows = Lcol[ib].rows;
4199  NumMat<T>& nzval = Lcol[ib].nzval;
4200  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4201  Int jcol = permInv[ permInv_r[ j + FirstBlockCol( ksup, super_ ) ] ];
4202  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4203  for( Int i = 0; i < rows.m(); i++ ){
4204  rowSend[displsSend[dest] + cntSize[dest]] = permInv[ rows[i] ];
4205  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4206  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4207  cntSize[dest]++;
4208  }
4209  }
4210  }
4211  } // I own the column of ksup
4212 
4213  // U blocks
4214  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4215  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4216  for( Int jb = 0; jb < Urow.size(); jb++ ){
4217  IntNumVec& cols = Urow[jb].cols;
4218  NumMat<T>& nzval = Urow[jb].nzval;
4219  for( Int j = 0; j < cols.m(); j++ ){
4220  Int jcol = permInv[ permInv_r[ cols[j] ] ];
4221  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4222  for( Int i = 0; i < Urow[jb].numRow; i++ ){
4223  rowSend[displsSend[dest] + cntSize[dest]] = permInv[ i + FirstBlockCol( ksup, super_ ) ];
4224  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4225  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4226  cntSize[dest]++;
4227  }
4228  }
4229  }
4230  } // I own the row of ksup
4231  }
4232 
4233  // Check sizes match
4234  for( Int ip = 0; ip < mpisize; ip++ ){
4235  if( cntSize[ip] != sizeSend[ip] )
4236  {
4237  ErrorHandling( "Sizes of the sending information do not match." );
4238  }
4239  }
4240 
4241 
4242  // Alltoallv to exchange information
4243  mpi::Alltoallv(
4244  &rowSend[0], &sizeSend[0], &displsSend[0],
4245  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4246  grid_->comm );
4247  mpi::Alltoallv(
4248  &colSend[0], &sizeSend[0], &displsSend[0],
4249  &colRecv[0], &sizeRecv[0], &displsRecv[0],
4250  grid_->comm );
4251  mpi::Alltoallv(
4252  &valSend[0], &sizeSend[0], &displsSend[0],
4253  &valRecv[0], &sizeRecv[0], &displsRecv[0],
4254  grid_->comm );
4255 
4256 #if ( _DEBUGlevel_ >= 1 )
4257  statusOFS << "Alltoallv communication finished." << std::endl;
4258 #endif
4259 
4260  //#if ( _DEBUGlevel_ >= 1 )
4261  // for( Int ip = 0; ip < mpisize; ip++ ){
4262  // statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
4263  // statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
4264  // statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
4265  // statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
4266  // statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;
4267  // statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;
4268  // }
4269  //#endif
4270 
4271  // Organize the received message.
4272  Int firstCol = mpirank * numColFirst;
4273  Int numColLocal;
4274  if( mpirank == mpisize-1 )
4275  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
4276  else
4277  numColLocal = numColFirst;
4278 
4279  std::vector<std::vector<Int> > rows( numColLocal );
4280  std::vector<std::vector<T> > vals( numColLocal );
4281 
4282  for( Int ip = 0; ip < mpisize; ip++ ){
4283  Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4284  Int* colRecvCur = &colRecv[displsRecv[ip]];
4285  T* valRecvCur = &valRecv[displsRecv[ip]];
4286  for( Int i = 0; i < sizeRecv[ip]; i++ ){
4287  rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4288  vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4289  } // for (i)
4290  } // for (ip)
4291 
4292  // Sort the rows
4293  std::vector<std::vector<Int> > sortIndex( numColLocal );
4294  for( Int j = 0; j < numColLocal; j++ ){
4295  sortIndex[j].resize( rows[j].size() );
4296  for( Int i = 0; i < sortIndex[j].size(); i++ )
4297  sortIndex[j][i] = i;
4298  std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4299  IndexComp<std::vector<Int>& > ( rows[j] ) );
4300  } // for (j)
4301 
4302  // Form DistSparseMatrix according to the received message
4303  // NOTE: for indicies, DistSparseMatrix follows the FORTRAN
4304  // convention (1 based) while PMatrix follows the C convention (0
4305  // based)
4306  A.size = this->NumCol();
4307  A.nnzLocal = 0;
4308  A.colptrLocal.Resize( numColLocal + 1 );
4309  // Note that 1 is important since the index follows the FORTRAN convention
4310  A.colptrLocal(0) = 1;
4311  for( Int j = 0; j < numColLocal; j++ ){
4312  A.nnzLocal += rows[j].size();
4313  A.colptrLocal(j+1) = A.colptrLocal(j) + rows[j].size();
4314  }
4315 
4316  A.comm = grid_->comm;
4317 
4318 #if ( _DEBUGlevel_ >= 1 )
4319  statusOFS << "nnzLocal = " << A.nnzLocal << std::endl;
4320  statusOFS << "nnz = " << A.Nnz() << std::endl;
4321 #endif
4322 
4323 
4324  A.rowindLocal.Resize( A.nnzLocal );
4325  A.nzvalLocal.Resize( A.nnzLocal );
4326 
4327  Int* rowPtr = A.rowindLocal.Data();
4328  T* nzvalPtr = A.nzvalLocal.Data();
4329  for( Int j = 0; j < numColLocal; j++ ){
4330  std::vector<Int>& rowsCur = rows[j];
4331  std::vector<Int>& sortIndexCur = sortIndex[j];
4332  std::vector<T>& valsCur = vals[j];
4333  for( Int i = 0; i < rows[j].size(); i++ ){
4334  // Note that 1 is important since the index follows the FORTRAN convention
4335  *(rowPtr++) = rowsCur[sortIndexCur[i]] + 1;
4336  *(nzvalPtr++) = valsCur[sortIndexCur[i]];
4337  }
4338  }
4339 
4340 #if ( _DEBUGlevel_ >= 1 )
4341  statusOFS << "A.colptrLocal[end] = " << A.colptrLocal(numColLocal) << std::endl;
4342  statusOFS << "A.rowindLocal.size() = " << A.rowindLocal.m() << std::endl;
4343  statusOFS << "A.rowindLocal[end] = " << A.rowindLocal(A.nnzLocal-1) << std::endl;
4344  statusOFS << "A.nzvalLocal[end] = " << A.nzvalLocal(A.nnzLocal-1) << std::endl;
4345 #endif
4346 
4347 
4348 
4349  return ;
4350  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix -----
4351 
4352 
4353 
4354  template<typename T>
4356  {
4357 #if ( _DEBUGlevel_ >= 1 )
4358  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4359 #endif
4360  Int mpirank = grid_->mpirank;
4361  Int mpisize = grid_->mpisize;
4362 
4363  std::vector<Int> rowSend( mpisize );
4364  std::vector<Int> colSend( mpisize );
4365  std::vector<T> valSend( mpisize );
4366  std::vector<Int> sizeSend( mpisize, 0 );
4367  std::vector<Int> displsSend( mpisize, 0 );
4368 
4369  std::vector<Int> rowRecv( mpisize );
4370  std::vector<Int> colRecv( mpisize );
4371  std::vector<T> valRecv( mpisize );
4372  std::vector<Int> sizeRecv( mpisize, 0 );
4373  std::vector<Int> displsRecv( mpisize, 0 );
4374 
4375  Int numSuper = this->NumSuper();
4376  const IntNumVec& permInv = super_->permInv;
4377 
4378 
4379 
4380  const IntNumVec * pPermInv_r;
4381 
4382  if(optionsFact_->RowPerm=="NOROWPERM"){
4383  pPermInv_r = &super_->permInv;
4384  }
4385  else{
4386  pPermInv_r = &super_->permInv_r;
4387  }
4388 
4389  const IntNumVec& permInv_r = *pPermInv_r;
4390 
4391 
4392 
4393 
4394 
4395 
4396  // The number of local columns in DistSparseMatrix format for the
4397  // processor with rank 0. This number is the same for processors
4398  // with rank ranging from 0 to mpisize - 2, and may or may not differ
4399  // from the number of local columns for processor with rank mpisize -
4400  // 1.
4401  Int numColFirst = this->NumCol() / mpisize;
4402 
4403  // Count the size first.
4404  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4405  // L blocks
4406  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4407  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4408  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4409  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4410  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
4411  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4412  sizeSend[dest] += Lcol[ib].numRow;
4413  }
4414  }
4415  } // I own the column of ksup
4416 
4417  // U blocks
4418  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4419  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4420  for( Int jb = 0; jb < Urow.size(); jb++ ){
4421  IntNumVec& cols = Urow[jb].cols;
4422  for( Int j = 0; j < cols.m(); j++ ){
4423  Int jcol = permInv( cols(j) );
4424  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4425  sizeSend[dest] += Urow[jb].numRow;
4426  }
4427  }
4428  } // I own the row of ksup
4429  } // for (ksup)
4430 
4431  // All-to-all exchange of size information
4432  MPI_Alltoall(
4433  &sizeSend[0], 1, MPI_INT,
4434  &sizeRecv[0], 1, MPI_INT, grid_->comm );
4435 
4436 
4437 
4438  // Reserve the space
4439  for( Int ip = 0; ip < mpisize; ip++ ){
4440  if( ip == 0 ){
4441  displsSend[ip] = 0;
4442  }
4443  else{
4444  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4445  }
4446 
4447  if( ip == 0 ){
4448  displsRecv[ip] = 0;
4449  }
4450  else{
4451  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4452  }
4453  }
4454  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4455  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4456 
4457  rowSend.resize( sizeSendTotal );
4458  colSend.resize( sizeSendTotal );
4459  valSend.resize( sizeSendTotal );
4460 
4461  rowRecv.resize( sizeRecvTotal );
4462  colRecv.resize( sizeRecvTotal );
4463  valRecv.resize( sizeRecvTotal );
4464 
4465 #if ( _DEBUGlevel_ >= 1 )
4466  statusOFS << "displsSend = " << displsSend << std::endl;
4467  statusOFS << "displsRecv = " << displsRecv << std::endl;
4468 #endif
4469 
4470  // Put (row, col, val) to the sending buffer
4471  std::vector<Int> cntSize( mpisize, 0 );
4472 
4473 
4474  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4475  // L blocks
4476  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4477  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4478  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4479  IntNumVec& rows = Lcol[ib].rows;
4480  NumMat<T>& nzval = Lcol[ib].nzval;
4481  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4482  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
4483  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4484  for( Int i = 0; i < rows.m(); i++ ){
4485  rowSend[displsSend[dest] + cntSize[dest]] = permInv_r( rows(i) );
4486  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4487  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4488  cntSize[dest]++;
4489  }
4490  }
4491  }
4492  } // I own the column of ksup
4493 
4494 
4495  // U blocks
4496  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4497  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4498  for( Int jb = 0; jb < Urow.size(); jb++ ){
4499  IntNumVec& cols = Urow[jb].cols;
4500  NumMat<T>& nzval = Urow[jb].nzval;
4501  for( Int j = 0; j < cols.m(); j++ ){
4502  Int jcol = permInv( cols(j) );
4503  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4504  for( Int i = 0; i < Urow[jb].numRow; i++ ){
4505  rowSend[displsSend[dest] + cntSize[dest]] =
4506  permInv_r( i + FirstBlockCol( ksup, super_ ) );
4507  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4508  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4509  cntSize[dest]++;
4510  }
4511  }
4512  }
4513  } // I own the row of ksup
4514  }
4515 
4516 
4517 
4518  // Check sizes match
4519  for( Int ip = 0; ip < mpisize; ip++ ){
4520  if( cntSize[ip] != sizeSend[ip] ){
4521  ErrorHandling( "Sizes of the sending information do not match." );
4522  }
4523  }
4524 
4525  // Alltoallv to exchange information
4526  mpi::Alltoallv(
4527  &rowSend[0], &sizeSend[0], &displsSend[0],
4528  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4529  grid_->comm );
4530  mpi::Alltoallv(
4531  &colSend[0], &sizeSend[0], &displsSend[0],
4532  &colRecv[0], &sizeRecv[0], &displsRecv[0],
4533  grid_->comm );
4534  mpi::Alltoallv(
4535  &valSend[0], &sizeSend[0], &displsSend[0],
4536  &valRecv[0], &sizeRecv[0], &displsRecv[0],
4537  grid_->comm );
4538 
4539 #if ( _DEBUGlevel_ >= 1 )
4540  statusOFS << "Alltoallv communication finished." << std::endl;
4541 #endif
4542 
4543  //#if ( _DEBUGlevel_ >= 1 )
4544  // for( Int ip = 0; ip < mpisize; ip++ ){
4545  // statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
4546  // statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
4547  // statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
4548  // statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
4549  // statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;
4550  // statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;
4551  // }
4552  //#endif
4553 
4554  // Organize the received message.
4555  Int firstCol = mpirank * numColFirst;
4556  Int numColLocal;
4557  if( mpirank == mpisize-1 )
4558  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
4559  else
4560  numColLocal = numColFirst;
4561 
4562  std::vector<std::vector<Int> > rows( numColLocal );
4563  std::vector<std::vector<T> > vals( numColLocal );
4564 
4565  for( Int ip = 0; ip < mpisize; ip++ ){
4566  Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4567  Int* colRecvCur = &colRecv[displsRecv[ip]];
4568  T* valRecvCur = &valRecv[displsRecv[ip]];
4569  for( Int i = 0; i < sizeRecv[ip]; i++ ){
4570  rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4571  vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4572  } // for (i)
4573  } // for (ip)
4574 
4575  // Sort the rows
4576  std::vector<std::vector<Int> > sortIndex( numColLocal );
4577  for( Int j = 0; j < numColLocal; j++ ){
4578  sortIndex[j].resize( rows[j].size() );
4579  for( Int i = 0; i < sortIndex[j].size(); i++ )
4580  sortIndex[j][i] = i;
4581  std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4582  IndexComp<std::vector<Int>& > ( rows[j] ) );
4583  } // for (j)
4584 
4585  // Form DistSparseMatrix according to the received message
4586  // NOTE: for indicies, DistSparseMatrix follows the FORTRAN
4587  // convention (1 based) while PMatrix follows the C convention (0
4588  // based)
4589  if( A.size != this->NumCol() ){
4590  ErrorHandling( "The DistSparseMatrix providing the pattern has a different size from PMatrix." );
4591  }
4592  if( A.colptrLocal.m() != numColLocal + 1 ){
4593  ErrorHandling( "The DistSparseMatrix providing the pattern has a different number of local columns from PMatrix." );
4594  }
4595 
4596  B.size = A.size;
4597  B.nnz = A.nnz;
4598  B.nnzLocal = A.nnzLocal;
4599  B.colptrLocal = A.colptrLocal;
4600  B.rowindLocal = A.rowindLocal;
4601  B.nzvalLocal.Resize( B.nnzLocal );
4602  SetValue( B.nzvalLocal, ZERO<T>() );
4603  // Make sure that the communicator of A and B are the same.
4604  // FIXME Find a better way to compare the communicators
4605  // if( grid_->comm != A.comm ){
4606  //ErrorHandling( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );
4607  // }
4608  B.comm = grid_->comm;
4609 
4610  Int* rowPtr = B.rowindLocal.Data();
4611  T* nzvalPtr = B.nzvalLocal.Data();
4612  for( Int j = 0; j < numColLocal; j++ ){
4613  std::vector<Int>& rowsCur = rows[j];
4614  std::vector<Int>& sortIndexCur = sortIndex[j];
4615  std::vector<T>& valsCur = vals[j];
4616  std::vector<Int> rowsCurSorted( rowsCur.size() );
4617  // Note that 1 is important since the index follows the FORTRAN convention
4618  for( Int i = 0; i < rowsCurSorted.size(); i++ ){
4619  rowsCurSorted[i] = rowsCur[sortIndexCur[i]] + 1;
4620  }
4621 
4622  // Search and match the indices
4623  std::vector<Int>::iterator it;
4624  for( Int i = B.colptrLocal(j) - 1;
4625  i < B.colptrLocal(j+1) - 1; i++ ){
4626  it = std::lower_bound( rowsCurSorted.begin(), rowsCurSorted.end(),
4627  *(rowPtr++) );
4628  if( it == rowsCurSorted.end() ){
4629  // Did not find the row, set it to zero
4630  *(nzvalPtr++) = ZERO<T>();
4631  }
4632  else{
4633  // Found the row, set it according to the received value
4634  *(nzvalPtr++) = valsCur[ sortIndexCur[it-rowsCurSorted.begin()] ];
4635  }
4636  } // for (i)
4637  } // for (j)
4638 
4639 #if ( _DEBUGlevel_ >= 1 )
4640  statusOFS << "B.colptrLocal[end] = " << B.colptrLocal(numColLocal) << std::endl;
4641  statusOFS << "B.rowindLocal.size() = " << B.rowindLocal.m() << std::endl;
4642  statusOFS << "B.rowindLocal[end] = " << B.rowindLocal(B.nnzLocal-1) << std::endl;
4643  statusOFS << "B.nzvalLocal[end] = " << B.nzvalLocal(B.nnzLocal-1) << std::endl;
4644 #endif
4645 
4646 
4647 
4648  return ;
4649  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix_OLD -----
4650 
4651 
4652  // A (maybe) more memory efficient way for converting the PMatrix to a
4653  // DistSparseMatrix structure.
4654  //
4655  template<typename T>
4657  {
4658 
4659 #if ( _DEBUGlevel_ >= 1 )
4660  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4661 #endif
4662  Int mpirank = grid_->mpirank;
4663  Int mpisize = grid_->mpisize;
4664 
4665  std::vector<Int> rowSend( mpisize );
4666  std::vector<Int> colSend( mpisize );
4667  std::vector<T> valSend( mpisize );
4668  std::vector<Int> sizeSend( mpisize, 0 );
4669  std::vector<Int> displsSend( mpisize, 0 );
4670 
4671  std::vector<Int> rowRecv( mpisize );
4672  std::vector<Int> colRecv( mpisize );
4673  std::vector<T> valRecv( mpisize );
4674  std::vector<Int> sizeRecv( mpisize, 0 );
4675  std::vector<Int> displsRecv( mpisize, 0 );
4676 
4677  Int numSuper = this->NumSuper();
4678  const IntNumVec& perm = super_->perm;
4679  const IntNumVec& permInv = super_->permInv;
4680 
4681  const IntNumVec * pPerm_r;
4682  const IntNumVec * pPermInv_r;
4683 
4684  pPerm_r = &super_->perm_r;
4685  pPermInv_r = &super_->permInv_r;
4686 
4687  const IntNumVec& perm_r = *pPerm_r;
4688  const IntNumVec& permInv_r = *pPermInv_r;
4689 
4690 
4691  // Count the sizes from the A matrix first
4692  Int numColFirst = this->NumCol() / mpisize;
4693  Int firstCol = mpirank * numColFirst;
4694  Int numColLocal;
4695  if( mpirank == mpisize-1 )
4696  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
4697  else
4698  numColLocal = numColFirst;
4699 
4700 
4701 
4702 
4703 
4704  Int* rowPtr = A.rowindLocal.Data();
4705  Int* colPtr = A.colptrLocal.Data();
4706 
4707  for( Int j = 0; j < numColLocal; j++ ){
4708  Int ocol = firstCol + j;
4709  Int col = perm[ perm_r[ ocol] ];
4710  Int blockColIdx = BlockIdx( col, super_ );
4711  Int procCol = PCOL( blockColIdx, grid_ );
4712  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4713  Int orow = rowPtr[i]-1;
4714  Int row = perm[ orow ];
4715  Int blockRowIdx = BlockIdx( row, super_ );
4716  Int procRow = PROW( blockRowIdx, grid_ );
4717  Int dest = PNUM( procRow, procCol, grid_ );
4718 #if ( _DEBUGlevel_ >= 1 )
4719  statusOFS << "("<< orow<<", "<<ocol<<") == "<< "("<< row<<", "<<col<<")"<< std::endl;
4720  statusOFS << "BlockIdx = " << blockRowIdx << ", " <<blockColIdx << std::endl;
4721  statusOFS << procRow << ", " << procCol << ", "
4722  << dest << std::endl;
4723 #endif
4724  sizeSend[dest]++;
4725  } // for (i)
4726  } // for (j)
4727 
4728  // All-to-all exchange of size information
4729  MPI_Alltoall(
4730  &sizeSend[0], 1, MPI_INT,
4731  &sizeRecv[0], 1, MPI_INT, grid_->comm );
4732 
4733 #if ( _DEBUGlevel_ >= 1 )
4734  statusOFS << std::endl << "sizeSend: " << sizeSend << std::endl;
4735  statusOFS << std::endl << "sizeRecv: " << sizeRecv << std::endl;
4736 #endif
4737 
4738 
4739 
4740  // Reserve the space
4741  for( Int ip = 0; ip < mpisize; ip++ ){
4742  if( ip == 0 ){
4743  displsSend[ip] = 0;
4744  }
4745  else{
4746  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4747  }
4748 
4749  if( ip == 0 ){
4750  displsRecv[ip] = 0;
4751  }
4752  else{
4753  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4754  }
4755  }
4756 
4757  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4758  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4759 
4760  rowSend.resize( sizeSendTotal );
4761  colSend.resize( sizeSendTotal );
4762  valSend.resize( sizeSendTotal );
4763 
4764  rowRecv.resize( sizeRecvTotal );
4765  colRecv.resize( sizeRecvTotal );
4766  valRecv.resize( sizeRecvTotal );
4767 
4768 #if ( _DEBUGlevel_ >= 1 )
4769  statusOFS << "displsSend = " << displsSend << std::endl;
4770  statusOFS << "displsRecv = " << displsRecv << std::endl;
4771 #endif
4772 
4773  // Put (row, col) to the sending buffer
4774  std::vector<Int> cntSize( mpisize, 0 );
4775 
4776  rowPtr = A.rowindLocal.Data();
4777  colPtr = A.colptrLocal.Data();
4778 
4779  for( Int j = 0; j < numColLocal; j++ ){
4780 
4781  Int ocol = firstCol + j;
4782  Int col = perm[ perm_r[ ocol] ];
4783  Int blockColIdx = BlockIdx( col, super_ );
4784  Int procCol = PCOL( blockColIdx, grid_ );
4785  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4786  Int orow = rowPtr[i]-1;
4787  Int row = perm[ orow ];
4788  Int blockRowIdx = BlockIdx( row, super_ );
4789  Int procRow = PROW( blockRowIdx, grid_ );
4790  Int dest = PNUM( procRow, procCol, grid_ );
4791  rowSend[displsSend[dest] + cntSize[dest]] = row;
4792  colSend[displsSend[dest] + cntSize[dest]] = col;
4793  cntSize[dest]++;
4794  } // for (i)
4795  } // for (j)
4796 
4797 
4798  // Check sizes match
4799  for( Int ip = 0; ip < mpisize; ip++ ){
4800  if( cntSize[ip] != sizeSend[ip] ){
4801  ErrorHandling( "Sizes of the sending information do not match." );
4802  }
4803  }
4804 
4805  // Alltoallv to exchange information
4806  mpi::Alltoallv(
4807  &rowSend[0], &sizeSend[0], &displsSend[0],
4808  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4809  grid_->comm );
4810  mpi::Alltoallv(
4811  &colSend[0], &sizeSend[0], &displsSend[0],
4812  &colRecv[0], &sizeRecv[0], &displsRecv[0],
4813  grid_->comm );
4814 
4815 #if ( _DEBUGlevel_ >= 1 )
4816  statusOFS << "Alltoallv communication of nonzero indices finished." << std::endl;
4817 #endif
4818 
4819 
4820 #if ( _DEBUGlevel_ >= 1 )
4821  for( Int ip = 0; ip < mpisize; ip++ ){
4822  statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
4823  statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
4824  statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
4825  statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
4826  }
4827 
4828 
4829  //DumpLU();
4830 
4831 
4832 
4833 #endif
4834 
4835  // For each (row, col), fill the nonzero values to valRecv locally.
4836  for( Int g = 0; g < sizeRecvTotal; g++ ){
4837  Int row = rowRecv[g];
4838  Int col = colRecv[g];
4839 
4840  Int blockRowIdx = BlockIdx( row, super_ );
4841  Int blockColIdx = BlockIdx( col, super_ );
4842 
4843  // Search for the nzval
4844  bool isFound = false;
4845 
4846  if( blockColIdx <= blockRowIdx ){
4847  // Data on the L side
4848 
4849  std::vector<LBlock<T> >& Lcol = this->L( LBj( blockColIdx, grid_ ) );
4850 
4851  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4852 #if ( _DEBUGlevel_ >= 1 )
4853  statusOFS << "blockRowIdx = " << blockRowIdx << ", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx << ", blockColIdx = " << blockColIdx << std::endl;
4854 #endif
4855  if( Lcol[ib].blockIdx == blockRowIdx ){
4856  IntNumVec& rows = Lcol[ib].rows;
4857  for( int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
4858  if( rows[iloc] == row ){
4859  Int jloc = col - FirstBlockCol( blockColIdx, super_ );
4860  valRecv[g] = Lcol[ib].nzval( iloc, jloc );
4861  isFound = true;
4862  break;
4863  } // found the corresponding row
4864  }
4865  }
4866  if( isFound == true ) break;
4867  } // for (ib)
4868  }
4869  else{
4870  // Data on the U side
4871 
4872  std::vector<UBlock<T> >& Urow = this->U( LBi( blockRowIdx, grid_ ) );
4873 
4874  for( Int jb = 0; jb < Urow.size(); jb++ ){
4875  if( Urow[jb].blockIdx == blockColIdx ){
4876  IntNumVec& cols = Urow[jb].cols;
4877  for( int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
4878  if( cols[jloc] == col ){
4879  Int iloc = row - FirstBlockRow( blockRowIdx, super_ );
4880  valRecv[g] = Urow[jb].nzval( iloc, jloc );
4881  isFound = true;
4882  break;
4883  } // found the corresponding col
4884  }
4885  }
4886  if( isFound == true ) break;
4887  } // for (jb)
4888  } // if( blockColIdx <= blockRowIdx )
4889 
4890  // Did not find the corresponding row, set the value to zero.
4891  if( isFound == false ){
4892  statusOFS << "In the permutated order, (" << row << ", " << col <<
4893  ") is not found in PMatrix." << std::endl;
4894  valRecv[g] = ZERO<T>();
4895  }
4896 
4897  } // for (g)
4898 
4899 
4900  // Feed back valRecv to valSend through Alltoallv. NOTE: for the
4901  // values, the roles of "send" and "recv" are swapped.
4902  mpi::Alltoallv(
4903  &valRecv[0], &sizeRecv[0], &displsRecv[0],
4904  &valSend[0], &sizeSend[0], &displsSend[0],
4905  grid_->comm );
4906 
4907 #if ( _DEBUGlevel_ >= 1 )
4908  statusOFS << "Alltoallv communication of nonzero values finished." << std::endl;
4909 #endif
4910 
4911  // Put the nonzero values from valSend to the matrix B.
4912  B.size = A.size;
4913  B.nnz = A.nnz;
4914  B.nnzLocal = A.nnzLocal;
4915  B.colptrLocal = A.colptrLocal;
4916  B.rowindLocal = A.rowindLocal;
4917  B.nzvalLocal.Resize( B.nnzLocal );
4918  SetValue( B.nzvalLocal, ZERO<T>() );
4919  // Make sure that the communicator of A and B are the same.
4920  // FIXME Find a better way to compare the communicators
4921  // if( grid_->comm != A.comm ){
4922  //ErrorHandling( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );
4923  // }
4924  B.comm = grid_->comm;
4925 
4926  for( Int i = 0; i < mpisize; i++ )
4927  cntSize[i] = 0;
4928 
4929  rowPtr = B.rowindLocal.Data();
4930  colPtr = B.colptrLocal.Data();
4931  T* valPtr = B.nzvalLocal.Data();
4932 
4933  for( Int j = 0; j < numColLocal; j++ ){
4934  Int ocol = firstCol + j;
4935  Int col = perm[ perm_r[ ocol] ];
4936  Int blockColIdx = BlockIdx( col, super_ );
4937  Int procCol = PCOL( blockColIdx, grid_ );
4938  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4939  Int orow = rowPtr[i]-1;
4940  Int row = perm[ orow ];
4941  Int blockRowIdx = BlockIdx( row, super_ );
4942  Int procRow = PROW( blockRowIdx, grid_ );
4943  Int dest = PNUM( procRow, procCol, grid_ );
4944  valPtr[i] = valSend[displsSend[dest] + cntSize[dest]];
4945  cntSize[dest]++;
4946  } // for (i)
4947  } // for (j)
4948 
4949  // Check sizes match
4950  for( Int ip = 0; ip < mpisize; ip++ ){
4951  if( cntSize[ip] != sizeSend[ip] ){
4952  ErrorHandling( "Sizes of the sending information do not match." );
4953  }
4954  }
4955 
4956 
4957 
4958  return ;
4959  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix -----
4960 
4961 
4962 
4963 
4964  template<typename T>
4966  {
4967  Int numSuper = this->NumSuper();
4968  Int nnzLocal = 0;
4969  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4970  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4971  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4972  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4973  nnzLocal += Lcol[ib].numRow * Lcol[ib].numCol;
4974  }
4975  } // if I own the column of ksup
4976  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4977  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4978  for( Int jb = 0; jb < Urow.size(); jb++ ){
4979  nnzLocal += Urow[jb].numRow * Urow[jb].numCol;
4980  }
4981  } // if I own the row of ksup
4982  }
4983 
4984 
4985  return nnzLocal;
4986  } // ----- end of method PMatrix::NnzLocal -----
4987 
4988 
4989  template<typename T>
4990  LongInt PMatrix<T>::Nnz ( )
4991  {
4992  LongInt nnzLocal = LongInt( this->NnzLocal() );
4993  LongInt nnz;
4994 
4995  MPI_Allreduce( &nnzLocal, &nnz, 1, MPI_LONG_LONG, MPI_SUM, grid_->comm );
4996 
4997 
4998  return nnz;
4999  } // ----- end of method PMatrix::Nnz -----
5000 
5001  template< >
5002  inline void PMatrix<Real>::GetNegativeInertia ( Real& inertia )
5003  {
5004  Int numSuper = this->NumSuper();
5005 
5006  Real inertiaLocal = 0.0;
5007  inertia = 0.0;
5008 
5009  for( Int ksup = 0; ksup < numSuper; ksup++ ){
5010  // I own the diagonal block
5011  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
5012  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
5013  LBlock<Real> & LB = this->L( LBj( ksup, grid_ ) )[0];
5014  for( Int i = 0; i < LB.numRow; i++ ){
5015  if( LB.nzval(i, i) < 0 )
5016  inertiaLocal++;
5017  }
5018  }
5019  }
5020 
5021  // All processors own diag
5022  mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5023 
5024 
5025  return ;
5026  } // ----- end of method PMatrix::GetNegativeInertia -----
5027 
5028 
5029  template< >
5030  inline void PMatrix<Complex>::GetNegativeInertia ( Real& inertia )
5031  {
5032  Int numSuper = this->NumSuper();
5033 
5034  Real inertiaLocal = 0.0;
5035  inertia = 0.0;
5036 
5037  for( Int ksup = 0; ksup < numSuper; ksup++ ){
5038  // I own the diagonal block
5039  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
5040  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
5041  LBlock<Complex> & LB = this->L( LBj( ksup, grid_ ) )[0];
5042  for( Int i = 0; i < LB.numRow; i++ ){
5043  if( LB.nzval(i, i).real() < 0 )
5044  inertiaLocal++;
5045  }
5046  }
5047  }
5048 
5049  // All processors own diag
5050  mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5051 
5052 
5053  return ;
5054  } // ----- end of method PMatrix::GetNegativeInertia -----
5055 
5056  template<typename T>
5057  inline PMatrix<T> * PMatrix<T>::Create(const GridType * pGridType, const SuperNodeType * pSuper, const PSelInvOptions * pSelInvOpt , const FactorizationOptions * pFactOpt)
5058  {
5059  PMatrix<T> * pMat = NULL;
5060  if(pFactOpt->Symmetric == 0){
5061  pMat = new PMatrixUnsym<T>( pGridType, pSuper, pSelInvOpt, pFactOpt );
5062  }
5063  else{
5064  pMat = new PMatrix<T>( pGridType, pSuper, pSelInvOpt, pFactOpt );
5065  }
5066 
5067  return pMat;
5068  } // ----- end of factory method PMatrix::Create -----
5069 
5070  template<typename T>
5071  inline PMatrix<T> * PMatrix<T>::Create(const FactorizationOptions * pFactOpt)
5072  {
5073  PMatrix<T> * pMat = NULL;
5074  if(pFactOpt->Symmetric == 0){
5075  pMat = new PMatrixUnsym<T>();
5076  }
5077  else{
5078  pMat = new PMatrix<T>();
5079  }
5080 
5081  return pMat;
5082  } // ----- end of factory method PMatrix::Create -----
5083 
5084 
5085 
5086  template<typename T>
5087  inline void PMatrix<T>::DumpLU()
5088  {
5089 //#if ( _DEBUGlevel_ >= 1 )
5090  const IntNumVec& perm = super_->perm;
5091  const IntNumVec& permInv = super_->permInv;
5092 
5093  const IntNumVec * pPerm_r;
5094  const IntNumVec * pPermInv_r;
5095 
5096  if(optionsFact_->RowPerm=="NOROWPERM"){
5097  pPerm_r = &super_->perm;
5098  pPermInv_r = &super_->permInv;
5099  }
5100  else{
5101  pPerm_r = &super_->perm_r;
5102  pPermInv_r = &super_->permInv_r;
5103  }
5104 
5105  const IntNumVec& perm_r = *pPerm_r;
5106  const IntNumVec& permInv_r = *pPermInv_r;
5107 
5108 statusOFS<<"Col perm is "<<perm<<std::endl;
5109 statusOFS<<"Row perm is "<<perm_r<<std::endl;
5110 
5111 statusOFS<<"Inv Col perm is "<<permInv<<std::endl;
5112 statusOFS<<"Inv Row perm is "<<permInv_r<<std::endl;
5113 
5114 
5115 
5116 
5117 statusOFS<<"Content of L"<<std::endl;
5118 //dump L
5119  for(Int j = 0;j<this->L_.size()-1;++j){
5120  std::vector<LBlock<T> >& Lcol = this->L( j );
5121  Int blockColIdx = GBj( j, this->grid_ );
5122  Int fc = FirstBlockCol( blockColIdx, this->super_ );
5123 
5124 
5125  for( Int ib = 0; ib < Lcol.size(); ib++ ){
5126  for(Int ir = 0; ir< Lcol[ib].rows.m(); ++ir){
5127  Int row = Lcol[ib].rows[ir];
5128  for(Int col = fc; col<fc+Lcol[ib].numCol;col++){
5129  Int ocol = permInv_r[permInv[col]];
5130  Int orow = permInv[row];
5131  Int jloc = col - FirstBlockCol( blockColIdx, this->super_ );
5132  Int iloc = ir;
5133  T val = Lcol[ib].nzval( iloc, jloc );
5134  statusOFS << "("<< orow<<", "<<ocol<<") == "<< "("<< row<<", "<<col<<") = "<<val<< std::endl;
5135  }
5136  }
5137  }
5138  }
5139 
5140 statusOFS<<"Content of U"<<std::endl;
5141 
5142 //dump U
5143  for(Int i = 0;i<this->U_.size()-1;++i){
5144  std::vector<UBlock<T> >& Urow = this->U( i );
5145  Int blockRowIdx = GBi( i, this->grid_ );
5146  Int fr = FirstBlockRow( blockRowIdx, this->super_ );
5147  for( Int jb = 0; jb < Urow.size(); jb++ ){
5148  for(Int row = fr; row<fr+Urow[jb].numRow;row++){
5149  for(Int ic = 0; ic< Urow[jb].cols.m(); ++ic){
5150  Int col = Urow[jb].cols[ic];
5151  Int ocol = permInv_r[permInv[col]];
5152  Int orow = permInv[row];
5153  Int iloc = row - FirstBlockRow( blockRowIdx, this->super_ );
5154  Int jloc = ic;
5155  T val = Urow[jb].nzval( iloc, jloc );
5156  statusOFS << "("<< orow<<", "<<ocol<<") == "<< "("<< row<<", "<<col<<") = "<<val<< std::endl;
5157 
5158  }
5159  }
5160 
5161  }
5162  }
5163 //#endif
5164 
5165  }
5166 
5167 
5168  template<typename T>
5169  inline void PMatrix<T>::CopyLU( const PMatrix<T> & C){
5170  ColBlockIdx_ = C.ColBlockIdx_;
5171  RowBlockIdx_ = C.RowBlockIdx_;
5172  L_ = C.L_;
5173  U_ = C.U_;
5174  }
5175 
5176 
5177 } // namespace PEXSI
5178 
5179 
5180 
5181 #endif //_PEXSI_PSELINV_IMPL_HPP_
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_impl.hpp:3594
A thin interface for passing parameters to set the Factorization options.
Definition: pselinv.hpp:115
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:248
void PMatrixToDistSparseMatrix(DistSparseMatrix< T > &A)
PMatrixToDistSparseMatrix converts the PMatrix into a distributed compressed sparse column matrix for...
Definition: pselinv_impl.hpp:4083
Definition: TreeBcast.hpp:705
SuperNodeType describes mapping between supernode and column, the permutation information, and potentially the elimination tree (not implemented here).
Definition: pselinv.hpp:177
Interface with SuperLU_Dist (version 3.0 and later)
void GetNegativeInertia(Real &inertia)
GetNegativeInertia computes the negative inertia of a PMatrix. This can be used to estimate e...
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:304
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:199
IntNumVec colptrLocal
Dimension numColLocal + 1, storing the pointers to the nonzero row indices and nonzero values in rowp...
Definition: sparse_matrix.hpp:108
void UnpackData(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_impl.hpp:1060
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:291
Int nnzLocal
Local number of local nonzeros elements on this processor.
Definition: sparse_matrix.hpp:96
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:309
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:319
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:360
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:324
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:334
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:153
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:202
Int NumCol(const SuperNodeType *s)
NumCol returns the total number of columns for a supernodal partiiton.
Definition: pselinv.hpp:369
Definition: pselinv.hpp:615
void GetDiagonal(NumVec< T > &diag)
GetDiagonal extracts the diagonal elements of the PMatrix.
Definition: pselinv_impl.hpp:3961
void GetWorkSet(std::vector< Int > &snodeEtree, std::vector< std::vector< Int > > &WSet)
GetWorkSet.
Definition: pselinv_impl.hpp:1227
NumVec< F > nzvalLocal
Dimension nnzLocal, storing the nonzero values.
Definition: sparse_matrix.hpp:116
Int FirstBlockRow(Int bnum, const SuperNodeType *s)
FirstBlockRow returns the first column of a block bnum. Note: the functionality of FirstBlockRow is e...
Definition: pselinv.hpp:355
void GetEtree(std::vector< Int > &etree_supno)
GetEtree computes the supernodal elimination tree to be used later in the pipelined selected inversio...
Definition: pselinv_impl.hpp:1166
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:193
IntNumVec rowindLocal
Dimension nnzLocal, storing the nonzero row indices. The indices are 1-based (FORTRAN-convention), i.e. the first row index is 1.
Definition: sparse_matrix.hpp:113
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:314
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_impl.hpp:2603
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_impl.hpp:2611
Int GBi(Int iLocal, const GridType *g)
GBi returns the global block number from a local block number in the row direction.
Definition: pselinv.hpp:329
GridType is the PSelInv way of defining the grid.
Definition: pselinv.hpp:140
void SelInvIntra_P2p(Int lidx, Int &rank)
SelInvIntra_P2p.
Definition: pselinv_impl.hpp:1465
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:299
Int size
Matrix dimension.
Definition: sparse_matrix.hpp:93
void PMatrixToDistSparseMatrix_OLD(const DistSparseMatrix< T > &A, DistSparseMatrix< T > &B)
PMatrixToDistSparseMatrix_OLD converts the PMatrix into a distributed compressed sparse column matrix...
Definition: pselinv_impl.hpp:4355
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:205
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:254
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:245
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:348
LongInt Nnz()
Compute the total number of nonzeros through MPI_Allreduce.
Definition: sparse_matrix_impl.hpp:101
MPI_Comm comm
MPI communicator.
Definition: sparse_matrix.hpp:119
Int NnzLocal()
NnzLocal computes the number of nonzero elements (L and U) saved locally.
Definition: pselinv_impl.hpp:4965
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:196
PMatrix contains the main data structure and the computational routine for the parallel selected inve...
Definition: pselinv.hpp:542
void SendRecvCD_UpdateU(std::vector< SuperNodeBufferType > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_impl.hpp:828
Int BlockIdx(Int i, const SuperNodeType *s)
BlockIdx returns the block index of a column i.
Definition: pselinv.hpp:343
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:251
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:260
Int nnz
Total number of nonzeros elements.
Definition: sparse_matrix.hpp:101
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:364
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_impl.hpp:3537
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:209
A thin interface for passing parameters to set the PSelInv options.
Definition: pselinv.hpp:102
Definition: utility.hpp:1626
IntNumVec cols
Dimension numCol * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:257
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_impl.hpp:3484
LongInt Nnz()
Nnz computes the total number of nonzero elements in the PMatrix.
Definition: pselinv_impl.hpp:4990
void SelInv_lookup_indexes(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_impl.hpp:471
Definition: TreeBcast.hpp:1272
void ComputeDiagUpdate(SuperNodeBufferType &snode)
ComputeDiagUpdate.
Definition: pselinv_impl.hpp:1126
DistSparseMatrix describes a Sparse matrix in the compressed sparse column format (CSC) and distribut...
Definition: sparse_matrix.hpp:91
PMatrixUnsym contains the main data structure and the computational routine for the parallel selected...
Definition: pselinv.hpp:1003
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:295