download/doc_v0.9.0/pselinv__impl_8hpp_source.html

 /*

    Copyright (c) 2012 The Regents of the University of California,

    through Lawrence Berkeley National Laboratory.


 Authors: Lin Lin and Mathias Jacquelin


 This file is part of PEXSI. All rights reserved.


 Redistribution and use in source and binary forms, with or without

 modification, are permitted provided that the following conditions are met:


 (1) Redistributions of source code must retain the above copyright notice, this

 list of conditions and the following disclaimer.

 (2) Redistributions in binary form must reproduce the above copyright notice,

 this list of conditions and the following disclaimer in the documentation

 and/or other materials provided with the distribution.

 (3) Neither the name of the University of California, Lawrence Berkeley

 National Laboratory, U.S. Dept. of Energy nor the names of its contributors may

 be used to endorse or promote products derived from this software without

 specific prior written permission.


 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


 You are under no obligation whatsoever to provide any bug fixes, patches, or

 upgrades to the features, functionality or performance of the source code

 ("Enhancements") to anyone; however, if you choose to make your Enhancements

 available either publicly, or directly to Lawrence Berkeley National

 Laboratory, without imposing a separate written license agreement for such

 Enhancements, then you hereby grant the following license: a non-exclusive,

 royalty-free perpetual license to install, use, modify, prepare derivative

 works, incorporate into other computer software, distribute, and sublicense

 such enhancements or derivative works thereof, in binary and source code form.

  */

 #ifndef _PEXSI_PSELINV_IMPL_HPP_

 #define _PEXSI_PSELINV_IMPL_HPP_


 #include <list>


 #include "pexsi/timer.h"

 #include "pexsi/superlu_dist_interf.hpp"


 #define MPI_MAX_COMM (1024)

 #define BCAST_THRESHOLD 16


 #define MOD(a,b) \

   ( ((a)%(b)+(b))%(b))


   namespace PEXSI{

     inline GridType::GridType   ( MPI_Comm Bcomm, int nprow, int npcol )

     {

 #ifndef _RELEASE_

       PushCallStack("GridType::GridType");

 #endif

       Int info;

       MPI_Initialized( &info );

       if( !info ){

 #ifdef USE_ABORT

         abort();

 #endif

         throw std::logic_error( "MPI has not been initialized." );

       }

       MPI_Group  comm_group;

       MPI_Comm_group( Bcomm, &comm_group );

       MPI_Comm_create( Bcomm, comm_group, &comm );

       //                comm = Bcomm;


       MPI_Comm_rank( comm, &mpirank );

       MPI_Comm_size( comm, &mpisize );

       if( mpisize != nprow * npcol ){

 #ifdef USE_ABORT

         abort();

 #endif

         throw std::logic_error( "mpisize != nprow * npcol." );

       }


       numProcRow = nprow;

       numProcCol = npcol;


       Int myrow = mpirank / npcol;

       Int mycol = mpirank % npcol;


       MPI_Comm_split( comm, myrow, mycol, &rowComm );

       MPI_Comm_split( comm, mycol, myrow, &colComm );


       MPI_Group_free( &comm_group );


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       return ;

     }           // -----  end of method GridType::GridType  -----


     inline GridType::~GridType  (  )

     {

 #ifndef _RELEASE_

       PushCallStack("GridType::~GridType");

 #endif

       // Dot not free grid.comm which is not generated by GridType().


       MPI_Comm_free( &rowComm );

       MPI_Comm_free( &colComm );

       MPI_Comm_free( &comm );


 #ifndef _RELEASE_

       PopCallStack();

 #endif

       return ;

     }           // -----  end of method GridType::~GridType  -----

   }


 namespace PEXSI{

   template<typename T>

     void PMatrix<T>::deallocate(){


       grid_ = NULL;

       super_ = NULL;

       options_ = NULL;

       optionsLU_ = NULL;


       ColBlockIdx_.clear();

       RowBlockIdx_.clear();

       U_.clear();

       L_.clear();

       workingSet_.clear();


       // Communication variables

       isSendToBelow_.Clear();

       isSendToRight_.Clear();

       isSendToDiagonal_.Clear();

       isSendToCrossDiagonal_.Clear();


       isRecvFromBelow_.Clear();

       isRecvFromAbove_.Clear();

       isRecvFromLeft_.Clear();

       isRecvFromCrossDiagonal_.Clear();


       //Cleanup tree information

       for(int i =0;i<fwdToBelowTree_.size();++i){

         if(fwdToBelowTree_[i]!=NULL){

           delete fwdToBelowTree_[i];

         }

       }


       for(int i =0;i<fwdToRightTree_.size();++i){

         if(fwdToRightTree_[i]!=NULL){

           delete fwdToRightTree_[i];

         }

       }


       for(int i =0;i<redToLeftTree_.size();++i){

         if(redToLeftTree_[i]!=NULL){

           delete redToLeftTree_[i];

         }

       }

       for(int i =0;i<redToAboveTree_.size();++i){

         if(redToAboveTree_[i]!=NULL){

           delete redToAboveTree_[i];

         }

       }

     }


   template<typename T>

     PMatrix<T> & PMatrix<T>::operator = ( const PMatrix<T> & C){

       if(&C!=this){

         //If we have some memory allocated, delete it

         deallocate();


         grid_ = C.grid_;

         super_ = C.super_;

         options_ = C.options_;

         optionsLU_ = C.optionsLU_;


         ColBlockIdx_ = C.ColBlockIdx_;

         RowBlockIdx_ = C.RowBlockIdx_;

         L_ = C.L_;

         U_ = C.U_;


         workingSet_ = C.workingSet_;


         // Communication variables

         isSendToBelow_ = C.isSendToBelow_;

         isSendToRight_ = C.isSendToRight_;

         isSendToDiagonal_ = C.isSendToDiagonal_;

         isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;


         isRecvFromBelow_ = C.isRecvFromBelow_;

         isRecvFromAbove_ = C.isRecvFromAbove_;

         isRecvFromLeft_ = C.isRecvFromLeft_;

         isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;


         fwdToBelowTree_.resize(C.fwdToBelowTree_.size());

         for(int i = 0 ; i< C.fwdToBelowTree_.size();++i){

           if(C.fwdToBelowTree_[i]!=NULL){

             fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();

           }

         }


         fwdToRightTree_.resize(C.fwdToRightTree_.size());

         for(int i = 0 ; i< C.fwdToRightTree_.size();++i){

           if(C.fwdToRightTree_[i]!=NULL){

             fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();

           }

         }


         redToLeftTree_.resize(C.redToLeftTree_.size());

         for(int i = 0 ; i< C.redToLeftTree_.size();++i){

           if(C.redToLeftTree_[i]!=NULL){

             redToLeftTree_[i] = C.redToLeftTree_[i]->clone();

           }

         }

         redToAboveTree_.resize(C.redToAboveTree_.size());

         for(int i = 0 ; i< C.redToAboveTree_.size();++i){

           if(C.redToAboveTree_[i]!=NULL){

             redToAboveTree_[i] = C.redToAboveTree_[i]->clone();

           }

         }


       }


       return *this;

     }


   template<typename T>

     PMatrix<T>::PMatrix( const PMatrix<T> & C){

       //If we have some memory allocated, delete it

       deallocate();


       grid_ = C.grid_;

       super_ = C.super_;

       options_ = C.options_;

       optionsLU_ = C.optionsLU_;


       ColBlockIdx_ = C.ColBlockIdx_;

       RowBlockIdx_ = C.RowBlockIdx_;

       L_ = C.L_;

       U_ = C.U_;


       workingSet_ = C.workingSet_;


       // Communication variables

       isSendToBelow_ = C.isSendToBelow_;

       isSendToRight_ = C.isSendToRight_;

       isSendToDiagonal_ = C.isSendToDiagonal_;

       isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;


       isRecvFromBelow_ = C.isRecvFromBelow_;

       isRecvFromAbove_ = C.isRecvFromAbove_;

       isRecvFromLeft_ = C.isRecvFromLeft_;

       isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;


       fwdToBelowTree_.resize(C.fwdToBelowTree_.size());

       for(int i = 0 ; i< C.fwdToBelowTree_.size();++i){

         if(C.fwdToBelowTree_[i]!=NULL){

           fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();

         }

       }


       fwdToRightTree_.resize(C.fwdToRightTree_.size());

       for(int i = 0 ; i< C.fwdToRightTree_.size();++i){

         if(C.fwdToRightTree_[i]!=NULL){

           fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();

         }

       }


       redToLeftTree_.resize(C.redToLeftTree_.size());

       for(int i = 0 ; i< C.redToLeftTree_.size();++i){

         if(C.redToLeftTree_[i]!=NULL){

           redToLeftTree_[i] = C.redToLeftTree_[i]->clone();

         }

       }

       redToAboveTree_.resize(C.redToAboveTree_.size());

       for(int i = 0 ; i< C.redToAboveTree_.size();++i){

         if(C.redToAboveTree_[i]!=NULL){

           redToAboveTree_[i] = C.redToAboveTree_[i]->clone();

         }

       }


     }


   template<typename T>

     PMatrix<T>::PMatrix (

         const GridType* g,

         const SuperNodeType* s,

         const PEXSI::PSelInvOptions * o,

         const PEXSI::SuperLUOptions * oLU

         )

     {

 #ifndef _RELEASE_

       PushCallStack("PMatrix::PMatrix");

 #endif


       this->Setup( g, s, o, oLU );


 #ifndef _RELEASE_

       PopCallStack();

 #endif

       return ;

     }           // -----  end of method PMatrix::PMatrix  -----


   template<typename T>

     PMatrix<T>::~PMatrix ( )

     {


 #ifndef _RELEASE_

       PushCallStack("PMatrix::~PMatrix");

 #endif


       deallocate();


 #ifndef _RELEASE_

       PopCallStack();

 #endif

       return ;

     }           // -----  end of method PMatrix::~PMatrix  -----


   template<typename T>

     void PMatrix<T>::Setup(

         const GridType* g,

         const SuperNodeType* s,

         const PEXSI::PSelInvOptions * o,

         const PEXSI::SuperLUOptions * oLU

         )

     {

 #ifndef _RELEASE_

       PushCallStack("PMatrix::Setup");

 #endif


       grid_          = g;

       super_         = s;

       options_       = o;

       optionsLU_       = oLU;


       //    if( grid_->numProcRow != grid_->numProcCol ){

       //      #ifdef USE_ABORT

       //abort();

       //#endif

       //throw std::runtime_error( "The current version of SelInv only works for square processor grids." ); }


       L_.clear();

       U_.clear();

       ColBlockIdx_.clear();

       RowBlockIdx_.clear();


       L_.resize( this->NumLocalBlockCol() );

       U_.resize( this->NumLocalBlockRow() );


       ColBlockIdx_.resize( this->NumLocalBlockCol() );

       RowBlockIdx_.resize( this->NumLocalBlockRow() );


       //workingSet_.resize(this->NumSuper());

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "PMatrix is constructed. The grid information: " << std::endl;

       statusOFS << "mpirank = " << MYPROC(grid_) << std::endl;

       statusOFS << "myrow   = " << MYROW(grid_) << std::endl;

       statusOFS << "mycol   = " << MYCOL(grid_) << std::endl;

 #endif


 #ifndef _RELEASE_

       PopCallStack();

 #endif

       return ;

     }           // -----  end of method PMatrix::Setup   -----


   template<typename T>

     inline  void PMatrix<T>::SelInv_lookup_indexes(

         SuperNodeBufferType & snode,

         std::vector<LBlock<T> > & LcolRecv,

         std::vector<UBlock<T> > & UrowRecv,

         NumMat<T> & AinvBuf,

         NumMat<T> & UBuf )

     {

       TIMER_START(Compute_Sinv_LT_Lookup_Indexes);


       TIMER_START(Build_colptr_rowptr);

       // rowPtr[ib] gives the row index in snode.LUpdateBuf for the first

       // nonzero row in LcolRecv[ib]. The total number of rows in

       // snode.LUpdateBuf is given by rowPtr[end]-1

       std::vector<Int> rowPtr(LcolRecv.size() + 1);

       // colPtr[jb] gives the column index in UBuf for the first

       // nonzero column in UrowRecv[jb]. The total number of rows in

       // UBuf is given by colPtr[end]-1

       std::vector<Int> colPtr(UrowRecv.size() + 1);


       rowPtr[0] = 0;

       for( Int ib = 0; ib < LcolRecv.size(); ib++ ){

         rowPtr[ib+1] = rowPtr[ib] + LcolRecv[ib].numRow;

       }

       colPtr[0] = 0;

       for( Int jb = 0; jb < UrowRecv.size(); jb++ ){

         colPtr[jb+1] = colPtr[jb] + UrowRecv[jb].numCol;

       }


       Int numRowAinvBuf = *rowPtr.rbegin();

       Int numColAinvBuf = *colPtr.rbegin();

       TIMER_STOP(Build_colptr_rowptr);


       TIMER_START(Allocate_lookup);

       // Allocate for the computational storage

       AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );

       UBuf.Resize( SuperSize( snode.Index, super_ ), numColAinvBuf );

       //    TIMER_START(SetValue_lookup);

       //    SetValue( AinvBuf, ZERO<T>() );

       //SetValue( snode.LUpdateBuf, ZERO<T>() );

       //    SetValue( UBuf, ZERO<T>() );

       //    TIMER_STOP(SetValue_lookup);

       TIMER_STOP(Allocate_lookup);


       TIMER_START(Fill_UBuf);

       // Fill UBuf first.  Make the transpose later in the Gemm phase.

       for( Int jb = 0; jb < UrowRecv.size(); jb++ ){

         UBlock<T>& UB = UrowRecv[jb];

         if( UB.numRow != SuperSize(snode.Index, super_) ){

           throw std::logic_error( "The size of UB is not right.  Something is seriously wrong." );

         }

         lapack::Lacpy( 'A', UB.numRow, UB.numCol, UB.nzval.Data(),

             UB.numRow, UBuf.VecData( colPtr[jb] ), SuperSize( snode.Index, super_ ) );

       }

       TIMER_STOP(Fill_UBuf);


       // Calculate the relative indices for (isup, jsup)

       // Fill AinvBuf with the information in L or U block.

       TIMER_START(JB_Loop);


 #ifdef STDFIND

       //    for( Int jb = 0; jb < UrowRecv.size(); jb++ ){

       //

       //      UBlock& UB = UrowRecv[jb];

       //      Int jsup = UB.blockIdx;

       //      Int SinvColsSta = FirstBlockCol( jsup, super_ );

       //

       //      // Column relative indicies

       //      std::vector<Int> relCols( UB.numCol );

       //      for( Int j = 0; j < UB.numCol; j++ ){

       //        relCols[j] = UB.cols[j] - SinvColsSta;

       //      }

       //

       //

       //

       //

       //      for( Int ib = 0; ib < LcolRecv.size(); ib++ ){

       //        LBlock& LB = LcolRecv[ib];

       //        Int isup = LB.blockIdx;

       //        Int SinvRowsSta = FirstBlockCol( isup, super_ );

       //        Scalar* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );

       //        Int     ldAinv    = numRowAinvBuf;

       //

       //        // Pin down the corresponding block in the part of Sinv.

       //        if( isup >= jsup ){

       //          std::vector<LBlock>&  LcolSinv = this->L( LBj(jsup, grid_ ) );

       //          bool isBlockFound = false;

       //          TIMER_START(PARSING_ROW_BLOCKIDX);

       //          for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){

       //            // Found the (isup, jsup) block in Sinv

       //            if( LcolSinv[ibSinv].blockIdx == isup ){

       //              LBlock& SinvB = LcolSinv[ibSinv];

       //

       //              // Row relative indices

       //              std::vector<Int> relRows( LB.numRow );

       //              Int* rowsLBPtr    = LB.rows.Data();

       //              Int* rowsSinvBPtr = SinvB.rows.Data();

       //

       //              TIMER_START(STDFIND_ROW);

       //              Int * pos =&rowsSinvBPtr[0];

       //              Int * last =&rowsSinvBPtr[SinvB.numRow];

       //              for( Int i = 0; i < LB.numRow; i++ ){

       //                //                pos = std::find(pos, &rowsSinvBPtr[SinvB.numRow-1], rowsLBPtr[i]);

       //                pos = std::find(rowsSinvBPtr, last, rowsLBPtr[i]);

       //                if(pos != last){

       //                  relRows[i] = (Int)(pos - rowsSinvBPtr);

       //                }

       //                else{

       //                  std::ostringstream msg;

       //                  msg << "Row " << rowsLBPtr[i] <<

       //                    " in LB cannot find the corresponding row in SinvB" << std::endl

       //                    << "LB.rows    = " << LB.rows << std::endl

       //                    << "SinvB.rows = " << SinvB.rows << std::endl;

       //                  throw std::runtime_error( msg.str().c_str() );

       //                }

       //              }

       //              TIMER_STOP(STDFIND_ROW);

       //

       //              TIMER_START(Copy_Sinv_to_Ainv);

       //              // Transfer the values from Sinv to AinvBlock

       //              Scalar* nzvalSinv = SinvB.nzval.Data();

       //              Int     ldSinv    = SinvB.numRow;

       //              for( Int j = 0; j < UB.numCol; j++ ){

       //                for( Int i = 0; i < LB.numRow; i++ ){

       //                  nzvalAinv[i+j*ldAinv] =

       //                    nzvalSinv[relRows[i] + relCols[j] * ldSinv];

       //                }

       //              }

       //              TIMER_STOP(Copy_Sinv_to_Ainv);

       //

       //              isBlockFound = true;

       //              break;

       //            }

       //          } // for (ibSinv )

       //          TIMER_STOP(PARSING_ROW_BLOCKIDX);

       //          if( isBlockFound == false ){

       //            std::ostringstream msg;

       //            msg << "Block(" << isup << ", " << jsup

       //              << ") did not find a matching block in Sinv." << std::endl;

       //            throw std::runtime_error( msg.str().c_str() );

       //          }

       //        } // if (isup, jsup) is in L

       //        else{

       //          // Row relative indices

       //          std::vector<Int> relRows( LB.numRow );

       //          Int SinvRowsSta = FirstBlockCol( isup, super_ );

       //          for( Int i = 0; i < LB.numRow; i++ ){

       //            relRows[i] = LB.rows[i] - SinvRowsSta;

       //          }

       //          std::vector<UBlock>&   UrowSinv = this->U( LBi( isup, grid_ ) );

       //          bool isBlockFound = false;

       //          TIMER_START(PARSING_COL_BLOCKIDX);

       //          for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){

       //            // Found the (isup, jsup) block in Sinv

       //            if( UrowSinv[jbSinv].blockIdx == jsup ){

       //              UBlock& SinvB = UrowSinv[jbSinv];

       //

       //

       //

       //              // Column relative indices

       //              std::vector<Int> relCols( UB.numCol );

       //              Int* colsUBPtr    = UB.cols.Data();

       //              Int* colsSinvBPtr = SinvB.cols.Data();

       //              TIMER_START(STDFIND_COL);

       //              Int * pos =&colsSinvBPtr[0];

       //              Int * last =&colsSinvBPtr[SinvB.numCol];

       //              for( Int j = 0; j < UB.numCol; j++ ){

       //                //colsUB is sorted

       //                pos = std::find(colsSinvBPtr, last, colsUBPtr[j]);

       //                if(pos !=last){

       //                  relCols[j] = (Int)(pos - colsSinvBPtr);

       //                }

       //                else{

       //                  std::ostringstream msg;

       //                  msg << "Col " << colsUBPtr[j] <<

       //                    " in UB cannot find the corresponding row in SinvB" << std::endl

       //                    << "UB.cols    = " << UB.cols << std::endl

       //                    << "UinvB.cols = " << SinvB.cols << std::endl;

       //                  throw std::runtime_error( msg.str().c_str() );

       //                }

       //              }

       //              TIMER_STOP(STDFIND_COL);

       //

       //

       //              TIMER_START(Copy_Sinv_to_Ainv);

       //              // Transfer the values from Sinv to AinvBlock

       //              Scalar* nzvalSinv = SinvB.nzval.Data();

       //              Int     ldSinv    = SinvB.numRow;

       //              for( Int j = 0; j < UB.numCol; j++ ){

       //                for( Int i = 0; i < LB.numRow; i++ ){

       //                  nzvalAinv[i+j*ldAinv] =

       //                    nzvalSinv[relRows[i] + relCols[j] * ldSinv];

       //                }

       //              }

       //              TIMER_STOP(Copy_Sinv_to_Ainv);

       //

       //              isBlockFound = true;

       //              break;

       //            }

       //          } // for (jbSinv)

       //          TIMER_STOP(PARSING_COL_BLOCKIDX);

       //          if( isBlockFound == false ){

       //            std::ostringstream msg;

       //            msg << "Block(" << isup << ", " << jsup

       //              << ") did not find a matching block in Sinv." << std::endl;

       //            throw std::runtime_error( msg.str().c_str() );

       //          }

       //        } // if (isup, jsup) is in U

       //

       //      } // for( ib )

       //    } // for ( jb )

 #else

       for( Int jb = 0; jb < UrowRecv.size(); jb++ ){

         for( Int ib = 0; ib < LcolRecv.size(); ib++ ){

           LBlock<T>& LB = LcolRecv[ib];

           UBlock<T>& UB = UrowRecv[jb];

           Int isup = LB.blockIdx;

           Int jsup = UB.blockIdx;

           T* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );

           Int     ldAinv    = AinvBuf.m();


           // Pin down the corresponding block in the part of Sinv.

           if( isup >= jsup ){

             std::vector<LBlock<T> >&  LcolSinv = this->L( LBj(jsup, grid_ ) );

             bool isBlockFound = false;

             TIMER_START(PARSING_ROW_BLOCKIDX);

             for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){

               // Found the (isup, jsup) block in Sinv

               if( LcolSinv[ibSinv].blockIdx == isup ){

                 LBlock<T> & SinvB = LcolSinv[ibSinv];


                 // Row relative indices

                 std::vector<Int> relRows( LB.numRow );

                 Int* rowsLBPtr    = LB.rows.Data();

                 Int* rowsSinvBPtr = SinvB.rows.Data();

                 for( Int i = 0; i < LB.numRow; i++ ){

                   bool isRowFound = false;

                   for( Int i1 = 0; i1 < SinvB.numRow; i1++ ){

                     if( rowsLBPtr[i] == rowsSinvBPtr[i1] ){

                       isRowFound = true;

                       relRows[i] = i1;

                       break;

                     }

                   }

                   if( isRowFound == false ){

                     std::ostringstream msg;

                     msg << "Row " << rowsLBPtr[i] <<

                       " in LB cannot find the corresponding row in SinvB" << std::endl

                       << "LB.rows    = " << LB.rows << std::endl

                       << "SinvB.rows = " << SinvB.rows << std::endl;

                     throw std::runtime_error( msg.str().c_str() );

                   }

                 }


                 // Column relative indicies

                 std::vector<Int> relCols( UB.numCol );

                 Int SinvColsSta = FirstBlockCol( jsup, super_ );

                 for( Int j = 0; j < UB.numCol; j++ ){

                   relCols[j] = UB.cols[j] - SinvColsSta;

                 }


                 // Transfer the values from Sinv to AinvBlock

                 T* nzvalSinv = SinvB.nzval.Data();

                 Int     ldSinv    = SinvB.numRow;

                 for( Int j = 0; j < UB.numCol; j++ ){

                   for( Int i = 0; i < LB.numRow; i++ ){

                     nzvalAinv[i+j*ldAinv] =

                       nzvalSinv[relRows[i] + relCols[j] * ldSinv];

                   }

                 }


                 isBlockFound = true;

                 break;

               }

             } // for (ibSinv )

             TIMER_STOP(PARSING_ROW_BLOCKIDX);

             if( isBlockFound == false ){

               std::ostringstream msg;

               msg << "Block(" << isup << ", " << jsup

                 << ") did not find a matching block in Sinv." << std::endl;

               throw std::runtime_error( msg.str().c_str() );

             }

           } // if (isup, jsup) is in L

           else{

             std::vector<UBlock<T> >&   UrowSinv = this->U( LBi( isup, grid_ ) );

             bool isBlockFound = false;

             TIMER_START(PARSING_COL_BLOCKIDX);

             for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){

               // Found the (isup, jsup) block in Sinv

               if( UrowSinv[jbSinv].blockIdx == jsup ){

                 UBlock<T> & SinvB = UrowSinv[jbSinv];


                 // Row relative indices

                 std::vector<Int> relRows( LB.numRow );

                 Int SinvRowsSta = FirstBlockCol( isup, super_ );

                 for( Int i = 0; i < LB.numRow; i++ ){

                   relRows[i] = LB.rows[i] - SinvRowsSta;

                 }


                 // Column relative indices

                 std::vector<Int> relCols( UB.numCol );

                 Int* colsUBPtr    = UB.cols.Data();

                 Int* colsSinvBPtr = SinvB.cols.Data();

                 for( Int j = 0; j < UB.numCol; j++ ){

                   bool isColFound = false;

                   for( Int j1 = 0; j1 < SinvB.numCol; j1++ ){

                     if( colsUBPtr[j] == colsSinvBPtr[j1] ){

                       isColFound = true;

                       relCols[j] = j1;

                       break;

                     }

                   }

                   if( isColFound == false ){

                     std::ostringstream msg;

                     msg << "Col " << colsUBPtr[j] <<

                       " in UB cannot find the corresponding row in SinvB" << std::endl

                       << "UB.cols    = " << UB.cols << std::endl

                       << "UinvB.cols = " << SinvB.cols << std::endl;

                     throw std::runtime_error( msg.str().c_str() );

                   }

                 }


                 // Transfer the values from Sinv to AinvBlock

                 T* nzvalSinv = SinvB.nzval.Data();

                 Int     ldSinv    = SinvB.numRow;

                 for( Int j = 0; j < UB.numCol; j++ ){

                   for( Int i = 0; i < LB.numRow; i++ ){

                     nzvalAinv[i+j*ldAinv] =

                       nzvalSinv[relRows[i] + relCols[j] * ldSinv];

                   }

                 }


                 isBlockFound = true;

                 break;

               }

             } // for (jbSinv)

             TIMER_STOP(PARSING_COL_BLOCKIDX);

             if( isBlockFound == false ){

               std::ostringstream msg;

               msg << "Block(" << isup << ", " << jsup

                 << ") did not find a matching block in Sinv." << std::endl;

               throw std::runtime_error( msg.str().c_str() );

             }

           } // if (isup, jsup) is in U


         } // for( ib )

       } // for ( jb )


 #endif

       TIMER_STOP(JB_Loop);


       TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);

     }


   template<typename T>

     inline void PMatrix<T>::SendRecvCD_UpdateU(

         std::vector<SuperNodeBufferType > & arrSuperNodes,

         Int stepSuper)

     {


       TIMER_START(Send_CD_Update_U);

       //compute the number of requests

       Int sendCount = 0;

       Int recvCount = 0;

       Int sendOffset[stepSuper];

       Int recvOffset[stepSuper];

       Int recvIdx=0;

       for (Int supidx=0; supidx<stepSuper; supidx++){

         SuperNodeBufferType & snode = arrSuperNodes[supidx];

         sendOffset[supidx]=sendCount;

         recvOffset[supidx]=recvCount;

         sendCount+= CountSendToCrossDiagonal(snode.Index);

         recvCount+= CountRecvFromCrossDiagonal(snode.Index);

       }


       std::vector<MPI_Request > arrMpiReqsSendCD(sendCount, MPI_REQUEST_NULL );

       std::vector<MPI_Request > arrMpiReqsSizeSendCD(sendCount, MPI_REQUEST_NULL );


       std::vector<MPI_Request > arrMpiReqsRecvCD(recvCount, MPI_REQUEST_NULL );

       std::vector<MPI_Request > arrMpiReqsSizeRecvCD(recvCount, MPI_REQUEST_NULL );

       std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);

       std::vector<int > arrSstrLcolSizeSendCD(sendCount);

       std::vector<std::vector<char> > arrSstrLcolRecvCD(recvCount);

       std::vector<int > arrSstrLcolSizeRecvCD(recvCount);


       for (Int supidx=0; supidx<stepSuper; supidx++){

         SuperNodeBufferType & snode = arrSuperNodes[supidx];


         // Send LUpdateBufReduced to the cross diagonal blocks.

         // NOTE: This assumes square processor grid


         TIMER_START(Send_L_CrossDiag);


         if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && isSendToCrossDiagonal_(grid_->numProcCol, snode.Index ) ){


           Int sendIdx = 0;

           for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){

             if(isSendToCrossDiagonal_(dstCol,snode.Index) ){

               Int dest = PNUM(PROW(snode.Index,grid_),dstCol,grid_);


               if( MYPROC( grid_ ) != dest       ){

                 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSendCD[sendOffset[supidx]+sendIdx];

                 MPI_Request & mpiReqSend = arrMpiReqsSendCD[sendOffset[supidx]+sendIdx];


                 std::stringstream sstm;

                 std::vector<char> & sstrLcolSend = arrSstrLcolSendCD[sendOffset[supidx]+sendIdx];

                 Int & sstrSize = arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdx];


                 serialize( snode.RowLocalPtr, sstm, NO_MASK );

                 serialize( snode.BlockIdxLocal, sstm, NO_MASK );

                 serialize( snode.LUpdateBuf, sstm, NO_MASK );


                 sstrLcolSend.resize( Size(sstm) );

                 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );

                 sstrSize = sstrLcolSend.size();


                 MPI_Isend( &sstrSize, sizeof(sstrSize), MPI_BYTE, dest, IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD), grid_->comm, &mpiReqSizeSend );

                 MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dest, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD), grid_->comm, &mpiReqSend );


                 PROFILE_COMM(MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD),sizeof(sstrSize));

                 PROFILE_COMM(MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD),sstrSize);


                 sendIdx++;

               }

             }

           }


         } // sender

         TIMER_STOP(Send_L_CrossDiag);

       }


       //Do Irecv for sizes

       for (Int supidx=0; supidx<stepSuper; supidx++){

         SuperNodeBufferType & snode = arrSuperNodes[supidx];

         //If I'm a receiver

         if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){

           Int recvIdx=0;

           for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){

             if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){

               Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);

               if( MYPROC( grid_ ) != src ){

                 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];

                 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecvCD[recvOffset[supidx]+recvIdx];

                 MPI_Irecv( &sstrSize, 1, MPI_INT, src, IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD), grid_->comm, &mpiReqSizeRecv );

                 recvIdx++;

               }

             }

           }

         }//end if I'm a receiver

       }


       //waitall sizes

       mpi::Waitall(arrMpiReqsSizeRecvCD);

       //Allocate content and do Irecv

       for (Int supidx=0; supidx<stepSuper; supidx++){

         SuperNodeBufferType & snode = arrSuperNodes[supidx];

         //If I'm a receiver

         if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){

           Int recvIdx=0;

           for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){

             if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){

               Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);

               if( MYPROC( grid_ ) != src ){

                 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];

                 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];

                 MPI_Request & mpiReqRecv = arrMpiReqsRecvCD[recvOffset[supidx]+recvIdx];

                 sstrLcolRecv.resize( sstrSize);

                 MPI_Irecv( (void*)&sstrLcolRecv[0], sstrSize, MPI_BYTE, src, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD), grid_->comm, &mpiReqRecv );

                 //statusOFS<<"P"<<MYPROC(grid_)<<" received "<<sstrSize<<" bytes of L/U from CD P"<<src<<std::endl;

                 recvIdx++;

               }

             }

           }

         }//end if I'm a receiver

       }


       //waitall content

       mpi::Waitall(arrMpiReqsRecvCD);

       //Do the work

       for (Int supidx=0; supidx<stepSuper; supidx++){

         SuperNodeBufferType & snode = arrSuperNodes[supidx];


         // Send LUpdateBufReduced to the cross diagonal blocks.

         // NOTE: This assumes square processor grid

         if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){


 #if ( _DEBUGlevel_ >= 1 )

           statusOFS << std::endl <<  " ["<<snode.Index<<"] "<<  "Update the upper triangular block" << std::endl << std::endl;

           statusOFS << std::endl << " ["<<snode.Index<<"] "<<   "blockIdxLocal:" << snode.BlockIdxLocal << std::endl << std::endl;

           statusOFS << std::endl << " ["<<snode.Index<<"] "<<   "rowLocalPtr:" << snode.RowLocalPtr << std::endl << std::endl;

 #endif


           std::vector<UBlock<T> >&  Urow = this->U( LBi( snode.Index, grid_ ) );

           std::vector<bool> isBlockFound(Urow.size(),false);


           recvIdx=0;

           for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){

             if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){

               Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);

               TIMER_START(Recv_L_CrossDiag);


               std::vector<Int> rowLocalPtrRecv;

               std::vector<Int> blockIdxLocalRecv;

               NumMat<T> UUpdateBuf;


               if( MYPROC( grid_ ) != src ){

                 std::stringstream sstm;

                 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];

                 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];

                 sstm.write( &sstrLcolRecv[0], sstrSize );


                 deserialize( rowLocalPtrRecv, sstm, NO_MASK );

                 deserialize( blockIdxLocalRecv, sstm, NO_MASK );

                 deserialize( UUpdateBuf, sstm, NO_MASK );


                 recvIdx++;


               } // sender is not the same as receiver

               else{

                 rowLocalPtrRecv   = snode.RowLocalPtr;

                 blockIdxLocalRecv = snode.BlockIdxLocal;

                 UUpdateBuf = snode.LUpdateBuf;

               } // sender is the same as receiver


               TIMER_STOP(Recv_L_CrossDiag);


 #if ( _DEBUGlevel_ >= 1 )

               statusOFS<<" ["<<snode.Index<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<snode.Index<<") <--- P"<<src<<std::endl;

               statusOFS << std::endl << " ["<<snode.Index<<"] "<<   "rowLocalPtrRecv:" << rowLocalPtrRecv << std::endl << std::endl;

               statusOFS << std::endl << " ["<<snode.Index<<"] "<<   "blockIdxLocalRecv:" << blockIdxLocalRecv << std::endl << std::endl;

 #endif


               // Update U

               for( Int ib = 0; ib < blockIdxLocalRecv.size(); ib++ ){

                 for( Int jb = 0; jb < Urow.size(); jb++ ){

                   UBlock<T>& UB = Urow[jb];

                   if( UB.blockIdx == blockIdxLocalRecv[ib] ){

                     NumMat<T> Ltmp ( UB.numCol, UB.numRow );

                     lapack::Lacpy( 'A', Ltmp.m(), Ltmp.n(),

                         &UUpdateBuf( rowLocalPtrRecv[ib], 0 ),

                         UUpdateBuf.m(), Ltmp.Data(), Ltmp.m() );

                     isBlockFound[jb] = true;

                     Transpose( Ltmp, UB.nzval );

                     break;

                   }

                 }

               }

             }

           }


           for( Int jb = 0; jb < Urow.size(); jb++ ){

             UBlock<T>& UB = Urow[jb];

             if( !isBlockFound[jb] ){

 #ifdef USE_ABORT

               abort();

 #endif

               throw std::logic_error( "UBlock cannot find its update. Something is seriously wrong." );

             }

           }

         } // receiver

       }


       TIMER_STOP(Send_CD_Update_U);


       mpi::Waitall(arrMpiReqsSizeSendCD);

       mpi::Waitall(arrMpiReqsSendCD);

     };


   template<typename T>

     inline void PMatrix<T>::UnpackData(

         SuperNodeBufferType & snode,

         std::vector<LBlock<T> > & LcolRecv,

         std::vector<UBlock<T> > & UrowRecv )

     {

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "["<<snode.Index<<"] "<<  "Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;

 #endif

       // U part

       if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){

         std::stringstream sstm;

         sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );

         std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );

         Int numUBlock;

         deserialize( numUBlock, sstm, NO_MASK );

         UrowRecv.resize( numUBlock );

         for( Int jb = 0; jb < numUBlock; jb++ ){

           deserialize( UrowRecv[jb], sstm, mask );

         }

       } // sender is not the same as receiver

       else{

         // U is obtained locally, just make a copy. Include everything

         // (there is no diagonal block)

         // Is it a copy ?  LL: YES. Maybe we should replace the copy by

         // something more efficient especially for mpisize == 1

         UrowRecv.resize(this->U( LBi( snode.Index, grid_ ) ).size());

         std::copy(this->U( LBi( snode.Index, grid_ ) ).begin(),this->U( LBi( snode.Index, grid_ )).end(),UrowRecv.begin());

       } // sender is the same as receiver


       //L part

       if( MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){

         std::stringstream     sstm;

         sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );

         std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );

         mask[LBlockMask::NZVAL] = 0; // nzval is excluded

         Int numLBlock;

         deserialize( numLBlock, sstm, NO_MASK );

         LcolRecv.resize( numLBlock );

         for( Int ib = 0; ib < numLBlock; ib++ ){

           deserialize( LcolRecv[ib], sstm, mask );

         }

       } // sender is not the same as receiver

       else{

         // L is obtained locally, just make a copy.

         // Do not include the diagonal block

         std::vector<LBlock<T> >& Lcol =  this->L( LBj( snode.Index, grid_ ) );

         Int startIdx = ( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )?1:0;

         LcolRecv.resize( Lcol.size() - startIdx );

         std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());

       } // sender is the same as receiver

     }


   template<typename T>

     inline void PMatrix<T>::ComputeDiagUpdate(SuperNodeBufferType & snode)

     {


       //---------Computing  Diagonal block, all processors in the column are participating to all pipelined supernodes

       if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){

 #if ( _DEBUGlevel_ >= 2 )

         statusOFS << std::endl << "["<<snode.Index<<"] "<<   "Updating the diagonal block" << std::endl << std::endl;

 #endif

         std::vector<LBlock<T> >&  Lcol = this->L( LBj( snode.Index, grid_ ) );


         //Allocate DiagBuf even if Lcol.size() == 0

         snode.DiagBuf.Resize(SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));

         SetValue(snode.DiagBuf, ZERO<T>());


         // Do I own the diagonal block ?

         Int startIb = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;

         for( Int ib = startIb; ib < Lcol.size(); ib++ ){


 #ifdef GEMM_PROFILE

           gemm_stat.push_back(snode.DiagBuf.m());

           gemm_stat.push_back(snode.DiagBuf.n());

           gemm_stat.push_back(Lcol[ib].numRow);

 #endif


           LBlock<T> & LB = Lcol[ib];


           blas::Gemm( 'T', 'N', snode.DiagBuf.m(), snode.DiagBuf.n(), LB.numRow,

               MINUS_ONE<T>(), &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ), snode.LUpdateBuf.m(),

               LB.nzval.Data(), LB.nzval.m(), ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );

         }


 #if ( _DEBUGlevel_ >= 1 )

         statusOFS << std::endl << "["<<snode.Index<<"] "<<   "Updated the diagonal block" << std::endl << std::endl;

 #endif

       }

     }


   template<typename T>

     void PMatrix<T>::GetEtree(std::vector<Int> & etree_supno )

     {


 #ifndef _RELEASE_

       PushCallStack("PMatrix::GetEtree");

       double begin =  MPI_Wtime( );

 #endif

       Int nsupers = this->NumSuper();


       if( optionsLU_->ColPerm != "PARMETIS" ) {

         /* Use the etree computed from serial symb. fact., and turn it

            into supernodal tree.  */

         const SuperNodeType * superNode = this->SuperNode();


         //translate from columns to supernodes etree using supIdx

         etree_supno.resize(this->NumSuper());

         for(Int i = 0; i < superNode->etree.m(); ++i){

           Int curSnode = superNode->superIdx[i];

           Int parentSnode = (superNode->etree[i]>= superNode->etree.m()) ?this->NumSuper():superNode->superIdx[superNode->etree[i]];

           if( curSnode != parentSnode){

             etree_supno[curSnode] = parentSnode;

           }

         }


       } else { /* ParSymbFACT==YES and SymPattern==YES  and RowPerm == NOROWPERM */

         /* Compute an "etree" based on struct(L),

            assuming struct(U) = struct(L').   */


         /* find the first block in each supernodal-column of local L-factor */

         std::vector<Int> etree_supno_l( nsupers, nsupers  );

         for( Int ksup = 0; ksup < nsupers; ksup++ ){

           if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

             // L part

             std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );

             if(Lcol.size()>0){

               Int firstBlk = 0;

               if( MYROW( grid_ ) == PROW( ksup, grid_ ) )

                 firstBlk=1;


               for( Int ib = firstBlk; ib < Lcol.size(); ib++ ){

                 etree_supno_l[ksup] = std::min(etree_supno_l[ksup] , Lcol[ib].blockIdx);

               }

             }

           }

         }


 #if ( _DEBUGlevel_ >= 1 )

         statusOFS << std::endl << " Local supernodal elimination tree is " << etree_supno_l <<std::endl<<std::endl;


 #endif

         /* form global e-tree */

         etree_supno.resize( nsupers );

         mpi::Allreduce( (Int*) &etree_supno_l[0],(Int *) &etree_supno[0], nsupers, MPI_MIN, grid_->comm );

         etree_supno[nsupers-1]=nsupers;

       }


 #ifndef _RELEASE_

       double end =  MPI_Wtime( );

       statusOFS<<"Building the list took "<<end-begin<<"s"<<std::endl;

 #endif

 #ifndef _RELEASE_

       PopCallStack();

 #endif

     }           // -----  end of method PMatrix::GetEtree  -----


   template<typename T>

     inline void PMatrix<T>::GetWorkSet(

         std::vector<Int> & snodeEtree,

         std::vector<std::vector<Int> > & WSet)

     {

       TIMER_START(Compute_WorkSet);

       Int numSuper = this->NumSuper();


       if (options_->maxPipelineDepth!=1){

         //find roots in the supernode etree (it must be postordered)

         //initialize the parent we are looking at

         //Int rootParent = snodeEtree[numSuper-2];

         Int rootParent = numSuper;


         //compute the level of each supernode and the total number of levels

         //IntNumVec level(numSuper);

         //level(rootParent)=0;

         IntNumVec level(numSuper+1);

         level(rootParent)=-1;

         Int numLevel = 0;

         for(Int i=rootParent-1; i>=0; i-- ){ level(i) = level(snodeEtree[i])+1; numLevel = std::max(numLevel, level(i)); }

         numLevel++;


         //Compute the number of supernodes at each level

         IntNumVec levelSize(numLevel);

         SetValue(levelSize,I_ZERO);

         //for(Int i=rootParent-1; i>=0; i-- ){ levelSize(level(i))++; }

         for(Int i=rootParent-1; i>=0; i-- ){ if(level[i]>=0){ levelSize(level(i))++; } }


         //Allocate memory

         WSet.resize(numLevel,std::vector<Int>());

         for(Int i=0; i<numLevel; i++ ){WSet[i].reserve(levelSize(i));}


         //Fill the worklist based on the level of each supernode

         for(Int i=rootParent-1; i>=0; i-- ){

           WSet[level(i)].push_back(i);

         }


         //Constrain the size of each list to be min(MPI_MAX_COMM,options_->maxPipelineDepth)

         Int limit = (options_->maxPipelineDepth>0)?std::min(MPI_MAX_COMM,options_->maxPipelineDepth):MPI_MAX_COMM;

         for (Int lidx=0; lidx<WSet.size() ; lidx++){

           if(WSet[lidx].size()>limit)

           {

             std::vector<std::vector<Int> >::iterator pos = WSet.begin()+lidx+1;

             WSet.insert(pos,std::vector<Int>());

             WSet[lidx+1].insert(WSet[lidx+1].begin(),WSet[lidx].begin() + limit ,WSet[lidx].end());

             WSet[lidx].erase(WSet[lidx].begin()+limit,WSet[lidx].end());

           }

         }


       }

       else{

         for( Int ksup = numSuper - 2; ksup >= 0; ksup-- ){

           WSet.push_back(std::vector<Int>());

           WSet.back().push_back(ksup);

         }


       }


       TIMER_STOP(Compute_WorkSet);

 #if ( _DEBUGlevel_ >= 1 )

       for (Int lidx=0; lidx<WSet.size() ; lidx++){

         statusOFS << std::endl << "L"<< lidx << " is: {";

         for (Int supidx=0; supidx<WSet[lidx].size() ; supidx++){

           statusOFS << WSet[lidx][supidx] << " ["<<snodeEtree[WSet[lidx][supidx]]<<"] ";

         }

         statusOFS << " }"<< std::endl;

       }

 #endif

     }


   template<typename T>

     inline void PMatrix<T>::SelInvIntra_P2p(Int lidx)

     {


 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)

       Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;

 #endif

       Int numSuper = this->NumSuper();

       std::vector<std::vector<Int> > & superList = this->WorkingSet();

       Int numSteps = superList.size();

       Int stepSuper = superList[lidx].size();


       TIMER_START(AllocateBuffer);


       //This is required to send the size and content of U/L

       std::vector<std::vector<MPI_Request> >  arrMpireqsSendToBelow;

       arrMpireqsSendToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcRow, MPI_REQUEST_NULL ));

       std::vector<std::vector<MPI_Request> >  arrMpireqsSendToRight;

       arrMpireqsSendToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcCol, MPI_REQUEST_NULL ));


       //This is required to reduce L

       std::vector<MPI_Request>  arrMpireqsSendToLeft;

       arrMpireqsSendToLeft.resize(stepSuper, MPI_REQUEST_NULL );

       //This is required to reduce D

       std::vector<MPI_Request>  arrMpireqsSendToAbove;

       arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );


       //This is required to receive the size and content of U/L

       std::vector<MPI_Request>   arrMpireqsRecvSizeFromAny;

       arrMpireqsRecvSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);

       std::vector<MPI_Request>   arrMpireqsRecvContentFromAny;

       arrMpireqsRecvContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);


       //allocate the buffers for this supernode

       std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);

       for (Int supidx=0; supidx<stepSuper; supidx++){

         arrSuperNodes[supidx].Index = superList[lidx][supidx];

       }


       int numSentToLeft = 0;

       std::vector<int> reqSentToLeft;


       NumMat<T> AinvBuf, UBuf;


       TIMER_STOP(AllocateBuffer);


 #ifndef _RELEASE_

       PushCallStack("PMatrix::SelInv_P2p::UpdateL");

 #endif

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "Communication to the Schur complement." << std::endl << std::endl;

 #endif


       {

         // Senders

         //Receivers have to resize their buffers

         TIMER_START(IRecv_Content_UL);

         // Receivers (Content)

         for (Int supidx=0; supidx<stepSuper ; supidx++){

           SuperNodeBufferType & snode = arrSuperNodes[supidx];

           MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvContentFromAny[supidx*2];

           MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvContentFromAny[supidx*2+1];


           if( isRecvFromAbove_( snode.Index ) &&

               MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){


             TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];

             if(bcastUTree!=NULL){

               Int myRoot = bcastUTree->GetRoot();

               snode.SizeSstrUrowRecv = bcastUTree->GetMsgSize();

               snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv);

               MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,

                   myRoot, IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT),

                   grid_->colComm, mpireqsRecvFromAbove );

 #if ( _DEBUGlevel_ >= 1 )

               statusOFS << std::endl << "["<<snode.Index<<"] "<<  "Receiving U " << snode.SizeSstrUrowRecv << " BYTES from "<<myRoot<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl <<  std::endl;

 #endif

             }

           } // if I need to receive from up


           if( isRecvFromLeft_( snode.Index ) &&

               MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){

             TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];

             if(bcastLTree!=NULL){

               Int myRoot = bcastLTree->GetRoot();

               snode.SizeSstrLcolRecv = bcastLTree->GetMsgSize();

               snode.SstrLcolRecv.resize(snode.SizeSstrLcolRecv);

               MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,

                   myRoot, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT),

                   grid_->rowComm, mpireqsRecvFromLeft );

 #if ( _DEBUGlevel_ >= 1 )

               statusOFS << std::endl << "["<<snode.Index<<"] "<<  "Receiving L " << snode.SizeSstrLcolRecv << " BYTES from "<<myRoot<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl <<  std::endl;

 #endif

             }

           } // if I need to receive from left

         }

         TIMER_STOP(IRecv_Content_UL);


         // Senders

         TIMER_START(ISend_Content_UL);

         for (Int supidx=0; supidx<stepSuper; supidx++){

           SuperNodeBufferType & snode = arrSuperNodes[supidx];

           std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];

           std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];


 #if ( _DEBUGlevel_ >= 1 )

           statusOFS << std::endl <<  "["<<snode.Index<<"] "

             << "Communication for the U part." << std::endl << std::endl;

 #endif

           // Communication for the U part.

           if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){

             std::vector<UBlock<T> >&  Urow = this->U( LBi(snode.Index, grid_) );

             // Pack the data in U

             TIMER_START(Serialize_UL);

             std::stringstream sstm;


             std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );

             // All blocks are to be sent down.

             serialize( (Int)Urow.size(), sstm, NO_MASK );

             for( Int jb = 0; jb < Urow.size(); jb++ ){

               serialize( Urow[jb], sstm, mask );

             }

             snode.SstrUrowSend.resize( Size( sstm ) );

             sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );

             snode.SizeSstrUrowSend = snode.SstrUrowSend.size();

             TIMER_STOP(Serialize_UL);

             TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];

             if(bcastUTree!=NULL){

               bcastUTree->ForwardMessage((char*)&snode.SstrUrowSend[0], snode.SizeSstrUrowSend,

                   IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT), &mpireqsSendToBelow[0] );

               for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){

                 Int iProcRow = bcastUTree->GetDest(idxRecv);

 #if ( _DEBUGlevel_ >= 1 )

                 statusOFS << std::endl << "["<<snode.Index<<"] "<<  "Sending U " << snode.SizeSstrUrowSend << " BYTES on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT) << std::endl <<  std::endl;

 #endif

               }

             }

           } // if I am the sender


 #if ( _DEBUGlevel_ >= 1 )

           statusOFS << std::endl << "["<<snode.Index<<"] "<< "Communication for the L part." << std::endl << std::endl;

 #endif


           // Communication for the L part.

           if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){

             std::vector<LBlock<T> >&  Lcol = this->L( LBj(snode.Index, grid_) );

             TIMER_START(Serialize_UL);

             // Pack the data in L

             std::stringstream sstm;

             std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );

             mask[LBlockMask::NZVAL] = 0; // nzval is excluded


             // All blocks except for the diagonal block are to be sent right


             if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )

               serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );

             else

               serialize( (Int)Lcol.size(), sstm, NO_MASK );


             for( Int ib = 0; ib < Lcol.size(); ib++ ){

               if( Lcol[ib].blockIdx > snode.Index ){

 #if ( _DEBUGlevel_ >= 2 )

                 statusOFS << std::endl << "["<<snode.Index<<"] "<<  "Serializing Block index " << Lcol[ib].blockIdx << std::endl;

 #endif

                 serialize( Lcol[ib], sstm, mask );

               }

             }

             snode.SstrLcolSend.resize( Size( sstm ) );

             sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );

             snode.SizeSstrLcolSend = snode.SstrLcolSend.size();

             TIMER_STOP(Serialize_UL);


             TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];

             if(bcastLTree!=NULL){

               bcastLTree->ForwardMessage((char*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend,

                   IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT), &mpireqsSendToRight[0] );


               for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){

                 Int iProcCol = bcastLTree->GetDest(idxRecv);

 #if ( _DEBUGlevel_ >= 1 )

                 statusOFS << std::endl << "["<<snode.Index<<"] "<<  "Sending L " << snode.SizeSstrLcolSend << " BYTES on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT) << std::endl <<  std::endl;

 #endif

               }

             }

           } // if I am the sender

         } //Senders

         TIMER_STOP(ISend_Content_UL);

       }


       vector<char> redLdone(stepSuper,0);

       for (Int supidx=0; supidx<stepSuper; supidx++){

         SuperNodeBufferType & snode = arrSuperNodes[supidx];

         TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];


         if(redLTree != NULL){

           redLTree->SetTag(IDX_TO_TAG(snode.Index,SELINV_TAG_L_REDUCE));

           //Initialize the tree

           redLTree->AllocRecvBuffers();

           //Post All Recv requests;

           redLTree->PostFirstRecv();

         }

       }


       TIMER_START(Compute_Sinv_LT);

       {

         Int msgForwarded = 0;

         Int msgToFwd = 0;

         Int gemmProcessed = 0;

         Int gemmToDo = 0;

         //      Int toRecvGemm = 0;

         //copy the list of supernodes we need to process

         std::list<Int> readySupidx;

         //find local things to do

         for(Int supidx = 0;supidx<stepSuper;supidx++){

           SuperNodeBufferType & snode = arrSuperNodes[supidx];


           if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){

             gemmToDo++;

             if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){

               snode.isReady++;

             }


             if(  MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){

               snode.isReady++;

             }


             if(snode.isReady==2){

               readySupidx.push_back(supidx);

 #if ( _DEBUGlevel_ >= 1 )

               statusOFS<<std::endl<<"Locally processing ["<<snode.Index<<"]"<<std::endl;

 #endif

             }

           }

           else if( (isRecvFromLeft_( snode.Index )  ) && MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) )

           {

             //Get the reduction tree

             TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];


             if(redLTree != NULL){

               TIMER_START(Reduce_Sinv_LT_Isend);

               //send the data to NULL to ensure 0 byte send

               redLTree->SetLocalBuffer(NULL);

               redLTree->SetDataReady(true);

               bool done = redLTree->Progress();

               TIMER_STOP(Reduce_Sinv_LT_Isend);

             }

           }// if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ))


           if(MYROW(grid_)!=PROW(snode.Index,grid_)){

             TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];

             if(bcastUTree != NULL){

               if(bcastUTree->GetDestCount()>0){

                 msgToFwd++;

               }

             }

           }


           if(MYCOL(grid_)!=PCOL(snode.Index,grid_)){

             TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];

             if(bcastLTree != NULL){

               if(bcastLTree->GetDestCount()>0){

                 msgToFwd++;

               }

             }

           }

         }


 #if ( _DEBUGlevel_ >= 1 )

         statusOFS<<std::endl<<"gemmToDo ="<<gemmToDo<<std::endl;

         statusOFS<<std::endl<<"msgToFwd ="<<msgToFwd<<std::endl;

 #endif


 #if defined (PROFILE)

         end_SendULWaitContentFirst=0;

         begin_SendULWaitContentFirst=0;

 #endif


         while(gemmProcessed<gemmToDo || msgForwarded < msgToFwd)

         {

           Int reqidx = MPI_UNDEFINED;

           Int supidx = -1;


           //while I don't have anything to do, wait for data to arrive

           do{


             int reqIndices[arrMpireqsRecvContentFromAny.size()];

             int numRecv = 0;


             //then process with the remote ones


             TIMER_START(WaitContent_UL);

 #if defined(PROFILE)

             if(begin_SendULWaitContentFirst==0){

               begin_SendULWaitContentFirst=1;

               TIMER_START(WaitContent_UL_First);

             }

 #endif


             numRecv = 0;

             MPI_Waitsome(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &numRecv, reqIndices, MPI_STATUSES_IGNORE);


             for(int i =0;i<numRecv;i++){

               reqidx = reqIndices[i];

               //I've received something

               if(reqidx!=MPI_UNDEFINED)

               {

                 //this stays true

                 supidx = reqidx/2;

                 SuperNodeBufferType & snode = arrSuperNodes[supidx];


                 //If it's a U block

                 if(reqidx%2==0){

                   TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];

                   if(bcastUTree != NULL){

                     if(bcastUTree->GetDestCount()>0){


                       std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];

 #if ( _DEBUGlevel_ >= 1 )

                       for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){

                         Int iProcRow = bcastUTree->GetDest(idxRecv);

                         statusOFS << std::endl << "["<<snode.Index<<"] "<<  "Forwarding U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl <<  std::endl;

                       }

 #endif


                       bcastUTree->ForwardMessage( (char*)&snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv,

                           IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT), &mpireqsSendToBelow[0] );

 #if ( _DEBUGlevel_ >= 1 )

                       for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){

                         Int iProcRow = bcastUTree->GetDest(idxRecv);

                         statusOFS << std::endl << "["<<snode.Index<<"] "<<  "Forwarded U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl <<  std::endl;

                       }

 #endif

                       msgForwarded++;

                     }

                   }

                 }

                 //If it's a L block

                 else if(reqidx%2==1){

                   TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];

                   if(bcastLTree != NULL){

                     if(bcastLTree->GetDestCount()>0){


                       std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];

 #if ( _DEBUGlevel_ >= 1 )

                       for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){

                         Int iProcCol = bcastLTree->GetDest(idxRecv);

                         statusOFS << std::endl << "["<<snode.Index<<"] "<<  "Forwarding L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcCol<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl <<  std::endl;

                       }

 #endif


                       bcastLTree->ForwardMessage( (char*)&snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv,

                           IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT), &mpireqsSendToRight[0] );


                       //                    for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){

                       //                      Int iProcCol = bcastLTree->GetDest(idxRecv);

                       //                      PROFILE_COMM(MYPROC(this->grid_),PNUM(MYROW(this->grid_),iProcCol,this->grid_),IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT),snode.SizeSstrLcolRecv);

                       //                    }

 #if ( _DEBUGlevel_ >= 1 )

                       for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){

                         Int iProcCol = bcastLTree->GetDest(idxRecv);

                         statusOFS << std::endl << "["<<snode.Index<<"] "<<  "Forwarded L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcCol<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl <<  std::endl;

                       }

 #endif

                       msgForwarded++;

                     }

                   }

                 }


 #if ( _DEBUGlevel_ >= 1 )

                 statusOFS<<std::endl<<"Received data for ["<<snode.Index<<"] reqidx%2="<<reqidx%2<<" is ready ?"<<snode.isReady<<std::endl;

 #endif


                 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){

                   snode.isReady++;


                   //if we received both L and U, the supernode is ready

                   if(snode.isReady==2){

                     readySupidx.push_back(supidx);


 #if defined(PROFILE)

                     if(end_SendULWaitContentFirst==0){

                       TIMER_STOP(WaitContent_UL_First);

                       end_SendULWaitContentFirst=1;

                     }

 #endif

                   }

                 }

               }


             }//end for waitsome


             TIMER_STOP(WaitContent_UL);


           } while( (gemmProcessed<gemmToDo && readySupidx.size()==0) || (gemmProcessed==gemmToDo && msgForwarded<msgToFwd) );


           //If I have some work to do

           if(readySupidx.size()>0)

           {

             supidx = readySupidx.back();

             readySupidx.pop_back();

             SuperNodeBufferType & snode = arrSuperNodes[supidx];


             // Only the processors received information participate in the Gemm

             if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){


               std::vector<LBlock<T> > LcolRecv;

               std::vector<UBlock<T> > UrowRecv;

               // Save all the data to be updated for { L( isup, snode.Index ) | isup > snode.Index }.

               // The size will be updated in the Gemm phase and the reduce phase


               UnpackData(snode, LcolRecv, UrowRecv);


               //NumMat<T> AinvBuf, UBuf;

               SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);


               snode.LUpdateBuf.Resize( AinvBuf.m(), SuperSize( snode.Index, super_ ) );


 #ifdef GEMM_PROFILE

               gemm_stat.push_back(AinvBuf.m());

               gemm_stat.push_back(UBuf.m());

               gemm_stat.push_back(AinvBuf.n());

 #endif


               TIMER_START(Compute_Sinv_LT_GEMM);

               blas::Gemm( 'N', 'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),

                   AinvBuf.Data(), AinvBuf.m(),

                   UBuf.Data(), UBuf.m(), ZERO<T>(),

                   snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );

               TIMER_STOP(Compute_Sinv_LT_GEMM);


 #if ( _DEBUGlevel_ >= 2 )

               statusOFS << std::endl << "["<<snode.Index<<"] "<<  "snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;

 #endif

             } // if Gemm is to be done locally


             //Get the reduction tree

             TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];

             if(redLTree != NULL){

               TIMER_START(Reduce_Sinv_LT_Isend);

               //send the data

               redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());

               redLTree->SetDataReady(true);

               bool done = redLTree->Progress();

               TIMER_STOP(Reduce_Sinv_LT_Isend);

             }

             gemmProcessed++;


 #if ( _DEBUGlevel_ >= 1 )

             statusOFS<<std::endl<<"gemmProcessed ="<<gemmProcessed<<"/"<<gemmToDo<<std::endl;

 #endif


             //advance reductions

             for (Int supidx=0; supidx<stepSuper; supidx++){

               SuperNodeBufferType & snode = arrSuperNodes[supidx];

               TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];

               if(redLTree != NULL && !redLdone[supidx]){

                 bool done = redLTree->Progress();

               }

             }

           }

         }


       }

       TIMER_STOP(Compute_Sinv_LT);


       //Reduce Sinv L^T to the processors in PCOL(ksup,grid_)


       TIMER_START(Reduce_Sinv_LT);

       //blocking wait for the reduction

       bool all_done = false;

       while(!all_done)

       {

         all_done = true;


         for (Int supidx=0; supidx<stepSuper; supidx++){

           SuperNodeBufferType & snode = arrSuperNodes[supidx];


           TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];


           if(redLTree != NULL && !redLdone[supidx]){

             bool done = redLTree->Progress();

             if(done){

               if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){

                 //determine the number of rows in LUpdateBufReduced

                 Int numRowLUpdateBuf;

                 std::vector<LBlock<T> >&  Lcol = this->L( LBj( snode.Index, grid_ ) );

                 if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){

                   snode.RowLocalPtr.resize( Lcol.size() + 1 );

                   snode.BlockIdxLocal.resize( Lcol.size() );

                   snode.RowLocalPtr[0] = 0;

                   for( Int ib = 0; ib < Lcol.size(); ib++ ){

                     snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;

                     snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;

                   }

                 } // I do not own the diagonal block

                 else{

                   snode.RowLocalPtr.resize( Lcol.size() );

                   snode.BlockIdxLocal.resize( Lcol.size() - 1 );

                   snode.RowLocalPtr[0] = 0;

                   for( Int ib = 1; ib < Lcol.size(); ib++ ){

                     snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;

                     snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;

                   }

                 } // I own the diagonal block, skip the diagonal block

                 numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();


                 if( numRowLUpdateBuf > 0 ){

                   if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){

                     snode.LUpdateBuf.Resize( numRowLUpdateBuf,SuperSize( snode.Index, super_ ) );

                   }

                 }


                 //copy the buffer from the reduce tree

                 redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());


               }

               redLdone[supidx]=1;

               redLTree->CleanupBuffers();

             }


             all_done = all_done && (done || redLdone[supidx]);

           }

         }

       }

       TIMER_STOP(Reduce_Sinv_LT);


 #ifndef _RELEASE_

       PopCallStack();

 #endif

       //--------------------- End of reduce of LUpdateBuf-------------------------

 #ifndef _RELEASE_

       PushCallStack("PMatrix::SelInv_P2p::UpdateD");

 #endif


       TIMER_START(Update_Diagonal);


       for (Int supidx=0; supidx<stepSuper; supidx++){

         SuperNodeBufferType & snode = arrSuperNodes[supidx];


         ComputeDiagUpdate(snode);


         //Get the reduction tree

         TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];


         if(redDTree != NULL){

           //send the data

           if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){

             if(snode.DiagBuf.Size()==0){

               snode.DiagBuf.Resize( SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));

               SetValue(snode.DiagBuf, ZERO<T>());

             }

           }


           redDTree->SetLocalBuffer(snode.DiagBuf.Data());

           if(!redDTree->IsAllocated()){

             redDTree->SetTag(IDX_TO_TAG(snode.Index,SELINV_TAG_D_REDUCE));

             redDTree->AllocRecvBuffers();

             //Post All Recv requests;

             redDTree->PostFirstRecv();

           }


           redDTree->SetDataReady(true);

           bool done = redDTree->Progress();

         }


         //advance reductions

         for (Int supidx=0; supidx<stepSuper; supidx++){

           SuperNodeBufferType & snode = arrSuperNodes[supidx];

           TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];

           if(redDTree != NULL){

             if(redDTree->IsAllocated()){

               bool done = redDTree->Progress();

             }

           }

         }

       }

       TIMER_STOP(Update_Diagonal);


       TIMER_START(Reduce_Diagonal);

       //blocking wait for the reduction

       {

         vector<char> is_done(stepSuper,0);

         bool all_done = false;

         while(!all_done)

         {

           all_done = true;


           for (Int supidx=0; supidx<stepSuper; supidx++){

             SuperNodeBufferType & snode = arrSuperNodes[supidx];

             TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];


             if(redDTree != NULL){

               bool done = redDTree->Progress();

               if(done && !is_done[supidx]){

                 if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){

                   if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){

                     LBlock<T> &  LB = this->L( LBj( snode.Index, grid_ ) )[0];

                     // Symmetrize LB

                     blas::Axpy( LB.numRow * LB.numCol, ONE<T>(), snode.DiagBuf.Data(), 1, LB.nzval.Data(), 1 );

                     Symmetrize( LB.nzval );

                   }

                 }


                 is_done[supidx]=1;

                 redDTree->CleanupBuffers();

               }


               all_done = all_done && (done || is_done[supidx]);

             }

           }

         }

       }

       TIMER_STOP(Reduce_Diagonal);


 #ifndef _RELEASE_

       PopCallStack();

 #endif


 #ifndef _RELEASE_

       PushCallStack("PMatrix::SelInv_P2p::UpdateU");

 #endif


       SendRecvCD_UpdateU(arrSuperNodes, stepSuper);


 #ifndef _RELEASE_

       PopCallStack();

 #endif


 #ifndef _RELEASE_

       PushCallStack("PMatrix::SelInv_P2p::UpdateLFinal");

 #endif


       TIMER_START(Update_L);

       for (Int supidx=0; supidx<stepSuper; supidx++){

         SuperNodeBufferType & snode = arrSuperNodes[supidx];


 #if ( _DEBUGlevel_ >= 1 )

         statusOFS << std::endl << "["<<snode.Index<<"] "<<  "Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;

 #endif


         if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){

           std::vector<LBlock<T> >&  Lcol = this->L( LBj( snode.Index, grid_ ) );

           //Need to skip the diagonal block if present

           Int startBlock = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;

           for( Int ib = startBlock; ib < Lcol.size(); ib++ ){

             LBlock<T> & LB = Lcol[ib];

             lapack::Lacpy( 'A', LB.numRow, LB.numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),

                 snode.LUpdateBuf.m(), LB.nzval.Data(), LB.numRow );

           }

         } // Finish updating L

       } // for (snode.Index) : Main loop

       TIMER_STOP(Update_L);


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       TIMER_START(Barrier);


       for (Int supidx=0; supidx<stepSuper; supidx++){

         SuperNodeBufferType & snode = arrSuperNodes[supidx];

         TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];


         if(redLTree != NULL){

           redLTree->Wait();

           redLTree->CleanupBuffers();

         }

       }


       for (Int supidx=0; supidx<stepSuper; supidx++){

         SuperNodeBufferType & snode = arrSuperNodes[supidx];

         TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];


         if(redDTree != NULL){

           redDTree->Wait();

           redDTree->CleanupBuffers();

         }

       }


       mpi::Waitall(arrMpireqsRecvContentFromAny);


       for (Int supidx=0; supidx<stepSuper; supidx++){

         Int ksup = superList[lidx][supidx];

         std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];

         std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];


 #if ( _DEBUGlevel_ >= 1 )

         statusOFS<<"["<<ksup<<"] mpireqsSendToRight"<<std::endl;

 #endif

         mpi::Waitall( mpireqsSendToRight );

 #if ( _DEBUGlevel_ >= 1 )

         statusOFS<<"["<<ksup<<"] mpireqsSendToBelow"<<std::endl;

 #endif

         mpi::Waitall( mpireqsSendToBelow );


       }


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS<<"barrier done"<<std::endl;

 #endif

       TIMER_STOP(Barrier);


 #ifdef LIST_BARRIER

 #ifndef ALL_BARRIER

       if (options_->maxPipelineDepth!=-1)

 #endif

       {

         MPI_Barrier(grid_->comm);

       }

 #endif


     }


   template<typename T>

     void PMatrix<T>::ConstructCommunicationPattern      (  )

     {

       ConstructCommunicationPattern_P2p();

     }           // -----  end of method PMatrix::ConstructCommunicationPattern  -----


   template<typename T>

     void PMatrix<T>::ConstructCommunicationPattern_P2p  (  )

     {

 #ifndef _RELEASE_

       PushCallStack("PMatrix::ConstructCommunicationPattern_P2p");

 #endif


       Int numSuper = this->NumSuper();


       TIMER_START(Allocate);


 #ifndef _RELEASE_

       PushCallStack( "Initialize the communication pattern" );

 #endif


       fwdToBelowTree_.resize(numSuper, NULL );

       fwdToRightTree_.resize(numSuper, NULL );

       redToLeftTree_.resize(numSuper, NULL );

       redToAboveTree_.resize(numSuper, NULL );


       isSendToBelow_.Resize(grid_->numProcRow, numSuper);

       isSendToRight_.Resize(grid_->numProcCol, numSuper);

       isSendToDiagonal_.Resize( numSuper );

       SetValue( isSendToBelow_, false );

       SetValue( isSendToRight_, false );

       SetValue( isSendToDiagonal_, false );


       isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );

       SetValue( isSendToCrossDiagonal_, false );

       isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );

       SetValue( isRecvFromCrossDiagonal_, false );


       isRecvFromAbove_.Resize( numSuper );

       isRecvFromLeft_.Resize( numSuper );

       isRecvFromBelow_.Resize( grid_->numProcRow, numSuper );

       SetValue( isRecvFromAbove_, false );

       SetValue( isRecvFromBelow_, false );

       SetValue( isRecvFromLeft_, false );

 #ifndef _RELEASE_

       PopCallStack();

 #endif


       TIMER_STOP(Allocate);


       TIMER_START(GetEtree);

       std::vector<Int> snodeEtree(this->NumSuper());

       GetEtree(snodeEtree);

       TIMER_STOP(GetEtree);


 #ifndef _RELEASE_

       PushCallStack( "Local column communication" );

 #endif

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "Local column communication" << std::endl;

 #endif

       // localColBlockRowIdx stores the nonzero block indices for each local block column.

       // The nonzero block indices including contribution from both L and U.

       // Dimension: numLocalBlockCol x numNonzeroBlock

       std::vector<std::set<Int> >   localColBlockRowIdx;


       localColBlockRowIdx.resize( this->NumLocalBlockCol() );


       TIMER_START(Column_communication);


       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         // All block columns perform independently

         if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){


           // Communication

           std::vector<Int> tAllBlockRowIdx;

           std::vector<Int> & colBlockIdx = ColBlockIdx(LBj(ksup, grid_));

           TIMER_START(Allgatherv_Column_communication);

           if( grid_ -> mpisize != 1 )

             mpi::Allgatherv( colBlockIdx, tAllBlockRowIdx, grid_->colComm );

           else

             tAllBlockRowIdx = colBlockIdx;


           TIMER_STOP(Allgatherv_Column_communication);


           localColBlockRowIdx[LBj( ksup, grid_ )].insert(

               tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );


 #if ( _DEBUGlevel_ >= 1 )

           statusOFS

             << " Column block " << ksup

             << " has the following nonzero block rows" << std::endl;

           for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();

               si != localColBlockRowIdx[LBj( ksup, grid_ )].end();

               si++ ){

             statusOFS << *si << "  ";

           }

           statusOFS << std::endl;

 #endif


         } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )

       } // for(ksup)


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       TIMER_STOP(Column_communication);


       TIMER_START(Row_communication);

 #ifndef _RELEASE_

       PushCallStack( "Local row communication" );

 #endif

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "Local row communication" << std::endl;

 #endif

       // localRowBlockColIdx stores the nonzero block indices for each local block row.

       // The nonzero block indices including contribution from both L and U.

       // Dimension: numLocalBlockRow x numNonzeroBlock

       std::vector<std::set<Int> >   localRowBlockColIdx;


       localRowBlockColIdx.resize( this->NumLocalBlockRow() );


       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         // All block columns perform independently

         if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){


           // Communication

           std::vector<Int> tAllBlockColIdx;

           std::vector<Int> & rowBlockIdx = RowBlockIdx(LBi(ksup, grid_));

           TIMER_START(Allgatherv_Row_communication);

           if( grid_ -> mpisize != 1 )

             mpi::Allgatherv( rowBlockIdx, tAllBlockColIdx, grid_->rowComm );

           else

             tAllBlockColIdx = rowBlockIdx;


           TIMER_STOP(Allgatherv_Row_communication);


           localRowBlockColIdx[LBi( ksup, grid_ )].insert(

               tAllBlockColIdx.begin(), tAllBlockColIdx.end() );


 #if ( _DEBUGlevel_ >= 1 )

           statusOFS

             << " Row block " << ksup

             << " has the following nonzero block columns" << std::endl;

           for( std::set<Int>::iterator si = localRowBlockColIdx[LBi( ksup, grid_ )].begin();

               si != localRowBlockColIdx[LBi( ksup, grid_ )].end();

               si++ ){

             statusOFS << *si << "  ";

           }

           statusOFS << std::endl;

 #endif


         } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )

       } // for(ksup)


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       TIMER_STOP(Row_communication);


       TIMER_START(STB_RFA);

 #ifndef _RELEASE_

       PushCallStack("SendToBelow / RecvFromAbove");

 #endif

       for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){

         // Loop over all the supernodes to the right of ksup

         Int jsup = snodeEtree[ksup];

         while(jsup<numSuper){

           Int jsupLocalBlockCol = LBj( jsup, grid_ );

           Int jsupProcCol = PCOL( jsup, grid_ );

           if( MYCOL( grid_ ) == jsupProcCol ){

             // SendToBelow / RecvFromAbove only if (ksup, jsup) is nonzero.

             if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {

               for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();

                   si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++       ){

                 Int isup = *si;

                 Int isupProcRow = PROW( isup, grid_ );

                 if( isup > ksup ){

                   if( MYROW( grid_ ) == isupProcRow ){

                     isRecvFromAbove_(ksup) = true;

                   }

                   if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){

                     isSendToBelow_( isupProcRow, ksup ) = true;

                   }

                 } // if( isup > ksup )

               } // for (si)

             } // if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 )

           } // if( MYCOL( grid_ ) == PCOL( jsup, grid_ ) )

           jsup = snodeEtree[jsup];

         } // for(jsup)

       } // for(ksup)

 #ifndef _RELEASE_

       PopCallStack();

 #endif

       TIMER_STOP(STB_RFA);


       TIMER_START(STR_RFL_RFB);

 #ifndef _RELEASE_

       PushCallStack("SendToRight / RecvFromLeft");

 #endif

       for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){

         // Loop over all the supernodes below ksup

         Int isup = snodeEtree[ksup];

         while(isup<numSuper){

           Int isupLocalBlockRow = LBi( isup, grid_ );

           Int isupProcRow       = PROW( isup, grid_ );

           if( MYROW( grid_ ) == isupProcRow ){

             // SendToRight / RecvFromLeft only if (isup, ksup) is nonzero.

             if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){

               for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();

                   si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){

                 Int jsup = *si;

                 Int jsupProcCol = PCOL( jsup, grid_ );

                 if( jsup > ksup ){

                   if( MYCOL( grid_ ) == jsupProcCol ){

                     isRecvFromLeft_(ksup) = true;

                   }

                   if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

                     isSendToRight_( jsupProcCol, ksup ) = true;

                   }

                 }

               } // for (si)

             } // if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 )

           } // if( MYROW( grid_ ) == isupProcRow )


           if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){

             if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){

               isRecvFromBelow_(isupProcRow,ksup) = true;

             }

             else if (MYROW(grid_) == isupProcRow){

               isSendToDiagonal_(ksup)=true;

             }

           } // if( MYCOL( grid_ ) == PCOL(ksup, grid_) )

           isup = snodeEtree[isup];

         } // for (isup)

       }  // for (ksup)


 #ifndef _RELEASE_

       PopCallStack();

 #endif

       TIMER_STOP(STR_RFL_RFB);


       TIMER_START(BUILD_BCAST_TREES);

       //Allgather RFL values within column


       vector<double> SeedRFL(numSuper,0.0);

       vector<Int> aggRFL(numSuper);

       vector<Int> globalAggRFL(numSuper*grid_->numProcCol);

       for( Int ksup = 0; ksup < numSuper ; ksup++ ){

         if(MYCOL(grid_)==PCOL(ksup,grid_)){

           std::vector<LBlock<T> >&  Lcol = this->L( LBj(ksup, grid_) );

           // All blocks except for the diagonal block are to be sent right


           Int totalSize = 0;

           //one integer holding the number of Lblocks

           totalSize+=sizeof(Int);

           for( Int ib = 0; ib < Lcol.size(); ib++ ){

             if( Lcol[ib].blockIdx > ksup ){

               //three indices + one IntNumVec

               totalSize+=3*sizeof(Int);

               totalSize+= sizeof(Int)+Lcol[ib].rows.ByteSize();

             }

           }


           aggRFL[ksup]=totalSize;

           SeedRFL[ksup]=rand();


         }

         else if(isRecvFromLeft_(ksup)){

           aggRFL[ksup]=1;

         }

         else{

           aggRFL[ksup]=0;

         }

       }

       //      //allgather

       MPI_Allgather(&aggRFL[0],numSuper*sizeof(Int),MPI_BYTE,

           &globalAggRFL[0],numSuper*sizeof(Int),MPI_BYTE,

           grid_->rowComm);

       MPI_Allreduce(MPI_IN_PLACE,&SeedRFL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);


       vector<double> SeedRFA(numSuper,0.0);

       vector<Int> aggRFA(numSuper);

       vector<Int> globalAggRFA(numSuper*grid_->numProcRow);

       for( Int ksup = 0; ksup < numSuper ; ksup++ ){

         if(MYROW(grid_)==PROW(ksup,grid_)){

           std::vector<UBlock<T> >&  Urow = this->U( LBi(ksup, grid_) );

           // All blocks except for the diagonal block are to be sent right


           Int totalSize = 0;

           //one integer holding the number of Lblocks

           totalSize+=sizeof(Int);

           for( Int jb = 0; jb < Urow.size(); jb++ ){

             if( Urow[jb].blockIdx >= ksup ){

               //three indices + one IntNumVec + one NumMat<T>

               totalSize+=3*sizeof(Int);

               totalSize+= sizeof(Int)+Urow[jb].cols.ByteSize();

               totalSize+= 2*sizeof(Int)+Urow[jb].nzval.ByteSize();

             }

           }

           aggRFA[ksup]=totalSize;

           SeedRFA[ksup]=rand();

         }

         else if(isRecvFromAbove_(ksup)){

           aggRFA[ksup]=1;

         }

         else{

           aggRFA[ksup]=0;

         }

       }


       //allgather

       MPI_Allgather(&aggRFA[0],numSuper*sizeof(Int),MPI_BYTE,

           &globalAggRFA[0],numSuper*sizeof(Int),MPI_BYTE,

           grid_->colComm);

       MPI_Allreduce(MPI_IN_PLACE,&SeedRFA[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);


       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         set<Int> set_ranks;

         Int msgSize = 0;

         for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){

           Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];

           if(isRFA>0){

             if( iProcRow != PROW( ksup, grid_ ) ){

               set_ranks.insert(iProcRow);

             }

             else{

               msgSize = isRFA;

             }

           }

         }


         if( isRecvFromAbove_(ksup) || CountSendToBelow(ksup)>0 ){

           vector<Int> tree_ranks;

           tree_ranks.push_back(PROW(ksup,grid_));

           tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());

           TreeBcast * & BcastUTree = fwdToBelowTree_[ksup];

           BcastUTree = TreeBcast::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);

 #ifdef COMM_PROFILE

           BcastUTree->SetGlobalComm(grid_->comm);

 #endif

         }


         set_ranks.clear();

         for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){

           Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];

           if(isRFL>0){

             if( iProcCol != PCOL( ksup, grid_ ) ){

               set_ranks.insert(iProcCol);

             }

             else{

               msgSize = isRFL;

             }

           }

         }


         if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){

           vector<Int> tree_ranks;

           tree_ranks.push_back(PCOL(ksup,grid_));

           tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());


           TreeBcast * & BcastLTree = fwdToRightTree_[ksup];

           BcastLTree = TreeBcast::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);

 #ifdef COMM_PROFILE

           BcastLTree->SetGlobalComm(grid_->comm);

 #endif

         }

       }


       //do the same for the other arrays

       TIMER_STOP(BUILD_BCAST_TREES);

       TIMER_START(BUILD_REDUCE_D_TREE);

       vector<double> SeedSTD(numSuper,0.0);

       vector<Int> aggSTD(numSuper);

       vector<Int> globalAggSTD(numSuper*grid_->numProcRow);

       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         if( MYCOL( grid_ ) == PCOL(ksup, grid_) &&  MYROW(grid_)==PROW(ksup,grid_)){

           Int totalSize = sizeof(T)*SuperSize( ksup, super_ )*SuperSize( ksup, super_ );

           aggSTD[ksup]=totalSize;

           SeedSTD[ksup]=rand();

         }

         else if(isSendToDiagonal_(ksup)){

           aggSTD[ksup]=1;

         }

         else{

           aggSTD[ksup]=0;

         }

       }


       //allgather

       MPI_Allgather(&aggSTD[0],numSuper*sizeof(Int),MPI_BYTE,

           &globalAggSTD[0],numSuper*sizeof(Int),MPI_BYTE,

           grid_->colComm);

       MPI_Allreduce(MPI_IN_PLACE,&SeedSTD[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);


       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){

           set<Int> set_ranks;

           Int msgSize = 0;

           for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){

             Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];

             if(isSTD>0){

               if( iProcRow != PROW( ksup, grid_ ) ){

                 set_ranks.insert(iProcRow);

               }

               else{

                 msgSize = isSTD;

               }

             }

           }


           Int amISTD = globalAggSTD[MYROW(grid_)*numSuper + ksup];


           //      if( MYCOL( grid_ ) == PCOL(ksup, grid_) &&  MYROW(grid_)==PROW(ksup,grid_)){

           //        assert(amISTD>0);

           //      }


           if( amISTD ){

             vector<Int> tree_ranks;

             tree_ranks.push_back(PROW(ksup,grid_));

             tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());


             //assert(set_ranks.find(MYROW(grid_))!= set_ranks.end() || MYROW(grid_)==tree_ranks[0]);


             TreeReduce<T> * & redDTree = redToAboveTree_[ksup];


             redDTree = TreeReduce<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedSTD[ksup]);

 #ifdef COMM_PROFILE

             redDTree->SetGlobalComm(grid_->comm);

 #endif

           }

         }

       }


       TIMER_STOP(BUILD_REDUCE_D_TREE);


       TIMER_START(BUILD_REDUCE_L_TREE);

       vector<double> SeedRTL(numSuper,0.0);

       vector<Int> aggRTL(numSuper);

       vector<Int> globalAggRTL(numSuper*grid_->numProcCol);

       for( Int ksup = 0; ksup < numSuper ; ksup++ ){

         if(MYCOL(grid_)==PCOL(ksup,grid_)){

           std::vector<LBlock<T> >&  Lcol = this->L( LBj(ksup, grid_) );

           // All blocks except for the diagonal block are to be sent right


           Int totalSize = 0;


           //determine the number of rows in LUpdateBufReduced

           Int numRowLUpdateBuf = 0;

           if( MYROW( grid_ ) != PROW( ksup, grid_ ) ){

             for( Int ib = 0; ib < Lcol.size(); ib++ ){

               numRowLUpdateBuf += Lcol[ib].numRow;

             }

           } // I do not own the diagonal block

           else{

             for( Int ib = 1; ib < Lcol.size(); ib++ ){

               numRowLUpdateBuf += Lcol[ib].numRow;

             }

           } // I own the diagonal block, skip the diagonal block


           //if(ksup==297){gdb_lock();}

           totalSize = numRowLUpdateBuf*SuperSize( ksup, super_ )*sizeof(T);


           aggRTL[ksup]=totalSize;


           SeedRTL[ksup]=rand();

         }

         else if(isRecvFromLeft_(ksup)){

           aggRTL[ksup]=1;

         }

         else{

           aggRTL[ksup]=0;

         }

       }

       //      //allgather

       MPI_Allgather(&aggRTL[0],numSuper*sizeof(Int),MPI_BYTE,

           &globalAggRTL[0],numSuper*sizeof(Int),MPI_BYTE,

           grid_->rowComm);

       MPI_Allreduce(MPI_IN_PLACE,&SeedRTL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);


       for( Int ksup = 0; ksup < numSuper ; ksup++ ){

         set<Int> set_ranks;

         Int msgSize = 0;

         for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){

           Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];

           if(isRTL>0){

             if( iProcCol != PCOL( ksup, grid_ ) ){

               set_ranks.insert(iProcCol);

             }

             else{

               msgSize = isRTL;

             }

           }

         }


         if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){

           vector<Int> tree_ranks;

           tree_ranks.push_back(PCOL(ksup,grid_));

           tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());


           TreeReduce<T> * & redLTree = redToLeftTree_[ksup];

           redLTree = TreeReduce<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRTL[ksup]);

 #ifdef COMM_PROFILE

           redLTree->SetGlobalComm(grid_->comm);

 #endif

         }

       }


       TIMER_STOP(BUILD_REDUCE_L_TREE);


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "isSendToBelow:" << std::endl;

       for(int j = 0;j< isSendToBelow_.n();j++){

         statusOFS << "["<<j<<"] ";

         for(int i =0; i < isSendToBelow_.m();i++){

           statusOFS<< isSendToBelow_(i,j) << " ";

         }

         statusOFS<<std::endl;

       }


       statusOFS << std::endl << "isRecvFromAbove:" << std::endl;

       for(int j = 0;j< isRecvFromAbove_.m();j++){

         statusOFS << "["<<j<<"] "<< isRecvFromAbove_(j)<<std::endl;

       }

 #endif

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "isSendToRight:" << std::endl;

       for(int j = 0;j< isSendToRight_.n();j++){

         statusOFS << "["<<j<<"] ";

         for(int i =0; i < isSendToRight_.m();i++){

           statusOFS<< isSendToRight_(i,j) << " ";

         }

         statusOFS<<std::endl;

       }


       statusOFS << std::endl << "isRecvFromLeft:" << std::endl;

       for(int j = 0;j< isRecvFromLeft_.m();j++){

         statusOFS << "["<<j<<"] "<< isRecvFromLeft_(j)<<std::endl;

       }


       statusOFS << std::endl << "isRecvFromBelow:" << std::endl;

       for(int j = 0;j< isRecvFromBelow_.n();j++){

         statusOFS << "["<<j<<"] ";

         for(int i =0; i < isRecvFromBelow_.m();i++){

           statusOFS<< isRecvFromBelow_(i,j) << " ";

         }

         statusOFS<<std::endl;

       }

 #endif


       TIMER_START(STCD_RFCD);


 #ifndef _RELEASE_

       PushCallStack("SendToCrossDiagonal / RecvFromCrossDiagonal");

 #endif

       for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){

         if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

           for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();

               si != localColBlockRowIdx[LBj( ksup, grid_ )].end(); si++ ){

             Int isup = *si;

             Int isupProcRow = PROW( isup, grid_ );

             Int isupProcCol = PCOL( isup, grid_ );

             if( isup > ksup && MYROW( grid_ ) == isupProcRow ){

               isSendToCrossDiagonal_(grid_->numProcCol, ksup ) = true;

               isSendToCrossDiagonal_(isupProcCol, ksup ) = true;

             }

           } // for (si)

         } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )

       } // for (ksup)


       for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){

         if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){

           for( std::set<Int>::iterator si = localRowBlockColIdx[ LBi(ksup, grid_) ].begin();

               si != localRowBlockColIdx[ LBi(ksup, grid_) ].end(); si++ ){

             Int jsup = *si;

             Int jsupProcCol = PCOL( jsup, grid_ );

             Int jsupProcRow = PROW( jsup, grid_ );

             if( jsup > ksup && MYCOL(grid_) == jsupProcCol ){

               isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) = true;

               isRecvFromCrossDiagonal_(jsupProcRow, ksup ) = true;

             }

           } // for (si)

         } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )

       } // for (ksup)

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "isSendToCrossDiagonal:" << std::endl;

       for(int j =0; j < isSendToCrossDiagonal_.n();j++){

         if(isSendToCrossDiagonal_(grid_->numProcCol,j)){

           statusOFS << "["<<j<<"] ";

           for(int i =0; i < isSendToCrossDiagonal_.m()-1;i++){

             if(isSendToCrossDiagonal_(i,j))

             {

               statusOFS<< PNUM(PROW(j,grid_),i,grid_)<<" ";

             }

           }

           statusOFS<<std::endl;

         }

       }


       statusOFS << std::endl << "isRecvFromCrossDiagonal:" << std::endl;

       for(int j =0; j < isRecvFromCrossDiagonal_.n();j++){

         if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){

           statusOFS << "["<<j<<"] ";

           for(int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){

             if(isRecvFromCrossDiagonal_(i,j))

             {

               statusOFS<< PNUM(i,PCOL(j,grid_),grid_)<<" ";

             }

           }

           statusOFS<<std::endl;

         }

       }


 #endif


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       TIMER_STOP(STCD_RFCD);


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       //Build the list of supernodes based on the elimination tree from SuperLU

       GetWorkSet(snodeEtree,this->WorkingSet());


       return ;

     }           // -----  end of method PMatrix::ConstructCommunicationPattern_P2p  -----


   template<typename T>

     void PMatrix<T>::SelInv     (  )

     {


       SelInv_P2p        (  );


 #ifdef GEMM_PROFILE

       statOFS<<"m"<<"\t"<<"n"<<"\t"<<"z"<<std::endl;

       for(auto it = gemm_stat.begin(); it!=gemm_stat.end(); it+=3){

         statOFS<<*it<<"\t"<<*(it+1)<<"\t"<<*(it+2)<<std::endl;

       }

 #endif


 #ifdef COMM_PROFILE

       //std::cout<<"DUMPING COMM STATS "<<comm_stat.size()<<" "<<std::endl;

       commOFS<<HEADER_COMM<<std::endl;

       for(auto it = comm_stat.begin(); it!=comm_stat.end(); it+=4){

         commOFS<<LINE_COMM(it)<<std::endl;

       }

 #endif

     }           // -----  end of method PMatrix::SelInv  -----


   template<typename T>

     void PMatrix<T>::SelInv_P2p (  )

     {

       TIMER_START(SelInv_P2p);


 #ifndef _RELEASE_

       PushCallStack("PMatrix::SelInv_P2p");

 #endif


       Int numSuper = this->NumSuper();


       // Main loop

       std::vector<std::vector<Int> > & superList = this->WorkingSet();

       Int numSteps = superList.size();


       for (Int lidx=0; lidx<numSteps ; lidx++){

         Int stepSuper = superList[lidx].size();


         SelInvIntra_P2p(lidx);


         //        if(lidx==1){ return;};

       }


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       TIMER_STOP(SelInv_P2p);


       return ;

     }           // -----  end of method PMatrix::SelInv_P2p  -----


   template<typename T>

     void PMatrix<T>::PreSelInv  (  )

     {

 #ifndef _RELEASE_

       PushCallStack("PMatrix::PreSelInv");

 #endif


       Int numSuper = this->NumSuper();


 #ifndef _RELEASE_

       PushCallStack("L(i,k) <- L(i,k) * L(k,k)^{-1}");

 #endif

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "L(i,k) <- L(i,k) * L(k,k)^{-1}" << std::endl << std::endl;

 #endif

       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

           // Broadcast the diagonal L block

           NumMat<T> nzvalLDiag;

           std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );

           if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){

             nzvalLDiag = Lcol[0].nzval;

             if( nzvalLDiag.m() != SuperSize(ksup, super_) ||

                 nzvalLDiag.n() != SuperSize(ksup, super_) ){

 #ifdef USE_ABORT

               abort();

 #endif

               throw std::runtime_error( "The size of the diagonal block of L is wrong." );

             }

           } // Owns the diagonal block

           else

           {

             nzvalLDiag.Resize( SuperSize(ksup, super_), SuperSize(ksup, super_) );

           }

           MPI_Bcast( (void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),

               MPI_BYTE, PROW( ksup, grid_ ), grid_->colComm );


           // Triangular solve

           for( Int ib = 0; ib < Lcol.size(); ib++ ){

             LBlock<T> & LB = Lcol[ib];

             if( LB.blockIdx > ksup ){

 #if ( _DEBUGlevel_ >= 2 )

               // Check the correctness of the triangular solve for the first local column

               if( LBj( ksup, grid_ ) == 0 ){

                 statusOFS << "Diag   L(" << ksup << ", " << ksup << "): " << nzvalLDiag << std::endl;

                 statusOFS << "Before solve L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;

               }

 #endif

               blas::Trsm( 'R', 'L', 'N', 'U', LB.numRow, LB.numCol, ONE<T>(),

                   nzvalLDiag.Data(), LB.numCol, LB.nzval.Data(), LB.numRow );

 #if ( _DEBUGlevel_ >= 2 )

               // Check the correctness of the triangular solve for the first local column

               if( LBj( ksup, grid_ ) == 0 ){

                 statusOFS << "After solve  L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;

               }

 #endif

             }

           }

         } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )

       } // for (ksup)


 #ifndef _RELEASE_

       PopCallStack();

 #endif


 #ifndef _RELEASE_

       PushCallStack("U(k,i) <- L(i,k)");

 #endif

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "U(k,i) <- L(i,k)" << std::endl << std::endl;

 #endif


       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         Int ksupProcRow = PROW( ksup, grid_ );

         Int ksupProcCol = PCOL( ksup, grid_ );


         Int sendCount = CountSendToCrossDiagonal(ksup);

         Int recvCount = CountRecvFromCrossDiagonal(ksup);


         std::vector<MPI_Request > arrMpiReqsSend(sendCount, MPI_REQUEST_NULL );

         std::vector<MPI_Request > arrMpiReqsSizeSend(sendCount, MPI_REQUEST_NULL );

         std::vector<std::vector<char> > arrSstrLcolSend(sendCount);

         std::vector<Int > arrSstrLcolSizeSend(sendCount);


         std::vector<MPI_Request > arrMpiReqsRecv(recvCount, MPI_REQUEST_NULL );

         std::vector<MPI_Request > arrMpiReqsSizeRecv(recvCount, MPI_REQUEST_NULL );

         std::vector<std::vector<char> > arrSstrLcolRecv(recvCount);

         std::vector<Int > arrSstrLcolSizeRecv(recvCount);


         // Sender

         if( isSendToCrossDiagonal_(grid_->numProcCol,ksup) ){

 #if ( _DEBUGlevel_ >= 1 )

           statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should send to "<<CountSendToCrossDiagonal(ksup)<<" processors"<<std::endl;

 #endif


           Int sendIdx = 0;

           for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){

             if(isSendToCrossDiagonal_(dstCol,ksup) ){

               Int dst = PNUM(PROW(ksup,grid_),dstCol,grid_);

               if(MYPROC(grid_)!= dst){

                 // Pack L data

                 std::stringstream sstm;

                 std::vector<char> & sstrLcolSend = arrSstrLcolSend[sendIdx];

                 Int & sstrSize = arrSstrLcolSizeSend[sendIdx];

                 MPI_Request & mpiReqSend = arrMpiReqsSend[sendIdx];

                 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSend[sendIdx];


                 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );

                 std::vector<LBlock<T> >&  Lcol = this->L( LBj(ksup, grid_) );

                 // All blocks except for the diagonal block are to be sent right

                 //TODO not true > this is a scatter operation ! Can we know the destination ?


                 Int count = 0;

                 if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){

                   for( Int ib = 1; ib < Lcol.size(); ib++ ){

                     if( Lcol[ib].blockIdx > ksup &&  (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol  ){

                       count++;

                     }

                   }

                 }

                 else{


                   for( Int ib = 0; ib < Lcol.size(); ib++ ){

                     if( Lcol[ib].blockIdx > ksup &&  (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol  ){

                       count++;

                     }

                   }

                 }


                 serialize( (Int)count, sstm, NO_MASK );


                 for( Int ib = 0; ib < Lcol.size(); ib++ ){

                   if( Lcol[ib].blockIdx > ksup &&  (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol  ){

 #if ( _DEBUGlevel_ >= 1 )

                     statusOFS<<"["<<ksup<<"] SEND contains "<<Lcol[ib].blockIdx<< " which corresponds to "<<GBj(ib,grid_)<<std::endl;

 #endif

                     serialize( Lcol[ib], sstm, mask );

                   }

                 }


                 sstrLcolSend.resize( Size(sstm) );

                 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );

                 sstrSize = sstrLcolSend.size();


                 // Send/Recv is possible here due to the one to one correspondence

                 // in the case of square processor grid


 #if ( _DEBUGlevel_ >= 1 )

                 statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") ---> LBj("<<ksup<<")="<<LBj(ksup,grid_)<<" ---> P"<<dst<<" ("<<ksupProcRow<<","<<dstCol<<")"<<std::endl;

 #endif

                 MPI_Isend( &sstrSize, sizeof(sstrSize), MPI_BYTE, dst, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeSend );

                 MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dst, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqSend );


                 PROFILE_COMM(MYPROC(this->grid_),dst,SELINV_TAG_D_SIZE,sizeof(sstrSize));

                 PROFILE_COMM(MYPROC(this->grid_),dst,SELINV_TAG_D_CONTENT,sstrSize);


                 //mpi::Send( sstm, dst,SELINV_TAG_D_SIZE, SELINV_TAG_D_CONTENT, grid_->comm );


                 sendIdx++;

               } // if I am a sender

             }

           }

         }


         // Receiver

         if( isRecvFromCrossDiagonal_(grid_->numProcRow,ksup) ){


 #if ( _DEBUGlevel_ >= 1 )

           statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should receive from "<<CountRecvFromCrossDiagonal(ksup)<<" processors"<<std::endl;

 #endif


           std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );

           std::vector<bool> isBlockFound(Urow.size(),false);


           Int recvIdx = 0;

           //receive size first

           for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){

             if(isRecvFromCrossDiagonal_(srcRow,ksup) ){

               std::vector<LBlock<T> > LcolRecv;

               Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);

               if(MYPROC(grid_)!= src){

                 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecv[recvIdx];

                 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];


                 MPI_Irecv( &sstrSize, 1, MPI_INT, src, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeRecv );


                 recvIdx++;

               }

             }

           }


           mpi::Waitall(arrMpiReqsSizeRecv);


           //receive content

           recvIdx = 0;

           for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){

             if(isRecvFromCrossDiagonal_(srcRow,ksup) ){

               std::vector<LBlock<T> > LcolRecv;

               Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);

               if(MYPROC(grid_)!= src){

                 MPI_Request & mpiReqRecv = arrMpiReqsRecv[recvIdx];

                 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];

                 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];

                 sstrLcolRecv.resize(sstrSize);


                 MPI_Irecv( &sstrLcolRecv[0], sstrSize, MPI_BYTE, src, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqRecv );


                 recvIdx++;

               }

             }

           }


           mpi::Waitall(arrMpiReqsRecv);


           //Process the content

           recvIdx = 0;

           for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){

             if(isRecvFromCrossDiagonal_(srcRow,ksup) ){

               std::vector<LBlock<T> > LcolRecv;

               Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);

               if(MYPROC(grid_)!= src){


                 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];

                 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];

                 std::stringstream sstm;


 #if ( _DEBUGlevel_ >= 1 )

                 statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<ksup<<") <--- P"<<src<<" ("<<srcRow<<","<<ksupProcCol<<")"<<std::endl;

 #endif


                 sstm.write( &sstrLcolRecv[0], sstrSize );


                 // Unpack L data.

                 Int numLBlock;

                 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );

                 deserialize( numLBlock, sstm, NO_MASK );

                 LcolRecv.resize(numLBlock);

                 for( Int ib = 0; ib < numLBlock; ib++ ){

                   deserialize( LcolRecv[ib], sstm, mask );

 #if ( _DEBUGlevel_ >= 1 )

                   statusOFS<<"["<<ksup<<"] RECV contains "<<LcolRecv[ib].blockIdx<< " which corresponds to "<< ib * grid_->numProcRow + srcRow; // <<std::endl;

                   //                  statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<<((ib * grid_->numProcRow + srcRow)/grid_->numProcCol)%grid_->numProcCol <<std::endl;

                   statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<< LcolRecv[ib].blockIdx % grid_->numProcCol <<std::endl;

 #endif

                 }


                 recvIdx++;


               } // sender is not the same as receiver

               else{

                 // L is obtained locally, just make a copy. Do not include the diagonal block

                 std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );

                 if( MYROW( grid_ ) != PROW( ksup, grid_ ) ){

                   LcolRecv.resize( Lcol.size() );

                   for( Int ib = 0; ib < Lcol.size(); ib++ ){

                     LcolRecv[ib] = Lcol[ib];

                   }

                 }

                 else{

                   LcolRecv.resize( Lcol.size() - 1 );

                   for( Int ib = 0; ib < Lcol.size() - 1; ib++ ){

                     LcolRecv[ib] = Lcol[ib+1];

                   }

                 }

               } // sender is the same as receiver


               // Update U

               // Make sure that the size of L and the corresponding U blocks match.

               for( Int ib = 0; ib < LcolRecv.size(); ib++ ){

                 LBlock<T> & LB = LcolRecv[ib];

                 if( LB.blockIdx <= ksup ){

 #ifdef USE_ABORT

                   abort();

 #endif

                   throw std::logic_error( "LcolRecv contains the wrong blocks." );

                 }

                 for( Int jb = 0; jb < Urow.size(); jb++ ){

                   UBlock<T> &  UB = Urow[jb];

                   if( LB.blockIdx == UB.blockIdx ){

                     // Compare size

                     if( LB.numRow != UB.numCol || LB.numCol != UB.numRow ){

                       std::ostringstream msg;

                       msg << "LB(" << LB.blockIdx << ", " << ksup << ") and UB("

                         << ksup << ", " << UB.blockIdx << ")    do not share the same size." << std::endl

                         << "LB: " << LB.numRow << " x " << LB.numCol << std::endl

                         << "UB: " << UB.numRow << " x " << UB.numCol << std::endl;

 #ifdef USE_ABORT

                       abort();

 #endif

                       throw std::runtime_error( msg.str().c_str() );

                     }


                     // Note that the order of the column indices of the U

                     // block may not follow the order of the row indices,

                     // overwrite the information in U.

                     UB.cols = LB.rows;

                     Transpose( LB.nzval, UB.nzval );


 #if ( _DEBUGlevel_ >= 1 )

                     statusOFS<<"["<<ksup<<"] USING "<<LB.blockIdx<< std::endl;

 #endif

                     isBlockFound[jb] = true;

                     break;

                   } // if( LB.blockIdx == UB.blockIdx )

                 } // for (jb)

               } // for (ib)

             }

           }


           for( Int jb = 0; jb < Urow.size(); jb++ ){

             UBlock<T> & UB = Urow[jb];

             if( !isBlockFound[jb] ){

 #ifdef USE_ABORT

               abort();

 #endif

               throw std::logic_error( "UBlock cannot find its update. Something is seriously wrong." );

             }

           }


         } // if I am a receiver


         //Wait until every receiver is done

         mpi::Waitall(arrMpiReqsSizeSend);

         mpi::Waitall(arrMpiReqsSend);


       } // for (ksup)


 #ifndef _RELEASE_

       PopCallStack();

 #endif


 #ifndef _RELEASE_

       PushCallStack("L(i,i) <- [L(k,k) * U(k,k)]^{-1} ");

 #endif

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl << std::endl;

 #endif


       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&

             MYCOL( grid_ ) == PCOL( ksup, grid_ )       ){

           IntNumVec ipiv( SuperSize( ksup, super_ ) );

           // Note that the pivoting vector ipiv should follow the FORTRAN

           // notation by adding the +1

           for(Int i = 0; i < SuperSize( ksup, super_ ); i++){

             ipiv[i] = i + 1;

           }

           LBlock<T> & LB = (this->L( LBj( ksup, grid_ ) ))[0];

 #if ( _DEBUGlevel_ >= 2 )

           // Check the correctness of the matrix inversion for the first local column

           statusOFS << "Factorized A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;

 #endif

           lapack::Getri( SuperSize( ksup, super_ ), LB.nzval.Data(),

               SuperSize( ksup, super_ ), ipiv.Data() );


           // Symmetrize the diagonal block

           Symmetrize( LB.nzval );


 #if ( _DEBUGlevel_ >= 2 )

           // Check the correctness of the matrix inversion for the first local column

           statusOFS << "Inversed   A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;

 #endif

         } // if I need to inverse the diagonal block

       } // for (ksup)


 #ifndef _RELEASE_

       PopCallStack();

 #endif


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       return ;

     }           // -----  end of method PMatrix::PreSelInv  -----


   template<typename T>

     void PMatrix<T>::GetDiagonal        ( NumVec<T>& diag )

     {

 #ifndef _RELEASE_

       PushCallStack("PMatrix::GetDiagonal");

 #endif

       Int numSuper = this->NumSuper();


       Int numCol = this->NumCol();

       const IntNumVec& perm    = super_->perm;

       const IntNumVec& permInv = super_->permInv;


       const IntNumVec * pPerm_r;

       const IntNumVec * pPermInv_r;


       pPerm_r = &super_->perm_r;

       pPermInv_r = &super_->permInv_r;


       const IntNumVec& perm_r    = *pPerm_r;

       const IntNumVec& permInv_r = *pPermInv_r;


       NumVec<T> diagLocal( numCol );

       SetValue( diagLocal, ZERO<T>() );


       diag.Resize( numCol );

       SetValue( diag, ZERO<T>() );


       for( Int orow = 0; orow < numCol; orow++){

         //row index in the permuted order

         Int row         = perm[ orow ];

         //col index in the permuted order

         Int col         = perm[ perm_r[ orow] ];


         Int blockColIdx = BlockIdx( col, super_ );

         Int blockRowIdx = BlockIdx( row, super_ );


         // I own the column

         if( MYROW( grid_ ) == PROW( blockRowIdx, grid_ ) &&

             MYCOL( grid_ ) == PCOL( blockColIdx, grid_ ) ){

           // Search for the nzval

           bool isFound = false;


           if( blockColIdx <= blockRowIdx ){

             // Data on the L side


             std::vector<LBlock<T> >&  Lcol = this->L( LBj( blockColIdx, grid_ ) );


             for( Int ib = 0; ib < Lcol.size(); ib++ ){

 #if ( _DEBUGlevel_ >= 1 )

               statusOFS << "blockRowIdx = " << blockRowIdx << ", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx << ", blockColIdx = " << blockColIdx << std::endl;

 #endif

               if( Lcol[ib].blockIdx == blockRowIdx ){

                 IntNumVec& rows = Lcol[ib].rows;

                 for( int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){

                   if( rows[iloc] == row ){

                     Int jloc = col - FirstBlockCol( blockColIdx, super_ );


                     diagLocal[ orow ] = Lcol[ib].nzval( iloc, jloc );


                     isFound = true;

                     break;

                   } // found the corresponding row

                 }

               }

               if( isFound == true ) break;

             } // for (ib)

           }

           else{

             // Data on the U side


             std::vector<UBlock<T> >&  Urow = this->U( LBi( blockRowIdx, grid_ ) );


             for( Int jb = 0; jb < Urow.size(); jb++ ){

               if( Urow[jb].blockIdx == blockColIdx ){

                 IntNumVec& cols = Urow[jb].cols;

                 for( int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){

                   if( cols[jloc] == col ){

                     Int iloc = row - FirstBlockRow( blockRowIdx, super_ );


                     diagLocal[ orow ] = Urow[jb].nzval( iloc, jloc );


                     isFound = true;

                     break;

                   } // found the corresponding col

                 }

               }

               if( isFound == true ) break;

             } // for (jb)

           } // if( blockColIdx <= blockRowIdx )


           // Did not find the corresponding row, set the value to zero.

           if( isFound == false ){

             statusOFS << "In the permutated order, (" << row << ", " << col <<

               ") is not found in PMatrix." << std::endl;

             diagLocal[orow] = ZERO<T>();

           }

         }

       }


 //      //TODO This doesnt work with row perm

 //      for( Int ksup = 0; ksup < numSuper; ksup++ ){

 //        Int numRows = SuperSize(ksup, this->super_);

 //

 //        // I own the diagonal block

 //        if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&

 //            MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

 //          LBlock<T> & LB = this->L( LBj( ksup, grid_ ) )[0];

 //          for( Int i = 0; i < LB.numRow; i++ ){

 //            diagLocal( permInv[ LB.rows(i) ] ) = LB.nzval( i, perm[permInv_r[i]] );

 //          }

 //        }

 //      }


       // All processors own diag

       mpi::Allreduce( diagLocal.Data(), diag.Data(), numCol, MPI_SUM, grid_->comm );


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       return ;

     }           // -----  end of method PMatrix::GetDiagonal  -----


   template<typename T>

     void PMatrix<T>::PMatrixToDistSparseMatrix  ( DistSparseMatrix<T>& A )

     {

 #ifndef _RELEASE_

       PushCallStack("PMatrix::PMatrixToDistSparseMatrix");

 #endif

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix." << std::endl;

 #endif

       Int mpirank = grid_->mpirank;

       Int mpisize = grid_->mpisize;


       std::vector<Int>     rowSend( mpisize );

       std::vector<Int>     colSend( mpisize );

       std::vector<T>  valSend( mpisize );

       std::vector<Int>     sizeSend( mpisize, 0 );

       std::vector<Int>     displsSend( mpisize, 0 );


       std::vector<Int>     rowRecv( mpisize );

       std::vector<Int>     colRecv( mpisize );

       std::vector<T>  valRecv( mpisize );

       std::vector<Int>     sizeRecv( mpisize, 0 );

       std::vector<Int>     displsRecv( mpisize, 0 );


       Int numSuper = this->NumSuper();

       const IntNumVec& perm    = super_->perm;

       const IntNumVec& permInv = super_->permInv;


       const IntNumVec * pPerm_r;

       const IntNumVec * pPermInv_r;


       pPerm_r = &super_->perm_r;

       pPermInv_r = &super_->permInv_r;


       const IntNumVec& perm_r    = *pPerm_r;

       const IntNumVec& permInv_r = *pPermInv_r;


       // The number of local columns in DistSparseMatrix format for the

       // processor with rank 0.  This number is the same for processors

       // with rank ranging from 0 to mpisize - 2, and may or may not differ

       // from the number of local columns for processor with rank mpisize -

       // 1.

       Int numColFirst = this->NumCol() / mpisize;


       // Count the size first.

       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         // L blocks

         if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

           std::vector<LBlock<T> >&  Lcol = this->L( LBj( ksup, grid_ ) );

           for( Int ib = 0; ib < Lcol.size(); ib++ ){

             for( Int j = 0; j < Lcol[ib].numCol; j++ ){

               Int jcol = permInv[ permInv_r[ j + FirstBlockCol( ksup, super_ ) ] ];

               Int dest = std::min( jcol / numColFirst, mpisize - 1 );

               sizeSend[dest] += Lcol[ib].numRow;

             }

           }

         } // I own the column of ksup


         // U blocks

         if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){

           std::vector<UBlock<T> >&  Urow = this->U( LBi( ksup, grid_ ) );

           for( Int jb = 0; jb < Urow.size(); jb++ ){

             IntNumVec& cols = Urow[jb].cols;

             for( Int j = 0; j < cols.m(); j++ ){

               Int jcol = permInv[ permInv_r[ cols[j] ] ];

               Int dest = std::min( jcol / numColFirst, mpisize - 1 );

               sizeSend[dest] += Urow[jb].numRow;

             }

           }

         } // I own the row of ksup

       } // for (ksup)


       // All-to-all exchange of size information

       MPI_Alltoall(

           &sizeSend[0], 1, MPI_INT,

           &sizeRecv[0], 1, MPI_INT, grid_->comm );


       // Reserve the space

       for( Int ip = 0; ip < mpisize; ip++ ){

         if( ip == 0 ){

           displsSend[ip] = 0;

         }

         else{

           displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];

         }


         if( ip == 0 ){

           displsRecv[ip] = 0;

         }

         else{

           displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];

         }

       }

       Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];

       Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];


       rowSend.resize( sizeSendTotal );

       colSend.resize( sizeSendTotal );

       valSend.resize( sizeSendTotal );


       rowRecv.resize( sizeRecvTotal );

       colRecv.resize( sizeRecvTotal );

       valRecv.resize( sizeRecvTotal );


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << "displsSend = " << displsSend << std::endl;

       statusOFS << "displsRecv = " << displsRecv << std::endl;

 #endif


       // Put (row, col, val) to the sending buffer

       std::vector<Int>   cntSize( mpisize, 0 );


       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         // L blocks

         if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

           std::vector<LBlock<T> >&  Lcol = this->L( LBj( ksup, grid_ ) );

           for( Int ib = 0; ib < Lcol.size(); ib++ ){

             IntNumVec&  rows = Lcol[ib].rows;

             NumMat<T>& nzval = Lcol[ib].nzval;

             for( Int j = 0; j < Lcol[ib].numCol; j++ ){

               Int jcol = permInv[ permInv_r[ j + FirstBlockCol( ksup, super_ ) ] ];

               Int dest = std::min( jcol / numColFirst, mpisize - 1 );

               for( Int i = 0; i < rows.m(); i++ ){

                 rowSend[displsSend[dest] + cntSize[dest]] = permInv[ rows[i] ];

                 colSend[displsSend[dest] + cntSize[dest]] = jcol;

                 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );

                 cntSize[dest]++;

               }

             }

           }

         } // I own the column of ksup


         // U blocks

         if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){

           std::vector<UBlock<T> >&  Urow = this->U( LBi( ksup, grid_ ) );

           for( Int jb = 0; jb < Urow.size(); jb++ ){

             IntNumVec& cols = Urow[jb].cols;

             NumMat<T>& nzval = Urow[jb].nzval;

             for( Int j = 0; j < cols.m(); j++ ){

               Int jcol = permInv[ permInv_r[ cols[j] ] ];

               Int dest = std::min( jcol / numColFirst, mpisize - 1 );

               for( Int i = 0; i < Urow[jb].numRow; i++ ){

                 rowSend[displsSend[dest] + cntSize[dest]] = permInv[ i + FirstBlockCol( ksup, super_ ) ];

                 colSend[displsSend[dest] + cntSize[dest]] = jcol;

                 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );

                 cntSize[dest]++;

               }

             }

           }

         } // I own the row of ksup

       }


       // Check sizes match

       for( Int ip = 0; ip < mpisize; ip++ ){

         if( cntSize[ip] != sizeSend[ip] )

         {

 #ifdef USE_ABORT

           abort();

 #endif

           throw std::runtime_error( "Sizes of the sending information do not match." );

         }

       }


       // Alltoallv to exchange information

       mpi::Alltoallv(

           &rowSend[0], &sizeSend[0], &displsSend[0],

           &rowRecv[0], &sizeRecv[0], &displsRecv[0],

           grid_->comm );

       mpi::Alltoallv(

           &colSend[0], &sizeSend[0], &displsSend[0],

           &colRecv[0], &sizeRecv[0], &displsRecv[0],

           grid_->comm );

       mpi::Alltoallv(

           &valSend[0], &sizeSend[0], &displsSend[0],

           &valRecv[0], &sizeRecv[0], &displsRecv[0],

           grid_->comm );


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << "Alltoallv communication finished." << std::endl;

 #endif


       //#if ( _DEBUGlevel_ >= 1 )

       //        for( Int ip = 0; ip < mpisize; ip++ ){

       //                statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;

       //                statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;

       //                statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;

       //                statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;

       //                statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;

       //                statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;

       //        }

       //#endif


       // Organize the received message.

       Int firstCol = mpirank * numColFirst;

       Int numColLocal;

       if( mpirank == mpisize-1 )

         numColLocal = this->NumCol() - numColFirst * (mpisize-1);

       else

         numColLocal = numColFirst;


       std::vector<std::vector<Int> > rows( numColLocal );

       std::vector<std::vector<T> > vals( numColLocal );


       for( Int ip = 0; ip < mpisize; ip++ ){

         Int*     rowRecvCur = &rowRecv[displsRecv[ip]];

         Int*     colRecvCur = &colRecv[displsRecv[ip]];

         T*  valRecvCur = &valRecv[displsRecv[ip]];

         for( Int i = 0; i < sizeRecv[ip]; i++ ){

           rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );

           vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );

         } // for (i)

       } // for (ip)


       // Sort the rows

       std::vector<std::vector<Int> > sortIndex( numColLocal );

       for( Int j = 0; j < numColLocal; j++ ){

         sortIndex[j].resize( rows[j].size() );

         for( Int i = 0; i < sortIndex[j].size(); i++ )

           sortIndex[j][i] = i;

         std::sort( sortIndex[j].begin(), sortIndex[j].end(),

             IndexComp<std::vector<Int>& > ( rows[j] ) );

       } // for (j)


       // Form DistSparseMatrix according to the received message

       // NOTE: for indicies,  DistSparseMatrix follows the FORTRAN

       // convention (1 based) while PMatrix follows the C convention (0

       // based)

       A.size = this->NumCol();

       A.nnzLocal  = 0;

       A.colptrLocal.Resize( numColLocal + 1 );

       // Note that 1 is important since the index follows the FORTRAN convention

       A.colptrLocal(0) = 1;

       for( Int j = 0; j < numColLocal; j++ ){

         A.nnzLocal += rows[j].size();

         A.colptrLocal(j+1) = A.colptrLocal(j) + rows[j].size();

       }


       A.comm = grid_->comm;


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << "nnzLocal = " << A.nnzLocal << std::endl;

       statusOFS << "nnz      = " << A.Nnz()      << std::endl;

 #endif


       A.rowindLocal.Resize( A.nnzLocal );

       A.nzvalLocal.Resize(  A.nnzLocal );


       Int*     rowPtr = A.rowindLocal.Data();

       T*  nzvalPtr = A.nzvalLocal.Data();

       for( Int j = 0; j < numColLocal; j++ ){

         std::vector<Int>& rowsCur = rows[j];

         std::vector<Int>& sortIndexCur = sortIndex[j];

         std::vector<T>& valsCur = vals[j];

         for( Int i = 0; i < rows[j].size(); i++ ){

           // Note that 1 is important since the index follows the FORTRAN convention

           *(rowPtr++)   = rowsCur[sortIndexCur[i]] + 1;

           *(nzvalPtr++) = valsCur[sortIndexCur[i]];

         }

       }


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << "A.colptrLocal[end]   = " << A.colptrLocal(numColLocal) << std::endl;

       statusOFS << "A.rowindLocal.size() = " << A.rowindLocal.m() << std::endl;

       statusOFS << "A.rowindLocal[end]   = " << A.rowindLocal(A.nnzLocal-1) << std::endl;

       statusOFS << "A.nzvalLocal[end]    = " << A.nzvalLocal(A.nnzLocal-1) << std::endl;

 #endif


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       return ;

     }           // -----  end of method PMatrix::PMatrixToDistSparseMatrix  -----


   template<typename T>

     void PMatrix<T>::PMatrixToDistSparseMatrix_OLD      ( const DistSparseMatrix<T>& A, DistSparseMatrix<T>& B  )

     {

 #ifndef _RELEASE_

       PushCallStack("PMatrix::PMatrixToDistSparseMatrix_OLD");

 #endif

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;

 #endif

       Int mpirank = grid_->mpirank;

       Int mpisize = grid_->mpisize;


       std::vector<Int>     rowSend( mpisize );

       std::vector<Int>     colSend( mpisize );

       std::vector<T>  valSend( mpisize );

       std::vector<Int>     sizeSend( mpisize, 0 );

       std::vector<Int>     displsSend( mpisize, 0 );


       std::vector<Int>     rowRecv( mpisize );

       std::vector<Int>     colRecv( mpisize );

       std::vector<T>  valRecv( mpisize );

       std::vector<Int>     sizeRecv( mpisize, 0 );

       std::vector<Int>     displsRecv( mpisize, 0 );


       Int numSuper = this->NumSuper();

       const IntNumVec& permInv = super_->permInv;


       const IntNumVec * pPermInv_r;


       if(optionsLU_->RowPerm=="NOROWPERM"){

         pPermInv_r = &super_->permInv;

       }

       else{

         pPermInv_r = &super_->permInv_r;

       }


       const IntNumVec& permInv_r = *pPermInv_r;


       // The number of local columns in DistSparseMatrix format for the

       // processor with rank 0.  This number is the same for processors

       // with rank ranging from 0 to mpisize - 2, and may or may not differ

       // from the number of local columns for processor with rank mpisize -

       // 1.

       Int numColFirst = this->NumCol() / mpisize;


       // Count the size first.

       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         // L blocks

         if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

           std::vector<LBlock<T> >&  Lcol = this->L( LBj( ksup, grid_ ) );

           for( Int ib = 0; ib < Lcol.size(); ib++ ){

             for( Int j = 0; j < Lcol[ib].numCol; j++ ){

               Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );

               Int dest = std::min( jcol / numColFirst, mpisize - 1 );

               sizeSend[dest] += Lcol[ib].numRow;

             }

           }

         } // I own the column of ksup


         // U blocks

         if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){

           std::vector<UBlock<T> >&  Urow = this->U( LBi( ksup, grid_ ) );

           for( Int jb = 0; jb < Urow.size(); jb++ ){

             IntNumVec& cols = Urow[jb].cols;

             for( Int j = 0; j < cols.m(); j++ ){

               Int jcol = permInv( cols(j) );

               Int dest = std::min( jcol / numColFirst, mpisize - 1 );

               sizeSend[dest] += Urow[jb].numRow;

             }

           }

         } // I own the row of ksup

       } // for (ksup)


       // All-to-all exchange of size information

       MPI_Alltoall(

           &sizeSend[0], 1, MPI_INT,

           &sizeRecv[0], 1, MPI_INT, grid_->comm );


       // Reserve the space

       for( Int ip = 0; ip < mpisize; ip++ ){

         if( ip == 0 ){

           displsSend[ip] = 0;

         }

         else{

           displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];

         }


         if( ip == 0 ){

           displsRecv[ip] = 0;

         }

         else{

           displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];

         }

       }

       Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];

       Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];


       rowSend.resize( sizeSendTotal );

       colSend.resize( sizeSendTotal );

       valSend.resize( sizeSendTotal );


       rowRecv.resize( sizeRecvTotal );

       colRecv.resize( sizeRecvTotal );

       valRecv.resize( sizeRecvTotal );


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << "displsSend = " << displsSend << std::endl;

       statusOFS << "displsRecv = " << displsRecv << std::endl;

 #endif


       // Put (row, col, val) to the sending buffer

       std::vector<Int>   cntSize( mpisize, 0 );


       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         // L blocks

         if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

           std::vector<LBlock<T> >&  Lcol = this->L( LBj( ksup, grid_ ) );

           for( Int ib = 0; ib < Lcol.size(); ib++ ){

             IntNumVec&  rows = Lcol[ib].rows;

             NumMat<T>& nzval = Lcol[ib].nzval;

             for( Int j = 0; j < Lcol[ib].numCol; j++ ){

               Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );

               Int dest = std::min( jcol / numColFirst, mpisize - 1 );

               for( Int i = 0; i < rows.m(); i++ ){

                 rowSend[displsSend[dest] + cntSize[dest]] = permInv_r( rows(i) );

                 colSend[displsSend[dest] + cntSize[dest]] = jcol;

                 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );

                 cntSize[dest]++;

               }

             }

           }

         } // I own the column of ksup


         // U blocks

         if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){

           std::vector<UBlock<T> >&  Urow = this->U( LBi( ksup, grid_ ) );

           for( Int jb = 0; jb < Urow.size(); jb++ ){

             IntNumVec& cols = Urow[jb].cols;

             NumMat<T>& nzval = Urow[jb].nzval;

             for( Int j = 0; j < cols.m(); j++ ){

               Int jcol = permInv( cols(j) );

               Int dest = std::min( jcol / numColFirst, mpisize - 1 );

               for( Int i = 0; i < Urow[jb].numRow; i++ ){

                 rowSend[displsSend[dest] + cntSize[dest]] =

                   permInv_r( i + FirstBlockCol( ksup, super_ ) );

                 colSend[displsSend[dest] + cntSize[dest]] = jcol;

                 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );

                 cntSize[dest]++;

               }

             }

           }

         } // I own the row of ksup

       }


       // Check sizes match

       for( Int ip = 0; ip < mpisize; ip++ ){

         if( cntSize[ip] != sizeSend[ip] ){

 #ifdef USE_ABORT

           abort();

 #endif

           throw std::runtime_error( "Sizes of the sending information do not match." );

         }

       }


       // Alltoallv to exchange information

       mpi::Alltoallv(

           &rowSend[0], &sizeSend[0], &displsSend[0],

           &rowRecv[0], &sizeRecv[0], &displsRecv[0],

           grid_->comm );

       mpi::Alltoallv(

           &colSend[0], &sizeSend[0], &displsSend[0],

           &colRecv[0], &sizeRecv[0], &displsRecv[0],

           grid_->comm );

       mpi::Alltoallv(

           &valSend[0], &sizeSend[0], &displsSend[0],

           &valRecv[0], &sizeRecv[0], &displsRecv[0],

           grid_->comm );


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << "Alltoallv communication finished." << std::endl;

 #endif


       //#if ( _DEBUGlevel_ >= 1 )

       //        for( Int ip = 0; ip < mpisize; ip++ ){

       //                statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;

       //                statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;

       //                statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;

       //                statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;

       //                statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;

       //                statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;

       //        }

       //#endif


       // Organize the received message.

       Int firstCol = mpirank * numColFirst;

       Int numColLocal;

       if( mpirank == mpisize-1 )

         numColLocal = this->NumCol() - numColFirst * (mpisize-1);

       else

         numColLocal = numColFirst;


       std::vector<std::vector<Int> > rows( numColLocal );

       std::vector<std::vector<T> > vals( numColLocal );


       for( Int ip = 0; ip < mpisize; ip++ ){

         Int*     rowRecvCur = &rowRecv[displsRecv[ip]];

         Int*     colRecvCur = &colRecv[displsRecv[ip]];

         T*  valRecvCur = &valRecv[displsRecv[ip]];

         for( Int i = 0; i < sizeRecv[ip]; i++ ){

           rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );

           vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );

         } // for (i)

       } // for (ip)


       // Sort the rows

       std::vector<std::vector<Int> > sortIndex( numColLocal );

       for( Int j = 0; j < numColLocal; j++ ){

         sortIndex[j].resize( rows[j].size() );

         for( Int i = 0; i < sortIndex[j].size(); i++ )

           sortIndex[j][i] = i;

         std::sort( sortIndex[j].begin(), sortIndex[j].end(),

             IndexComp<std::vector<Int>& > ( rows[j] ) );

       } // for (j)


       // Form DistSparseMatrix according to the received message

       // NOTE: for indicies,  DistSparseMatrix follows the FORTRAN

       // convention (1 based) while PMatrix follows the C convention (0

       // based)

       if( A.size != this->NumCol() ){

 #ifdef USE_ABORT

         abort();

 #endif

         throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different size from PMatrix." );

       }

       if( A.colptrLocal.m() != numColLocal + 1 ){

 #ifdef USE_ABORT

         abort();

 #endif

         throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different number of local columns from PMatrix." );

       }


       B.size = A.size;

       B.nnz  = A.nnz;

       B.nnzLocal = A.nnzLocal;

       B.colptrLocal = A.colptrLocal;

       B.rowindLocal = A.rowindLocal;

       B.nzvalLocal.Resize( B.nnzLocal );

       SetValue( B.nzvalLocal, ZERO<T>() );

       // Make sure that the communicator of A and B are the same.

       // FIXME Find a better way to compare the communicators

       //                        if( grid_->comm != A.comm ){

       //                                #ifdef USE_ABORT

       //      abort();

       //#endif

       //throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );

       //                        }

       B.comm = grid_->comm;


       Int*     rowPtr = B.rowindLocal.Data();

       T*  nzvalPtr = B.nzvalLocal.Data();

       for( Int j = 0; j < numColLocal; j++ ){

         std::vector<Int>& rowsCur = rows[j];

         std::vector<Int>& sortIndexCur = sortIndex[j];

         std::vector<T>& valsCur = vals[j];

         std::vector<Int>  rowsCurSorted( rowsCur.size() );

         // Note that 1 is important since the index follows the FORTRAN convention

         for( Int i = 0; i < rowsCurSorted.size(); i++ ){

           rowsCurSorted[i] = rowsCur[sortIndexCur[i]] + 1;

         }


         // Search and match the indices

         std::vector<Int>::iterator it;

         for( Int i = B.colptrLocal(j) - 1;

             i < B.colptrLocal(j+1) - 1; i++ ){

           it = std::lower_bound( rowsCurSorted.begin(), rowsCurSorted.end(),

               *(rowPtr++) );

           if( it == rowsCurSorted.end() ){

             // Did not find the row, set it to zero

             *(nzvalPtr++) = ZERO<T>();

           }

           else{

             // Found the row, set it according to the received value

             *(nzvalPtr++) = valsCur[ sortIndexCur[it-rowsCurSorted.begin()] ];

           }

         } // for (i)

       } // for (j)


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << "B.colptrLocal[end]   = " << B.colptrLocal(numColLocal) << std::endl;

       statusOFS << "B.rowindLocal.size() = " << B.rowindLocal.m() << std::endl;

       statusOFS << "B.rowindLocal[end]   = " << B.rowindLocal(B.nnzLocal-1) << std::endl;

       statusOFS << "B.nzvalLocal[end]    = " << B.nzvalLocal(B.nnzLocal-1) << std::endl;

 #endif


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       return ;

     }           // -----  end of method PMatrix::PMatrixToDistSparseMatrix_OLD  -----


   // A (maybe) more memory efficient way for converting the PMatrix to a

   // DistSparseMatrix structure.

   //

   template<typename T>

     void PMatrix<T>::PMatrixToDistSparseMatrix ( const DistSparseMatrix<T>& A, DistSparseMatrix<T>& B )

     {


 #ifndef _RELEASE_

       PushCallStack("PMatrix::PMatrixToDistSparseMatrix");

 #endif

 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;

 #endif

       Int mpirank = grid_->mpirank;

       Int mpisize = grid_->mpisize;


       std::vector<Int>     rowSend( mpisize );

       std::vector<Int>     colSend( mpisize );

       std::vector<T>  valSend( mpisize );

       std::vector<Int>     sizeSend( mpisize, 0 );

       std::vector<Int>     displsSend( mpisize, 0 );


       std::vector<Int>     rowRecv( mpisize );

       std::vector<Int>     colRecv( mpisize );

       std::vector<T>  valRecv( mpisize );

       std::vector<Int>     sizeRecv( mpisize, 0 );

       std::vector<Int>     displsRecv( mpisize, 0 );


       Int numSuper = this->NumSuper();

       const IntNumVec& perm    = super_->perm;

       const IntNumVec& permInv = super_->permInv;


       const IntNumVec * pPerm_r;

       const IntNumVec * pPermInv_r;


       pPerm_r = &super_->perm_r;

       pPermInv_r = &super_->permInv_r;


       const IntNumVec& perm_r    = *pPerm_r;

       const IntNumVec& permInv_r = *pPermInv_r;


       // Count the sizes from the A matrix first

       Int numColFirst = this->NumCol() / mpisize;

       Int firstCol = mpirank * numColFirst;

       Int numColLocal;

       if( mpirank == mpisize-1 )

         numColLocal = this->NumCol() - numColFirst * (mpisize-1);

       else

         numColLocal = numColFirst;


       Int*     rowPtr = A.rowindLocal.Data();

       Int*     colPtr = A.colptrLocal.Data();


       for( Int j = 0; j < numColLocal; j++ ){

         Int ocol = firstCol + j;

         Int col         = perm[ perm_r[ ocol] ];

         Int blockColIdx = BlockIdx( col, super_ );

         Int procCol     = PCOL( blockColIdx, grid_ );

         for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){

           Int orow = rowPtr[i]-1;

           Int row         = perm[ orow ];

           Int blockRowIdx = BlockIdx( row, super_ );

           Int procRow     = PROW( blockRowIdx, grid_ );

           Int dest = PNUM( procRow, procCol, grid_ );

 #if ( _DEBUGlevel_ >= 1 )

           statusOFS << "("<< orow<<", "<<ocol<<") == "<< "("<< row<<", "<<col<<")"<< std::endl;

           statusOFS << "BlockIdx = " << blockRowIdx << ", " <<blockColIdx << std::endl;

           statusOFS << procRow << ", " << procCol << ", "

             << dest << std::endl;

 #endif

           sizeSend[dest]++;

         } // for (i)

       } // for (j)


       // All-to-all exchange of size information

       MPI_Alltoall(

           &sizeSend[0], 1, MPI_INT,

           &sizeRecv[0], 1, MPI_INT, grid_->comm );


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << std::endl << "sizeSend: " << sizeSend << std::endl;

       statusOFS << std::endl << "sizeRecv: " << sizeRecv << std::endl;

 #endif


       // Reserve the space

       for( Int ip = 0; ip < mpisize; ip++ ){

         if( ip == 0 ){

           displsSend[ip] = 0;

         }

         else{

           displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];

         }


         if( ip == 0 ){

           displsRecv[ip] = 0;

         }

         else{

           displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];

         }

       }


       Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];

       Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];


       rowSend.resize( sizeSendTotal );

       colSend.resize( sizeSendTotal );

       valSend.resize( sizeSendTotal );


       rowRecv.resize( sizeRecvTotal );

       colRecv.resize( sizeRecvTotal );

       valRecv.resize( sizeRecvTotal );


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << "displsSend = " << displsSend << std::endl;

       statusOFS << "displsRecv = " << displsRecv << std::endl;

 #endif


       // Put (row, col) to the sending buffer

       std::vector<Int>   cntSize( mpisize, 0 );


       rowPtr = A.rowindLocal.Data();

       colPtr = A.colptrLocal.Data();


       for( Int j = 0; j < numColLocal; j++ ){


         Int ocol = firstCol + j;

         Int col         = perm[ perm_r[ ocol] ];

         Int blockColIdx = BlockIdx( col, super_ );

         Int procCol     = PCOL( blockColIdx, grid_ );

         for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){

           Int orow = rowPtr[i]-1;

           Int row         = perm[ orow ];

           Int blockRowIdx = BlockIdx( row, super_ );

           Int procRow     = PROW( blockRowIdx, grid_ );

           Int dest = PNUM( procRow, procCol, grid_ );

           rowSend[displsSend[dest] + cntSize[dest]] = row;

           colSend[displsSend[dest] + cntSize[dest]] = col;

           cntSize[dest]++;

         } // for (i)

       } // for (j)


       // Check sizes match

       for( Int ip = 0; ip < mpisize; ip++ ){

         if( cntSize[ip] != sizeSend[ip] ){

 #ifdef USE_ABORT

           abort();

 #endif

           throw std::runtime_error( "Sizes of the sending information do not match." );

         }

       }


       // Alltoallv to exchange information

       mpi::Alltoallv(

           &rowSend[0], &sizeSend[0], &displsSend[0],

           &rowRecv[0], &sizeRecv[0], &displsRecv[0],

           grid_->comm );

       mpi::Alltoallv(

           &colSend[0], &sizeSend[0], &displsSend[0],

           &colRecv[0], &sizeRecv[0], &displsRecv[0],

           grid_->comm );


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << "Alltoallv communication of nonzero indices finished." << std::endl;

 #endif


 #if ( _DEBUGlevel_ >= 1 )

       for( Int ip = 0; ip < mpisize; ip++ ){

         statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;

         statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;

         statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;

         statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;

       }


       DumpLU();


 #endif


       // For each (row, col), fill the nonzero values to valRecv locally.

       for( Int g = 0; g < sizeRecvTotal; g++ ){

         Int row = rowRecv[g];

         Int col = colRecv[g];


         Int blockRowIdx = BlockIdx( row, super_ );

         Int blockColIdx = BlockIdx( col, super_ );


         // Search for the nzval

         bool isFound = false;


         if( blockColIdx <= blockRowIdx ){

           // Data on the L side


           std::vector<LBlock<T> >&  Lcol = this->L( LBj( blockColIdx, grid_ ) );


           for( Int ib = 0; ib < Lcol.size(); ib++ ){

 #if ( _DEBUGlevel_ >= 1 )

             statusOFS << "blockRowIdx = " << blockRowIdx << ", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx << ", blockColIdx = " << blockColIdx << std::endl;

 #endif

             if( Lcol[ib].blockIdx == blockRowIdx ){

               IntNumVec& rows = Lcol[ib].rows;

               for( int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){

                 if( rows[iloc] == row ){

                   Int jloc = col - FirstBlockCol( blockColIdx, super_ );

                   valRecv[g] = Lcol[ib].nzval( iloc, jloc );

                   isFound = true;

                   break;

                 } // found the corresponding row

               }

             }

             if( isFound == true ) break;

           } // for (ib)

         }

         else{

           // Data on the U side


           std::vector<UBlock<T> >&  Urow = this->U( LBi( blockRowIdx, grid_ ) );


           for( Int jb = 0; jb < Urow.size(); jb++ ){

             if( Urow[jb].blockIdx == blockColIdx ){

               IntNumVec& cols = Urow[jb].cols;

               for( int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){

                 if( cols[jloc] == col ){

                   Int iloc = row - FirstBlockRow( blockRowIdx, super_ );

                   valRecv[g] = Urow[jb].nzval( iloc, jloc );

                   isFound = true;

                   break;

                 } // found the corresponding col

               }

             }

             if( isFound == true ) break;

           } // for (jb)

         } // if( blockColIdx <= blockRowIdx )


         // Did not find the corresponding row, set the value to zero.

         if( isFound == false ){

           statusOFS << "In the permutated order, (" << row << ", " << col <<

             ") is not found in PMatrix." << std::endl;

           valRecv[g] = ZERO<T>();

         }


       } // for (g)


       // Feed back valRecv to valSend through Alltoallv. NOTE: for the

       // values, the roles of "send" and "recv" are swapped.

       mpi::Alltoallv(

           &valRecv[0], &sizeRecv[0], &displsRecv[0],

           &valSend[0], &sizeSend[0], &displsSend[0],

           grid_->comm );


 #if ( _DEBUGlevel_ >= 1 )

       statusOFS << "Alltoallv communication of nonzero values finished." << std::endl;

 #endif


       // Put the nonzero values from valSend to the matrix B.

       B.size = A.size;

       B.nnz  = A.nnz;

       B.nnzLocal = A.nnzLocal;

       B.colptrLocal = A.colptrLocal;

       B.rowindLocal = A.rowindLocal;

       B.nzvalLocal.Resize( B.nnzLocal );

       SetValue( B.nzvalLocal, ZERO<T>() );

       // Make sure that the communicator of A and B are the same.

       // FIXME Find a better way to compare the communicators

       //                        if( grid_->comm != A.comm ){

       //                                #ifdef USE_ABORT

       //abort();

       //#endif

       //throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );

       //                        }

       B.comm = grid_->comm;


       for( Int i = 0; i < mpisize; i++ )

         cntSize[i] = 0;


       rowPtr = B.rowindLocal.Data();

       colPtr = B.colptrLocal.Data();

       T* valPtr = B.nzvalLocal.Data();


       for( Int j = 0; j < numColLocal; j++ ){

         Int ocol = firstCol + j;

         Int col         = perm[ perm_r[ ocol] ];

         Int blockColIdx = BlockIdx( col, super_ );

         Int procCol     = PCOL( blockColIdx, grid_ );

         for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){

           Int orow = rowPtr[i]-1;

           Int row         = perm[ orow ];

           Int blockRowIdx = BlockIdx( row, super_ );

           Int procRow     = PROW( blockRowIdx, grid_ );

           Int dest = PNUM( procRow, procCol, grid_ );

           valPtr[i] = valSend[displsSend[dest] + cntSize[dest]];

           cntSize[dest]++;

         } // for (i)

       } // for (j)


       // Check sizes match

       for( Int ip = 0; ip < mpisize; ip++ ){

         if( cntSize[ip] != sizeSend[ip] ){

 #ifdef USE_ABORT

           abort();

 #endif

           throw std::runtime_error( "Sizes of the sending information do not match." );

         }

       }


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       return ;

     }     // -----  end of method PMatrix::PMatrixToDistSparseMatrix  -----


   template<typename T>

     Int PMatrix<T>::NnzLocal    (  )

     {

 #ifndef _RELEASE_

       PushCallStack("PMatrix::NnzLocal");

 #endif

       Int numSuper = this->NumSuper();

       Int nnzLocal = 0;

       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

           std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );

           for( Int ib = 0; ib < Lcol.size(); ib++ ){

             nnzLocal += Lcol[ib].numRow * Lcol[ib].numCol;

           }

         } // if I own the column of ksup

         if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){

           std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );

           for( Int jb = 0; jb < Urow.size(); jb++ ){

             nnzLocal += Urow[jb].numRow * Urow[jb].numCol;

           }

         } // if I own the row of ksup

       }


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       return nnzLocal;

     }           // -----  end of method PMatrix::NnzLocal  -----


   template<typename T>

     LongInt PMatrix<T>::Nnz     (  )

     {

 #ifndef _RELEASE_

       PushCallStack("PMatrix::Nnz");

 #endif

       LongInt nnzLocal = LongInt( this->NnzLocal() );

       LongInt nnz;


       MPI_Allreduce( &nnzLocal, &nnz, 1, MPI_LONG_LONG, MPI_SUM, grid_->comm );


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       return nnz;

     }           // -----  end of method PMatrix::Nnz  -----


   template< >

     inline void PMatrix<Real>::GetNegativeInertia       ( Real& inertia )

     {

 #ifndef _RELEASE_

       PushCallStack("PMatrix::GetNegativeInertia");

 #endif

       Int numSuper = this->NumSuper();


       Real inertiaLocal = 0.0;

       inertia          = 0.0;


       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         // I own the diagonal block

         if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&

             MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

           LBlock<Real> & LB = this->L( LBj( ksup, grid_ ) )[0];

           for( Int i = 0; i < LB.numRow; i++ ){

             if( LB.nzval(i, i) < 0 )

               inertiaLocal++;

           }

         }

       }


       // All processors own diag

       mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       return ;

     }           // -----  end of method PMatrix::GetNegativeInertia  -----


   template< >

     inline void PMatrix<Complex>::GetNegativeInertia    ( Real& inertia )

     {

 #ifndef _RELEASE_

       PushCallStack("PMatrix::GetNegativeInertia");

 #endif

       Int numSuper = this->NumSuper();


       Real inertiaLocal = 0.0;

       inertia          = 0.0;


       for( Int ksup = 0; ksup < numSuper; ksup++ ){

         // I own the diagonal block

         if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&

             MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){

           LBlock<Complex> & LB = this->L( LBj( ksup, grid_ ) )[0];

           for( Int i = 0; i < LB.numRow; i++ ){

             if( LB.nzval(i, i).real() < 0 )

               inertiaLocal++;

           }

         }

       }


       // All processors own diag

       mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );


 #ifndef _RELEASE_

       PopCallStack();

 #endif


       return ;

     }           // -----  end of method PMatrix::GetNegativeInertia  -----


   template<typename T>

     inline PMatrix<T> * PMatrix<T>::Create(const GridType * pGridType, const SuperNodeType * pSuper, const PSelInvOptions * pSelInvOpt , const SuperLUOptions * pLuOpt)

     {

       PMatrix<T> * pMat = NULL;

       if(pLuOpt->Symmetric == 0){

         pMat = new PMatrixUnsym<T>( pGridType, pSuper, pSelInvOpt, pLuOpt  );

       }

       else{

         pMat = new PMatrix<T>( pGridType, pSuper, pSelInvOpt, pLuOpt  );

       }


       return pMat;

     }           // -----  end of factory method PMatrix::Create  -----


   template<typename T>

     inline PMatrix<T> * PMatrix<T>::Create(const SuperLUOptions * pLuOpt)

     {

       PMatrix<T> * pMat = NULL;

       if(pLuOpt->Symmetric == 0){

         pMat = new PMatrixUnsym<T>();

       }

       else{

         pMat = new PMatrix<T>();

       }


       return pMat;

     }           // -----  end of factory method PMatrix::Create  -----

 } // namespace PEXSI


 #endif //_PEXSI_PSELINV_IMPL_HPP_

PEXSI::PMatrix::PreSelInv
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_impl.hpp:2677

PEXSI::NumVec< Int >

PEXSI::UBlock::blockIdx
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:236

PEXSI::PMatrix::PMatrixToDistSparseMatrix
void PMatrixToDistSparseMatrix(DistSparseMatrix< T > &A)
PMatrixToDistSparseMatrix converts the PMatrix into a distributed compressed sparse column matrix for...
Definition: pselinv_impl.hpp:3208

PEXSI::PMatrix::SelInvIntra_P2p
void SelInvIntra_P2p(Int lidx)
SelInvIntra_P2p.
Definition: pselinv_impl.hpp:1211

PEXSI::SuperLUOptions
A thin interface for passing parameters to set the SuperLU options.
Definition: superlu_dist_internal.hpp:62

PEXSI::TreeBcast
Definition: TreeBcast.hpp:22

PEXSI::SuperNodeType
SuperNodeType describes mapping between supernode and column, the permutation information, and potentially the elimination tree (not implemented here).
Definition: pselinv.hpp:165

superlu_dist_interf.hpp
Interface with SuperLU_Dist (version 3.0 and later)

PEXSI::PMatrix::GetNegativeInertia
void GetNegativeInertia(Real &inertia)
GetNegativeInertia computes the negative inertia of a PMatrix. This can be used to estimate e...

PEXSI::PROW
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:292

PEXSI::LBlock::numRow
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:187

PEXSI::DistSparseMatrix::colptrLocal
IntNumVec colptrLocal
Dimension numColLocal + 1, storing the pointers to the nonzero row indices and nonzero values in rowp...
Definition: sparse_matrix.hpp:108

PEXSI::PMatrix::UnpackData
void UnpackData(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_impl.hpp:971

PEXSI::MYPROC
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:279

PEXSI::DistSparseMatrix::nnzLocal
Int nnzLocal
Local number of local nonzeros elements on this processor.
Definition: sparse_matrix.hpp:96

PEXSI::PCOL
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:297

PEXSI::LBi
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:307

PEXSI::SuperLUOptions::Symmetric
Int Symmetric
Option to specify if matrix is symmetric or not.
Definition: superlu_dist_internal.hpp:114

timer.h
Profiling and timing using TAU.

PEXSI::SuperSize
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:348

PEXSI::LBj
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:312

PEXSI::GBj
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:322

PEXSI::SetValue
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:171

PEXSI::LBlock::numCol
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:190

PEXSI::NumCol
Int NumCol(const SuperNodeType *s)
NumCol returns the total number of columns for a supernodal partiiton.
Definition: pselinv.hpp:357

PEXSI::PMatrix::SuperNodeBufferType
Definition: pselinv.hpp:598

PEXSI::PMatrix::GetDiagonal
void GetDiagonal(NumVec< T > &diag)
GetDiagonal extracts the diagonal elements of the PMatrix.
Definition: pselinv_impl.hpp:3080

PEXSI::PMatrix::GetWorkSet
void GetWorkSet(std::vector< Int > &snodeEtree, std::vector< std::vector< Int > > &WSet)
GetWorkSet.
Definition: pselinv_impl.hpp:1134

PEXSI::DistSparseMatrix::nzvalLocal
NumVec< F > nzvalLocal
Dimension nnzLocal, storing the nonzero values.
Definition: sparse_matrix.hpp:116

PEXSI::FirstBlockRow
Int FirstBlockRow(Int bnum, const SuperNodeType *s)
FirstBlockRow returns the first column of a block bnum. Note: the functionality of FirstBlockRow is e...
Definition: pselinv.hpp:343

PEXSI::PMatrix::GetEtree
void GetEtree(std::vector< Int > &etree_supno)
GetEtree computes the supernodal elimination tree to be used later in the pipelined selected inversio...
Definition: pselinv_impl.hpp:1065

PEXSI::LBlock
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:181

PEXSI::DistSparseMatrix::rowindLocal
IntNumVec rowindLocal
Dimension nnzLocal, storing the nonzero row indices. The indices are 1-based (FORTRAN-convention), i.e. the first row index is 1.
Definition: sparse_matrix.hpp:113

PEXSI::PNUM
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:302

PEXSI::PMatrix::ConstructCommunicationPattern
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_impl.hpp:1951

PEXSI::PMatrix::ConstructCommunicationPattern_P2p
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_impl.hpp:1959

PEXSI::GridType
GridType is the PSelInv way of defining the grid.
Definition: pselinv.hpp:128

PEXSI::MYCOL
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:287

PEXSI::DistSparseMatrix::size
Int size
Matrix dimension.
Definition: sparse_matrix.hpp:93

PEXSI::PMatrix::PMatrixToDistSparseMatrix_OLD
void PMatrixToDistSparseMatrix_OLD(const DistSparseMatrix< T > &A, DistSparseMatrix< T > &B)
PMatrixToDistSparseMatrix_OLD converts the PMatrix into a distributed compressed sparse column matrix...
Definition: pselinv_impl.hpp:3489

PEXSI::LBlock::rows
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:193

PEXSI::UBlock::numCol
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:242

PEXSI::UBlock
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:233

PEXSI::NumMat< T >

PEXSI::FirstBlockCol
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:336

PEXSI::DistSparseMatrix::Nnz
LongInt Nnz()
Compute the total number of nonzeros through MPI_Allreduce.
Definition: sparse_matrix_impl.hpp:107

PEXSI::DistSparseMatrix::comm
MPI_Comm comm
MPI communicator.
Definition: sparse_matrix.hpp:119

PEXSI::PMatrix::NnzLocal
Int NnzLocal()
NnzLocal computes the number of nonzero elements (L and U) saved locally.
Definition: pselinv_impl.hpp:4132

PEXSI::LBlock::blockIdx
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:184

PEXSI::PMatrix
PMatrix contains the main data structure and the computational routine for the parallel selected inve...
Definition: ngchol_interf.hpp:57

PEXSI::PMatrix::SendRecvCD_UpdateU
void SendRecvCD_UpdateU(std::vector< SuperNodeBufferType > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_impl.hpp:748

PEXSI::BlockIdx
Int BlockIdx(Int i, const SuperNodeType *s)
BlockIdx returns the block index of a column i.
Definition: pselinv.hpp:331

PEXSI::UBlock::numRow
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:239

PEXSI::UBlock::nzval
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:248

PEXSI::DistSparseMatrix::nnz
Int nnz
Total number of nonzeros elements.
Definition: sparse_matrix.hpp:101

PEXSI::NumSuper
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:352

PEXSI::PMatrix::SelInv_P2p
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_impl.hpp:2642

PEXSI::LBlock::nzval
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:197

PEXSI::PSelInvOptions
A thin interface for passing parameters to set the PSelInv options.
Definition: pselinv.hpp:101

PEXSI::IndexComp
Definition: utility.hpp:1481

PEXSI::UBlock::cols
IntNumVec cols
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:245

PEXSI::PMatrix::SelInv
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_impl.hpp:2617

PEXSI::PMatrix::Nnz
LongInt Nnz()
Nnz computes the total number of nonzero elements in the PMatrix.
Definition: pselinv_impl.hpp:4163

PEXSI::PMatrix::SelInv_lookup_indexes
void SelInv_lookup_indexes(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_impl.hpp:391

PEXSI::TreeReduce
Definition: TreeBcast.hpp:555

PEXSI::PMatrix::ComputeDiagUpdate
void ComputeDiagUpdate(SuperNodeBufferType &snode)
ComputeDiagUpdate.
Definition: pselinv_impl.hpp:1025

PEXSI::DistSparseMatrix
DistSparseMatrix describes a Sparse matrix in the compressed sparse column format (CSC) and distribut...
Definition: sparse_matrix.hpp:91

PEXSI::PMatrixUnsym
PMatrixUnsym contains the main data structure and the computational routine for the parallel selected...
Definition: pselinv.hpp:978

PEXSI::MYROW
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:283