// -*- Mode: C++; Modified: "Fri Dec 17 17:24:07 1999 by dbs"; -*- 
// file: LoadBalance.cpp

// Purpose:
//  This file contains a global function to compute an assignment
//  of boxes to processors for an AMR hierarchy.  The assignment
//  is made so as to balance the computation and communication
//  workload on each processor (i.e. make it as even as possible).
//
//  ********************************************************
//  CAVEAT: this version ignores the communication workload.
//          It balances ONLY the compute workload.
//  ********************************************************
//
// Usage:
//  The boxes in the AMR hierarchy are stored as a Vector of
//  Vectors of Boxes.  The first (outer) vector indexes over
//  the levels and the second (inner) vector indexes over the boxes
//  on each level.
//  The computational workload is represented as a real number for
//  each box in the hierarchy, stored as a Vector of Vectors of Reals
//  in the same across-level/in-level ordering as the boxes.
//  The communication workload is represented as a real number for
//  each connection between pairs of boxes, which specifies the cost of 
//  communication between the two boxes if they are on different 
//  processors.  No allowance is made for varying the communication 
//  cost depending on the distance between processors (ie. all processors
//  are equally far apart) and it assumes that the effective cost of
//  communicating between two boxes on the same processor is zero.
//  The resulting assignment is represented as an integer for 
//  each box in the hierarchy, which gives the processor number (starting
//  at zero) on which each box should reside.  Represented as a 
//  Vector of Vectors of ints in the same ordering as the boxes and
//  loads.
//  The other output argument is a measure of how close to perfect the
//  final load balance is.  The \var{effRatio} output is defined
//  as the smallest load divided by the largest load.  A perfect load
//  balance has efficiency == 1.
//  It is important to note that it is the sum of the computation cost 
//  and communication cost that is balanced, so the values in the two
//  variables must be in the same units.  It doesn't matter what the
//  units are.  
//
// Interface Specifications:
//  There are several interfaces to the LoadBalance function that allow
//  for different ways to represent the boxes and loads to be balanced.
//
//  This version is the full-functionality version: it takes a hierarchy
//  of boxes (represented as a Vector<Vector<Box>> and returns processor
//  assignments for all levels of the hierarchy.
//  
//  int LoadBalance(
//    Vector<Vector<int>>  procAssignments  //output: processor number for each
//                                          //   box indexed by [level][box].
//    Real                 effRatio         //output: efficiency ratio
//    Vector<Vector<Box>>  Grids            //input: meshes to balance, indexed
//                                          //   by [level][box]
//    Vector<Vector<long>> ComputeLoads     //input: computational cost of each
//                                          //   box, indexed by [level][box]
//    [...something...]    CommunicateLoads //input: communication cost
//                                          //   between each pair of 
//                                          //   neighboring boxes
//    Vector<int>          RefRatios        //input: refinement ratio
//    )                                     //   for each level
//
//  This version takes the boxes from an existing BoxLayout and modifies
//  it with the processor assignments.
//
//  int LoadBalance(
//    Vector<BoxLayout>    Grids            //in-out: input boxes to balance,
//                                          //   output processor numbers
//    Real                 effRatio         //same as above
//    Vector<Vector<long>> ComputeLoads     //   "
//    [...something...]    CommunicateLoads //   "
//    Vector<int>          RefRatios        //   "
//    )
//
//  This version balances only a single level grid and implicitly assumes
//  the load is proportional to the number of points in each box.
//
//  int LoadBalance(
//    Vector<int>          procAssignments  //output: processor number for each
//                                              box in a single level
//    Vector<Box>          boxes            //input: boxes to balance
//    )
//
//  Return value: integer status_code (see "Exceptions:" below)
//    =0  means succesful completion
//    <0  means a fatal exception occurred; the output is undefined
//    >0  means a non-fatal exception occurredl but the output is defined
//
//
// Method:
//  This subroutine uses the Kernighan-Lin algorithm for
//  solving knapsack problems.  It operates one level at a time.
//  The algorithm is:
//   0. loop over levels
//   1.   sort loads by decreasing value
//   2.   loop over loads
//   3.     assign load to processor with lowest total load
//   4.   loop until done
//   5.     select the processor with the worst load balance
//   6.     loop over loads il in this processor ip
//   7.       loop over loads jl in other processors jp
//   8.         if (swapping il and jl improves load balance more 
//                       than any previous swap) then
//   9.             save jp, il,jl and the improvement_in_balance
//  10.     if any improvement_in_balance, then swap ip,il with jp,jl
//  11.   repeat (4) until no swap improves the load balance
//
//   Complexity is: O( NP * NL/NP * NL * nr ) + O( NP^2)
//     where: NP=#processors, NL=#loads(ie. boxes),
//            nr is the # of repetitions of step 4 (may be O(NP))
//            and the O(NP^2) term is due to Step 11.
//
// Implementation Notes:
//  There are several places in this implementation where the
//  cost could be reduced significantly with more complex coding.
//  In particular, most of the searches are done by brute force,
//  resulting in O(N) or O(N^2) cost where a better implementation
//  using sorted lists woudl cost only O(log N) or O(N log N).
//  This isn't a big deal for a prototype, but for a problem with
//  1000s of boxes this will be significant.
//
// Exceptions:
//  -1011 input vectors (\var{Grids} and \var{ComputeLoads}) are not
//        the same size
//  -1012 one of the vector elements of the input vectors (\var{Grids} 
//        and \var{ComputeLoads}) are not the same size
//  -1013 input vectors (\var{Grids} and \var{RefRatios}) are not
//        the same size
//
// References:
// B. Kernighan and S. Lin, "An effective heuristic procedure for partitioning
// graphs", The Bell System Technial Journal, pp. 291--308, Feb 1970
//
// Modification History
//  19Nov99 <dbs> initial design and coding

#ifndef NDEBUG
#include <iostream>
using std::cout;
#endif
#include <algorithm>
#include "DataIterator.H"
#include "Misc.H"
#include "SPMD.H"
#include "LoadBalance.H"
#include "LayoutIterator.H"

// local prototypes
int
min_element( const Vector<long>& Vect ) ;
void
min_max_elements( int& imin ,int& imax ,const Vector<long>& Vect ) ;

// Local class definition (needed to sort loads)
class Load
{
  public:
    Load():load(0) ,grid_index(0){}
    bool operator < (const Load& rhs) const {return load < rhs.load;}

    long load ;      //actual load on this box
    int grid_index ; //link to Grids[]
};


// Code:

///
// This version takes a Vector<BoxLayout> and builds a matching Vector of
// Vector<Box> and uses it to call the full version.
///
int
LoadBalance( Vector<BoxLayout>&    Grids            //in-out: input grids to balance
//                                                  //   and output proc. numbers
            ,Real&                 effRatio         //output: ratio of min load
//                                                  //   to max load
     ,const Vector<Vector<long> >&  ComputeLoads     //input: computational cost 
//                                                  //   of each box; indexed by [Level][Box]
//## ,const [...something...]&     CommunicateLoads //input: communication cost 
//                                                  //   between each pair of   
//                                                  //   neighboring boxes      
     ,const Vector<int>&           RefRatios        //input: refinement ratio   
//                                                  //   for each level 
			 ,int nProc
             ) 
{
  Vector<Vector<Box> > boxes( Grids.size() ) ;

  for( int i = 0 ; i < Grids.size() ; ++i ) {
    LayoutIterator j = Grids[i].layoutIterator() ;
    int p ;
    for( j.reset() ,p = 0 ; j.ok() ; ++j ,++p ) {
      boxes[i][p] = Grids[i][j()] ;
    }
  }

  Vector<Vector<int> > procIDs( Grids.size() ) ;
  int status = LoadBalance( procIDs ,effRatio
                           ,boxes ,ComputeLoads ,RefRatios, nProc ) ;

  if( status < 0 ) return status ;  //LoadBalance() failed

  for( int i = 0 ; i < Grids.size() ; ++i ) {
    LayoutIterator j = Grids[i].layoutIterator() ;
    int p ;
    for( j.reset() ,p = 0 ; j.ok() ; ++j ,++p ) {
      Grids[i].setProcID( j() ,procIDs[i][p] ) ;
    }
  }

  return status ; 
}


///
// This version takes a single BoxLayout (i.e. one level)
// and uses the box volumes to construct the compute loads
// and calls the full version.
///
int LoadBalance(Vector<int>& procs, const Vector<Box>& boxes)
{

  Vector<Vector<Box> > layouts(1, boxes);
  Vector<Vector<long> > computeLoads(1, Vector<long>(boxes.size()));
  Vector<int> refRatios(1,1);
  Vector<Vector<int> > assignments(1,Vector<int>(boxes.size(), -1));
  Real effRatio;

  for(int index = 0; index < boxes.size(); ++index)
    {
      computeLoads[0][index] = layouts[0][index].numPts();
    }
  
  int ret = LoadBalance(assignments, effRatio, layouts, computeLoads, refRatios);
  
  if(ret == 0)
    procs = assignments[0];

  return ret;
}


///
// This version does the real work.
///
int
LoadBalance(Vector<Vector<int> >& procAssignments  //output: processor number  
//                                                  //   for each box           
			,Real&                 effRatio         //output: ratio of min load
//                                                  //   to max load
			,const Vector<Vector<Box> >&  Grids            //input: meshes to balance  
			,const Vector<Vector<long> >& ComputeLoads     //input: computational cost 
//                                                  //   of each box            
//## ,const [...something...]&     CommunicateLoads //input: communication cost 
//                                                  //   between each pair of   
//                                                  //   neighboring boxes      
			,const Vector<int>&           RefRatios        //input: refinement ratio   
                                                    //   for each level   
			,int nProc                            // number of procs to assugn to
             )
{
  // local variables
  Real eff_ratio ; // efficiency ratio on a level
  int status = 0 ; // return code

  // Validate inputs
  if( Grids.size() != ComputeLoads.size() ) { return -1011 ; }
  if( Grids.size() != RefRatios.size() ) { return -1013 ; }
  for( int lvl=0 ; lvl<Grids.size() ; ++lvl )
    {
      if( Grids[lvl].size() != ComputeLoads[lvl].size() ) { return -1012 ; }
    }

  // set the number of elements in the output vector to the number
  // of levels and the number of elements in each element to the 
  // number of boxes on each level and set the value of each 
  // element to zero
  procAssignments.resize( Grids.size() ) ;
  for( int lvl=0 ; lvl<Grids.size() ; ++lvl )
    {
      procAssignments[lvl].resize( Grids[lvl].size(),0 ) ;
    }

  // check for special case of all loads on 1 processor
  if( nProc == 1 )
    {
      for( int lvl=0 ; lvl<Grids.size() ; ++lvl )
        {
          for( int i=0 ; i<Grids[lvl].size() ; ++i )
            {
              procAssignments[lvl][i] = 0 ;
            }
        }
      effRatio = 1.0 ;
      status = 0 ;
    }
  else
    {
      // general case: loads on more than one processor
      effRatio = 1.0 ;

      // balance each level separately
      for( int lvl=0 ; lvl<Grids.size() ; ++lvl )
        {
          // first, build the load structure and sort by compute_load
          Vector<Load> loads( Grids[lvl].size() ) ;
          for( int i=0 ; i<Grids[lvl].size() ; ++i )
            {
              loads[i].load = ComputeLoads[lvl][i] ;
              loads[i].grid_index = i ;
            }
          std::sort( loads.begin() ,loads.end() ) ;

          // do the initial assignments by sequentially
          // `handing out' the loads from largest to smallest
          Vector<long> total_loads( nProc,0 ) ; //total load per processor
          Vector<Vector<Load> > proc_loads( nProc ) ; //loads per processor
          int iproc_minload = 0 ; //processor with lowest load
          // loads are sorted in increasing order, so work backwards through the vector
          for( int i=loads.size()-1 ; i>=0 ; --i )
            {
              // put the next load on the processor with the lowest total load
              proc_loads[iproc_minload].push_back( loads[i] ) ;
              total_loads[iproc_minload] += loads[i].load ;

              // recompute which processor has the lowest load
              //[NOTE: this would be faster if the loads were sorted]
              iproc_minload = min_element( total_loads ) ;
            }
          // compute average load per processor, truncated to int
          long avg_load = 0 ;
          for( int i=0 ; i<total_loads.size() ; ++i ) avg_load += total_loads[i] ;
          avg_load /= nProc ;

          // optimize the assignments by swapping a load off the
          // processor with the max load onto another processor
          // such that the load balance is improved
          int iter_count = 0, swap_count = 0 ;
          int iproc_maxload ;
          long max_change ; //largest change in load balance
          int ibmax,jbmax,ipmax,jpmax ;  //box and processor indices corresponding to max_change

          while( 1 )
            {
              max_change = 0 ;

              // find the processor that has the largest deviation from perfect load balance
              min_max_elements( iproc_minload ,iproc_maxload ,total_loads ) ;
              if( iproc_minload == iproc_maxload )
                {
                  // load balance is perfect
                  // (this won't happen except in test cases)
                  break ;
                }
              assert( total_loads[iproc_minload] <= avg_load &&
                      avg_load <= total_loads[iproc_maxload] ) ;
              if( avg_load - total_loads[iproc_minload] > total_loads[iproc_maxload] - avg_load )
                ipmax = iproc_minload ;
              else
                ipmax = iproc_maxload ;

              //[NOTE: dont need this here, but it may be useful for debugging.]
              eff_ratio = (Real)total_loads[iproc_minload] / (Real)total_loads[iproc_maxload] ;

	      // deviation from perfect load balance for this proc
	      long devib = total_loads[ipmax] - avg_load ;

              // search all the other processors for the swap that has the maximum
              // reduction in the total deviation from the perfect load balance
              for( int j=0 ; j<proc_loads.size() ; ++j )
                {
                  if( j != ipmax )
                    {
		      long devjb = total_loads[j] - avg_load ;

                      // loop over all boxes on both processors
                      for( int ibox=0 ; ibox<proc_loads[ipmax].size() ; ++ibox )
                        {
                          for( int jbox=0 ; jbox<proc_loads[j].size() ; ++jbox )
                            {
                              iter_count++ ;
                              // how much bigger is the ibox load than the jbox load?
                              long diff = proc_loads[ipmax][ibox].load
                                        - proc_loads[  j  ][jbox].load ;
                              // change in total deviation from swapping boxes
                              long change = Abs( devib ) + Abs( devjb ) 
                                          - Abs( devib - diff ) - Abs( devjb + diff ) ;
                              // remember this pair of boxes if the change is better
                              //[NOTE: max_change starts at 0, so this is always an improvement]
                              if( change > max_change )
                                {
                                  max_change = change ;
                                  ibmax = ibox ; jbmax = jbox ; jpmax = j ;
                                }
                            }
                        }
                    }
                }
              // if there is a swap that improves load balance, take it; else stop
              if( max_change > 0 )
                {
                  // adjust the total loads on each processor
                  long load_diff = proc_loads[ipmax][ibmax].load
                                 - proc_loads[jpmax][jbmax].load ;
                  assert( load_diff != 0 ) ;
                  total_loads[ipmax] -= load_diff ;
                  total_loads[jpmax] += load_diff ;
                  // swap the loads
                  Load tmp = proc_loads[ipmax][ibmax] ;
                  proc_loads[ipmax][ibmax] = proc_loads[jpmax][jbmax] ;
                  proc_loads[jpmax][jbmax] = tmp ;

                  swap_count++ ;
                }
              else
                {
                  break ;
                }
            }

          // Done with this level.

          // Compute the final efficiency ratio and save it if appropriate.
          min_max_elements( iproc_minload ,iproc_maxload ,total_loads ) ;
          eff_ratio = (Real)total_loads[iproc_minload] / (Real)total_loads[iproc_maxload] ;
          if( eff_ratio < effRatio ) effRatio = eff_ratio ;

          // Assign boxes to processors for this level.
          for( int ip=0 ; ip<proc_loads.size() ; ++ip )
            {
              for( int jb=0 ; jb<proc_loads[ip].size() ; ++jb )
                {
                  procAssignments[lvl][proc_loads[ip][jb].grid_index] = ip ;
                }
            }

#ifndef NDEBUG
//           if( iter_count > 0 )
//             {
//               cout << "    debug: LoadBalance: level " << lvl << " used "
//                    << iter_count << " iterations and "
//                    << swap_count << " swaps to get efficiency ratio "
//                    << eff_ratio << std::endl ;
//             }
#endif
        }

      // Done with all levels.  
      // We could try to permute processors assignments between levels to
      // reduce communication, but it is probably not worth the effort
      // since it probably would have O(N^4) cost (N==#boxes).
    }

  return status;
}


////////////////////////////////////////////////////////////////
//                utility functions                           //
////////////////////////////////////////////////////////////////

//
// Find the index of the small value in a non-empty (long) Vector
//
int
min_element( const Vector<long>& Vect )
{
  assert( Vect.size() > 0 );
  int imin = 0 ;
  for( int i=1 ; i<Vect.size() ; ++i )
    {
      if( Vect[i] < Vect[imin] ) imin = i ;
    }
  return imin;
}


//
// Find the indices of the smallest and largest values in a non-empty (long) Vector
//
void
min_max_elements( int& imin ,int& imax ,const Vector<long>& Vect )
{
  assert( Vect.size() > 0 );
  imin = 0 ; imax = 0 ;
  for( int i=1 ; i<Vect.size() ; ++i )
    {
      if( Vect[i] < Vect[imin] ) imin = i ;
      if( Vect[i] > Vect[imax] ) imax = i ;
    }
  return;
}
