/* _______              __
  / ___/ /  ___  __ _  / /  ___
 / /__/ _ \/ _ \/  ' \/ _ \/ _ \
 \___/_//_/\___/_/_/_/_.__/\___/ 
*/
/*

** This software is copyright (C) by the Lawrence Berkeley National
** Laboratory.  Permission is granted to reproduce this software for
** non-commercial purposes provided that this notice is left intact.
**  
** It is acknowledged that the U.S. Government has rights to this
** software under Contract DE-AC03-765F00098 between the U.S.  Department
** of Energy and the University of California.
**  
** This software is provided as a professional and academic contribution
** for joint exchange. Thus it is experimental, is provided ``as is'',
** with no warranties of any kind whatsoever, no support, no promise of
** updates, or printed documentation. By using this software, you
** acknowledge that the Lawrence Berkeley National Laboratory and Regents
** of the University of California shall have no liability with respect
** to the infringement of other copyrights by any part of this software.
*/

#include <cstdlib>
#include <algorithm>
#include "parstream.H"

using std::sort;


template<class T> inline
LevelData<T>::LevelData()
  :m_sendbuffer(NULL), m_sendcapacity(0),
   m_recbuffer(NULL), m_reccapacity(0)
{
#ifdef MPI
numSends = 0;
#endif
}


template<class T> inline
LevelData<T>::LevelData(const DisjointBoxLayout& dp, int comps, const IntVect& ghost,
						const DataFactory<T>& a_factory)
  : m_disjointBoxLayout(dp), m_ghost(ghost), m_sendbuffer(NULL),
	m_sendcapacity(0),  m_recbuffer(NULL),m_reccapacity(0)
{ 
#ifdef MPI
  numSends = 0;
#endif
  m_boxLayout = dp; 
  m_comps = comps;
  m_isdefined = true;

  if(!dp.isClosed())
    {
      MayDay::Error("non-disjoint DisjointBoxLayout: LevelData<T>::LevelData(const DisjointBoxLayout& dp, int comps)");
    }

  Interval interval(0, comps-1);
  allocateGhostVector(a_factory, ghost);
  setVector(*this, interval, interval);
}

// Since I need to thwart the user from invoking the
// 'define' methods that use a general BoxLayout, I cannot
// get at said functions myself now. Ha!  So, I have to recode
// them here.

template<class T> inline
void LevelData<T>::define(const DisjointBoxLayout& dp, int comps, const IntVect& ghost,
						  const DataFactory<T> & a_factory)
{
  m_isdefined = true;
  if(!dp.isClosed())
    {
      MayDay::Error("non-disjoint DisjointBoxLayout: LevelData<T>::define(const DisjointBoxLayout& dp,....)");
    }
  if(comps<=0)
    {
      MayDay::Error("LevelData::LevelData(const BoxLayout& dp, int comps)  comps<=0");
    }
  m_comps = comps;
  m_boxLayout = dp;

  m_disjointBoxLayout = dp;
  m_ghost = ghost;

  Interval interval(0, comps-1);
  allocateGhostVector(a_factory, ghost);
  setVector(*this, interval, interval);
}

template<class T> inline
void LevelData<T>::define(const LevelData<T>& da,  const DataFactory<T> & a_factory)
{
  m_isdefined = true;
  if(this == &da) return;
  m_disjointBoxLayout = da.m_disjointBoxLayout;
  m_boxLayout  = da.m_disjointBoxLayout;
  m_comps     = da.m_comps;
  m_ghost     = da.m_ghost;
	
  Interval srcAnddest(0, m_comps-1);
	
  allocateGhostVector(a_factory, m_ghost);
  setVector(da, srcAnddest, srcAnddest);
}


template<class T> inline
void LevelData<T>::define(const LevelData<T>& da, const Interval& comps,
						   const DataFactory<T>& a_factory)
{
  m_isdefined = true;
  if(this == &da){
    MayDay::Error(" LevelData<T>::define(const LevelData<T>& da, const Interval& comps) called with 'this'");
  }
  assert(comps.size()>0);
  assert(comps.end()<=m_comps);
  assert(comps.begin()>=0);

  m_disjointBoxLayout = da.m_disjointBoxLayout;
  m_boxLayout  = da.m_disjointBoxLayout;

  m_comps = comps.size();

  m_ghost = da.m_ghost;

  Interval dest(0, m_comps-1);

  allocateGhostVector(a_factory, m_ghost);

  setVector(da, comps, dest);

}

template<class T> inline
void LevelData<T>::copyTo(const Interval& srcComps, 
			  BoxLayoutData<T>& dest,
			  const Interval& destComps) const
{
  if((BoxLayoutData<T>*)this == &dest) return;
  
  if(boxLayout() == dest.boxLayout())
    {
      // parallel direct copy here, no communication issues
      for(DataIterator it(dataIterator()); it.ok(); ++it)
	{
	  dest[it()].copy(box(it()), 
			  destComps,
			  this->operator[](it()),
			  srcComps);
	}
      return;
    }

  Copier copier(m_disjointBoxLayout, dest.boxLayout());
  copyTo(srcComps, dest, destComps, copier);
}

template<class T> inline
void LevelData<T>::copyTo(const Interval& srcComps, 
			  LevelData<T>& dest,
			  const Interval& destComps) const
{
  if(this == &dest){
    MayDay::Error("src == dest in copyTo function. Perhaps you want exchange ?");
  }

  if(boxLayout() == dest.boxLayout()  && dest.ghostVect() == IntVect::Zero)
    {
      // parallel direct copy here, no communication issues
      for(DataIterator it(dataIterator()); it.ok(); ++it)
	{
	  dest[it()].copy(box(it()), 
			  destComps,
			  this->operator[](it()),
			  srcComps);
	}
      return;
    }
  
  Copier copier(m_disjointBoxLayout, dest.getBoxes(), dest.m_ghost);
  copyTo(srcComps, dest, destComps, copier);
}


template<class T> inline
void LevelData<T>::copyTo(const Interval& srcComps, 
			  BoxLayoutData<T>& dest,
			  const Interval& destComps,
			  const Copier& copier) const
{

  makeItSo(srcComps, *this, dest, destComps, copier);

}

template<class T> inline
void LevelData<T>::copyTo(const Interval& srcComps, 
			  LevelData<T>& dest,
			  const Interval& destComps,
			  const Copier& copier) const
{

  makeItSo(srcComps, *this, dest, destComps, copier);

}

template<class T> inline 
void LevelData<T>::exchange(const Interval& comps)
{
  // later on we can code this part as a direct algorithm
  // by copying and pasting the code from the Copier::define code
  // for now, just do the easy to debug approach.
  Copier copier(m_disjointBoxLayout, m_disjointBoxLayout, m_ghost);
  exchange(comps, copier);
}

template<class T> inline
void LevelData<T>::exchange(const Interval& comps,
			    const Copier& copier)
{
  makeItSo(comps, *this, *this, comps, copier);
	
}

template<class T> inline
void LevelData<T>::makeItSo(const Interval&   a_srcComps, 
			    const LevelData<T>& a_src,
			    BoxLayoutData<T>& a_dest,
			    const Interval&   a_destComps,
			    const Copier&     a_copier) const
{
  // The following five functions are nullOps in uniprocessor mode

  completePendingSends(); // wait for sends from possible previous operation

  allocateBuffers(a_src,  a_srcComps,
		  a_dest, a_destComps,
		  a_copier);  //monkey with buffers, set up 'fromMe' and 'toMe' queues

  writeSendDataFromMeIntoBuffers(a_src, a_srcComps);

  postReceivesToMe(); // all non-blocking

  postSendsFromMe();  // all non-blocking

  //  computation that could occur during communication should really
  //  go here somehow.  while all the non-blocking sending and receiving is
  //  going on.  
  //
  //  my thought is to return from this function at this point an object
  //  that encapsulates the argument list above.  
  //  a "ChomboMessaging" object.
  //  The user can keep a reference
  //  to this object and do computations.  When they reach the limit of what
  //  they can compute without this communication completing, they call the
  //  "finalize()" function of their ChomboMessaging object and the rest of this
  //  code below gets executed.
  //  a real question though is: is there really enough computation to do while
  //  messaging is going on to justify the effort, and what machines really have
  //  good asynchronous messaging to make the work worthwhile.
  //  
  //  the other approach is to more finely decompose the overlapping of
  //  messaging and computation by using the ChomboMessaging object in the
  //  DataIterator construction.  The DataIterator returns T objects as they
  //  are completed from messaging.  This preserves almost all of the Chombo
  //  code as is but would be mucho tricky to actually implement and might only
  //  gain little.  This would not be a thing to try unitl Chombo is
  //  heavily instrumented for performance measuring.  in this design, unpackRecievesToMe()
  //  would become a complicated process interwoven with a DataIterator.

  //  postReceivesToMe();

  // perform local copy
  for(CopyIterator it(a_copier, CopyIterator::LOCAL); it.ok(); ++it)
    {
      const MotionItem& item = it();
      a_dest[item.toIndex].copy(item.region, 
				a_destComps,
				a_src[item.fromIndex],
				a_srcComps);
    }

  unpackReceivesToMe(a_dest, a_destComps); // nullOp in uniprocessor mode
  
}

template<class T> inline
void LevelData<T>::define(const BoxLayout& dp, int comps,  const DataFactory<T>& a_factory)
{
  MayDay::Error("LevelData<T>::define called with BoxLayout input");
}

template<class T> inline
void LevelData<T>::define(const BoxLayout& dp)
{
  MayDay::Error("LevelData<T>::define called with BoxLayout input");
}

template<class T> inline
void LevelData<T>::define(const BoxLayoutData<T>& da, const DataFactory<T>& a_factory )
{
  MayDay::Error("LevelData<T>::define called with BoxLayout input");
}

template<class T> inline
void LevelData<T>::define(const BoxLayoutData<T>& da, const Interval& comps,
						  const DataFactory<T>& a_factory)
{
  MayDay::Error("LevelData<T>::define called with BoxLayout input");
}

template<class T> inline
LevelData<T>::~LevelData()
{
  completePendingSends();
  free(m_sendbuffer);
  free(m_recbuffer);
}

#ifndef MPI
// uniprocessor version of all these nullop functions.
template<class T> inline
void LevelData<T>::completePendingSends() const
{;}

template<class T> inline
void LevelData<T>::allocateBuffers(const LevelData<T>& a_src, 
				   const Interval& a_srcComps,
				   const BoxLayoutData<T>& a_dest,
				   const Interval& a_destComps,
				   const Copier&   a_copier) const
{;}  

template<class T> inline
void LevelData<T>::writeSendDataFromMeIntoBuffers(const LevelData<T>& a_src, 
						  const Interval&     a_srcComps) const
{;}

template<class T> inline
void LevelData<T>::postSendsFromMe() const
{;}

template<class T> inline
void LevelData<T>::postReceivesToMe() const
{;}

template<class T> inline
void LevelData<T>::unpackReceivesToMe(BoxLayoutData<T>& a_dest, 
				      const Interval&   a_destComps) const
{;}

#else

// MPI versions of the above codes.

template<class T> inline
void LevelData<T>::completePendingSends() const
{
  if(numSends > 0){
    int result = MPI_Waitall(numSends, m_sendRequests, m_sendStatus);
    if(result != MPI_SUCCESS)
      {
	//hell if I know what to do about failed messaging here
      }
  
    delete[] m_sendRequests;
    delete[] m_sendStatus;
  }
  numSends = 0;
}

template<class T> inline
void LevelData<T>::allocateBuffers(const LevelData<T>& a_src, 
				   const Interval& a_srcComps,
				   const BoxLayoutData<T>& a_dest,
				   const Interval& a_destComps,
				   const Copier&   a_copier) const
{
  m_fromMe.resize(0);
  m_toMe.resize(0);
  size_t sendBufferSize = 0;
  size_t recBufferSize  = 0;
  // two versions of code here.  one for preAllocatable T, one not.

  T dummy;
  for(CopyIterator it(a_copier, CopyIterator::FROM); it.ok(); ++it)
	{
	  const MotionItem& item = it();
	  bufEntry b;
	  b.item = &item;
	  b.size = a_src[item.fromIndex].size(item.region, a_srcComps);
	  sendBufferSize+=b.size;
	  b.procID = item.procID;
	  m_fromMe.push_back(b);
	}
  sort(m_fromMe.begin(), m_fromMe.end());
  for(CopyIterator it(a_copier, CopyIterator::TO); it.ok(); ++it)
	{
	  const MotionItem& item = it();
	  bufEntry b;
	  b.item = &item;
	  if (T::preAllocatable())
		{
		  b.size = dummy.size(item.region, a_destComps);
		  recBufferSize+=b.size;
		}
	  b.procID = item.procID;
	  m_toMe.push_back(b);
	}
  sort(m_toMe.begin(), m_toMe.end());
  
  if(!T::preAllocatable()) // here is the first pass of the "two-pass" communications
	{
	  // in the non-preallocatable case, I need to message the
	  // values for the m_toMe[*].size 
	  if(m_fromMe.size() > 0){
		//m_sendRequests    = new MPI_Request[m_fromMe.size()];
		//m_sendStatus      = new MPI_Status[m_fromMe.size()];
		
		int lastProc = -1;
		int messageIndex = 0;
		for(int i=0; i<m_fromMe.size(); ++i)
		  {
			bufEntry& b = m_fromMe[i];
			if(b.procID == lastProc) messageIndex++;
			else                     messageIndex = 0;
			lastProc = b.procID;
			MPI_Isend(&(b.size), 1, MPI_INTEGER, b.procID, 
					  messageIndex, Chombo_MPI::comm, m_sendRequests+i);
		  }
	  }
	  if(m_toMe.size() > 0)
		{
		  m_receiveRequests = new MPI_Request[m_toMe.size()];
		  m_receiveStatus   = new MPI_Status[m_toMe.size()];
		  int lastProc = -1;
		  int messageIndex = 0;
		  for(int i=0; i<m_toMe.size(); ++i)
			{
			  bufEntry& b = m_toMe[i];
			  if(b.procID == lastProc) messageIndex++;
			  else                     messageIndex = 0;
			  lastProc = b.procID;
			  MPI_Irecv(&(b.size), 1, MPI_INTEGER, b.procID, 
						messageIndex, Chombo_MPI::comm, m_receiveRequests+i);
			}
		  
		  int result = MPI_Waitall(m_toMe.size(), m_receiveRequests, m_receiveStatus);
		  if(result != MPI_SUCCESS){
			MayDay::Error("First pass of two-phase communication failed");
		  }
		  delete[] m_receiveRequests;
		  delete[] m_receiveStatus;
		}
	}


  // allocate send and receveive buffer space.

  if(sendBufferSize > m_sendcapacity)
    {
      free(m_sendbuffer);
      m_sendbuffer = malloc(sendBufferSize);
      if(m_sendbuffer == NULL)
		{
		  MayDay::Error("Out of memory in LevelData::allocatebuffers");
		}
      m_sendcapacity = sendBufferSize;
    }

  if(recBufferSize > m_reccapacity)
    {
      free(m_recbuffer);
      m_recbuffer = malloc(recBufferSize);
      if(m_recbuffer == NULL)
		{
		  MayDay::Error("Out of memory in LevelData::allocatebuffers");
		}
      m_reccapacity = recBufferSize;
    }
  
 
  sort(m_toMe.begin(), m_toMe.end());
  
  /*
  pout()<<"\n";
  for(int i=0; i<m_fromMe.size(); i++) 
    pout()<<m_fromMe[i].item->region<<"{"<<m_fromMe[i].procID<<"}"<<" ";
  pout() <<"::::";
  for(int i=0; i<m_toMe.size(); i++) 
    pout()<<m_toMe[i].item->region<<"{"<<m_toMe[i].procID<<"}"<<" ";
  pout() << endl;
  */

  char* nextFree = (char*)m_sendbuffer;
  if(m_fromMe.size() > 0){
	for(unsigned int i=0; i<m_fromMe.size(); ++i)
	  {
		m_fromMe[i].bufPtr = nextFree;
		nextFree += m_fromMe[i].size;
	  }
  }

  nextFree = (char*)m_recbuffer;
  if(m_toMe.size() > 0){
	for(unsigned int i=0; i<m_toMe.size(); ++i)
	  {
		m_toMe[i].bufPtr = nextFree;
		nextFree += m_toMe[i].size;
	  }
  }
  
  // since fromMe and toMe are sorted based on procID, messages can now be grouped
  // together on a per-processor basis.
 
}



template<class T> inline
void LevelData<T>::writeSendDataFromMeIntoBuffers(const LevelData<T>& a_src, 
						  const Interval&     a_srcComps) const
{
  
  
  for(unsigned int i=0; i<m_fromMe.size(); ++i)
    {
      const bufEntry& entry = m_fromMe[i];
      a_src[entry.item->fromIndex].linearOut(entry.bufPtr, entry.item->region, a_srcComps);
    }

}

template<class T> inline
void LevelData<T>::postSendsFromMe() const
{

  // now we get the magic of message coalescence
  // fromMe has already been sorted in the allocateBuffers() step.
  
  numSends = m_fromMe.size();
  if(numSends > 1){
  for(unsigned int i=m_fromMe.size()-1; i>0; --i)
    {
      if(m_fromMe[i].procID == m_fromMe[i-1].procID) 
	{
	  numSends--;
	  m_fromMe[i-1].size+=m_fromMe[i].size;
	  m_fromMe[i].size = 0;
	}
    }
  }
  m_sendRequests = new MPI_Request[numSends];
  m_sendStatus = new MPI_Status[numSends];


  unsigned int next=0;
  for(int i=0; i<numSends; ++i)
    {
      const bufEntry& entry = m_fromMe[next];
//       cout<<procID()<< ": sending message of "<<entry.size;
//       cout<<" to proc "<<  entry.procID<<endl;
      MPI_Isend(entry.bufPtr, entry.size, MPI_BYTE, entry.procID, 
		0, Chombo_MPI::comm, m_sendRequests+i);
      ++next;
      while(next < m_fromMe.size() && m_fromMe[next].size == 0) ++next;
    }


}

template<class T> inline
void LevelData<T>::postReceivesToMe() const
{
  numReceives = m_toMe.size();

  if(numReceives > 1){
  for(unsigned int i=m_toMe.size()-1; i>0; --i)
    {
      if(m_toMe[i].procID == m_toMe[i-1].procID) 
	{
	  numReceives--;
	  m_toMe[i-1].size+=m_toMe[i].size;
	  m_toMe[i].size = 0;
	}
    }
  }
  m_receiveRequests = new MPI_Request[numReceives];
  m_receiveStatus = new MPI_Status[numReceives];


  unsigned int next=0;
  for(int i=0; i<numReceives; ++i)
    {
      const bufEntry& entry = m_toMe[next];
 //      cout<<procID()<< ": receiving message of "<<entry.size;
//       cout<<" from proc "<<  entry.procID<<endl;
      MPI_Irecv(entry.bufPtr, entry.size, MPI_BYTE, entry.procID, 
		0, Chombo_MPI::comm, m_receiveRequests+i);
      ++next;
      while(next < m_toMe.size() && m_toMe[next].size == 0) ++next;
    }
  
}


template<class T> inline
void LevelData<T>::unpackReceivesToMe(BoxLayoutData<T>& a_dest, 
				      const Interval&   a_destComps) const
{
  // wish asynchronous worked communication worked on all machines so that
  // I didn't have to do this part:
  completePendingSends();


  if(numReceives > 0){
    int result = MPI_Waitall(numReceives, m_receiveRequests, m_receiveStatus);
    if(result != MPI_SUCCESS)
      {
	//hell if I know what to do about failed messaging here
      }

    for(unsigned int i=0; i<m_toMe.size(); ++i)
      {
	const bufEntry& entry = m_toMe[i];
	a_dest[entry.item->toIndex].linearIn(entry.bufPtr, entry.item->region, a_destComps);
      }
  
    delete[] m_receiveRequests;
    delete[] m_receiveStatus;
  }
  numReceives = 0;
}

#endif







