/*
	Generic Master Worker Application Template
*/

extern "C" {
#include "gmwat.h"
}
#include <boost/thread/thread.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/condition.hpp>
#include <boost/tokenizer.hpp>
#include <queue>
#include <vector>
#include <map>
#include <set>
#include <string>

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#include "version.h"


#ifdef VERBOSE_DATA
#include <string.h>

void printData(char *tag, char *p, int size) {
    char *buf, *sp;

    buf = (char *)malloc(strlen(tag)+64+size*4+16);
    sp = buf;
	sprintf(sp, "\x1b[1;33;40m(%s) [%d]", tag, size);
    sp += strlen(sp);
	while(size--) {
		sprintf(sp, " %d",*p++);
        sp += strlen(sp);
	}
	sprintf(sp, ".\x1b[0m");
    puts(buf);
    free(buf);    
}
#else
#define printData(t, x,y)
#endif

#ifdef METRIC_TRACE
#define TRACE1(x) x
#else
#define TRACE1(x) 
#endif

#ifdef THREADED_TRACE
#define TRACE2(x) x
#else
#define TRACE2(x) 
#endif


/*int printfc(const char *fmt, ...) {
    va_list ap;
    int n;

    va_start(ap,fmt);

    n = printf(fmt, ap);

    va_end(ap);
    return n;
}*/

static int verbose=0;
#define VERBOSE_INFO        1
#define VERBOSE_PROGRESS    2
static char role='W';
static int statistic=0;
#define STATISTIC_WU_TIME        1
#define STATISTIC_ITERATION_TIME        2
#define STATISTIC_API_TIME 4
#define STATISTIC_PROGRESS_COMPLETE 8
#define STATISTIC_MASTER_COLLECTOR 16

//static char *localStatistics = NULL;

enum _executionMode {
    PARALLEL_EXECUTION = 1,
    SERIAL_EXECUTION = 2
} executionMode = PARALLEL_EXECUTION;



#ifdef PARALLEL_CODE
    #include <mpi.h>
#endif

#include <math.h>
#include <signal.h>

#ifdef _PRINT_DEBUG
    #define DEBUGPRN(x) x
#else
    #define DEBUGPRN(x)
#endif

#define _DEBUGPRN(x) x

#define TRUE -1
#define FALSE 0

#define TAG_INDEX   1000
#define TAG_PAYLOAD 2000

#ifdef WIN32
    #include <windows.h>
    #define getpid() GetCurrentProcessId()
#endif


double __baseTime=0.0;

inline double sampleTime() {
#ifdef PARALLEL_CODE
//    if(executionMode==PARALLEL_EXECUTION) return (MPI_Wtime()-__baseTime);
//    else 
    {
        struct timespec tv;
        clock_gettime(CLOCK_REALTIME, &tv);
        return((tv.tv_sec+tv.tv_nsec/1000000000.0)-__baseTime);
    }
#else
    #ifdef WIN32
        retur 0.0;
    #else
        struct timespec tv;
        clock_gettime(CLOCK_REALTIME, &tv);
        return((tv.tv_sec+tv.tv_nsec/1000000000.0)-__baseTime);
    #endif
#endif
}

void setBaseTime() {
    __baseTime = sampleTime();
}


boost::mutex callStackMutex;
std::deque<std::string> callStack;
std::string getCallStack( void ) {
    boost::mutex::scoped_lock lock(callStackMutex);
    static char ret[4096];

    ret[0]=0;

    for (std::deque<std::string>::const_iterator it=callStack.begin(); it!=callStack.end(); it++) {
        if (!ret[0]) strcat(ret, (*it).c_str());
        else {
            strcat(ret, ":");
            strcat(ret, (*it).c_str());
        }
    }
    return ret;
}

void stateEnter(std::string tag) {
    boost::mutex::scoped_lock lock(callStackMutex);
    callStack.push_back(tag);
}

void stateExit() {
    boost::mutex::scoped_lock lock(callStackMutex);
    callStack.pop_back();
}





typedef std::pair<int, int> range_item_t;
typedef std::vector<range_item_t> range_t;

struct WorkIndex {
    WorkIndex(int g, int n) : gi(g), wn(n) {
    }
    WorkIndex() : gi(0), wn(-1) {
    }
    int gi; // grain size
    int wn; // work number
};

/**
return true if a<b
*/
struct lessWorkIndex {
    inline bool operator()(const struct WorkIndex &a, const struct WorkIndex &b) {
        return(a.gi<b.gi || (a.gi==b.gi && a.wn<b.wn));
    }
};

struct gmwat_stat_item {
    int iteration;
    double execTime, execStart;
    double inputTime, inputStart;
    double outputTime, outputStart;
    double mark;
    gmwat_stat_item() : iteration(-1), execTime(0), execStart(0), inputTime(0), inputStart(0), outputTime(0), outputStart(0), mark(0) {
    }
};

typedef std::map<struct WorkIndex, struct gmwat_stat_item, lessWorkIndex> gmwat_stat_t;

struct Work {
    Work() : data(NULL), index(-1,0) {
    }
    ~Work() {
        assert(!data);
    }
    struct WorkData *data;
    struct WorkIndex index;
};

struct MPIAddress {
    MPIAddress(int r, MPI_Comm c) : rank(r), comm(c) {
    }
    MPIAddress() : rank(0), comm(MPI_COMM_WORLD) {}
    int rank;
    MPI_Comm comm;
};


/**
 * Represent an output or imput command.
 **/
struct MPIEvent {
    MPIEvent(struct MPIAddress a, Work *w) : addr(a), work(w) {
    }
    struct MPIAddress addr;
    Work *work;
};


class Runnable {
public:
    virtual ~Runnable() {
    }
    virtual void run() = 0;
};

struct ThreadWrapper {
    Runnable *_target;
    ThreadWrapper(Runnable *target): _target(target) {
    }
    void operator()() {
        _target->run();
    }
};

class ProcessorInterface {
public:
    virtual void give(Work *work)  = 0;
    virtual Work *take() = 0;
    virtual Work *take(int blockOnGet) = 0;
    virtual ~ProcessorInterface() {}
};


enum CommSignal {
    CSUnkonwn = 0,
    CSShouldFinish = 1,
    CSShouldWaitToSend = 2,
    CSShouldNotWaitToSend = 3,
    CSFinalizationStart = 4,
    CSIterationStart = 5,
    CSIterationEnd = 6
};

char *getSignalName(enum CommSignal cs) {
    switch(cs) {
        case CSUnkonwn: return "CSUnkonwn";
        case CSShouldFinish: return "CSShouldFinish";
        case CSShouldWaitToSend: return "CSShouldWaitToSend";
        case CSShouldNotWaitToSend: return "CSShouldNotWaitToSend";
        case CSFinalizationStart: return "CSFinalizationStart";
        case CSIterationEnd: return "CSIterationEnd";
        case CSIterationStart: return "CSIterationStart";
    }
    return "not named";
}
class ControllerInterface;

class MPIHandlerInterface {
public:
    virtual ~MPIHandlerInterface() {
    }
    virtual void give(Work *work, int rank, MPI_Comm comm, int channel=0) = 0;
    virtual void giveSignal(enum CommSignal sig, int rank, MPI_Comm comm, int channel=0) = 0;
    virtual void shouldFinish() = 0;
    virtual void setWaitDataOnSend(int val) = 0;
    virtual int getOutQueueLength() = 0;
    virtual int getWaitDataOnSend() = 0;
    virtual void setController(ControllerInterface *controller) = 0;
    virtual void setBlockOnComm() = 0;
    virtual void configChannels(std::vector<int> channels) = 0;
};

class ControllerInterface : virtual public Runnable {
protected:
    MPIHandlerInterface *_mpiHandler;
public:
    ControllerInterface() : _mpiHandler(NULL) {
    }
    virtual ~ControllerInterface() {
    }
    void setMpiHandler(MPIHandlerInterface *mpiHandler) {
        _mpiHandler = mpiHandler;
    }
    virtual void notifyInput(WorkIndex &workIndex, int rank, MPI_Comm comm) = 0;
    virtual void notifyOutput(WorkIndex &workIndex, int rank, MPI_Comm comm) = 0;
    virtual void give(Work *work, int rank, MPI_Comm comm) = 0;
    virtual void report(Work *work, int rank, MPI_Comm comm) = 0;
    virtual void reportSignal(enum CommSignal sig, int rank, MPI_Comm comm) = 0;
    virtual void processSend() = 0;
    virtual const std::string getName() = 0;
    virtual Work *getWork(int gi, int wn, int rank, MPI_Comm comm) = 0;
    virtual void setChannelMap(std::map<int,int> channels) = 0;
    virtual double getEndTime() = 0;
};


#ifdef PARALLEL_CODE

#ifdef SYNCHRONOUS_THREADED

class ChannelLister {
public:
    virtual ~ChannelLister() {}
    virtual Work *inputStart(WorkIndex &workIndex, int rank, MPI_Comm comm) = 0;
    virtual void outputStart(WorkIndex &workIndex, int rank, MPI_Comm comm) = 0;
    virtual void inputFinish(Work *work, int rank, MPI_Comm comm) = 0;
    virtual void outputFinish(Work *work, int rank, MPI_Comm comm) = 0;
    virtual boost::mutex *getSerialMutex() = 0;
};

class ChannelInputHandler : virtual public  Runnable {
private:
    ChannelLister *_listener;
    int _channelTag, _rank;
    MPI_Comm _comm;
    int _shouldFinish;
    MPI_Request _request;
    ThreadWrapper *_threadWrapper;
    boost::thread *_thread;

public:
    ChannelInputHandler(ChannelLister *listener, int channelTag=MPI_ANY_TAG, int rank=MPI_ANY_SOURCE, MPI_Comm comm=MPI_COMM_WORLD) : 
        _listener(listener), _channelTag(channelTag), _rank(rank), _comm(comm), /*_onRead(FALSE),*/ _shouldFinish(FALSE), _thread(NULL) {
        DEBUGPRN(printf("(%d) ChannelInputHandler[%d] created\n", getpid(), _channelTag);)
        }

    ~ChannelInputHandler() {
        if(_thread) {
            _thread->join();
            delete _thread;
            delete _threadWrapper;
        }
    }

    inline int poolWait(MPI_Request *request, MPI_Status *status) {
        int error;
        /*
	int flag;
	do {
            {
                boost::mutex::scoped_lock mpilock(*_listener->getSerialMutex());
                error = MPI_Test(request, &flag, status);
            }
            //error = MPI_Test(request, &flag, status);
            assert(error==MPI_SUCCESS);
            pthread_yield();
        } while(!_shouldFinish && !flag);*/

        error = MPI_Wait(request, status);
        assert(error==MPI_SUCCESS);
        return error;
    }

    virtual void run() {
        DEBUGPRN(printf("(%d) ChannelInputHandler[%d].run() enter\n", getpid(), _channelTag);)

        struct WorkIndex theInWorkIndex; //phase 1
        MPI_Status status;
        int currentTag;
        int currentRank;
        Work *theInWork;
        int cancelled, error;
        MPI_Request localRequest;

        while(!_shouldFinish) {
            {
                currentTag = _channelTag;

                //{
                //    boost::mutex::scoped_lock mpilock(*_listener->getSerialMutex());
                    error = MPI_Irecv(&theInWorkIndex, sizeof(struct WorkIndex), MPI_BYTE, MPI_ANY_SOURCE, TAG_INDEX+currentTag, _comm, &_request);
                //}
                assert(error==MPI_SUCCESS);
            }

            error = poolWait(&_request, &status);
            assert(error==MPI_SUCCESS);

            error = MPI_Test_cancelled(&status, &cancelled);
            assert(error==MPI_SUCCESS);

            if(cancelled || _shouldFinish) {
                DEBUGPRN(printf("(%d) ChannelInputHandler[%d].run(){ MPI_Wait cancelled\n", getpid(), _channelTag);)
                continue;
            }
            currentRank = status.MPI_SOURCE;
            DEBUGPRN(printf("(%d) ChannelInputHandler[%d].run(){ (%d,%d) header recv rank:%d\n", getpid(), _channelTag, theInWorkIndex.gi, theInWorkIndex.wn, currentRank);)
            
            // get the work from listener
            {
                theInWork = _listener->inputStart(theInWorkIndex, currentRank, _comm);
                if(theInWork->data) for(int i=0; i<theInWork->data->count; i++) {
                    DEBUGPRN(printf("(%d) ChannelInputHandler[%d].run(){ (%d,%d) chunk %d pre-recv (%d bytes)\n", getpid(), _channelTag, theInWorkIndex.gi, theInWorkIndex.wn, i, theInWork->data->chunks[i].size);)

                    //{
                    //    boost::mutex::scoped_lock mpilock(*_listener->getSerialMutex());
                        error = MPI_Irecv(theInWork->data->chunks[i].data, theInWork->data->chunks[i].size, MPI_BYTE, currentRank, TAG_PAYLOAD+currentTag, _comm, &localRequest);
                    //}
                    assert(error==MPI_SUCCESS);

                    error = poolWait(&localRequest, &status);
                    assert(error==MPI_SUCCESS);

                    DEBUGPRN(printf("(%d) ChannelInputHandler[%d].run(){ (%d,%d) chunk %d recv\n", getpid(), _channelTag, theInWorkIndex.gi, theInWorkIndex.wn, i);)
                }

                DEBUGPRN(printf("(%d) ChannelInputHandler[%d].run(){ WN (%d,%d) recv complete\n", getpid(), _channelTag, theInWorkIndex.gi, theInWorkIndex.wn);)

                _listener->inputFinish(theInWork, currentRank, _comm);               
            }
        }
        DEBUGPRN(printf("(%d) ChannelInputHandler[%d].run() exit\n", getpid(), _channelTag);)
    }

    virtual void shouldFinish() {
        DEBUGPRN(printf("(%d) ChannelInputHandler[%d].shouldFinish() enter\n", getpid(), _channelTag);)

        _shouldFinish = TRUE;
        if(_request!=MPI_REQUEST_NULL) { 
            MPI_Cancel(&_request);
            DEBUGPRN(printf("(%d) ChannelInputHandler[%d].shouldFinish(){ MPI_Cancel executed\n", getpid(), _channelTag);)
        }

        DEBUGPRN(printf("(%d) ChannelInputHandler[%d].shouldFinish() exit\n", getpid(), _channelTag);)
    }

    virtual void configChannel(int rank=MPI_ANY_SOURCE, MPI_Comm comm=MPI_COMM_WORLD) {
        _rank = rank;
        _comm = comm;
        /*{
            boost::mutex::scoped_lock lock(syncMutex);
            if(_onRead) { MPI_Cancel(&_request); }
        }*/
        if(_request!=MPI_REQUEST_NULL) { MPI_Cancel(&_request); }
    }
    boost::thread *start() {
        DEBUGPRN(printf("(%d) ChannelInputHandler[%d].start() enter\n", getpid(), _channelTag);)

        assert(_thread==NULL);
         _threadWrapper = new ThreadWrapper(this);
         _thread = new boost::thread(*_threadWrapper);
         
         DEBUGPRN(printf("(%d) ChannelInputHandler[%d].start() exit\n", getpid(), _channelTag);)

         return _thread;
    }
    boost::thread *getThread() { return _thread; }
};

class ChannelOutputHandler : virtual public  Runnable {
private:
    ChannelLister *_listener;
    // an output queue
    std::queue<MPIEvent *> _outEvents;
    int _channelTag, _rank;
    MPI_Comm _comm;
    int _shouldFinish;

    boost::condition outEventsEmpty;
    boost::mutex syncMutex;
    ThreadWrapper *_threadWrapper;
    boost::thread *_thread;
public:
    ChannelOutputHandler(ChannelLister *listener, int channelTag, int rank=MPI_ANY_SOURCE, MPI_Comm comm=MPI_COMM_WORLD) : 
        _listener(listener), _channelTag(channelTag), _rank(rank), _comm(comm), _shouldFinish(FALSE), _thread(NULL) {
        DEBUGPRN(printf("(%d) ChannelOutputHandler[%d] created\n", getpid(), _channelTag);)
        }

    ~ChannelOutputHandler() {
        if(_thread) {
            //delete _thread; // is destroyed by group
            delete _threadWrapper;
        }
    }

    inline int poolWait(MPI_Request *request, MPI_Status *status) {
        int error;
        /*
	int flag;
	do {
            pthread_yield();
            {
                boost::mutex::scoped_lock mpilock(*_listener->getSerialMutex());
                error = MPI_Test(request, &flag, status);
            }            
        } while(!_shouldFinish && !flag);*/
        error = MPI_Wait(request, status);
        assert(error==MPI_SUCCESS);
        return error;
    }

    virtual void run() {
        DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].run() enter\n", getpid(), _channelTag);)

        MPIEvent *outEvent;
        MPI_Status status;
        MPI_Request localRequest;
        int error;

        while(!_shouldFinish || _outEvents.size()>0) {
            //get an event
            outEvent = NULL;
            DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].run(){ pre-critical secction (_shouldFinish:%d, _outEvents.sze():%d)\n", getpid(), _channelTag, _shouldFinish, _outEvents.size());)
            {
                boost::mutex::scoped_lock lock(syncMutex);
                while(!_shouldFinish && _outEvents.size()==0) {
                    DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].run(){ wait (_shouldFinish:%d, _outEvents.sze():%d)\n", getpid(), _channelTag, _shouldFinish, _outEvents.size());)
                    outEventsEmpty.wait(lock);
                    DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].run(){ wakeup (_shouldFinish:%d, _outEvents.sze():%d)\n", getpid(), _channelTag, _shouldFinish, _outEvents.size());)
                }
                if(_outEvents.size()>0) {
                    outEvent = _outEvents.front();
                    _outEvents.pop();
                } else {
                    // if(_shouldFinish)
                    break;
                }
            }
            assert(outEvent);

            DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].run(){ header (%d,%d) pre-send\n", getpid(), _channelTag, outEvent->work->index.gi, outEvent->work->index.wn);)
            //{
            //    boost::mutex::scoped_lock mpilock(*_listener->getSerialMutex());
                error = MPI_Issend(&(outEvent->work->index), sizeof(struct WorkIndex), MPI_BYTE, outEvent->addr.rank, TAG_INDEX+_channelTag, outEvent->addr.comm, &localRequest);
            //}
            assert(error==MPI_SUCCESS);

            error = poolWait(&localRequest, &status);
            assert(error==MPI_SUCCESS);

            DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].run(){ header (%d,%d) send\n", getpid(), _channelTag, outEvent->work->index.gi, outEvent->work->index.wn);)

            _listener->outputStart(outEvent->work->index, outEvent->addr.rank, outEvent->addr.comm);

            if(outEvent->work->data) {
                //assert(outEvent->work->data->count < 5);

                for(int i=0; i<outEvent->work->data->count; i++) {
                    DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].run(){ (%d,%d) chunk %d pre-send (%d bytes)\n", getpid(), _channelTag, outEvent->work->index.gi, outEvent->work->index.wn, i, outEvent->work->data->chunks[i].size);)

                    //{
                    //    boost::mutex::scoped_lock mpilock(*_listener->getSerialMutex());
                        error = MPI_Isend(outEvent->work->data->chunks[i].data, outEvent->work->data->chunks[i].size, MPI_BYTE, outEvent->addr.rank, TAG_PAYLOAD+_channelTag, outEvent->addr.comm, &localRequest);
                    //}
                    assert(error==MPI_SUCCESS);

                    error = poolWait(&localRequest, &status);
                    assert(error==MPI_SUCCESS);

                    DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].run(){ (%d,%d) chunk %d send\n", getpid(), _channelTag, outEvent->work->index.gi, outEvent->work->index.wn, i);)
                }
            }

            DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].run(){ WU (%d,%d) send complete\n", getpid(), _channelTag, outEvent->work->index.gi, outEvent->work->index.wn);)

            _listener->outputFinish(outEvent->work, outEvent->addr.rank, outEvent->addr.comm);

            delete outEvent;
        }
        DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].run() exit\n", getpid(), _channelTag);)
    }

    virtual void shouldFinish() {
        DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].shouldFinish() enter\n", getpid(), _channelTag);)

        _shouldFinish = TRUE;

        {
                boost::mutex::scoped_lock lock(syncMutex);
                outEventsEmpty.notify_all();
        }

        DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].shouldFinish() exit\n", getpid(), _channelTag);)
    }

    virtual void configChannel(int rank=MPI_ANY_SOURCE, MPI_Comm comm=MPI_COMM_WORLD) {
        _rank = rank;
        _comm = comm;
    }

    void output(Work *work, int rank, MPI_Comm comm) {
        DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].output((%d,%d), %d, %d) enter\n", getpid(), _channelTag, work->index.gi, work->index.wn, rank, (int)comm);)

        boost::mutex::scoped_lock lock(syncMutex);

        MPIEvent *aEvent = new MPIEvent(MPIAddress(rank,comm), work);
        _outEvents.push(aEvent);

        outEventsEmpty.notify_all();

        DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].output() exit\n", getpid(), _channelTag);)
    }
    boost::thread *start() {
        DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].start() enter\n", getpid(), _channelTag);)

        assert(_thread==NULL);
         _threadWrapper = new ThreadWrapper(this);
         _thread = new boost::thread(*_threadWrapper);

         DEBUGPRN(printf("(%d) ChannelOutputHandler[%d].start() exit\n", getpid(), _channelTag);)

         return _thread;
    }
    void join() {
        _thread->join();
    }
    boost::thread *getThread() { return _thread; }
};


class MPIHandler : virtual public  Runnable, public ChannelLister, virtual public MPIHandlerInterface {
private:
    std::map<int, int> _channelMap;
    std::vector<int> _channelTags;

    ControllerInterface *_controller;
    int _waitDataToSend; // not really used
    MPI_Comm _comm;
    std::vector<ChannelInputHandler *> _inputChannels;
    std::vector<ChannelOutputHandler *> _outputChannels;
    int _queueLen;

    boost::mutex syncMutex;
    boost::condition finishCondition;

    boost::thread_group _threads;

    int _started;

    boost::mutex serialMPIMutex;

public:
    MPIHandler() : _controller(NULL), _waitDataToSend(TRUE), _comm(MPI_COMM_WORLD), _queueLen(0), _started(FALSE) {
        _inputChannels.push_back(new ChannelInputHandler(this, 0));
        _outputChannels.push_back(new ChannelOutputHandler(this, 0));
        _channelTags.insert(_channelTags.begin(), 0);
    }
    virtual ~MPIHandler() {
        // todo
    }

    inline virtual boost::mutex *getSerialMutex() {
        return &serialMPIMutex;
    }

    virtual void give(Work *work, int rank, MPI_Comm comm, int channel=0) {
        if(channel) {
            channel = _channelMap[channel]; // label to position
        }

        {
            boost::mutex::scoped_lock lock(syncMutex);
            _queueLen++;
        }

        _outputChannels[channel]->output(work, rank, comm);
    }

    virtual void giveSignal(enum CommSignal sig, int rank, MPI_Comm comm, int channel=0) {
        Work *work = new Work();
        work->index.gi = -1;
        work->index.wn = (int)sig;
        give(work, rank, comm);
    }

    virtual void shouldFinish() {
        DEBUGPRN(printf("(%d) MPIHandler.shouldFinish() enter\n", getpid());)

        for(int i=0; i<(int)_inputChannels.size(); i++) _inputChannels[i]->shouldFinish();
        for(int i=0; i<(int)_outputChannels.size(); i++) _outputChannels[i]->shouldFinish();

        {
            boost::mutex::scoped_lock lock(syncMutex);
            finishCondition.notify_all();
        }

        DEBUGPRN(printf("(%d) MPIHandler.shouldFinish() exit\n", getpid());)
    }
    virtual void setWaitDataOnSend(int val) { _waitDataToSend=val; }
    virtual int getWaitDataOnSend() { return _waitDataToSend; }

    virtual int getOutQueueLength() {
        return _queueLen;
    }
    
    virtual void setController(ControllerInterface *controller) { _controller = controller; }

    virtual void setBlockOnComm() {} //not used


    virtual void configChannels(std::vector<int> channels) {
        DEBUGPRN(printf("(%d) MPIHandler.configChannels(%d) enter\n", getpid(), channels.size());)

        //todo
        // 0 is the default channel
        
        // if the number is greater
        if(channels.size()+1>_inputChannels.size()) {
            // insert channels
            for(int i=_channelTags.size(); i<(int)channels.size()+1; i++) {
                ChannelInputHandler *cih = new ChannelInputHandler(this, i);
                ChannelOutputHandler *coh = new ChannelOutputHandler(this, i);

                _inputChannels.push_back(cih);
                _outputChannels.push_back(coh);

                if(_started) {
                    _threads.add_thread(cih->start());
                    _threads.add_thread(coh->start());
                }
            }
        } else if(channels.size()+1<_inputChannels.size()) { 
            boost::thread_group opthreads;
           // delete channels
            for(int i=channels.size()+1; i<(int)_channelTags.size(); i++) {
                _threads.remove_thread(_inputChannels[i]->getThread());
                _threads.remove_thread(_outputChannels[i]->getThread());

                opthreads.add_thread(_inputChannels[i]->getThread());
                opthreads.remove_thread(_outputChannels[i]->getThread());

                _inputChannels[i]->shouldFinish();
                _outputChannels[i]->shouldFinish();
            }

            opthreads.join_all();

            _inputChannels.resize(channels.size()+1);
            _outputChannels.resize(channels.size()+1);
        }

        //reconfigure channels??
        //for(int i=0; i<(int)channels.size(); i++) {
            //_inputChannels[i+1]->configChannel(
        //}
        
        for(int i=0; i<(int)channels.size(); i++) {
            _channelMap[channels[i]]=i+1;
        }
        _channelTags = channels;
        _channelTags.insert(_channelTags.begin(), 0);

        DEBUGPRN(printf("(%d) MPIHandler.configChannels() exit\n", getpid());)
    }

    virtual void run() {
        //allocate all threads and wait for finish condition
        DEBUGPRN(printf("(%d) MPIHandler.run() enter\n", getpid());)

        for (int i=0; i<(int)_inputChannels.size(); i++) {
            _threads.add_thread(_inputChannels[i]->start());
            _threads.add_thread(_outputChannels[i]->start());
        }
        _started = TRUE;
        {
            boost::mutex::scoped_lock lock(syncMutex);
            finishCondition.wait(lock);
        }

        _threads.join_all();

        DEBUGPRN(printf("(%d) MPIHandler.run() exit\n", getpid());)
    }

    virtual Work *inputStart(WorkIndex &workIndex, int rank, MPI_Comm comm) { 
        DEBUGPRN(printf("(%d) MPIHandler.inputStart((gi:%d,wn:%d), rank:%d, comm:%d) enter\n", getpid(), workIndex.gi, workIndex.wn, rank, (int)comm);)

        Work *newWork;

        if(workIndex.gi==-1) { 
            newWork = new Work();
            newWork->index = workIndex;
        } else {
            newWork = _controller->getWork(workIndex.gi, workIndex.wn, rank, comm);
            _controller->notifyInput(workIndex,rank,comm);
        }

        DEBUGPRN(printf("(%d) MPIHandler.inputStart() exit\n", getpid());)

        return newWork;
    }

    virtual void outputStart(WorkIndex &workIndex, int rank, MPI_Comm comm) { 
        DEBUGPRN(printf("(%d) MPIHandler.outputStart((gi:%d,wn:%d), rank:%d, comm:%d) enter\n", getpid(), workIndex.gi, workIndex.wn, rank, (int)comm);)

        _controller->notifyOutput(workIndex,rank,comm);

        DEBUGPRN(printf("(%d) MPIHandler.outputStart() exit\n", getpid());)
    }

    virtual void inputFinish(Work *work, int rank, MPI_Comm comm) { 
        DEBUGPRN(printf("(%d) MPIHandler.inputFinish((gi:%d,wn:%d), rank:%d, comm:%d) enter\n", getpid(), work->index.gi, work->index.wn, rank, (int)comm);)

        if(work->index.gi == -1) {
            //signals
            _controller->reportSignal((CommSignal)work->index.wn, rank, comm);
            delete work;
        } else {
            _controller->give(work, rank, comm);
        }

        DEBUGPRN(printf("(%d) MPIHandler.inputFinish() exit\n", getpid());)
    }

    virtual void outputFinish(Work *work, int rank, MPI_Comm comm)  { 
        DEBUGPRN(printf("(%d) MPIHandler.outputFinish((gi:%d,wn:%d), rank:%d, comm:%d) enter\n", getpid(), work->index.gi, work->index.wn, rank, (int)comm);)

        if(work->index.gi == -1) {
            //signals
            _controller->reportSignal((CommSignal)work->index.wn, rank, comm);
            delete work;
        } else {
            _controller->report(work, rank, comm);
        }

        {
            boost::mutex::scoped_lock lock(syncMutex);
            _queueLen--;
        }

        _controller->processSend();

        DEBUGPRN(printf("(%d) MPIHandler.outputFinish() exit\n", getpid());)
    }
};
#else
class MPIHandler : virtual public  Runnable, virtual public MPIHandlerInterface {
private:
    
    boost::mutex outEventsMutex;
    boost::condition outEventsEmpty;
    
    boost::mutex inEventsMutex;

    ControllerInterface *_controller;
    int _waitDataToSend;

    struct channel {
        std::queue<MPIEvent *> outEvents;
        std::queue<MPIEvent *> inEvents;

        MPIEvent *theOutEvent;
        MPIEvent *theInEvent; // phase 2
        struct WorkIndex theInWorkIndex; //phase 1
        int ichunk, ochunk;
    };

    std::vector<struct channel> _channels;

    MPI_Comm _comm; // default receive communicator
    int _shouldFinish, _commPending;
    int _outputPending;

    int _canBlockOnComm;

    boost::mutex serialMPIMutex;

    int _channelNumber;

    std::map<int, int> _channelMap;
    std::vector<int> channelTags;

public:
    MPIHandler(int channelNumber=1) : _controller(NULL), _waitDataToSend(TRUE), _comm(MPI_COMM_WORLD), _shouldFinish(FALSE), _commPending(0), _canBlockOnComm(0), _channelNumber(channelNumber) {
        _channels.resize(_channelNumber);
    }

    virtual ~MPIHandler() {
    }

    virtual void configChannels(std::vector<int> channels) {
        _channelNumber = channels.size()+1; // 0 is the default channel
        _channels.resize(_channelNumber);
        for(int i=0; i<(int)channels.size(); i++) {
            _channelMap[channels[i]]=i+1;
        }
        channelTags = channels;
        channelTags.insert(channelTags.begin(), 0);
    }
    virtual void setController(ControllerInterface *controller) {
        _controller = controller;
        _waitDataToSend = FALSE;
    }

    virtual void give(Work *work, int rank, MPI_Comm comm, int channel=0) {
        assert(work);
        if(channel) {
            channel = _channelMap[channel]; // label to position
        }
        boost::mutex::scoped_lock lock(outEventsMutex);
        MPIEvent *aEvent = new MPIEvent(MPIAddress(rank,comm), work);
        _channels[channel].outEvents.push(aEvent);
        outEventsEmpty.notify_all();
        _commPending+=(1 + work->data->size);
        DEBUGPRN(printf("[%s] MPI Handler work %d accepted for sent (rank: %d, comm: %d)\n", _controller->getName().c_str(), work->index.wn, rank, (int)comm);)
    }

    virtual void giveSignal(enum CommSignal sig, int rank, MPI_Comm comm, int channel=0) {
        if(channel) {
            channel = _channelMap[channel]; // label to position
        }
        boost::mutex::scoped_lock lock(outEventsMutex);
        Work *work = new Work();
        work->index.gi = -1;
        work->index.wn = (int)sig;
        MPIEvent *aEvent = new MPIEvent(MPIAddress(rank,comm), work);
        _channels[channel].outEvents.push(aEvent);
        outEventsEmpty.notify_all();
        _commPending++;
        DEBUGPRN(printf("[%s] MPI Handler sig %d accepted for sent (rank: %d, comm: %d)\n", _controller->getName().c_str(), work->index.wn, rank, (int)comm);)
    }

    void setBlockOnComm() {
        _canBlockOnComm = 1;
    }

    void run() {
        int csel=0;

        assert(_controller);
        enum RequestMean {
            ReqNotSet=-1,
            ReqUndefined = MPI_UNDEFINED,
            ReqInputPhase1=0, ReqInputPhase2=1, ReqOutputPhase1=2, ReqOutputPhase2=3
        };
        MPI_Request requests[4*_channelNumber]; // = { MPI_REQUEST_NULL, MPI_REQUEST_NULL, MPI_REQUEST_NULL, MPI_REQUEST_NULL};
        int reqCheck[4*_channelNumber]; // = { FALSE, FALSE, FALSE, FALSE};

        for(int i=0; i<4*_channelNumber; i++) {
            requests[i]=MPI_REQUEST_NULL;
            reqCheck[i]=FALSE;
        }

        DEBUGPRN(printf("[%s] MPIHadler Running (%d channels)\n", _controller->getName().c_str(), _channelNumber);)

        _outputPending = FALSE;

        for(csel=0; csel<_channelNumber; csel++) {
            MPI_Irecv(&(_channels[csel].theInWorkIndex), sizeof(struct WorkIndex), MPI_BYTE, MPI_ANY_SOURCE, TAG_INDEX+channelTags[csel], _comm, &(requests[csel*4+ReqInputPhase1]));       
            reqCheck[csel*4+ReqInputPhase1] = TRUE;
        }
        csel=0;

        int li=0;
        do {
    #ifndef MPI_POOLING
            _controller->processSend();
    #endif

            //output check
            for(csel=0; csel<_channelNumber; csel++) {            
                if (requests[csel*4+ReqOutputPhase1] == MPI_REQUEST_NULL && !_outputPending && !_shouldFinish) {
                    {
                        boost::mutex::scoped_lock lock(outEventsMutex);
        #ifndef MPI_POOLING
                        while (outEvents.empty() && _waitDataToSend && 
                               (requests[csel*4+ReqOutputPhase2] == MPI_REQUEST_NULL &&
                                requests[csel*4+ReqInputPhase2] == MPI_REQUEST_NULL) && inEvents.empty()) {
                            DEBUGPRN(printf("[%s] MPI Handler out queue empty, wait enter.\n", _controller->getName().c_str());)
                            outEventsEmpty.wait(lock);
                            DEBUGPRN(printf("[%s] MPI Handler out queue empty, wait release.\n", _controller->getName().c_str());)
                        }
        #endif
                        if (!_channels[csel].outEvents.empty()) {
                            MPIEvent *outEvent = _channels[csel].outEvents.front();
                            if (requests[csel*4+ReqOutputPhase2] == MPI_REQUEST_NULL) {
                                //takes only if payload has been sent
                                _channels[csel].outEvents.pop();
                                _channels[csel].theOutEvent = outEvent;                            
                            } else {
                                _outputPending = TRUE;
                            }
                            DEBUGPRN(printf("[%s] MPI Handler out queue (_outputPending=%d).\n", _controller->getName().c_str(),_outputPending);)
    
                            DEBUGPRN(printf("[%s] MPI Handler send %s %d to rank %d comm %d.\n", _controller->getName().c_str(), 
                                            (outEvent->work->index.gi<0)?"signal":"work index",
                                            outEvent->work->index.wn, outEvent->addr.rank, (int)outEvent->addr.comm);)
    
                            { // SERIALIZE MPI CALLS
                                boost::mutex::scoped_lock mpilock(serialMPIMutex);
                                //assert(outEvent->addr.comm);
                                MPI_Isend(&(outEvent->work->index), sizeof(struct WorkIndex), MPI_BYTE, 
                                          outEvent->addr.rank, TAG_INDEX+channelTags[csel], outEvent->addr.comm, &(requests[csel*4+ReqOutputPhase1]));
                                reqCheck[csel*4+ReqOutputPhase1] = TRUE;
                            }                        
                        }
        #ifndef MPI_POOLING
                        else {
                            DEBUGPRN(printf("[%s] MPI Handler set not to wait out queue (wait=%d).\n", _controller->getName().c_str(), _waitDataToSend);)
                        }
        #endif
                    }
                }
            }

            if(_shouldFinish) {
                int outPending=FALSE;
                for(csel=0; csel<_channelNumber; csel++) {
                    if ((requests[csel*4+ReqOutputPhase1] == MPI_REQUEST_NULL) && (requests[csel*4+ReqOutputPhase2] == MPI_REQUEST_NULL) && 
                        (requests[csel*4+ReqInputPhase2] == MPI_REQUEST_NULL) && (requests[csel*4+ReqInputPhase1] != MPI_REQUEST_NULL)) {
                        //cancel receive
                        boost::mutex::scoped_lock mpilock(serialMPIMutex);
                        MPI_Cancel(&requests[csel*4+ReqInputPhase1]);
                        reqCheck[csel*4+ReqInputPhase1] = FALSE;                        
                    } else {
                        outPending = TRUE;
                    }
                }
                if(!outPending) {
                    if (_commPending) {
                        DEBUGPRN(printf("[%s] MPIHandler commPending=%d INVALID\n", _controller->getName().c_str(), _commPending);)
                        break;
                    }
                    break; // break the big looop
                }
            }

            /*compending verification
            if (_commPending && _shouldFinish && (requests[csel*4+ReqOutputPhase1] == MPI_REQUEST_NULL) && (requests[csel*4+ReqOutputPhase2] == MPI_REQUEST_NULL) && 
                (requests[csel*4+ReqInputPhase2] == MPI_REQUEST_NULL) && (requests[csel*4+ReqInputPhase1] == MPI_REQUEST_NULL)) {
                DEBUGPRN(printf("[%s] MPIHandler commPending=%d INVALID\n", _controller->getName().c_str(), _commPending);)
                break;
            }*/


            RequestMean index=ReqNotSet;
            MPI_Status wstatus;

    #ifdef MPI_POOLING

   #ifdef MPI_MIXED
            if(_canBlockOnComm) {
                boost::mutex::scoped_lock mpilock(serialMPIMutex);
                MPI_Waitany(4, requests, (int*)&index, &wstatus);
                if(index!=MPI_UNDEFINED) {
                    reqCheck[(int)index] = FALSE;
                }
            } else 
   #endif
            {            
                int compflag=FALSE, once=FALSE, ei;
                for (int i=0; i<4*_channelNumber; i++) {
                    ei=(li+i)%(4*_channelNumber);
                    if (reqCheck[ei]) {
                        { // SERIALIZE MPI CALLS
                            boost::mutex::scoped_lock mpilock(serialMPIMutex);
                            MPI_Test(&requests[ei], &compflag, &wstatus);
                        }
                        boost::thread::yield();
                        if (compflag) {
                            index = (RequestMean)ei;
                            reqCheck[ei] = FALSE;
                            li=(ei+1)%(4*_channelNumber);
                            break;
                        } else {
                            once=TRUE;
                        }
                    }
                }
                /*int compflag=FALSE;
                MPI_Testany(4, requests, (int*)&index, &compflag, &wstatus);
                */
                if (!compflag) {
                    boost::thread::yield();
                    //usleep(1000);
                    //assert(index==ReqUndefined);
                    assert(once);
                    continue;
                }
            }

            
            { // index normalization
                csel = index / 4;
                index = (RequestMean)(index % 4);
            }
            

            DEBUGPRN(printf("[%s] MPI Handler MPI_Test [0:%d][1:%d][2:%d][3:%d]. (cp: %d) (index %d)\n", _controller->getName().c_str(), reqCheck[0], reqCheck[1], reqCheck[2], reqCheck[3], _commPending, index);)
    #else
            DEBUGPRN(printf("[%s] MPI Handler MPI_Waitany enter [0:%p][1:%p][2:%p][3:%p]. (cp: %d)\n", _controller->getName().c_str(), requests[0], requests[1], requests[2], requests[3], _commPending);)

            //DEBUGPRN(printf("[%s] MPI Handler ->", _controller->getName().c_str());for(int i=0;i<4;i++) printf("[%d:%x]", i, requests[i]); printf("\n");)

            MPI_Waitany(4, requests, (int*)&index, &wstatus);
            if (index==ReqNotSet || index==ReqUndefined) {
                MPI_Waitany(4, requests, (int*)&index, &wstatus);
            }
            //DEBUGPRN(printf("[%s] MPI Handler <-", _controller->getName().c_str());for(int i=0;i<4;i++) printf("[%d:%x]", i, requests[i]); printf("\n");)
            DEBUGPRN(printf("[%s] MPI Handler MPI_Waitany exit [0:%p][1:%p][2:%p][3:%p].\n", _controller->getName().c_str(), requests[0], requests[1], requests[2], requests[3]);)
    #endif
            switch (index) {
                case ReqInputPhase1: {
                        DEBUGPRN(printf("[%s] MPI Handler on ReqInputPhase1.\n", _controller->getName().c_str());)

                        if (_channels[csel].theInWorkIndex.gi==-1) {
                            DEBUGPRN(printf("[%s] MPI Handler sig %d received.\n", _controller->getName().c_str(), _channels[csel].theInWorkIndex.wn);)
                            _controller->reportSignal((CommSignal)_channels[csel].theInWorkIndex.wn, wstatus.MPI_SOURCE, _comm);
                            _canBlockOnComm = 0;
                        } else {
                            Work *newWork = _controller->getWork(_channels[csel].theInWorkIndex.gi, _channels[csel].theInWorkIndex.wn, wstatus.MPI_SOURCE, _comm);
                            assert(newWork);
                            MPIEvent *aEvent = new MPIEvent(MPIAddress(wstatus.MPI_SOURCE, _comm), newWork );
                            aEvent->work->index = _channels[csel].theInWorkIndex;
                            aEvent->addr = MPIAddress(wstatus.MPI_SOURCE, _comm);

                            DEBUGPRN(printf("[%s] MPI Handler ReqInputPhase1 index %d rank %d.\n", _controller->getName().c_str(), _channels[csel].theInWorkIndex.wn, wstatus.MPI_SOURCE);)

                            if (requests[csel*4+ReqInputPhase2] != MPI_REQUEST_NULL) {
                                boost::mutex::scoped_lock inLock(inEventsMutex);
                                _channels[csel].inEvents.push(aEvent);
                            } else {
                                _channels[csel].theInEvent = aEvent;
                                _channels[csel].ichunk = 0;

                                assert(_channels[csel].theInEvent->work->data->chunks[_channels[csel].ichunk].size);

                                DEBUGPRN(printf("[%s] MPI Handler MPI_Irecv(%d, chunk: %d[%d], %d, %d).\n", _controller->getName().c_str(), 
                                                theInEvent->work->index.wn, ichunk, theInEvent->work->data->chunks[ichunk].size,
                                                theInEvent->addr.rank, (int)theInEvent->addr.comm);)

                                { // SERIALIZE MPI CALLS
                                    boost::mutex::scoped_lock mpilock(serialMPIMutex);
                                    MPI_Irecv(_channels[csel].theInEvent->work->data->chunks[_channels[csel].ichunk].data, 
                                              _channels[csel].theInEvent->work->data->chunks[_channels[csel].ichunk].size, MPI_BYTE, 
                                              _channels[csel].theInEvent->addr.rank, TAG_PAYLOAD+channelTags[csel], _channels[csel].theInEvent->addr.comm, 
                                              &(requests[csel*4+ReqInputPhase2]));
                                    reqCheck[csel*4+ReqInputPhase2] = TRUE;                                    
                                }

                                //notify only once
                                _controller->notifyInput(aEvent->work->index, aEvent->addr.rank, aEvent->addr.comm);
                            }

                            {
                                boost::mutex::scoped_lock lock(outEventsMutex);
                                _commPending += aEvent->work->data->size;
                            }

                            _canBlockOnComm = 0;

                        }
                        if (!_shouldFinish) {
                            { // SERIALIZE MPI CALLS
                                boost::mutex::scoped_lock mpilock(serialMPIMutex);
                                MPI_Irecv(&(_channels[csel].theInWorkIndex), sizeof(struct WorkIndex), MPI_BYTE, MPI_ANY_SOURCE, TAG_INDEX+channelTags[csel], _comm, 
                                          &(requests[csel*4+ReqInputPhase1]));
                                reqCheck[csel*4+ReqInputPhase1] = TRUE;
                            }

                            assert(requests[csel*4+ReqInputPhase1]);
                        }
                        break;
                    }
                case ReqInputPhase2: {
                        struct {
                            WorkIndex index;
                            MPIAddress addr;
                        } postNotification;

                        DEBUGPRN(printf("[%s] MPI Handler on ReqInputPhase2. (ichunk %d size %d)\n", _controller->getName().c_str(), ichunk, theInEvent->work->data->size);)
                        int shouldNotify;
                        int notifyinput = FALSE;
                        MPIEvent *compEvent = _channels[csel].theInEvent;
                        { //reduces monitor
                            boost::mutex::scoped_lock inLock(inEventsMutex);

                            //print last received data
                            printData("MPI_Irecv", (char *)theInEvent->work->data->chunks[ichunk].data, theInEvent->work->data->chunks[ichunk].size);

                            _channels[csel].ichunk++;

                            if (_channels[csel].ichunk == _channels[csel].theInEvent->work->data->size) {
                                if (!_channels[csel].inEvents.empty()) {
                                    _channels[csel].theInEvent = _channels[csel].inEvents.front();
                                    _channels[csel].inEvents.pop();
                                    _channels[csel].ichunk=0;
                                } else {
                                    _channels[csel].theInEvent = NULL;
                                }
                                shouldNotify = TRUE;
                            } else {
                                shouldNotify = FALSE;
                            }
                            if (_channels[csel].theInEvent) {
                                DEBUGPRN(printf("[%s] MPI Handler MPI_Irecv(%d, chunk: %d[%d], %d, %d).\n", _controller->getName().c_str(), 
                                                theInEvent->work->index.wn, ichunk, theInEvent->work->data->chunks[ichunk].size,
                                                theInEvent->addr.rank, (int)theInEvent->addr.comm);) 

                                { // SERIALIZE MPI CALLS
                                    boost::mutex::scoped_lock mpilock(serialMPIMutex);
                                    MPI_Irecv(_channels[csel].theInEvent->work->data->chunks[_channels[csel].ichunk].data, 
                                              _channels[csel].theInEvent->work->data->chunks[_channels[csel].ichunk].size, MPI_BYTE, 
                                              _channels[csel].theInEvent->addr.rank, TAG_PAYLOAD+channelTags[csel], _channels[csel].theInEvent->addr.comm, 
                                              &(requests[csel*4+ReqInputPhase2]));
                                    reqCheck[csel*4+ReqInputPhase2] = TRUE;
                                }
                                if(_channels[csel].ichunk==0) {
                                    //notify only once
                                    notifyinput = TRUE;
                                    postNotification.index = _channels[csel].theInEvent->work->index;
                                    postNotification.addr = _channels[csel].theInEvent->addr;
                                }
                            }
                        }
                        {
                            boost::mutex::scoped_lock outLock(outEventsMutex);
                            _commPending--;
                        }
                        if (shouldNotify) {
                            _controller->give(compEvent->work, compEvent->addr.rank, compEvent->addr.comm);
                            _canBlockOnComm = 0;
                            delete compEvent;
                        } else {
                            DEBUGPRN(printf("[%s] MPI Handler on ReqInputPhase2. (ichunk %d size %d) (not notified)\n", _controller->getName().c_str(), 
                                            ichunk, theInEvent->work->data->size);)
                        }

                        if(notifyinput) {
                            //due event ordering
                            _controller->notifyInput(postNotification.index, postNotification.addr.rank, postNotification.addr.comm);
                        }


                        break;
                    }
                case ReqOutputPhase1: {
                        {
                            boost::mutex::scoped_lock lock(outEventsMutex);
                            _commPending--;
                        }                    
                        DEBUGPRN(printf("[%s] MPI Handler on ReqOutputPhase1 (_outputPending=%d).\n", _controller->getName().c_str(), _outputPending);)

                        MPIEvent *outEvent = NULL;

                        if (_outputPending) outEvent = _channels[csel].outEvents.front();
                        else outEvent = _channels[csel].theOutEvent;

                        assert(outEvent);
                        if (outEvent->work->index.gi==-1) {
                            DEBUGPRN(printf("[%s] MPI Handler sig %d sent (rank %d) (_outputPending=%d).\n", _controller->getName().c_str(), outEvent->work->index.wn, outEvent->addr.rank, _outputPending);)
                            if (_outputPending) {
                                boost::mutex::scoped_lock lock(outEventsMutex);                            
                                _channels[csel].outEvents.pop();
                                _outputPending = FALSE;
                            }
                            _controller->reportSignal((CommSignal)outEvent->work->index.wn, outEvent->addr.rank, outEvent->addr.comm);
                            _canBlockOnComm = 0;

                            delete outEvent->work;
                            delete outEvent;
                            outEvent = NULL;
                        } else {
                            if (requests[csel*4+ReqOutputPhase2] == MPI_REQUEST_NULL) {
                                if (_outputPending) {
                                    boost::mutex::scoped_lock lock(outEventsMutex);                            
                                    _channels[csel].theOutEvent = outEvent;
                                    _channels[csel].outEvents.pop();
                                }
                                _channels[csel].ochunk = 0;

                                DEBUGPRN(printf("[%s] MPI Handler MPI_Isend(%d, chunk: %d[%d], %d, %d).\n", _controller->getName().c_str(), 
                                                theOutEvent->work->index.wn, ochunk, theOutEvent->work->data->chunks[_channels[csel].ochunk].size,
                                                theOutEvent->addr.rank, (int)theOutEvent->addr.comm);)

                                { // SERIALIZE MPI CALLS
                                    boost::mutex::scoped_lock mpilock(serialMPIMutex);
                                    //assert(_channels[csel].theOutEvent->addr.comm);
                                    MPI_Isend(_channels[csel].theOutEvent->work->data->chunks[_channels[csel].ochunk].data, 
                                              _channels[csel].theOutEvent->work->data->chunks[_channels[csel].ochunk].size, MPI_BYTE, 
                                              _channels[csel].theOutEvent->addr.rank, TAG_PAYLOAD+channelTags[csel], _channels[csel].theOutEvent->addr.comm, 
                                              &(requests[csel*4+ReqOutputPhase2]));
                                    reqCheck[csel*4+ReqOutputPhase2] = TRUE;
                                    printData("MPI_Isend", (char *)theOutEvent->work->data->chunks[ochunk].data, theOutEvent->work->data->chunks[ochunk].size);
                                }

                                _controller->notifyOutput(_channels[csel].theOutEvent->work->index, _channels[csel].theOutEvent->addr.rank, _channels[csel].theOutEvent->addr.comm);
                            }
                        }
                        break;
                    }
                case ReqOutputPhase2: {
                        int shouldNotify;
                        int notifyOutput = FALSE;
                        {
                            boost::mutex::scoped_lock lock(outEventsMutex);
                            _commPending--;
                        }                    
                        DEBUGPRN(printf("[%s] MPI Handler on ReqOutputPhase2.\n", _controller->getName().c_str());)
                        //process out, notify controller, recv if pending
                        MPIEvent *compEvent = _channels[csel].theOutEvent;

                        _channels[csel].ochunk++;
                        if (_channels[csel].ochunk == _channels[csel].theOutEvent->work->data->size) {
                            if (_outputPending && _channels[csel].theOutEvent->work->index.gi!=-1) {
                                assert(!_channels[csel].outEvents.empty());
                                boost::mutex::scoped_lock lock(outEventsMutex);
                                _channels[csel].theOutEvent = _channels[csel].outEvents.front();
                                _channels[csel].outEvents.pop();
                                _channels[csel].ochunk = 0;
                                _outputPending=FALSE;
                            } else {
                                _channels[csel].theOutEvent = NULL;
                            }
                            shouldNotify = TRUE;
                        } else {
                            shouldNotify = FALSE;
                        }
                        if (_channels[csel].theOutEvent && _channels[csel].theOutEvent->work->index.gi>=0) {
                            DEBUGPRN(printf("[%s] MPI Handler MPI_Isend(%d, chunk: %d[%d], %d, %d).\n", _controller->getName().c_str(), 
                                            theOutEvent->work->index.wn, _channels[csel].ochunk, theOutEvent->work->data->chunks[ochunk].size,
                                            theOutEvent->addr.rank, (int)theOutEvent->addr.comm);) 

                            { // SERIALIZE MPI CALLS
                                boost::mutex::scoped_lock mpilock(serialMPIMutex);
                                //assert(_channels[csel].theOutEvent->addr.comm);
                                MPI_Isend(_channels[csel].theOutEvent->work->data->chunks[_channels[csel].ochunk].data, 
                                          _channels[csel].theOutEvent->work->data->chunks[_channels[csel].ochunk].size, 
                                          MPI_BYTE, _channels[csel].theOutEvent->addr.rank, TAG_PAYLOAD+channelTags[csel], _channels[csel].theOutEvent->addr.comm, 
                                          &(requests[csel*4+ReqOutputPhase2]));
                                reqCheck[csel*4+ReqOutputPhase2] = TRUE;
                                //printData("MPI_Isend", (char *)theOutEvent->work->data->chunks[_channels[csel].ochunk].data, theOutEvent->work->data->chunks[_channels[csel].ochunk].size);
                            }
                            if(_channels[csel].ochunk==0) {
                                notifyOutput = TRUE;
                            }
                        }
                        if (shouldNotify) {
                            _controller->report(compEvent->work, compEvent->addr.rank, compEvent->addr.comm);
                            _canBlockOnComm = 0;
                            // delete compEvent; //delete delayed
                        }
                        if(notifyOutput) {
                            _controller->notifyOutput(_channels[csel].theOutEvent->work->index, _channels[csel].theOutEvent->addr.rank, _channels[csel].theOutEvent->addr.comm);
                        }

                        if (shouldNotify) {
                            delete compEvent;
                        }
                        break;
                    }
                default: {
                        DEBUGPRN(printf("[%s] MPIHandler MPI_WaitAny index value MPI_UNDEFINED\n", _controller->getName().c_str());)
                        printf("[%s] MPIHandler MPI_WaitAny index value MPI_UNDEFINED\n", _controller->getName().c_str());
                    }
            }
            DEBUGPRN(printf("[%s] MPIHandler commPending=%d\n", _controller->getName().c_str(), _commPending);)

            if (_commPending<0) printf("[%s] MPIHadler negative commPending\n", _controller->getName().c_str());
            assert(_commPending>=0);
        } while (!_shouldFinish || (_shouldFinish && _commPending!=0));

        DEBUGPRN(printf("[%s] MPIHadler loop finished\n", _controller->getName().c_str());)

        for(csel=0; csel<_channelNumber; csel++) {
            if (requests[csel*4+ReqInputPhase1] != MPI_REQUEST_NULL) {
                DEBUGPRN(printf("[%s] MPIHadler cancelling income request.\n", _controller->getName().c_str());)
                //cancel pending receives
                // SERIALIZE MPI CALLS
                boost::mutex::scoped_lock mpilock(serialMPIMutex);
                MPI_Cancel(&requests[csel*4+ReqInputPhase1]);
                reqCheck[csel*4+ReqInputPhase1] = FALSE;
            }
        }
        DEBUGPRN(printf("[%s] MPIHadler Finished\n", _controller->getName().c_str());)
    }

    virtual void shouldFinish() {
        DEBUGPRN(printf("[%s] MPI Handler on shouldFinish(). (E)\n", _controller->getName().c_str());)
        boost::mutex::scoped_lock lock(outEventsMutex);
        _shouldFinish = TRUE;
        outEventsEmpty.notify_all();
        DEBUGPRN(printf("[%s] MPI Handler on shouldFinish(). (X)\n", _controller->getName().c_str());)
    }
    virtual void setWaitDataOnSend(int val) {
        DEBUGPRN(printf("[%s] MPI Handler on setWaitDataOnSend(%d) (from %s).\n", _controller->getName().c_str(), val, getCallStack().c_str());)
        boost::mutex::scoped_lock lock(outEventsMutex);
        _waitDataToSend = val;
        outEventsEmpty.notify_all();
    }
    virtual int getWaitDataOnSend() {
        return _waitDataToSend;
    }
    virtual int getOutQueueLength() {
        boost::mutex::scoped_lock lock(outEventsMutex);
        int ql=0;
        for(int i=0; i<_channelNumber; i++) {
            ql+=_channels[0].outEvents.size();
        }
        DEBUGPRN(printf("[%s] MPI getOutQueueLength() = %d\n", _controller->getName().c_str(), ql);)

        return ql;
    }
};

#endif


#endif




/**
 * Represents a productor/consumer scheme processor
 **/
class ComputeProcessor : virtual public ProcessorInterface, virtual public  Runnable {
private:
    std::queue<Work *> inQueue;
    boost::mutex inQueueMutex;
    boost::condition inQueueEmpty;

    std::queue<Work *> outQueue;
    boost::mutex outQueueMutex;
    boost::condition outQueueEmpty;

    int _shouldFinish;
    int _hasPending;

    double _startTime, _endTime, _busyTime, _procStart;
protected:
    inline int hasPending() {
        return _hasPending;
    }
    virtual Work *process(Work *aWork) {
        DEBUGPRN(printf("Processor process(%d) on  pid %d\n", aWork->index.wn, getpid());)
        return aWork;
    }
    virtual void selfGive(Work *outWork) {
        DEBUGPRN(printf("Processor on pid %d selfGive() enter\n", getpid());)
        boost::mutex::scoped_lock lock(outQueueMutex);
        outQueue.push(outWork);
        outQueueEmpty.notify_all();
        DEBUGPRN(printf("Processor on pid %d selfGive() exit\n", getpid());)
    }
    virtual Work *selfTake(int waitForData=FALSE) {
        DEBUGPRN(printf("Processor on pid %d selfTake() enter\n", getpid());)
        boost::mutex::scoped_lock lock(inQueueMutex);
        while (inQueue.empty() && !_shouldFinish && waitForData) inQueueEmpty.wait(lock);
        if (inQueue.empty()) return NULL;
        Work *inWork = inQueue.front();
        inQueue.pop();
        DEBUGPRN(printf("Processor on pid %d selfTake() exit\n", getpid());)
        _hasPending = !inQueue.empty();
        return inWork;
    }
    void shouldFinish() {
        boost::mutex::scoped_lock lock(inQueueMutex);
        _shouldFinish = TRUE;
        _endTime = sampleTime();
        inQueueEmpty.notify_all();
    }
public:
    inline double getEfficiency() {
        return _busyTime/(_endTime-_startTime);
    }
    inline double getBusyTime() {
        return _busyTime;
    }
    void run() {
        DEBUGPRN(printf("Processor Running on pid %d\n", getpid());)
        _startTime = sampleTime();
        _busyTime = 0;
        while (!_shouldFinish) {
            Work *inWork = selfTake(TRUE);
            if (!inWork) {             
                break;
            }
            _procStart = sampleTime();
            Work *res = process(inWork);
            _busyTime += sampleTime() - _procStart;
            selfGive(res);
        }
        DEBUGPRN(printf("Processor Finished on pid %d\n", getpid());)
    }
    virtual void give(Work *work) {
        DEBUGPRN(printf("Processor on pid %d give() enter\n", getpid());)
        boost::mutex::scoped_lock lock(inQueueMutex);
        inQueue.push(work);
        inQueueEmpty.notify_all();
        DEBUGPRN(printf("Processor on pid %d give() exit\n", getpid());)
        _hasPending = TRUE;
    }

    virtual Work *take() {
        return take(TRUE);
    }
    virtual Work *take(int waitForData) {
        printf("Processor on pid %d take() enter\n", getpid());
        boost::mutex::scoped_lock lock(outQueueMutex);
        if (outQueue.empty() && !waitForData) return NULL;
        while (outQueue.empty()) outQueueEmpty.wait(lock);
        Work *response = inQueue.front();
        inQueue.pop();
        printf("Processor on pid %d take() exit\n", getpid());
        return response;
    }
    ComputeProcessor() : _shouldFinish(FALSE), _hasPending(FALSE) {
    }
    virtual ~ComputeProcessor() {
    }
    void setFinalization() {
        boost::mutex::scoped_lock lock(outQueueMutex);
        _shouldFinish = TRUE;
        outQueueEmpty.notify_all();
    }
};

#ifdef WIN32
    #define MAKE_HASH(x) (*(INT64 *)(&(x)))
#else
    #define MAKE_HASH(x) (*(int64_t *)(&(x)))
#endif

struct Worker {
    Worker(int i, struct MPIAddress a, int sc) : index(i), mpiAddr(a), queue(0), wp(0), lastSchedule(0), commState(WSCommBound), scIndex(sc), isActive(TRUE) {
    }
    int index;
    struct MPIAddress mpiAddr;
    int queue;
    int wp;
    double lastSchedule;
    enum WorkerState {
        WSCommBound = 1,
        WSCompBound = 2,
        WSTerminate = 3
    } commState;

    int scIndex; // subcluster index
    int isActive;
};



struct worker_wu_stat {
    struct MPIAddress mpiaddr;
    struct WorkIndex index;
    struct gmwat_stat_item stats;
};

struct worker_wu_stat_total {
    struct MPIAddress mpiaddr;
    double busyTime;
    double efficiency;
};


class WorkStorage {
private:
    std::map<int, std::queue<Work * > > reuseInput, reuseOutput;
    boost::mutex inMutex, outMutex;
public:
    WorkStorage() {
    }
    virtual ~WorkStorage() {
        for (std::map<int, std::queue<Work *> >::const_iterator it = reuseInput.begin(); it!=reuseInput.end(); it++) {
            int gi = (*it).first;           
            while (!reuseInput[gi].empty()) {
                user_freeWorkData(reuseInput[gi].front()->data);
                reuseInput[gi].front()->data=NULL;
                delete reuseInput[gi].front();
                reuseInput[gi].pop();
            }
        }
        for (std::map<int, std::queue<Work *> >::const_iterator it = reuseOutput.begin(); it!=reuseOutput.end(); it++) {
            int gi = (*it).first;           
            while (!reuseOutput[gi].empty()) {
                user_freeWorkData(reuseOutput[gi].front()->data);
                reuseOutput[gi].front()->data=NULL;
                delete reuseOutput[gi].front();
                reuseOutput[gi].pop();
            }
        }    
    }

    Work *takeRequest(int gi, int wn, WorkData *top=NULL) {
        if(gi == -2 || gi == -3) {
            return createStats(gi, wn);
        } else {
            DEBUGPRN(printf("takeRequest(%d,%d,%p)\n", gi, wn, top);)
    
            boost::mutex::scoped_lock lock(inMutex);
            Work *ret;
            if (reuseInput[gi].empty()) {
                ret = new Work();           
            } else {
                ret = reuseInput[gi].front();            
                reuseInput[gi].pop();            
                assert(ret->index.gi==gi);
            }
            ret->index.gi = gi;
            ret->index.wn = wn;
            ret->data = user_buildWorkData(gi, wn, top, ret->data, TRUE);
            return ret;
        }
    }

    Work *takeResponse(int gi, int wn, WorkData *top=NULL) {
        if(gi == -2 || gi == -3) {
            return createStats(gi, wn);
        } else {
            DEBUGPRN(printf("takeResponse(%d,%d,%p)\n", gi, wn, top);)
    
            boost::mutex::scoped_lock lock(outMutex);
            Work *ret;
            if (reuseOutput[gi].empty()) {
                ret = new Work();           
            } else {
                ret = reuseOutput[gi].front();            
                reuseOutput[gi].pop();
                assert(ret->index.gi==gi);
            }
            ret->index.gi = gi;
            ret->index.wn = wn;        
            ret->data = user_buildWorkData(gi, wn, top, ret->data, FALSE);
            return ret;
        }
    }

    void giveRequest(Work *work) {
        if(work->index.gi==-2 || work->index.gi==-3) {
            deleteStats(work);
        } else {
            DEBUGPRN(printf("giveRequest(%d,%d,%p)\n", work->index.gi, work->index.wn, work->data);)
    
            boost::mutex::scoped_lock lock(inMutex);
            reuseInput[work->index.gi].push(work);
        }
    }

    void giveResponse(Work *work) {        
        if(work->index.gi==-2 || work->index.gi==-3) {
            deleteStats(work);
        } else {
            DEBUGPRN(printf("giveResponse(%d,%d,%p\n", work->index.gi, work->index.wn, work->data);)
            boost::mutex::scoped_lock lock(outMutex);
            reuseOutput[work->index.gi].push(work);
        }
    }

    void deleteStats(Work *work) {
        DEBUGPRN(printf("%d deleteStats(%d,%d) %p\n", getpid(), work->index.gi, work->index.wn, work);)
        if(work->index.gi==-2) {
            delete (struct worker_wu_stat*)work->data->chunks->data;
        } else {
            delete (struct worker_wu_stat_total*)work->data->chunks->data;
        }            
        delete work->data->chunks;
        delete work->data;
        work->data = NULL;
        delete work;
    }

    Work *createStats(int gi, int wn) {
        Work *ret = new Work();
        ret->index.gi = gi;
        ret->index.wn = wn;        
        ret->data = new struct WorkData;
        ret->data->count =1;
        ret->data->chunks = new struct DataChunk;
        ret->data->chunks->ident=0;
        ret->data->chunks->refs=0;
        if(gi==-2) {
            ret->data->chunks->size = sizeof(struct worker_wu_stat);
            ret->data->chunks->data = new struct worker_wu_stat;
        } else {
            ret->data->chunks->size = sizeof(struct worker_wu_stat_total);
            ret->data->chunks->data = new struct worker_wu_stat_total;
        }
        DEBUGPRN(printf("%d createStats(%d,%d) %p\n", getpid(), gi, wn, ret);)
        return ret;
    }
};


struct WorkSched {
    WorkSched() : topIndex(-1,-1), completeResponse(0), completeRequest(0), 
    state(0), subindex(0), workerIndex(-1), theWork(NULL), theResponse(NULL), status(0) {
    }
    struct WorkIndex index; // grain index 
    struct WorkIndex topIndex;
    int completeResponse, completeRequest, delta;
    int state;
    int subindex; // division part
    int workerIndex; // assigned worker
    Work *theWork, *theResponse;
    int status;
    #define WORKSHED_STATUS_PENDING_GIVE 2
};

struct WorkClass {
    //int ps, rs;
    //double nx;
    //double nxd;
    WorkClass() /*: wc(0), wu(0)*/ {
    }
    int topdiv;
    int delta;
    std::queue<struct WorkSched *> pending;
    //int index;
    //int wc; // completed itens
    //int wu;
};

struct SubCluster {
    SubCluster() : numWorkers(0), numWorkersConfig(0), workersOffset(0), full(FALSE) {
    }
    //std::vector<struct vmconfig> vm; /* maps nw metadata*/
    //std::vector<struct Worker *> pvm;
    //int fNetIn, fNetOut;
    int numWorkers;
    int numWorkersConfig;
    int workersOffset;

    int full;

    //std::string wan;
    //std::string lan;
    //std::string ctx;

    //int , uBusy, stage;

    //int currentRound;	
};



struct CommStats {
    double startSend;
    double sendTime;
    double startRecv;
    double recvTime;
    //AsyncMessage(Work *w, int r, int c) : work(w), rank(r), comm(c) {
    //}
};

struct AsyncMessage {
    Work *work;
    int rank;
    MPI_Comm comm;
    AsyncMessage(Work *w, int r, MPI_Comm c) : work(w), rank(r), comm(c) {
    }
};



int _gi, _bgi, _maxgi; 

/*
 * Handles work processing logic
 */ 
class MasterProcessor : /*public  ComputeProcessor, */public ControllerInterface, protected WorkStorage {
private:
    int _rank; //myrank
    MPI_Comm _comm;

    int _iter;
    /*
        _ws - work queued to transmission (inc on schedule and dec on report)
        _wp - any work pending (inc in schedule and dec on give), 
        _wn - next big work number (inc on big grain creation), 
        _wgs - any work grain schedule count
    */
    int _wu, _wc, _ws, _wp, _wn, _wgs; 

    std::map<MPI_Comm, std::map<int, int> > windex;
    std::vector<Worker> workers;

    std::queue<AsyncMessage> _asyncGive;

    typedef std::map< struct WorkIndex, struct CommStats, lessWorkIndex > commStats_t;
    commStats_t commStats, smCommStats;

public:
    //int _gi, _bgi; 
private:
    std::string _name;
    int _sigLoop;
    CommSignal _currSignal;
    enum MasterState {
        MSInitialization = 2,
        MSSteadyDist = 3,
        MSSteady =4,
        MSSteadyFin =5,
        MSFinalization = 6,
        MSTermination = 8,
        MSFinished = 9
    } _state;

    enum SubMasterState {
        SMSSteady=2,
        SMSFinalization=3,
        SMSTermination=4
    } _smstate;

    boost::mutex iterationMutex;
    boost::condition iterationEnd;

    boost::mutex stateMutex;

    boost::mutex workloadAcess;

    enum {
        MIN_FREE,
        ROUND_BASED,
        LRU_BASED
    } schedType;

    std::vector<struct SubCluster> dvm;
    typedef std::map<struct WorkIndex, struct WorkSched, lessWorkIndex> workload_t;
    workload_t workload;
    std::vector<struct WorkClass *> workClasses;

    std::queue<struct WorkSched *> pendingWork;

    int defaultPreferedClass, divGrains; 

    int _masterRank;
    MPI_Comm _masterComm;
    int _inRecv;

    double _serialTime;
    double _startTime, _endTime;

    int _statCount; // loop stat receive
    double _progressSample, _progressTime;

    std::map<int,int> _channelMap;

    inline static char *getSubStateName(enum SubMasterState ms) {
        switch (ms) {
            case SMSSteady: return "SMSSteady";
            case SMSFinalization: return "SMSFinalization";
            case SMSTermination: return "SMSTermination";
            default:
                return "UNKNOWN";
        }
    }

    inline static char *getStateName(enum MasterState ms) {
        switch (ms) {
            case MSInitialization: return "MSInitialization";
            case MSSteadyDist: return "MSSteadyDist";
            case MSSteady: return "MSSteady";
            case MSSteadyFin: return "MSSteadyFin";
            case MSFinalization: return "MSFinalization";
            case MSTermination: return "MSTermination";
            case MSFinished: return "MSFinished";
            default:
                return "UNKNOWN";
        }
    }


    void startTermination() {
        assert(_state!=MSTermination);
        boost::mutex::scoped_lock lock(stateMutex);

        DEBUGPRN(printf("%s started finalization cicle (E)\n", _name.c_str());)
        _sigLoop = 0;
        DEBUGPRN(printf("%s >>> STATE - TERMINATION\n", _name.c_str());)
        for (int i=0; i<(int)workers.size(); i++) {
            _mpiHandler->giveSignal(CSShouldFinish, workers[i].mpiAddr.rank, workers[i].mpiAddr.comm);
            workers[i].commState = Worker::WSTerminate;
        }
        setState(MSTermination);
        DEBUGPRN(printf("%s started finalization cicle (X)\n", _name.c_str());)
    }


    void broadcastSignal(enum CommSignal sig) {
        { // transmission stage
            boost::mutex::scoped_lock lock(stateMutex);
            _sigLoop = 0;
            for (int i=0; i<(int)workers.size(); i++) {
                _mpiHandler->giveSignal(sig, workers[i].mpiAddr.rank, workers[i].mpiAddr.comm);
            }
        }
        { // wait delivery
            boost::mutex::scoped_lock lock(iterationMutex);
            while (_sigLoop<(int)workers.size()) {
                DEBUGPRN(printf("%s wait for broadcastSignal end phase (E) _sigloop: %d workersSize: %d _smstate %s\n", _name.c_str(), _sigLoop, workers.size(), getSubStateName(_smstate));)
                iterationEnd.wait(lock);
                DEBUGPRN(printf("%s wait for broadcastSignal end phase (X)\n", _name.c_str());)
            }
        }
    }

    void startFinalization() {
        DEBUGPRN(stateEnter("startFinalization()");)
        boost::mutex::scoped_lock lock(stateMutex);

        assert(_state!=MSFinalization);
        DEBUGPRN(printf("%s started finalization cicle (E)\n", _name.c_str());)
        _sigLoop = 0;
        DEBUGPRN(printf("%s >>> STATE - FINALIZATION\n", _name.c_str());)
        for (int i=0; i<(int)workers.size(); i++) {
            _mpiHandler->giveSignal(CSFinalizationStart, workers[i].mpiAddr.rank, workers[i].mpiAddr.comm);
            workers[i].commState = Worker::WSCommBound;
        }
        setState(MSFinalization);
        DEBUGPRN(printf("%s started finalization cicle (X)\n", _name.c_str());)
        DEBUGPRN(stateExit();)
    }

    void finalize() {
        DEBUGPRN(stateEnter("finalize()");)
        _mpiHandler->shouldFinish();
        _mpiHandler->setWaitDataOnSend(FALSE);
        iterationEnd.notify_all();
        setState(MSFinished);
        DEBUGPRN(stateExit();)
    }

    void schedule(int workerIndex=-1, Work *lastWork=NULL) {
        DEBUGPRN(printf("%s schedule idx=%d work %p\n", _name.c_str(), workerIndex, lastWork);)
        //assert(workClasses[_gi]->ws < workClasses[_gi]->_wu);

        for (int i=0; i<(int)dvm.size(); i++)
            if (!dvm[i].full && (workerIndex = getActiveWorker(i))>=0) break;

        if (workerIndex==-1) {
            checkState();
            DEBUGPRN(printf("%s queue FULL (should not happen)\n", _name.c_str());)
            assert(_state==MSSteady);
            if (lastWork) giveRequest(lastWork);
            return;
        }

        struct WorkIndex workIndex = getWorkIndex();

        if(_smstate==SMSFinalization && workIndex.wn<0) {
            DEBUGPRN(printf("%s schedule canceled _ws=%d (gi: %d), _state=%d \n", _name.c_str(), _ws, _gi, _state);) 
            return;
        }

        DEBUGPRN(printf("%s schedule _ws=%d (gi: %d), _state=%d workIndex (%d,%d)\n", _name.c_str(), _ws, _gi, _state, workIndex.gi, workIndex.wn);) 

        assert(workIndex.wn>=0);
      
        {
            boost::mutex::scoped_lock lock(workloadAcess);
            workload[workIndex].workerIndex = workerIndex;
        }

        {
            boost::mutex::scoped_lock lock(iterationMutex);
            workers[workerIndex].queue++;
            workers[workerIndex].lastSchedule = sampleTime();

            _ws += workClasses[workIndex.gi]->delta;
            _wp++;
            _wgs++;
        }

#ifdef _PRINT_DEBUG
        {
            boost::mutex::scoped_lock lock(workloadAcess);
            dumpWorkload();
        }            
#endif


        _mpiHandler->give(workload[workIndex].theWork, workers[workerIndex].mpiAddr.rank, workers[workerIndex].mpiAddr.comm, _channelMap[workers[workerIndex].mpiAddr.rank]);

        DEBUGPRN(printf("%s schedule _ws=%d (gi: %d), _state=%d workIndex (%d,%d)\n", _name.c_str(), _ws, _gi, _state, workIndex.gi, workIndex.wn);)       
    }


    int getActiveWorker(int clusterIndex) {
        int workerIndex=-1;

        switch (schedType) {
            case LRU_BASED: {
                    for (workerIndex=dvm[clusterIndex].workersOffset; workerIndex<(dvm[clusterIndex].workersOffset+dvm[clusterIndex].numWorkers); workerIndex++)
                        if (workers[workerIndex].isActive && workers[workerIndex].queue==0) break;
                    if (workerIndex==(dvm[clusterIndex].workersOffset+dvm[clusterIndex].numWorkers)) {
                        //search must older asignation
                        int lastShedWorker = -1;
                        double lastSched=sampleTime();
                        for (workerIndex=dvm[clusterIndex].workersOffset; workerIndex<(dvm[clusterIndex].workersOffset+dvm[clusterIndex].numWorkers); workerIndex++) {
                            if (workers[workerIndex].isActive && workers[workerIndex].lastSchedule<lastSched && workers[workerIndex].queue<2) {
                                lastSched = workers[workerIndex].lastSchedule;
                                lastShedWorker = workerIndex;
                            }
                        }
                        if (lastShedWorker==-1) {
                            dvm[clusterIndex].full = TRUE;
                            break;
                        } else {
                            workerIndex = lastShedWorker;
                            DEBUGPRN(printf("EVT_WORK_GENERATION schedule reason LRU\n");)
                        }
                    } else {
                        DEBUGPRN(printf("EVT_WORK_GENERATION schedule reason QUEUE=0\n");)
                    }                   
                    break;
                }
            default: {
                    double ls = sampleTime();
                    for (int i=0; i<(int)workers.size(); i++) {
                        //printf("\tMaster.%d query schedule index %d queue %d ls %f\n", getpid(), i, workers[i].queue, workers[i].lastSchedule);
                        if (workers[i].lastSchedule<ls && workers[i].queue<2) {
                            ls = workers[i].lastSchedule;
                            workerIndex = i;
                        }
                    }
                }
        }
        if(workerIndex>=0) {
            DEBUGPRN(printf("\tMaster.%d query schedule index %d mpiaddr %d,%d queue %d ls %f\n", getpid(), workerIndex, workers[workerIndex].mpiAddr.rank, (int)workers[workerIndex].mpiAddr.comm, workers[workerIndex].queue, workers[workerIndex].lastSchedule);)
        }
        return workerIndex;
    }

    void setState(enum MasterState ms) {
        DEBUGPRN(printf("%s setState(%s) _state=%s\n", _name.c_str(), getStateName(ms), getStateName(_state));)
        assert(_state!=ms);
        _state = ms;
    }

    inline void checkState() {
        DEBUGPRN(stateEnter("checkState()");)
        boost::mutex::scoped_lock lock(stateMutex);        
        DEBUGPRN(printf("%s checkSstate() _state=%s (E) _wc: %d _ws: %d _wu: %d \n", _name.c_str(), getStateName(_state), _wc, _ws, _wu);)

        if (_state==MSInitialization) {
            if (_wgs==(int)workers.size()) {
                setState(MSSteadyDist);
            }
#ifndef MPI_POOLING
            if (_masterRank>=0 && _ws==_wu && divGrains==0) {
                _mpiHandler->setWaitDataOnSend(FALSE);
            } else {
                _mpiHandler->setWaitDataOnSend(TRUE);
            }
#endif
        }
        if (_state==MSSteady || _state==MSSteadyDist) {
            // if pipeline of all workers are not full or is a submaster and has no work to schedule
            if (_wp>=((int)workers.size()*2) || (_masterRank>=0 && _ws==_wu)) {
                if (_state==MSSteadyDist) {
#ifndef MPI_POOLING
                    _mpiHandler->setWaitDataOnSend(FALSE);
#endif
                    setState(MSSteady);
                }
            } else {
                if (_state==MSSteady) {
#ifndef MPI_POOLING
                    _mpiHandler->setWaitDataOnSend(TRUE);
#endif
                    setState(MSSteadyDist);
                }
            }
        }
#ifndef MPI_POOLING
        if (_state==MSFinalization) {
            if (_wc==_wu && !_mpiHandler->getWaitDataOnSend()) {
                if (_sigLoop==(int)workers.size()&&_smstate==SMSFinalization) {
                    // all signals received... should wait for termination signal
                    _mpiHandler->setWaitDataOnSend(FALSE);
                } else {
                    // all results received, mpihandle shoud wait for finalization signals
                    _mpiHandler->setWaitDataOnSend(TRUE);
                }
            } else if (_wc<_wu && _mpiHandler->getWaitDataOnSend()) {
                // results pending
                _mpiHandler->setWaitDataOnSend(FALSE);
            }
        }
        if (_state==MSTermination && !_mpiHandler->getWaitDataOnSend() && _sigLoop==(int)workers.size()) {
            // wait for notification
            //printf("DEBUG wait for notification\n");
            _mpiHandler->setWaitDataOnSend(TRUE);
        }
#endif
        DEBUGPRN(printf("%s checkState() _state=%s (X)\n", _name.c_str(), getStateName(_state));)

        DEBUGPRN(stateExit();)
    }

    WorkSched *getBaseGrain() {
        WorkSched *schedWork=NULL;

        DEBUGPRN(printf("%s getBaseGrain() _bgi=%d divGrains=%d\n", _name.c_str(), _bgi, divGrains);)

        if (_masterRank==-1) {
            DEBUGPRN(printf("%s getBaseGrain() _bgi=%d (X)\n", _name.c_str(), _bgi);)            
            schedWork = new struct WorkSched;
            schedWork->completeResponse = workClasses[_bgi]->delta;
            schedWork->index.gi = _bgi;
            schedWork->index.wn = _wn++;
            schedWork->subindex = schedWork->index.wn;
            schedWork->theWork = takeRequest(schedWork->index.gi, schedWork->index.wn);

            double tmark = sampleTime();
            user_readWorkData(schedWork->index.gi, schedWork->index.wn, schedWork->theWork->data);            
            _serialTime += sampleTime()-tmark;

        } else {
            boost::mutex::scoped_lock lock(iterationMutex);
            //wait while has pending no work and state is steady or is receiving work
            while (divGrains==0 && (_smstate==SMSSteady || _inRecv)) {
                DEBUGPRN(printf("%s getBaseGrain() no divGrains, wait (E)\n", _name.c_str());)
                iterationEnd.wait(lock);
                DEBUGPRN(printf("%s getBaseGrain() no divGrains, wait (X)\n", _name.c_str());)
            }
            if (divGrains>0) {
                assert(!pendingWork.empty());
                schedWork = pendingWork.front();
                pendingWork.pop();
                assert(schedWork==workClasses[schedWork->index.gi]->pending.front());
                workClasses[schedWork->index.gi]->pending.pop();
                divGrains--;
            }
        }

        DEBUGPRN(printf("%s getBaseGrain() result (%d,%d) -> %p\n", _name.c_str(), schedWork->index.gi, schedWork->index.wn, schedWork->theWork);)

        return schedWork;
    }

    struct WorkIndex getWorkIndex() {
        WorkSched *schedWork = NULL;
        struct WorkIndex workIndex;

        boost::mutex::scoped_lock lock(workloadAcess);

        //search if there is pending sub divisions
        for (int sc = _gi+1; sc<(int)workClasses.size(); sc++) {
            if (!workClasses[sc]->pending.empty()) {
                DEBUGPRN(printf("Master.%d case 0 (_bgi %d _gi %d)\n", getpid(), _bgi, _gi);)
                schedWork = workClasses[sc]->pending.front();
                workClasses[sc]->pending.pop();
                divGrains--;
                break;
            }
        }

        int lsc = _bgi;
        while (lsc>=0 && workClasses[lsc]->pending.empty()) lsc--;
        if (lsc<0) {
            lsc = _bgi;
        } else {
            DEBUGPRN(printf("%s have bigger pending grains(lsc %d _bgi %d)\n", _name.c_str(), lsc, _bgi);)
        }


        DEBUGPRN(printf("Master.%d break (lsc %d _bgi %d _gi %d)\n", getpid(), lsc, _bgi, _gi);)

        if (schedWork == NULL && _gi==_bgi && (_ws < _wu || _masterRank>=0)) {
            //traditional distribution
            schedWork = getBaseGrain();
            if (!schedWork) return WorkIndex(-1,-1);
            DEBUGPRN(printf("Master.%d _gi==_bgi case 1 (lsc %d _bgi %d _gi %d)\n", getpid(), lsc, _bgi, _gi);)
        } else {
            DEBUGPRN(printf("Master.%d _gi==_bgi case 2 (lsc %d _bgi %d _gi %d) (_ws: %d, _wn:%d _wu:%d)\n", getpid(), lsc, _bgi, _gi, _ws, _wn, _wu);)
            if (workClasses[_gi]->pending.empty()) {
                // search upper free grain
                DEBUGPRN(printf("Master.%d _gi==_bgi case 3 (lsc %d _bgi %d _gi %d)\n", getpid(), lsc, _bgi, _gi);)
                int sc=_gi;
                int effgi = _gi;
                while (sc>lsc && workClasses[sc]->pending.empty()) sc--;
                if (_wn<_wu) {
                    while (sc<effgi) {
                        if (sc==lsc) {
                            //DEBUGPRN(printf("Master.%d _gi==_bgi case 4 (lsc %d _bgi %d _gi %d)\n", getpid(), lsc, _bgi, _gi);)

                            if (sc!=_bgi) dumpWorkClass();
                            assert(sc==_bgi);

                            struct WorkSched *work = getBaseGrain();
                            if (!work) return WorkIndex(-1,-2);

                            workIndex = work->index;
                            if(workIndex.gi>=_gi) {
                                //will forward, not break
                                DEBUGPRN(printf("%s will forward (not break down) grain %d,%d\n", _name.c_str(), workIndex.gi, workIndex.wn);)
                                schedWork = work;
                                break;
                            }
                            workload[workIndex].index = workIndex;
                            workload[workIndex].subindex = work->subindex;
                            workload[workIndex].theWork = work->theWork;

                            delete work;

                            /*char wuname[256];
                            sprintf(wuname, "%d", workload[workIndex].subindex);
                            workload[workIndex].name = wuname;*/
                        } else {
                            //DEBUGPRN(printf("Master.%d _gi==_bgi case 5 (lsc %d _bgi %d _gi %d)\n", getpid(), lsc, _bgi, _gi);)
                            // has pending, use it
                            struct WorkSched *work = workClasses[sc]->pending.front();
                            workClasses[sc]->pending.pop();
                            divGrains--;

                            // the top should be the base for index and the complete at this moment, the index                            
                            workIndex = work->index;

                            workload[workIndex].index = workIndex;
                            workload[workIndex].topIndex = work->topIndex;
                            workload[workIndex].subindex = work->subindex;
                            workload[workIndex].state = -1;                            

                            if(!work->theWork) {
                                //on submaster it may be already allocated
                                //DEBUGPRN(printf("Master.%d _gi==_bgi case 6 (lsc %d _bgi %d _gi %d)\n", getpid(), lsc, _bgi, _gi);)
                                if(workload[workIndex].topIndex.wn>=0) {
                                    DEBUGPRN(printf("Master.%d _gi==_bgi case 7 (lsc %d _bgi %d _gi %d)\n", getpid(), lsc, _bgi, _gi);)
                                    workload[workIndex].theWork = takeRequest(sc, workload[workIndex].index.wn, 
                                                                              workload[workload[workIndex].topIndex].theWork->data);
                                } else {
                                    DEBUGPRN(printf("Master.%d _gi==_bgi case 8 (lsc %d _bgi %d _gi %d)\n", getpid(), lsc, _bgi, _gi);)
                                    workload[workIndex].theWork = takeRequest(sc, workload[workIndex].index.wn);
                                }
                            } else {
                                _DEBUGPRN(printf("Master.%d _gi==_bgi case 9 (lsc %d _bgi %d _gi %d)\n", getpid(), lsc, _bgi, _gi);)
                                workload[workIndex].theWork =  work->theWork;
                            }


                            /*workload[workIndex].theResponse = takeRequest(sc, workload[workIndex].index.wn, 
                                workload[workload[workIndex].topIndex].theResponse->data);*/

                            /*char wuname[256];
                            sprintf(wuname, "%s", workload[workload[workIndex].topLevelIndex].name.c_str(), workload[workIndex].subindex);
                            workload[workIndex].name = wuname;*/

                            delete work;
                        }

                        sc++;

						assert(workIndex.wn>=0);
                        //breakdown
                        for (int i=0; i<workClasses[sc]->topdiv; i++) {
                            //DEBUGPRN(printf("Master.%d _gi==_bgi case 10(%d) (lsc %d _bgi %d _gi %d)\n", getpid(), i, lsc, _bgi, _gi);)

                            struct WorkSched *work = new struct WorkSched;
                            work->subindex = i+1;
                            work->topIndex = workIndex;

                            work->index = WorkIndex(sc, work->topIndex.wn*workClasses[sc]->topdiv + i);
                            work->workerIndex = -1;

							/*printf("%s PACKED (%p) top %d -> %d %d <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n", _name.c_str(), 
								work, work->index.wn, work->topIndex.wn, workIndex.wn);*/

                            workClasses[sc]->pending.push(work);
                            divGrains++;
                        }
                    } // else no more units to breakdown
                } else {
                    //DEBUGPRN(printf("Master.%d _gi==_bgi case 11 (lsc %d _bgi %d _gi %d) (_ws: %d, _wn:%d _wu:%d)\n", getpid(), lsc, _bgi, _gi, _ws, _wn, _wu);)
                }
            }

            // check if there is pending grains for work class or upper grains to process
            if (schedWork == NULL && !workClasses[_gi]->pending.empty()) {
                // if exists schedule					
                schedWork = workClasses[_gi]->pending.front();
                workClasses[_gi]->pending.pop();
				DEBUGPRN(printf("%s POPED (%p) top %d wn %d data %p <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n", _name.c_str(), schedWork, schedWork->topIndex.wn, schedWork->index.wn, schedWork->theWork);)

                if(!schedWork->theWork) {
                    // system may work with grains top grains with gi>0
                    if(schedWork->topIndex.wn>=0) {
                        schedWork->theWork = takeRequest(schedWork->index.gi, schedWork->index.wn, workload[schedWork->topIndex].theWork->data);
                    } else {
                        schedWork->theWork = takeRequest(schedWork->index.gi, schedWork->index.wn);
                    }
                }

                divGrains--;
            } else if(schedWork) {
                //DEBUGPRN(printf("Master.%d case 12 (lsc %d _bgi %d _gi %d) (_ws: %d, _wn:%d _wu:%d) schedWork:%x\n", getpid(), lsc, _bgi, _gi, _ws, _wn, _wu, schedWork);)                
                //DEBUGPRN(printf("Master.%d case 13 (gi:%d wn%d)\n", getpid(), schedWork->index.gi, schedWork->index.wn);)
                if(!schedWork->theWork) {
                    // system may work with grains top grains with gi>0
                    if(schedWork->topIndex.wn>=0) {
                        schedWork->theWork = takeRequest(schedWork->index.gi, schedWork->index.wn, workload[schedWork->topIndex].theWork->data);
                    } else {
                        schedWork->theWork = takeRequest(schedWork->index.gi, schedWork->index.wn);
                    }
                }
            }
        }

        if (schedWork) {
            struct WorkIndex workIndex = schedWork->index;

            if(schedWork->theWork==NULL) {
                //DEBUGPRN(printf("%s FATAL ERROR : schedWork->theWork==NULL for grain %d,%d\n", _name.c_str(), workIndex.gi, workIndex.wn);)
               dumpWorkload();
            }

            assert(schedWork->theWork!=NULL);

            workload[workIndex].index = schedWork->index;
            workload[workIndex].topIndex = schedWork->topIndex;
            workload[workIndex].subindex = schedWork->subindex;
            workload[workIndex].state = 0;
            workload[workIndex].completeRequest = workload[workIndex].completeResponse = workClasses[workIndex.gi]->delta;
            workload[workIndex].theWork = schedWork->theWork;


            delete schedWork;

            /*if(clusterIndex==0) {
                dvm[0].uBusy = 1;
            } else {
                inet[dvm[clusterIndex].wan].uBusy = 1;
            }*/
            /*if(statType & 256) {               fprintf(shedLog, "%f wu %s to worker %d.%d.%d reason %s\n", 
                    simtime(),
                    workload[workIndex].name.c_str(), 
                    workers[workerIndex].scIdx, workers[workerIndex].wgIdx, workers[workerIndex].idx, 
                    ScheduleReasonsStrings[workload[workIndex].schedReason]);
            }*/

#ifdef _PRINT_DEBUG
            dumpWorkload();
#endif

            return workIndex;
        }
        return WorkIndex(0,-3);
    }

    void dumpWorkClass() {
        //dump workClasses
        printf("%s WorkClasses Dump (wu: %d ws: %d wc: %d):\n", _name.c_str(), _wu, _ws, _wc);
        for (int i=0; i<(int)workClasses.size(); i++) {
            printf("\tworkClasses[%d] {topDiv: %d delta: %d pending: %d}\n", i, 
                   workClasses[i]->topdiv, workClasses[i]->delta, workClasses[i]->pending.size());
        }
    }

    void dumpWorkload() {
        printf("%s Workload Dump:\n", _name.c_str());
        for (workload_t::const_iterator it = workload.begin(); it!=workload.end(); it++) {
            int rank = -1;
            if((*it).second.workerIndex>=0) {
                rank = workers[(*it).second.workerIndex].mpiAddr.rank;
            }
            printf("\tworkload[%d,%d] {topIndex: %d, complete: %d/%d, subindex: %d, workerIndex:%d(%d) request:%p response:%p}\n", 
                   (*it).first.gi, (*it).first.wn, (*it).second.topIndex.wn, (*it).second.completeResponse, workClasses[(*it).first.gi]->delta, 
                   (*it).second.subindex, (*it).second.workerIndex, rank,  (*it).second.theWork, (*it).second.theResponse);
        }
    }

    void waitForPendingWork() {
        DEBUGPRN(printf("%s waitForPendingWork() (E) _wc: %d _wu: %d\n", _name.c_str(), _wc, _wu);)
        boost::mutex::scoped_lock lock(iterationMutex);
        while (_wc<_wu) {
            DEBUGPRN(printf("%s wait for pendingWork phase (E) _wc: %d _wu: %d\n", _name.c_str(), _wc, _wu);)
            iterationEnd.wait(lock);
            DEBUGPRN(printf("%s wait for pendingWork phase (X)\n", _name.c_str());)
        }
        DEBUGPRN(printf("%s waitForPendingWork() (X)\n", _name.c_str());)
    }

    void waitForFinalization() {
        boost::mutex::scoped_lock lock(iterationMutex);
        assert(_wc==_wu);
        while (_sigLoop<(int)workers.size() || (_masterRank>=0 && _smstate==SMSFinalization)) {
            DEBUGPRN(printf("%s wait for finalization phase (E) _sigloop: %d workersSize: %d _smstate %s\n", _name.c_str(), _sigLoop, workers.size(), getSubStateName(_smstate));)
            iterationEnd.wait(lock);
            DEBUGPRN(printf("%s wait for finalization phase (X)\n", _name.c_str());)
        }
    }
    void waitForTermination() {
        boost::mutex::scoped_lock lock(iterationMutex);
        while (_sigLoop<(int)workers.size()) {
            DEBUGPRN(printf("%s wait for termination phase (E) _sigloop=%d\n", _name.c_str(), _sigLoop);)
            iterationEnd.wait(lock);
            DEBUGPRN(printf("%s wait for termination phase (X)\n", _name.c_str());)
        }
    }
    void waitForStatistics() {
        DEBUGPRN(printf("%s waitForStatistics() (E) _wc: %d _wu: %d\n", _name.c_str(), _wc, _wu);)
        boost::mutex::scoped_lock lock(iterationMutex);
        while (_statCount<(int)workers.size()) {
            DEBUGPRN(printf("%s wait for waitForStatistics phase (E) _statCount: %d\n", _name.c_str(), _statCount);)
            iterationEnd.wait(lock);
            DEBUGPRN(printf("%s wait for waitForStatistics phase (X)\n", _name.c_str());)
        }
        DEBUGPRN(printf("%s waitForStatistics() (X)\n", _name.c_str());)
    }

    /*########################################################################################################################*/

    public:

    virtual double getEndTime() { return _endTime; }

    virtual void setChannelMap(std::map<int,int> channels) {
        _channelMap = channels;
    }

    double getExecTime() {
        return _endTime - _startTime;
    }

    virtual const std::string getName() {
        return _name;
    }

    virtual void processSend() {
        checkState();
    }

    virtual void reportSignal(enum CommSignal sig, int rank, MPI_Comm comm) {
        DEBUGPRN(printf("%s reportSignal(%s, %d, %d) _sigLoop:%d\n", _name.c_str(), getSignalName(sig), rank, comm,  _sigLoop);)

        if (rank==_masterRank && comm==_masterComm) {
            if (sig==CSFinalizationStart) {
                boost::mutex::scoped_lock lock(iterationMutex); // protect _signal access
                _smstate = SMSFinalization;
                iterationEnd.notify_all();
            } else if (sig==CSShouldFinish) {
                boost::mutex::scoped_lock lock(iterationMutex); // protect _signal access
                _smstate = SMSTermination;
                iterationEnd.notify_all();
            } else if(sig==CSIterationStart) {
                _wc -= _wu;
                _ws -= _wu;
                _wu=0;
            }
        } else {
            boost::mutex::scoped_lock lock(iterationMutex); // protect _signal access
            _sigLoop++;
            if (_sigLoop==(int)workers.size()) {
                iterationEnd.notify_all();
            }
        }
    }

    MasterProcessor(int iter, int wu, int commsize, MPI_Comm comm=MPI_COMM_WORLD, int rank=0, int bgi=0, int gi=0, MPI_Comm masterComm=MPI_COMM_WORLD, int masterRank=-1) 
        :_rank(rank), _comm(comm), _iter(iter), _wu(wu), _wc(0), _ws(0), _wp(0), _wn(0), _wgs(0), /*_gi(gi), _bgi(bgi),*/ _sigLoop(-1), 
        _state(MSInitialization), _smstate(SMSSteady), schedType(LRU_BASED), defaultPreferedClass(0), divGrains(0), 
        _masterRank(masterRank), _masterComm(masterComm), _inRecv(0), _serialTime(0.0), _statCount(0), _progressSample(0.0), _progressTime(0.0)
         {
             _gi = gi;
             _bgi = bgi;

        for (int i=0; i<commsize; i++) {
            int workerIndex = (int)workers.size();
            Worker w(workerIndex, MPIAddress(i, _comm), 0);
            workers.push_back(w);
            windex[workers[workerIndex].mpiAddr.comm][workers[workerIndex].mpiAddr.rank] = workerIndex;            
        }

        assert(commsize>0);

        { // local name configuration        
            char name[128];        
            if(_masterRank<0) {
                sprintf(name, "\x1b[1mMaster.%d(%d)\x1b[0m", _rank, getpid());
            } else {
                sprintf(name, "Submaster.%d(%d)", _rank, getpid());
                assert(_wu==0);
            }
            _name=name;
        }

        SubCluster sc;
        sc.numWorkers = sc.numWorkersConfig = (int)workers.size();
        dvm.push_back(sc);

        for (int i=0; i<(int)workers.size(); i++) {
            if(verbose & VERBOSE_INFO) printf("%s workers[%d] rank %d comm %d\n", _name.c_str(), i, workers[i].mpiAddr.rank, (int)workers[i].mpiAddr.comm);
        }

        WorkClass *wcls = new WorkClass;
        wcls->topdiv = 0;
        workClasses.push_back(wcls);

        {
            int topdiv, i=1;
            while ((topdiv=user_canBreakDown(i++))!=0) {
                wcls = new WorkClass();
                wcls->topdiv = topdiv;
                workClasses.push_back(wcls);
            }
        }

        //generate index deltas
        workClasses[workClasses.size()-1]->delta = 1;   
        for (int i=(int)workClasses.size()-2; i>=0; i--) {
            workClasses[i]->delta = workClasses[i+1]->delta*workClasses[i+1]->topdiv;       
        }

        _wu = wu*workClasses[0]->delta;

#ifdef _PRINT_DEBUG
        dumpWorkClass();
#endif

        _maxgi = (int)workClasses.size()-1;

        if (_gi>=(int)workClasses.size()) {
            printf("%s GMWAT Warning: Cannot set grain size to %d, using %d instead.\n", _name.c_str(), _gi, workClasses.size()-1);
            _gi = workClasses.size()-1;
        } else if (_gi<0) {
            printf("%s GMWAT Warning: Cannot set grain size to %d, using 0 instead.\n", _name.c_str(), _gi);
            _gi = 0;
        } else if (_bgi<0) {
            printf("%s GMWAT Warning: Cannot set base grain size to %d, using 0 instead.\n", _name.c_str(), _bgi);
            _bgi = 0;
        } else if (_bgi>_gi) {
            printf("%s GMWAT Warning: Cannot set base grain size to %d, using %d instead.\n", _name.c_str(), _bgi, _gi);
            _bgi = _gi;
        }
        if(verbose & VERBOSE_INFO) printf("%s GMWAT Info: Max gi %d, gi %d bgi %d (wu %d)\n", _name.c_str(), workClasses.size()-1, _gi, _bgi, _wu);
    }

    void configWorker(int workerIndex, int rank, MPI_Comm comm) {
        assert(workerIndex<(int)workers.size());

        //int sz = windex.erase(MAKE_HASH(workers[workerIndex].mpiAddr));
        //assert(sz==1);
        workers[workerIndex].mpiAddr = MPIAddress(rank, _comm);
        windex[workers[workerIndex].mpiAddr.comm][workers[workerIndex].mpiAddr.rank] = workerIndex;
    }

    virtual ~MasterProcessor() {
        for (int i=0; i<(int)workClasses.size(); i++) {
            delete workClasses[i];
        }
        if(statistic & STATISTIC_API_TIME) {
            if(_masterRank==-1) {
                printf("STAT T %s %d %.6f\n", _name.c_str(), _rank, _serialTime);
            }
            for(commStats_t::const_iterator st = commStats.begin(); st!=commStats.end(); st++) {
                printf("STAT %s %d %d %.6f %.6f %.6f %.6f\n", (_masterRank==-1)?"MC":"SC",
                       (*st).first.gi, (*st).first.wn, 
                       (*st).second.startSend, (*st).second.sendTime,
                       (*st).second.startRecv, (*st).second.recvTime);
            }
            for(commStats_t::const_iterator st = smCommStats.begin(); st!=smCommStats.end(); st++) {
                //only submaster should have data
                printf("STAT SC %d %d %.6f %.6f %.6f %.6f\n",
                       (*st).first.gi, (*st).first.wn, 
                       (*st).second.startSend, (*st).second.sendTime,
                       (*st).second.startRecv, (*st).second.recvTime);
            }
        }
    }

    virtual void give(Work *work, int rank, MPI_Comm comm) {
        TRACE1(printf("[trace] %s give (gi:%d/wn:%d) to (rank:%d/comm:%d) at %f\n", _name.c_str(), work->index.gi, work->index.wn, rank, (int)comm, sampleTime());)

        if(work->index.gi<0) {
            if(_masterRank==-1) {
                //deal with data
                if(work->index.gi==-2) {
                    struct worker_wu_stat *stats = (struct worker_wu_stat*)work->data->chunks->data;
                    printf("STAT MW %d %d %d %d %.6f %.6f %.6f %.6f %.6f %.6f\n",
                           stats->mpiaddr.rank, (int)stats->mpiaddr.comm,
                           stats->index.gi, stats->index.wn,
                           stats->stats.execStart, stats->stats.execTime, 
                           stats->stats.inputStart, stats->stats.inputTime, 
                           stats->stats.outputStart, stats->stats.outputTime);

                } else {
                    struct worker_wu_stat_total *stats_grp = (struct worker_wu_stat_total*)work->data->chunks->data;
                    printf("STAT MT %d %d %.6f %.6f\n",
                           stats_grp->mpiaddr.rank, (int)stats_grp->mpiaddr.comm,
                           stats_grp->busyTime, stats_grp->efficiency);
                }

                
                //release main thread
                if(work->index.gi==-3){
                    boost::mutex::scoped_lock lock(iterationMutex);
                    _statCount++;
                    iterationEnd.notify_all();
                }

                giveResponse(work);
            } else {
                _mpiHandler->give(work, _masterRank, _masterComm, _channelMap[_masterRank]); //fast forward
                // _statCount is modified on report
            }
            return;
        }
#ifdef ASYNC_GIVE
        {
            boost::mutex::scoped_lock lock(iterationMutex);
            _asyncGive.push(AsyncMessage(work, rank, comm));
            if(_masterRank==rank) {
                smCommStats[work->index].recvTime = sampleTime() - smCommStats[work->index].startRecv;
            } else {
                commStats[work->index].recvTime = sampleTime() - commStats[work->index].startRecv;
            }
            iterationEnd.notify_all();        
        }
    }

    virtual void giveAsync(Work *work, int rank, MPI_Comm comm) {
#endif
        //printf("%s give() rank %d comm %d _masterRank %d _masterComm %d\n", _name.c_str(), rank, _masterRank, comm, _masterComm);
        
        {       
            boost::mutex::scoped_lock lock(iterationMutex);
            _inRecv--;
            assert(_inRecv>=0);

#ifndef ASYNC_GIVE
            if(_masterRank==rank) {
                smCommStats[work->index].recvTime = sampleTime() - smCommStats[work->index].startRecv;
            } else {
                commStats[work->index].recvTime = sampleTime() - commStats[work->index].startRecv;
            }
#endif
        }

        if (rank==_masterRank && comm==_masterComm) {
            // submaster work
            boost::mutex::scoped_lock lock(workloadAcess);

            struct WorkSched *selWork = new struct WorkSched;

            selWork->subindex = work->index.wn;
            selWork->index = work->index;
            selWork->theWork = work;
            {       
                boost::mutex::scoped_lock lock(iterationMutex);
                workClasses[work->index.gi]->pending.push(selWork);                
                _wu += workClasses[work->index.gi]->delta;

                pendingWork.push(selWork);

                divGrains++;
                iterationEnd.notify_all();

                if (verbose & VERBOSE_PROGRESS) printf("%s received request wn = %d, gi = %d from %d _wc %d _wu %d\n", _name.c_str(), work->index.wn, work->index.gi, rank, _wc, _wu);
            }
        } else {

            {
                boost::mutex::scoped_lock lock(workloadAcess);

                if (workload.find(work->index)==workload.end()) {
                    // should not occur, only function give() should delete work units
                    printf("ERROR: work wn:%d gi:%d not found on workload\n", work->index.wn, work->index.gi);
                    dumpWorkload();
                    assert(0);
                }

                workload[work->index].theResponse = work;

                if(workload[work->index].theWork) {
                    //postulates event
                    printf("Give call postulated %d,%d from %d,%d\n", work->index.gi, work->index.wn, rank, (int)comm);
                    workload[work->index].status |= WORKSHED_STATUS_PENDING_GIVE;
                    return;
                }                   
            }

            {       
                boost::mutex::scoped_lock lock(iterationMutex);

                MPIAddress addr(rank, comm);
                int workerIndex = windex[addr.comm][addr.rank];
                workers[workerIndex].queue--;
                workers[workerIndex].wp++;

                _wp--;                
                // updates wc only at end
                if (verbose & VERBOSE_PROGRESS) printf("%s received response wn = %d, gi = %d from %d (index %d) _wc=%d (mpiaddr %d,%d)\n", _name.c_str(), work->index.wn, work->index.gi, rank, workerIndex, (_wc+workClasses[work->index.gi]->delta), addr.rank, (int)addr.comm);
            }

            int gi = work->index.gi;

            {
                boost::mutex::scoped_lock lock(workloadAcess);

                struct WorkIndex selWork = work->index;
                while (workload[selWork].topIndex.wn>=0 && workload[selWork].completeResponse==workClasses[selWork.gi]->delta) {
                    //updates topdiv
                    workload[workload[selWork].topIndex].completeResponse += workClasses[selWork.gi]->delta;

                    //theWork may be null at this point
                    //parent update
                    double tmark = sampleTime();
                    user_updateWorkData(selWork.gi, selWork.wn, 
                                    workload[workload[selWork].topIndex].theResponse->data, workload[selWork].theResponse->data);
                    

                    //delete current
                    user_freeWorkData(workload[selWork].theResponse->data);
                    _serialTime += sampleTime()-tmark;


                    workload[selWork].theResponse->data = NULL;
                    delete workload[selWork].theResponse;

                    if (workload[selWork].theWork) { // inverse report->give sequence
                        printf("ERROR freeing a postulated call %d,%d\n", work->index.gi, work->index.wn);
                        exit(0);
                        /*user_freeWorkData(workload[selWork].theWork->data);
                        workload[selWork].theWork->data=NULL;
                        delete workload[selWork].theWork;
                        workload[selWork].theWork = NULL;*/
                    }

                    struct WorkIndex newSelWork = workload[selWork].topIndex;
                    int sz = workload.erase(selWork);
                    assert(sz==1);

                    //changes current to topdiv
                    selWork = newSelWork;
                }
                if (workload[selWork].topIndex.wn<0 &&  workload[selWork].completeResponse==workClasses[selWork.gi]->delta) {
                    Work *request = workload[selWork].theWork;
                    Work *response = workload[selWork].theResponse;

                    int sz = workload.erase(selWork);
                    assert(sz==1);

                    if (request) {
                        printf("ERROR freeing a base postulated call %d,%d\n", work->index.gi, work->index.wn);
                        exit(0);
                        giveRequest(request);
                    }
                    assert(_wc<=_wu);
                    if (_masterRank==-1) {
                        double tmark = sampleTime();
                        user_writeWorkData(response->index.gi, response->index.wn, response->data);
                        _serialTime += sampleTime()-tmark;

                        giveResponse(response);
                    } else {
                        _mpiHandler->give(response, _masterRank, _masterComm, _channelMap[_masterRank]);
                    }
                }

#ifdef _PRINT_DEBUG
                dumpWorkload();
#endif
            }
            { // wakeup
                boost::mutex::scoped_lock lock(iterationMutex);
                _wc += workClasses[gi]->delta;
                //checkState(); //checked in main thread
                iterationEnd.notify_all();
		
                if (statistic & STATISTIC_PROGRESS_COMPLETE) {
                    double ps, st;
                    ps = (((1.0)*_wc)/_wu)*100.0;
                    st = sampleTime();
                    if(ps - _progressSample>=.25 || _wu==_wc) {
                        printf("STAT P %.6f %.4f%% %.6f %d\n", st, ps, ((ps-_progressSample)/(st-_progressTime)), _gi);
                        _progressSample = ps;
                        _progressTime = st;
                    }                    
                }
            }
        }
    }
    virtual void report(Work *work, int rank, MPI_Comm comm) {
        TRACE1(printf("[trace] %s report (gi:%d/wn:%d) to (rank:%d/comm:%d) at %f\n", _name.c_str(), work->index.gi, work->index.wn, rank, (int)comm, sampleTime());)

        if (rank==_masterRank && comm==_masterComm) {
            if(work->index.gi==-3) {
                boost::mutex::scoped_lock lock(iterationMutex);
                _statCount++;
                iterationEnd.notify_all();
              }
            //submaster to master response communication report
            giveResponse(work);
        } else {

            if (verbose & VERBOSE_PROGRESS) printf("%s sent work wn = %d, gi = %d to %d\n", _name.c_str(), work->index.wn, work->index.gi, rank);

            struct WorkIndex workIndex = work->index; // save for pending give verification

            {
                boost::mutex::scoped_lock lock(workloadAcess);
                if (workload.find(work->index)==workload.end()) {
                    // should not ocurrs
                    printf("ERROR: work wn:%d gi:%d not found on workload\n", work->index.wn, work->index.gi);
                    dumpWorkload();
                    assert(0);
                }

                struct WorkIndex selWork = work->index;
                while (workload[selWork].topIndex.wn>=0 && workload[selWork].completeRequest==workClasses[selWork.gi]->delta) {
                    //updates topdiv
                    workload[workload[selWork].topIndex].completeRequest += workClasses[selWork.gi]->delta;

                    //delete current
                    double tmark = sampleTime();
                    user_freeWorkData(workload[selWork].theWork->data);
                    _serialTime += sampleTime()-tmark;

                    workload[selWork].theWork->data = NULL;
                    delete workload[selWork].theWork;
                    workload[selWork].theWork = NULL;

                    //changes current to topdiv
                    selWork = workload[selWork].topIndex;
                }
                if (workload[selWork].topIndex.wn<0 && workload[selWork].completeRequest==workClasses[selWork.gi]->delta) {
                    Work *request = workload[selWork].theWork;
                    workload[selWork].theWork = NULL;
                    giveRequest(request);
                }
            }

            if(workload[workIndex].status & WORKSHED_STATUS_PENDING_GIVE) {
                //inverter report->give sequence
                give(workload[workIndex].theResponse, rank, comm);
            }
        }
        { // wakeup
            boost::mutex::scoped_lock lock(iterationMutex);

            if(work->index.gi>=0) {
                if(_masterRank==rank) {
                    smCommStats[work->index].sendTime = sampleTime() - smCommStats[work->index].startSend;
                } else {
                    commStats[work->index].sendTime = sampleTime() - commStats[work->index].startSend;
                }
            }
            

            iterationEnd.notify_all();
        }        
    }
    void run() {
        int iteration =1;
        double it_time;

        _startTime = sampleTime();

        if(verbose & VERBOSE_INFO) printf("%s runing\n", _name.c_str());

        for(;iteration<=_iter; iteration++) {
            _wc = _wn = _ws = 0;

            DEBUGPRN(printf("\n\n%s Iteration %d start -------------------------------------<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n", _name.c_str(), iteration);)

            checkState();
            if (_masterRank<0) {
                it_time = sampleTime();
                user_iteration_start(iteration);                
            }

            broadcastSignal(CSIterationStart);

            if (_masterRank<0) { schedule();}

#ifdef ASYNC_GIVE
            AsyncMessage agive(NULL,0,0);
#endif
            for(;;)  {
                DEBUGPRN(printf("%s schedule loop wc: %d wp: %d ws: %d gi: %d (state: %s)\n", _name.c_str(), _wc, _wp, _ws, _gi, getStateName(_state));)

                checkState();

                //if (_wp<((int)workers.size()*2) && (_ws<_wu || divGrains>0)) {
                if (_wp<((int)workers.size()*2) && _mpiHandler->getOutQueueLength()==0 && (_ws<_wu || divGrains>0)) {
                    DEBUGPRN(printf("%s Pre-Schedule wc %d wp %d ws %d wn %d divGrains %d (state: %s)\n", _name.c_str(), _wc, _wp, _ws, _wn, divGrains, getStateName(_state));)
                    schedule();               
                } else {
                    boost::mutex::scoped_lock lock(iterationMutex);

                   if(!(_wc<_wu || divGrains>0 || _inRecv>0 || (_masterRank>=0 && _smstate==SMSSteady))) {
                       break;
                   }

#ifdef ASYNC_GIVE
                   if(_asyncGive.size()==0)  {
#endif
                       //if has more something to sched latter
                       DEBUGPRN(printf("%s schedule loop wait for event (E) wc: %d wp: %d ws: %d (state: %s)\n", _name.c_str(), _wc, _wp, _ws, getStateName(_state));)
                       _mpiHandler->setBlockOnComm();
                       iterationEnd.wait(lock);
                       DEBUGPRN(printf("%s schedule loop wait for event (X) wc: %d wp: %d ws: %d (state: %s)\n", _name.c_str(), _wc, _wp, _ws, getStateName(_state));)
#ifdef ASYNC_GIVE
                   } else {
                       agive = _asyncGive.front();                       
                        _asyncGive.pop();
                   }
#endif
                }

#ifdef ASYNC_GIVE
                if(agive.work) {
                    this->giveAsync(agive.work, agive.rank, agive.comm);
                    agive.work=NULL;
                }
#endif
            }
            if (verbose & VERBOSE_PROGRESS) printf("%s finished schedule loop\n", _name.c_str());

            waitForPendingWork();

            broadcastSignal(CSIterationEnd);

            {
                boost::mutex::scoped_lock lock(workloadAcess);
                assert(workload.size()==0);
            }

            if (_masterRank<0) {               
                user_iteration_end(iteration);
                if(statistic & STATISTIC_ITERATION_TIME) {
                    printf("%s iteration %d time %f\n", _name.c_str(), iteration, sampleTime()-it_time);
                }
            }
        }

        startFinalization();

        waitForFinalization();        

        if (verbose & VERBOSE_PROGRESS) printf("%s finished finalization, starting termination\n", _name.c_str());

        assert(workload.size()==0);

        startTermination();

        waitForTermination();

        _endTime = sampleTime();

        if(statistic & STATISTIC_MASTER_COLLECTOR) {
            waitForStatistics();
        }


        finalize();
        if(verbose & VERBOSE_INFO) printf("Master.%d finalized\n", getpid());
    }

    void dump() {
        printf("%s STATE DUMP (wp:%d, wc:%d, ws: %d state %s) \n", _name.c_str(), _wp, _wc, _ws, getStateName(_state));
        for (int i=0; i<(int)workers.size(); i++) {
            printf("\tworkers[%d] rank %d comm %d queue %d cs %d wp %d\n", i, workers[i].mpiAddr.rank, (int)workers[i].mpiAddr.comm, 
                   workers[i].queue, workers[i].commState, workers[i].wp);
        }
        printf("%s STATE DUMP END\n", _name.c_str());
        dumpWorkClass();
        dumpWorkload();
    }

    /**
    Called by communication handler do build grain buffers.
    */
    virtual Work *getWork(int gi, int wn, int rank, MPI_Comm comm) {
        struct WorkIndex workIndex(gi,wn), topIndex;

        if ((gi<0) || (_masterRank==rank && _masterComm==comm)) {
            return takeRequest(gi, wn);
        } else {
            Work *theResponse = NULL; 
            { // reduduces monitor and do not lock on recurse
                boost::mutex::scoped_lock lock(workloadAcess);
                theResponse = workload[workIndex].theResponse;
                topIndex = workload[workIndex].topIndex;
            }
            if (theResponse==NULL) {
                //if (workIndex.gi==_bgi) {
                if (workIndex.gi==_bgi || topIndex.wn<0) {
                    theResponse = takeResponse(gi, wn);
                } else {
                    theResponse = takeResponse(gi, wn, getWork(topIndex.gi, topIndex.wn, rank, comm)->data);
                }

                { // reduduces monitor and do not lock on recurse
                    boost::mutex::scoped_lock lock(workloadAcess);
                    workload[workIndex].theResponse = theResponse;
                }
            }
            return theResponse;
        }
    }

    virtual void notifyInput(WorkIndex &workIndex, int rank, MPI_Comm comm) {
        TRACE1(printf("[trace] %s notifyInput (gi:%d/wn:%d) to (rank:%d/comm:%d) at %f\n", _name.c_str(), workIndex.gi, workIndex.wn, rank, (int)comm, sampleTime());)

        if(workIndex.gi<0) {
            return;
        }

        DEBUGPRN(printf("%s notify([%d,%d] rank %d comm %d (E)\n", _name.c_str(), workIndex.gi, workIndex.wn, rank, (int)comm);)
        {
            boost::mutex::scoped_lock lock(iterationMutex);
            _inRecv++;
            if(_masterRank==rank) {
                smCommStats[workIndex].startRecv = sampleTime();
            } else {
                commStats[workIndex].startRecv = sampleTime();
            }
        }
        DEBUGPRN(printf("%s notify([%d,%d] rank %d comm %d (X)\n", _name.c_str(), workIndex.gi, workIndex.wn, rank, (int)comm);)
    }
    virtual void notifyOutput(WorkIndex &workIndex, int rank, MPI_Comm comm) {
        TRACE1(printf("[trace] %s notifyOutput (gi:%d/wn:%d) to (rank:%d/comm:%d) at %f\n", _name.c_str(), workIndex.gi, workIndex.wn, rank, (int)comm, sampleTime());)

        if(workIndex.gi<0) {
            return;
        }
        // stats
        {
            boost::mutex::scoped_lock lock(iterationMutex);
            if(_masterRank==rank) {
                smCommStats[workIndex].startSend = sampleTime();
            } else {
                commStats[workIndex].startSend = sampleTime();
            }
        }
    }
};


/**
 * Handles dada processing logic
 */
class WorkerProcessor : public  ComputeProcessor, public ControllerInterface, protected WorkStorage {
private:
    int _masterRank;
    MPI_Comm _comm;
    std::string _name;
    int _rank;
    int _queue;
    boost::mutex iterationMutex;
    enum WorkerState {
        WSInitialization=1,
        WSSteady=2,
        WSFinalization=3
    } _state;
    int _wn;
    int _iteration;
    double _endTime;

    boost::mutex stateMutex;

    std::map< int, gmwat_stat_t > stats;

    std::map<int,int> _channelMap;

    inline void checkState() {
        boost::mutex::scoped_lock lock(stateMutex);

        if (_wn==2 && _state==WSInitialization) _state=WSSteady;

        switch (_state) {
            case WSSteady:
                if (_queue==2) {
                    if (_mpiHandler->getWaitDataOnSend()!=TRUE) {
                        _mpiHandler->setWaitDataOnSend(TRUE); 
                    }
                } else {
                    if (_mpiHandler->getWaitDataOnSend()!=FALSE) {
                        _mpiHandler->setWaitDataOnSend(FALSE);
                    }
                }
                break;
            case WSInitialization:
            case WSFinalization:
                if (_queue>0) {
                    if (_mpiHandler->getWaitDataOnSend()!=TRUE) {
                        _mpiHandler->setWaitDataOnSend(TRUE); 
                    }
                } else {
                    if (_mpiHandler->getWaitDataOnSend()!=FALSE) {
                        _mpiHandler->setWaitDataOnSend(FALSE);
                    }
                }
                break;
        }        
    }

    void printStats() {
        for(int i=1; i<=_iteration; i++) {
            for(gmwat_stat_t::const_iterator st = stats[i].begin(); st!=stats[i].end(); st++) {
                printf("STAT U %s %d %d %d %d %.6f %.6f %.6f %.6f %.6f %.6f\n", _name.c_str(), _rank, i, 
                       (*st).first.gi, (*st).first.wn, 
                       (*st).second.execStart, (*st).second.execTime,
                       (*st).second.inputStart, (*st).second.inputTime, (*st).second.outputStart, (*st).second.outputTime);
            }
        }
        printf("STAT W %s %d %.6f %.6f\n", _name.c_str(), _rank, getBusyTime(), getEfficiency());
    }
    void sendStats() {
        DEBUGPRN(printf("[%s] sendStats() (E)\n", _name.c_str());)
        int ident=0;
        for(int i=1; i<=_iteration; i++) {
            for(gmwat_stat_t::const_iterator st = stats[i].begin(); st!=stats[i].end(); st++) {
                Work *work = takeResponse(-2, ident++);
                struct worker_wu_stat *stat = (struct worker_wu_stat *)work->data->chunks->data;
                stat->index = (*st).first;
                stat->stats = (*st).second;
                stat->mpiaddr.rank = _rank;
                stat->mpiaddr.comm = _comm;
                _mpiHandler->give(work, _masterRank, _comm, _channelMap[_masterRank]);
            }
        }
        Work *work = takeResponse(-3, 0);
        struct worker_wu_stat_total *statgrp = (struct worker_wu_stat_total *)work->data->chunks->data;
        statgrp->busyTime = getBusyTime();
        statgrp->efficiency = getEfficiency();
        statgrp->mpiaddr.rank = _rank;
        statgrp->mpiaddr.comm = _comm;
        _mpiHandler->give(work, _masterRank, _comm, _channelMap[_masterRank]);        
        DEBUGPRN(printf("[%s] sendStats() (X)\n", _name.c_str());)
    }

public:
    WorkerProcessor(MPI_Comm comm, int rank) : _masterRank(0), _comm(comm), _rank(rank), _queue(0), _state(WSInitialization), _wn(0), _iteration(0), _endTime(0) {
        char name[128];
        sprintf(name, "worker.%d(%d)", _rank, getpid());
        _name=name;
    }

    ~WorkerProcessor() {
        if((statistic & STATISTIC_WU_TIME) && !(statistic & STATISTIC_MASTER_COLLECTOR)) {
            printStats();
        }
    }

    virtual double getEndTime() { return _endTime; }

    virtual void setChannelMap(std::map<int,int> channels) {
        _channelMap = channels;
    }

    virtual const std::string getName() {
        return _name;
    }

    virtual void processSend() {
        DEBUGPRN(printf("[%s] processSend() queue=%d, _state=%d(E)\n", _name.c_str(), _queue, _state);)
        checkState();
        if(_queue==0 && _state!=WSFinalization) _mpiHandler->setBlockOnComm();
        DEBUGPRN(printf("[%s] processSend() (X)\n", _name.c_str());)
    }
    virtual void reportSignal(CommSignal sig, int rank, MPI_Comm comm) {
        // nothing .... for now
        DEBUGPRN(printf("[%s] reportSignal(%s)\n", _name.c_str(), getSignalName(sig));)
        switch (sig) {
            case CSShouldFinish:
                DEBUGPRN(printf("[%s] marked to finish\n", _name.c_str());)
                assert(_queue==0);
                shouldFinish();
                if(statistic & STATISTIC_MASTER_COLLECTOR) {
                    sendStats();
                } else {
            	    _mpiHandler->shouldFinish();
                }                
                break;
            case CSFinalizationStart:
                {
                    boost::mutex::scoped_lock lock(stateMutex);
                    _state=WSFinalization;                    
                }
                break;
            case CSIterationStart:
                //increment iteration number??
                _iteration++;
                break;
            case CSIterationEnd:
                //
                break;
            default:
                fprintf(stderr, "[%s] unhandled signal %d\n", _name.c_str(), sig);
                exit(-1);
        }
    }
    virtual void give(Work *work, int rank, MPI_Comm comm) {
        TRACE1(printf("[trace] %s give (gi:%d/wn:%d) to (rank:%d/comm:%d) at %f\n", _name.c_str(), work->index.gi, work->index.wn, rank, (int)comm, sampleTime());)
        DEBUGPRN(printf("[%s] received work wn = %d, gi = %d\n", _name.c_str(), work->index.wn, work->index.gi);)

        stats[_iteration][work->index].inputTime = sampleTime()-stats[_iteration][work->index].inputStart;

        _masterRank = rank;
        _comm = comm;
        ComputeProcessor::give(work); // fixme ... why this??!?!?!
        {
            boost::mutex::scoped_lock lock(stateMutex);
            _queue++;
            _wn++;
        }
        checkState();
    }
    virtual void report(Work *work, int rank, MPI_Comm comm) {
        TRACE1(printf("[trace] %s report (gi:%d/wn:%d) to (rank:%d/comm:%d) at %f\n", _name.c_str(), work->index.gi, work->index.wn, rank, (int)comm, sampleTime());)
        DEBUGPRN(printf("[%s] sent response wn = %d, gi = %d\n", _name.c_str(), work->index.wn, work->index.gi);)

        if(work->index.gi==-3 && statistic & STATISTIC_MASTER_COLLECTOR) {
            _mpiHandler->shouldFinish();
        }

        if(work->index.gi>=0) {
            stats[_iteration][work->index].outputTime = sampleTime()-stats[_iteration][work->index].outputStart;
        }
        


        //delete work;
        giveResponse(work);
    }

    virtual Work *process(Work *work) {
        Work *response = takeResponse(work->index.gi, work->index.wn);
        DEBUGPRN(printf("[%s] process start work wn = %d, gi = %d\n", _name.c_str(), work->index.wn, work->index.gi);)
        stats[_iteration][work->index].execStart = sampleTime();
        user_processWorkData(work->index.gi, work->index.wn, work->data, response->data);
        stats[_iteration][work->index].execTime = sampleTime()-stats[_iteration][work->index].execStart;        
        DEBUGPRN(printf("[%s] process end work wn = %d, gi = %d\n", _name.c_str(), work->index.wn, work->index.gi);)
        giveRequest(work);
        return response;
    }

    virtual void selfGive(Work *work) {
        DEBUGPRN(printf("[%s] selfGive wn = %d, gi = %d (E)\n", _name.c_str(), work->index.wn, work->index.gi);)
        boost::mutex::scoped_lock lock(iterationMutex);
        _queue--;        
        _mpiHandler->give(work, _masterRank, _comm, _channelMap[_masterRank]);
        if (_queue==0) {
            _mpiHandler->setWaitDataOnSend(FALSE);
        }
        DEBUGPRN(printf("[%s] selfGive wn = %d, gi = %d (X)\n", _name.c_str(), work->index.wn, work->index.gi);)
    }

    virtual Work *getWork(int gi, int wn, int rank, MPI_Comm comm) {
        return takeRequest(gi, wn);
    }

    virtual void notifyInput(WorkIndex &workIndex, int rank, MPI_Comm comm) {
        TRACE1(printf("[trace] %s notifyInput (gi:%d/wn:%d) to (rank:%d/comm:%d) at %f\n", _name.c_str(), workIndex.gi, workIndex.wn, rank, (int)comm, sampleTime());)
        if(workIndex.gi>=0) {
            stats[_iteration][workIndex].inputStart = sampleTime();
        }
    }
    virtual void notifyOutput(WorkIndex &workIndex, int rank, MPI_Comm comm) {
        TRACE1(printf("[trace] %s notifyOutput (gi:%d/wn:%d) to (rank:%d/comm:%d) at %f\n", _name.c_str(), workIndex.gi, workIndex.wn, rank, (int)comm, sampleTime());)
        if(workIndex.gi>=0) {
            stats[_iteration][workIndex].outputStart = sampleTime();
        }
    }
    
};



/**
 * Handles dada processing logic
 */
class CMgrProcessor : public ControllerInterface, protected WorkStorage {
private:
    int _from, _to;
    MPI_Comm _comm;
    std::string _name;
    int _rank;
    boost::mutex iterationMutex;
    int _iteration;

    boost::mutex stateMutex;

    std::map< int, gmwat_stat_t > stats;

    std::map<int,int> _channelMap;

    double _endTime;

    void printStats() {
        /*for(int i=1; i<=_iteration; i++) {
            for(gmwat_stat_t::const_iterator st = stats[i].begin(); st!=stats[i].end(); st++) {
                printf("STAT U %s %d %d %d %d %.6f %.6f %.6f %.6f %.6f %.6f\n", _name.c_str(), _rank, i, 
                       (*st).first.gi, (*st).first.wn, 
                       (*st).second.execStart, (*st).second.execTime,
                       (*st).second.inputStart, (*st).second.inputTime, (*st).second.outputStart, (*st).second.outputTime);
            }
        }
        printf("STAT W %s %d %.6f %.6f\n", _name.c_str(), _rank, getBusyTime(), getEfficiency());*/
    }
    void sendStats() {
        /*DEBUGPRN(printf("[%s] sendStats() (E)\n", _name.c_str());)
        int ident=0;
        for(int i=1; i<=_iteration; i++) {
            for(gmwat_stat_t::const_iterator st = stats[i].begin(); st!=stats[i].end(); st++) {
                Work *work = takeResponse(-2, ident++);
                struct worker_wu_stat *stat = (struct worker_wu_stat *)work->data->chunks->data;
                stat->index = (*st).first;
                stat->stats = (*st).second;
                stat->mpiaddr.rank = _rank;
                stat->mpiaddr.comm = _comm;
                _mpiHandler->give(work, _masterRank, _comm);
            }
        }
        Work *work = takeResponse(-3, 0);
        struct worker_wu_stat_total *statgrp = (struct worker_wu_stat_total *)work->data->chunks->data;
        statgrp->busyTime = getBusyTime();
        statgrp->efficiency = getEfficiency();
        statgrp->mpiaddr.rank = _rank;
        statgrp->mpiaddr.comm = _comm;
        _mpiHandler->give(work, _masterRank, _comm);        
        DEBUGPRN(printf("[%s] sendStats() (X)\n", _name.c_str());)*/
    }

public:
    CMgrProcessor(MPI_Comm comm, int rank, int from, int to) : _from(from), _to(to), _comm(comm), _rank(rank), _iteration(0) {
        char name[128];
        sprintf(name, "cmgr.%d(%d)", _rank, getpid());
        _name=name;
    }

    ~CMgrProcessor() {
        if((statistic & STATISTIC_WU_TIME) && !(statistic & STATISTIC_MASTER_COLLECTOR)) {
            printStats();
        }
    }

    virtual double getEndTime() { return _endTime; }

    virtual void setChannelMap(std::map<int,int> channels) {
        _channelMap = channels;
    }

    virtual const std::string getName() {
        return _name;
    }

    virtual void processSend() {
        DEBUGPRN(printf("[%s] processSend() _state=%d(E)\n", _name.c_str());)
        _mpiHandler->setBlockOnComm();
        DEBUGPRN(printf("[%s] processSend() (X)\n", _name.c_str());)
    }
    virtual void reportSignal(CommSignal sig, int rank, MPI_Comm comm) {
        // nothing .... for now
        DEBUGPRN(printf("[%s] reportSignal(%s)\n", _name.c_str(), getSignalName(sig));)

        if(rank==_from) {
            _mpiHandler->giveSignal(sig, _to, _comm);
        } else {
            //response action
            switch (sig) {
                case CSShouldFinish:
                    DEBUGPRN(printf("[%s] marked to finish\n", _name.c_str());)
                    if(statistic & STATISTIC_MASTER_COLLECTOR) {
                        sendStats();
                    } else {
                        _mpiHandler->shouldFinish();
                    }                
                    break;
                case CSFinalizationStart:
                case CSIterationEnd:
                    // nothing
                    break;
                case CSIterationStart:
                    _iteration++;
                    break;
                default:
                    fprintf(stderr, "[%s] unhandled signal %d\n", _name.c_str(), sig);
                    exit(-1);
            }
        }
    }
    virtual void give(Work *work, int rank, MPI_Comm comm) {
        DEBUGPRN(printf("[%s] received work wn = %d, gi = %d\n", _name.c_str(), work->index.wn, work->index.gi);)

        stats[_iteration][work->index].inputTime = sampleTime()-stats[_iteration][work->index].inputStart;

        if(rank==_from) {
            _mpiHandler->give(work, _to, _comm, _channelMap[_to]);
        } else {
            _mpiHandler->give(work, _from, _comm, _channelMap[_from]);
        }
        
    }
    virtual void report(Work *work, int rank, MPI_Comm comunicator) {
        DEBUGPRN(printf("[%s] sent response wn = %d, gi = %d\n", _name.c_str(), work->index.wn, work->index.gi);)

        if(work->index.gi==-3 && statistic & STATISTIC_MASTER_COLLECTOR) {
            _mpiHandler->shouldFinish();
        }

        if(work->index.gi>=0) {
            stats[_iteration][work->index].outputTime = sampleTime()-stats[_iteration][work->index].outputStart;
        }
        
        //delete work;
        if(rank==_from) {
            giveResponse(work);
        } else { //if(rank==_to) 
            giveRequest(work);
        }
        
    }

    virtual Work *getWork(int gi, int wn, int rank, MPI_Comm comm) {
        if(rank==_from) {
            return takeRequest(gi, wn);
        } else { //if(rank==_to) 
            return takeResponse(gi, wn);
        }        
    }

    virtual void notifyInput(WorkIndex &workIndex, int rank, MPI_Comm comm) {
        if(workIndex.gi>=0) {
            //stats[_iteration][workIndex].inputStart = sampleTime();
        }
    }
    virtual void notifyOutput(WorkIndex &workIndex, int rank, MPI_Comm comm) {
        if(workIndex.gi>=0) {
            //stats[_iteration][workIndex].outputStart = sampleTime();
        }
    }
    virtual void run() {
        //nothing here
    }
};

class SerialHandler : virtual public MPIHandlerInterface {
    int _dummyWaitDataOnSend;
    ControllerInterface *_controller;
public:
    SerialHandler() : _dummyWaitDataOnSend(TRUE), _controller(NULL) {
    }

    virtual void configChannels(std::vector<int> channels) {}
    virtual void give(Work *work, int rank, MPI_Comm comm, int channel=0) {        
        Work *response = _controller->getWork(work->index.gi, work->index.wn, rank, comm);
        user_processWorkData(work->index.gi, work->index.wn, work->data, response->data);
        _controller->notifyOutput(work->index, rank, comm);
        _controller->report(work, rank, comm);
        _controller->notifyInput(response->index, rank, comm);
        _controller->give(response, rank, comm);
    }
    virtual void giveSignal(enum CommSignal sig, int rank, MPI_Comm comm, int channel=0) {
        _controller->reportSignal(sig,rank,comm);
    }
    virtual void shouldFinish() {
    }
    virtual void setWaitDataOnSend(int val) {
        _dummyWaitDataOnSend=val;
    }
    virtual int getOutQueueLength() {
        return 0;
    }
    virtual int getWaitDataOnSend() {
        return _dummyWaitDataOnSend;
    }

    virtual void setController(ControllerInterface *controller) {
        _controller = controller;
        _dummyWaitDataOnSend = TRUE;
    }
    virtual void notify(WorkIndex &workIndex, int rank, MPI_Comm comm) {
    }
    virtual void setBlockOnComm() {
    }
};



ControllerInterface *controller = NULL;
void signal_handler(int sig) {

    if (controller) {
        ((MasterProcessor*)controller)->dump();
    } else {
        printf("Error: controller is null, unable to dump.\n");
    }
    exit(0);
    //signal(SIGINT, SIGDFL);
}

struct CommunicationManager {
    int from,to; // cm processes
    int source,target; // endpoints
};

struct MpiChannel {
    int ident;
    int from, to;
};


class MPITester : public ControllerInterface, virtual public Runnable {
private:
    int _mode; // 0, 1
    std::string _name;
    char *buf[2][2];
    int ii, io, index, packs;
    MPIHandlerInterface *_mpi;
    WorkData *wd[2][2];
    boost::mutex sync;
    boost::condition condSend;
    boost::mutex syncFinish;
    boost::condition condFinish;
public:
    MPITester(int mode, MPIHandlerInterface *mpi) : _mode(mode), _mpi(mpi) {
        char name[32];
        sprintf(name, "MPITester[%d]", _mode); 
        for(int i=0; i<2; i++)
            for(int j=0; j<2; j++) {
                buf[i][j] = (char *)calloc(1024*1024, 1);
                wd[i][j] = (struct WorkData *)calloc(1, sizeof(struct WorkData));
                wd[i][j]->count = 4;
                wd[i][j]->chunks = (struct DataChunk *)calloc(wd[i][j]->count, sizeof(struct DataChunk));
                for(int s=0; s<wd[i][j]->count; s++) {
                    wd[i][j]->chunks[s].size = 1024*1024;
                    wd[i][j]->chunks[s].data = buf[i][j];
                }
            }
        packs=index=ii=io=0;
    }
    ~MPITester() {
        for(int i=0; i<2; i++) {
            for(int j=0; j<2; j++) {
                free(buf[i][j]);
                for(int s=0; s<wd[i][j]->count; s++) {
                    wd[i][j]->chunks[s].data = NULL;
                }
                free(wd[i][j]->chunks);
                free(wd[i][j]);
            }
        }
    }
    virtual void notifyInput(WorkIndex &workIndex, int rank, MPI_Comm comm) {
        printf("%d MPITester[%d].notifyInput([%d,%d], %d @ %p)\n", getpid(), _mode, workIndex.gi, workIndex.wn, rank, comm);
    }
    virtual void notifyOutput(WorkIndex &workIndex, int rank, MPI_Comm comm) {
        printf("%d MPITester[%d].notifyOutput([%d,%d], %d @ %p)\n", getpid(), _mode, workIndex.gi, workIndex.wn, rank, comm);
    }
    virtual void give(Work *work, int rank, MPI_Comm comm) {
        printf("%d MPITester[%d].give([%d,%d], %d @ %p)\n", getpid(), _mode, work->index.gi, work->index.wn, rank, comm);
        work->data = NULL;
        delete work;
        {
            boost::mutex::scoped_lock lock(syncFinish);
            packs++;
            if(packs==1000) {
                condFinish.notify_all();
            }
        }
        
    }
    virtual void report(Work *work, int rank, MPI_Comm comm) {
        printf("%d MPITester[%d].report([%d,%d], %d @ %p)\n", getpid(), _mode, work->index.gi, work->index.wn, rank, comm);
        work->data = NULL;
        delete work;
        {
            boost::mutex::scoped_lock lock(sync);
            condSend.notify_all();
        }
    }
    virtual void reportSignal(enum CommSignal sig, int rank, MPI_Comm comm) {
        printf("%d MPITester[%d].reportSignal(sig:%d, %d @ %p)\n", getpid(), _mode, sig, rank, comm);
    }
    virtual void processSend() {
        printf("%d MPITester[%d].processSend()\n", getpid(), _mode);
    }
    virtual const std::string getName() {
        return _name;
    }
    virtual Work *getWork(int gi, int wn, int rank, MPI_Comm comm) {
        printf("%d MPITester[%d].getWork([%d,%d], %d @ %p)\n", getpid(), _mode, gi, wn, rank, comm);
        Work *w = new struct Work;
        w->index.gi = gi;
        w->index.wn = wn;
        w->data = wd[0][ii++];
        ii%=2;
        return w;
    }
    virtual void setChannelMap(std::map<int, int> channels) {
        printf("%d MPITester[%d].setChannelMap(%d)\n", getpid(), _mode, channels.size());
    }
    virtual double getEndTime() {
        return sampleTime();        
    }
    virtual void run() {
        localGive();
        localGive();
        while(index<1000) {
            {
                boost::mutex::scoped_lock lock(sync);
                condSend.wait(lock);
            }
            localGive();
        }
        printf("%d MPITester[%d].run() process completed\n", getpid(), _mode);
        {
            boost::mutex::scoped_lock lock(syncFinish);
            while(packs<1000) {
                condFinish.wait(lock);
            }
        }
        _mpi->shouldFinish();
        printf("%d MPITester[%d].run() thread exit\n", getpid(), _mode);
    }
private:
    void localGive() {
        Work *w = new struct Work;
        w->index.gi = 0;
        w->index.wn = index++;
        w->data = wd[1][io++];
        io%=2;
        _mpi->give(w, (_mode)?0:1, MPI_COMM_WORLD);
    }
};


int main(int argc, char **argv, char **env) {
#ifdef PARALLEL_CODE
    int rank, commsize, master_rank=0, gmaster_rank=-1, len, range[2];
    MPI_Comm comm;
    const char *cluster_config;
    std::vector<range_item_t> ranges;
    std::map<std::string, std::string> cmdConf;
    int param_gi=0, param_bgi=0, param_wu, param_iter;
    MPIHandlerInterface *mpiHandler;
    char execMode[512];
    double t1, t2;
    int stdoutRedir;
    int isMPIInitialized = false;
    std::vector<struct CommunicationManager> cmgrs;
    std::vector<struct MpiChannel> channels;

    setBaseTime();

   // signal(SIGINT, signal_handler);

    setvbuf(stdout, (char *)NULL, _IOLBF, 0);

    //mpich fix up
    {
    	int isSerial=false;
    	for(int i=1; i<argc; i++) {
    		if(strcmp(argv[i], "--gmwat-serial")==0) {
    			isSerial=true;
    			break;
    		}
    	}
    	if(!isSerial) {
    		//MPI_Init(&argc, &argv);

            int provider;
            int error = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provider);
            assert(error==MPI_SUCCESS);
#ifdef SYNCHRONOUS_THREADED
            if(provider!=MPI_THREAD_MULTIPLE) {    
                printf("MPI_THREAD_MULTIPLE required but unavailable\n");
                MPI_Finalize();
                exit(0);
            }
#endif
    		isMPIInitialized = true;
    	}
    }

    { //filter configuration parameters
        char *prefix="--gmwat-", *buf;
        int psz = strlen(prefix);
        for(int i=1; i<argc; i++) {
            //ifparams starts with --gmwat- push in cmdConf
            if((int)strlen(argv[i])>=psz && strncmp(argv[i], prefix, psz)==0){
                buf = (char *)malloc(strlen(argv[i])+1);
                strcpy(buf, argv[i]);
                char *key=buf+8;
                char *value=key;
                while(*value&&*value!='=') value++;
                if(*value) { *value=0; value++; }
                cmdConf[key] = value;
                free(buf);
    
                //printf("line config '%s' = '%s'\n", key, value);
                // remove from original parameter line
                for(int k=i; k<argc-1; k++) {
                    argv[k]=argv[k+1];
                }
                argc--;
                i--; // reprocess same position next time 
            }
        }
    }
    {
        if(cmdConf.find("version")!=cmdConf.end()) {
            printf("GMWAT version: %d copiled at %s %s\n", GMWAT_VERSION, __DATE__, __TIME__);
            return 0;
        }
    }

    {
        if(cmdConf.find("stdout")!=cmdConf.end()) {
            stdoutRedir = creat(cmdConf["stdout"].c_str(), 0600);
            if(stdoutRedir != -1) {
                //printf("GMWAT: output redirect to '%s'.", cmdConf["stdout"].c_str());
                close(1);
                dup(stdoutRedir);
                close(stdoutRedir);
            } else {
                printf("GMWAT: error in file open '%s'. Unable to redirect output.", cmdConf["stdout"].c_str());
            }
        }
    }

    { // verbose configuration
        const char *verbose_config = NULL;
        if(cmdConf.find("verbose")!=cmdConf.end()) {
            verbose_config = cmdConf["verbose"].c_str();
        } else {
            verbose_config = getenv("GMWAT_VERBOSE");
        }
        if (verbose_config) {
            verbose = atoi(verbose_config);
            if(verbose & VERBOSE_INFO) printf("GMWAT: verbose level set to %d.\n", verbose);
        }
    }
    { // verbose configuration
        const char *stats_config = NULL;
        if(cmdConf.find("stats")!=cmdConf.end()) {
            stats_config = cmdConf["stats"].c_str();
        } else {
            stats_config = getenv("GMWAT_STATS");
        }
        if (stats_config) {
            statistic = atoi(stats_config);
            if(verbose & VERBOSE_INFO) printf("GMWAT: statistic filter set to %d.\n", statistic);
        }
    }

    if(cmdConf.find("mpitest")!=cmdConf.end()) {
        char pname[MPI_MAX_PROCESSOR_NAME];
        MPI_Get_processor_name(pname, &len);

        comm = MPI_COMM_WORLD;

        MPI_Comm_size(comm, &commsize);
        MPI_Comm_rank(comm, &rank);

        if(commsize%2) {
            printf("[process %d] process number must be even.\n", getpid());
            MPI_Finalize();
            exit(0);
        }

        mpiHandler = new MPIHandler();

        controller = new MPITester(rank%2, mpiHandler);

        mpiHandler->setController(controller);
        controller->setMpiHandler(mpiHandler);

        ThreadWrapper twmpi(dynamic_cast<MPIHandler *>(mpiHandler));
        ThreadWrapper twctrl(controller);        

        boost::thread mpiThread(twmpi);
        boost::thread controllerThread(twctrl);

        mpiThread.join();
        controllerThread.join();

        t2 = controller->getEndTime();

        DEBUGPRN(printf("[process %d] going to finalize ...\n", getpid());)

        MPI_Finalize(); 
        exit(0);
    }

    if(cmdConf.find("serial")!=cmdConf.end()) {

        executionMode=SERIAL_EXECUTION;

       if(isMPIInitialized) {
		MPI_Finalize();
       } 

        { // param_gi and param_bgi configuration
            char senv[128];
            const char *gi_config=NULL, *bgi_config;

            memset(senv, 0L, 128);

            if(cmdConf.find("gi")!=cmdConf.end()) {
                gi_config = cmdConf["gi"].c_str();
            } else {
                gi_config = getenv("GMWAT_GI");
            }
            if (gi_config) {               
                param_gi=atoi(gi_config);
            }

            if(cmdConf.find("bgi")!=cmdConf.end()) {
                bgi_config=cmdConf[senv].c_str();
            } else {
                bgi_config = getenv("GMWAT_BGI");
            }
            if (bgi_config) { 
                param_bgi=atoi(bgi_config);
            }
        }

        t1 = sampleTime();

        role='M';

        user_config(argc,argv,env,&param_wu, &param_iter, role);

        mpiHandler = new SerialHandler();
        controller = new MasterProcessor(param_iter, param_wu, 1, 0, 0, param_bgi, param_gi);

        strcpy(execMode, "Serial 1p");


        mpiHandler->setController(controller);
        controller->setMpiHandler(mpiHandler);

        ((Runnable *)controller)->run();

        // t2 = sampleTime();
        t2 = controller->getEndTime();

        user_finalize();

    } else {
        //MPI_Init(&argc, &argv);
    
        char pname[MPI_MAX_PROCESSOR_NAME];
        MPI_Get_processor_name(pname, &len);
    
        comm = MPI_COMM_WORLD;
    
        MPI_Comm_size(comm, &commsize);
        MPI_Comm_rank(comm, &rank);

        sprintf(execMode, "'Parallel %dp'", commsize);

        if(commsize<=1) {
            printf("ERROR: Parallel version requires more than one processor.\n");
            exit(-1);
        }
        //printf("rank %d on %s has %d parameters\n", rank, pname, argc);


        { // param_gi and param_bgi configuration
            char senv[128];
            const char *gi_config=NULL, *bgi_config;

            memset(senv, 0L, 128);

            sprintf(senv, "%d-gi", rank);
            if(cmdConf.find(senv)!=cmdConf.end()) {
                gi_config = cmdConf[senv].c_str();
            } else {
                sprintf(senv, "GMWAT_%d_GI", rank);
                gi_config = getenv(senv);
            }

            if (!gi_config) { 
                if(cmdConf.find("gi")!=cmdConf.end()) {
                    gi_config = cmdConf["gi"].c_str();
                } else {
                    gi_config = getenv("GMWAT_GI");
                }
            }
            if (gi_config) {
                param_gi=atoi(gi_config);
            }
            sprintf(senv, "%d-bgi", rank);
            if(cmdConf.find(senv)!=cmdConf.end()) {
                bgi_config=cmdConf[senv].c_str();
            } else {
                sprintf(senv, "GMWAT_%d_BGI", rank);
                bgi_config = getenv(senv);
            }

            if (!bgi_config) { 
                if(cmdConf.find("bgi")!=cmdConf.end()) {
                    bgi_config=cmdConf["bgi"].c_str();
                } else {
                    bgi_config = getenv("GMWAT_BGI");
                }
            }
            if (bgi_config) {
                param_bgi=atoi(bgi_config);
            }
        }
    
        range[0] = 0;
        range[1] = commsize-1;
    
    
        { // global mastar  configuration
            const char *gmaster_config = NULL;
            if(cmdConf.find("master")!=cmdConf.end()) {
                gmaster_config = cmdConf["master"].c_str();
            } else {
                gmaster_config = getenv("GMWAT_MASTER");
            }
            if (gmaster_config) {
                gmaster_rank=atoi(gmaster_config);
                if (gmaster_rank==rank) role = 'M';
            }
        }
        { // cluster configuration
            cluster_config = NULL;
            if(cmdConf.find("clusters")!=cmdConf.end()) {
                cluster_config = cmdConf["clusters"].c_str();
            } else {
                cluster_config = getenv("GMWAT_CLUSTERS");
            }
        }

        #define GMWAT_CLUSTER_ERROR_MESSAGE "ERROR: Invalid GMWAT_CLUSTERS string format token %s, range not found, should be like '0,1-5;5,6-7'\n"
        if (cluster_config) {
            std::string srole(cluster_config);

            {
                if(verbose & VERBOSE_INFO && rank==0) printf("GMWAT: using cluster config '%s' on %d processors.\n", cluster_config, commsize);
                strcat(execMode, " (");
                strcat(execMode, cluster_config);
                strcat(execMode, ")");
            }

            //checks for 0,1-5;6,7-8
            typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
            boost::char_separator<char> scSep(";");
            tokenizer scTok(srole, scSep);
    
            for (tokenizer::const_iterator scIt=scTok.begin(); scIt!=scTok.end(); scIt++) {
                boost::char_separator<char> partSep(",");
                std::string cluster = *scIt;
                tokenizer partTok(cluster, partSep);
                tokenizer::const_iterator partIt = partTok.begin();
    
                if (partIt==partTok.end()) {
                    fprintf(stderr, GMWAT_CLUSTER_ERROR_MESSAGE, "<EOL>");
                    exit(-1);
                }
                int smrank = atoi((*partIt).c_str());            
                if (role=='W') {
                    if (smrank==rank) {
                        if (gmaster_rank<0) {
                            role='M';                        
                        } else {
                            role='S';                    
                        }
                    }
                }
    
                if (gmaster_rank<0) {
                    gmaster_rank = smrank;
                }
    
                partIt++;
                if (partIt==partTok.end()) {
                    fprintf(stderr, GMWAT_CLUSTER_ERROR_MESSAGE, "<EOL>");
                    exit(-1);
                }
    
                boost::char_separator<char> rangeSep("-");
                tokenizer rangeTok(*partIt, rangeSep);
    
                partIt++;
                if (partIt!=partTok.end()) {
                    fprintf(stderr, GMWAT_CLUSTER_ERROR_MESSAGE, (*partIt).c_str());
                    exit(-1);
                }
    
                tokenizer::const_iterator rangeIt = rangeTok.begin();
                for (int i=0; i<2; i++) {
                    if (rangeIt==partTok.end()) {
                        fprintf(stderr, GMWAT_CLUSTER_ERROR_MESSAGE, "<token end>");
                        exit(-1);
                    }
                    range[i] = atoi((*rangeIt).c_str());
                    rangeIt++;
                }
                if (rangeIt!=partTok.end()) {
                    fprintf(stderr, GMWAT_CLUSTER_ERROR_MESSAGE, (*rangeIt).c_str());
                    exit(-1);
                }
    
                if (range[0]<=rank && range[1]>=rank) {
                    master_rank = smrank;
                }
                if (smrank == rank) {
                    ranges.push_back(range_item_t(range[0], range[1]));
                }
            }
        }
    
        if(gmaster_rank<0) {
            gmaster_rank=0;
            if(rank==gmaster_rank) role='M';
        }
    
        { // comunication managers configuration
            const char *cmgrs_config = NULL;
            if(cmdConf.find("cmgrs")!=cmdConf.end()) {
                cmgrs_config = cmdConf["cmgrs"].c_str();
            } else {
                cmgrs_config = getenv("GMWAT_CMGRS");
            }
            if(cmgrs_config) {
                // 0:1-2:3,4:6-7:8
                std::string param(cmgrs_config);
                typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
                boost::char_separator<char> scSep(",");
                tokenizer scTok(param, scSep);

                #define GMWAT_CM_ERROR_MESSAGE "Error parsing comunication manager format."

                for (tokenizer::const_iterator scIt=scTok.begin(); scIt!=scTok.end(); scIt++) {
                    boost::char_separator<char> partSep(":");
                    tokenizer partTok(*scIt, partSep);
                    tokenizer::const_iterator partIt = partTok.begin();

                    cmgrs.resize(cmgrs.size()+1);

                    if (partIt==partTok.end()) {
                        fprintf(stderr, GMWAT_CM_ERROR_MESSAGE);
                        exit(-1);
                    }

                    cmgrs.back().source = atoi((*partIt).c_str());

                    partIt++;
                    if (partIt==partTok.end()) { // check if target is missing
                        fprintf(stderr, GMWAT_CM_ERROR_MESSAGE);
                        exit(-1);
                    }

                    boost::char_separator<char> rangeSep("-");
                    tokenizer rangeTok(*partIt, rangeSep);

                    tokenizer::const_iterator rangeIt = rangeTok.begin();
                    for (int i=0; i<2; i++) {
                        if (rangeIt==partTok.end()) {
                            fprintf(stderr, GMWAT_CM_ERROR_MESSAGE);
                            exit(-1);
                        }
                        int ep = atoi((*rangeIt).c_str());
                        if(i==0) cmgrs.back().from = ep;
                        else cmgrs.back().to = ep;
                        if(rank==ep) {
                            role='C';
                        }
                        rangeIt++;
                    }
                    if (rangeIt!=partTok.end()) {
                        fprintf(stderr, GMWAT_CM_ERROR_MESSAGE); //, (*rangeIt).c_str());
                        exit(-1);
                    }

                    partIt++;
                    if (partIt==partTok.end()) { // check if target is missing
                        fprintf(stderr, GMWAT_CM_ERROR_MESSAGE);
                        exit(-1);
                    }

                    cmgrs.back().target = atoi((*partIt).c_str());

                    partIt++;

                    if (partIt!=partTok.end()) { // check for extra oper 
                        fprintf(stderr, GMWAT_CM_ERROR_MESSAGE);
                        exit(-1);
                    }
                }
            }
            /*for(int i=0; i<cmgrs.size(); i++) {
                printf("rank:%d %d %d:%d-%d:%d role:%c\n", rank, i, cmgrs[i].source, cmgrs[i].from, cmgrs[i].to, cmgrs[i].target, role);
            }*/
        }

        std::map<int,int> pchannels;
        std::vector<int> cmaps;

        { // channels configuration
            const char *channels_config = NULL;
            if(cmdConf.find("channels")!=cmdConf.end()) {
                channels_config = cmdConf["channels"].c_str();
            } else {
                channels_config = getenv("GMWAT_CHANNELS");
            }
            if(channels_config) {
                // 0,1-2;2,7-8
                std::string param(channels_config);
                typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
                boost::char_separator<char> scSep(";");
                tokenizer scTok(param, scSep);

                #define GMWAT_CHANNEL_ERROR_MESSAGE "Error parsing channels format."

                for (tokenizer::const_iterator scIt=scTok.begin(); scIt!=scTok.end(); scIt++) {
                    boost::char_separator<char> partSep(",");
                    tokenizer partTok(*scIt, partSep);
                    tokenizer::const_iterator partIt = partTok.begin();

                    channels.resize(channels.size()+1);

                    if (partIt==partTok.end()) {
                        fprintf(stderr, GMWAT_CHANNEL_ERROR_MESSAGE);
                        exit(-1);
                    }

                    channels.back().ident = (atoi((*partIt).c_str())+1);

                    partIt++;
                    if (partIt==partTok.end()) { // check if target is missing
                        fprintf(stderr, GMWAT_CHANNEL_ERROR_MESSAGE);
                        exit(-1);
                    }

                    boost::char_separator<char> rangeSep("-");
                    tokenizer rangeTok(*partIt, rangeSep);

                    tokenizer::const_iterator rangeIt = rangeTok.begin();
                    for (int i=0; i<2; i++) {
                        if (rangeIt==partTok.end()) {
                            fprintf(stderr, GMWAT_CHANNEL_ERROR_MESSAGE);
                            exit(-1);
                        }
                        int ep = atoi((*rangeIt).c_str());
                        if(i==0) channels.back().from = ep;
                        else channels.back().to = ep;
                        if(rank==ep) {
                            role='C';
                        }
                        rangeIt++;
                    }
                    if (rangeIt!=partTok.end()) {
                        fprintf(stderr, GMWAT_CHANNEL_ERROR_MESSAGE); //, (*rangeIt).c_str());
                        exit(-1);
                    }

                    partIt++;
                    
                    if (partIt!=partTok.end()) { // check for extra oper 
                        fprintf(stderr, GMWAT_CHANNEL_ERROR_MESSAGE);
                        exit(-1);
                    }
                }
            }
            std::set<int> idents;
            for(int i=0; i<(int)channels.size(); i++) {
                //printf("channels rank:%d %d %d:%d-%d role:%c\n", rank, i, channels[i].ident, channels[i].from, channels[i].to, role);
                if(channels[i].from == rank || channels[i].to == rank) {
                    pchannels[ (channels[i].from==rank)?channels[i].to:channels[i].from ] = channels[i].ident;
                    idents.insert(channels[i].ident);
                }
            }
            cmaps.resize(idents.size());
            int i=0;
            for(std::set<int>::const_iterator it=idents.begin(); it!=idents.end(); it++) {
                cmaps[i++] = *it;
            }
        }
       
    
        //ControllerInterface *controller = NULL;

        t1 = sampleTime();
    
        user_config(argc,argv,env,&param_wu,&param_iter, role);
    
        if (role=='M' || role=='S') {
            if (ranges.size()==0) {
                ranges.push_back(range_item_t(1, commsize-1));
                if(verbose & VERBOSE_INFO) printf("GMWAT auto range workers (1 to %d)\n", commsize-1); 
            }

            //creates only workers needed
            int cs = 0;
            //add ranges
            for (range_t::const_iterator rg=ranges.begin(); rg!=ranges.end(); rg++) {
                cs += (*rg).second-(*rg).first+1;
    
                if (rank>=(*rg).first && rank<=(*rg).second) {
                    cs--; // skip self
                }
            }
    
            MasterProcessor *master;
    
            if (role=='M') {
                master = new MasterProcessor(param_iter, param_wu, cs, comm, rank, param_bgi, param_gi);
            } else { // if (role=='S')
                master = new MasterProcessor(1, 0, cs, comm, rank, param_bgi, param_gi, comm, gmaster_rank);
            }
    
            if(verbose & VERBOSE_INFO) printf("GMWAT %c master has %d workers\n", role, cs); 
            int workerIndex=0;
            for (range_t::const_iterator rg=ranges.begin(); rg!=ranges.end(); rg++) {
                if(verbose & VERBOSE_INFO) printf("GMWAT processing range %d, %d\n", (*rg).first, (*rg).second);
                for (int i=(*rg).first; i<=(*rg).second; i++) {
                    if (i==rank) {
                        if(verbose & VERBOSE_INFO) printf("GMWAT %c +W configWorker(%d, %d, %d) SKIPED\n", role, workerIndex, i, (int)comm); 
                        continue; //skip self
                    }
                    if(verbose & VERBOSE_INFO) printf("GMWAT %c +W configWorker(%d, %d, %d)\n", role, workerIndex, i, (int)comm); 
                    master->configWorker(workerIndex, i, comm);
                    workerIndex++;
                }
            }
            controller = master;
        } else {
            if(role=='C') {
                int from=-1, to=-1;
                for(int i=0;i<(int)cmgrs.size();i++) {
                    if(cmgrs[i].from==rank) {
                        from =  cmgrs[i].source;
                        to = cmgrs[i].to;
                        break;
                    } else if(cmgrs[i].to==rank) {
                        from =  cmgrs[i].from;
                        to = cmgrs[i].target;
                        break;
                    }
                }
                if(from==-1) {
                    fprintf(stderr, "rank %d has role %c and is not mapped\n", rank, role); fflush(stderr);
                }
                assert(from>-1);
                controller = new CMgrProcessor(comm, rank, from, to);
            } else {
                controller = new WorkerProcessor(comm, rank);
            }            
        }

        if(verbose & VERBOSE_INFO) printf("GMWAT ROLE: rank %d in %s is '%c'\n", rank, pname, role); 

        mpiHandler = new MPIHandler();


        mpiHandler->configChannels(cmaps);
        controller->setChannelMap(pchannels);


        mpiHandler->setController(controller);
        controller->setMpiHandler(mpiHandler);

        ThreadWrapper twmpi(dynamic_cast<MPIHandler *>(mpiHandler));
        ThreadWrapper twctrl(controller);        

        boost::thread mpiThread(twmpi);
        boost::thread controllerThread(twctrl);

        mpiThread.join();
        controllerThread.join();

        t2 = controller->getEndTime();

        user_finalize();

        DEBUGPRN(printf("[process %d] going to finalize ...\n", getpid());)

        MPI_Finalize();        
    }

    delete mpiHandler;

    switch(role) {
        case 'M':
        case 'S':
            delete controller;
            break;
        case 'W':
            delete ((WorkerProcessor *)controller); // FIXME 
            break;
        case 'C':
            delete ((CMgrProcessor *)controller); // FIXME 
            break;
    }

    if (role=='M') {
        char params[1024];
        strncpy(params, argv[0], sizeof(params));
        for (int i=1; i<argc; i++) {
            int limit;
            limit = sizeof(params)-strlen(params)-1;
            strncat(params, " ", limit);
            limit = sizeof(params)-strlen(params)-1;
            strncat(params, argv[i], limit);
        }
        printf("\x1b[1;37;46mwu: %d tot: %f mode: %s version: %d params: %s\x1b[0m\n", param_wu, t2-t1, execMode, GMWAT_VERSION, params);
    }
#else
    int wu;

    if (char *vl = getenv("GMWAT_VERBOSE")) {
        verbose = atoi(vl);
        printf("GMWAT: verbose level set to %d.\n", verbose);
    }

    user_config(argc,argv,env,&wu, role);
    controller = new MasterProcessor(wu, 1, 0, 0);

    SerialHandler serialHandler;

    serialHandler.setController(controller);
    controller->setMpiHandler(&serialHandler);

    double t1 = sampleTime();
    ((Runnable *)controller)->run();
    //double t2 = sampleTime();
    double t2 = controller->getEndTime();

    delete controller;

    {
        char params[1024];
        strncpy(params, argv[0], sizeof(params));
        for (int i=1; i<argc; i++) {
            int limit;
            limit = sizeof(params)-strlen(params)-1;
            strncat(params, " ", limit);
            limit = sizeof(params)-strlen(params)-1;
            strncat(params, argv[i], limit);
        }
        printf("\x1b[1;37;46mwu: %d tot: %f params: %s\x1b[0m\n", wu, t2-t1, params);
    }

    user_finalize();
#endif
    return 0;
}
