File: murasaki_mpi.cc

package info (click to toggle)
murasaki 1.68.6-12
links: PTS, VCS
area: main
in suites: bullseye
size: 15,676 kB
sloc: cpp: 16,010; perl: 8,365; makefile: 186; sh: 31
file content (1394 lines) | stat: -rw-r--r-- 51,027 bytes
/*
Copyright (C) 2006-2008 Keio University
(Kris Popendorf) <comp@bio.keio.ac.jp> (2006)

This file is part of Murasaki.

Murasaki is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Murasaki is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Murasaki.  If not, see <http://www.gnu.org/licenses/>.
*/

//////////////
// murasaki project
// murasaki_mpi.cc
// provides MPI-specific functions
//////////////

#ifdef MURASAKI_MPI

#include <mpi.h>
#include <iostream>
#include <fstream>
#include <unistd.h> //for getpid
#include "murasaki.h"
#include "sequence.h"
#include "dinkymath.h"
#include "timing.h"
#include "murasaki_mpi.h"


//defined here for sanity's sake
typedef struct mpi_req_t{
  int type;
  HashKey key;
  HashVal val;
} mpi_req;

//globals
int mpi_id;
int mpi_procs;
bool mpi_capable=false;
vector<string> mpi_hostnames;
vector<int> mpi_hostIds;
map<string,int> mpi_hostPopulation;
map<string,int> mpi_hostLeader_byName;
vector<int> mpi_hostLeader;//lowest id on same host
bool mpi_isHostLeader,mpi_usingShm;
int mpi_sysv_projid;
word mpi_totalMemory,mpi_totalStorage;
vector<double> mpi_storeShare;
vector<pair<word,word> > mpi_storeOwn;
word mpi_myStoreOffset;
map<word,int> mpi_storeBrk; // start of hash region -> id
map<word,int> mpi_storeBrkStop; // stop+1 of hash region -> id
vector<double> mpi_hashShare;
vector<pair<word,word> > mpi_hashOwn;
map<word,int> mpi_hashBrk; //start of sequence -> rank
map<word,Location> mpi_hashPoints; //maps each hashBrk to an actual Location
vector<long> mpi_hashCount,mpi_storeCount,mpi_extractCount,mpi_extractLocCount,mpi_mergeCount,mpi_anchorSendCount,mpi_anchorRecvCount;
vector<double> mpi_workTime;

word mpi_total_hash_size;
vector<word> mpi_worldMemory;
vector<MPI_Job> mpi_jobs;
vector<int> mpi_jobCount(JOB_MAX,0);
vector<int> mpi_hasherIds;
vector<int> mpi_worldId2jobId;
int mpi_myHasherId=-1;
vector<int> mpi_assemblerIds; //storage nodes map to assemblers, assembler nodes map to their index in mpi_hasherIds (will be handy for merging assembler -> assembler)
int mpi_finalAssembler; //someone still has to write out 

const char *MPI_jobNames[]={"Hasher","Storage","--MAX--"};

MPI_Comm mpi_leaders_comm,mpi_localhost_comm,mpi_job_comm;
int mpi_myHostId,mpi_myLeaderRank,mpi_myLocalRank,mpi_myJobRank;


MPI_Datatype MPI_HashMessage_type;

struct lt_mpi_worldMemory : binary_function<int,int,bool> {
  inline bool operator()(const int &a, const int &b) const {
    return mpi_worldMemory[a]==mpi_worldMemory[b] ? a<b:mpi_worldMemory[a]<mpi_worldMemory[b];
  }
};

struct gt_mpi_worldMemory : binary_function<int,int,bool> {
  inline bool operator()(const int &a, const int &b) const {
    return mpi_worldMemory[a]==mpi_worldMemory[b] ? a>b:mpi_worldMemory[a]>mpi_worldMemory[b];
  }
};

void mpi_types_init(){
  MPI_HashMessage msg;
  MPI_Datatype type[3]={MPI_UNSIGNED_LONG,MPI_INT,MPI_LONG};
  int blocklen[3]={1,1,1};
  MPI_Aint disp[3]; //displacement
  MPI_Aint base;
  MPI_Get_address(&msg.key,disp);
  MPI_Get_address(&msg.seqno,disp+1);
  MPI_Get_address(&msg.pos,disp+1);
  base=disp[0];
  for(int i=0;i<3;i++)disp[i]-=base;
  MPI_Type_create_struct(3,blocklen,disp,type,&MPI_HashMessage_type);
  MPI_Type_commit(&MPI_HashMessage_type); //make sure we can send with it

  mpi_hashCount.resize(mpi_procs,0);
  mpi_storeCount.resize(mpi_procs,0);
  mpi_extractLocCount.resize(mpi_procs,0);
  mpi_extractCount.resize(mpi_procs,0);
  mpi_mergeCount.resize(mpi_procs,0);
  mpi_workTime.resize(mpi_procs,0);
  mpi_anchorSendCount.resize(mpi_procs,0);
  mpi_anchorRecvCount.resize(mpi_procs,0);  
}

void mpi_init(){
  mpi_usingShm=(opt.use_shm_mmap || opt.use_shm_sysv);
  
  cout << "Synchronizing MPI nodes..."<<endl;
  //synchronize machine names
  mpi_hostnames.resize(mpi_procs);
  mpi_hostLeader.resize(mpi_procs);
  mpi_hostIds.reserve(mpi_procs);
  int hostId=-1;
  for(int i=0;i<mpi_procs;i++){
    char buf[81];
    if(mpi_id==i)
      strcpy(buf,nodename);
    MPI_Bcast(buf,81,MPI_CHAR,i,MPI_COMM_WORLD);
    string sbuf(buf);
    mpi_hostnames[i]=sbuf;
    mpi_hostPopulation[sbuf]++;
    
    pair<map<string,int>::iterator, bool> res(mpi_hostLeader_byName.insert(pair<string,int>(sbuf,i)));
    if(res.second)//new host!
      hostId++;
    mpi_hostIds.push_back(hostId);
    if(mpi_id==i){
      mpi_myHostId=hostId;
      if(res.second || !mpi_usingShm || (opt.use_shm_mmap && !opt.mmap_writePerHost && mpi_id==0)){ //new entry
	mpi_isHostLeader=true;
      }else{
	mpi_isHostLeader=false;
      }
    }
    mpi_hostLeader[i]=(mpi_usingShm ? mpi_hostLeader_byName[sbuf]:i);
  }

  //especially if we're using shm we have to create new comm objects per host and for all leaders
  if(opt.verbose)cout << "Setting up leader/drone channel (my color: "<<mpi_isHostLeader<<")"<<endl;
  MPI_Comm_split(MPI_COMM_WORLD,mpi_isHostLeader,0,&mpi_leaders_comm); //well actually non-leaders get one too, but it only goes to fellow drones.
  MPI_Comm_rank(mpi_leaders_comm,&mpi_myLeaderRank);
  
  if(opt.verbose)cout << "Setting up localhost channel (my color: "<<mpi_myHostId<<")"<<endl;
  MPI_Comm_split(MPI_COMM_WORLD,mpi_myHostId,0,&mpi_localhost_comm);
  MPI_Comm_rank(mpi_localhost_comm,&mpi_myLocalRank);
  //wow, that was easy.
  
#ifdef USE_SHM_SYSV    
  if(opt.use_shm_sysv){
    mpi_sysv_projid=getpid();
    MPI_Bcast(&mpi_sysv_projid,1,MPI_INT,0,MPI_COMM_WORLD); //everybody use node 0's pid as projid (sadly, not guaranteed to be clusterwide unique, but a clusterwide unique number id is a tall order to fill...)
  }
#endif
}

void mpi_initJobs(){
  //synchronize machine statistics
  mpi_worldMemory.resize(mpi_procs,0);
  if(!mpi_usingShm)
    totalSequenceMemory*=mpi_hostPopulation[mpi_hostnames[mpi_id]];
  else
    MPI_Bcast(&totalSequenceMemory,1,MPI_UNSIGNED_LONG,0,mpi_localhost_comm); //first make sure everyone on this host knows how much memory is getting spent on sequence storage.
  mpi_worldMemory[mpi_id]=(opt.targetMemory-totalSequenceMemory)/mpi_hostPopulation[mpi_hostnames[mpi_id]];
  mpi_totalMemory=0;
  vector<int> node_memoryRank(mpi_procs);
  for(int i=0;i<mpi_procs;i++){
    MPI_Bcast(&mpi_worldMemory[i],1,MPI_UNSIGNED_LONG,i,MPI_COMM_WORLD);
    mpi_totalMemory+=mpi_worldMemory[i];
    node_memoryRank[i]=i;
  }

  //automatically assign jobs based on who has the most ram
  cout << "Assigning jobs"<<endl;
  mpi_jobs.resize(mpi_procs);
  if(opt.mpi_bigFirst)
    sort(node_memoryRank.begin(),node_memoryRank.end(),gt_mpi_worldMemory());
  else
    sort(node_memoryRank.begin(),node_memoryRank.end(),lt_mpi_worldMemory());
  map<string,int> jobsAssigned; //keep track of how many storage jobs have been assigned to each node
  int maxJobs=0;
    
  int mid=opt.mpi_hashers ? opt.mpi_hashers:mpi_procs/4;
  if(mid<=0)
    mid=1; //need at least 1 to do final assembly!
  mpi_hasherIds.resize(mid,-1);
  int hasherId=mid;

  vector<vector<int> > hashersOn(mpi_hostnames.size()); //for later use
  mpi_assemblerIds.resize(mpi_procs,-1);
  for(int i=0;i<mpi_procs;i++){
    int node=-1;
    MPI_Job job=i<mid ? JOB_HASHER:JOB_STORAGE;
    do{ //until we find one...
      vector<int>::iterator picki=node_memoryRank.begin();
      for(int pick=0;pick<(int)node_memoryRank.size();pick++,++picki)
	if(!opt.mpi_hostbalance || jobsAssigned[mpi_hostnames[node_memoryRank[pick]]]<=maxJobs){
	  node=node_memoryRank[pick];
	  mpi_jobs[node]=job;
	  jobsAssigned[mpi_hostnames[node]]++;
	  node_memoryRank.erase(picki);
	  goto NodePicked;
	}
      //couldn't successfully find one, so, up maxJobs and try again
      maxJobs++;
    }while(node<0);
  NodePicked:
    if(job==JOB_HASHER){
      hasherId--;
      assert(hasherId>=0);
      if(node==mpi_id)
	mpi_myHasherId=hasherId;
      mpi_hasherIds[hasherId]=node;
      hashersOn[mpi_hostIds[node]].push_back(node);
    }
    mpi_jobCount[job]++;
  }

  //pick final assemblers
  if(!opt.mpi_distMerge && opt.mpi_distCollect){ //can only use 1 out of n hashers as  merger, so pick carefully
    multimap<int,int> assemblingFor; // client count -> server id
    vector<int> assemblingCount(mpi_procs);
    
    for(int i=0;i<mpi_procs;i++){
      if(mpi_jobs[i]==JOB_HASHER)
	continue;
      
      assemblingFor.clear();
      for(vector<int>::iterator hi=hashersOn[mpi_hostIds[i]].begin();
	  hi!=hashersOn[mpi_hostIds[i]].end();
	  ++hi)
	assemblingFor.insert(pair<int,int>(assemblingCount[*hi],*hi));
      
      if(!assemblingFor.empty()){
	int hi=assemblingFor.begin()->second; //most unused on this node
	mpi_assemblerIds[i]=hi;
	assemblingCount[hi]++;
      }
    }
    
    //for the remaining nodes to quickly find the least used node
    for(vector<int>::iterator hi=mpi_hasherIds.begin();hi!=mpi_hasherIds.end();++hi){
      assemblingFor.insert(pair<int,int>(assemblingCount[*hi],*hi));
    }
    
    for(int i=0;i<mpi_procs;i++){
      if(mpi_assemblerIds[i]!=-1)
	continue;
      multimap<int,int>::iterator hi=assemblingFor.begin();
      mpi_assemblerIds[i]=hi->second;
      assemblingCount[hi->second]++;
      assemblingFor.erase(hi);
      assemblingFor.insert(pair<int,int>(assemblingCount[mpi_assemblerIds[i]],mpi_assemblerIds[i]));
    }
  }else{
    //just to make this mpi_assemblerIds data consistent, set it anyway...
    for(int i=0;i<mpi_procs;i++){
      if(mpi_jobs[i]==JOB_STORAGE)
	mpi_assemblerIds[i]=mpi_hasherIds[0];
    }
  }

  //the "final assembler" is the one that actually writes out the anchor file
  mpi_finalAssembler=mpi_hasherIds[0]; 
  
  //set up global "0-n -> location" coordinate map
  //  mpi_fillHashPoints();

  //now that we have jobs, also init job channel
  if(opt.verbose)cout << "Setting up job channel (my color: "<<mpi_jobs[mpi_id]<<")"<<endl;
  MPI_Comm_split(MPI_COMM_WORLD,(int)mpi_jobs[mpi_id],mpi_myHasherId,&mpi_job_comm);
  MPI_Comm_rank(mpi_job_comm,&mpi_myJobRank);
  if(mpi_id==mpi_finalAssembler)
    assert(mpi_myJobRank==0);  
  cout << "Final assembler is "<<mpi_finalAssembler<<endl;

  //just for sanity's sake, make everyone aware of others jobid's
  mpi_worldId2jobId.resize(mpi_procs);
  mpi_worldId2jobId[mpi_id]=mpi_myJobRank;
  for(uint i=0;i<(uint)mpi_procs;i++)
    MPI_Bcast(&mpi_worldId2jobId[i],1,MPI_INT,i,MPI_COMM_WORLD);
}

void mpi_fillHashPoints(){
  word count=0;
  for(unsigned seqi=0;seqi<seqs.size();seqi++){
    BitSequence *a=seqs[seqi]->fwd;
  fillRegionBrks_start: 
    if((a==seqs[seqi]->fwd && !opt.skipFwd) ||
       (a==seqs[seqi]->rev && !opt.skipRev))
      for(unsigned j=0;j<a->matchRegions.size();j++){
	SeqPos start=a->matchRegions[j].first+1;
	if(a==seqs[seqi]->rev) //flip
	  start=0-start;
	mpi_hashPoints[count]=Location(seqi,start);
	//	cout << "Defining global "<<count<<"~"<<(count+(a->matchRegions[j].second-a->matchRegions[j].first))<<" as a "<<(a->matchRegions[j].second-a->matchRegions[j].first+1)<<" long region at: "<<Location(seqi,start)<<endl;
	count+=a->matchRegions[j].second-a->matchRegions[j].first+1; //this needs to be the number of bases -in that region-
      }
    if(a==seqs[seqi]->fwd){
      a=seqs[seqi]->rev;
      goto fillRegionBrks_start;
    }
  }
  assert(count==globalCounted);
}

GlobalHashPointIterator::GlobalHashPointIterator() :
  _count(0), _seqi(0), _bs(seqs[_seqi]->fwd), _regi(0) { }

GlobalHashPointIterator::GlobalHashPointIterator(word x) :
  _count(0), _seqi(0), _bs(seqs[_seqi]->fwd), _regi(0) { seek(x); }

void GlobalHashPointIterator::seek(word x){
  while(_count+span()-1<x && !atLast()){
    //    cout << ">Seeking for "<<x<<" past "<<debugString()<<endl;
    ++*this;
  }
  //  cout << ">End seek for "<<x<<" at "<<debugString()<<endl;
  if(atLast() && _count+span()-1<x)
    throw MurasakiException(string("Never found target GlobalHashPoint ")+dstring((long)x));
}

string GlobalHashPointIterator::debugString() const {
  ostringstream os;
  os << "seqi="<<_seqi<<string(_bs==seqs[_seqi]->fwd ? "fwd":"rev")
     <<" regi="<<_regi<<"/"
     <<_bs->matchRegions.size()<<" span="
     <<(_bs->matchRegions[_regi].second-_bs->matchRegions[_regi].first+1)
     <<" count="<<_count;
  return os.str();
}

word GlobalHashPointIterator::counted() const {return _count;}

word GlobalHashPointIterator::span() const {
  //  cout << "Span: "<<debugString()<<" spans: "<<_bs->matchRegions[_regi].second<<"-"<<_bs->matchRegions[_regi].first<<"+1 ->"<<(_bs->matchRegions[_regi].second-_bs->matchRegions[_regi].first+1)<<endl;
  return _bs->matchRegions[_regi].second-_bs->matchRegions[_regi].first+1;
}

pair<word,Location> GlobalHashPointIterator::operator*() const {
  SeqPos start=_bs->matchRegions[_regi].first+1;
  if(_bs==seqs[_seqi]->rev) //flip
    start=0-start;
  return pair<word,Location>(_count,Location(_seqi,start));
}

GlobalHashPointIterator& GlobalHashPointIterator::operator++(){
  assert(!atLast());
  _count+=span();
  //  cout << "Advancing "<<debugString()<<endl;
  if(_regi+1<_bs->matchRegions.size()){   //still in this bitseq, advance region
    ++_regi;
    return *this;
  }
  //bitseq is over, switch fwd to rev if possible
  _regi=0;
  if(_bs==seqs[_seqi]->fwd){
    _bs=seqs[_seqi]->rev;
    return *this;
  }
  //sequence is over, advance to next seq
  ++_seqi;
  _bs=seqs[_seqi]->fwd;
  return *this;
}

bool GlobalHashPointIterator::atLast() const {
  return _seqi+1>=seqs.size() && _bs!=seqs[_seqi]->fwd && _regi+1>=_bs->matchRegions.size();
}

bool GlobalHashPointIterator::sanityCheck() const {
  GlobalHashPointIterator test(*this);
  while(!test.atLast())
    ++test;
  cout << "Sanity check: counted="<<test.counted()<<"+span="<<test.span()<<"->"<<(test.counted()+test.span())<<"=="<<globalCounted<<endl;
  assert(test.counted()+test.span()==globalCounted);
  return test.counted()+test.span()==globalCounted;
}

void mpi_hasher_client_mode(BitSequence *pat){
  writeOut(opt.status_record,"MPI>Hasher client starting.",true);  

  //prepare message buffers
  int peakBufferBlocks=0;
  vector<list<MessageBlock> > sendBuffers(mpi_procs);
  for(int i=0;i<mpi_procs;i++)
    if(mpi_jobs[i]==JOB_STORAGE)
      sendBuffers[i].push_back(MessageBlock());
  
  word globalStart=mpi_hashOwn[mpi_id].first,globalEnd=mpi_hashOwn[mpi_id].second;
  word globalPos=globalStart;
  ticker.reset(mpi_hashOwn[mpi_id].second-mpi_hashOwn[mpi_id].first);
  cout << "My global region is "<<globalPos<<" to "<<globalEnd<<endl;
  Timer hashStart;
  for(word globalInc=0;globalPos<globalEnd;globalInc++){
    GlobalHashPointIterator locIte(globalPos);
    //locIte.seek(globalPos);
    SeqPos offset=globalPos-locIte.counted();
    assert(globalInc==0 || !offset);
    assert(locIte.sanityCheck());
    assert(locIte.counted()<=globalPos);
    assert(locIte.counted()+locIte.span()>globalPos);

    Location loc=(*locIte).second;
    assert(globalPos>=(*locIte).first);
    BitSequence *bitseq=loc.bitSeq();
    Sequence *s=loc.seq();
    int sign=bitseq==s->fwd ? 1:-1;
    //    cout << "Global increment: "<<globalInc<<" pos: "<<globalPos<<" end: "<<globalEnd<<" offset="<<offset<<" region start: "<<loc<<endl;

    SeqPosPairArray::iterator region=loc.localRegionIte(),
      endRegion=bitseq->matchRegions.end();
    for(;globalPos<globalEnd && region!=endRegion;++region,offset=0){ //then do all the regions in that sequence
#ifndef NDEBUG
      //      word initialGlobalPos=globalPos;
#endif
      SeqPos start=region->first+offset+1; //location style coords (start at 1), but always positive
      SeqPos stop=region->second-pat->length()+2;
      Window win(bitseq,start-1,pat); //bitseq coords
      word hash;
      SeqPos p;
      for(p=start;p<=stop;p+=opt.hashSkip){
	hash=win.hash();
	mpi_hashCount[mpi_id]++;
	Location here(s,sign<0 ? 0-p:p);
	//select target storage node
	map<word,int>::iterator ite=mpi_storeBrkStop.upper_bound(hash);
	assert(ite!=mpi_storeBrkStop.end());
	assert(hash<ite->first); //must be within target's hash range!
	int target=(*ite).second;
	assert(target<mpi_procs);
	assert(target>=0);
	assert(mpi_jobs[target]==JOB_STORAGE);
	assert(mpi_storeOwn[target].first<=hash);
	assert(mpi_storeOwn[target].second>=hash);
	sendBuffers[target].back().messages[sendBuffers[target].back().used++].set(hash,here);

	//	cout << "Hashed global "<<globalPos<<"~"<<start<<" "<<sign<<":("<<here<<win.prettyString()<<") to "<<hash<<" for "<<target<<endl;
	
	//see if we just filled up a buffer. if so, start it sending and add another one
	if(sendBuffers[target].back().used>=mpi_bufferSize){
	  assert(sendBuffers[target].back().used<=mpi_bufferSize);
#ifndef NDEBUG
	  //make sure everything in that buffer is sane
	  for(int mi=0;mi<sendBuffers[target].back().used;mi++){
	    assert(sendBuffers[target].back().messages[mi].key>=mpi_storeOwn[target].first);
	    assert(sendBuffers[target].back().messages[mi].key<=mpi_storeOwn[target].second);
	  }
	  //	  assert(cout << "Sending "<<sendBuffers[target].back().used<<" "<<sizeof(MPI_HashMessage)<<" byte messages to "<<target<<endl);
	  assert(sendBuffers[target].back().used<=mpi_bufferSize);
#endif
	  MPI_Issend(sendBuffers[target].back().messages,sendBuffers[target].back().used*sizeof(MPI_HashMessage),MPI_BYTE,target,0,MPI_COMM_WORLD,&(sendBuffers[target].back().request));
	  sendBuffers[target].back().stored=1; //tag as sent.

	  do {
	    //erase any finished buffers on this target...
	    MPI_Status status;
	    int send_done;
	    for(list<MessageBlock>::iterator bite=sendBuffers[target].begin();
		bite!=sendBuffers[target].end();){
	      list<MessageBlock>::iterator here=bite;
	      ++bite;
	      send_done=0;
	      if(here->stored){
		if(opt.mpi_maxBuffers && sendBuffers[target].size()>opt.mpi_maxBuffers){ //if currently overly full...
		  MPI_Wait(&(here->request),&status);
		  send_done=1;
		}else{
		  MPI_Test(&(here->request),&send_done,&status);
		}
		if(send_done){
		  sendBuffers[target].erase(here);
		}
	      }
	      else{break;} //these are guaranteed to be sent in order, so anything after here also hasn't been sent.
	    }
	  }while(opt.mpi_maxBuffers && sendBuffers[target].size()>opt.mpi_maxBuffers);
	  
	  sendBuffers[target].push_back(MessageBlock()); //add a fresh buffer
	  peakBufferBlocks=max<int>(peakBufferBlocks,sendBuffers[target].size());
	}
	
	//      NextHash:
	ticker.tick(globalPos-globalStart);
	
	if(opt.hashSkip>1)
	  win.slide(opt.hashSkip*2);
	else
	  win.slide();
	globalPos+=opt.hashSkip;
	if(globalPos>=globalEnd){
	  goto DoneHashing;
	}
      }
      //without any hashSkip p ends a pattern length away from the region end, so need to add that in.
      //with pattern skips, it might go over, in which case subtraction is correct.
      globalPos+=(region->second-p+2); 
#ifndef NDEBUG
      //      SeqPos regionLength=(region->second-region->first)+1-initialOffset;
      //      assert(globalPos==regionLength+initialGlobalPos);
#endif
    }
  }
 DoneHashing:
  cout << endl << "Adding 'finish' notifications to the send queue..."<<endl;
  for(int target=0;target<mpi_procs;target++){
    if(mpi_jobs[target]==JOB_STORAGE){
      if(sendBuffers[target].back().used)
	sendBuffers[target].push_back(MessageBlock()); //empty is a party invitation. if it was never used, that will work just fine as an invitation.
    }
  }

  cout << "Delivering remaining hash messages..."<<endl;
  for(int target=0;target<mpi_procs;target++){
    if(sendBuffers[target].size()){
      for(list<MessageBlock>::iterator bite=sendBuffers[target].begin();
	  bite!=sendBuffers[target].end();++bite){
	if(!bite->stored){
	  assert(bite->used<=mpi_bufferSize);
#ifndef NDEBUG
	  //make sure everything in that buffer is sane
	  for(int mi=0;mi<bite->used;mi++){
	    assert(bite->messages[mi].key>=mpi_storeOwn[target].first);
	    assert(bite->messages[mi].key<=mpi_storeOwn[target].second);
	  }
	  //	  assert(cout << "Sending "<<bite->used<<" messages to "<<target<<endl);
#endif
	  MPI_Issend(bite->messages,bite->used*sizeof(MPI_HashMessage),MPI_BYTE,target,0,MPI_COMM_WORLD,&(bite->request));
	  bite->stored=1; //tag as sent.
	}
      }   
    }
  }

  Timer hashDone;
  cout << "Hashing computations done in: "<<elapsed(hashStart,hashDone)<<endl;
  mpi_workTime[mpi_id]+=diff(hashDone,hashStart);
  cout << "Waiting for final messages to finish sending..."<<endl;
  bool stillSending=true;
  while(stillSending){
    MPI_Status status;
    int send_done;
    stillSending=false;
    for(int target=0;target<mpi_procs;target++){
      if(sendBuffers[target].size()){
	stillSending=true;
	for(list<MessageBlock>::iterator bite=sendBuffers[target].begin();
	    bite!=sendBuffers[target].end();){
	  list<MessageBlock>::iterator here=bite;
	  ++bite;
	  send_done=0;
	  if(here->stored){
	    MPI_Wait(&(here->request),&status);
	    sendBuffers[target].erase(here);
	  }
	}
      }
    }
  }
  Timer hashSendComplete;
  cout << "Delivery of remaining messages took: "<<elapsed(hashDone,hashSendComplete)<<endl;
  cout << "Overall hasher time: "<<elapsed(hashStart,hashSendComplete)<<endl;
  cout << "I'm still alive!"<<endl;
  cout << "Max message blocks in a queue at any one time: "<<peakBufferBlocks<<endl;
}

void mpi_storage_client_mode(){
  cout << "Initializing hash storage mode."<<endl;
  Timer storageStart;
  word peakBufferBlocks=0;
  int nodesLeft=mpi_jobCount[JOB_HASHER];
  typedef pair<int, list<MessageBlock> > ReceiverQueue;
  list<ReceiverQueue > receiverQueues; //local storage after pulling off the mpi stack
  word packets=0;
  vector<int> doneNodes(mpi_procs,0);
  for(int node=0;node<mpi_procs;node++)
    if(mpi_jobs[node]==JOB_HASHER){
      receiverQueues.push_back(ReceiverQueue(node,list<MessageBlock>(1))); //start with an empty block
      MessageBlock &block=receiverQueues.back().second.front();
      MPI_Irecv(block.messages,mpi_bufferSize*sizeof(MPI_HashMessage),MPI_BYTE,receiverQueues.back().first,0,MPI_COMM_WORLD,&block.request); //start receiving
    }

  list<ReceiverQueue>::iterator qi(receiverQueues.begin()),qinext=qi;
  while(nodesLeft){ //until we get a "done" message from everyone.
    qi=qinext;
    ++qinext;
    if(qinext==receiverQueues.end())
      qinext=receiverQueues.begin();

    list<MessageBlock> &toStore=qi->second;
    MPI_Request &recv_req=toStore.back().request;

    if(toStore.empty() && doneNodes[qi->first]){
      cout << "Finished storing all data from "<<qi->first <<" early. Removing from queue."<<endl;
      receiverQueues.erase(qi);
      if(receiverQueues.empty())
	break;
      continue; //completely done with this one
    }

    peakBufferBlocks=max<int>(peakBufferBlocks,toStore.size());
    int recv_done=0;
    MPI_Status recv_status;
    //store more crap in the hash
    do {
      if(toStore.front().used){ //if we have crap..
	mpi_storage_store(toStore);
      }
    }while(opt.mpi_maxBuffers && toStore.size()>opt.mpi_maxBuffers && toStore.front().used);

    if(doneNodes[qi->first]) //no more messages to receive here
      continue;

    //see if we got a message...
    MPI_Test(&recv_req,&recv_done,&recv_status);
    if(!recv_done){//no message yet
      continue;
    }
    //got a message
    packets++;

    MPI_Get_count(&recv_status,MPI_BYTE,&(toStore.back().used));
#ifndef NDEBUG
    if(recv_status.MPI_SOURCE!=qi->first){
      cout << "Bogon packet! Supposed to be received from "<<qi->first<<" but came from "<<recv_status.MPI_SOURCE<<endl;
      cout << "Contains: "<<toStore.back().used<<" bytes"<<endl;
    }
#endif
    assert(recv_status.MPI_SOURCE==qi->first); //better have received this from the node we thought was sending!

    assert((toStore.back().used%sizeof(MPI_HashMessage))==0);
    toStore.back().used/=sizeof(MPI_HashMessage);
    if(!toStore.back().used){ //got an empty (ie: done!)
      cout << "Node "<<recv_status.MPI_SOURCE<<" reported being done. "<<(nodesLeft-1)<<" left."<<endl;
      assert(!doneNodes[recv_status.MPI_SOURCE]);
      doneNodes[recv_status.MPI_SOURCE]=1;
      nodesLeft--;
      //this final message is technically useless, so just erase it now
      toStore.pop_back();
      if(!nodesLeft)
	break; //completely quick this message receiving schnick
    }else{
#ifndef NDEBUG      
      //scan everything we received to make sure it's all sane
      for(int mi=0;mi<toStore.back().used;mi++){
	MPI_HashMessage &msg=toStore.back().messages[mi];
	if(!(msg.key>=mpi_myStoreOffset)){
	  cerr << "Fresh off the wire from "<<recv_status.MPI_SOURCE<<", already borken: m#"<<mi<<" ("<<mi*sizeof(MPI_HashMessage)<<"): "<<msg.key
	       <<"<"<<mpi_myStoreOffset<<" wtf??? used="<<toStore.back().used<<" stored="<<toStore.back().stored<<" loc: "<<(long)msg.seqno<<","<<(long)msg.pos<<endl;
	  for(int nmi=max(mi-5,0);nmi<min(toStore.back().used,mi+5);nmi++){
	    MPI_HashMessage &nmsg=toStore.back().messages[nmi];
	    cerr << "Nearby: #"<<nmi<<": key="<<nmsg.key<<" loc="<<(long)nmsg.seqno<<","<<(long)nmsg.pos<<endl;
	  }
	}
	assert(msg.key>=mpi_myStoreOffset);
	assert(msg.key<mpi_myStoreOffset+hash_size);
      }      
#endif
      //      cout << "Adding another cake..."<<endl;
      toStore.push_back(MessageBlock()); //add another buffer to receive in while we process this one...
      MPI_Irecv(toStore.back().messages,mpi_bufferSize*sizeof(MPI_HashMessage),MPI_BYTE,qi->first,0,MPI_COMM_WORLD,&toStore.back().request); //start receiving
    }
  }
  Timer receiveDone;
  cout << "Finished receiving data in: "<<elapsed(storageStart,receiveDone)<<endl;

  size_t remaining=0;
  for(qi=receiverQueues.begin();qi!=receiverQueues.end();++qi)
    for(list<MessageBlock>::iterator mi=qi->second.begin();mi!=qi->second.end();++mi)
      remaining+=mi->used-mi->stored;
  if(remaining){
    cout << "Finishing up storage of final "<<remaining<<(remaining==1 ? " block":" blocks")<<endl;
    ticker.reset(remaining);
    for(qi=receiverQueues.begin();qi!=receiverQueues.end();++qi){
      list<MessageBlock> &toStore=qi->second;
      //final mad dash to finish
      while(!toStore.empty()){
	mpi_storage_store(toStore);
	ticker.tick();
      }
    }
    cout << endl;
  }

  Timer storageDone;
  mpi_workTime[mpi_id]+=diff(storageDone,storageStart);
  cout << "Most message blocks in queue at any one time: "<<peakBufferBlocks<<endl;
  cout << "Exclusive storage time: "<<elapsed(receiveDone,storageDone)<<endl;
  cout << "Total storage time: "<<elapsed(storageStart,storageDone)<<endl;
}

void mpi_storage_store(list<MessageBlock> &toStore){
  MPI_HashMessage &msg=toStore.front().messages[toStore.front().stored++];
  mpi_storeCount[mpi_id]++;
#ifndef NDEBUG
  if(!(msg.key>=mpi_myStoreOffset)){
    cerr << "Oh noes! "<<msg.key<<"<"<<mpi_myStoreOffset<<" wtf??? used="<<toStore.front().used<<" stored="<<toStore.front().used<<" loc: "<<(long)msg.seqno<<","<<(long)msg.pos<<endl;
    if(toStore.front().stored>=2){
      MPI_HashMessage &prior=toStore.front().messages[toStore.front().stored-2];
      cerr << "prior: key="<<prior.key<<" used="<<toStore.front().used<<" stored="<<toStore.front().stored<<" loc: "<<(long)prior.seqno<<","<<(long)prior.pos<<endl;
    }
    assert(msg.key>=mpi_myStoreOffset); //err, why are we receiving someone else's messagse??
  }
#endif
  msg.key-=mpi_myStoreOffset; //subtract to match my personal offset
  assert(msg.key<hash_size);
  mfh->add(msg.key,Location(msg.seqno,msg.pos));
  if(toStore.front().stored>=toStore.front().used){//we're done with that buffer
    toStore.erase(toStore.begin());
  }
}

void mpi_merge_client_mode(){
  list<AnchorBlock* > todo; //things with a header & body
  list<AnchorBlock* > pending; //things with just a header
  int nodesLeft=mpi_jobCount[JOB_STORAGE];
  vector<int> doneNodes(mpi_procs,0);//just to make sure
  word packets=0;
  word mergeProgressDelay=opt.anchorProgressCheck;
  word mergeProgressCheck=mergeProgressDelay;

  if(!opt.mpi_distMerge){
    nodesLeft=0;
    for(uint i=0;i<(uint)mpi_procs;i++)
      if(mpi_assemblerIds[i]==mpi_id && mpi_jobs[i]==JOB_STORAGE){
	nodesLeft++;
      }
  }
  cout << "Hasher "<<mpi_myHasherId<<": Collecting locations into "<<(opt.mpi_distCollect ? "intermediate":"final")<<" anchors from ";
  if(opt.mpi_distMerge)
    cout << "all";
  else
    cout <<nodesLeft;
  cout <<" nodes."<<endl;

  Timer anchorStoreStart;
  bool spareTime;

  AnchorBlock *incoming=0; //waiting for header
  MPI_Status status;
  while(nodesLeft || !pending.empty()){
    spareTime=opt.mpi_anchorInSpareTime;
    if(nodesLeft){
      if(!incoming){ //start a new receive
	incoming=new AnchorBlock();
	MPI_Irecv(&incoming->header,sizeof(MPI_AnchorHeader),MPI_BYTE,MPI_ANY_SOURCE,0,MPI_COMM_WORLD,&incoming->headerReq);
	spareTime=false;
      }
      MPI_Test(&incoming->headerReq,&incoming->headerStored,&status);
      if(incoming->headerStored){ //is it done?
	incoming->owner=status.MPI_SOURCE;
	int payloadSize;
	MPI_Get_count(&status,MPI_BYTE,&payloadSize);
	if(!payloadSize){ //done msg
	  cout << "Node "<<incoming->owner<<" announced being done. "<<(nodesLeft-1)<<" left."<<endl;
	  assert(incoming->owner>=0);
	  assert(incoming->owner<mpi_procs);
	  assert(mpi_jobs[incoming->owner]==JOB_STORAGE);
	  assert(!doneNodes[incoming->owner]);
	  doneNodes[incoming->owner]=1;
	  nodesLeft--;
	  delete incoming;
	  incoming=0;
	}else{
	  incoming->locs.resize(incoming->header.count);
	  pending.push_back(incoming);
	  MPI_Irecv(&incoming->locs.front(),sizeof(Location)*incoming->header.count,MPI_BYTE,incoming->owner,1,MPI_COMM_WORLD,&incoming->bodyReq); //note: tag 1
	  incoming=0;
	  spareTime=false;
	}
      }
    }

    //    word checked=1; //for debug only

    //check status of pending receives
    for(list<AnchorBlock*>::iterator i=pending.begin();i!=pending.end();){
      MPI_Test(&(*i)->bodyReq,&(*i)->bodyStored,&status);
      if((*i)->bodyStored){
	int check_count;
	MPI_Get_count(&status,MPI_BYTE,&check_count);
	check_count/=sizeof(Location);
	assert((word)check_count==(*i)->header.count);
	list<AnchorBlock*>::iterator temp=i;
	todo.push_back(*i);
	++i;
	pending.erase(temp);
	packets++;
	spareTime=false;
      }else{
	++i;
      }
    }
    
    //process a loc set if we have it.
    do {
      if(((opt.mpi_maxBuffers && todo.size()>=opt.mpi_maxBuffers) || spareTime) && !todo.empty()){
	mpi_procAnchorBlock(todo);
	if(anchorProgress_fh && mergeProgressDelay && !(--mergeProgressCheck)){
	  SystemInfo infoNow;
	  *anchorProgress_fh<<anchors->count()<<"\t"<<infoNow.freeMemory<<endl;
	  mergeProgressCheck=mergeProgressDelay;
	}
      }
    }while(opt.mpi_maxBuffers && todo.size()>=opt.mpi_maxBuffers);
  }
  Timer anchorReceiveDone;
  cout << "Collected "<<packets<<" packets."<<endl;
  cout << "All location data received in: "<<elapsed(anchorStoreStart,anchorReceiveDone)<<endl;

  //finish any remaining todos
  cout << "Remaining location sets to process: "<<todo.size()<<endl;
  ticker.reset(todo.size());
  while(!todo.empty()){
    mpi_procAnchorBlock(todo);
    if(anchorProgress_fh && mergeProgressDelay && !(--mergeProgressCheck)){
      SystemInfo infoNow;
      *anchorProgress_fh<<anchors->count()<<"\t"<<infoNow.freeMemory<<endl;
      mergeProgressCheck=mergeProgressDelay;
    }
    ticker.tick();
  }
  cout << endl;
  Timer anchorStoreDone;
  mpi_workTime[mpi_id]+=diff(anchorStoreDone,anchorStoreStart);
  cout << "Exclusive anchor generation completed in: "<<elapsed(anchorReceiveDone,anchorStoreDone)<<endl;
  cout << "Total anchors storage time: "<<elapsed(anchorStoreStart,anchorStoreDone)<<endl;
  cout << "Total "<<(opt.mpi_distCollect ? "intermediate":"final")<<" anchors: "<<anchors->count()<<endl;
}

void mpi_procAnchorBlock(list<AnchorBlock*> &todo){
  //demux into loclist
  LocList l(seq_count);
  AnchorBlock* block=todo.front();
  activeHash=block->header.hashKey;  //set up variables to pretend we're a non-mpi run
  //  cout <<"Processing a "<<block->header.count<<" block from "<<block->owner<<endl;
  for(vector<Location>::iterator i=block->locs.begin();i!=block->locs.end();++i){
    l[i->seqno].push_back(*i);
  }

  if(opt.mergeFilter){
    word combinations=1;
    for(int s=0;s<seq_count;s++){
      combinations*=l[s].size();
      if(combinations>opt.mergeFilter){
	if(repeats)
	  repeats->add(l);
	goto BlockDone;
      }
    }
  }
  
  if(1){
    list<Location> start; //localize this
    procLocs(l,start,opt.rifts);  //and go
  }

 BlockDone:
  delete block; //free that block
  todo.erase(todo.begin()); //and kill it
  mpi_mergeCount[mpi_id]++;
}

void mpi_extract_client_mode(BitSequence *pat){
  cout << "Preparing receiver scheduler..."<<endl;
  Timer anchorSendStart;

  vector<int> receivers; //list of receivers to choose from
  if(opt.mpi_distMerge){
    receivers.reserve(mpi_hasherIds.size());
    for(int i=mpi_myJobRank%mpi_hasherIds.size();i<(int)mpi_hasherIds.size();i++)
      receivers.push_back(mpi_hasherIds[i]);
    for(int i=0;i<mpi_myJobRank%(int)mpi_hasherIds.size();i++)
      receivers.push_back(mpi_hasherIds[i]); //initial order is shifted per extractor
    assert(receivers.size()==mpi_hasherIds.size());
  }else{
    receivers.reserve(1);
    receivers.push_back(mpi_assemblerIds[mpi_id]);
  }

  vector<multimap<int,int>::iterator> bufferItes; //per receiver iterator to receiverQ index>
  multimap<int,int> buffers; // buffers -> rank (only used if we have a max_buffer limit)
  int prevChoice=0;
  vector<list<AnchorBlock*> > sent(receivers.size()); //need a separate queue for each target

  if(opt.mpi_maxBuffers){
    bufferItes.reserve(receivers.size());
    for(uint i=0;i<receivers.size();i++){
      bufferItes.push_back(buffers.insert(pair<int,int>(opt.mpi_maxBuffers,i)));
    }
  }

  cout << "Extracting anchors from hash-space."<<endl;
  ticker.reset(hash_size);
  for(word base=0;base<(word)hash_size;base++,ticker.tick()){
    //    cout << "Checking "<<base<<"/"<<hash_size<<endl;
    if(mfh->emptyAt(base))
      continue;

    list<LocList> fulllist;
    mfh->getMatchingSets(base,fulllist);

    for(list<LocList>::iterator seti=fulllist.begin();seti!=fulllist.end();++seti){
      LocList &loclist=*seti;
      if(opt.mergeFilter && !opt.repeatMap){ //if we'll be culling repeats, but not storing them, we can eliminate the repeats here before sending
	word combinations=1;
	for(int s=0;s<seq_count;s++){
	  if(loclist[s].empty())
	    continue;
	  combinations*=loclist[s].size();
	}
	if(combinations>opt.mergeFilter){
	  continue; //abort here before even creating the AnchorBlock pointer.
	}
      }

      AnchorBlock* assemble=new AnchorBlock(base); //dump matches in here
      bool gotAllSeqs=true;
      word skips=opt.rifts;

      for(int s=0;s<seq_count;s++){
	if(loclist[s].empty()){
	  if(skips>0)
	    skips--;
	  else{
	    gotAllSeqs=false;
	    break;
	  }
	}
	//(re)mux all the locsublists
	for(LocSubList::iterator site=loclist[s].begin();site!=loclist[s].end();++site)	  
	  assemble->locs.push_back(*site);
      }

      if(gotAllSeqs){
	int receiver;
	if(opt.mpi_maxBuffers){
	  while(buffers.empty() || (*(buffers.rbegin())).first<1){ //make sure we have a buffer available
	    /*	    cout << "Waiting for available buffer"<<endl;
	    if(buffers.empty())
	      cout << "(Currently buffers is empty)"<<endl;
	    else
	    cout << "(Biggest buffer has "<<(*buffers.rbegin()).first<<" free)"<<endl;*/

	    mpi_extract_checkSentQueue(sent,bufferItes,buffers);
	  }
	  multimap<int,int>::iterator ite=buffers.end();
	  --ite; //can't erase reverse iterators, so find end manually...
	  pair<int,int> choice(*ite);
	  buffers.erase(ite); //pop
	  assert(choice.first>0); //must have receive buffers available.
	  receiver=choice.second;
	  if(choice.first>1) //any buffers left on that host?
	    bufferItes[receiver]=buffers.insert(buffers.begin(),pair<int,int>(choice.first-1,choice.second)); //if so, put it back on (-1 buffer) (and because it's now -1 buffer, it probably goes at the front, so use that as a hint)
	  else
	    bufferItes[receiver]=buffers.end();
	}else{
	  int choice=++prevChoice; //round robin
	  if((uint)choice>=receivers.size())
	    choice=0;
	  receiver=choice;
	}

	int receiverId=receivers[receiver];
	//	cout << "Will be sending to "<<receiver<<" (global rank: "<<receiverId<<")"<<endl;
	assemble->header.count=assemble->locs.size();
	assemble->header.hashKey=base+mpi_myStoreOffset;
	assert(assemble->header.count>=(unsigned)seq_count-opt.rifts); //must have at least that many...
	mpi_extractLocCount[mpi_id]+=assemble->header.count;
	MPI_Issend(&assemble->header,sizeof(MPI_AnchorHeader),MPI_BYTE,receiverId,0,MPI_COMM_WORLD,&(assemble->headerReq));
	MPI_Issend(&assemble->locs.front(),sizeof(Location)*assemble->header.count,MPI_BYTE,receiverId,1,MPI_COMM_WORLD,&(assemble->bodyReq)); //note: sent with tag 1 so we can differentiate it from a new header.
	sent[receiver].push_back(assemble); //tag as sent
	mpi_extractCount[mpi_id]++;
      }
      else {
	//	cout << "Negative! "<<assemble->locs.size()<<" locs"<<endl;
	delete assemble;
      }
    }
  }
  cout << endl; //for ticker
  Timer anchorSendProcDone;
  mpi_workTime[mpi_id]+=diff(anchorSendProcDone,anchorSendStart);
  cout << "Anchor extraction completed in: "<<elapsed(anchorSendStart,anchorSendProcDone)<<endl;
  
  while(mpi_extract_checkSentQueue(sent,bufferItes,buffers)) //finish sending
    ;
  
  Timer anchorSendAlmostDone;
  cout << "Signaling receiver that we're done (time so far: "<<elapsed(anchorSendStart,anchorSendAlmostDone)<<")"<<endl;
  int done=0;
  for(int receiver=0;receiver<(int)receivers.size();receiver++)
    MPI_Ssend(&done,0,MPI_BYTE,receivers[receiver],0,MPI_COMM_WORLD); //message it that we're done.
  Timer anchorSendDone;
  cout << "Anchor extraction and delivery completed in: "<<elapsed(anchorSendStart,anchorSendDone)<<endl;
}

word mpi_extract_checkSentQueue(vector<list<AnchorBlock*> > &sent,vector<multimap<int,int>::iterator> &bufferItes,multimap<int,int> &buffers){
  MPI_Status status;
  int bodyStored;
  bool packetsLeft=0;
  
  for(uint receiver=0;receiver<sent.size();receiver++)
    for(list<AnchorBlock* >::iterator ite=sent[receiver].begin();
	ite!=sent[receiver].end();){
      packetsLeft++;
      AnchorBlock& block=**ite;
      if(!block.headerStored)
	MPI_Test(&block.headerReq,&block.headerStored,&status);
      if(block.headerStored){
	MPI_Test(&block.bodyReq,&bodyStored,&status);
	if(bodyStored){
	  list<AnchorBlock* >::iterator temp(ite);
	  ++ite;
	  delete *temp;
	  sent[receiver].erase(temp);
	  
	  if(opt.mpi_maxBuffers){
	    //give ourselves another buffer for this receiver
	    multimap<int,int>::iterator mite(bufferItes[receiver]);
	    int count=0;
	    if(mite!=buffers.end()){
	      count=(*mite).first;
	      buffers.erase(mite);
	    }
	    bufferItes[receiver]=buffers.insert(pair<int,int>(count+1,receiver));
	  }
	}else{
	  ++ite;
	}
      }else{
	break; //packages are received in FIFO order
      }
    }
  return packetsLeft;
}

void mpi_hashAndStore_client_mode(BitSequence *pat){ //for the nocake option
  cout << "There is no cake. (Launching cakeless hash+storage mode)."<<endl;
  for(unsigned i=0;i<seqs.size();i++){
    if(!opt.skipFwd){
      cout << "Hashing "+seqs[i]->name+" forwards\n";
      mpi_hashSeqLocal(seqs[i]->fwd,pat,1,seqs[i]);
    }
    if(!opt.skipRev){
      cout << "Hashing "+seqs[i]->name+" backwards\n";
      mpi_hashSeqLocal(seqs[i]->rev,pat,-1,seqs[i]);
    }
  }
}

void mpi_hashSeqLocal(BitSequence *bitseq,BitSequence *pat,int sign,Sequence *s){
  ticker.reset(bitseq->length()-1);
  for(SeqPosPairArray::iterator region=bitseq->matchRegions.begin();
      region!=bitseq->matchRegions.end();region++){
    SeqPos stop=region->second-pat->length()+2;
    SeqPos start=region->first+1;
    Window win(bitseq,region->first,pat);
    word hash=win.hash();
    for(SeqPos p=start;p<=stop;p+=opt.hashSkip){
      hash=win.hash()-mpi_myStoreOffset;
      if(hash<hash_size){ //in my region!
	mpi_storeCount[mpi_id]++;
	Location here(s,p*sign);
	if(!mfh->emptyAt(hash)){
	  if(opt.hashfilter){
	    if(mfh->sizeAt(hash)>(unsigned)opt.hashfilter)
	      goto NextHash;
	  }
	  if(opt.seedfilter){
	    if(mfh->sizeAt(hash,here)>(unsigned)opt.seedfilter)
	      goto NextHash;
	  }
	}
	mfh->add(hash,here);
      }
    NextHash:
      ticker.tick(p-1);
      
      if(opt.hashSkip>1)
	win.slide(opt.hashSkip*2);
      else
	win.slide();
    }
  }
  cout << endl; //for ticker
}

void mpi_anchorMergeClient(int mergeTarget){
  cout << "Hasher "<<mpi_myJobRank<<": Sending "<<(anchors->used[0].size())<<" anchors to hasher "<<mergeTarget<<"..."<<endl;
  Timer anchorSendStart;
  assert(anchors); //i'd better have data to send...

  if(!anchors->used[0].empty()){
    ticker.reset(anchors->used[0].size());
    //send anchor data
    for(usedItree::iterator i=anchors->used[0].begin();
	i!=anchors->used[0].end();
	++i){
      //vectors have 2 parts we (might) need to send: intervals (the spans of the anchor, critical), members (what hashvals went into making this anchor (only need if opt.retainMembers is on))
      vector<UsedInt> intervals;
      assert((*i)->spaces.size()==(uint)seq_count);
      intervals.reserve(seq_count);
      for(vector<MapIte>::iterator j=(*i)->spaces.begin();
	  j!=(*i)->spaces.end();
	  ++j)
	intervals.push_back(j->key());
      MPI_Ssend(&intervals.front(),sizeof(UsedInt)*seq_count,MPI_BYTE,mergeTarget,0,mpi_job_comm);
      mpi_anchorSendCount[mpi_id]++;
      if(opt.retainMembers){
	//send member count
	int memberCount=(*i)->members.size();
	MPI_Ssend(&memberCount,1,MPI_INT,mergeTarget,1,mpi_job_comm);
	//serialize and send members
	vector<pair<word,SeqPos> > members;
	for(HashCount::iterator j=(*i)->members.begin();j!=(*i)->members.end();++j)
	  members.push_back(*j);
	
	MPI_Ssend(&members.front(),sizeof(pair<word,SeqPos>)*memberCount,MPI_BYTE,mergeTarget,2,mpi_job_comm);
      }
      ticker.tick();
    }
    cout << endl;
  }
  Timer anchorSendDone;
  cout << "Anchor merge completed in: "<<elapsed(anchorSendStart,anchorSendDone)<<endl;
  MPI_Ssend(&anchorSendDone,0,MPI_BYTE,mergeTarget,0,mpi_job_comm);
  mpi_workTime[mpi_id]+=diff(anchorSendDone,anchorSendStart);
  Timer repeatSendStart;

  if(repeats){
    cout << "Waiting for others to finish before sending repeats"<<endl;
    Timer repeatSendStart;
    cout << "Sending repeat data for "<<(repeats->clusters.size())<<" repeats."<<endl;
    //send meta-data (repeats)
    ticker.reset(repeats->clusters.size());
    for(list<LocList>::iterator ri=repeats->clusters.begin();
	ri!=repeats->clusters.end();++ri){
      AnchorBlock assemble; //basically have to reassemble the original block
      for(int si=0;si<seq_count;si++)
	for(LocSubList::iterator j=(*ri)[si].begin();
	    j!=(*ri)[si].end();j++)
	  assemble.locs.push_back(*j);
      assemble.header.count=assemble.locs.size();
      MPI_Ssend(&assemble.header,sizeof(MPI_AnchorHeader),MPI_BYTE,mergeTarget,0,mpi_job_comm);
      MPI_Ssend(&assemble.locs.front(),sizeof(Location)*assemble.header.count,MPI_BYTE,mergeTarget,1,mpi_job_comm);
      ticker.tick();
    }
    MPI_Ssend(&repeatSendStart,0,MPI_BYTE,mergeTarget,0,mpi_job_comm);
    Timer repeatSendDone;
    cout << "Repeat data send completed in: "<<elapsed(repeatSendStart,repeatSendDone)<<endl;
    mpi_workTime[mpi_id]+=diff(repeatSendDone,repeatSendStart);
  }
}

void mpi_anchorMergeServer(const vector<int> &senders){
  vector<MPI_Request> activeReq(senders.size());
  vector<vector<UsedInt> > intervals(senders.size());
  vector<int> memberCount(senders.size());
  vector<vector<pair<word,SeqPos> > > members(senders.size());
  vector<int> senderStates(senders.size(),0);

  cout << "Hasher "<<mpi_myJobRank<<": Merging distributed anchor data (for "<<senders.size()<<" "<<(senders.size()==1 ? "node: ":"nodes: ");
  copy(senders.begin(),senders.end(),ostream_iterator<int>(cout, " "));
  cout << ")..."<<endl;
  Timer anchorMergeStart;

  MPI_Status status;

  for(uint sender=0;sender<senders.size();sender++){ //start initial receives
    intervals[sender].resize(seq_count);
    MPI_Irecv(&intervals[sender].front(),sizeof(UsedInt)*seq_count,MPI_BYTE,senders[sender],0,mpi_job_comm,&activeReq[sender]);
  }
  uint nodesLeft=senders.size();

  int payloadSize;
  int stored;
  while(nodesLeft){
    for(uint sender=0;sender<senders.size();sender++){
      if(senderStates[sender]<0)continue;
      stored=0;
      MPI_Test(&activeReq[sender],&stored,&status);
      if(stored){
	switch(senderStates[sender]){
	case -1: break; //already done
	case 0: //receiving intervals
	  MPI_Get_count(&status,MPI_BYTE,&payloadSize);
	  if(payloadSize<1){//reported done
	    nodesLeft--;
	    senderStates[sender]=-1;
	    break;
	  }
	  mpi_anchorRecvCount[mpi_id]++;
	  assert((uint)payloadSize==sizeof(UsedInt)*seq_count);
	  if(opt.retainMembers){
	    senderStates[sender]++;
	    break;
	  }else{
	    senderStates[sender]=3; //skip to full
	  }
	  break;
	case 1: //receiving member count
	  MPI_Get_count(&status,MPI_INT,&payloadSize);
	  assert(payloadSize==1);
	  members[sender].resize(memberCount[sender]);
	  senderStates[sender]++;
	  break;
	case 2:
	  MPI_Get_count(&status,MPI_BYTE,&payloadSize);
	  assert((uint)payloadSize==(sizeof pair<word,SeqPos>(0,0))*memberCount[sender]);
	  senderStates[sender]++;
	  break;
	default:
	  throw MurasakiException("Invalid state in anchorMergeServer (recv switch)");
	}
	
	if(senderStates[sender]==3){ //full set
	  IntervalSet a(intervals[sender].begin(),intervals[sender].end(),
			members[sender].begin(),members[sender].end());
	  anchors->insert(a);
	  senderStates[sender]=0; //time to start next
	}
	
	switch(senderStates[sender]){ //start next receive
	case -1:break; //done
	case 0:
	  MPI_Irecv(&intervals[sender].front(),sizeof(UsedInt)*seq_count,MPI_BYTE,senders[sender],0,mpi_job_comm,&activeReq[sender]);
	  break;
	case 1:
	  MPI_Irecv(&memberCount[sender],1,MPI_INT,senders[sender],1,mpi_job_comm,&activeReq[sender]);
	  break;
	case 2:
	  MPI_Irecv(&members[sender].front(),sizeof(pair<word,SeqPos>)*memberCount[sender],MPI_BYTE,senders[sender],2,mpi_job_comm,&activeReq[sender]);
	  break;
	default:
	  throw MurasakiException("Invalid state in anchorMergeServer (start new receive switch)");
	}
      }
    }
  }
  Timer anchorMergeDone;
  cout << "Anchor merge completed in: "<<elapsed(anchorMergeStart,anchorMergeDone)<<endl;
  if(opt.verbose)
    cout << "Anchors at this point: "<<anchors->used[0].size()<<endl;

  mpi_workTime[mpi_id]+=diff(anchorMergeDone,anchorMergeStart);
  if(repeats){
    Timer repeatRecvStart;
    cout << "Receiving repeat data."<<endl;
    vector<AnchorBlock> assemble(senders.size());
    for(uint sender=0;sender<senders.size();sender++){
      senderStates[sender]=0;
      MPI_Irecv(&assemble[sender].header,sizeof(MPI_AnchorHeader),MPI_BYTE,senders[sender],0,mpi_job_comm,&activeReq[sender]);
    }
    
    nodesLeft=senders.size();
    while(nodesLeft){
      for(uint sender=0;sender<senders.size();sender++){
	if(senderStates[sender]<0)continue;
	stored=0;
	MPI_Test(&activeReq[sender],&stored,&status);

	if(stored){
	  MPI_Get_count(&status,MPI_BYTE,&payloadSize);
	  if(payloadSize<1){//reported done
	    cout << "Node "<<senders[sender]<<" (sender "<<sender<<") reported done."<<endl;
	    nodesLeft--;
	    senderStates[sender]=-1;
	    break;
	  }

	  switch(senderStates[sender]){
	  case 0: //received header
	    assemble[sender].locs.resize(assemble[sender].header.count);
	    MPI_Irecv(&assemble[sender].locs.front(),sizeof(Location)*assemble[sender].header.count,MPI_BYTE,senders[sender],1,mpi_job_comm,&activeReq[sender]);
	    senderStates[sender]++;
	    break;
	  case 1:
	    //have to demux into loclist then add
	    {//simply need to scope loclist
	      LocList loclist(seq_count);
	      for(vector<Location>::iterator i=assemble[sender].locs.begin();
		  i!=assemble[sender].locs.end();
		  ++i)
		loclist[i->seqno].push_back(*i);
	      repeats->add(loclist);

	      //start next recv of header
	      senderStates[sender]=0;
	      MPI_Irecv(&assemble[sender].header,sizeof(MPI_AnchorHeader),MPI_BYTE,senders[sender],0,mpi_job_comm,&activeReq[sender]);
	    }
	    break;
	  default:throw MurasakiException("Invalid state in anchorMergeServer (repeat recv switch)");
	  }
	}
      }
    }
    Timer repeatRecvDone;
    cout << "Repeat data send completed in: "<<elapsed(repeatRecvStart,repeatRecvDone)<<endl;
    mpi_workTime[mpi_id]+=diff(repeatRecvDone,repeatRecvStart);
  }
}

void mpi_syncSeqCounts(){
  assert(mpi_capable); assert(opt.mpi_fileDistro);
  //these can also be hostleader->drones, but for now, 0->world will do
  MPI_Bcast(&longestSeq,1,MPI_UNSIGNED_LONG,0,MPI_COMM_WORLD);
  MPI_Bcast(&totalSeqLength,1,MPI_UNSIGNED_LONG,0,MPI_COMM_WORLD);
  MPI_Bcast(&totalHashLength,1,MPI_UNSIGNED_LONG,0,MPI_COMM_WORLD);
  MPI_Bcast(&globalCounted,1,MPI_UNSIGNED_LONG,0,MPI_COMM_WORLD);
  MPI_Bcast(globalBaseCount,4,MPI_UNSIGNED_LONG,0,MPI_COMM_WORLD);
}

MessageBlock::MessageBlock() :
  used(0),stored(0),
  messages(new MPI_HashMessage[mpi_bufferSize])
{
  assert(messages);
}

MessageBlock::MessageBlock(const MessageBlock &a) :
  used(0),stored(0),
  messages(new MPI_HashMessage[mpi_bufferSize])
{ assert(!a.used); assert(!a.stored);} //only for copying empty blocks dammit!
void MessageBlock::reset(){
  used=0;
  stored=0;
}
MessageBlock::~MessageBlock(){
  if(messages)
    delete[] messages;
}

AnchorBlock::AnchorBlock() :
  header(0),headerStored(0),bodyStored(0),owner(-1) {}
AnchorBlock::AnchorBlock(word key) :
  header(key),headerStored(0),bodyStored(0),owner(-1) {}
AnchorBlock::AnchorBlock(const AnchorBlock &a) :
  header(a.header.hashKey),headerStored(0),bodyStored(0),owner(a.owner) { assert(!a.headerStored); assert(!a.bodyStored);} //only for copying empty blocks dammit!

MPI_HashMessage::MPI_HashMessage(const word& _key,const Location& l) :
  key(_key),seqno(l.seqno),pos(l.pos) {}
void MPI_HashMessage::set(const word& _key,const Location& l){
  key=_key;
  seqno=l.seqno;
  pos=l.pos;
}
MPI_HashMessage::MPI_HashMessage(){} //empty. don't bother initing

MPI_AnchorHeader::MPI_AnchorHeader(word k) : hashKey(k) {}
MPI_AnchorHeader::MPI_AnchorHeader() : hashKey(0) {}

void mpi_write_histogram(){
  if(opt.retainMembers){
    if(!dfCount)
      dfCount=anchors->makeDfCount(); //dfCount might still be null
    if(dfCount){
      cout << "Writing histogram..."<<endl;
      ofstream histo_fh(opt.hashHisto_record.c_str());
      vector<word> histo;
      for(word i=0;i<mpi_total_hash_size;++i)
	if(dfCount[i]){
	  if(histo.size()<=dfCount[i])
	    histo.resize(dfCount[i]+1,0);
	  histo[dfCount[i]]++;
	}
      for(word i=0;i<histo.size();i++)
      	histo_fh << i << "\t" << histo[i] << endl;
    }else{
      cout << "Histogram not possible in MPI without df table"<<endl;
    }
  }
}

#endif