[petsc-users] Tuning performance for simple solver

Michael Becker Fri, 03 Jun 2016 03:43:53 -0700

Dear all,

I have a few questions regarding possible performance enhancements forthe PETSc solver I included in my project.

It's a particle-in-cell plasma simulation written in C++, wherePoisson's equation needs to be solved repeatedly on every timestep.The simulation domain is discretized using finite differences, so thesolver therefore needs to be able to efficiently solve the linear systemA x = b successively with changing b. The solution x of the previoustimestep is generally a good initial guess for the solution.

I wrote a class PETScSolver that holds all PETSc objects and necessaryinformation about domain size and decomposition. To solve the linearsystem, two arrays, 'phi' and 'charge', are passed to a member functionsolve(), where they are copied to PETSc vectors, and KSPSolve() iscalled. After convergence, the solution is then transferred again to thephi array so that other program parts can use it.

The matrix is created using DMDA. An array 'bound' is used to determinewhether a node is either a Dirichlet BC or holds a charge.

I attached three files, petscsolver.h, petscsolver.cpp and main.cpp,that contain a shortened version of the solver class and a set-up toinitialize and run a simple problem.


Is there anything I can change to generally make the program run faster?

And, since I'm rather unexperienced with KSP methods, how do Iefficiently choose PC and KSP? Just by testing every combination?

Would multigrid be a viable option as a pure solver (-ksp_type preonly)?

Thanks,
Michael

/*
   Solves Poisson's equation with Dirichlet BC on a cubical grid of arbitrary size
   Processors: n
*/

#define MAIN_CPP

#include <mpi.h>
#include <iostream>
#include <stack>
#include <queue>

#include "petscsolver.h"

/* constants that would normally be imported from input card */
const    int   g_Ni =  100; //number of nodes per dimension
const double      h = 1E-4; //distance between grid points
const double reltol = 1E-5; //rtol in KSPSetTolerances

int main(int argc, char **argv)
{
  MPI_Init(&argc,&argv);
  
  int size,rank;
  MPI_Comm_size(MPI_COMM_WORLD,&size);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);

	if (rank==0) {
	  for (int i=0; i<argc; i++)
	    std::cout<<argv[i]<<' ';
	  std::cout<<'\n'<<std::flush;
  }

  /* Acquire the most evenly split factorization for domain decomposition */
  std::stack<int> primes;
	std::priority_queue<int> factors;
	int product = size;

	for (int i = 2; i <= product; i++){
		while (product%i==0) {
			primes.push(i);
			product /= i;
		}
	}

	while (primes.size()<3) {
		primes.push(1);
	}

	for (int i=0; i<3; i++) {
		factors.push(primes.top());
		primes.pop();
	}

	while (!primes.empty()) {
		int i = factors.top();
		factors.pop();
		i *= primes.top();
		primes.pop();
		factors.push(i);
	}

  int dims[3]; //process partition in each direction, as passed to DMDACreate3d
  for (int i = 0; i < 3; i++) {
    dims[i] = factors.top();
	  factors.pop();
  }

  if (rank==0) std::cout<<'\n'<<dims[0]<<'x'<<dims[1]<<'x'<<dims[2]<<" subdomains\n\n"<<std::flush; 


  /* Get subdomain position of own rank in global grid */
  /* Note that this is also the way PETSc partitions the global grid, otherwise this wouldn't work */
	int rankpos[3];
	rankpos[0] = rank % dims[0];
	rankpos[1] = (rank / dims[0]) % dims[1];
	rankpos[2] = (rank / (dims[0]*dims[1])) % dims[2];

	int l_Ni[dims[0]], l_Nj[dims[1]], l_Nk[dims[2]]; //number of local nodes along the x,y and z axis for each cell,
	                                                 //as passed to DMDACreate3d

	for (int i=0; i<dims[0]; i++)	l_Ni[i]=0;
	for (int i=0; i<dims[1]; i++)	l_Nj[i]=0;
	for (int i=0; i<dims[2]; i++)	l_Nk[i]=0;

	int j=0;
	do l_Ni[j%dims[0]]++; while (++j<g_Ni);

	int imin = 0;
	for (int i=0; i<rankpos[0]; i++)
		imin += l_Ni[i];
	int imax = imin+l_Ni[rankpos[0]]-1;

	j=0;
	do l_Nj[j%dims[1]]++; while (++j<g_Ni);

	int jmin = 0;
	for (int i=0; i<rankpos[1]; i++)
		jmin += l_Nj[i];
	int jmax = jmin+l_Nj[rankpos[1]]-1;

	j=0;
	do l_Nk[j%dims[2]]++; while (++j<g_Ni);

	int kmin = 0;
	for (int i=0; i<rankpos[2]; i++)
		kmin += l_Nk[i];
	int kmax = kmin+l_Nk[rankpos[2]]-1;

	int Ni = l_Ni[rankpos[0]];
	int Nj = l_Nj[rankpos[1]];
	int Nk = l_Nk[rankpos[2]];
  int Nall = Ni*Nj*Nk;
  
  MPI_Barrier(MPI_COMM_WORLD);
  std::cout<<"Rank "<<rank<<": Ni="<<Ni<<", Nj = "<<Nj<<", Nk = "<<Nk<<'\n'<<std::flush;
  MPI_Barrier(MPI_COMM_WORLD);
  if (rank==0) std::cout<<std::endl<<std::flush; 
  
  MPI_Barrier(MPI_COMM_WORLD);
  std::cout<<"Rank "<<rank<<": i="<<imin<<".."<<imax<<", j="<<jmin<<".."<<jmax<<", k="<<kmin<<".."<<kmax<<'\n'<<std::flush;
  MPI_Barrier(MPI_COMM_WORLD);
  if (rank==0) std::cout<<std::endl<<std::flush; 

  /*
     Assemble local arrays that need to be passed to the solver.
     Note that these can't be PETSc vectors yet, as they are normally accessed from other part of the program as well.
     The triplet (i,j,k) in the local grid is addressed via element i+j*Ni+k*Ni*Nj.
  */
  int *bound; //used to determine whether a node is a Dirichlet BC (1) or not (0)
  bound = new int[Nall];
  double *phi, *charge; //hold electric potential and charge
  phi = new double[Nall];
  charge = new double[Nall];

  /* 
     Define a hollow cylinder of constant potential
     Charge is set to 0, but can be varied
  */
  int center = (int) ceil(((double) g_Ni)/2. - 1.);
  for (int k=kmin; k<=kmax; k++) {
    for (int j=jmin; j<=jmax; j++) {
      for (int i=imin; i<=imax; i++) {
        int idx = i-imin+(j-jmin)*Ni+(k-kmin)*Ni*Nj;
	      
	      int R = (int) ceil(((double) g_Ni)/2-1.);

	      if ((i-center)*(i-center)+(j-center)*(j-center)>=R*R || k==0 || k==g_Ni-1) {
		      phi[idx] = 100.;
		      charge[idx] = 0.;
		      bound[idx] = 1;
	      } else {
		      phi[idx] = 0.;
		      charge[idx] = 0.;
		      bound[idx] = 0;
	      }
      }
    }
  }
  
  
  /* Create and initialize solver */
	PETScSolver *petscsolver;
	petscsolver = new PETScSolver(argc,argv,g_Ni,g_Ni,g_Ni,dims,l_Ni,l_Nj,l_Nk,Nall,h);
	petscsolver->init(bound); //initializes matrix
	
	/* Solve */
	petscsolver->initsolve(phi,charge,reltol);
	
	/*
	   Solve without setting options.
	   Note that this is normally repeatedly called with altered charge array and phi array of previous timestep as initial guess
	*/
	petscsolver->solve(0,phi,charge,reltol);
	petscsolver->solve(1,phi,charge,reltol);
	
	
	delete petscsolver;

  delete[] bound;
  delete[] phi;
  delete[] charge;


	MPI_Barrier(MPI_COMM_WORLD);

	if (rank == 0) std::cout<<".. done"<<std::endl;

	MPI::Finalize();

	return 0;
}

#ifndef PETSCSOLVER_H
#define	PETSCSOLVER_H

#include <petscksp.h>
#include <petscdm.h>
#include <petscdmda.h>

#include <mpi.h>
#include <iostream>

extern PetscErrorCode ComputeMatrix(KSP,Mat,Mat,void*);

class PETScSolver
{
  int g_Ni,g_Nj,g_Nk,g_Nall;
  int Nall;
  
  double ih;
  double iepsilon0;
  
  int *bound;

	int rank;

  KSP ksp;
  DM da;
  PC pc;
  Vec x, b;
  
public:
  PETScSolver(int,char**,int,int,int,int*,int*,int*,int*,int,double);
  ~PETScSolver();

	void init(int*);
	
  void initsolve(double*,double*,double reltol=1E-5);
  void solve(int,double*,double*,double reltol=1E-5);
};


#endif

#include "petscsolver.h"

PETScSolver::PETScSolver(int argc,char **argv,int g_Ni,int g_Nj,int g_Nk,int *dims,int *l_Ni,int *l_Nj,int *l_Nk,int _Nall,double h)
{
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);

	Nall = _Nall;
	ih = 1./h;
	iepsilon0 = 1./8.854187817E-12;
	
	bound = NULL;

	PetscInitialize(&argc,&argv,(char*)0,(char*)0);

	KSPCreate(PETSC_COMM_WORLD,&ksp);
	DMDACreate3d(PETSC_COMM_WORLD,DM_BOUNDARY_GHOSTED,DM_BOUNDARY_GHOSTED,DM_BOUNDARY_GHOSTED,
      DMDA_STENCIL_BOX,g_Ni,g_Nj,g_Nk,dims[0],dims[1],dims[2],1,1,l_Ni,l_Nj,l_Nk,&da);
	KSPSetDM(ksp,da);

  DMCreateGlobalVector(da,&b);
  VecDuplicate(b,&x);
  VecSet(x,0);
  VecSet(b,0);
}

PETScSolver::~PETScSolver()
{
	VecDestroy(&x);
	VecDestroy(&b);
	DMDestroy(&da);
	KSPDestroy(&ksp);
	PetscFinalize();
}

/* Needs to be called before initsolve() or solve() */
void PETScSolver::init(int *_bound)
{
  bound = _bound;
	KSPSetComputeOperators(ksp,ComputeMatrix,(void*)bound);
}

void PETScSolver::initsolve(double *phi,double *charge,double reltol)
{
  KSPSetInitialGuessNonzero(ksp,PETSC_TRUE);
  KSPSetFromOptions(ksp);
  KSPSetTolerances(ksp,reltol,PETSC_DEFAULT,PETSC_DEFAULT,PETSC_DEFAULT);

  PetscScalar *xarray;
  VecGetArray(x,&xarray);
  for (int i=0; i<Nall; i++)
    xarray[i] = phi[i];
  VecRestoreArray(x, &xarray);

  PetscScalar *barray;
  VecGetArray(b,&barray);
  for (int i=0; i<Nall; i++) {
    if (bound[i]==0)
      barray[i] = charge[i]*ih*iepsilon0;
    else
      barray[i] = phi[i];
  }
  VecRestoreArray(b,&barray);

  KSPSolve(ksp,b,x);

  int its;
  KSPGetIterationNumber(ksp,&its);
  if (rank==0) std::cout<<"Initsolve: "<<its<<" iterations\n";
  
	KSPConvergedReason reason;
	KSPGetConvergedReason(ksp,&reason);
	if (reason<0) {
		if (rank==0) std::cerr<<"Initsolve: no convergence\n";
		MPI_Finalize();
		exit(101);
	}

  KSPGetSolution(ksp,&x);
  VecGetArray(x,&xarray);
  for (int i=0; i<Nall; i++)
    phi[i] = xarray[i];
  VecRestoreArray(x,&xarray);
}

void PETScSolver::solve(int timestep,double *phi,double *charge,double reltol)
{
  KSPSetTolerances(ksp,reltol,PETSC_DEFAULT,PETSC_DEFAULT,PETSC_DEFAULT);

  PetscScalar *barray;
  VecGetArray(b,&barray);
  for (int i=0; i<Nall; i++) {
    if (bound[i]==0)
      barray[i] = charge[i]*ih*iepsilon0;
    else
      barray[i] = phi[i];
  }
  VecRestoreArray(b,&barray);

  KSPSolve(ksp,b,x);

  int its;
  KSPGetIterationNumber(ksp,&its);
  if (rank==0) std::cout<<"Timestep "<<timestep<<": "<<its<<" iterations\n";
  
  KSPGetSolution(ksp,&x);
  PetscScalar *xarray;
  VecGetArray(x,&xarray);
  for (int i=0; i<Nall; i++)
    phi[i] = xarray[i];
  VecRestoreArray(x,&xarray);
}

PetscErrorCode ComputeMatrix(KSP ksp,Mat A,Mat B,void *ctx)
{
	DM             da;
	PetscErrorCode ierr;
	PetscInt       i,j,k,mx,my,mz,xm,ym,zm,xs,ys,zs;
	PetscScalar    v[7];
	MatStencil     row,col[7];
	int *bound = (int *)ctx

	PetscFunctionBeginUser;
	ierr    = KSPGetDM(ksp,&da);CHKERRQ(ierr);
	ierr    = DMDAGetInfo(da,0,&mx,&my,&mz,0,0,0,0,0,0,0,0,0);CHKERRQ(ierr);
	ierr    = DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm);CHKERRQ(ierr);

	for (k=zs; k<zs+zm; k++) {
		for (j=ys; j<ys+ym; j++) {
			for (i=xs; i<xs+xm; i++) {
				row.i = i; row.j = j; row.k = k;
				if (bound[(i-xs)+(j-ys)*xm+(k-zs)*xm*ym]==1) { //Dirichlet nodes
					v[0] = 1.0;
					ierr = MatSetValuesStencil(B,1,&row,1,&row,v,INSERT_VALUES);CHKERRQ(ierr);
				} else {                                       //others
					v[0] = -1; col[0].i = i; col[0].j = j; col[0].k = k-1;
					v[1] = -1; col[1].i = i; col[1].j = j-1; col[1].k = k;
					v[2] = -1; col[2].i = i-1; col[2].j = j; col[2].k = k;
					v[3] =  6; col[3].i = row.i; col[3].j = row.j; col[3].k = row.k;
					v[4] = -1; col[4].i = i+1; col[4].j = j; col[4].k = k;
					v[5] = -1; col[5].i = i; col[5].j = j+1; col[5].k = k;
					v[6] = -1; col[6].i = i; col[6].j = j; col[6].k = k+1;
					ierr = MatSetValuesStencil(B,1,&row,7,col,v,INSERT_VALUES);CHKERRQ(ierr);
				}
			}
		}
	}

	ierr   = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
	ierr   = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
	PetscFunctionReturn(0);
}

[petsc-users] Tuning performance for simple solver

Reply via email to