Dear all,
I have a few questions regarding possible performance enhancements for
the PETSc solver I included in my project.
It's a particle-in-cell plasma simulation written in C++, where
Poisson's equation needs to be solved repeatedly on every timestep.
The simulation domain is discretized using finite differences, so the
solver therefore needs to be able to efficiently solve the linear system
A x = b successively with changing b. The solution x of the previous
timestep is generally a good initial guess for the solution.
I wrote a class PETScSolver that holds all PETSc objects and necessary
information about domain size and decomposition. To solve the linear
system, two arrays, 'phi' and 'charge', are passed to a member function
solve(), where they are copied to PETSc vectors, and KSPSolve() is
called. After convergence, the solution is then transferred again to the
phi array so that other program parts can use it.
The matrix is created using DMDA. An array 'bound' is used to determine
whether a node is either a Dirichlet BC or holds a charge.
I attached three files, petscsolver.h, petscsolver.cpp and main.cpp,
that contain a shortened version of the solver class and a set-up to
initialize and run a simple problem.
Is there anything I can change to generally make the program run faster?
And, since I'm rather unexperienced with KSP methods, how do I
efficiently choose PC and KSP? Just by testing every combination?
Would multigrid be a viable option as a pure solver (-ksp_type preonly)?
Thanks,
Michael
/*
Solves Poisson's equation with Dirichlet BC on a cubical grid of arbitrary size
Processors: n
*/
#define MAIN_CPP
#include <mpi.h>
#include <iostream>
#include <stack>
#include <queue>
#include "petscsolver.h"
/* constants that would normally be imported from input card */
const int g_Ni = 100; //number of nodes per dimension
const double h = 1E-4; //distance between grid points
const double reltol = 1E-5; //rtol in KSPSetTolerances
int main(int argc, char **argv)
{
MPI_Init(&argc,&argv);
int size,rank;
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
if (rank==0) {
for (int i=0; i<argc; i++)
std::cout<<argv[i]<<' ';
std::cout<<'\n'<<std::flush;
}
/* Acquire the most evenly split factorization for domain decomposition */
std::stack<int> primes;
std::priority_queue<int> factors;
int product = size;
for (int i = 2; i <= product; i++){
while (product%i==0) {
primes.push(i);
product /= i;
}
}
while (primes.size()<3) {
primes.push(1);
}
for (int i=0; i<3; i++) {
factors.push(primes.top());
primes.pop();
}
while (!primes.empty()) {
int i = factors.top();
factors.pop();
i *= primes.top();
primes.pop();
factors.push(i);
}
int dims[3]; //process partition in each direction, as passed to DMDACreate3d
for (int i = 0; i < 3; i++) {
dims[i] = factors.top();
factors.pop();
}
if (rank==0) std::cout<<'\n'<<dims[0]<<'x'<<dims[1]<<'x'<<dims[2]<<" subdomains\n\n"<<std::flush;
/* Get subdomain position of own rank in global grid */
/* Note that this is also the way PETSc partitions the global grid, otherwise this wouldn't work */
int rankpos[3];
rankpos[0] = rank % dims[0];
rankpos[1] = (rank / dims[0]) % dims[1];
rankpos[2] = (rank / (dims[0]*dims[1])) % dims[2];
int l_Ni[dims[0]], l_Nj[dims[1]], l_Nk[dims[2]]; //number of local nodes along the x,y and z axis for each cell,
//as passed to DMDACreate3d
for (int i=0; i<dims[0]; i++) l_Ni[i]=0;
for (int i=0; i<dims[1]; i++) l_Nj[i]=0;
for (int i=0; i<dims[2]; i++) l_Nk[i]=0;
int j=0;
do l_Ni[j%dims[0]]++; while (++j<g_Ni);
int imin = 0;
for (int i=0; i<rankpos[0]; i++)
imin += l_Ni[i];
int imax = imin+l_Ni[rankpos[0]]-1;
j=0;
do l_Nj[j%dims[1]]++; while (++j<g_Ni);
int jmin = 0;
for (int i=0; i<rankpos[1]; i++)
jmin += l_Nj[i];
int jmax = jmin+l_Nj[rankpos[1]]-1;
j=0;
do l_Nk[j%dims[2]]++; while (++j<g_Ni);
int kmin = 0;
for (int i=0; i<rankpos[2]; i++)
kmin += l_Nk[i];
int kmax = kmin+l_Nk[rankpos[2]]-1;
int Ni = l_Ni[rankpos[0]];
int Nj = l_Nj[rankpos[1]];
int Nk = l_Nk[rankpos[2]];
int Nall = Ni*Nj*Nk;
MPI_Barrier(MPI_COMM_WORLD);
std::cout<<"Rank "<<rank<<": Ni="<<Ni<<", Nj = "<<Nj<<", Nk = "<<Nk<<'\n'<<std::flush;
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0) std::cout<<std::endl<<std::flush;
MPI_Barrier(MPI_COMM_WORLD);
std::cout<<"Rank "<<rank<<": i="<<imin<<".."<<imax<<", j="<<jmin<<".."<<jmax<<", k="<<kmin<<".."<<kmax<<'\n'<<std::flush;
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0) std::cout<<std::endl<<std::flush;
/*
Assemble local arrays that need to be passed to the solver.
Note that these can't be PETSc vectors yet, as they are normally accessed from other part of the program as well.
The triplet (i,j,k) in the local grid is addressed via element i+j*Ni+k*Ni*Nj.
*/
int *bound; //used to determine whether a node is a Dirichlet BC (1) or not (0)
bound = new int[Nall];
double *phi, *charge; //hold electric potential and charge
phi = new double[Nall];
charge = new double[Nall];
/*
Define a hollow cylinder of constant potential
Charge is set to 0, but can be varied
*/
int center = (int) ceil(((double) g_Ni)/2. - 1.);
for (int k=kmin; k<=kmax; k++) {
for (int j=jmin; j<=jmax; j++) {
for (int i=imin; i<=imax; i++) {
int idx = i-imin+(j-jmin)*Ni+(k-kmin)*Ni*Nj;
int R = (int) ceil(((double) g_Ni)/2-1.);
if ((i-center)*(i-center)+(j-center)*(j-center)>=R*R || k==0 || k==g_Ni-1) {
phi[idx] = 100.;
charge[idx] = 0.;
bound[idx] = 1;
} else {
phi[idx] = 0.;
charge[idx] = 0.;
bound[idx] = 0;
}
}
}
}
/* Create and initialize solver */
PETScSolver *petscsolver;
petscsolver = new PETScSolver(argc,argv,g_Ni,g_Ni,g_Ni,dims,l_Ni,l_Nj,l_Nk,Nall,h);
petscsolver->init(bound); //initializes matrix
/* Solve */
petscsolver->initsolve(phi,charge,reltol);
/*
Solve without setting options.
Note that this is normally repeatedly called with altered charge array and phi array of previous timestep as initial guess
*/
petscsolver->solve(0,phi,charge,reltol);
petscsolver->solve(1,phi,charge,reltol);
delete petscsolver;
delete[] bound;
delete[] phi;
delete[] charge;
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) std::cout<<".. done"<<std::endl;
MPI::Finalize();
return 0;
}
#ifndef PETSCSOLVER_H
#define PETSCSOLVER_H
#include <petscksp.h>
#include <petscdm.h>
#include <petscdmda.h>
#include <mpi.h>
#include <iostream>
extern PetscErrorCode ComputeMatrix(KSP,Mat,Mat,void*);
class PETScSolver
{
int g_Ni,g_Nj,g_Nk,g_Nall;
int Nall;
double ih;
double iepsilon0;
int *bound;
int rank;
KSP ksp;
DM da;
PC pc;
Vec x, b;
public:
PETScSolver(int,char**,int,int,int,int*,int*,int*,int*,int,double);
~PETScSolver();
void init(int*);
void initsolve(double*,double*,double reltol=1E-5);
void solve(int,double*,double*,double reltol=1E-5);
};
#endif
#include "petscsolver.h"
PETScSolver::PETScSolver(int argc,char **argv,int g_Ni,int g_Nj,int g_Nk,int *dims,int *l_Ni,int *l_Nj,int *l_Nk,int _Nall,double h)
{
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
Nall = _Nall;
ih = 1./h;
iepsilon0 = 1./8.854187817E-12;
bound = NULL;
PetscInitialize(&argc,&argv,(char*)0,(char*)0);
KSPCreate(PETSC_COMM_WORLD,&ksp);
DMDACreate3d(PETSC_COMM_WORLD,DM_BOUNDARY_GHOSTED,DM_BOUNDARY_GHOSTED,DM_BOUNDARY_GHOSTED,
DMDA_STENCIL_BOX,g_Ni,g_Nj,g_Nk,dims[0],dims[1],dims[2],1,1,l_Ni,l_Nj,l_Nk,&da);
KSPSetDM(ksp,da);
DMCreateGlobalVector(da,&b);
VecDuplicate(b,&x);
VecSet(x,0);
VecSet(b,0);
}
PETScSolver::~PETScSolver()
{
VecDestroy(&x);
VecDestroy(&b);
DMDestroy(&da);
KSPDestroy(&ksp);
PetscFinalize();
}
/* Needs to be called before initsolve() or solve() */
void PETScSolver::init(int *_bound)
{
bound = _bound;
KSPSetComputeOperators(ksp,ComputeMatrix,(void*)bound);
}
void PETScSolver::initsolve(double *phi,double *charge,double reltol)
{
KSPSetInitialGuessNonzero(ksp,PETSC_TRUE);
KSPSetFromOptions(ksp);
KSPSetTolerances(ksp,reltol,PETSC_DEFAULT,PETSC_DEFAULT,PETSC_DEFAULT);
PetscScalar *xarray;
VecGetArray(x,&xarray);
for (int i=0; i<Nall; i++)
xarray[i] = phi[i];
VecRestoreArray(x, &xarray);
PetscScalar *barray;
VecGetArray(b,&barray);
for (int i=0; i<Nall; i++) {
if (bound[i]==0)
barray[i] = charge[i]*ih*iepsilon0;
else
barray[i] = phi[i];
}
VecRestoreArray(b,&barray);
KSPSolve(ksp,b,x);
int its;
KSPGetIterationNumber(ksp,&its);
if (rank==0) std::cout<<"Initsolve: "<<its<<" iterations\n";
KSPConvergedReason reason;
KSPGetConvergedReason(ksp,&reason);
if (reason<0) {
if (rank==0) std::cerr<<"Initsolve: no convergence\n";
MPI_Finalize();
exit(101);
}
KSPGetSolution(ksp,&x);
VecGetArray(x,&xarray);
for (int i=0; i<Nall; i++)
phi[i] = xarray[i];
VecRestoreArray(x,&xarray);
}
void PETScSolver::solve(int timestep,double *phi,double *charge,double reltol)
{
KSPSetTolerances(ksp,reltol,PETSC_DEFAULT,PETSC_DEFAULT,PETSC_DEFAULT);
PetscScalar *barray;
VecGetArray(b,&barray);
for (int i=0; i<Nall; i++) {
if (bound[i]==0)
barray[i] = charge[i]*ih*iepsilon0;
else
barray[i] = phi[i];
}
VecRestoreArray(b,&barray);
KSPSolve(ksp,b,x);
int its;
KSPGetIterationNumber(ksp,&its);
if (rank==0) std::cout<<"Timestep "<<timestep<<": "<<its<<" iterations\n";
KSPGetSolution(ksp,&x);
PetscScalar *xarray;
VecGetArray(x,&xarray);
for (int i=0; i<Nall; i++)
phi[i] = xarray[i];
VecRestoreArray(x,&xarray);
}
PetscErrorCode ComputeMatrix(KSP ksp,Mat A,Mat B,void *ctx)
{
DM da;
PetscErrorCode ierr;
PetscInt i,j,k,mx,my,mz,xm,ym,zm,xs,ys,zs;
PetscScalar v[7];
MatStencil row,col[7];
int *bound = (int *)ctx
PetscFunctionBeginUser;
ierr = KSPGetDM(ksp,&da);CHKERRQ(ierr);
ierr = DMDAGetInfo(da,0,&mx,&my,&mz,0,0,0,0,0,0,0,0,0);CHKERRQ(ierr);
ierr = DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm);CHKERRQ(ierr);
for (k=zs; k<zs+zm; k++) {
for (j=ys; j<ys+ym; j++) {
for (i=xs; i<xs+xm; i++) {
row.i = i; row.j = j; row.k = k;
if (bound[(i-xs)+(j-ys)*xm+(k-zs)*xm*ym]==1) { //Dirichlet nodes
v[0] = 1.0;
ierr = MatSetValuesStencil(B,1,&row,1,&row,v,INSERT_VALUES);CHKERRQ(ierr);
} else { //others
v[0] = -1; col[0].i = i; col[0].j = j; col[0].k = k-1;
v[1] = -1; col[1].i = i; col[1].j = j-1; col[1].k = k;
v[2] = -1; col[2].i = i-1; col[2].j = j; col[2].k = k;
v[3] = 6; col[3].i = row.i; col[3].j = row.j; col[3].k = row.k;
v[4] = -1; col[4].i = i+1; col[4].j = j; col[4].k = k;
v[5] = -1; col[5].i = i; col[5].j = j+1; col[5].k = k;
v[6] = -1; col[6].i = i; col[6].j = j; col[6].k = k+1;
ierr = MatSetValuesStencil(B,1,&row,7,col,v,INSERT_VALUES);CHKERRQ(ierr);
}
}
}
}
ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
PetscFunctionReturn(0);
}