Hi Hong,

Sorry for my late reply and thanks for the fix. Does that mean that I will be able to run that matrix on 10 procs in the future (petsc 3.6.2?)?

Thanks

Anthony


On 08/11/2015 09:58 AM, Hong wrote:
Anthony,
I pushed a fix
https://bitbucket.org/petsc/petsc/commits/ceeba3afeff0c18262ed13ef92e2508ca68b0ecf

Once it passes our nightly tests, I'll merge it to petsc-maint, then petsc-dev.
Thanks for reporting it!

Hong

On Mon, Aug 10, 2015 at 4:27 PM, Barry Smith <bsm...@mcs.anl.gov <mailto:bsm...@mcs.anl.gov>> wrote:


      Anthony,

       This crash is in PETSc code before it calls the SuperLU_DIST
    numeric factorization; likely we have a mistake such as assuming a
    process has at least one row of the matrix and need to fix it.

       Barry

    > 0x00007fe6ba609297 in MatLUFactorNumeric_SuperLU_DIST (F=0x1922b50,
    >    A=0x14a6a70, info=0x19099f8)
    >    at
    
/home/anthony/LIB/petsc-3.6.1/src/mat/impls/aij/mpi/superlu_dist/superlu_dist.c:368
    > 368           colA_start = rstart + ajj[0]; /* the smallest
    global col index of A */



    > On Aug 10, 2015, at 3:50 PM, Anthony Haas <a...@email.arizona.edu
    <mailto:a...@email.arizona.edu>> wrote:
    >
    > Hi Sherry,
    >
    > I recently submitted a matrix for which I noticed that
    Superlu_dist was hanging when running on 4 processors with
    parallel symbolic factorization. I have been using the latest
    version of Superlu_dist and the code is not hanging anymore.
    However, I noticed that when running the same matrix (I have
    attached the matrix), the code crashes with the following SIGSEGV
    when running on 10 procs (with or without parallel symbolic
    factorization). It is probably overkill to run such a 'small'
    matrix on 10 procs but I thought that it might still be useful to
    report the problem?? See below for the error obtained when running
    with gdb and also a code snippet to reproduce the error.
    >
    > Thanks,
    >
    >
    > Anthony
    >
    >
    >
    > 1) ERROR in GDB
    >
    > Program received signal SIGSEGV, Segmentation fault.
    > 0x00007fe6ba609297 in MatLUFactorNumeric_SuperLU_DIST (F=0x1922b50,
    >    A=0x14a6a70, info=0x19099f8)
    >    at
    
/home/anthony/LIB/petsc-3.6.1/src/mat/impls/aij/mpi/superlu_dist/superlu_dist.c:368
    > 368           colA_start = rstart + ajj[0]; /* the smallest
    global col index of A */
    > (gdb)
    >
    >
    >
    > 2) PORTION OF CODE TO REPRODUCE ERROR
    >
    >    Subroutine HowBigLUCanBe(rank)
    >
    >      IMPLICIT NONE
    >
    >      integer(i4b),intent(in) :: rank
    >      integer(i4b)            :: i,ct
    >      real(dp)                :: begin,endd
    >      complex(dpc)            :: sigma
    >
    >      PetscErrorCode ierr
    >
    >
    >      if (rank==0) call cpu_time(begin)
    >
    >      if (rank==0) then
    >         write(*,*)
    >         write(*,*)'Testing How Big LU Can Be...'
    >         write(*,*)'============================'
    >         write(*,*)
    >      endif
    >
    >      !sigma = (1.0d0,0.0d0)
    >      !call MatAXPY(A,-sigma,B,DIFFERENT_NONZERO_PATTERN,ierr) !
    on exit A = A-sigma*B
    >
    >      !call MatView(A,PETSC_VIEWER_STDOUT_WORLD,ierr)
    >
    > !.....Write Matrix to ASCII and Binary Format
    >      !call
    PetscViewerASCIIOpen(PETSC_COMM_WORLD,"Amat.m",viewer,ierr)
    >      !call MatView(DXX,viewer,ierr)
    >      !call PetscViewerDestroy(viewer,ierr)
    >
    >      !call
    
PetscViewerBinaryOpen(PETSC_COMM_WORLD,"Amat_binary.m",FILE_MODE_WRITE,viewer,ierr)
    >      !call MatView(A,viewer,ierr)
    >      !call PetscViewerDestroy(viewer,ierr)
    >
    > !...Load a Matrix in Binary Format
    >      call
    
PetscViewerBinaryOpen(PETSC_COMM_WORLD,"Amat_binary.m",FILE_MODE_READ,viewer,ierr)
    >      call MatCreate(PETSC_COMM_WORLD,DLOAD,ierr)
    >      call MatSetType(DLOAD,MATAIJ,ierr)
    >      call MatLoad(DLOAD,viewer,ierr)
    >      call PetscViewerDestroy(viewer,ierr)
    >
    >      !call MatView(DLOAD,PETSC_VIEWER_STDOUT_WORLD,ierr)
    >
    >
    > !.....Create Linear Solver Context
    >      call KSPCreate(PETSC_COMM_WORLD,ksp,ierr)
    >
    > !.....Set operators. Here the matrix that defines the linear
    system also serves as the preconditioning matrix.
    >      !call
    KSPSetOperators(ksp,A,A,DIFFERENT_NONZERO_PATTERN,ierr) !aha
    commented and replaced by next line
    >
    >      !call KSPSetOperators(ksp,A,A,ierr) ! remember: here A =
    A-sigma*B
    >      call KSPSetOperators(ksp,DLOAD,DLOAD,ierr) ! remember: here
    A = A-sigma*B
    >
    > !.....Set Relative and Absolute Tolerances and Uses Default for
    Divergence Tol
    >      tol = 1.e-10
    >      call
    KSPSetTolerances(ksp,tol,tol,PETSC_DEFAULT_REAL,PETSC_DEFAULT_INTEGER,ierr)
    >
    > !.....Set the Direct (LU) Solver
    >      call KSPSetType(ksp,KSPPREONLY,ierr)
    >      call KSPGetPC(ksp,pc,ierr)
    >      call PCSetType(pc,PCLU,ierr)
    >      call
    PCFactorSetMatSolverPackage(pc,MATSOLVERSUPERLU_DIST,ierr) !
    MATSOLVERSUPERLU_DIST MATSOLVERMUMPS
    >
    > !.....Create Right-Hand-Side Vector
    >      !call MatCreateVecs(A,frhs,PETSC_NULL_OBJECT,ierr)
    >      !call MatCreateVecs(A,sol,PETSC_NULL_OBJECT,ierr)
    >
    >      call MatCreateVecs(DLOAD,frhs,PETSC_NULL_OBJECT,ierr)
    >      call MatCreateVecs(DLOAD,sol,PETSC_NULL_OBJECT,ierr)
    >
    >      call
    MatGetOwnershipRange(DLOAD,IstartA,IendA,ierr)!;CHKERRQ(ierr)
    >
    >      allocate(xwork1(IendA-IstartA))
    >      allocate(loc(IendA-IstartA))
    >
    >      ct=0
    >      do i=IstartA,IendA-1
    >         ct=ct+1
    >         loc(ct)=i
    >         xwork1(ct)=(1.0d0,0.0d0)
    >      enddo
    >
    >      call
    VecSetValues(frhs,IendA-IstartA,loc,xwork1,INSERT_VALUES,ierr)
    >      call VecZeroEntries(sol,ierr)
    >
    >      deallocate(xwork1,loc)
    >
    > !.....Assemble Vectors
    >      call VecAssemblyBegin(frhs,ierr)
    >      call VecAssemblyEnd(frhs,ierr)
    >
    > !.....Solve the Linear System
    >      call KSPSolve(ksp,frhs,sol,ierr)
    >
    >      !call VecView(sol,PETSC_VIEWER_STDOUT_WORLD,ierr)
    >
    >      if (rank==0) then
    >         call cpu_time(endd)
    >         write(*,*)
    >         print '("Total time for HowBigLUCanBe = ",f21.3,"
    seconds.")',endd-begin
    >      endif
    >
    >      call SlepcFinalize(ierr)
    >
    >      STOP
    >
    >
    >    end Subroutine HowBigLUCanBe
    >
    > <Amat_binary.m.info <http://Amat_binary.m.info>>



Reply via email to