Sorry if this get reposted, put I think my last mail got lost on the ether.

I am trying to parallelize computation on HDF5 files, but I get only a 
minor speedup due to the parallelization. I thought that I could access the 
same file from multiple threads when I open the file in read mode, but it 
seems like it is faster to access data on different files, is that. Is 
there anything I can do to improve the preformance of my code.

Consider the following two files:

myparallel.jl
```julia
module MyParallel
using HDF5

const N_objs = 20
const N_floats = 100_000_000
const singlefn = "junk.h5"
const multiplefns = ["junk_$(i).h5" for i in 1:N_objs]

function write_junk_singlefile()
    h5open(singlefn, "w") do f
        for i in 1:N_objs
            write(f, "data_$(i)", rand(N_floats))
        end
    end
end

function write_junk_multifiles()
    for i in 1:N_objs
        h5open(f->write(f, "data", rand(N_floats)), multiplefns[i], "w")
    end
end

calc_singlefile(obj_id::Int) = h5open(f->mean(f["data_$(obj_id)"][:]), 
singlefn, "r")
calc_singlefile(obj_ids::AbstractArray{Int}) = [calc_singlefile(i) for i in 
obj_ids]
calc_singlefile_parallel(obj_ids::AbstractArray{Int}) = 
pmap(calc_singlefile, obj_ids)

calc_multifiles(file_id::Int) = h5open(f->mean(f["data"][:]), 
multiplefns[file_id], "r")
calc_multifiles(file_ids::AbstractArray{Int}) = [calc_multifiles(i) for i 
in file_ids]
calc_multifiles_parallel(file_ids::AbstractArray{Int}) = 
pmap(calc_singlefile, file_ids)

export N_objs, write_junk_singlefile, write_junk_multifiles, 
calc_singlefile,
calc_singlefile_parallel, calc_multifiles, calc_multifiles_parallel
end
```

and

run_paralleltest.jl

```julia
addprocs(7)
@everywhere include("myparallel.jl")
@everywhere using MyParallel

#write_junk_singlefile()
#write_junk_multifiles()

println("singlefile single core processing:")
@time calc_singlefile(1:N_objs)
println("multifiles single core processing:")
@time calc_multifiles(1:N_objs)
println("singlefile multi core processing:")
@time calc_singlefile_parallel(1:N_objs)
println("multifiles multi core processing:")
@time calc_multifiles_parallel(1:N_objs)
```

when I run `julia run_paralleltest.jl` on my macbook pro, I get the 
following results:

```
WARNING: replacing module HDF5
WARNING: replacing module HDF5
WARNING: replacing module HDF5
WARNING: replacing module HDF5
WARNING: replacing module HDF5
WARNING: replacing module HDF5
WARNING: replacing module HDF5
singlefile single core processing:
 16.758472 seconds (762.67 k allocations: 14.933 GB, 7.55% gc time)
multifiles single core processing:
 15.962462 seconds (27.59 k allocations: 14.902 GB, 7.74% gc time)
singlefile multi core processing:
 19.293451 seconds (5.73 M allocations: 241.379 MB, 1.08% gc time)
multifiles multi core processing:
 13.152688 seconds (3.15 k allocations: 204.315 KB)
```

Is there a way to get a better performance for running the parallel 
calculations. Also, are the warnings expected?

best,
Jon Alm Eriksen

Reply via email to