On Tue, Mar 29, 2016 at 12:43 PM, 博陈 <chenphysic...@gmail.com> wrote:
> I tried the built-in profiler, and find that the problem lies in lines I > end with ******, the result is shown below: > that proved my guess, how can I pre-allocate these arrays? If I don't want > to divide this code into several parts that calculate these arrays > separately. > I don't understand what you mean by `divide this code into several parts that calculate these arrays separately` > | lines | backtrace | > > | 169 | 9011 | *********** > > | 173 | 1552 | > > | 175 | 2604 | > > | 179 | 2906 | > > | 181 | 1541 | > > | 192 | 4458 | > > | 211 | 13332 ************| > > | 214 | 8431 |************ > > | 218 | 15871 |*********** > > | 221 | 2538 | > > > 在 2016年3月29日星期二 UTC+8下午9:27:27,Stefan Karpinski写道: >> >> Have you tried: >> >> (a) calling @code_typewarn on your function >> (b) using the built-in profiler? >> >> >> On Tue, Mar 29, 2016 at 9:23 AM, 博陈 <chenph...@gmail.com> wrote: >> >>> First of all, have a look at the result. >>> >>> >>> <https://lh3.googleusercontent.com/-anNt-E4P1vM/Vvp-TybegZI/AAAAAAAAABE/ZvDO2xarndMSgKVOXy_hcPd5NTh-7QcEA/s1600/QQ%25E5%259B%25BE%25E7%2589%258720160329210732.png> >>> >>> >>> >>> >>> >>> >>> >>> >>> My code calculates the evolution of 1-d 2-electron system in the >>> electric field, some variables are calculated during the evolution. >>> According to the result of @time evolution, my code must have a >>> pre-allocation problem. Before you see the long code, i suggest that the >>> hotspot might be around the Arrays prop_e, \phio, pp. I have learnt that I >>> can use m = Array(Float64, 1) outside a "for" loop and empty!(m) and >>> push!(m, new_m) inside the loop to pre-allocate the variable m, but in my >>> situations, I don't know how to pre-allocate these arrays. >>> >>> Below is the script (precisely, the main function) itself. >>> >>> function evolution(ϕ::Array{Complex{Float64}, 2}, >>> ele::Array{Float64, 1}, dx::Float64, dt::Float64, >>> flags::Tuple{Int64, Int64, Int64, Int64}) >>> ϕg = copy(ϕ) >>> FFTW.set_num_threads(8) >>> ns = length( ϕ[:, 1] ) >>> x = get_x(ns, dx) >>> p = get_p(ns, dx) >>> if flags[4] == 1 >>> pp = similar(p) >>> A = -cumsum(ele) * dt >>> A² = A.*A >>> ##### splitting >>> r_sp = 150.0 >>> δ_sp = 5.0 >>> splitter = Array(Float64, ns, ns) >>> end >>> nt = length( ele ) >>> >>> # ##### Pre-allocate result and temporary arrays >>> #if flags[1] == 1 >>> σ = zeros(Complex128, nt) >>> #end >>> #if flags[2] == 1 >>> a = zeros(Float64, nt) >>> #end >>> #if flags[3] == 1 >>> r_ionization = 20.0 >>> n1 = round(Int, ns/2 - r_ionization/dx) >>> n2 = round(Int, ns/2 + r_ionization/dx) >>> ip = zeros(Float64, nt) >>> #end >>> >>> ##### FFT plan >>> p_fft! = plan_fft!( similar(ϕ), flags=FFTW.MEASURE ) >>> >>> prop_x = similar( ϕ ) >>> prop_p = similar( prop_x ) >>> prop_e = similar( prop_x ) >>> # this two versions just cost the same time >>> xplusy = Array(Float64, ns, ns) >>> #xplusy = Array( Float64, ns^2) >>> >>> ##### absorb boundary >>> r_a = ns * dx /2 - 50.0 >>> δ = 10.0 >>> absorb = Array(Float64, ns, ns) >>> >>> k0 = 2π / (ns * dx) >>> >>> @inbounds for j in 1:ns >>> @inbounds for i in 1:ns >>> prop_x[i, j] = exp( -im * get_potential(x[i], x[j]) * dt / 2 >>> ) >>> prop_p[i, j] = exp( -im * (p[i]^2 + p[j]^2)/2 * dt ) >>> >>> xplusy[i, j] = x[i] + x[j] >>> >>> absorb[i, j] = (1.0 - get_out(x[i], r_a, δ ))* (1.0 - >>> get_out(x[j], >>> r_a, δ)) >>> end >>> end >>> >>> if flags[2] == 1 >>> pvpx = Array(Float64, ns, ns) >>> @inbounds for j in 1:ns >>> @inbounds for i in 1:ns >>> pvpx[i, j] = get_pvpx(x[i], x[j]) >>> end >>> end >>> end >>> >>> if flags[4] == 1 >>> ϕo = zeros(Complex128, ns, ns) >>> ϕp = zeros(Complex128, ns, ns) >>> @inbounds for j in 1:ns >>> @inbounds for i in 1:ns >>> splitter[i, j] = get_out(x[i], r_sp, δ_sp) * >>> get_out(x[j], r_sp, δ_sp) >>> end >>> end >>> end >>> >>> for i in 1:nt >>> for j in eachindex(ϕ) >>> prop_e[j] = exp( -im * ele[i] * xplusy[j] * dt/2.0) >>> ************************************169 >>> >>> You might be hitting a stupid inlining issue here, try adding parenthesis to the multiplication (i.e. instead of `a * b * c * d` do `(a * b) * (c * d)`) > end >>> >>> for j in eachindex(ϕ) >>> ϕ[j] *= prop_x[j] * prop_e[j] >>> end >>> p_fft! * ϕ >>> for j in eachindex(ϕ) >>> ϕ[j] *= prop_p[j] >>> end >>> p_fft! \ ϕ >>> for j in eachindex(ϕ) >>> ϕ[j] *= prop_x[j] * prop_e[j] >>> end >>> ########## autocorrelation function σ(t) >>> if flags[1] == 1 >>> for j in eachindex(ϕ) >>> σ[i] += conj(ϕg[j]) * ϕ[j] >>> end >>> end >>> ########## dipole acceleration a(t) >>> if flags[2] == 1 >>> for j in eachindex(ϕ) >>> a[i] += abs(ϕ[j])^2 * (pvpx[j] + 2ele[i]) >>> end >>> end >>> ########## ionization probability ip(t) >>> if flags[3] == 1 >>> for j1 in n1:n2 >>> for j2 in 1:ns >>> ip[i] += abs( ϕ[j2+ns*(j1-1)] )^2 >>> end >>> end >>> for j1 in [1:n1-1; n2+1:ns] >>> for j2 in n1:n2 >>> ip[i] += abs( ϕ[j2+ns*(j1-1)] )^2 >>> end >>> end >>> end >>> ########## get momentum >>> if flags[4] == 1 >>> for j in eachindex(ϕo) >>> ϕo[j] = ϕ[j] * splitter[j] * exp( -im * A[i]*xplusy[j] ) >>> **********************************211 >>> >>> Same with above > end >>> for j in eachindex(p) >>> pp[j] = p[j]^2 /2 * (nt-i) - p[j] *sum( A[i:nt] ) + sum( >>> A²[1:nt] ) /2 ******************214 >>> >>> write out the sum directly, you can do with a helper function Using subarray would also eliminate the data copy but is still suboptimum as it is now. > end >>> for j2 in 1:ns >>> for j1 in 1:ns >>> ϕo[j1, j2] = ϕo[j1, j2] * exp( -im * (pp[j1] + >>> pp[j2]) * dt)************************218 >>> >>> I don't see any obvious problem, (apart from the potential inlining issue as above) but it does look like a keep loop with c function call so it won't be surprising if most of the time is spent here. > end >>> end >>> p_fft! * ϕo >>> for j in eachindex(ϕp) >>> ϕp[j] += ϕo[j] >>> end >>> end >>> >>> ########## absorb boundary >>> if mod(i, 300) == 0 >>> for j in eachindex(ϕ) >>> ϕ[j] *= absorb[j] >>> end >>> end >>> >>> if (mod(i, 500) == 0) >>> println("i = $i") >>> flush(STDOUT) >>> end >>> end >>> σ *= dx^2 >>> a *= dx^2 >>> ip *= dx^2 >>> >>> save("data/fs.jld", "ϕ", ϕ) >>> if flags[1] == 1 >>> save("data/sigma.jld", "σ", σ) >>> end >>> if flags[2] == 1 >>> save("data/a.jld", "a", a) >>> end >>> if flags[3] == 1 >>> save("data/ip.jld", "ip", ip) >>> end >>> if flags[4] == 1 >>> save("data/pf.jld", "ϕp", ϕp) >>> end >>> >>> #return σ, a, ip, ϕ >>> nothing >>> end >>> >>> >>> >>