Loop code
# TODO: figure out memory issuefunction train!(m::Model, s::Adagrad; xmax=100, 
alpha=0.75)    J = 0.0    shuffle!(m.covec)    vecsize = size(m.W_main, 1)    
eltype = typeof(m.b_main[1])    vm = zeros(eltype, vecsize)    vc = 
zeros(eltype, vecsize)    grad_main = zeros(eltype, vecsize)    grad_ctx = 
zeros(eltype, vecsize)    for n=1:s.niter        # shuffle indices        for i 
= 1:length(m.covec)            @inbounds l1 = m.covec[i].i # main index         
   @inbounds l2 = m.covec[i].j # context index            @inbounds v = 
m.covec[i].v            #= vm[:] = m.W_main[:, l1] =#            #= vc[:] = 
m.W_ctx[:, l2] =#            @inbounds for j = 1:vecsize                vm[j] = 
m.W_main[j, l1]                vc[j] = m.W_ctx[j, l2]            end            
diff = dot(vec(vm), vec(vc)) + m.b_main[l1] + m.b_ctx[l2] - log(v)            
fdiff = ifelse(v < xmax, (v / xmax) ^ alpha, 1.0) * diff            J += 0.5 * 
fdiff * diff            fdiff *= s.lrate            # inc memory by ~200 MB && 
running time by 2x            #= grad_main[:] = fdiff * m.W_ctx[:, l2] =#       
     #= grad_ctx[:] = fdiff * m.W_main[:, l1] =#            @inbounds for j = 
1:vecsize                grad_main[j] = fdiff * m.W_ctx[j, l2]                
grad_ctx[j] = fdiff * m.W_main[j, l1]            end            # Adaptive 
learning            # inc ~ 600MB + 0.75s            #= m.W_main[:, l1] -= 
grad_main ./ sqrt(m.W_main_grad[:, l1]) =#            #= m.W_ctx[:, l2] -= 
grad_ctx ./ sqrt(m.W_ctx_grad[:, l2]) =#            #= m.b_main[l1] -= fdiff ./ 
sqrt(m.b_main_grad[l1]) =#            #= m.b_ctx[l2] -= fdiff ./ 
sqrt(m.b_ctx_grad[l2]) =#            @inbounds for j = 1:vecsize                
m.W_main[j, l1] -= grad_main[j] / sqrt(m.W_main_grad[j, l1])                
m.W_ctx[j, l2] -= grad_ctx[j] / sqrt(m.W_ctx_grad[j, l2])            end        
    m.b_main[l1] -= fdiff ./ sqrt(m.b_main_grad[l1])            m.b_ctx[l2] -= 
fdiff ./ sqrt(m.b_ctx_grad[l2])            # Gradients            fdiff *= 
fdiff            #= m.W_main_grad[:, l1] += grad_main .^ 2 =#            #= 
m.W_ctx_grad[:, l2] += grad_ctx .^ 2 =#            #= m.b_main_grad[l1] += 
fdiff =#            #= m.b_ctx_grad[l2] += fdiff =#            @inbounds for j 
= 1:vecsize                m.W_main_grad[j, l1] += grad_main[j] ^ 2             
   m.W_ctx_grad[j, l2] += grad_ctx[j] ^ 2            end            
m.b_main_grad[l1] += fdiff            m.b_ctx_grad[l2] += fdiff        end      
  #= if n % 10 == 0 =#        #=     println("iteration $n, cost $J") =#        
#= end =#    endend

Mixmax [https://emailapps.mixmax.com/img/badge_mixmax.png]
[https://mixmax.com/r/YQkS9vBkBR3wSpESz] Not using Mixmax yet? 
[https://mixmax.com/r/YQkS9vBkBR3wSpESz]
And the respective timings
@time GloVe.train!(model, GloVe.Adagrad(500)) 7.097 seconds (96237 k 
allocations: 1468 MB, 7.01% gc time)
Slower and more memory.
On Sun, May 24, 2015 at 4:21 AM, Mauro < mauro...@runbox.com 
[mauro...@runbox.com] > wrote:
Loops should run without allocations. Can you post your loop-code?

> A[i, :] = 0.5 * B[i, :]

To state the obvious, as loop:

for j=1:size(A,2)
A[i,j] = 0.5 * B[i,j]
end

this shouldn't allocate, if i is an integer. Unless A and B have
different type, then allocation might happen.

On Sun, 2015-05-24 at 05:00, Dom Luna < dluna...@gmail.com [dluna...@gmail.com] 
> wrote:
> Reposting this from Gitter chat since it seems this is more active.
>
> I'm writing a GloVe module to learn Julia.
>
> How can I avoid memory allocations? My main function deals with a lot of
> random indexing in Matrices.
>
> A[i, :] = 0.5 * B[i, :]
>
> In this case* i* isn't from a linear sequence. I'm not sure that matters.
> Anyway, I’ve done analysis and I know B[i, :] is the issue here since it’s
> creating a copy.
>
> https://github.com/JuliaLang/ julia/blob/master/base/array. jl#L309
[https://github.com/JuliaLang/julia/blob/master/base/array.jl#L309] makes the
> copy
>
>
> I tried to do it via loop but it looks like that doesn’t help either. In
> fact, it seems to allocate slight more memory which seems really odd.
>
> Here’s some of the code, it’s a little messy since I’m commenting different
> approaches I’m trying out.
>
> type Model{T}
> W_main::Matrix{T}
> W_ctx::Matrix{T}
> b_main::Vector{T}
> b_ctx::Vector{T}
> W_main_grad::Matrix{T}
> W_ctx_grad::Matrix{T}
> b_main_grad::Vector{T}
> b_ctx_grad::Vector{T}
> covec::Vector{Cooccurence}
> end
>
> # Each vocab word in associated with a main vector and a context vector.
> # The paper initializes the to values [-0.5, 0.5] / vecsize+1 and
> # the gradients to 1.0.
> #
> # The +1 term is for the bias.
> function Model(comatrix; vecsize=100)
> vs = size(comatrix, 1)
> Model(
> (rand(vecsize, vs) - 0.5) / (vecsize + 1),
> (rand(vecsize, vs) - 0.5) / (vecsize + 1),
> (rand(vs) - 0.5) / (vecsize + 1),
> (rand(vs) - 0.5) / (vecsize + 1),
> ones(vecsize, vs),
> ones(vecsize, vs),
> ones(vs),
> ones(vs),
> CoVector(comatrix), # not required in 0.4
> )
> end
>
> # TODO: figure out memory issue
> # the memory comments are from 500 loop test with vecsize=100
> function train!(m::Model, s::Adagrad; xmax=100, alpha=0.75)
> J = 0.0
> shuffle!(m.covec)
>
> vecsize = size(m.W_main, 1)
> eltype = typeof(m.b_main[1])
> vm = zeros(eltype, vecsize)
> vc = zeros(eltype, vecsize)
> grad_main = zeros(eltype, vecsize)
> grad_ctx = zeros(eltype, vecsize)
>
> for n=1:s.niter
> # shuffle indices
> for i = 1:length(m.covec)
> @inbounds l1 = m.covec[i].i # main index
> @inbounds l2 = m.covec[i].j # context index
> @inbounds v = m.covec[i].v
>
> vm[:] = m.W_main[:, l1]
> vc[:] = m.W_ctx[:, l2]
>
> diff = dot(vec(vm), vec(vc)) + m.b_main[l1] + m.b_ctx[l2] -
> log(v)
> fdiff = ifelse(v < xmax, (v / xmax) ^ alpha, 1.0) * diff
> J += 0.5 * fdiff * diff
>
> fdiff *= s.lrate
> # inc memory by ~200 MB && running time by 2x
> grad_main[:] = fdiff * m.W_ctx[:, l2]
> grad_ctx[:] = fdiff * m.W_main[:, l1]
>
> # Adaptive learning
> # inc ~ 600MB + 0.75s
> #= @inbounds for ii = 1:vecsize =#
> #= m.W_main[ii, l1] -= grad_main[ii] /
> sqrt(m.W_main_grad[ii, l1]) =#
> #= m.W_ctx[ii, l2] -= grad_ctx[ii] / sqrt(m.W_ctx_grad[ii,
> l2]) =#
> #= m.b_main[l1] -= fdiff ./ sqrt(m.b_main_grad[l1]) =#
> #= m.b_ctx[l2] -= fdiff ./ sqrt(m.b_ctx_grad[l2]) =#
> #= end =#
>
> m.W_main[:, l1] -= grad_main ./ sqrt(m.W_main_grad[:, l1])
> m.W_ctx[:, l2] -= grad_ctx ./ sqrt(m.W_ctx_grad[:, l2])
> m.b_main[l1] -= fdiff ./ sqrt(m.b_main_grad[l1])
> m.b_ctx[l2] -= fdiff ./ sqrt(m.b_ctx_grad[l2])
>
> # Gradients
> fdiff *= fdiff
> m.W_main_grad[:, l1] += grad_main .^ 2
> m.W_ctx_grad[:, l2] += grad_ctx .^ 2
> m.b_main_grad[l1] += fdiff
> m.b_ctx_grad[l2] += fdiff
> end
>
> #= if n % 10 == 0 =#
> #= println(“iteration $n, cost $J”) =#
> #= end =#
> end
> end
>
>
> Here’s the entire repo https://github.com/domluna/ GloVe.jl 
> [https://github.com/domluna/GloVe.jl] . Might be
> helpful.
>
> I tried doing some loops but it allocates more memory (oddly enough) and
> gets slower.
>
> You’ll notice the word vectors are indexed by column, I changed the
> representation to that
> seeing if it would make a difference during the loop. It didn’t seem to.
>
> The memory analysis showed
>
> Julia Version 0.4.0-dev+4893
> Commit eb5da26* (2015-05-19 11:51 UTC)
> Platform Info:
> System: Darwin (x86_64-apple-darwin14.4.0)
> CPU: Intel(R) Core(TM) i5-2557M CPU @ 1.70GHz
> WORD_SIZE: 64
> BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Sandybridge)
> LAPACK: libopenblas
> LIBM: libopenlibm
> LLVM: libLLVM-3.3
>
> Here model consists of 100x19 Matrices and 100 element vectors, 19 words in
> the vocab, 100 element word vector.
>
> @time GloVe.train!(model, GloVe.Adagrad(500))
> 1.990 seconds (6383 k allocations: 1162 MB, 10.82% gc time)
>
> 0.3 has is a bit slower due to worse gc but same memory.
>
> Any help would be greatly appreciated!



cheers,

dom




Sent with Mixmax [https://mixmax.com]

[https://app.mixmax.com/api/track/v2/kcSBzxMuxJTn4TZ8M/dluna132%40gmail.com/i02bj5ycwV3bydWZsd2bvdGQzJXZzVXLhlGb1pmI/i02bj5ycwV3bydWZsd2bvdGQzJXZzVXLhlGb1pmI?sc=false]

Reply via email to