Try:

function foo_old!(a)
    for i in 1:size(a, 2)
        a[:, i] /= norm(a[:, i])
    end
    return a
end

function foo_new!(a)
    for i in 1:size(a, 2)
        s = zero(eltype(a))
        @simd for j = 1:size(a,1)
            @inbounds s += abs2(a[j, i])
        end
        scale_factor = 1 / sqrt(s)
        @simd for j = 1:size(a,1)
            @inbounds a[j, i] *= scale_factor
        end
    end
    return a
end

a = rand(1000,10000);
@time foo_old!(a);
@time foo_old!(a);
@time foo_old!(a);
a = rand(1000,10000);
@time foo_new!(a);
@time foo_new!(a);
@time foo_new!(a);

On my machine, foo_new! is at least 10x faster than foo_old!, and also 
avoids all the allocations for the slices a[:,i].

Reply via email to