Try: function foo_old!(a) for i in 1:size(a, 2) a[:, i] /= norm(a[:, i]) end return a end
function foo_new!(a) for i in 1:size(a, 2) s = zero(eltype(a)) @simd for j = 1:size(a,1) @inbounds s += abs2(a[j, i]) end scale_factor = 1 / sqrt(s) @simd for j = 1:size(a,1) @inbounds a[j, i] *= scale_factor end end return a end a = rand(1000,10000); @time foo_old!(a); @time foo_old!(a); @time foo_old!(a); a = rand(1000,10000); @time foo_new!(a); @time foo_new!(a); @time foo_new!(a); On my machine, foo_new! is at least 10x faster than foo_old!, and also avoids all the allocations for the slices a[:,i].