You can use blas_set_num_threads(1) in julia and it will use only 1 
thread... but this is not right answer

Dňa pondelok, 21. marca 2016 9:49:23 UTC+1 Igor Cerovsky napísal(-a):
>
> Hi,
>
> Trying to write custom and using BLAS functions implementation of 
> Gram-Schmidt algorithm I got more than 2-times slower performance for Julia 
> 0.4.3 on my computer Intel i7 6700HQ (on older processor i7 5500 the 
> performance gain is 1.2-times). The code below is a bit longer, but I got 
> the slow performance in the whole context. Trying to profile parts of the 
> algorithm I got only slightly different performance.
>
> Custom implementation:
>
> function rank1update!(DST, A, R, k)
>     rows, cols = size(A)
>     for j = k+1:cols
>         @simd for i = 1:rows
>             @inbounds DST[i,j] -= A[i, k] * R[k, j]
>         end
>     end
> end
>
> function mygemv!(DST, A, k, alpha)
>     rows, cols = size(A)
>     for j in k+1:cols
>         s = 0.0
>         @simd for i in 1:rows
>             @inbounds s += A[i, k] * A[i, j]
>         end      
>         DST[k, j] = s * alpha
>     end
> end
>
> function mgsf(M)
>     rows, cols = size(M)
>     Q = copy(M)
>     R = eye(cols)
>
>     for k in 1:cols
>         alpha = 1.0 / sumabs2(sub(Q, :, k))
>         mygemv!(R, Q, k, alpha)
>         rank1update!(Q, Q, R, k)
>     end
>     Q, R
> end
>
> Implementation using BLAS functions:
> function mgs_blas(M)
>     cols = size(M, 2)
>     Q = copy(M)
>     R = eye(cols)
>
>     for k in 1:cols
>         q_k = sub(Q, :, k)
>         Q_sub = sub(Q, :, k+1:cols)
>         R_sub = sub(R, k, k+1:cols)
>
>         alpha = 1.0 / sumabs2(q_k)
>         R[k, k+1:cols] = BLAS.gemv('T', alpha, Q_sub, q_k)
>         BLAS.ger!(-1.0,  q_k, vec(R_sub), Q_sub)
>     end
>     
>     Q, R
> end
>
> And results; using BLAS the performance gain is ~2.6 times:
>
> # custom implementation
> Q2, R2 = @time mgsf(T);
>
>   0.714916 seconds (4.99 k allocations: 15.411 MB, 0.08% gc time)
>
>
> # implementation using BLAS functions 
>
> Q5, R5 = @time mgs_blas(T);
>
>   0.339278 seconds (16.45 k allocations: 23.521 MB, 0.76% gc time)
>
>
>
> A hint: Looking at performance graph in the Task Manager it seems BLAS 
> uses more cores.
> The question that remains is: what is going on?
>
> Thanks for explanation.
>
>

Reply via email to