Hi, I am trying to optimize a code which just adds a bunch of things. My 
first instinct was to unravel the loops and run it as SIMD, like so:

dx = 1/400
addprocs(3)
imin = -6
jmin = -2

#Some SIMD
@time res = @sync @parallel (+) for i = imin:dx:0
  tmp = 0
  for j=jmin:dx:0
    ans = 0
    @simd for k=1:24
      @inbounds ans += coefs[k]*(i^(powz[k]))*(j^(poww[k]))
    end
    tmp += abs(ans)<1
  end
  tmp
end
res = res*(12/(length(imin:dx:0)*length(jmin:dx:0)))
println(res)#Mathematica gives 2.98

(Coefficients arrays posted at the bottom for completeness). On a 4 core i7 
this runs in ~4.68 seconds. I was able to beat this by optimizing the power 
calculations like so:

## Full unravel
@time res = @sync @parallel (+) for i = imin:dx:0
  tmp = 0
  isq2 = i*i; isq3 = i*isq2; isq4 = isq2*isq2; isq5 = i*isq4
  isq6 = isq4*isq2; isq7 = i*isq6; isq8 = isq4*isq4
  for j=jmin:dx:0
    jsq2 = j*j; jsq4 = jsq2*jsq2; jsq6 = jsq2*jsq4; jsq8 = jsq4*jsq4
    @inbounds tmp += abs(coefs[1]*(jsq2) + coefs[2]*(jsq4) + coefs[3]*(jsq6) 
+ coefs[4]*(jsq8) + coefs[5]*(i) + coefs[6]*(i)*(jsq2) + coefs[7]*(i)*(jsq4) 
+ coefs[8]*(i)*(jsq6) +
    coefs[9]*(isq2) + coefs[10]*(isq2)*(jsq2) + coefs[11]*(isq2)*(jsq4) + 
coefs[12]*(isq2)*(jsq6) + coefs[13]*(isq3) + coefs[14]*(isq3)*(jsq2) + coefs
[15]*(isq3)*(jsq4) +
    coefs[16]*(isq4) + coefs[17]*(isq4)*(jsq2) + coefs[18]*(isq4)*(jsq4) + 
coefs[19]*(isq5) + coefs[20]*(isq5)*(jsq2) + coefs[21]*(isq6) + coefs[22]*(
isq6)*(jsq2) +
    coefs[23]*(isq7) + coefs[24]*(isq8))<1
  end
  tmp
end
res = res*(12/(length(imin:dx:0)*length(jmin:dx:0)))
println(res)#Mathematica gives 2.98

This is the best version and gets ~1.83 seconds (FYI it bets a version 
where I distributed out by isq). However, when I put this into a function 
and call it with @code_llvm, I get:

define %jl_value_t* @julia_simdCheck_3161() {
top:
  %0 = alloca [8 x %jl_value_t*], align 8
  %.sub = getelementptr inbounds [8 x %jl_value_t*]* %0, i64 0, i64 0
  %1 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 2
  %2 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 3
  store %jl_value_t* inttoptr (i64 12 to %jl_value_t*), %jl_value_t** %.sub, 
align 8
  %3 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 1
  %4 = load %jl_value_t*** @jl_pgcstack, align 8
  %.c = bitcast %jl_value_t** %4 to %jl_value_t*
  store %jl_value_t* %.c, %jl_value_t** %3, align 8
  store %jl_value_t** %.sub, %jl_value_t*** @jl_pgcstack, align 8
  store %jl_value_t* null, %jl_value_t** %1, align 8
  store %jl_value_t* null, %jl_value_t** %2, align 8
  %5 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 4
  store %jl_value_t* null, %jl_value_t** %5, align 8
  %6 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 5
  store %jl_value_t* null, %jl_value_t** %6, align 8
  %7 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 6
  store %jl_value_t* null, %jl_value_t** %7, align 8
  %8 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 7
  store %jl_value_t* null, %jl_value_t** %8, align 8
  %9 = call %jl_value_t* @julia_sync_begin_535()
  store %jl_value_t* inttoptr (i64 2274649264 to %jl_value_t*), %jl_value_t
** %2, align 8
  %10 = load %jl_value_t* (%jl_value_t*, %jl_value_t**, i32)** inttoptr (i64 
2274649264 to %jl_value_t* (%jl_value_t*, %jl_value_t**, i32)**), align 16
  %11 = load %jl_value_t** inttoptr (i64 2212282536 to %jl_value_t**), 
align 8
  store %jl_value_t* %11, %jl_value_t** %5, align 8
  %12 = load %jl_value_t** inttoptr (i64 2197909240 to %jl_value_t**), 
align 8
  store %jl_value_t* %12, %jl_value_t** %6, align 8
  %13 = load %jl_value_t** inttoptr (i64 2197909288 to %jl_value_t**), 
align 8
  store %jl_value_t* %13, %jl_value_t** %7, align 8
  %14 = load %jl_value_t** inttoptr (i64 2212418552 to %jl_value_t**), 
align 8
  store %jl_value_t* %14, %jl_value_t** %8, align 8
  %15 = call %jl_value_t* %10(%jl_value_t* inttoptr (i64 2274649264 to %
jl_value_t*), %jl_value_t** %5, i32 4)
  store %jl_value_t* %15, %jl_value_t** %1, align 8
  call void @julia_sync_end_541()
  %16 = load %jl_value_t** %3, align 8
  %17 = getelementptr inbounds %jl_value_t* %16, i64 0, i32 0
  store %jl_value_t** %17, %jl_value_t*** @jl_pgcstack, align 8
  ret %jl_value_t* %15
}

This shows me that the llvm is not auto-vectorizing the inner part. *Is 
there a way to tell the llvm to SIMD vectorize the big summation in this 
more optimized form?*

For completeness, here are the coefficient arrays (note: coefs should be 
treated as variable. In principle those zeros could change so there's no 
manual optimizing to be done there):

coefs 
Vector Float64, 24
1.3333333333333328
0.16666666666666669
0.0
0.0
2.0
2.333333333333335
0.06250000000000001
0.0
2.0
1.1666666666666659
0.019097222222222224
0.0
1.0
3.469446951953614e-18
0.0
0.25
0.010416666666666666
0.0
0.0
0.0
0.0
0.0
0.0
0.0

powz
Vector Float64, 24
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
2.0
2.0
2.0
2.0
3.0
3.0
3.0
4.0
4.0
4.0
5.0
5.0
6.0
6.0
7.0
8.0

poww

Vector Float64, 24

2.0
4.0
6.0
8.0
0.0
2.0
4.0
6.0
0.0
2.0
4.0
6.0
0.0
2.0
4.0
0.0
2.0
4.0
0.0
2.0
0.0
2.0
0.0
0.0

Reply via email to