the attached code (see contract_pppp_sparse) is a kernel which I hope gets optimized well. Unfortunately, compiling (on opteron or core2) it as

gfortran -O3 -march=native -ffast-math -funroll-loops -ffree-line-length-200 test.f90

./a.out
 Sparse: time[s]   0.66804099
 New: time[s]   0.20801300
     speedup    3.2115347
      Glfops    3.1151900
 Error:   1.11022302462515654E-016

shows that the hand-optimized version (see contract_pppp_test) is about 3x faster. I played around with options, but couldn't get gcc to generate fast code for the original source. I think that this would involve unrolling a loop and scalarizing the scratch arrays buffer1 and buffer2 (as done in the hand-optimized version). So, is there any combination of options to get that effect?

Second question, even the code generated for the hand-optimized version is not quite ideal. The asm of the inner loop appears (like the source) to contain about 4*81 multiplies. However, a 'smarter' way to do the calculation would be to compute the constants used for multiplying work(i) by retaining common subexpressions (i.e. all values of sa_i * sb_j * sc_k * sd_l * work[n] can be computed in 9+9+81+81 multiplies instead of the current scheme, which has 4*81). That could bring another factor of 2 speedup. Is there a chance to have gcc see this, or does this need to be done on the source level ?

If considered useful, I can add a PR to bugzilla with the testcase.

Joost

MODULE TEST
  IMPLICIT NONE
  INTEGER :: l
  INTEGER, PARAMETER :: dp=8
  INTEGER, PARAMETER :: nco(0:3)=(/((l+1)*(l+2)/2,l=0,3)/)
  INTEGER, PARAMETER :: nso(0:3)=(/(2*l+1,l=0,3)/)
CONTAINS
  SUBROUTINE contract_pppp_sparse(work, &
                            nl_a, nl_b, nl_c, nl_d,&
                            sphi_a, sphi_b, sphi_c, sphi_d,&
                            primitives,&
                            s_offset_a, s_offset_b, s_offset_c, s_offset_d)
    REAL(dp), DIMENSION(3*3*3*3), INTENT(IN) :: work
    INTEGER :: nl_a, nl_b, nl_c, nl_d
    REAL(dp), DIMENSION(3,3*nl_a), INTENT(IN) :: sphi_a
    REAL(dp), DIMENSION(3,3*nl_b), INTENT(IN) :: sphi_b
    REAL(dp), DIMENSION(3,3*nl_c), INTENT(IN) :: sphi_c
    REAL(dp), DIMENSION(3,3*nl_d), INTENT(IN) :: sphi_d
    REAL(dp), DIMENSION(3*nl_a, 3*nl_b,3*nl_c,3*nl_d) :: primitives
    INTEGER, INTENT(IN) :: s_offset_a, s_offset_b, s_offset_c, s_offset_d
    REAL(dp), DIMENSION(3* 3*3*3) :: buffer1, buffer2
    INTEGER :: imax,jmax,kmax, ia, ib, ic, id, s_offset_a1, s_offset_b1, 
s_offset_c1, s_offset_d1,&
              i1 ,i2, i3, i, j, k

    s_offset_a1 = 0
    DO ia = 1,nl_a
      s_offset_b1 = 0
      DO ib = 1,nl_b
        s_offset_c1 = 0
        DO ic = 1,nl_c
          s_offset_d1 = 0
          DO id = 1,nl_d
            buffer1 = 0.0_dp
            imax=3*3*3
            jmax=3
            kmax=3
            DO i=1,imax
buffer1(i+imax*(3-1)) = buffer1(i+imax*(3-1)) + work(1+(i-1)*kmax) * 
sphi_a(1,3+s_offset_a1)
buffer1(i+imax*(1-1)) = buffer1(i+imax*(1-1)) + work(2+(i-1)*kmax) * 
sphi_a(2,1+s_offset_a1)
buffer1(i+imax*(2-1)) = buffer1(i+imax*(2-1)) + work(3+(i-1)*kmax) * 
sphi_a(3,2+s_offset_a1)
            ENDDO
            buffer2 = 0.0_dp
            imax=3*3*3
            jmax=3
            kmax=3
            DO i=1,imax
buffer2(i+imax*(3-1)) = buffer2(i+imax*(3-1)) + buffer1(1+(i-1)*kmax) * 
sphi_b(1,3+s_offset_b1)
buffer2(i+imax*(1-1)) = buffer2(i+imax*(1-1)) + buffer1(2+(i-1)*kmax) * 
sphi_b(2,1+s_offset_b1)
buffer2(i+imax*(2-1)) = buffer2(i+imax*(2-1)) + buffer1(3+(i-1)*kmax) * 
sphi_b(3,2+s_offset_b1)
            ENDDO
            buffer1 = 0.0_dp
            imax=3*3*3
            jmax=3
            kmax=3
            DO i=1,imax
buffer1(i+imax*(3-1)) = buffer1(i+imax*(3-1)) + buffer2(1+(i-1)*kmax) * 
sphi_c(1,3+s_offset_c1)
buffer1(i+imax*(1-1)) = buffer1(i+imax*(1-1)) + buffer2(2+(i-1)*kmax) * 
sphi_c(2,1+s_offset_c1)
buffer1(i+imax*(2-1)) = buffer1(i+imax*(2-1)) + buffer2(3+(i-1)*kmax) * 
sphi_c(3,2+s_offset_c1)
            ENDDO
            imax=3*3*3
            jmax=3
            kmax=3
            i = 0
            DO i1=1,3
            DO i2=1,3
            DO i3=1,3
              i = i + 1
primitives(s_offset_a1+i3, s_offset_b1+i2, s_offset_c1+i1, s_offset_d1+3) =&
primitives(s_offset_a1+i3, s_offset_b1+i2, s_offset_c1+i1, s_offset_d1+3) &
+ buffer1(1+(i-1)*kmax) * sphi_d(1,3+s_offset_d1)
primitives(s_offset_a1+i3, s_offset_b1+i2, s_offset_c1+i1, s_offset_d1+1) =&
primitives(s_offset_a1+i3, s_offset_b1+i2, s_offset_c1+i1, s_offset_d1+1) &
+ buffer1(2+(i-1)*kmax) * sphi_d(2,1+s_offset_d1)
primitives(s_offset_a1+i3, s_offset_b1+i2, s_offset_c1+i1, s_offset_d1+2) =&
primitives(s_offset_a1+i3, s_offset_b1+i2, s_offset_c1+i1, s_offset_d1+2) &
+ buffer1(3+(i-1)*kmax) * sphi_d(3,2+s_offset_d1)
            ENDDO
            ENDDO
            ENDDO
            s_offset_d1 = s_offset_d1 + 3
          END DO
          s_offset_c1 = s_offset_c1 + 3
        END DO
        s_offset_b1 = s_offset_b1 + 3
      END DO
      s_offset_a1 = s_offset_a1 + 3
    END DO

  END SUBROUTINE contract_pppp_sparse

  SUBROUTINE contract_pppp_test(work, &
                            nl_a, nl_b, nl_c, nl_d,&
                            sphi_a, sphi_b, sphi_c, sphi_d,&
                            primitives,&
                            s_offset_a, s_offset_b, s_offset_c, s_offset_d)
    REAL(dp), DIMENSION(3*3*3*3), INTENT(IN) :: work
    INTEGER :: nl_a, nl_b, nl_c, nl_d
    REAL(dp), DIMENSION(3,3*nl_a), INTENT(IN) :: sphi_a
    REAL(dp), DIMENSION(3,3*nl_b), INTENT(IN) :: sphi_b
    REAL(dp), DIMENSION(3,3*nl_c), INTENT(IN) :: sphi_c
    REAL(dp), DIMENSION(3,3*nl_d), INTENT(IN) :: sphi_d
    REAL(dp), DIMENSION(3*nl_a, 3*nl_b,3*nl_c,3*nl_d) :: primitives
    INTEGER, INTENT(IN) :: s_offset_a, s_offset_b, s_offset_c, s_offset_d
    REAL(dp), DIMENSION(3* 3*3*3) :: buffer1, buffer2
    INTEGER :: imax,jmax,kmax, ia, ib, ic, id, s_offset_a1, s_offset_b1, 
s_offset_c1, s_offset_d1,&
              i1 ,i2, i3, i, j, k
    REAL(dp) :: sa1,sa2,sa3,sb1,sb2,sb3,sc1,sc2,sc3,sd1,sd2,sd3

REAL(KIND=dp) :: buffer1_1
REAL(KIND=dp) :: buffer1_2
REAL(KIND=dp) :: buffer1_3
REAL(KIND=dp) :: buffer1_4
REAL(KIND=dp) :: buffer1_5
REAL(KIND=dp) :: buffer1_6
REAL(KIND=dp) :: buffer1_7
REAL(KIND=dp) :: buffer1_8
REAL(KIND=dp) :: buffer1_9
REAL(KIND=dp) :: buffer1_10
REAL(KIND=dp) :: buffer1_11
REAL(KIND=dp) :: buffer1_12
REAL(KIND=dp) :: buffer1_13
REAL(KIND=dp) :: buffer1_14
REAL(KIND=dp) :: buffer1_15
REAL(KIND=dp) :: buffer1_16
REAL(KIND=dp) :: buffer1_17
REAL(KIND=dp) :: buffer1_18
REAL(KIND=dp) :: buffer1_19
REAL(KIND=dp) :: buffer1_20
REAL(KIND=dp) :: buffer1_21
REAL(KIND=dp) :: buffer1_22
REAL(KIND=dp) :: buffer1_23
REAL(KIND=dp) :: buffer1_24
REAL(KIND=dp) :: buffer1_25
REAL(KIND=dp) :: buffer1_26
REAL(KIND=dp) :: buffer1_27
REAL(KIND=dp) :: buffer1_28
REAL(KIND=dp) :: buffer1_29
REAL(KIND=dp) :: buffer1_30
REAL(KIND=dp) :: buffer1_31
REAL(KIND=dp) :: buffer1_32
REAL(KIND=dp) :: buffer1_33
REAL(KIND=dp) :: buffer1_34
REAL(KIND=dp) :: buffer1_35
REAL(KIND=dp) :: buffer1_36
REAL(KIND=dp) :: buffer1_37
REAL(KIND=dp) :: buffer1_38
REAL(KIND=dp) :: buffer1_39
REAL(KIND=dp) :: buffer1_40
REAL(KIND=dp) :: buffer1_41
REAL(KIND=dp) :: buffer1_42
REAL(KIND=dp) :: buffer1_43
REAL(KIND=dp) :: buffer1_44
REAL(KIND=dp) :: buffer1_45
REAL(KIND=dp) :: buffer1_46
REAL(KIND=dp) :: buffer1_47
REAL(KIND=dp) :: buffer1_48
REAL(KIND=dp) :: buffer1_49
REAL(KIND=dp) :: buffer1_50
REAL(KIND=dp) :: buffer1_51
REAL(KIND=dp) :: buffer1_52
REAL(KIND=dp) :: buffer1_53
REAL(KIND=dp) :: buffer1_54
REAL(KIND=dp) :: buffer1_55
REAL(KIND=dp) :: buffer1_56
REAL(KIND=dp) :: buffer1_57
REAL(KIND=dp) :: buffer1_58
REAL(KIND=dp) :: buffer1_59
REAL(KIND=dp) :: buffer1_60
REAL(KIND=dp) :: buffer1_61
REAL(KIND=dp) :: buffer1_62
REAL(KIND=dp) :: buffer1_63
REAL(KIND=dp) :: buffer1_64
REAL(KIND=dp) :: buffer1_65
REAL(KIND=dp) :: buffer1_66
REAL(KIND=dp) :: buffer1_67
REAL(KIND=dp) :: buffer1_68
REAL(KIND=dp) :: buffer1_69
REAL(KIND=dp) :: buffer1_70
REAL(KIND=dp) :: buffer1_71
REAL(KIND=dp) :: buffer1_72
REAL(KIND=dp) :: buffer1_73
REAL(KIND=dp) :: buffer1_74
REAL(KIND=dp) :: buffer1_75
REAL(KIND=dp) :: buffer1_76
REAL(KIND=dp) :: buffer1_77
REAL(KIND=dp) :: buffer1_78
REAL(KIND=dp) :: buffer1_79
REAL(KIND=dp) :: buffer1_80
REAL(KIND=dp) :: buffer1_81
REAL(KIND=dp) :: buffer2_1
REAL(KIND=dp) :: buffer2_2
REAL(KIND=dp) :: buffer2_3
REAL(KIND=dp) :: buffer2_4
REAL(KIND=dp) :: buffer2_5
REAL(KIND=dp) :: buffer2_6
REAL(KIND=dp) :: buffer2_7
REAL(KIND=dp) :: buffer2_8
REAL(KIND=dp) :: buffer2_9
REAL(KIND=dp) :: buffer2_10
REAL(KIND=dp) :: buffer2_11
REAL(KIND=dp) :: buffer2_12
REAL(KIND=dp) :: buffer2_13
REAL(KIND=dp) :: buffer2_14
REAL(KIND=dp) :: buffer2_15
REAL(KIND=dp) :: buffer2_16
REAL(KIND=dp) :: buffer2_17
REAL(KIND=dp) :: buffer2_18
REAL(KIND=dp) :: buffer2_19
REAL(KIND=dp) :: buffer2_20
REAL(KIND=dp) :: buffer2_21
REAL(KIND=dp) :: buffer2_22
REAL(KIND=dp) :: buffer2_23
REAL(KIND=dp) :: buffer2_24
REAL(KIND=dp) :: buffer2_25
REAL(KIND=dp) :: buffer2_26
REAL(KIND=dp) :: buffer2_27
REAL(KIND=dp) :: buffer2_28
REAL(KIND=dp) :: buffer2_29
REAL(KIND=dp) :: buffer2_30
REAL(KIND=dp) :: buffer2_31
REAL(KIND=dp) :: buffer2_32
REAL(KIND=dp) :: buffer2_33
REAL(KIND=dp) :: buffer2_34
REAL(KIND=dp) :: buffer2_35
REAL(KIND=dp) :: buffer2_36
REAL(KIND=dp) :: buffer2_37
REAL(KIND=dp) :: buffer2_38
REAL(KIND=dp) :: buffer2_39
REAL(KIND=dp) :: buffer2_40
REAL(KIND=dp) :: buffer2_41
REAL(KIND=dp) :: buffer2_42
REAL(KIND=dp) :: buffer2_43
REAL(KIND=dp) :: buffer2_44
REAL(KIND=dp) :: buffer2_45
REAL(KIND=dp) :: buffer2_46
REAL(KIND=dp) :: buffer2_47
REAL(KIND=dp) :: buffer2_48
REAL(KIND=dp) :: buffer2_49
REAL(KIND=dp) :: buffer2_50
REAL(KIND=dp) :: buffer2_51
REAL(KIND=dp) :: buffer2_52
REAL(KIND=dp) :: buffer2_53
REAL(KIND=dp) :: buffer2_54
REAL(KIND=dp) :: buffer2_55
REAL(KIND=dp) :: buffer2_56
REAL(KIND=dp) :: buffer2_57
REAL(KIND=dp) :: buffer2_58
REAL(KIND=dp) :: buffer2_59
REAL(KIND=dp) :: buffer2_60
REAL(KIND=dp) :: buffer2_61
REAL(KIND=dp) :: buffer2_62
REAL(KIND=dp) :: buffer2_63
REAL(KIND=dp) :: buffer2_64
REAL(KIND=dp) :: buffer2_65
REAL(KIND=dp) :: buffer2_66
REAL(KIND=dp) :: buffer2_67
REAL(KIND=dp) :: buffer2_68
REAL(KIND=dp) :: buffer2_69
REAL(KIND=dp) :: buffer2_70
REAL(KIND=dp) :: buffer2_71
REAL(KIND=dp) :: buffer2_72
REAL(KIND=dp) :: buffer2_73
REAL(KIND=dp) :: buffer2_74
REAL(KIND=dp) :: buffer2_75
REAL(KIND=dp) :: buffer2_76
REAL(KIND=dp) :: buffer2_77
REAL(KIND=dp) :: buffer2_78
REAL(KIND=dp) :: buffer2_79
REAL(KIND=dp) :: buffer2_80
REAL(KIND=dp) :: buffer2_81


          s_offset_d1 = 0
          DO id = 1,nl_d
             sd1=sphi_d(1,3+s_offset_d1)
             sd2=sphi_d(2,1+s_offset_d1)
             sd3=sphi_d(3,2+s_offset_d1)

        s_offset_c1 = 0
        DO ic = 1,nl_c
           sc1=sphi_c(1,3+s_offset_c1)
           sc2=sphi_c(2,1+s_offset_c1)
           sc3=sphi_c(3,2+s_offset_c1)

      s_offset_b1 = 0
      DO ib = 1,nl_b
        sb1=sphi_b(1,3+s_offset_b1)
        sb2=sphi_b(2,1+s_offset_b1)
        sb3=sphi_b(3,2+s_offset_b1)

    s_offset_a1 = 0
    DO ia = 1,nl_a
      sa1=sphi_a(1,3+s_offset_a1)
      sa2=sphi_a(2,1+s_offset_a1)
      sa3=sphi_a(3,2+s_offset_a1)

buffer1_1=0.0_dp
buffer1_2=0.0_dp
buffer1_3=0.0_dp
buffer1_4=0.0_dp
buffer1_5=0.0_dp
buffer1_6=0.0_dp
buffer1_7=0.0_dp
buffer1_8=0.0_dp
buffer1_9=0.0_dp
buffer1_10=0.0_dp
buffer1_11=0.0_dp
buffer1_12=0.0_dp
buffer1_13=0.0_dp
buffer1_14=0.0_dp
buffer1_15=0.0_dp
buffer1_16=0.0_dp
buffer1_17=0.0_dp
buffer1_18=0.0_dp
buffer1_19=0.0_dp
buffer1_20=0.0_dp
buffer1_21=0.0_dp
buffer1_22=0.0_dp
buffer1_23=0.0_dp
buffer1_24=0.0_dp
buffer1_25=0.0_dp
buffer1_26=0.0_dp
buffer1_27=0.0_dp
buffer1_28=0.0_dp
buffer1_29=0.0_dp
buffer1_30=0.0_dp
buffer1_31=0.0_dp
buffer1_32=0.0_dp
buffer1_33=0.0_dp
buffer1_34=0.0_dp
buffer1_35=0.0_dp
buffer1_36=0.0_dp
buffer1_37=0.0_dp
buffer1_38=0.0_dp
buffer1_39=0.0_dp
buffer1_40=0.0_dp
buffer1_41=0.0_dp
buffer1_42=0.0_dp
buffer1_43=0.0_dp
buffer1_44=0.0_dp
buffer1_45=0.0_dp
buffer1_46=0.0_dp
buffer1_47=0.0_dp
buffer1_48=0.0_dp
buffer1_49=0.0_dp
buffer1_50=0.0_dp
buffer1_51=0.0_dp
buffer1_52=0.0_dp
buffer1_53=0.0_dp
buffer1_54=0.0_dp
buffer1_55=0.0_dp
buffer1_56=0.0_dp
buffer1_57=0.0_dp
buffer1_58=0.0_dp
buffer1_59=0.0_dp
buffer1_60=0.0_dp
buffer1_61=0.0_dp
buffer1_62=0.0_dp
buffer1_63=0.0_dp
buffer1_64=0.0_dp
buffer1_65=0.0_dp
buffer1_66=0.0_dp
buffer1_67=0.0_dp
buffer1_68=0.0_dp
buffer1_69=0.0_dp
buffer1_70=0.0_dp
buffer1_71=0.0_dp
buffer1_72=0.0_dp
buffer1_73=0.0_dp
buffer1_74=0.0_dp
buffer1_75=0.0_dp
buffer1_76=0.0_dp
buffer1_77=0.0_dp
buffer1_78=0.0_dp
buffer1_79=0.0_dp
buffer1_80=0.0_dp
buffer1_81=0.0_dp
buffer1_55 = buffer1_55+ work(1) * sa1
buffer1_1 = buffer1_1+ work(2) * sa2
buffer1_28 = buffer1_28+ work(3) * sa3
buffer1_56 = buffer1_56+ work(4) * sa1
buffer1_2 = buffer1_2+ work(5) * sa2
buffer1_29 = buffer1_29+ work(6) * sa3
buffer1_57 = buffer1_57+ work(7) * sa1
buffer1_3 = buffer1_3+ work(8) * sa2
buffer1_30 = buffer1_30+ work(9) * sa3
buffer1_58 = buffer1_58+ work(10) * sa1
buffer1_4 = buffer1_4+ work(11) * sa2
buffer1_31 = buffer1_31+ work(12) * sa3
buffer1_59 = buffer1_59+ work(13) * sa1
buffer1_5 = buffer1_5+ work(14) * sa2
buffer1_32 = buffer1_32+ work(15) * sa3
buffer1_60 = buffer1_60+ work(16) * sa1
buffer1_6 = buffer1_6+ work(17) * sa2
buffer1_33 = buffer1_33+ work(18) * sa3
buffer1_61 = buffer1_61+ work(19) * sa1
buffer1_7 = buffer1_7+ work(20) * sa2
buffer1_34 = buffer1_34+ work(21) * sa3
buffer1_62 = buffer1_62+ work(22) * sa1
buffer1_8 = buffer1_8+ work(23) * sa2
buffer1_35 = buffer1_35+ work(24) * sa3
buffer1_63 = buffer1_63+ work(25) * sa1
buffer1_9 = buffer1_9+ work(26) * sa2
buffer1_36 = buffer1_36+ work(27) * sa3
buffer1_64 = buffer1_64+ work(28) * sa1
buffer1_10 = buffer1_10+ work(29) * sa2
buffer1_37 = buffer1_37+ work(30) * sa3
buffer1_65 = buffer1_65+ work(31) * sa1
buffer1_11 = buffer1_11+ work(32) * sa2
buffer1_38 = buffer1_38+ work(33) * sa3
buffer1_66 = buffer1_66+ work(34) * sa1
buffer1_12 = buffer1_12+ work(35) * sa2
buffer1_39 = buffer1_39+ work(36) * sa3
buffer1_67 = buffer1_67+ work(37) * sa1
buffer1_13 = buffer1_13+ work(38) * sa2
buffer1_40 = buffer1_40+ work(39) * sa3
buffer1_68 = buffer1_68+ work(40) * sa1
buffer1_14 = buffer1_14+ work(41) * sa2
buffer1_41 = buffer1_41+ work(42) * sa3
buffer1_69 = buffer1_69+ work(43) * sa1
buffer1_15 = buffer1_15+ work(44) * sa2
buffer1_42 = buffer1_42+ work(45) * sa3
buffer1_70 = buffer1_70+ work(46) * sa1
buffer1_16 = buffer1_16+ work(47) * sa2
buffer1_43 = buffer1_43+ work(48) * sa3
buffer1_71 = buffer1_71+ work(49) * sa1
buffer1_17 = buffer1_17+ work(50) * sa2
buffer1_44 = buffer1_44+ work(51) * sa3
buffer1_72 = buffer1_72+ work(52) * sa1
buffer1_18 = buffer1_18+ work(53) * sa2
buffer1_45 = buffer1_45+ work(54) * sa3
buffer1_73 = buffer1_73+ work(55) * sa1
buffer1_19 = buffer1_19+ work(56) * sa2
buffer1_46 = buffer1_46+ work(57) * sa3
buffer1_74 = buffer1_74+ work(58) * sa1
buffer1_20 = buffer1_20+ work(59) * sa2
buffer1_47 = buffer1_47+ work(60) * sa3
buffer1_75 = buffer1_75+ work(61) * sa1
buffer1_21 = buffer1_21+ work(62) * sa2
buffer1_48 = buffer1_48+ work(63) * sa3
buffer1_76 = buffer1_76+ work(64) * sa1
buffer1_22 = buffer1_22+ work(65) * sa2
buffer1_49 = buffer1_49+ work(66) * sa3
buffer1_77 = buffer1_77+ work(67) * sa1
buffer1_23 = buffer1_23+ work(68) * sa2
buffer1_50 = buffer1_50+ work(69) * sa3
buffer1_78 = buffer1_78+ work(70) * sa1
buffer1_24 = buffer1_24+ work(71) * sa2
buffer1_51 = buffer1_51+ work(72) * sa3
buffer1_79 = buffer1_79+ work(73) * sa1
buffer1_25 = buffer1_25+ work(74) * sa2
buffer1_52 = buffer1_52+ work(75) * sa3
buffer1_80 = buffer1_80+ work(76) * sa1
buffer1_26 = buffer1_26+ work(77) * sa2
buffer1_53 = buffer1_53+ work(78) * sa3
buffer1_81 = buffer1_81+ work(79) * sa1
buffer1_27 = buffer1_27+ work(80) * sa2
buffer1_54 = buffer1_54+ work(81) * sa3
buffer2_1=0.0_dp
buffer2_2=0.0_dp
buffer2_3=0.0_dp
buffer2_4=0.0_dp
buffer2_5=0.0_dp
buffer2_6=0.0_dp
buffer2_7=0.0_dp
buffer2_8=0.0_dp
buffer2_9=0.0_dp
buffer2_10=0.0_dp
buffer2_11=0.0_dp
buffer2_12=0.0_dp
buffer2_13=0.0_dp
buffer2_14=0.0_dp
buffer2_15=0.0_dp
buffer2_16=0.0_dp
buffer2_17=0.0_dp
buffer2_18=0.0_dp
buffer2_19=0.0_dp
buffer2_20=0.0_dp
buffer2_21=0.0_dp
buffer2_22=0.0_dp
buffer2_23=0.0_dp
buffer2_24=0.0_dp
buffer2_25=0.0_dp
buffer2_26=0.0_dp
buffer2_27=0.0_dp
buffer2_28=0.0_dp
buffer2_29=0.0_dp
buffer2_30=0.0_dp
buffer2_31=0.0_dp
buffer2_32=0.0_dp
buffer2_33=0.0_dp
buffer2_34=0.0_dp
buffer2_35=0.0_dp
buffer2_36=0.0_dp
buffer2_37=0.0_dp
buffer2_38=0.0_dp
buffer2_39=0.0_dp
buffer2_40=0.0_dp
buffer2_41=0.0_dp
buffer2_42=0.0_dp
buffer2_43=0.0_dp
buffer2_44=0.0_dp
buffer2_45=0.0_dp
buffer2_46=0.0_dp
buffer2_47=0.0_dp
buffer2_48=0.0_dp
buffer2_49=0.0_dp
buffer2_50=0.0_dp
buffer2_51=0.0_dp
buffer2_52=0.0_dp
buffer2_53=0.0_dp
buffer2_54=0.0_dp
buffer2_55=0.0_dp
buffer2_56=0.0_dp
buffer2_57=0.0_dp
buffer2_58=0.0_dp
buffer2_59=0.0_dp
buffer2_60=0.0_dp
buffer2_61=0.0_dp
buffer2_62=0.0_dp
buffer2_63=0.0_dp
buffer2_64=0.0_dp
buffer2_65=0.0_dp
buffer2_66=0.0_dp
buffer2_67=0.0_dp
buffer2_68=0.0_dp
buffer2_69=0.0_dp
buffer2_70=0.0_dp
buffer2_71=0.0_dp
buffer2_72=0.0_dp
buffer2_73=0.0_dp
buffer2_74=0.0_dp
buffer2_75=0.0_dp
buffer2_76=0.0_dp
buffer2_77=0.0_dp
buffer2_78=0.0_dp
buffer2_79=0.0_dp
buffer2_80=0.0_dp
buffer2_81=0.0_dp
buffer2_55 = buffer2_55+ buffer1_1 * sb1
buffer2_1 = buffer2_1+ buffer1_2 * sb2
buffer2_28 = buffer2_28+ buffer1_3 * sb3
buffer2_56 = buffer2_56+ buffer1_4 * sb1
buffer2_2 = buffer2_2+ buffer1_5 * sb2
buffer2_29 = buffer2_29+ buffer1_6 * sb3
buffer2_57 = buffer2_57+ buffer1_7 * sb1
buffer2_3 = buffer2_3+ buffer1_8 * sb2
buffer2_30 = buffer2_30+ buffer1_9 * sb3
buffer2_58 = buffer2_58+ buffer1_10 * sb1
buffer2_4 = buffer2_4+ buffer1_11 * sb2
buffer2_31 = buffer2_31+ buffer1_12 * sb3
buffer2_59 = buffer2_59+ buffer1_13 * sb1
buffer2_5 = buffer2_5+ buffer1_14 * sb2
buffer2_32 = buffer2_32+ buffer1_15 * sb3
buffer2_60 = buffer2_60+ buffer1_16 * sb1
buffer2_6 = buffer2_6+ buffer1_17 * sb2
buffer2_33 = buffer2_33+ buffer1_18 * sb3
buffer2_61 = buffer2_61+ buffer1_19 * sb1
buffer2_7 = buffer2_7+ buffer1_20 * sb2
buffer2_34 = buffer2_34+ buffer1_21 * sb3
buffer2_62 = buffer2_62+ buffer1_22 * sb1
buffer2_8 = buffer2_8+ buffer1_23 * sb2
buffer2_35 = buffer2_35+ buffer1_24 * sb3
buffer2_63 = buffer2_63+ buffer1_25 * sb1
buffer2_9 = buffer2_9+ buffer1_26 * sb2
buffer2_36 = buffer2_36+ buffer1_27 * sb3
buffer2_64 = buffer2_64+ buffer1_28 * sb1
buffer2_10 = buffer2_10+ buffer1_29 * sb2
buffer2_37 = buffer2_37+ buffer1_30 * sb3
buffer2_65 = buffer2_65+ buffer1_31 * sb1
buffer2_11 = buffer2_11+ buffer1_32 * sb2
buffer2_38 = buffer2_38+ buffer1_33 * sb3
buffer2_66 = buffer2_66+ buffer1_34 * sb1
buffer2_12 = buffer2_12+ buffer1_35 * sb2
buffer2_39 = buffer2_39+ buffer1_36 * sb3
buffer2_67 = buffer2_67+ buffer1_37 * sb1
buffer2_13 = buffer2_13+ buffer1_38 * sb2
buffer2_40 = buffer2_40+ buffer1_39 * sb3
buffer2_68 = buffer2_68+ buffer1_40 * sb1
buffer2_14 = buffer2_14+ buffer1_41 * sb2
buffer2_41 = buffer2_41+ buffer1_42 * sb3
buffer2_69 = buffer2_69+ buffer1_43 * sb1
buffer2_15 = buffer2_15+ buffer1_44 * sb2
buffer2_42 = buffer2_42+ buffer1_45 * sb3
buffer2_70 = buffer2_70+ buffer1_46 * sb1
buffer2_16 = buffer2_16+ buffer1_47 * sb2
buffer2_43 = buffer2_43+ buffer1_48 * sb3
buffer2_71 = buffer2_71+ buffer1_49 * sb1
buffer2_17 = buffer2_17+ buffer1_50 * sb2
buffer2_44 = buffer2_44+ buffer1_51 * sb3
buffer2_72 = buffer2_72+ buffer1_52 * sb1
buffer2_18 = buffer2_18+ buffer1_53 * sb2
buffer2_45 = buffer2_45+ buffer1_54 * sb3
buffer2_73 = buffer2_73+ buffer1_55 * sb1
buffer2_19 = buffer2_19+ buffer1_56 * sb2
buffer2_46 = buffer2_46+ buffer1_57 * sb3
buffer2_74 = buffer2_74+ buffer1_58 * sb1
buffer2_20 = buffer2_20+ buffer1_59 * sb2
buffer2_47 = buffer2_47+ buffer1_60 * sb3
buffer2_75 = buffer2_75+ buffer1_61 * sb1
buffer2_21 = buffer2_21+ buffer1_62 * sb2
buffer2_48 = buffer2_48+ buffer1_63 * sb3
buffer2_76 = buffer2_76+ buffer1_64 * sb1
buffer2_22 = buffer2_22+ buffer1_65 * sb2
buffer2_49 = buffer2_49+ buffer1_66 * sb3
buffer2_77 = buffer2_77+ buffer1_67 * sb1
buffer2_23 = buffer2_23+ buffer1_68 * sb2
buffer2_50 = buffer2_50+ buffer1_69 * sb3
buffer2_78 = buffer2_78+ buffer1_70 * sb1
buffer2_24 = buffer2_24+ buffer1_71 * sb2
buffer2_51 = buffer2_51+ buffer1_72 * sb3
buffer2_79 = buffer2_79+ buffer1_73 * sb1
buffer2_25 = buffer2_25+ buffer1_74 * sb2
buffer2_52 = buffer2_52+ buffer1_75 * sb3
buffer2_80 = buffer2_80+ buffer1_76 * sb1
buffer2_26 = buffer2_26+ buffer1_77 * sb2
buffer2_53 = buffer2_53+ buffer1_78 * sb3
buffer2_81 = buffer2_81+ buffer1_79 * sb1
buffer2_27 = buffer2_27+ buffer1_80 * sb2
buffer2_54 = buffer2_54+ buffer1_81 * sb3
buffer1_1=0.0_dp
buffer1_2=0.0_dp
buffer1_3=0.0_dp
buffer1_4=0.0_dp
buffer1_5=0.0_dp
buffer1_6=0.0_dp
buffer1_7=0.0_dp
buffer1_8=0.0_dp
buffer1_9=0.0_dp
buffer1_10=0.0_dp
buffer1_11=0.0_dp
buffer1_12=0.0_dp
buffer1_13=0.0_dp
buffer1_14=0.0_dp
buffer1_15=0.0_dp
buffer1_16=0.0_dp
buffer1_17=0.0_dp
buffer1_18=0.0_dp
buffer1_19=0.0_dp
buffer1_20=0.0_dp
buffer1_21=0.0_dp
buffer1_22=0.0_dp
buffer1_23=0.0_dp
buffer1_24=0.0_dp
buffer1_25=0.0_dp
buffer1_26=0.0_dp
buffer1_27=0.0_dp
buffer1_28=0.0_dp
buffer1_29=0.0_dp
buffer1_30=0.0_dp
buffer1_31=0.0_dp
buffer1_32=0.0_dp
buffer1_33=0.0_dp
buffer1_34=0.0_dp
buffer1_35=0.0_dp
buffer1_36=0.0_dp
buffer1_37=0.0_dp
buffer1_38=0.0_dp
buffer1_39=0.0_dp
buffer1_40=0.0_dp
buffer1_41=0.0_dp
buffer1_42=0.0_dp
buffer1_43=0.0_dp
buffer1_44=0.0_dp
buffer1_45=0.0_dp
buffer1_46=0.0_dp
buffer1_47=0.0_dp
buffer1_48=0.0_dp
buffer1_49=0.0_dp
buffer1_50=0.0_dp
buffer1_51=0.0_dp
buffer1_52=0.0_dp
buffer1_53=0.0_dp
buffer1_54=0.0_dp
buffer1_55=0.0_dp
buffer1_56=0.0_dp
buffer1_57=0.0_dp
buffer1_58=0.0_dp
buffer1_59=0.0_dp
buffer1_60=0.0_dp
buffer1_61=0.0_dp
buffer1_62=0.0_dp
buffer1_63=0.0_dp
buffer1_64=0.0_dp
buffer1_65=0.0_dp
buffer1_66=0.0_dp
buffer1_67=0.0_dp
buffer1_68=0.0_dp
buffer1_69=0.0_dp
buffer1_70=0.0_dp
buffer1_71=0.0_dp
buffer1_72=0.0_dp
buffer1_73=0.0_dp
buffer1_74=0.0_dp
buffer1_75=0.0_dp
buffer1_76=0.0_dp
buffer1_77=0.0_dp
buffer1_78=0.0_dp
buffer1_79=0.0_dp
buffer1_80=0.0_dp
buffer1_81=0.0_dp
buffer1_55 = buffer1_55+ buffer2_1 * sc1
buffer1_1 = buffer1_1+ buffer2_2 * sc2
buffer1_28 = buffer1_28+ buffer2_3 * sc3
buffer1_56 = buffer1_56+ buffer2_4 * sc1
buffer1_2 = buffer1_2+ buffer2_5 * sc2
buffer1_29 = buffer1_29+ buffer2_6 * sc3
buffer1_57 = buffer1_57+ buffer2_7 * sc1
buffer1_3 = buffer1_3+ buffer2_8 * sc2
buffer1_30 = buffer1_30+ buffer2_9 * sc3
buffer1_58 = buffer1_58+ buffer2_10 * sc1
buffer1_4 = buffer1_4+ buffer2_11 * sc2
buffer1_31 = buffer1_31+ buffer2_12 * sc3
buffer1_59 = buffer1_59+ buffer2_13 * sc1
buffer1_5 = buffer1_5+ buffer2_14 * sc2
buffer1_32 = buffer1_32+ buffer2_15 * sc3
buffer1_60 = buffer1_60+ buffer2_16 * sc1
buffer1_6 = buffer1_6+ buffer2_17 * sc2
buffer1_33 = buffer1_33+ buffer2_18 * sc3
buffer1_61 = buffer1_61+ buffer2_19 * sc1
buffer1_7 = buffer1_7+ buffer2_20 * sc2
buffer1_34 = buffer1_34+ buffer2_21 * sc3
buffer1_62 = buffer1_62+ buffer2_22 * sc1
buffer1_8 = buffer1_8+ buffer2_23 * sc2
buffer1_35 = buffer1_35+ buffer2_24 * sc3
buffer1_63 = buffer1_63+ buffer2_25 * sc1
buffer1_9 = buffer1_9+ buffer2_26 * sc2
buffer1_36 = buffer1_36+ buffer2_27 * sc3
buffer1_64 = buffer1_64+ buffer2_28 * sc1
buffer1_10 = buffer1_10+ buffer2_29 * sc2
buffer1_37 = buffer1_37+ buffer2_30 * sc3
buffer1_65 = buffer1_65+ buffer2_31 * sc1
buffer1_11 = buffer1_11+ buffer2_32 * sc2
buffer1_38 = buffer1_38+ buffer2_33 * sc3
buffer1_66 = buffer1_66+ buffer2_34 * sc1
buffer1_12 = buffer1_12+ buffer2_35 * sc2
buffer1_39 = buffer1_39+ buffer2_36 * sc3
buffer1_67 = buffer1_67+ buffer2_37 * sc1
buffer1_13 = buffer1_13+ buffer2_38 * sc2
buffer1_40 = buffer1_40+ buffer2_39 * sc3
buffer1_68 = buffer1_68+ buffer2_40 * sc1
buffer1_14 = buffer1_14+ buffer2_41 * sc2
buffer1_41 = buffer1_41+ buffer2_42 * sc3
buffer1_69 = buffer1_69+ buffer2_43 * sc1
buffer1_15 = buffer1_15+ buffer2_44 * sc2
buffer1_42 = buffer1_42+ buffer2_45 * sc3
buffer1_70 = buffer1_70+ buffer2_46 * sc1
buffer1_16 = buffer1_16+ buffer2_47 * sc2
buffer1_43 = buffer1_43+ buffer2_48 * sc3
buffer1_71 = buffer1_71+ buffer2_49 * sc1
buffer1_17 = buffer1_17+ buffer2_50 * sc2
buffer1_44 = buffer1_44+ buffer2_51 * sc3
buffer1_72 = buffer1_72+ buffer2_52 * sc1
buffer1_18 = buffer1_18+ buffer2_53 * sc2
buffer1_45 = buffer1_45+ buffer2_54 * sc3
buffer1_73 = buffer1_73+ buffer2_55 * sc1
buffer1_19 = buffer1_19+ buffer2_56 * sc2
buffer1_46 = buffer1_46+ buffer2_57 * sc3
buffer1_74 = buffer1_74+ buffer2_58 * sc1
buffer1_20 = buffer1_20+ buffer2_59 * sc2
buffer1_47 = buffer1_47+ buffer2_60 * sc3
buffer1_75 = buffer1_75+ buffer2_61 * sc1
buffer1_21 = buffer1_21+ buffer2_62 * sc2
buffer1_48 = buffer1_48+ buffer2_63 * sc3
buffer1_76 = buffer1_76+ buffer2_64 * sc1
buffer1_22 = buffer1_22+ buffer2_65 * sc2
buffer1_49 = buffer1_49+ buffer2_66 * sc3
buffer1_77 = buffer1_77+ buffer2_67 * sc1
buffer1_23 = buffer1_23+ buffer2_68 * sc2
buffer1_50 = buffer1_50+ buffer2_69 * sc3
buffer1_78 = buffer1_78+ buffer2_70 * sc1
buffer1_24 = buffer1_24+ buffer2_71 * sc2
buffer1_51 = buffer1_51+ buffer2_72 * sc3
buffer1_79 = buffer1_79+ buffer2_73 * sc1
buffer1_25 = buffer1_25+ buffer2_74 * sc2
buffer1_52 = buffer1_52+ buffer2_75 * sc3
buffer1_80 = buffer1_80+ buffer2_76 * sc1
buffer1_26 = buffer1_26+ buffer2_77 * sc2
buffer1_53 = buffer1_53+ buffer2_78 * sc3
buffer1_81 = buffer1_81+ buffer2_79 * sc1
buffer1_27 = buffer1_27+ buffer2_80 * sc2
buffer1_54 = buffer1_54+ buffer2_81 * sc3

primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+1, s_offset_d1+1) 
=primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+1, s_offset_d1+1) 
+buffer1_2* sd2
primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+1, s_offset_d1+1) 
=primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+1, s_offset_d1+1) 
+buffer1_5* sd2
primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+1, s_offset_d1+1) 
=primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+1, s_offset_d1+1) 
+buffer1_8* sd2
primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+1, s_offset_d1+2) 
=primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+1, s_offset_d1+2) 
+buffer1_3* sd3
primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+1, s_offset_d1+2) 
=primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+1, s_offset_d1+2) 
+buffer1_6* sd3
primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+1, s_offset_d1+2) 
=primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+1, s_offset_d1+2) 
+buffer1_9* sd3
primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+1, s_offset_d1+3) 
=primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+1, s_offset_d1+3) 
+buffer1_1* sd1
primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+1, s_offset_d1+3) 
=primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+1, s_offset_d1+3) 
+buffer1_4* sd1
primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+1, s_offset_d1+3) 
=primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+1, s_offset_d1+3) 
+buffer1_7* sd1
primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+2, s_offset_d1+1) 
=primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+2, s_offset_d1+1) 
+buffer1_29* sd2
primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+2, s_offset_d1+1) 
=primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+2, s_offset_d1+1) 
+buffer1_32* sd2
primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+2, s_offset_d1+1) 
=primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+2, s_offset_d1+1) 
+buffer1_35* sd2
primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+2, s_offset_d1+2) 
=primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+2, s_offset_d1+2) 
+buffer1_30* sd3
primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+2, s_offset_d1+2) 
=primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+2, s_offset_d1+2) 
+buffer1_33* sd3
primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+2, s_offset_d1+2) 
=primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+2, s_offset_d1+2) 
+buffer1_36* sd3
primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+2, s_offset_d1+3) 
=primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+2, s_offset_d1+3) 
+buffer1_28* sd1
primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+2, s_offset_d1+3) 
=primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+2, s_offset_d1+3) 
+buffer1_31* sd1
primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+2, s_offset_d1+3) 
=primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+2, s_offset_d1+3) 
+buffer1_34* sd1
primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+3, s_offset_d1+1) 
=primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+3, s_offset_d1+1) 
+buffer1_56* sd2
primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+3, s_offset_d1+1) 
=primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+3, s_offset_d1+1) 
+buffer1_59* sd2
primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+3, s_offset_d1+1) 
=primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+3, s_offset_d1+1) 
+buffer1_62* sd2
primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+3, s_offset_d1+2) 
=primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+3, s_offset_d1+2) 
+buffer1_57* sd3
primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+3, s_offset_d1+2) 
=primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+3, s_offset_d1+2) 
+buffer1_60* sd3
primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+3, s_offset_d1+2) 
=primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+3, s_offset_d1+2) 
+buffer1_63* sd3
primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+3, s_offset_d1+3) 
=primitives(s_offset_a1+1, s_offset_b1+1, s_offset_c1+3, s_offset_d1+3) 
+buffer1_55* sd1
primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+3, s_offset_d1+3) 
=primitives(s_offset_a1+2, s_offset_b1+1, s_offset_c1+3, s_offset_d1+3) 
+buffer1_58* sd1
primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+3, s_offset_d1+3) 
=primitives(s_offset_a1+3, s_offset_b1+1, s_offset_c1+3, s_offset_d1+3) 
+buffer1_61* sd1
primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+1, s_offset_d1+1) 
=primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+1, s_offset_d1+1) 
+buffer1_11* sd2
primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+1, s_offset_d1+1) 
=primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+1, s_offset_d1+1) 
+buffer1_14* sd2
primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+1, s_offset_d1+1) 
=primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+1, s_offset_d1+1) 
+buffer1_17* sd2
primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+1, s_offset_d1+2) 
=primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+1, s_offset_d1+2) 
+buffer1_12* sd3
primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+1, s_offset_d1+2) 
=primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+1, s_offset_d1+2) 
+buffer1_15* sd3
primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+1, s_offset_d1+2) 
=primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+1, s_offset_d1+2) 
+buffer1_18* sd3
primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+1, s_offset_d1+3) 
=primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+1, s_offset_d1+3) 
+buffer1_10* sd1
primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+1, s_offset_d1+3) 
=primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+1, s_offset_d1+3) 
+buffer1_13* sd1
primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+1, s_offset_d1+3) 
=primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+1, s_offset_d1+3) 
+buffer1_16* sd1
primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+2, s_offset_d1+1) 
=primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+2, s_offset_d1+1) 
+buffer1_38* sd2
primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+2, s_offset_d1+1) 
=primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+2, s_offset_d1+1) 
+buffer1_41* sd2
primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+2, s_offset_d1+1) 
=primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+2, s_offset_d1+1) 
+buffer1_44* sd2
primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+2, s_offset_d1+2) 
=primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+2, s_offset_d1+2) 
+buffer1_39* sd3
primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+2, s_offset_d1+2) 
=primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+2, s_offset_d1+2) 
+buffer1_42* sd3
primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+2, s_offset_d1+2) 
=primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+2, s_offset_d1+2) 
+buffer1_45* sd3
primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+2, s_offset_d1+3) 
=primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+2, s_offset_d1+3) 
+buffer1_37* sd1
primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+2, s_offset_d1+3) 
=primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+2, s_offset_d1+3) 
+buffer1_40* sd1
primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+2, s_offset_d1+3) 
=primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+2, s_offset_d1+3) 
+buffer1_43* sd1
primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+3, s_offset_d1+1) 
=primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+3, s_offset_d1+1) 
+buffer1_65* sd2
primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+3, s_offset_d1+1) 
=primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+3, s_offset_d1+1) 
+buffer1_68* sd2
primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+3, s_offset_d1+1) 
=primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+3, s_offset_d1+1) 
+buffer1_71* sd2
primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+3, s_offset_d1+2) 
=primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+3, s_offset_d1+2) 
+buffer1_66* sd3
primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+3, s_offset_d1+2) 
=primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+3, s_offset_d1+2) 
+buffer1_69* sd3
primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+3, s_offset_d1+2) 
=primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+3, s_offset_d1+2) 
+buffer1_72* sd3
primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+3, s_offset_d1+3) 
=primitives(s_offset_a1+1, s_offset_b1+2, s_offset_c1+3, s_offset_d1+3) 
+buffer1_64* sd1
primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+3, s_offset_d1+3) 
=primitives(s_offset_a1+2, s_offset_b1+2, s_offset_c1+3, s_offset_d1+3) 
+buffer1_67* sd1
primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+3, s_offset_d1+3) 
=primitives(s_offset_a1+3, s_offset_b1+2, s_offset_c1+3, s_offset_d1+3) 
+buffer1_70* sd1
primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+1, s_offset_d1+1) 
=primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+1, s_offset_d1+1) 
+buffer1_20* sd2
primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+1, s_offset_d1+1) 
=primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+1, s_offset_d1+1) 
+buffer1_23* sd2
primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+1, s_offset_d1+1) 
=primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+1, s_offset_d1+1) 
+buffer1_26* sd2
primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+1, s_offset_d1+2) 
=primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+1, s_offset_d1+2) 
+buffer1_21* sd3
primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+1, s_offset_d1+2) 
=primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+1, s_offset_d1+2) 
+buffer1_24* sd3
primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+1, s_offset_d1+2) 
=primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+1, s_offset_d1+2) 
+buffer1_27* sd3
primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+1, s_offset_d1+3) 
=primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+1, s_offset_d1+3) 
+buffer1_19* sd1
primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+1, s_offset_d1+3) 
=primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+1, s_offset_d1+3) 
+buffer1_22* sd1
primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+1, s_offset_d1+3) 
=primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+1, s_offset_d1+3) 
+buffer1_25* sd1
primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+2, s_offset_d1+1) 
=primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+2, s_offset_d1+1) 
+buffer1_47* sd2
primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+2, s_offset_d1+1) 
=primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+2, s_offset_d1+1) 
+buffer1_50* sd2
primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+2, s_offset_d1+1) 
=primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+2, s_offset_d1+1) 
+buffer1_53* sd2
primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+2, s_offset_d1+2) 
=primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+2, s_offset_d1+2) 
+buffer1_48* sd3
primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+2, s_offset_d1+2) 
=primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+2, s_offset_d1+2) 
+buffer1_51* sd3
primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+2, s_offset_d1+2) 
=primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+2, s_offset_d1+2) 
+buffer1_54* sd3
primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+2, s_offset_d1+3) 
=primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+2, s_offset_d1+3) 
+buffer1_46* sd1
primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+2, s_offset_d1+3) 
=primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+2, s_offset_d1+3) 
+buffer1_49* sd1
primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+2, s_offset_d1+3) 
=primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+2, s_offset_d1+3) 
+buffer1_52* sd1
primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+3, s_offset_d1+1) 
=primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+3, s_offset_d1+1) 
+buffer1_74* sd2
primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+3, s_offset_d1+1) 
=primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+3, s_offset_d1+1) 
+buffer1_77* sd2
primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+3, s_offset_d1+1) 
=primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+3, s_offset_d1+1) 
+buffer1_80* sd2
primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+3, s_offset_d1+2) 
=primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+3, s_offset_d1+2) 
+buffer1_75* sd3
primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+3, s_offset_d1+2) 
=primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+3, s_offset_d1+2) 
+buffer1_78* sd3
primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+3, s_offset_d1+2) 
=primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+3, s_offset_d1+2) 
+buffer1_81* sd3
primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+3, s_offset_d1+3) 
=primitives(s_offset_a1+1, s_offset_b1+3, s_offset_c1+3, s_offset_d1+3) 
+buffer1_73* sd1
primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+3, s_offset_d1+3) 
=primitives(s_offset_a1+2, s_offset_b1+3, s_offset_c1+3, s_offset_d1+3) 
+buffer1_76* sd1
primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+3, s_offset_d1+3) 
=primitives(s_offset_a1+3, s_offset_b1+3, s_offset_c1+3, s_offset_d1+3) 
+buffer1_79* sd1

      s_offset_a1 = s_offset_a1 + 3
    END DO
        s_offset_b1 = s_offset_b1 + 3
      END DO
          s_offset_c1 = s_offset_c1 + 3
        END DO
            s_offset_d1 = s_offset_d1 + 3
          END DO

  END SUBROUTINE contract_pppp_test
END MODULE


    USE TEST
    IMPLICIT NONE
    INTEGER, PARAMETER     :: nl_a=2, nl_b=2, nl_c=2, nl_d=2
    INTEGER, PARAMETER     :: n_a=1, n_b=1, n_c=1, n_d=1
    REAL(dp), DIMENSION(nco(n_a)*nco(n_b)*nco(n_c)*nco(n_d))    :: p_work
    REAL(dp), DIMENSION(nco(n_a),nso(n_a)*nl_a)   :: sphi_a
    REAL(dp), DIMENSION(nco(n_b),nso(n_b)*nl_b)   :: sphi_b
    REAL(dp), DIMENSION(nco(n_c),nso(n_c)*nl_c)   :: sphi_c
    REAL(dp), DIMENSION(nco(n_d),nso(n_d)*nl_d)   :: sphi_d

    REAL(dp), DIMENSION(nso(n_a)*nl_a, 
nso(n_b)*nl_b,nso(n_c)*nl_c,nso(n_d)*nl_d) :: primitives1
    REAL(dp), DIMENSION(nso(n_a)*nl_a, 
nso(n_b)*nl_b,nso(n_c)*nl_c,nso(n_d)*nl_d) :: primitives2
    INTEGER                                  :: s_offset_a, s_offset_b,&
                                                s_offset_c, s_offset_d
    INTEGER :: i,N
    REAL :: t1,t2,told,tnew

    N=100000
    CALL RANDOM_NUMBER(p_work)
    CALL RANDOM_NUMBER(sphi_a)
    CALL RANDOM_NUMBER(sphi_b)
    CALL RANDOM_NUMBER(sphi_c)
    CALL RANDOM_NUMBER(sphi_d)
    s_offset_a=0
    s_offset_b=0
    s_offset_c=0
    s_offset_d=0
    primitives1=0
    primitives2=0

    CALL CPU_TIME(T1)
    DO i=1,N
          CALL contract_pppp_sparse(p_work, &
                            nl_a, nl_b, nl_c, nl_d,&
                            sphi_a, sphi_b, sphi_c, sphi_d,&
                            primitives1,&
                            s_offset_a, s_offset_b, s_offset_c, s_offset_d)
    ENDDO
    CALL CPU_TIME(T2)
    told=T2-T1
    write(6,*) "Sparse: time[s] ",told


    CALL CPU_TIME(T1)
    DO i=1,N
          CALL contract_pppp_test(p_work, &
                            nl_a, nl_b, nl_c, nl_d,&
                            sphi_a, sphi_b, sphi_c, sphi_d,&
                            primitives2,&
                            s_offset_a, s_offset_b, s_offset_c, s_offset_d)
    ENDDO
    CALL CPU_TIME(T2)
    tnew=T2-T1
                                     ! estimated number of flops for pppp
    write(6,*) "New: time[s] ",tnew
    write(6,*) "    speedup ",told/tnew
    write(6,*) "     Glfops ",(81*5*nl_a*nl_b*nl_c*nl_d)/((tnew)/N)/1.0E9

    primitives1=0
    primitives2=0
          CALL contract_pppp_sparse(p_work, &
                            nl_a, nl_b, nl_c, nl_d,&
                            sphi_a, sphi_b, sphi_c, sphi_d,&
                            primitives1,&
                            s_offset_a, s_offset_b, s_offset_c, s_offset_d)
          CALL contract_pppp_test(p_work, &
                            nl_a, nl_b, nl_c, nl_d,&
                            sphi_a, sphi_b, sphi_c, sphi_d,&
                            primitives2,&
                            s_offset_a, s_offset_b, s_offset_c, s_offset_d)
    write(6,*) "Error: ",MAXVAL(ABS(primitives1-primitives2))

    END

Reply via email to