Hello world,
the attached patch implements a blocked algorithm for
improving the speed of cshift for dim > 1.
It uses the fact that
integer, dimension (n1,n2,n3) :: a, b
b = cshift(a,shift,3)
is identical, as far as the memory locations is concerned.
integer, dimension (n1*n2*n3) :: c, d
d = cshift(c, shift*n1*n2, 1)
The speedup is quite large; from being really slow for
dim > 1, this patch makes it go even faster.
Below there are some comparisons for the attached benchmark,
do-1.f90. gfortran-7 uses the old library version.
Interestingly, the library version is also much faster
than an implementation of straight DO loops.
Regression-tested. OK for trunk?
Regards
Thomas
$ gfortran-7 -static-libgfortran -O3 do-1.f90 && ./a.out
Testing explicit DO loops
Dim =1 Elapsed CPU time =5.71363878
Dim =2 Elapsed CPU time =5.40494061
Dim =3 Elapsed CPU time =5.40769291
Testing built-in cshift
Dim =1 Elapsed CPU time =3.43479729
Dim =2 Elapsed CPU time =11.7110386
Dim =3 Elapsed CPU time =31.0966301
$ gfortran -static-libgfortran -O3 do-1.f90 && ./a.out
Testing explicit DO loops
Dim =1 Elapsed CPU time =5.73881340
Dim =2 Elapsed CPU time =5.38435745
Dim =3 Elapsed CPU time =5.38971329
Testing built-in cshift
Dim =1 Elapsed CPU time =3.42018127
Dim =2 Elapsed CPU time =2.24075317
Dim =3 Elapsed CPU time =2.23136330
2017-06-14 Thomas Koenig
PR fortran/52473
* m4/cshift0.m4: For arrays that are contiguous up to
shift, implement blocked algorighm for cshift.
* generated/cshift0_c10.c: Regenerated.
* generated/cshift0_c16.c: Regenerated.
* generated/cshift0_c4.c: Regenerated.
* generated/cshift0_c8.c: Regenerated.
* generated/cshift0_i1.c: Regenerated.
* generated/cshift0_i16.c: Regenerated.
* generated/cshift0_i2.c: Regenerated.
* generated/cshift0_i4.c: Regenerated.
* generated/cshift0_i8.c: Regenerated.
* generated/cshift0_r10.c: Regenerated.
* generated/cshift0_r16.c: Regenerated.
* generated/cshift0_r4.c: Regenerated.
* generated/cshift0_r8.c: Regenerated.
2017-06-14 Thomas Koenig
PR fortran/52473
* gfortran.dg/cshift_1.f90: New test.
Index: m4/cshift0.m4
===
--- m4/cshift0.m4 (Revision 249104)
+++ m4/cshift0.m4 (Arbeitskopie)
@@ -52,6 +52,9 @@
index_type len;
index_type n;
+ bool do_blocked;
+ index_type r_ex, a_ex;
+
which = which - 1;
sstride[0] = 0;
rstride[0] = 0;
@@ -64,33 +67,99 @@
soffset = 1;
len = 0;
- for (dim = 0; dim < GFC_DESCRIPTOR_RANK (array); dim++)
+ r_ex = 1;
+ a_ex = 1;
+
+ if (which > 0)
{
- if (dim == which)
-{
- roffset = GFC_DESCRIPTOR_STRIDE(ret,dim);
- if (roffset == 0)
-roffset = 1;
- soffset = GFC_DESCRIPTOR_STRIDE(array,dim);
- if (soffset == 0)
-soffset = 1;
- len = GFC_DESCRIPTOR_EXTENT(array,dim);
-}
- else
-{
- count[n] = 0;
- extent[n] = GFC_DESCRIPTOR_EXTENT(array,dim);
- rstride[n] = GFC_DESCRIPTOR_STRIDE(ret,dim);
- sstride[n] = GFC_DESCRIPTOR_STRIDE(array,dim);
- n++;
-}
+ /* Test if both ret and array are contiguous. */
+ do_blocked = true;
+ dim = GFC_DESCRIPTOR_RANK (array);
+ for (n = 0; n < dim; n ++)
+ {
+ index_type rs, as;
+ rs = GFC_DESCRIPTOR_STRIDE (ret, n);
+ if (rs != r_ex)
+ {
+ do_blocked = false;
+ break;
+ }
+ as = GFC_DESCRIPTOR_STRIDE (array, n);
+ if (as != a_ex)
+ {
+ do_blocked = false;
+ break;
+ }
+ r_ex *= GFC_DESCRIPTOR_EXTENT (ret, n);
+ a_ex *= GFC_DESCRIPTOR_EXTENT (array, n);
+ }
}
- if (sstride[0] == 0)
-sstride[0] = 1;
- if (rstride[0] == 0)
-rstride[0] = 1;
+ else
+do_blocked = false;
- dim = GFC_DESCRIPTOR_RANK (array);
+ n = 0;
+
+ if (do_blocked)
+{
+ /* For contiguous arrays, use the relationship that
+
+ dimension(n1,n2,n3) :: a, b
+ b = cshift(a,sh,3)
+
+ can be dealt with as if
+
+ dimension(n1*n2*n3) :: an, bn
+ bn = cshift(a,sh*n1*n2,1)
+
+ we can used a more blocked algorithm for dim>1. */
+ sstride[0] = 1;
+ rstride[0] = 1;
+ roffset = 1;
+ soffset = 1;
+ len = GFC_DESCRIPTOR_STRIDE(array, which)
+ * GFC_DESCRIPTOR_EXTENT(array, which);
+ shift *= GFC_DESCRIPTOR_STRIDE(array, which);
+ for (dim = which + 1; dim < GFC_DESCRIPTOR_RANK (array); dim++)
+ {
+ count[n] = 0;
+ extent[n] = GFC_DESCRIPTOR_EXTENT(array,dim);
+ rstride[n] =