Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Dirk Brandewie

On 05/30/2014 01:07 PM, Tim Chen wrote:

On Fri, 2014-05-30 at 12:38 -0700, Dirk Brandewie wrote:


Dirk,

Thanks for checking things out.

I tested on a Haswell system, and I see that the frequency
can dip below the max even when I set the min_perf_pct to 100.
Let me know if you want to log on to my system and check if
there's something I missed. It is odd that the package 1's
cores are at a much higher frequency and close to
max than package 0, once min_perf_pct is set to 100.



Can you run turbostat for a few samples it reports an average over the sample
time.



Here it is.



You have me at a loss here I can come in on Monday if you are around and
we can try to figure out what is happening.

--Dirk

Tim

Package Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  
CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt 
RAMWatt   PKG_%   RAM_%
-   -   -   00.0220482594   00.23
0.00   99.750.00  33  425.930.00   91.520.00   23.22
4.150.120.00
0   0   0   10.0619972594   00.16
0.00   99.780.00  32  427.920.00   91.550.00   16.88
1.950.060.00
0   0  28   00.0113382594   00.21
0   1   1   00.0216962594   00.11
0.00   99.870.00  33
0   1  29   00.0114552594   00.11
0   2   2   00.0116182594   00.07
0.00   99.910.00  30
0   2  30   00.0115132594   00.07
0   3   3   00.0117242594   00.08
0.00   99.910.00  31
0   3  31   00.0114472594   00.08
0   4   4   00.0117692594   00.06
0.00   99.920.00  32
0   4  32   00.0114832594   00.06
0   5   5   00.0116702594   00.07
0.00   99.920.00  29
0   5  33   00.0115152594   00.07
0   6   6   00.0116002594   00.07
0.00   99.920.00  33
0   6  34   00.0114122594   00.07
0   8   7   00.0115882594   00.07
0.00   99.920.00  30
0   8  35   00.0114322594   00.07
0   9   8   00.0116622594   00.11
0.00   99.880.00  32
0   9  36   00.0216582594   00.10
0  10   9   00.0115702594   00.07
0.00   99.910.00  32
0  10  37   00.0114682594   00.07
0  11  10   00.0116802594   00.07
0.00   99.920.00  31
0  11  38   00.0115112594   00.07
0  12  11   00.0116902594   00.08
0.00   99.910.00  30
0  12  39   00.0115602594   00.08
0  13  12   00.0216042594   00.11
0.00   99.870.00  29
0  13  40   00.0214362594   00.11
0  14  13   00.0216202594   00.09
0.00   99.890.00  29
0  14  41   00.0214402594   00.09
1   0  14   00.0316662594   00.16
0.00   99.820.00  28  363.940.00   91.500.006.34
2.200.060.00
1   0  42   30.0832632594   00.11
1   1  15   00.0121942594   00.09
0.00   99.900.00  30
1   1  43   00.0123582594   00.09
1   2  16   00.0126502594   00.08
0.00   99.910.00  28
1   2  44   00.0120322594   00.08
1   3  17   10.0323052594   04.11
0.00   95.860.00  30
1   3  45   00.0122902594   04.13
1   4  18   00.0123622594   00.09
0.00   99.900.00  28
1   4  46   00.0123252594   00.09
1   5  19   00.0123742594   00.07
0.00   99.920.00  30
1   5  47   00.0124422594   00.07
1   6  20   00.0124762594   00.08
0.00   99.910.00  30
1   6  48   00.012

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Tim Chen
On Fri, 2014-05-30 at 12:38 -0700, Dirk Brandewie wrote:

> > Dirk,
> >
> > Thanks for checking things out.
> >
> > I tested on a Haswell system, and I see that the frequency
> > can dip below the max even when I set the min_perf_pct to 100.
> > Let me know if you want to log on to my system and check if
> > there's something I missed. It is odd that the package 1's
> > cores are at a much higher frequency and close to
> > max than package 0, once min_perf_pct is set to 100.
> >
> 
> Can you run turbostat for a few samples it reports an average over the sample
> time.
> 

Here it is.

Tim

Package Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  
CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt 
RAMWatt   PKG_%   RAM_% 
   -   -   -   00.0220482594   00.23
0.00   99.750.00  33  425.930.00   91.520.00   23.22
4.150.120.00
   0   0   0   10.0619972594   00.16
0.00   99.780.00  32  427.920.00   91.550.00   16.88
1.950.060.00
   0   0  28   00.0113382594   00.21
   0   1   1   00.0216962594   00.11
0.00   99.870.00  33
   0   1  29   00.0114552594   00.11
   0   2   2   00.0116182594   00.07
0.00   99.910.00  30
   0   2  30   00.0115132594   00.07
   0   3   3   00.0117242594   00.08
0.00   99.910.00  31
   0   3  31   00.0114472594   00.08
   0   4   4   00.0117692594   00.06
0.00   99.920.00  32
   0   4  32   00.0114832594   00.06
   0   5   5   00.0116702594   00.07
0.00   99.920.00  29
   0   5  33   00.0115152594   00.07
   0   6   6   00.0116002594   00.07
0.00   99.920.00  33
   0   6  34   00.0114122594   00.07
   0   8   7   00.0115882594   00.07
0.00   99.920.00  30
   0   8  35   00.0114322594   00.07
   0   9   8   00.0116622594   00.11
0.00   99.880.00  32
   0   9  36   00.0216582594   00.10
   0  10   9   00.0115702594   00.07
0.00   99.910.00  32
   0  10  37   00.0114682594   00.07
   0  11  10   00.0116802594   00.07
0.00   99.920.00  31
   0  11  38   00.0115112594   00.07
   0  12  11   00.0116902594   00.08
0.00   99.910.00  30
   0  12  39   00.0115602594   00.08
   0  13  12   00.0216042594   00.11
0.00   99.870.00  29
   0  13  40   00.0214362594   00.11
   0  14  13   00.0216202594   00.09
0.00   99.890.00  29
   0  14  41   00.0214402594   00.09
   1   0  14   00.0316662594   00.16
0.00   99.820.00  28  363.940.00   91.500.006.34
2.200.060.00
   1   0  42   30.0832632594   00.11
   1   1  15   00.0121942594   00.09
0.00   99.900.00  30
   1   1  43   00.0123582594   00.09
   1   2  16   00.0126502594   00.08
0.00   99.910.00  28
   1   2  44   00.0120322594   00.08
   1   3  17   10.0323052594   04.11
0.00   95.860.00  30
   1   3  45   00.0122902594   04.13
   1   4  18   00.0123622594   00.09
0.00   99.900.00  28
   1   4  46   00.0123252594   00.09
   1   5  19   00.0123742594   00.07
0.00   99.920.00  30
   1   5  47   00.0124422594   00.07
   1   6  20   00.0124762594   00.08
0.00   99.910.00  30
   1   6  48   00.0123822594   00.07
   1   8  21   00.0126692594   00.09
0.00   99.900.00  29
   1   8  49   0

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Dirk Brandewie

On 05/30/2014 12:32 PM, Tim Chen wrote:

On Fri, 2014-05-30 at 11:45 -0700, Dirk Brandewie wrote:



With turbostat from rc7.
[root@echolake turbostat]# ./turbostat
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0811783492   00.120.08
0.01   99.71  29  29   99.230.000.000.002.180.00
0.00
0   0   20.1911893492   00.220.30
0.00   99.29  29  29   99.240.000.000.002.180.00
0.00
0   4   10.1212533492   00.29
1   1   00.0310653492   00.030.00
0.00   99.93  23
1   5   00.0111043492   00.05
2   2   00.0212753492   00.220.00
0.03   99.73  24
2   6   20.1812203492   00.06
3   3   00.01 9923492   00.070.00
0.01   99.90  23
3   7   00.05 9153492   00.04
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0610343492   00.095.15
0.00   94.70  28  28   99.490.000.000.002.480.01
0.00
0   0   10.0910663492   00.170.01
0.00   99.73  28  28   99.490.000.000.002.480.01
0.00
0   4   10.1210363492   00.14
1   1   00.0410093492   00.05   20.59
0.00   79.32  24
1   5   00.02 9223492   00.07
2   2   00.03 9243492   00.150.00
0.00   99.82  25
2   6   10.1211173492   00.06
3   3   00.01 9113492   00.040.01
0.00   99.94  22
3   7   00.03 8563492   00.02
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.08 8893492   00.120.03
0.06   99.71  29  29   99.320.000.000.002.210.00
0.00
0   0   10.11 8673492   00.200.02
0.22   99.44  29  29   99.320.000.000.002.210.00
0.00
0   4   10.14 9073492   00.17
1   1   10.12 8093492   00.040.11
0.01   99.73  24
1   5   00.01 7983492   00.14
2   2   00.03 8633492   00.180.00
0.01   99.78  24
2   6   10.1410133492   00.07
3   3   00.02 8533492   00.090.00
0.00   99.89  23
3   7   10.06 8153492   00.05
^C
[root@echolake turbostat]# echo 100 > 
/sys/devices/system/cpu/intel_pstate/min_perf_pct
[root@echolake turbostat]# ./turbostat
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0334893492   02.430.01
0.00   97.53  30  30   90.200.000.000.002.850.06
0.00
0   0   10.0434703492   00.090.00
0.00   99.88  30  30   90.200.000.000.002.850.06
0.00
0   4   20.0634923492   00.07
1   1   10.0234953492   00.050.03
0.00   99.90  25
1   5   00.0034943492   00.07
2   2   00.0134923492   09.530.00
0.01   90.45  25
2   6   10.0434923492   09.50
3   3   10.0334923492   00.050.01
0.00   99.91  23
3   7   10.0234933492   00.06
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0234923492   04.930.00
0.00   95.04  30  30   80.190.000.000.003.540.10
0.00
0   0   10.0234913492   00.080.01
0.00   99.89  30  30   80.190.000.000.003.540.10
0.0

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Tim Chen
On Fri, 2014-05-30 at 11:45 -0700, Dirk Brandewie wrote:

> 
> With turbostat from rc7.
> [root@echolake turbostat]# ./turbostat 
> Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
> CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt 
> CorWatt GFXWatt 
>-   -   10.0811783492   00.120.08
> 0.01   99.71  29  29   99.230.000.000.002.180.00  
>   0.00
>0   0   20.1911893492   00.220.30
> 0.00   99.29  29  29   99.240.000.000.002.180.00  
>   0.00
>0   4   10.1212533492   00.29
>1   1   00.0310653492   00.030.00
> 0.00   99.93  23
>1   5   00.0111043492   00.05
>2   2   00.0212753492   00.220.00
> 0.03   99.73  24
>2   6   20.1812203492   00.06
>3   3   00.01 9923492   00.070.00
> 0.01   99.90  23
>3   7   00.05 9153492   00.04
> Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
> CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt 
> CorWatt GFXWatt 
>-   -   10.0610343492   00.095.15
> 0.00   94.70  28  28   99.490.000.000.002.480.01  
>   0.00
>0   0   10.0910663492   00.170.01
> 0.00   99.73  28  28   99.490.000.000.002.480.01  
>   0.00
>0   4   10.1210363492   00.14
>1   1   00.0410093492   00.05   20.59
> 0.00   79.32  24
>1   5   00.02 9223492   00.07
>2   2   00.03 9243492   00.150.00
> 0.00   99.82  25
>2   6   10.1211173492   00.06
>3   3   00.01 9113492   00.040.01
> 0.00   99.94  22
>3   7   00.03 8563492   00.02
> Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
> CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt 
> CorWatt GFXWatt 
>-   -   10.08 8893492   00.120.03
> 0.06   99.71  29  29   99.320.000.000.002.210.00  
>   0.00
>0   0   10.11 8673492   00.200.02
> 0.22   99.44  29  29   99.320.000.000.002.210.00  
>   0.00
>0   4   10.14 9073492   00.17
>1   1   10.12 8093492   00.040.11
> 0.01   99.73  24
>1   5   00.01 7983492   00.14
>2   2   00.03 8633492   00.180.00
> 0.01   99.78  24
>2   6   10.1410133492   00.07
>3   3   00.02 8533492   00.090.00
> 0.00   99.89  23
>3   7   10.06 8153492   00.05
> ^C
> [root@echolake turbostat]# echo 100 > 
> /sys/devices/system/cpu/intel_pstate/min_perf_pct 
> [root@echolake turbostat]# ./turbostat 
> Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
> CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt 
> CorWatt GFXWatt 
>-   -   10.0334893492   02.430.01
> 0.00   97.53  30  30   90.200.000.000.002.850.06  
>   0.00
>0   0   10.0434703492   00.090.00
> 0.00   99.88  30  30   90.200.000.000.002.850.06  
>   0.00
>0   4   20.0634923492   00.07
>1   1   10.0234953492   00.050.03
> 0.00   99.90  25
>1   5   00.0034943492   00.07
>2   2   00.0134923492   09.530.00
> 0.01   90.45  25
>2   6   10.0434923492   09.50
>3   3   10.0334923492   00.050.01
> 0.00   99.91  23
>3   7   10.0234933492   00.06
> Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
> CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt 
> CorWatt GFXWatt 
>-   -   10.0234923492   04.930.00
> 0.00   95.04  30  30   80.190.000.000.003.540.10  
>   0.00
>0   0   10.0234913492   0   

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Dirk Brandewie
On 05/30/2014 10:56 AM, Tim Chen wrote:
> On Thu, 2014-05-29 at 21:16 -0400, Dave Jones wrote:
>> On Thu, May 29, 2014 at 06:07:16PM -0700, Tim Chen wrote:
>>   > On Thu, 2014-05-29 at 19:54 -0400, George Spelvin wrote:
>>   > > Sorry for the delay; my Ivy Bridge test machine isn't in my
>>   > > office and getting to the console to tweak the BIOS is a
>>   > > bit of a bother.
>>   > >
>>   > > Anyway, i7-4930K, turbo boost & hyperthreading disabled,
>>   > > $ cat /sys/devices/system/cpu/cpu?/cpufreq/scaling_governor
>>   > > performance
>>   > > performance
>>   > > performance
>>   > > performance
>>   > > performance
>>   > > performance
>>   > >
>>   > > Oddly, though, CPU speed still seems to be fluctuating:
>>   > > $ grep MHz /proc/cpuinfo
>>   > > cpu MHz : 1255.875
>>   > > cpu MHz : 3168.375
>>   > > cpu MHz : 3062.125
>>   > > cpu MHz : 1468.375
>>   > > cpu MHz : 1309.000
>>   > > cpu MHz : 2212.125
>>   > > $ grep MHz /proc/cpuinfo
>>   > > cpu MHz : 1255.875
>>   > > cpu MHz : 2690.250
>>   > > cpu MHz : 1255.875
>>   > > cpu MHz : 2530.875
>>   > > cpu MHz : 2212.125
>>   > > cpu MHz : 1521.500
>>   >
>>   > This is odd.  On my Ivy Bridge system the CPU speed from /proc/cpuinfo
>>   > is at max freq once I set the performance governor.
>>   > The numbers above almost look like
>>   > the cpu frequency is fluctuating and an average is taken.
>>   > What version of the kernel are you running?  Is
>>   > CONFIG_CPU_FREQ_GOV_PERFORMANCE compiled in?
>>   >
>>   > Does /sys/devices/system/cpu/cpu?/cpufreq/scaling_cur_freq
>>   > also changes?
>>   >
>>   > Can you check what are the available governors in your system
>>   > and available frequencies?
>>   >
>>   > cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors
>>   > cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
>>   >
>>   > If userspace governor is available, you can try set the governor
>>   > to userspace, then pin frequency to 3400 MHz (assuming that's your
>>   > max) with command like:
>>   
>> intel_pstate overrides any governor choice you make through sysfs.
>>
>>  Dave
>>
> 
> Dirk,
> 
> Wonder if this the right behavior for intel_pstate that when I set the
> governor to performance, intel_pstate driver still adjusts
> the cpu frequencies around?

No, the value returned is a measured/delivered frequency instead of the P state
requested which is what the other governors return.

> 
> Turbotstat also confirms that the frequencies are not at max,
> even though the max_perf_pct and min_perf_pct are both set at 100.
> 

I calculate frequency the same way turbostat does but my samples are a *lot* 
shorter.
 

> I ran on my HSW system with 3.15-rc7 kernel and see similar
> issue that Geroge reported.
> 
> It is really a pain when we need to do performance benchmarking and
> need to have a constant cpu frequency.
> 

With turbostat from rc7.
[root@echolake turbostat]# ./turbostat 
Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt 
   -   -   10.0811783492   00.120.08
0.01   99.71  29  29   99.230.000.000.002.180.00
0.00
   0   0   20.1911893492   00.220.30
0.00   99.29  29  29   99.240.000.000.002.180.00
0.00
   0   4   10.1212533492   00.29
   1   1   00.0310653492   00.030.00
0.00   99.93  23
   1   5   00.0111043492   00.05
   2   2   00.0212753492   00.220.00
0.03   99.73  24
   2   6   20.1812203492   00.06
   3   3   00.01 9923492   00.070.00
0.01   99.90  23
   3   7   00.05 9153492   00.04
Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt 
   -   -   10.0610343492   00.095.15
0.00   94.70  28  28   99.490.000.000.002.480.01
0.00
   0   0   10.0910663492   00.170.01
0.00   99.73  28  28   99.490.000.000.002.480.01
0.00
   0   4   10.1210363492   00.14
   1   1   00.0410093492   00.05   20.59
0.00   79.32  24
   1   5   00.02 9223492   00.07
   2   2   00.03 9243492   00.150.00
0.00   99.82  25
   2   6   10.1211173492   00.06
   3   3   00.

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Tim Chen
On Thu, 2014-05-29 at 21:16 -0400, Dave Jones wrote:
> On Thu, May 29, 2014 at 06:07:16PM -0700, Tim Chen wrote:
>  > On Thu, 2014-05-29 at 19:54 -0400, George Spelvin wrote:
>  > > Sorry for the delay; my Ivy Bridge test machine isn't in my
>  > > office and getting to the console to tweak the BIOS is a
>  > > bit of a bother.
>  > > 
>  > > Anyway, i7-4930K, turbo boost & hyperthreading disabled,
>  > > $ cat /sys/devices/system/cpu/cpu?/cpufreq/scaling_governor
>  > > performance
>  > > performance
>  > > performance
>  > > performance
>  > > performance
>  > > performance
>  > > 
>  > > Oddly, though, CPU speed still seems to be fluctuating:
>  > > $ grep MHz /proc/cpuinfo
>  > > cpu MHz : 1255.875
>  > > cpu MHz : 3168.375
>  > > cpu MHz : 3062.125
>  > > cpu MHz : 1468.375
>  > > cpu MHz : 1309.000
>  > > cpu MHz : 2212.125
>  > > $ grep MHz /proc/cpuinfo
>  > > cpu MHz : 1255.875
>  > > cpu MHz : 2690.250
>  > > cpu MHz : 1255.875
>  > > cpu MHz : 2530.875
>  > > cpu MHz : 2212.125
>  > > cpu MHz : 1521.500
>  > 
>  > This is odd.  On my Ivy Bridge system the CPU speed from /proc/cpuinfo 
>  > is at max freq once I set the performance governor.  
>  > The numbers above almost look like
>  > the cpu frequency is fluctuating and an average is taken.
>  > What version of the kernel are you running?  Is 
>  > CONFIG_CPU_FREQ_GOV_PERFORMANCE compiled in?
>  > 
>  > Does /sys/devices/system/cpu/cpu?/cpufreq/scaling_cur_freq
>  > also changes?
>  > 
>  > Can you check what are the available governors in your system
>  > and available frequencies?
>  > 
>  > cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors
>  > cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
>  > 
>  > If userspace governor is available, you can try set the governor
>  > to userspace, then pin frequency to 3400 MHz (assuming that's your
>  > max) with command like:
>  
> intel_pstate overrides any governor choice you make through sysfs.
> 
>   Dave
> 

Dirk,

Wonder if this the right behavior for intel_pstate that when I set the 
governor to performance, intel_pstate driver still adjusts 
the cpu frequencies around?

Turbotstat also confirms that the frequencies are not at max,
even though the max_perf_pct and min_perf_pct are both set at 100.  

I ran on my HSW system with 3.15-rc7 kernel and see similar
issue that Geroge reported.

It is really a pain when we need to do performance benchmarking and 
need to have a constant cpu frequency.  

Thanks.

Tim

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Tim Chen
On Fri, 2014-05-30 at 12:52 -0400, George Spelvin wrote:
> > That's very small (less than 0.2%) so I think it's acceptable.
> 
> Thank you!  May I take this as an Acked-by; ?

Yes, with the caveat that you still have a v3 of this patch
that reorganize the K table to rodata.

Tim
> 
> I'll work on some performance improvements, but they proably
> won't be ready for the 3.16 merge window.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread George Spelvin
> That's very small (less than 0.2%) so I think it's acceptable.

Thank you!  May I take this as an Acked-by; ?

I'll work on some performance improvements, but they proably
won't be ready for the 3.16 merge window.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Tim Chen
On Fri, 2014-05-30 at 01:25 -0400, George Spelvin wrote:

> 
> Averaging the 8K bytes per update, I do see an average of 3.2 cycles per
> operation (that is, per 8K of data processed) lost, or about 1 cycle per
> (3K or less) block processed.  I'm hoping the reduced D-cache polution
> makes it up somewhere else.

That's very small (less than 0.2%) so I think it's acceptable.

Tim


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-29 Thread George Spelvin
Olay, recompiled with the acpi-cpufreq driver, so the performance governor
actually works, pegging the frequency at 3900 MHz.

Existing (old) code:
[  455.641397] 
[  455.641397] testing speed of crc32c
[  455.641403] test  0 (   16 byte blocks,   16 bytes per update,   1 updates): 
73 cycles/operation,4 cycles/byte
[  455.641406] test  1 (   64 byte blocks,   16 bytes per update,   4 updates): 
   418 cycles/operation,6 cycles/byte
[  455.641409] test  2 (   64 byte blocks,   64 bytes per update,   1 updates): 
89 cycles/operation,1 cycles/byte
[  455.641411] test  3 (  256 byte blocks,   16 bytes per update,  16 updates): 
  1330 cycles/operation,5 cycles/byte
[  455.641417] test  4 (  256 byte blocks,   64 bytes per update,   4 updates): 
   502 cycles/operation,1 cycles/byte
[  455.641420] test  5 (  256 byte blocks,  256 bytes per update,   1 updates): 
   170 cycles/operation,0 cycles/byte
[  455.641422] test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates): 
  4971 cycles/operation,4 cycles/byte
[  455.641440] test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates): 
   805 cycles/operation,0 cycles/byte
[  455.641445] test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates): 
   371 cycles/operation,0 cycles/byte
[  455.641448] test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates): 
  9839 cycles/operation,4 cycles/byte
[  455.641484] test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates): 
  1436 cycles/operation,0 cycles/byte
[  455.641490] test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates): 
   824 cycles/operation,0 cycles/byte
[  455.641494] test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates): 
   494 cycles/operation,0 cycles/byte
[  455.641498] test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates): 
 19561 cycles/operation,4 cycles/byte
[  455.641568] test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates): 
  2757 cycles/operation,0 cycles/byte
[  455.641579] test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates): 
  1633 cycles/operation,0 cycles/byte
[  455.641586] test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates): 
   861 cycles/operation,0 cycles/byte
[  455.641590] test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 
 39015 cycles/operation,4 cycles/byte
[  455.641729] test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates): 
  5412 cycles/operation,0 cycles/byte
[  455.641749] test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates): 
  3106 cycles/operation,0 cycles/byte
[  455.641762] test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates): 
  1656 cycles/operation,0 cycles/byte
[  455.641769] test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates): 
  1639 cycles/operation,0 cycles/byte
[  480.885336] 
[  480.885336] testing speed of crc32c
[  480.885342] test  0 (   16 byte blocks,   16 bytes per update,   1 updates): 
81 cycles/operation,5 cycles/byte
[  480.885345] test  1 (   64 byte blocks,   16 bytes per update,   4 updates): 
   426 cycles/operation,6 cycles/byte
[  480.885348] test  2 (   64 byte blocks,   64 bytes per update,   1 updates): 
96 cycles/operation,1 cycles/byte
[  480.885350] test  3 (  256 byte blocks,   16 bytes per update,  16 updates): 
  1331 cycles/operation,5 cycles/byte
[  480.885356] test  4 (  256 byte blocks,   64 bytes per update,   4 updates): 
   497 cycles/operation,1 cycles/byte
[  480.885359] test  5 (  256 byte blocks,  256 bytes per update,   1 updates): 
   179 cycles/operation,0 cycles/byte
[  480.885361] test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates): 
  4961 cycles/operation,4 cycles/byte
[  480.885380] test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates): 
   795 cycles/operation,0 cycles/byte
[  480.885384] test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates): 
   366 cycles/operation,0 cycles/byte
[  480.885387] test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates): 
  9827 cycles/operation,4 cycles/byte
[  480.885423] test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates): 
  1445 cycles/operation,0 cycles/byte
[  480.885430] test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates): 
   834 cycles/operation,0 cycles/byte
[  480.885434] test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates): 
   495 cycles/operation,0 cycles/byte
[  480.885437] test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates): 
 19560 cycles/operation,4 cycles/byte
[  480.885507] test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates): 
  2767 cycles/operation,0 cycles/byte
[  480.885518] test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates): 
  1643 cycles/operation,0 cycles/byte
[  480.885525] test 16 ( 4096 byte blocks, 4096 bytes per update,  

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-29 Thread George Spelvin
> This is odd.  On my Ivy Bridge system the CPU speed from /proc/cpuinfo 
> is at max freq once I set the performance governor.  
> The numbers above almost look like
> the cpu frequency is fluctuating and an average is taken.
> What version of the kernel are you running?  Is 
> CONFIG_CPU_FREQ_GOV_PERFORMANCE compiled in?

Yes; I have

CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_GOV_COMMON=y
CONFIG_CPU_FREQ_STAT=y
# CONFIG_CPU_FREQ_STAT_DETAILS is not set
# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE=y
CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
CONFIG_CPU_FREQ_GOV_POWERSAVE=y
# CONFIG_CPU_FREQ_GOV_USERSPACE is not set
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y

However scaling_available_governor only lists "performance powersave"

> Does /sys/devices/system/cpu/cpu?/cpufreq/scaling_cur_freq
> also changes?

That fine does not exist.  However,
/sys/devices/system/cpu/cpu?/cpufreq/cpuinfo_cur_freq
exists and changes.  Several snapshots:

Snap1   Snap2   Snap3   Snap4
cpu01255875 1255875 1255875 1255875
cpu11202750 1202750 1202750 1415250
cpu21680875 1255875 1468375 1468375
cpu31202750 1255875 1521500 1521500
cpu41946500 1255875 1255875 1255875
cpu52690250 2371500 1946500 1734000

> Can you check what are the available governors in your system
> and available frequencies?

> cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors
performance powersave
> cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
cat: /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies: No 
such file or directory
$ ls /sys/devices/system/cpu/cpu0/cpufreq/
affected_cpus cpuinfo_transition_latency   scaling_governor
cpuinfo_cur_freq  related_cpus scaling_max_freq
cpuinfo_max_freq  scaling_available_governors  scaling_min_freq
cpuinfo_min_freq  scaling_driver   scaling_setspeed
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed


> If userspace governor is available, you can try set the governor
> to userspace, then pin frequency to 3400 MHz (assuming that's your
> max) with command like:

I'll have to recompile and reboot, but sure.

Do you want me to change from the intel_pstate driver while I'm at it?

> BTW, why do you place the K table in .text, instead of .rodata? 

Because the jump table before it was in .text, and if I try to move
*that* to .rodata I get a linker error.  So I just put the K_table
right next to it.

However, it's all moot: my current v3 does move K_table to .rodata.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-29 Thread Dave Jones
On Thu, May 29, 2014 at 06:07:16PM -0700, Tim Chen wrote:
 > On Thu, 2014-05-29 at 19:54 -0400, George Spelvin wrote:
 > > Sorry for the delay; my Ivy Bridge test machine isn't in my
 > > office and getting to the console to tweak the BIOS is a
 > > bit of a bother.
 > > 
 > > Anyway, i7-4930K, turbo boost & hyperthreading disabled,
 > > $ cat /sys/devices/system/cpu/cpu?/cpufreq/scaling_governor
 > > performance
 > > performance
 > > performance
 > > performance
 > > performance
 > > performance
 > > 
 > > Oddly, though, CPU speed still seems to be fluctuating:
 > > $ grep MHz /proc/cpuinfo
 > > cpu MHz : 1255.875
 > > cpu MHz : 3168.375
 > > cpu MHz : 3062.125
 > > cpu MHz : 1468.375
 > > cpu MHz : 1309.000
 > > cpu MHz : 2212.125
 > > $ grep MHz /proc/cpuinfo
 > > cpu MHz : 1255.875
 > > cpu MHz : 2690.250
 > > cpu MHz : 1255.875
 > > cpu MHz : 2530.875
 > > cpu MHz : 2212.125
 > > cpu MHz : 1521.500
 > 
 > This is odd.  On my Ivy Bridge system the CPU speed from /proc/cpuinfo 
 > is at max freq once I set the performance governor.  
 > The numbers above almost look like
 > the cpu frequency is fluctuating and an average is taken.
 > What version of the kernel are you running?  Is 
 > CONFIG_CPU_FREQ_GOV_PERFORMANCE compiled in?
 > 
 > Does /sys/devices/system/cpu/cpu?/cpufreq/scaling_cur_freq
 > also changes?
 > 
 > Can you check what are the available governors in your system
 > and available frequencies?
 > 
 > cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors
 > cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
 > 
 > If userspace governor is available, you can try set the governor
 > to userspace, then pin frequency to 3400 MHz (assuming that's your
 > max) with command like:
 
intel_pstate overrides any governor choice you make through sysfs.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-29 Thread Tim Chen
On Thu, 2014-05-29 at 19:54 -0400, George Spelvin wrote:
> Sorry for the delay; my Ivy Bridge test machine isn't in my
> office and getting to the console to tweak the BIOS is a
> bit of a bother.
> 
> Anyway, i7-4930K, turbo boost & hyperthreading disabled,
> $ cat /sys/devices/system/cpu/cpu?/cpufreq/scaling_governor
> performance
> performance
> performance
> performance
> performance
> performance
> 
> Oddly, though, CPU speed still seems to be fluctuating:
> $ grep MHz /proc/cpuinfo
> cpu MHz : 1255.875
> cpu MHz : 3168.375
> cpu MHz : 3062.125
> cpu MHz : 1468.375
> cpu MHz : 1309.000
> cpu MHz : 2212.125
> $ grep MHz /proc/cpuinfo
> cpu MHz : 1255.875
> cpu MHz : 2690.250
> cpu MHz : 1255.875
> cpu MHz : 2530.875
> cpu MHz : 2212.125
> cpu MHz : 1521.500

This is odd.  On my Ivy Bridge system the CPU speed from /proc/cpuinfo 
is at max freq once I set the performance governor.  
The numbers above almost look like
the cpu frequency is fluctuating and an average is taken.
What version of the kernel are you running?  Is 
CONFIG_CPU_FREQ_GOV_PERFORMANCE compiled in?

Does /sys/devices/system/cpu/cpu?/cpufreq/scaling_cur_freq
also changes?

Can you check what are the available governors in your system
and available frequencies?

cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies

If userspace governor is available, you can try set the governor
to userspace, then pin frequency to 3400 MHz (assuming that's your
max) with command like:

i=0
num_cpus=`cat /proc/cpuinfo| grep "^processor"| wc -l `
while [ $i -lt $num_cpus ]
do
  echo userspace > /sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor
  echo 340 > /sys/devices/system/cpu/cpu$i/cpufreq/scaling_setspeed
  i=`expr $i + 1`
done


> 
> It does this even if I set scaling_min_freq to 340.
> Very annoying.  Should I be using a different
> scaling_governor than intel_pstate?
> 
> >> It doesn't look like a slowdown; more like a 1% speedup.
> >
> > You will need to throw away the first few iterations of
> > the test to account for cache warming effects.
> 
> You're absolutely right; that's exactly *why* I ran it 24 times and
> listed them all separately.  The "1%" number was B.S. and I was not
> thinking when I quoted it.
> 
> What I had legitimately noticed was that the code with the patch took
> slightly fewer cycles most of the time, even after discounting the
> first few.  Not statistically significant, but enough to argue that it
> didn't cause a noticeable slowdown.
> 
> 
> Anyway, two iterations each of "modprobe tcrypt mode=319".
> 
> Old code:
> [ 1530.513529] 
> [ 1530.513529] testing speed of crc32c
> [ 1530.513535] test  0 (   16 byte blocks,   16 bytes per update,   1 
> updates): 75 cycles/operation,4 cycles/byte
> [ 1530.513537] test  1 (   64 byte blocks,   16 bytes per update,   4 
> updates):413 cycles/operation,6 cycles/byte
> [ 1530.513540] test  2 (   64 byte blocks,   64 bytes per update,   1 
> updates): 88 cycles/operation,1 cycles/byte
> [ 1530.513542] test  3 (  256 byte blocks,   16 bytes per update,  16 
> updates):   1327 cycles/operation,5 cycles/byte
> [ 1530.513548] test  4 (  256 byte blocks,   64 bytes per update,   4 
> updates):503 cycles/operation,1 cycles/byte
> [ 1530.513551] test  5 (  256 byte blocks,  256 bytes per update,   1 
> updates):178 cycles/operation,0 cycles/byte
> [ 1530.513553] test  6 ( 1024 byte blocks,   16 bytes per update,  64 
> updates):   4972 cycles/operation,4 cycles/byte
> [ 1530.513572] test  7 ( 1024 byte blocks,  256 bytes per update,   4 
> updates):806 cycles/operation,0 cycles/byte
> [ 1530.513576] test  8 ( 1024 byte blocks, 1024 bytes per update,   1 
> updates):370 cycles/operation,0 cycles/byte
> [ 1530.513579] test  9 ( 2048 byte blocks,   16 bytes per update, 128 
> updates):   9835 cycles/operation,4 cycles/byte
> [ 1530.513615] test 10 ( 2048 byte blocks,  256 bytes per update,   8 
> updates):   1461 cycles/operation,0 cycles/byte
> [ 1530.513622] test 11 ( 2048 byte blocks, 1024 bytes per update,   2 
> updates):847 cycles/operation,0 cycles/byte
> [ 1530.513626] test 12 ( 2048 byte blocks, 2048 bytes per update,   1 
> updates):495 cycles/operation,0 cycles/byte
> [ 1530.513630] test 13 ( 4096 byte blocks,   16 bytes per update, 256 
> updates):  19571 cycles/operation,4 cycles/byte
> [ 1530.513700] test 14 ( 4096 byte blocks,  256 bytes per update,  16 
> updates):   2758 cycles/operation,0 cycles/byte
> [ 1530.513711] test 15 ( 4096 byte blocks, 1024 bytes per update,   4 
> updates):   1676 cycles/operation,0 cycles/byte
> [ 1530.513718] test 16 ( 4096 byte blocks, 4096 bytes per update,   1 
> updates):859 cycles/operation,0 cycles/byte
> [ 1530.513722] test 17 ( 8192 byte bl

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-29 Thread George Spelvin
Sorry for the delay; my Ivy Bridge test machine isn't in my
office and getting to the console to tweak the BIOS is a
bit of a bother.

Anyway, i7-4930K, turbo boost & hyperthreading disabled,
$ cat /sys/devices/system/cpu/cpu?/cpufreq/scaling_governor
performance
performance
performance
performance
performance
performance

Oddly, though, CPU speed still seems to be fluctuating:
$ grep MHz /proc/cpuinfo
cpu MHz : 1255.875
cpu MHz : 3168.375
cpu MHz : 3062.125
cpu MHz : 1468.375
cpu MHz : 1309.000
cpu MHz : 2212.125
$ grep MHz /proc/cpuinfo
cpu MHz : 1255.875
cpu MHz : 2690.250
cpu MHz : 1255.875
cpu MHz : 2530.875
cpu MHz : 2212.125
cpu MHz : 1521.500

It does this even if I set scaling_min_freq to 340.
Very annoying.  Should I be using a different
scaling_governor than intel_pstate?

>> It doesn't look like a slowdown; more like a 1% speedup.
>
> You will need to throw away the first few iterations of
> the test to account for cache warming effects.

You're absolutely right; that's exactly *why* I ran it 24 times and
listed them all separately.  The "1%" number was B.S. and I was not
thinking when I quoted it.

What I had legitimately noticed was that the code with the patch took
slightly fewer cycles most of the time, even after discounting the
first few.  Not statistically significant, but enough to argue that it
didn't cause a noticeable slowdown.


Anyway, two iterations each of "modprobe tcrypt mode=319".

Old code:
[ 1530.513529] 
[ 1530.513529] testing speed of crc32c
[ 1530.513535] test  0 (   16 byte blocks,   16 bytes per update,   1 updates): 
75 cycles/operation,4 cycles/byte
[ 1530.513537] test  1 (   64 byte blocks,   16 bytes per update,   4 updates): 
   413 cycles/operation,6 cycles/byte
[ 1530.513540] test  2 (   64 byte blocks,   64 bytes per update,   1 updates): 
88 cycles/operation,1 cycles/byte
[ 1530.513542] test  3 (  256 byte blocks,   16 bytes per update,  16 updates): 
  1327 cycles/operation,5 cycles/byte
[ 1530.513548] test  4 (  256 byte blocks,   64 bytes per update,   4 updates): 
   503 cycles/operation,1 cycles/byte
[ 1530.513551] test  5 (  256 byte blocks,  256 bytes per update,   1 updates): 
   178 cycles/operation,0 cycles/byte
[ 1530.513553] test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates): 
  4972 cycles/operation,4 cycles/byte
[ 1530.513572] test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates): 
   806 cycles/operation,0 cycles/byte
[ 1530.513576] test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates): 
   370 cycles/operation,0 cycles/byte
[ 1530.513579] test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates): 
  9835 cycles/operation,4 cycles/byte
[ 1530.513615] test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates): 
  1461 cycles/operation,0 cycles/byte
[ 1530.513622] test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates): 
   847 cycles/operation,0 cycles/byte
[ 1530.513626] test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates): 
   495 cycles/operation,0 cycles/byte
[ 1530.513630] test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates): 
 19571 cycles/operation,4 cycles/byte
[ 1530.513700] test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates): 
  2758 cycles/operation,0 cycles/byte
[ 1530.513711] test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates): 
  1676 cycles/operation,0 cycles/byte
[ 1530.513718] test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates): 
   859 cycles/operation,0 cycles/byte
[ 1530.513722] test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 
 39012 cycles/operation,4 cycles/byte
[ 1530.513861] test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates): 
  5417 cycles/operation,0 cycles/byte
[ 1530.513882] test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates): 
  3162 cycles/operation,0 cycles/byte
[ 1530.513894] test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates): 
  1678 cycles/operation,0 cycles/byte
[ 1530.513901] test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates): 
  1653 cycles/operation,0 cycles/byte

[ 1662.359717] 
[ 1662.359717] testing speed of crc32c
[ 1662.359723] test  0 (   16 byte blocks,   16 bytes per update,   1 updates): 
80 cycles/operation,5 cycles/byte
[ 1662.359725] test  1 (   64 byte blocks,   16 bytes per update,   4 updates): 
   430 cycles/operation,6 cycles/byte
[ 1662.359729] test  2 (   64 byte blocks,   64 bytes per update,   1 updates): 
81 cycles/operation,1 cycles/byte
[ 1662.359730] test  3 (  256 byte blocks,   16 bytes per update,  16 updates): 
  1324 cycles/operation,5 cycles/byte
[ 1662.359736] test  4 (  256 byte blocks,   64 bytes per update,   4 updates): 
   503 cycles/operation,1 cycles/byte
[ 1662.359740] test  5

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-28 Thread Jan Beulich
>>> "George Spelvin"  05/28/14 11:47 PM >>>
>Jan Beulich  wrote:
>> "George Spelvin"  05/28/14 4:40 PM
>>> Jan: Is support for SLE10's pre-2.18 binutils still required?
>>> Your PEXTRD fix was only a year ago, so I expect, but I wanted to ask.
>
>> I'd much appreciate if I would be able to build the kernel that way for
>> another while.
>
>Does it matter that the code I'm working on is 64-bit only?

No.

>It aready
>uses crc32q instruction (added with SSE4.2) with no assembler workarounds,
>so I figure pmovzxdq (part of SSE 4.1) doesn't make it any worse.

If that's the case, then adding another (earlier) one shouldn't be an issue.

Jan

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-28 Thread Tim Chen
On Wed, 2014-05-28 at 19:01 -0400, George Spelvin wrote:
> Thanks for the reply!
> 
> > Changing from the aligned move (movdqa) to unaligned move and zeroing
> > (pmovzxdq), is going to make things slower.  If the table is aligned
> > on 8 byte boundary, some of the table can span 2 cache lines, which
> > can slow things further.
> 
> Um, two notes:
> 1) This load is performed once per 3072-byte block, which
>is a minimum of 128 cycles just for the crc32q instructions,
>never mind all the pcmulqdq folderol.
> 
>Is it really more than 2 cycles?  Heck, is it *any* overall
>time given that it's preceded by a stretch of 384 instructions
>that it's not data-dependent on?
> 
>I'll do some benchmarking to find out.
> 
> 2) The shrunk table entries are 8 bytes long, and so can't
>span a cache line.  Is there any benefit to using a
>larger alignment, other than the very small issue of the
>full table needing 1 more cache line to be fully cached?

I think you are fine.  Each entry should fit in a cache line
entirely.  With the reduced entry size, we will be fitting
twice as many entries per cache line so it may help to reduce
the cache miss.

>
> > We are trading speed for only 4096 bytes of memory save,
> > which is likely not a good trade for most systems except for 
> > those really constrained of memory.  For this kind of non-performance
> > critical system, it may as well use the generic crc32c algorithm and
> > compile out this module.
> 
> I hadn't intended to cause any speed penalty at all.
> Do you really think there will be one?

If you can do some benchmarking to find out the change's
speed impact, that will help to eliminate concerns about
speed penalty.  

Thanks.

Tim

Tim



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-28 Thread George Spelvin
Thanks for the reply!

> Changing from the aligned move (movdqa) to unaligned move and zeroing
> (pmovzxdq), is going to make things slower.  If the table is aligned
> on 8 byte boundary, some of the table can span 2 cache lines, which
> can slow things further.

Um, two notes:
1) This load is performed once per 3072-byte block, which
   is a minimum of 128 cycles just for the crc32q instructions,
   never mind all the pcmulqdq folderol.

   Is it really more than 2 cycles?  Heck, is it *any* overall
   time given that it's preceded by a stretch of 384 instructions
   that it's not data-dependent on?

   I'll do some benchmarking to find out.

2) The shrunk table entries are 8 bytes long, and so can't
   span a cache line.  Is there any benefit to using a
   larger alignment, other than the very small issue of the
   full table needing 1 more cache line to be fully cached?
   
> We are trading speed for only 4096 bytes of memory save,
> which is likely not a good trade for most systems except for 
> those really constrained of memory.  For this kind of non-performance
> critical system, it may as well use the generic crc32c algorithm and
> compile out this module.

I hadn't intended to cause any speed penalty at all.
Do you really think there will be one?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-28 Thread Tim Chen
On Wed, 2014-05-28 at 10:40 -0400, George Spelvin wrote:
> While following a number of tangents in the code (I was figuring out
> how to edit lib/Kconfig; don't ask), I came across a table of 256 64-bit
> words, all of which had the high half set to zero.
> 
> Since the code depends on both pclmulq and crc32, SSE 4.1 is obviously
> present, so it could use pmovzxdq and save 1K of kernel data.
> 
> The following patch obviously lacks the kludges for old binutils,
> but should convey the general idea.
> 
> Jan: Is support for SLE10's pre-2.18 binutils still required?
> Your PEXTRD fix was only a year ago, so I expect, but I wanted to ask.
> 
> Two other minor additional changes:
> 
> 1. The current code unnecessarily puts the table in the read-write
>.data section.  Moved to .text.
> 2. I'm also not sure why it's necessary to force such large alignment
>on K_table.  Comments on reducing it?
> 
> Signed-off-by: George Spelvin 
> 
> 
> diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S 
> b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
> index dbc4339b..9f885ee4 100644
> --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
> +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
> @@ -216,15 +216,11 @@ LABEL crc_ %i
>   ## 4) Combine three results:
>   
>  
> - lea (K_table-16)(%rip), bufp# first entry is for idx 1
> + lea (K_table-8)(%rip), bufp # first entry is for idx 1
>   shlq$3, %rax# rax *= 8
> - subq%rax, tmp   # tmp -= rax*8
> - shlq$1, %rax
> - subq%rax, tmp   # tmp -= rax*16
> - # (total tmp -= rax*24)
> - addq%rax, bufp
> -
> - movdqa  (bufp), %xmm0   # 2 consts: K1:K2
> + pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2

Changing from the aligned move (movdqa) to unaligned move and zeroing
(pmovzxdq), is going to make things slower.  If the table is aligned
on 8 byte boundary, some of the table can span 2 cache lines, which
can slow things further.

We are trading speed for only 4096 bytes of memory save,
which is likely not a good trade for most systems except for 
those really constrained of memory.  For this kind of non-performance
critical system, it may as well use the generic crc32c algorithm and
compile out this module.

Thanks.

Tim



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-28 Thread George Spelvin
Jan Beulich  wrote:
> "George Spelvin"  05/28/14 4:40 PM
>> Jan: Is support for SLE10's pre-2.18 binutils still required?
>> Your PEXTRD fix was only a year ago, so I expect, but I wanted to ask.

> I'd much appreciate if I would be able to build the kernel that way for
> another while.

Does it matter that the code I'm working on is 64-bit only?  It aready
uses crc32q instruction (added with SSE4.2) with no assembler workarounds,
so I figure pmovzxdq (part of SSE 4.1) doesn't make it any worse.

The annoying thing about doing it with macros is that it would be a
PITA to support a memory operand; I'd probably have to punt to .byte.

> Putting data into .text seems wrong - it should go into .rodata.

I don't really care, but it's being accessed PC-relative the same as
a jump table that's already in .text, so I just figured I'd be lazy.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-28 Thread Jan Beulich
>>> "George Spelvin"  05/28/14 4:40 PM >>>
>Jan: Is support for SLE10's pre-2.18 binutils still required?
>Your PEXTRD fix was only a year ago, so I expect, but I wanted to ask.

I'd much appreciate if I would be able to build the kernel that way for another 
while.

>Two other minor additional changes:
>
>1. The current code unnecessarily puts the table in the read-write
   >.data section.  Moved to .text.

Putting data into .text seems wrong - it should go into .rodata.

Jan

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-28 Thread George Spelvin
Um, yeah, I just noticed the problem with that patch: half of the numbers
in that table are 33 bits, and cause a pile of warnings (not errors,
unfortunately!) from gas that scrolled by when I wasn't looking.

Logically, there should be no need for 33-bit values; they should all be
reducible modulo the polynomial.  But that is going to take a slightly
larger change.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-28 Thread George Spelvin
While following a number of tangents in the code (I was figuring out
how to edit lib/Kconfig; don't ask), I came across a table of 256 64-bit
words, all of which had the high half set to zero.

Since the code depends on both pclmulq and crc32, SSE 4.1 is obviously
present, so it could use pmovzxdq and save 1K of kernel data.

The following patch obviously lacks the kludges for old binutils,
but should convey the general idea.

Jan: Is support for SLE10's pre-2.18 binutils still required?
Your PEXTRD fix was only a year ago, so I expect, but I wanted to ask.

Two other minor additional changes:

1. The current code unnecessarily puts the table in the read-write
   .data section.  Moved to .text.
2. I'm also not sure why it's necessary to force such large alignment
   on K_table.  Comments on reducing it?

Signed-off-by: George Spelvin 


diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S 
b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index dbc4339b..9f885ee4 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -216,15 +216,11 @@ LABEL crc_ %i
## 4) Combine three results:

 
-   lea (K_table-16)(%rip), bufp# first entry is for idx 1
+   lea (K_table-8)(%rip), bufp # first entry is for idx 1
shlq$3, %rax# rax *= 8
-   subq%rax, tmp   # tmp -= rax*8
-   shlq$1, %rax
-   subq%rax, tmp   # tmp -= rax*16
-   # (total tmp -= rax*24)
-   addq%rax, bufp
-
-   movdqa  (bufp), %xmm0   # 2 consts: K1:K2
+   pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2
+   leal(%eax,%eax,2), %eax # rax *= 3 (total *24)
+   subq%rax, tmp   # tmp -= rax*24
 
movqcrc_init, %xmm1 # CRC for block 1
PCLMULQDQ 0x00,%xmm0,%xmm1  # Multiply by K2
@@ -331,136 +327,135 @@ ENDPROC(crc_pcl)
 

## PCLMULQDQ tables
-   ## Table is 128 entries x 2 quad words each
+   ## Table is 128 entries x 2 words (8 bytes) each

-.data
-.align 64
+.align 8
 K_table:
-.quad 0x14cd00bd6,0x105ec76f0
+.long 0x14cd00bd6,0x105ec76f0
-.quad 0x0ba4fc28e,0x14cd00bd6
+.long 0x0ba4fc28e,0x14cd00bd6
-.quad 0x1d82c63da,0x0f20c0dfe
+.long 0x1d82c63da,0x0f20c0dfe
-.quad 0x09e4addf8,0x0ba4fc28e
+.long 0x09e4addf8,0x0ba4fc28e
-.quad 0x039d3b296,0x1384aa63a
+.long 0x039d3b296,0x1384aa63a
-.quad 0x102f9b8a2,0x1d82c63da
+.long 0x102f9b8a2,0x1d82c63da
-.quad 0x14237f5e6,0x01c291d04
+.long 0x14237f5e6,0x01c291d04
-.quad 0x00d3b6092,0x09e4addf8
+.long 0x00d3b6092,0x09e4addf8

(Remaining boring bits of this hunk elided.)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/