Re: [Intel-gfx] [PATCH 58/67] drm/i915/cnl: Cannonlake color init.

2017-04-24 Thread Vivi, Rodrigo


> On Apr 24, 2017, at 10:57 AM, Ville Syrjälä  
> wrote:
> 
>> On Thu, Apr 06, 2017 at 12:15:54PM -0700, Rodrigo Vivi wrote:
>> Cannonlake has same color setup as Geminilake.
>> Legacy color load luts doesn't work anymore on Cannonlake+.
> 
> Not sure what that means. The legacy 8bpc LUT i no longer there?
> The code still depends on that working, and we also still expose the C8
> format which at least used to depend on the legacy LUT.

Not sure... what I tried to say is that by using old sequences available for 
luts we just get black screen... while in previous platforms I remember that 
old setup sequences working while we didn't have the new ones...

> 
>> 
>> Signed-off-by: Rodrigo Vivi 
>> ---
>> drivers/gpu/drm/i915/i915_pci.c  | 1 +
>> drivers/gpu/drm/i915/intel_color.c   | 2 +-
>> drivers/gpu/drm/i915/intel_display.c | 4 ++--
>> drivers/gpu/drm/i915/intel_sprite.c  | 2 +-
>> 4 files changed, 5 insertions(+), 4 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_pci.c 
>> b/drivers/gpu/drm/i915/i915_pci.c
>> index bace848..1e8e0ac 100644
>> --- a/drivers/gpu/drm/i915/i915_pci.c
>> +++ b/drivers/gpu/drm/i915/i915_pci.c
>> @@ -438,6 +438,7 @@
>>.gen = 10,
>>.ddb_size = 1024,
>>.has_csr = 1,
>> +.color = { .degamma_lut_size = 0, .gamma_lut_size = 1024 }
>> };
>> 
>> /*
>> diff --git a/drivers/gpu/drm/i915/intel_color.c 
>> b/drivers/gpu/drm/i915/intel_color.c
>> index 306c6b0..f85d575 100644
>> --- a/drivers/gpu/drm/i915/intel_color.c
>> +++ b/drivers/gpu/drm/i915/intel_color.c
>> @@ -615,7 +615,7 @@ void intel_color_init(struct drm_crtc *crtc)
>>   IS_BROXTON(dev_priv)) {
>>dev_priv->display.load_csc_matrix = i9xx_load_csc_matrix;
>>dev_priv->display.load_luts = broadwell_load_luts;
>> -} else if (IS_GEMINILAKE(dev_priv)) {
>> +} else if (IS_GEMINILAKE(dev_priv) || IS_CANNONLAKE(dev_priv)) {
>>dev_priv->display.load_csc_matrix = i9xx_load_csc_matrix;
>>dev_priv->display.load_luts = glk_load_luts;
>>} else {
>> diff --git a/drivers/gpu/drm/i915/intel_display.c 
>> b/drivers/gpu/drm/i915/intel_display.c
>> index 3adee22..697c112 100644
>> --- a/drivers/gpu/drm/i915/intel_display.c
>> +++ b/drivers/gpu/drm/i915/intel_display.c
>> @@ -3302,7 +3302,7 @@ u32 skl_plane_ctl(const struct intel_crtc_state 
>> *crtc_state,
>> 
>>plane_ctl = PLANE_CTL_ENABLE;
>> 
>> -if (!IS_GEMINILAKE(dev_priv)) {
>> +if (!IS_GEMINILAKE(dev_priv) && !IS_CANNONLAKE(dev_priv)) {
>>plane_ctl |=
>>PLANE_CTL_PIPE_GAMMA_ENABLE |
>>PLANE_CTL_PIPE_CSC_ENABLE |
>> @@ -3359,7 +3359,7 @@ static void skylake_update_primary_plane(struct 
>> drm_plane *plane,
>> 
>>spin_lock_irqsave(_priv->uncore.lock, irqflags);
>> 
>> -if (IS_GEMINILAKE(dev_priv)) {
>> +if (IS_GEMINILAKE(dev_priv) || IS_CANNONLAKE(dev_priv)) {
>>I915_WRITE_FW(PLANE_COLOR_CTL(pipe, plane_id),
>>  PLANE_COLOR_PIPE_GAMMA_ENABLE |
>>  PLANE_COLOR_PIPE_CSC_ENABLE |
>> diff --git a/drivers/gpu/drm/i915/intel_sprite.c 
>> b/drivers/gpu/drm/i915/intel_sprite.c
>> index f7d4314..a002c1a 100644
>> --- a/drivers/gpu/drm/i915/intel_sprite.c
>> +++ b/drivers/gpu/drm/i915/intel_sprite.c
>> @@ -240,7 +240,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc, 
>> struct intel_flip_work *work
>> 
>>spin_lock_irqsave(_priv->uncore.lock, irqflags);
>> 
>> -if (IS_GEMINILAKE(dev_priv)) {
>> +if (IS_GEMINILAKE(dev_priv) || IS_CANNONLAKE(dev_priv)) {
>>I915_WRITE_FW(PLANE_COLOR_CTL(pipe, plane_id),
>>  PLANE_COLOR_PIPE_GAMMA_ENABLE |
>>  PLANE_COLOR_PIPE_CSC_ENABLE |
>> -- 
>> 1.9.1
>> 
>> ___
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> -- 
> Ville Syrjälä
> Intel OTC
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [Regression report] Weekly regression report WW16

2017-04-24 Thread Jairo Miramontes
Link to FDO regression list: 
https://bugs.freedesktop.org/buglist.cgi?bug_status=NEW_status=ASSIGNED_status=REOPENED_status=NEEDINFO=DRM%2FIntel=OP=OP=short_desc=keywords=CP=CP=OR_name=i915%20regressions_id=600614=anywordssubstr=anywordssubstr_sys=All_sys=Linux%20%28All%29_sys=other=bug_id%20DESC=DRI_based_on=i915%20regressions_format=advanced_platform=All_platform=x86%20%28IA32%29_platform=x86-64%20%28AMD64%29_platform=IA64%20%28Itanium%29_platform=Other=regression%20bisect=regression%20bisected%20pending_bisect


Total regressions: 30

This week regressions:4

++-+++

| BugId| Summary| Created on | Bisect |

++-+++

| 100731 | [GLK - IGT] [regression] igt@pm_rps@basic-api| 2017-04-20 | No|

| 100730 | [GLK - IGT] [regression] igt@kms_frontbuffer_tracking@basic | 
2017-04-19 | No|


| 100729 | [GLK - IGT] [ regression] igt@gem_tiled* @basic| 2017-04-19 | No|

| 100728 | [GLK - IGT] [regression] - igt@gem_render* @basic| 2017-04-19 
| No|


++-+++

Previous regressions:26

++--+++

| BugId| Summary| Created on | Bisect |

++--+++

| 99929| [i915] Black/gray screen with kernel 4.9.3| 2017-02-23 | Yes|

| 99578| [SKL][bisected] Screen regularly returns from power save mode| 
2017-01-28 | Yes|


| 99575| [ilk bisected] "drm/i915: Type safe register read/write" -> 
poor/nonexistent IPS | 2017-01-28 | Yes|


| 99362| no resolutions >=1080p with Acer P7500 and Thinkpad X1| 
2017-01-11 | No|


| 99093| [PNV][regression] N450 and D510 machines get stuck in 
igt@gem_ringfill@basic-def | 2016-12-15 | Yes|


| 98690| [SKL bisected] System freeze when starting X using kernel 
4.9-rc1 or later| 2016-11-11 | Yes|


| 98517| Skylake gen6 suspend/resume video regression 4.9| 2016-10-31 | No|

| 97918| [bdw regression 4.8] Severe graphics regression, rainbow 
glitching and flickerin | 2016-09-25 | No|


| 96781| [skl dp-mst] oops in atomic commit| 2016-07-02 | Yes|

| 96428| [IVB bisected] [drm:intel_dp_aux_ch] *ERROR* dp aux hw did not 
signal timeout (h | 2016-06-07 | Yes|


| 95736| [IVB bisected] *ERROR* uncleared fifo underrun on pipe A| 
2016-05-24 | Yes|


| 94590| [KBL] igt/kms_fbcon_fbt/psr-suspend regression| 2016-03-17 | No|

| 93782| [i9xx TV][BISECT] vblank wait timeout on crtc| 2016-01-19 | Yes|

| 93486| [HP Compaq dc7800 Small Form Factor PC][REGRESSION] 
suspend/resume failure| 2015-12-23 | No|


| 93361| 12bpc hdmi causes wrong real refresh rate (swapbuffers return 
time)| 2015-12-12 | Yes|


| 92050| [regression]/bug introduced by commit 
[0e572fe7383a376992364914694c39aa7fe44c1d] | 2015-09-19 | Yes|


| 91974| [bisected] unrecoverable black screen after killing X| 
2015-09-11 | Yes|


| 89632| [i965 
regression]igt/kms_universal_plane/universal-plane-pipe-A-functional 
cause | 2015-03-18 | No|


| 88124| i915: regression: after DP connected (via docking station) 
monitor is turned off | 2015-01-06 | Yes|


| 87726| [BDW Bisected] OglDrvCtx performance reduced by ~30% after use 
true PPGTT in Gen | 2014-12-26 | Yes|


| 87725| [BDW Bisected] OglBatch7 performance reduced by ~7% after 
enable execlists| 2014-12-26 | Yes|


| 87131| [PNV regression] igt/gem_exec_lut_handle takes more than 10 
minutes| 2014-12-09 | No|


| 100617 | [Regression bisected] Gnome Shell slow animations| 2017-04-08 
| Yes|


| 100610 | WARNING: CPU: 1 PID: 16615 at 
drivers/gpu/drm/drm_debugfs_crc.c:185 crtc_crc_ope | 2017-04-07 | Yes|


| 100548 | [BAT][CTG] DRM-Tip 4.11.0-rc5 introduced kms_pipe_crc_basic 
regression on C2D| 2017-04-04 | No|


| 100221 | [SKL] Resume from suspend to disk fails - bisected| 
2017-03-15 | Yes|


++--+++

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 05/20] drm/i915/tdr: Add support for per engine reset recovery

2017-04-24 Thread Michel Thierry



On 20/04/17 17:17, Michel Thierry wrote:

Hmm. Interesting. This relies on i915_gem_retire_requests() (i.e.
struct_mutex) to skip replaying innocent requests, but here we should be
asserting that we do have the hung request.

i.e.
request = i915_gem_find_active_request(engine);
if (!request)
goto skip.

Bonus points for tying that into i915_gem_reset_prepare_engine() so that
we only seach for the active_request once.



Will this do it?
https://patchwork.freedesktop.org/patch/152494/  (ignore the DRM_ERROR I 
still have to change)


I'm not sure about reusing the active request in full-reset (what if we 
have more than one engine hung?).


Thanks
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v2 1/2] PCI / PM: Add needs_resume flag to avoid suspend complete optimization

2017-04-24 Thread Rafael J. Wysocki
On Monday, April 24, 2017 10:42:42 PM Lukas Wunner wrote:
> On Mon, Apr 24, 2017 at 10:02:30PM +0200, Lukas Wunner wrote:
> > On Mon, Apr 24, 2017 at 05:27:42PM +0300, Imre Deak wrote:
> > > Some drivers - like i915 - may not support the system suspend direct
> > > complete optimization due to differences in their runtime and system
> > > suspend sequence. Add a flag that when set resumes the device before
> > > calling the driver's system suspend handlers which effectively disables
> > > the optimization.
> > 
> > FWIW, there are at least two alternative solutions to this problem which
> > do not require changes to the PCI core:
> > 
> > (1) Add a ->prepare hook to i915_pm_ops which calls pm_runtime_get_sync()
> > and a ->complete hook which calls pm_runtime_put().
> 
> Thinking a bit more about this, it's even simpler:  The PM core acquires
> a runtime PM ref in device_prepare() and releases it in device_complete(),
> so it's sufficient to just call pm_runtime_resume() in a ->prepare hook
> that's newly added to i915.  No ->complete hook necessary.  Tentative
> patch below, based on drm-intel-fixes, would replace both of your patches.

Calling it in ->prepare() means that everybody is now waiting for you to resume.

Not quite optimal IMO.

Thanks,
Rafael

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v2 1/2] PCI / PM: Add needs_resume flag to avoid suspend complete optimization

2017-04-24 Thread Lukas Wunner
On Mon, Apr 24, 2017 at 10:02:30PM +0200, Lukas Wunner wrote:
> On Mon, Apr 24, 2017 at 05:27:42PM +0300, Imre Deak wrote:
> > Some drivers - like i915 - may not support the system suspend direct
> > complete optimization due to differences in their runtime and system
> > suspend sequence. Add a flag that when set resumes the device before
> > calling the driver's system suspend handlers which effectively disables
> > the optimization.
> 
> FWIW, there are at least two alternative solutions to this problem which
> do not require changes to the PCI core:
> 
> (1) Add a ->prepare hook to i915_pm_ops which calls pm_runtime_get_sync()
> and a ->complete hook which calls pm_runtime_put().

Thinking a bit more about this, it's even simpler:  The PM core acquires
a runtime PM ref in device_prepare() and releases it in device_complete(),
so it's sufficient to just call pm_runtime_resume() in a ->prepare hook
that's newly added to i915.  No ->complete hook necessary.  Tentative
patch below, based on drm-intel-fixes, would replace both of your patches.

Thanks,

Lukas

-- >8 --
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 5c089b3..6ef156b 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1820,6 +1820,11 @@ void i915_reset(struct drm_i915_private *dev_priv)
goto wakeup;
 }
 
+static int i915_pm_prepare(struct device *kdev)
+{
+   pm_runtime_resume(kdev);
+}
+
 static int i915_pm_suspend(struct device *kdev)
 {
struct pci_dev *pdev = to_pci_dev(kdev);
@@ -2451,6 +2456,7 @@ static int intel_runtime_resume(struct device *kdev)
 * S0ix (via system suspend) and S3 event handlers [PMSG_SUSPEND,
 * PMSG_RESUME]
 */
+   .prepare = i915_pm_prepare,
.suspend = i915_pm_suspend,
.suspend_late = i915_pm_suspend_late,
.resume_early = i915_pm_resume_early,
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v2 2/2] drm/i915: Prevent the system suspend complete optimization

2017-04-24 Thread Lukas Wunner
On Mon, Apr 24, 2017 at 05:27:43PM +0300, Imre Deak wrote:
> Since
> 
> commit bac2a909a096c9110525c18cbb8ce73c660d5f71
> Author: Rafael J. Wysocki 
> Date:   Wed Jan 21 02:17:42 2015 +0100
> 
> PCI / PM: Avoid resuming PCI devices during system suspend

This is not the commit you are looking for. :-)  See below.


> PCI devices will default to allowing the system suspend complete
> optimization where devices are not woken up during system suspend if
> they were already runtime suspended. This however breaks the i915/HDA
> drivers for two reasons:
> 
> - The i915 driver has system suspend specific steps that it needs to
>   run, that bring the device to a different state than its runtime
>   suspended state.
> 
> - The HDA driver's suspend handler requires power that it will request
>   from the i915 driver's power domain handler. This in turn requires the
>   i915 driver to runtime resume itself, but this won't be possible if the
>   suspend complete optimization is in effect: in this case the i915
>   runtime PM is disabled and trying to get an RPM reference returns
>   -EACCESS.

Hm, sounds like something that needs to be solved with device_links.


> 
> Solve this by requiring the PCI/PM core to resume the device during
> system suspend which in effect disables the suspend complete optimization.
> 
> One possibility for this bug staying hidden for so long is that the
> optimization for a device is disabled if it's disabled for any of its
> children devices. i915 may have a backlight device as its child which
> doesn't support runtime PM and so doesn't allow the optimization either.
> So if this backlight device got registered the bug stayed hidden.

No, the reason this hasn't popped up earlier is because direct_complete
has only been enabled for DRM devices for a few months now, to be specific
since

commit d14d2a8453d650bea32a1c5271af1458cd283a0f
Author: Lukas Wunner 
Date:   Wed Jun 8 12:49:29 2016 +0200

drm: Remove dev_pm_ops from drm_class

which landed in v4.8.

(Sorry for not raising my voice earlier, this patch appeared on my radar
just now.)

Kind regards,

Lukas

> 
> Credits to Marta, Tomi and David who enabled pstore logging,
> that caught one instance of this issue across a suspend/
> resume-to-ram and Ville who rememberd that the optimization was enabled
> for some devices at one point.
> 
> The first WARN triggered by the problem:
> 
> [ 6250.746445] WARNING: CPU: 2 PID: 17384 at 
> drivers/gpu/drm/i915/intel_runtime_pm.c:2846 intel_runtime_pm_get+0x6b/0xd0 
> [i915]
> [ 6250.746448] pm_runtime_get_sync() failed: -13
> [ 6250.746451] Modules linked in: snd_hda_intel i915 vgem snd_hda_codec_hdmi 
> x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul
> snd_hda_codec_realtek snd_hda_codec_generic ghash_clmulni_intel e1000e 
> snd_hda_codec snd_hwdep snd_hda_core ptp mei_me pps_core snd_pcm lpc_ich mei 
> prime_
> numbers i2c_hid i2c_designware_platform i2c_designware_core [last unloaded: 
> i915]
> [ 6250.746512] CPU: 2 PID: 17384 Comm: kworker/u8:0 Tainted: G U  W   
> 4.11.0-rc5-CI-CI_DRM_334+ #1
> [ 6250.746515] Hardware name:  /NUC5i5RYB, BIOS 
> RYBDWi35.86A.0362.2017.0118.0940 01/18/2017
> [ 6250.746521] Workqueue: events_unbound async_run_entry_fn
> [ 6250.746525] Call Trace:
> [ 6250.746530]  dump_stack+0x67/0x92
> [ 6250.746536]  __warn+0xc6/0xe0
> [ 6250.746542]  ? pci_restore_standard_config+0x40/0x40
> [ 6250.746546]  warn_slowpath_fmt+0x46/0x50
> [ 6250.746553]  ? __pm_runtime_resume+0x56/0x80
> [ 6250.746584]  intel_runtime_pm_get+0x6b/0xd0 [i915]
> [ 6250.746610]  intel_display_power_get+0x1b/0x40 [i915]
> [ 6250.746646]  i915_audio_component_get_power+0x15/0x20 [i915]
> [ 6250.746654]  snd_hdac_display_power+0xc8/0x110 [snd_hda_core]
> [ 6250.746661]  azx_runtime_resume+0x218/0x280 [snd_hda_intel]
> [ 6250.746667]  pci_pm_runtime_resume+0x76/0xa0
> [ 6250.746672]  __rpm_callback+0xb4/0x1f0
> [ 6250.746677]  ? pci_restore_standard_config+0x40/0x40
> [ 6250.746682]  rpm_callback+0x1f/0x80
> [ 6250.746686]  ? pci_restore_standard_config+0x40/0x40
> [ 6250.746690]  rpm_resume+0x4ba/0x740
> [ 6250.746698]  __pm_runtime_resume+0x49/0x80
> [ 6250.746703]  pci_pm_suspend+0x57/0x140
> [ 6250.746709]  dpm_run_callback+0x6f/0x330
> [ 6250.746713]  ? pci_pm_freeze+0xe0/0xe0
> [ 6250.746718]  __device_suspend+0xf9/0x370
> [ 6250.746724]  ? dpm_watchdog_set+0x60/0x60
> [ 6250.746730]  async_suspend+0x1a/0x90
> [ 6250.746735]  async_run_entry_fn+0x34/0x160
> [ 6250.746741]  process_one_work+0x1f2/0x6d0
> [ 6250.746749]  worker_thread+0x49/0x4a0
> [ 6250.746755]  kthread+0x107/0x140
> [ 6250.746759]  ? process_one_work+0x6d0/0x6d0
> [ 6250.746763]  ? kthread_create_on_node+0x40/0x40
> [ 6250.746768]  ret_from_fork+0x2e/0x40
> [ 6250.746778] ---[ end trace 102a62fd2160f5e6 ]---
> 
> v2:
> - Use the new pci_dev->needs_resume flag, to avoid any overhead during
>   the ->pm_prepare 

Re: [Intel-gfx] [PATCH 50/67] drm/i915/gen10+: use the SKL code for reading WM latencies

2017-04-24 Thread Ville Syrjälä
On Mon, Apr 24, 2017 at 04:10:41PM -0300, Paulo Zanoni wrote:
> Em Seg, 2017-04-24 às 21:22 +0300, Ville Syrjälä escreveu:
> > On Thu, Apr 06, 2017 at 12:15:46PM -0700, Rodrigo Vivi wrote:
> > > 
> > > From: Paulo Zanoni 
> > > 
> > > Gen 10 should use the exact same code as Gen 9, so change the check
> > > to
> > > take this into consideration, and also assume that future platforms
> > > will run this code.
> > > 
> > > Also add a MISSING_CASE(), just in case we do something wrong,
> > > instead
> > > of silently failing.
> > > 
> > > Signed-off-by: Paulo Zanoni 
> > > Signed-off-by: Rodrigo Vivi 
> > > ---
> > >  drivers/gpu/drm/i915/intel_pm.c | 6 --
> > >  1 file changed, 4 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/intel_pm.c
> > > b/drivers/gpu/drm/i915/intel_pm.c
> > > index 4c07b91..a2b2509 100644
> > > --- a/drivers/gpu/drm/i915/intel_pm.c
> > > +++ b/drivers/gpu/drm/i915/intel_pm.c
> > > @@ -2291,7 +2291,7 @@ static void ilk_compute_wm_level(const struct
> > > drm_i915_private *dev_priv,
> > >  static void intel_read_wm_latency(struct drm_i915_private
> > > *dev_priv,
> > >     uint16_t wm[8])
> > >  {
> > > - if (IS_GEN9(dev_priv)) {
> > > + if (INTEL_GEN(dev_priv) >= 9) {
> > >   uint32_t val;
> > >   int ret, i;
> > >   int level, max_level = ilk_wm_max_level(dev_priv);
> > > @@ -2351,7 +2351,7 @@ static void intel_read_wm_latency(struct
> > > drm_i915_private *dev_priv,
> > >   }
> > >  
> > >   /*
> > > -  * WaWmMemoryReadLatency:skl,glk
> > > +  * WaWmMemoryReadLatency:skl+,glk
> > 
> > When we did we start to use the '+' notation in the w/a notes?
> 
> The + is because this code, when written, was going to be run for every
> gen >= 9. Stuff changed, rebases happened, so I guess I'll need to re-
> check to see that's still the case.
> 
> > 
> > What would it mean to say 'skl+,glk'? Is BXT included or not?
> 
> When I originally wrote this patch, I changed from "skl" to "skl+",
> there was no glk, and it was added by not-me during some rebase at some
> point.

Well, up to now we've never used + in these. So maybe just list ever
platform explicitly. Makes it clear where it really applies. The + stuff
gets hard when the w/a is no longer needed by some more recent platform.

> 
> > 
> > > 
> > >    *
> > >    * punit doesn't take into account the read
> > > latency so we need
> > >    * to add 2us to the various latency levels we
> > > retrieve from the
> > > @@ -2390,6 +2390,8 @@ static void intel_read_wm_latency(struct
> > > drm_i915_private *dev_priv,
> > >   wm[0] = 7;
> > >   wm[1] = (mltr >> MLTR_WM1_SHIFT) & ILK_SRLT_MASK;
> > >   wm[2] = (mltr >> MLTR_WM2_SHIFT) & ILK_SRLT_MASK;
> > > + } else {
> > > + MISSING_CASE(INTEL_DEVID(dev_priv));
> > >   }
> > >  }
> > >  
> > > -- 
> > > 1.9.1
> > > 
> > > ___
> > > Intel-gfx mailing list
> > > Intel-gfx@lists.freedesktop.org
> > > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > 

-- 
Ville Syrjälä
Intel OTC
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v2 1/2] PCI / PM: Add needs_resume flag to avoid suspend complete optimization

2017-04-24 Thread Lukas Wunner
On Mon, Apr 24, 2017 at 05:27:42PM +0300, Imre Deak wrote:
> Some drivers - like i915 - may not support the system suspend direct
> complete optimization due to differences in their runtime and system
> suspend sequence. Add a flag that when set resumes the device before
> calling the driver's system suspend handlers which effectively disables
> the optimization.

FWIW, there are at least two alternative solutions to this problem which
do not require changes to the PCI core:

(1) Add a ->prepare hook to i915_pm_ops which calls pm_runtime_get_sync()
and a ->complete hook which calls pm_runtime_put().

(2) Assign a struct dev_pm_domain to the GPU which overrides the PCI
->prepare hook to return 0 unconditionally.  See here for an example:
http://lxr.free-electrons.com/source/drivers/gpu/vga/vga_switcheroo.c#L1056

It may be worth considering approach (1) (which is the simpler of the two)
in favor of this patch.  Because I'm not sure there are many drivers that
will make use of this change to the PCI core.

It's unfortunate that the return value of PCI drivers' ->prepare hooks
has totally different semantics than that of bus types, thereby
necessitating such workarounds. :-(

Best regards,

Lukas

> 
> Needed by the next patch fixing suspend/resume on i915.
> 
> Suggested by Rafael.
> 
> Cc: Rafael J. Wysocki 
> Cc: Bjorn Helgaas 
> Cc: linux-...@vger.kernel.org
> Cc: sta...@vger.kernel.org
> Signed-off-by: Imre Deak 
> ---
>  drivers/pci/pci.c   | 3 ++-
>  include/linux/pci.h | 7 +++
>  2 files changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 7904d02..020f02d 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -2141,7 +2141,8 @@ bool pci_dev_keep_suspended(struct pci_dev *pci_dev)
>  
>   if (!pm_runtime_suspended(dev)
>   || pci_target_state(pci_dev) != pci_dev->current_state
> - || platform_pci_need_resume(pci_dev))
> + || platform_pci_need_resume(pci_dev)
> + || pci_dev->needs_resume)
>   return false;
>  
>   /*
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index e2d1a12..3fe00a6 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -316,6 +316,9 @@ struct pci_dev {
>   unsigned inthotplug_user_indicators:1; /* SlotCtl indicators
> controlled exclusively by
> user sysfs */
> + unsigned intneeds_resume:1; /* Resume before calling the driver's
> +system suspend hooks, disabling the
> +direct_complete optimization. */
>   unsigned intd3_delay;   /* D3->D0 transition time in ms */
>   unsigned intd3cold_delay;   /* D3cold->D0 transition time in ms */
>  
> @@ -1113,6 +1116,10 @@ bool pci_check_pme_status(struct pci_dev *dev);
>  void pci_pme_wakeup_bus(struct pci_bus *bus);
>  void pci_d3cold_enable(struct pci_dev *dev);
>  void pci_d3cold_disable(struct pci_dev *dev);
> +static inline void pci_resume_before_suspend(struct pci_dev *dev, bool 
> enable)
> +{
> + dev->needs_resume = enable;
> +}
>  
>  static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state,
> bool enable)
> -- 
> 2.5.0
> 
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 50/67] drm/i915/gen10+: use the SKL code for reading WM latencies

2017-04-24 Thread Paulo Zanoni
Em Seg, 2017-04-24 às 21:22 +0300, Ville Syrjälä escreveu:
> On Thu, Apr 06, 2017 at 12:15:46PM -0700, Rodrigo Vivi wrote:
> > 
> > From: Paulo Zanoni 
> > 
> > Gen 10 should use the exact same code as Gen 9, so change the check
> > to
> > take this into consideration, and also assume that future platforms
> > will run this code.
> > 
> > Also add a MISSING_CASE(), just in case we do something wrong,
> > instead
> > of silently failing.
> > 
> > Signed-off-by: Paulo Zanoni 
> > Signed-off-by: Rodrigo Vivi 
> > ---
> >  drivers/gpu/drm/i915/intel_pm.c | 6 --
> >  1 file changed, 4 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_pm.c
> > b/drivers/gpu/drm/i915/intel_pm.c
> > index 4c07b91..a2b2509 100644
> > --- a/drivers/gpu/drm/i915/intel_pm.c
> > +++ b/drivers/gpu/drm/i915/intel_pm.c
> > @@ -2291,7 +2291,7 @@ static void ilk_compute_wm_level(const struct
> > drm_i915_private *dev_priv,
> >  static void intel_read_wm_latency(struct drm_i915_private
> > *dev_priv,
> >       uint16_t wm[8])
> >  {
> > -   if (IS_GEN9(dev_priv)) {
> > +   if (INTEL_GEN(dev_priv) >= 9) {
> >     uint32_t val;
> >     int ret, i;
> >     int level, max_level = ilk_wm_max_level(dev_priv);
> > @@ -2351,7 +2351,7 @@ static void intel_read_wm_latency(struct
> > drm_i915_private *dev_priv,
> >     }
> >  
> >     /*
> > -    * WaWmMemoryReadLatency:skl,glk
> > +    * WaWmMemoryReadLatency:skl+,glk
> 
> When we did we start to use the '+' notation in the w/a notes?

The + is because this code, when written, was going to be run for every
gen >= 9. Stuff changed, rebases happened, so I guess I'll need to re-
check to see that's still the case.

> 
> What would it mean to say 'skl+,glk'? Is BXT included or not?

When I originally wrote this patch, I changed from "skl" to "skl+",
there was no glk, and it was added by not-me during some rebase at some
point.

> 
> > 
> >      *
> >      * punit doesn't take into account the read
> > latency so we need
> >      * to add 2us to the various latency levels we
> > retrieve from the
> > @@ -2390,6 +2390,8 @@ static void intel_read_wm_latency(struct
> > drm_i915_private *dev_priv,
> >     wm[0] = 7;
> >     wm[1] = (mltr >> MLTR_WM1_SHIFT) & ILK_SRLT_MASK;
> >     wm[2] = (mltr >> MLTR_WM2_SHIFT) & ILK_SRLT_MASK;
> > +   } else {
> > +   MISSING_CASE(INTEL_DEVID(dev_priv));
> >     }
> >  }
> >  
> > -- 
> > 1.9.1
> > 
> > ___
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 40/67] drm/i915/cnl: Enable loadgen_select bit for vswing sequence

2017-04-24 Thread Ville Syrjälä
On Thu, Apr 06, 2017 at 12:15:36PM -0700, Rodrigo Vivi wrote:
> From: Clint Taylor 
> 
> vswing programming sequence step 2 requires the Loadgen_select bit to
> be set in PORT_TX_DW4 lane reigsters per table defined by Bit rate and
> lane width. Implemented the change that was marked as FIXME in the
> driver.
> 
> v2: (Rodrigo) checkpatch fixes.
> 
> Signed-off-by: Clint Taylor 
> Signed-off-by: Rodrigo Vivi 
> ---
>  drivers/gpu/drm/i915/intel_ddi.c | 38 --
>  1 file changed, 32 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_ddi.c 
> b/drivers/gpu/drm/i915/intel_ddi.c
> index a4d7061..3f461c3 100644
> --- a/drivers/gpu/drm/i915/intel_ddi.c
> +++ b/drivers/gpu/drm/i915/intel_ddi.c
> @@ -1846,10 +1846,24 @@ static void cnl_ddi_vswing_program(struct 
> drm_i915_private *dev_priv,
>   I915_WRITE(CNL_PORT_TX_DW7_GRP(port), val);
>  }
>  
> -static void cnl_ddi_vswing_sequence(struct drm_i915_private *dev_priv,
> - u32 level, enum port port, int type)
> +static void cnl_ddi_vswing_sequence(struct intel_encoder *encoder, u32 level)

This change looks unrelated. And it reminds me that I should resurrect
my remaining encoder->type fixings...

>  {
> + struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
> + struct intel_dp *intel_dp = enc_to_intel_dp(>base);
> + enum port port = intel_ddi_get_encoder_port(encoder);
> + int type = encoder->type;

> + int width = 0;
> + int rate = 0;
>   u32 val;
> + int ln = 0;
> +
> + if ((intel_dp) && (type == INTEL_OUTPUT_EDP || type == 
> INTEL_OUTPUT_DP)) {

intel_dp isn't needed outside this branch, so the declaration could be
moved here.

> + width = intel_dp->lane_count;
> + rate = intel_dp->link_rate;
> + } else {
> + width = 4;
> + /* Rate is always < than 6GHz for HDMI */
> + }
>  
>   /*
>* 1. If port type is eDP or DP,
> @@ -1865,8 +1879,21 @@ static void cnl_ddi_vswing_sequence(struct 
> drm_i915_private *dev_priv,
>  
>   /* 2. Program loadgen select */
>   /*
> -  * FIXME: Program PORT_TX_DW4_LN depending on Bit rate and used lanes
> +  * Program PORT_TX_DW4_LN depending on Bit rate and used lanes
> +  * <= 6 GHz and 4 lanes (LN0=0, LN1=1, LN2=1, LN3=1)
> +  * <= 6 GHz and 1,2 lanes (LN0=0, LN1=1, LN2=1, LN3=0)
> +  * > 6 GHz (LN0=0, LN1=0, LN2=0, LN3=0)
>*/
> + for (ln = 0; ln <= 3; ln++) {

< 4 would look more typical.

> + val = I915_READ(CNL_PORT_TX_DW4_LN(port, ln));
> + val &= ~LOADGEN_SELECT;
> +
> + if (((rate < 60) && (width == 4) && (ln >= 1))  ||
> + ((rate < 60) && (width < 4) && ((ln == 1) || (ln == 
> 2 {

< looks wrong based on the comment. Should be <=

Looks lispy. Could trim some of the parens.

> + val |= LOADGEN_SELECT;
> + }
> + I915_WRITE(CNL_PORT_TX_DW4_LN(port, ln), val);
> + }
>  
>   /* 3. Set PORT_CL_DW5 SUS Clock Config to 11b */
>   val = I915_READ(CNL_PORT_CL1CM_DW5);
> @@ -1920,7 +1947,7 @@ uint32_t ddi_signal_levels(struct intel_dp *intel_dp)
>   else if (IS_GEN9_LP(dev_priv))
>   bxt_ddi_vswing_sequence(dev_priv, level, port, encoder->type);
>   else if (IS_CANNONLAKE(dev_priv)) {
> - cnl_ddi_vswing_sequence(dev_priv, level, port, encoder->type);
> + cnl_ddi_vswing_sequence(encoder, level);
>   /* DDI_BUF_CTL bits 27:24 are reserved on CNL */
>   return 0;
>   }
> @@ -2022,8 +2049,7 @@ static void intel_ddi_pre_enable_hdmi(struct 
> intel_encoder *encoder,
>   bxt_ddi_vswing_sequence(dev_priv, level, port,
>   INTEL_OUTPUT_HDMI);
>   else if (IS_CANNONLAKE(dev_priv))
> - cnl_ddi_vswing_sequence(dev_priv, level, port,
> - INTEL_OUTPUT_HDMI);
> + cnl_ddi_vswing_sequence(encoder, level);
>  
>   intel_hdmi->set_infoframes(drm_encoder,
>  has_hdmi_sink,
> -- 
> 1.9.1
> 
> ___
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] ✗ Fi.CI.BAT: failure for Enable OA unit for Gen 8 and 9 in i915 perf (rev7)

2017-04-24 Thread Patchwork
== Series Details ==

Series: Enable OA unit for Gen 8 and 9 in i915 perf (rev7)
URL   : https://patchwork.freedesktop.org/series/20084/
State : failure

== Summary ==

cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/gvt/scheduler.o' failed
make[4]: *** [drivers/gpu/drm/i915/gvt/scheduler.o] Error 1
cc1: all warnings being treated as errors
cc1: all warnings being treated as errors
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/i915_gpu_error.o' failed
make[4]: *** [drivers/gpu/drm/i915/i915_gpu_error.o] Error 1
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/i915_vgpu.o' failed
make[4]: *** [drivers/gpu/drm/i915/i915_vgpu.o] Error 1
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/intel_fbdev.o' failed
make[4]: *** [drivers/gpu/drm/i915/intel_fbdev.o] Error 1
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/intel_hdmi.o' failed
make[4]: *** [drivers/gpu/drm/i915/intel_hdmi.o] Error 1
cc1: all warnings being treated as errors
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/i915_oa_hsw.o' failed
make[4]: *** [drivers/gpu/drm/i915/i915_oa_hsw.o] Error 1
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/i915_perf.o' failed
make[4]: *** [drivers/gpu/drm/i915/i915_perf.o] Error 1
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/selftests/i915_selftest.o' failed
make[4]: *** [drivers/gpu/drm/i915/selftests/i915_selftest.o] Error 1
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/dvo_tfp410.o' failed
make[4]: *** [drivers/gpu/drm/i915/dvo_tfp410.o] Error 1
scripts/Makefile.build:294: recipe for target 'drivers/gpu/drm/i915/intel_dp.o' 
failed
make[4]: *** [drivers/gpu/drm/i915/intel_dp.o] Error 1
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/gvt/cmd_parser.o' failed
make[4]: *** [drivers/gpu/drm/i915/gvt/cmd_parser.o] Error 1
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 'drivers/gpu/drm/i915/intel_pm.o' 
failed
make[4]: *** [drivers/gpu/drm/i915/intel_pm.o] Error 1
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 'drivers/gpu/drm/i915/gvt/gtt.o' 
failed
make[4]: *** [drivers/gpu/drm/i915/gvt/gtt.o] Error 1
cc1: all warnings being treated as errors
cc1: all warnings being treated as errors
  LD [M]  drivers/net/ethernet/intel/e1000/e1000.o
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/gvt/execlist.o' failed
make[4]: *** [drivers/gpu/drm/i915/gvt/execlist.o] Error 1
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/intel_sdvo.o' failed
make[4]: *** [drivers/gpu/drm/i915/intel_sdvo.o] Error 1
  LD  drivers/scsi/scsi_mod.o
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/gvt/handlers.o' failed
make[4]: *** [drivers/gpu/drm/i915/gvt/handlers.o] Error 1
  LD  drivers/tty/serial/8250/8250_base.o
  LD  drivers/tty/serial/8250/built-in.o
  LD [M]  drivers/net/ethernet/intel/igbvf/igbvf.o
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 
'drivers/gpu/drm/i915/intel_display.o' failed
make[4]: *** [drivers/gpu/drm/i915/intel_display.o] Error 1
scripts/Makefile.build:553: recipe for target 'drivers/gpu/drm/i915' failed
make[3]: *** [drivers/gpu/drm/i915] Error 2
make[3]: *** Waiting for unfinished jobs
  AR  lib/lib.a
  LD  drivers/tty/serial/built-in.o
  EXPORTS lib/lib-ksyms.o
  LD  net/xfrm/built-in.o
  LD  drivers/usb/core/usbcore.o
  LD  lib/built-in.o
  CC  arch/x86/kernel/cpu/capflags.o
  LD  arch/x86/kernel/cpu/built-in.o
  LD  drivers/usb/core/built-in.o
  LD  net/ipv4/built-in.o
  LD  fs/btrfs/btrfs.o
scripts/Makefile.build:553: recipe for target 'arch/x86/kernel' failed
make[1]: *** [arch/x86/kernel] Error 2
Makefile:1002: recipe for target 'arch/x86' failed
make: *** [arch/x86] Error 2
make: *** Waiting for unfinished jobs
  LD  fs/btrfs/built-in.o
  LD  drivers/scsi/sd_mod.o
  LD  drivers/scsi/built-in.o
scripts/Makefile.build:553: recipe for target 'drivers/gpu/drm' failed
make[2]: *** [drivers/gpu/drm] Error 2
scripts/Makefile.build:553: recipe for target 'drivers/gpu' failed
make[1]: *** [drivers/gpu] Error 2
make[1]: *** Waiting for unfinished jobs
  LD  drivers/usb/host/xhci-hcd.o
  LD  drivers/tty/vt/built-in.o
  LD  drivers/tty/built-in.o
  LD  drivers/md/md-mod.o
  LD  drivers/md/built-in.o
  LD  net/core/built-in.o
  LD  fs/ext4/ext4.o
  LD  fs/ext4/built-in.o
  LD  net/built-in.o
 

[Intel-gfx] [PATCH v6 12/15] drm/i915/perf: Add OA unit support for Gen 8+

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

Enables access to OA unit metrics for BDW, CHV, SKL and BXT which all
share (more-or-less) the same OA unit design.

Of particular note in comparison to Haswell: some OA unit HW config
state has become per-context state and as a consequence it is somewhat
more complicated to manage synchronous state changes from the cpu while
there's no guarantee of what context (if any) is currently actively
running on the gpu.

The periodic sampling frequency which can be particularly useful for
system-wide analysis (as opposed to command stream synchronised
MI_REPORT_PERF_COUNT commands) is perhaps the most surprising state to
have become per-context save and restored (while the OABUFFER
destination is still a shared, system-wide resource).

This support for gen8+ takes care to consider a number of timing
challenges involved in synchronously updating per-context state
primarily by programming all config state from the cpu and updating all
current and saved contexts synchronously while the OA unit is still
disabled.

The driver intentionally avoids depending on command streamer
programming to update OA state considering the lack of synchronization
between the automatic loading of OACTXCONTROL state (that includes the
periodic sampling state and enable state) on context restore and the
parsing of any general purpose BB the driver can control. I.e. this
implementation is careful to avoid the possibility of a context restore
temporarily enabling any out-of-date periodic sampling state. In
addition to the risk of transiently-out-of-date state being loaded
automatically; there are also internal HW latencies involved in the
loading of MUX configurations which would be difficult to account for
from the command streamer (and we only want to enable the unit when once
the MUX configuration is complete).

Since the Gen8+ OA unit design no longer supports clock gating the unit
off for a single given context (which effectively stopped any progress
of counters while any other context was running) and instead supports
tagging OA reports with a context ID for filtering on the CPU, it means
we can no longer hide the system-wide progress of counters from a
non-privileged application only interested in metrics for its own
context. Although we could theoretically try and subtract the progress
of other contexts before forwarding reports via read() we aren't in a
position to filter reports captured via MI_REPORT_PERF_COUNT commands.
As a result, for Gen8+, we always require the
dev.i915.perf_stream_paranoid to be unset for any access to OA metrics
if not root.

v5: Drain submitted requests when enabling metric set to ensure no
lite-restore erases the context image we just updated (Lionel)

v6: In addition to drain, switch to kernel context & update all
context in place (Chris)

Signed-off-by: Robert Bragg 
Signed-off-by: Lionel Landwerlin 
Reviewed-by: Matthew Auld  \o/
---
 drivers/gpu/drm/i915/i915_drv.h  |  45 +-
 drivers/gpu/drm/i915/i915_perf.c | 957 ---
 drivers/gpu/drm/i915/i915_reg.h  |  22 +
 drivers/gpu/drm/i915/intel_lrc.c |   2 +
 include/uapi/drm/i915_drm.h  |  19 +-
 5 files changed, 952 insertions(+), 93 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ffa1fc5eddfd..676b1227067c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2064,9 +2064,17 @@ struct i915_oa_ops {
void (*init_oa_buffer)(struct drm_i915_private *dev_priv);

/**
-* @enable_metric_set: Applies any MUX configuration to set up the
-* Boolean and Custom (B/C) counters that are part of the counter
-* reports being sampled. May apply system constraints such as
+* @select_metric_set: The auto generated code that checks whether a
+* requested OA config is applicable to the system and if so sets up
+* the mux, oa and flex eu register config pointers according to the
+* current dev_priv->perf.oa.metrics_set.
+*/
+   int (*select_metric_set)(struct drm_i915_private *dev_priv);
+
+   /**
+* @enable_metric_set: Selects and applies any MUX configuration to set
+* up the Boolean and Custom (B/C) counters that are part of the
+* counter reports being sampled. May apply system constraints such as
 * disabling EU clock gating as required.
 */
int (*enable_metric_set)(struct drm_i915_private *dev_priv);
@@ -2097,20 +2105,13 @@ struct i915_oa_ops {
size_t *offset);

/**
-* @oa_buffer_check: Check for OA buffer data + update tail
-*
-* This is either called via fops or the poll check hrtimer (atomic
-* ctx) without any locks taken.
+* @oa_hw_tail_read: read the OA tail pointer register
 *
-* It's safe to read OA 

Re: [Intel-gfx] [PATCH 50/67] drm/i915/gen10+: use the SKL code for reading WM latencies

2017-04-24 Thread Ville Syrjälä
On Thu, Apr 06, 2017 at 12:15:46PM -0700, Rodrigo Vivi wrote:
> From: Paulo Zanoni 
> 
> Gen 10 should use the exact same code as Gen 9, so change the check to
> take this into consideration, and also assume that future platforms
> will run this code.
> 
> Also add a MISSING_CASE(), just in case we do something wrong, instead
> of silently failing.
> 
> Signed-off-by: Paulo Zanoni 
> Signed-off-by: Rodrigo Vivi 
> ---
>  drivers/gpu/drm/i915/intel_pm.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index 4c07b91..a2b2509 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -2291,7 +2291,7 @@ static void ilk_compute_wm_level(const struct 
> drm_i915_private *dev_priv,
>  static void intel_read_wm_latency(struct drm_i915_private *dev_priv,
> uint16_t wm[8])
>  {
> - if (IS_GEN9(dev_priv)) {
> + if (INTEL_GEN(dev_priv) >= 9) {
>   uint32_t val;
>   int ret, i;
>   int level, max_level = ilk_wm_max_level(dev_priv);
> @@ -2351,7 +2351,7 @@ static void intel_read_wm_latency(struct 
> drm_i915_private *dev_priv,
>   }
>  
>   /*
> -  * WaWmMemoryReadLatency:skl,glk
> +  * WaWmMemoryReadLatency:skl+,glk

When we did we start to use the '+' notation in the w/a notes?

What would it mean to say 'skl+,glk'? Is BXT included or not?

>*
>* punit doesn't take into account the read latency so we need
>* to add 2us to the various latency levels we retrieve from the
> @@ -2390,6 +2390,8 @@ static void intel_read_wm_latency(struct 
> drm_i915_private *dev_priv,
>   wm[0] = 7;
>   wm[1] = (mltr >> MLTR_WM1_SHIFT) & ILK_SRLT_MASK;
>   wm[2] = (mltr >> MLTR_WM2_SHIFT) & ILK_SRLT_MASK;
> + } else {
> + MISSING_CASE(INTEL_DEVID(dev_priv));
>   }
>  }
>  
> -- 
> 1.9.1
> 
> ___
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 59/67] drm/i915/cnl: Fix Cannonlake scaler mode programing.

2017-04-24 Thread Ville Syrjälä
On Thu, Apr 06, 2017 at 12:15:55PM -0700, Rodrigo Vivi wrote:
> As Geminilake scalers Cannonlake also don't need and don't have
> the "high quality" mode programming.
> 
> Cc: Ander Conselvan de Oliveira 
> Signed-off-by: Rodrigo Vivi 
> ---
>  drivers/gpu/drm/i915/intel_atomic.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_atomic.c 
> b/drivers/gpu/drm/i915/intel_atomic.c
> index 50fb1f7..7c36b20 100644
> --- a/drivers/gpu/drm/i915/intel_atomic.c
> +++ b/drivers/gpu/drm/i915/intel_atomic.c
> @@ -248,7 +248,7 @@ int intel_atomic_setup_scalers(struct drm_i915_private 
> *dev_priv,
>   }
>  
>   /* set scaler mode */
> - if (IS_GEMINILAKE(dev_priv)) {
> + if (IS_GEMINILAKE(dev_priv) || IS_CANNONLAKE(dev_priv)) {
>   scaler_state->scalers[*scaler_id].mode = 0;

So this now controls normal vs. adaptive mode. We don't program anything
for the adaptive mode so far so we definitely shouldn't enable it.

Reviewed-by: Ville Syrjälä 

>   } else if (num_scalers_need == 1 && intel_crtc->pipe != PIPE_C) 
> {
>   /*
> -- 
> 1.9.1
> 
> ___
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 58/67] drm/i915/cnl: Cannonlake color init.

2017-04-24 Thread Ville Syrjälä
On Thu, Apr 06, 2017 at 12:15:54PM -0700, Rodrigo Vivi wrote:
> Cannonlake has same color setup as Geminilake.
> Legacy color load luts doesn't work anymore on Cannonlake+.

Not sure what that means. The legacy 8bpc LUT i no longer there?
The code still depends on that working, and we also still expose the C8
format which at least used to depend on the legacy LUT.

> 
> Signed-off-by: Rodrigo Vivi 
> ---
>  drivers/gpu/drm/i915/i915_pci.c  | 1 +
>  drivers/gpu/drm/i915/intel_color.c   | 2 +-
>  drivers/gpu/drm/i915/intel_display.c | 4 ++--
>  drivers/gpu/drm/i915/intel_sprite.c  | 2 +-
>  4 files changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> index bace848..1e8e0ac 100644
> --- a/drivers/gpu/drm/i915/i915_pci.c
> +++ b/drivers/gpu/drm/i915/i915_pci.c
> @@ -438,6 +438,7 @@
>   .gen = 10,
>   .ddb_size = 1024,
>   .has_csr = 1,
> + .color = { .degamma_lut_size = 0, .gamma_lut_size = 1024 }
>  };
>  
>  /*
> diff --git a/drivers/gpu/drm/i915/intel_color.c 
> b/drivers/gpu/drm/i915/intel_color.c
> index 306c6b0..f85d575 100644
> --- a/drivers/gpu/drm/i915/intel_color.c
> +++ b/drivers/gpu/drm/i915/intel_color.c
> @@ -615,7 +615,7 @@ void intel_color_init(struct drm_crtc *crtc)
>  IS_BROXTON(dev_priv)) {
>   dev_priv->display.load_csc_matrix = i9xx_load_csc_matrix;
>   dev_priv->display.load_luts = broadwell_load_luts;
> - } else if (IS_GEMINILAKE(dev_priv)) {
> + } else if (IS_GEMINILAKE(dev_priv) || IS_CANNONLAKE(dev_priv)) {
>   dev_priv->display.load_csc_matrix = i9xx_load_csc_matrix;
>   dev_priv->display.load_luts = glk_load_luts;
>   } else {
> diff --git a/drivers/gpu/drm/i915/intel_display.c 
> b/drivers/gpu/drm/i915/intel_display.c
> index 3adee22..697c112 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -3302,7 +3302,7 @@ u32 skl_plane_ctl(const struct intel_crtc_state 
> *crtc_state,
>  
>   plane_ctl = PLANE_CTL_ENABLE;
>  
> - if (!IS_GEMINILAKE(dev_priv)) {
> + if (!IS_GEMINILAKE(dev_priv) && !IS_CANNONLAKE(dev_priv)) {
>   plane_ctl |=
>   PLANE_CTL_PIPE_GAMMA_ENABLE |
>   PLANE_CTL_PIPE_CSC_ENABLE |
> @@ -3359,7 +3359,7 @@ static void skylake_update_primary_plane(struct 
> drm_plane *plane,
>  
>   spin_lock_irqsave(_priv->uncore.lock, irqflags);
>  
> - if (IS_GEMINILAKE(dev_priv)) {
> + if (IS_GEMINILAKE(dev_priv) || IS_CANNONLAKE(dev_priv)) {
>   I915_WRITE_FW(PLANE_COLOR_CTL(pipe, plane_id),
> PLANE_COLOR_PIPE_GAMMA_ENABLE |
> PLANE_COLOR_PIPE_CSC_ENABLE |
> diff --git a/drivers/gpu/drm/i915/intel_sprite.c 
> b/drivers/gpu/drm/i915/intel_sprite.c
> index f7d4314..a002c1a 100644
> --- a/drivers/gpu/drm/i915/intel_sprite.c
> +++ b/drivers/gpu/drm/i915/intel_sprite.c
> @@ -240,7 +240,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc, 
> struct intel_flip_work *work
>  
>   spin_lock_irqsave(_priv->uncore.lock, irqflags);
>  
> - if (IS_GEMINILAKE(dev_priv)) {
> + if (IS_GEMINILAKE(dev_priv) || IS_CANNONLAKE(dev_priv)) {
>   I915_WRITE_FW(PLANE_COLOR_CTL(pipe, plane_id),
> PLANE_COLOR_PIPE_GAMMA_ENABLE |
> PLANE_COLOR_PIPE_CSC_ENABLE |
> -- 
> 1.9.1
> 
> ___
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v7 0/4] Adding driver-private objects to atomic state

2017-04-24 Thread Harry Wentland

Patches 1-3: Reviewed-by: Harry Wentland 
Patch 4: Acked-by: Harry Wentland 

Harry

On 2017-04-21 01:51 AM, Dhinakaran Pandiyan wrote:

Changes in this version:
Used connector->atomic_check() to release vcpi slots instead of the
atomic_release() callback.

This series introduces void * type driver-private objects in core and adds
helper functions that operate on these private objects. Drivers need to
implement object-specific functions to swap and clear object states. The
advantage of having void * for these objects in the core is objects of different
types can be managed in the same atomic state array. The series implements
DP-MST link bw tracking using the driver-private object infrastructure.

Pandiyan, Dhinakaran (4):
  drm: Add driver-private objects to atomic state
  drm/dp: Introduce MST topology state to track available link bandwidth
  drm/dp: Add DP MST helpers to atomically find and release vcpi slots
  drm/dp: Track MST link bandwidth

 drivers/gpu/drm/drm_atomic.c  |  65 +++
 drivers/gpu/drm/drm_atomic_helper.c   |   5 ++
 drivers/gpu/drm/drm_dp_mst_topology.c | 150 ++
 drivers/gpu/drm/i915/intel_dp_mst.c   |  57 +++--
 include/drm/drm_atomic.h  | 101 +++
 include/drm/drm_dp_mst_helper.h   |  26 ++
 6 files changed, 398 insertions(+), 6 deletions(-)


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 4/4] drm/i915: Include interesting seqno in the missed breadcrumb debug

2017-04-24 Thread Chris Wilson
On Mon, Apr 24, 2017 at 03:52:25PM +0300, Mika Kuoppala wrote:
> Chris Wilson  writes:
> 
> > Knowing the neighbouring seqno (current on hw, last submitted to hw)
> > provide some useful breadcrumbs to the debug log.
> >
> > Signed-off-by: Chris Wilson 
> 
> Reviewed-by: Mika Kuoppala 

Pushed the debug patch. I would like to get some traction on the fix in
patch 1... :-p
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] ✓ Fi.CI.BAT: success for series starting with [v2,1/2] PCI / PM: Add needs_resume flag to avoid suspend complete optimization

2017-04-24 Thread Patchwork
== Series Details ==

Series: series starting with [v2,1/2] PCI / PM: Add needs_resume flag to avoid 
suspend complete optimization
URL   : https://patchwork.freedesktop.org/series/23454/
State : success

== Summary ==

Series 23454v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/23454/revisions/1/mbox/

Test gem_exec_flush:
Subgroup basic-batch-kernel-default-uc:
fail   -> PASS   (fi-snb-2600) fdo#17
Test gem_exec_suspend:
Subgroup basic-s4-devices:
pass   -> DMESG-WARN (fi-kbl-7560u) fdo#100125

fdo#17 https://bugs.freedesktop.org/show_bug.cgi?id=17
fdo#100125 https://bugs.freedesktop.org/show_bug.cgi?id=100125

fi-bdw-5557u total:278  pass:267  dwarn:0   dfail:0   fail:0   skip:11  
time:430s
fi-bdw-gvtdvmtotal:278  pass:256  dwarn:8   dfail:0   fail:0   skip:14  
time:432s
fi-bsw-n3050 total:278  pass:242  dwarn:0   dfail:0   fail:0   skip:36  
time:573s
fi-bxt-j4205 total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  
time:507s
fi-bxt-t5700 total:278  pass:258  dwarn:0   dfail:0   fail:0   skip:20  
time:540s
fi-byt-j1900 total:278  pass:254  dwarn:0   dfail:0   fail:0   skip:24  
time:493s
fi-byt-n2820 total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  
time:480s
fi-hsw-4770  total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  
time:409s
fi-hsw-4770r total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  
time:404s
fi-ilk-650   total:278  pass:228  dwarn:0   dfail:0   fail:0   skip:50  
time:418s
fi-ivb-3520m total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:490s
fi-ivb-3770  total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:467s
fi-kbl-7500u total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:455s
fi-kbl-7560u total:278  pass:267  dwarn:1   dfail:0   fail:0   skip:10  
time:568s
fi-skl-6260u total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  
time:461s
fi-skl-6700hqtotal:278  pass:261  dwarn:0   dfail:0   fail:0   skip:17  
time:569s
fi-skl-6700k total:278  pass:256  dwarn:4   dfail:0   fail:0   skip:18  
time:458s
fi-skl-6770hqtotal:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  
time:490s
fi-skl-gvtdvmtotal:278  pass:265  dwarn:0   dfail:0   fail:0   skip:13  
time:432s
fi-snb-2520m total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  
time:526s
fi-snb-2600  total:278  pass:249  dwarn:0   dfail:0   fail:0   skip:29  
time:404s

9fb4b60bfa8c532ad6eda05af002c6b2f090d97f drm-tip: 2017y-04m-24d-09h-40m-53s UTC 
integration manifest
1d78856 PCI / PM: Add needs_resume flag to avoid suspend complete optimization

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_4537/
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH i-g-t v4] igt/scripts: trace.pl to parse the i915 tracepoints

2017-04-24 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Given a log file created via perf with some interesting trace
events enabled, this tool can generate the timeline graph of
requests getting queued, their dependencies resolved, sent to
the GPU for executing and finally completed.

This can be useful when analyzing certain classes of performance
issues. More help is available in the tool itself.

The tool will also calculate some overall per engine statistics,
like total time engine was idle and similar.

v2:
 * Address missing git add.
 * Make html output optional (--html switch) and by default
   just output aggregated per engine stats to stdout.

v3:
 * Added --trace option which invokes perf with the correct
   options automatically.
 * Added --avg-delay-stats which prints averages for things
   like waiting on ready, waiting on GPU and context save
   duration.
 * Fix warnings when no waits on an engine.
 * Correct help text.

v4:
 * Add --squash-ctx-id to substract engine id from ctx id
   when parsing to make it easier to identify which context
   is which with new i915 ctx id allocation scheme.
 * Reconstruct request_out events where they are missing.

Signed-off-by: Tvrtko Ursulin 
Cc: Chris Wilson 
Cc: Harri Syrja 
Cc: Krzysztof E Olinski 
---
 scripts/Makefile.am |   2 +-
 scripts/trace.pl| 990 
 2 files changed, 991 insertions(+), 1 deletion(-)
 create mode 100755 scripts/trace.pl

diff --git a/scripts/Makefile.am b/scripts/Makefile.am
index 85d4a5cf4e5c..641715294936 100644
--- a/scripts/Makefile.am
+++ b/scripts/Makefile.am
@@ -1,2 +1,2 @@
-dist_noinst_SCRIPTS = intel-gfx-trybot who.sh run-tests.sh
+dist_noinst_SCRIPTS = intel-gfx-trybot who.sh run-tests.sh trace.pl
 noinst_PYTHON = throttle.py
diff --git a/scripts/trace.pl b/scripts/trace.pl
new file mode 100755
index ..1f524aaa0f89
--- /dev/null
+++ b/scripts/trace.pl
@@ -0,0 +1,990 @@
+#! /usr/bin/perl
+#
+# Copyright © 2017 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+#
+
+use strict;
+use warnings;
+use 5.010;
+
+my $gid = 0;
+my (%db, %queue, %submit, %notify, %rings, %ctxdb, %ringmap, %reqwait);
+my @freqs;
+
+my $max_items = 3000;
+my $width_us = 32000;
+my $correct_durations = 0;
+my %ignore_ring;
+my %skip_box;
+my $html = 0;
+my $trace = 0;
+my $avg_delay_stats = 0;
+my $squash_context_id = 0;
+
+my @args;
+
+sub arg_help
+{
+   return unless scalar(@_);
+
+   if ($_[0] eq '--help' or $_[0] eq '-h') {
+   shift @_;
+print <trace.log
+
+   This file in turn should be piped into this tool which will generate some
+   statistics out of it, or if --html was given HTML output.
+
+   HTML can be viewed from a directory containing the 'vis' JavaScript module.
+   On Ubuntu this can be installed like this:
+
+   apt-get install npm
+   npm install vis

Re: [Intel-gfx] [PATCH 1/2] drm/i915: Calculate ironlake intermediate watermarks correctly

2017-04-24 Thread Ville Syrjälä
On Mon, Apr 24, 2017 at 11:30:28AM +0200, Maarten Lankhorst wrote:
> The watermarks it should calculate against are the old optimal watermarks.
> The currently active crtc watermarks are pure fiction, and are invalid in
> case of a nonblocking modeset, page flip enabling/disabling planes or any
> other reason.
> 
> When the crtc is disabled or during a modeset the intermediate watermarks
> don't need to be programmed separately, and could be directly assigned
> to the optimal watermarks.
> 
> Signed-off-by: Maarten Lankhorst 
> Cc: Ville Syrjälä 
> ---
>  drivers/gpu/drm/i915/intel_pm.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index cacb65fa2dd5..7da701dc3caa 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -2639,7 +2639,9 @@ static int ilk_compute_intermediate_wm(struct 
> drm_device *dev,
>  struct intel_crtc_state *newstate)
>  {
>   struct intel_pipe_wm *a = >wm.ilk.intermediate;
> - struct intel_pipe_wm *b = _crtc->wm.active.ilk;
> + const struct drm_crtc_state *old_drm_state =
> + drm_atomic_get_old_crtc_state(newstate->base.state, 
> _crtc->base);

Can we get a intel_atomic_get_old_crtc_state() please?

> + const struct intel_pipe_wm *b = 
> _intel_crtc_state(old_drm_state)->wm.ilk.optimal;
>   int level, max_level = ilk_wm_max_level(to_i915(dev));
>  
>   /*
> @@ -2648,6 +2650,9 @@ static int ilk_compute_intermediate_wm(struct 
> drm_device *dev,
>* and after the vblank.
>*/
>   *a = newstate->wm.ilk.optimal;
> + if (!newstate->base.active || 
> drm_atomic_crtc_needs_modeset(>base))
> + return 0;
> +
>   a->pipe_enabled |= b->pipe_enabled;
>   a->sprites_enabled |= b->sprites_enabled;
>   a->sprites_scaled |= b->sprites_scaled;
> -- 
> 2.7.4

-- 
Ville Syrjälä
Intel OTC
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v2 2/2] drm/i915: Prevent the system suspend complete optimization

2017-04-24 Thread Imre Deak
Since

commit bac2a909a096c9110525c18cbb8ce73c660d5f71
Author: Rafael J. Wysocki 
Date:   Wed Jan 21 02:17:42 2015 +0100

PCI / PM: Avoid resuming PCI devices during system suspend

PCI devices will default to allowing the system suspend complete
optimization where devices are not woken up during system suspend if
they were already runtime suspended. This however breaks the i915/HDA
drivers for two reasons:

- The i915 driver has system suspend specific steps that it needs to
  run, that bring the device to a different state than its runtime
  suspended state.

- The HDA driver's suspend handler requires power that it will request
  from the i915 driver's power domain handler. This in turn requires the
  i915 driver to runtime resume itself, but this won't be possible if the
  suspend complete optimization is in effect: in this case the i915
  runtime PM is disabled and trying to get an RPM reference returns
  -EACCESS.

Solve this by requiring the PCI/PM core to resume the device during
system suspend which in effect disables the suspend complete optimization.

One possibility for this bug staying hidden for so long is that the
optimization for a device is disabled if it's disabled for any of its
children devices. i915 may have a backlight device as its child which
doesn't support runtime PM and so doesn't allow the optimization either.
So if this backlight device got registered the bug stayed hidden.

Credits to Marta, Tomi and David who enabled pstore logging,
that caught one instance of this issue across a suspend/
resume-to-ram and Ville who rememberd that the optimization was enabled
for some devices at one point.

The first WARN triggered by the problem:

[ 6250.746445] WARNING: CPU: 2 PID: 17384 at 
drivers/gpu/drm/i915/intel_runtime_pm.c:2846 intel_runtime_pm_get+0x6b/0xd0 
[i915]
[ 6250.746448] pm_runtime_get_sync() failed: -13
[ 6250.746451] Modules linked in: snd_hda_intel i915 vgem snd_hda_codec_hdmi 
x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul
snd_hda_codec_realtek snd_hda_codec_generic ghash_clmulni_intel e1000e 
snd_hda_codec snd_hwdep snd_hda_core ptp mei_me pps_core snd_pcm lpc_ich mei 
prime_
numbers i2c_hid i2c_designware_platform i2c_designware_core [last unloaded: 
i915]
[ 6250.746512] CPU: 2 PID: 17384 Comm: kworker/u8:0 Tainted: G U  W   
4.11.0-rc5-CI-CI_DRM_334+ #1
[ 6250.746515] Hardware name:  /NUC5i5RYB, BIOS 
RYBDWi35.86A.0362.2017.0118.0940 01/18/2017
[ 6250.746521] Workqueue: events_unbound async_run_entry_fn
[ 6250.746525] Call Trace:
[ 6250.746530]  dump_stack+0x67/0x92
[ 6250.746536]  __warn+0xc6/0xe0
[ 6250.746542]  ? pci_restore_standard_config+0x40/0x40
[ 6250.746546]  warn_slowpath_fmt+0x46/0x50
[ 6250.746553]  ? __pm_runtime_resume+0x56/0x80
[ 6250.746584]  intel_runtime_pm_get+0x6b/0xd0 [i915]
[ 6250.746610]  intel_display_power_get+0x1b/0x40 [i915]
[ 6250.746646]  i915_audio_component_get_power+0x15/0x20 [i915]
[ 6250.746654]  snd_hdac_display_power+0xc8/0x110 [snd_hda_core]
[ 6250.746661]  azx_runtime_resume+0x218/0x280 [snd_hda_intel]
[ 6250.746667]  pci_pm_runtime_resume+0x76/0xa0
[ 6250.746672]  __rpm_callback+0xb4/0x1f0
[ 6250.746677]  ? pci_restore_standard_config+0x40/0x40
[ 6250.746682]  rpm_callback+0x1f/0x80
[ 6250.746686]  ? pci_restore_standard_config+0x40/0x40
[ 6250.746690]  rpm_resume+0x4ba/0x740
[ 6250.746698]  __pm_runtime_resume+0x49/0x80
[ 6250.746703]  pci_pm_suspend+0x57/0x140
[ 6250.746709]  dpm_run_callback+0x6f/0x330
[ 6250.746713]  ? pci_pm_freeze+0xe0/0xe0
[ 6250.746718]  __device_suspend+0xf9/0x370
[ 6250.746724]  ? dpm_watchdog_set+0x60/0x60
[ 6250.746730]  async_suspend+0x1a/0x90
[ 6250.746735]  async_run_entry_fn+0x34/0x160
[ 6250.746741]  process_one_work+0x1f2/0x6d0
[ 6250.746749]  worker_thread+0x49/0x4a0
[ 6250.746755]  kthread+0x107/0x140
[ 6250.746759]  ? process_one_work+0x6d0/0x6d0
[ 6250.746763]  ? kthread_create_on_node+0x40/0x40
[ 6250.746768]  ret_from_fork+0x2e/0x40
[ 6250.746778] ---[ end trace 102a62fd2160f5e6 ]---

v2:
- Use the new pci_dev->needs_resume flag, to avoid any overhead during
  the ->pm_prepare hook. (Rafael)

Fixes: bac2a909a096 ("PCI / PM: Avoid resuming PCI devices during system 
suspend")
References: https://bugs.freedesktop.org/show_bug.cgi?id=100378
Cc: Rafael J. Wysocki 
Cc: Marta Lofstedt 
Cc: David Weinehall 
Cc: Tomi Sarvela 
Cc: Ville Syrjälä 
Cc: Mika Kuoppala 
Cc: Chris Wilson 
Cc: Takashi Iwai 
Cc: Bjorn Helgaas 
Cc: linux-...@vger.kernel.org
Cc: sta...@vger.kernel.org
Signed-off-by: Imre Deak 
Reviewed-by: Chris Wilson  (v1)
---
 drivers/gpu/drm/i915/i915_drv.c | 9 +
 1 file changed, 9 insertions(+)

diff --git 

[Intel-gfx] [PATCH v2 1/2] PCI / PM: Add needs_resume flag to avoid suspend complete optimization

2017-04-24 Thread Imre Deak
Some drivers - like i915 - may not support the system suspend direct
complete optimization due to differences in their runtime and system
suspend sequence. Add a flag that when set resumes the device before
calling the driver's system suspend handlers which effectively disables
the optimization.

Needed by the next patch fixing suspend/resume on i915.

Suggested by Rafael.

Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: linux-...@vger.kernel.org
Cc: sta...@vger.kernel.org
Signed-off-by: Imre Deak 
---
 drivers/pci/pci.c   | 3 ++-
 include/linux/pci.h | 7 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7904d02..020f02d 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2141,7 +2141,8 @@ bool pci_dev_keep_suspended(struct pci_dev *pci_dev)
 
if (!pm_runtime_suspended(dev)
|| pci_target_state(pci_dev) != pci_dev->current_state
-   || platform_pci_need_resume(pci_dev))
+   || platform_pci_need_resume(pci_dev)
+   || pci_dev->needs_resume)
return false;
 
/*
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e2d1a12..3fe00a6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -316,6 +316,9 @@ struct pci_dev {
unsigned inthotplug_user_indicators:1; /* SlotCtl indicators
  controlled exclusively by
  user sysfs */
+   unsigned intneeds_resume:1; /* Resume before calling the driver's
+  system suspend hooks, disabling the
+  direct_complete optimization. */
unsigned intd3_delay;   /* D3->D0 transition time in ms */
unsigned intd3cold_delay;   /* D3cold->D0 transition time in ms */
 
@@ -1113,6 +1116,10 @@ bool pci_check_pme_status(struct pci_dev *dev);
 void pci_pme_wakeup_bus(struct pci_bus *bus);
 void pci_d3cold_enable(struct pci_dev *dev);
 void pci_d3cold_disable(struct pci_dev *dev);
+static inline void pci_resume_before_suspend(struct pci_dev *dev, bool enable)
+{
+   dev->needs_resume = enable;
+}
 
 static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state,
  bool enable)
-- 
2.5.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 00/15] drm/i915: Two stage watermarks for g4x

2017-04-24 Thread Lofstedt, Marta
Thanks Ville,

I have verified that below patch-set fix the CI regression on the Core2 duo, 
see bug: https://bugs.freedesktop.org/show_bug.cgi?id=100548

So, if/when this patch-set lands, I assume we could revert the revert of the 
"sched/clock: Fix broken stable to unstable transfer": git@841c8c9  

/Marta 

> -Original Message-
> From: Intel-gfx [mailto:intel-gfx-boun...@lists.freedesktop.org] On Behalf
> Of ville.syrj...@linux.intel.com
> Sent: Friday, April 21, 2017 9:14 PM
> To: intel-gfx@lists.freedesktop.org
> Subject: [Intel-gfx] [PATCH 00/15] drm/i915: Two stage watermarks for g4x
> 
> From: Ville Syrjälä 
> 
> This makes g4x follow the two stage watermark programming approach as
> well, and as a bonus exposes the video sprites on g4x.
> 
> There is one slight problem with merging the wms from multiple pipes; If one
> pipe is currently enabled and we're about to enabled another one, we
> should turn off CxSR before the second pipe gets enabled as the FIFO will get
> repartitioned. This could also happen when there's a parallel watermark
> update on the first pipe, so a dumb approach of just disabling CxSR in the
> modeset path doesn't really work. I think the proper fix will involve some
> shuffling of code in the modeset path because it's currently a bit of a mess.
> So with the current code there could be an occasional underrun reported
> when enabling the second pipe.
> 
> Entire series available here:
> git://github.com/vsyrjala/linux.git g4x_atomic_wm_8
> 
> Ville Syrjälä (15):
>   drm/i915: s/vlv_plane_wm_compute/vlv_raw_plane_wm_compute/ etc.
>   drm/i915: Drop the debug message from vlv_get_fifo_size()
>   drm/i915: s/vlv_num_wm_levels/intel_wm_num_levels/
>   drm/i915: Rename bunch of vlv_ watermark structures to g4x_
>   drm/i915: Make vlv/chv watermark debug print less cryptic
>   drm/i915: Document CxSR
>   drm/i915: Fix cursor 'cpp' in watermark calculatins for old platforms
>   drm/i915: Fix the g4x watermark TLB miss workaround
>   drm/i915: Refactor the g4x TLB miss w/a to a helper
>   drm/i915: Refactor wm calculations
>   drm/i915: Apply the g4x TLB miss w/a to SR watermarks as well
>   drm/i915: Two stage watermarks for g4x
>   drm/i915: Enable HPLL watermarks on g4x
>   drm/i915: Add g4x watermark tracepoint
>   drm/i915: Add support for sprites on g4x
> 
>  drivers/gpu/drm/i915/i915_debugfs.c  |   12 +-
>  drivers/gpu/drm/i915/i915_drv.h  |   20 +-
>  drivers/gpu/drm/i915/i915_trace.h|   49 ++
>  drivers/gpu/drm/i915/intel_device_info.c |2 +-
>  drivers/gpu/drm/i915/intel_display.c |   29 +-
>  drivers/gpu/drm/i915/intel_drv.h |   34 +-
>  drivers/gpu/drm/i915/intel_pm.c  | 1254 ++---
> -
>  drivers/gpu/drm/i915/intel_sprite.c  |   18 +-
>  8 files changed, 1081 insertions(+), 337 deletions(-)
> 
> --
> 2.10.2
> 
> ___
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 07/27] drm/i915: Squash repeated awaits on the same fence

2017-04-24 Thread Chris Wilson
On Mon, Apr 24, 2017 at 02:19:54PM +0100, Chris Wilson wrote:
> On Mon, Apr 24, 2017 at 02:03:25PM +0100, Tvrtko Ursulin wrote:
> > 
> > On 19/04/2017 10:41, Chris Wilson wrote:
> > >Track the latest fence waited upon on each context, and only add a new
> > >asynchronous wait if the new fence is more recent than the recorded
> > >fence for that context. This requires us to filter out unordered
> > >timelines, which are noted by DMA_FENCE_NO_CONTEXT. However, in the
> > >absence of a universal identifier, we have to use our own
> > >i915->mm.unordered_timeline token.
> > 
> > (._.), a bit later... @_@!
> > 
> > What does this fixes and is the complexity worth it?
> 
> It's a recovery of the optimisation that we used to have from the
> initial multiple engine semaphore synchronisation - that of avoiding
> repeating the same synchronisation barriers.
> 
> In the current setup, the cost of repeat fence synchronisation is
> obfuscated, it just causes a tight loop between
> 
>  /<-\
>  |   ^
> i915_sw_fence_complete -> i915_sw_fence_commit ->|
> 
> and extra depth in the dependency trees, which is generally not
> observed in normal usage.
> 
> When you know what you are looking for, the reduction of all those
> atomic ops from underneath hardirq is definitely worth it, even for
> fairly simply operations, and there tends to be repetition from all he
> buffers being tracked between requests (and clients).

And it also says, to me at least, that the cost of the lookup must be
less than the cost of a couple of atomics.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 07/27] drm/i915: Squash repeated awaits on the same fence

2017-04-24 Thread Chris Wilson
On Mon, Apr 24, 2017 at 02:03:25PM +0100, Tvrtko Ursulin wrote:
> 
> On 19/04/2017 10:41, Chris Wilson wrote:
> >Track the latest fence waited upon on each context, and only add a new
> >asynchronous wait if the new fence is more recent than the recorded
> >fence for that context. This requires us to filter out unordered
> >timelines, which are noted by DMA_FENCE_NO_CONTEXT. However, in the
> >absence of a universal identifier, we have to use our own
> >i915->mm.unordered_timeline token.
> 
> (._.), a bit later... @_@!
> 
> What does this fixes and is the complexity worth it?

It's a recovery of the optimisation that we used to have from the
initial multiple engine semaphore synchronisation - that of avoiding
repeating the same synchronisation barriers.

In the current setup, the cost of repeat fence synchronisation is
obfuscated, it just causes a tight loop between

 /<-\
 |   ^
i915_sw_fence_complete -> i915_sw_fence_commit ->|

and extra depth in the dependency trees, which is generally not
observed in normal usage.

When you know what you are looking for, the reduction of all those
atomic ops from underneath hardirq is definitely worth it, even for
fairly simply operations, and there tends to be repetition from all he
buffers being tracked between requests (and clients).

Using a seqno map avoids the cost of tracking fences (i.e. keeping old
fences forever) and allows it to be kept on the timeline, rather than
the request itself (a ht under the request can squash simple repeats,
but using the timeline is more complete).

2 small routines to implement a compressed radixtree -- it's
comparitively simple compared to having to accommodate RCU walkers!
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 12/15] drm/i915: Two stage watermarks for g4x

2017-04-24 Thread Ville Syrjälä
On Mon, Apr 24, 2017 at 09:34:42AM +0200, Maarten Lankhorst wrote:
> On 21-04-17 20:14, ville.syrj...@linux.intel.com wrote:
> > From: Ville Syrjälä 
> >
> > Implement proper two stage watermark programming for g4x. As with
> > other pre-SKL platforms, the watermark registers aren't double
> > buffered on g4x. Hence we must sequence the watermark update
> > carefully around plane updates.
> >
> > The code is quite heavily modelled on the VLV/CHV code, with some
> > fairly significant differences due to the different hardware
> > architecture:
> > * g4x doesn't use inverted watermark values
> > * CxSR actually affects the watermarks since it controls memory self
> >   refresh in addition to the max FIFO mode
> > * A further HPLL SR mode is possible with higher memory wakeup
> >   latency
> > * g4x has FBC2 and so it also has FBC watermarks
> > * max FIFO mode for primary plane only (cursor is allowed, sprite is not)
> > * g4x has no manual FIFO repartitioning
> > * some TLB miss related workarounds are needed for the watermarks
> >
> > Actually the hardware is quite similar to ILK+ in many ways. The
> > most visible differences are in the actual watermakr register
> > layout. ILK revamped that part quite heavily whereas g4x is still
> > using the layout inherited from earlier platforms.
> >
> > Note that we didn't previously enable the HPLL SR on g4x. So in order
> > to not introduce too many functional changes in this patch I've not
> > actually enabled it here either, even though the code is now fully
> > ready for it. We'll enable it separately later on.
> >
> > Signed-off-by: Ville Syrjälä 
> > ---
> >  drivers/gpu/drm/i915/i915_debugfs.c  |  12 +-
> >  drivers/gpu/drm/i915/i915_drv.h  |  12 +
> >  drivers/gpu/drm/i915/intel_display.c |  25 +-
> >  drivers/gpu/drm/i915/intel_drv.h |  28 ++
> >  drivers/gpu/drm/i915/intel_pm.c  | 942 
> > +++
> >  5 files changed, 792 insertions(+), 227 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> > b/drivers/gpu/drm/i915/i915_debugfs.c
> > index 870c470177b5..69550d31099e 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -3898,6 +3898,8 @@ static void wm_latency_show(struct seq_file *m, const 
> > uint16_t wm[8])
> > num_levels = 3;
> > else if (IS_VALLEYVIEW(dev_priv))
> > num_levels = 1;
> > +   else if (IS_G4X(dev_priv))
> > +   num_levels = 3;
> > else
> > num_levels = ilk_wm_max_level(dev_priv) + 1;
> >  
> > @@ -3910,8 +3912,10 @@ static void wm_latency_show(struct seq_file *m, 
> > const uint16_t wm[8])
> >  * - WM1+ latency values in 0.5us units
> >  * - latencies are in us on gen9/vlv/chv
> >  */
> > -   if (INTEL_GEN(dev_priv) >= 9 || IS_VALLEYVIEW(dev_priv) ||
> > -   IS_CHERRYVIEW(dev_priv))
> > +   if (INTEL_GEN(dev_priv) >= 9 ||
> > +   IS_VALLEYVIEW(dev_priv) ||
> > +   IS_CHERRYVIEW(dev_priv) ||
> > +   IS_G4X(dev_priv))
> > latency *= 10;
> > else if (level > 0)
> > latency *= 5;
> > @@ -3972,7 +3976,7 @@ static int pri_wm_latency_open(struct inode *inode, 
> > struct file *file)
> >  {
> > struct drm_i915_private *dev_priv = inode->i_private;
> >  
> > -   if (INTEL_GEN(dev_priv) < 5)
> > +   if (INTEL_GEN(dev_priv) < 5 && !IS_G4X(dev_priv))
> > return -ENODEV;
> >  
> > return single_open(file, pri_wm_latency_show, dev_priv);
> > @@ -4014,6 +4018,8 @@ static ssize_t wm_latency_write(struct file *file, 
> > const char __user *ubuf,
> > num_levels = 3;
> > else if (IS_VALLEYVIEW(dev_priv))
> > num_levels = 1;
> > +   else if (IS_G4X(dev_priv))
> > +   num_levels = 3;
> > else
> > num_levels = ilk_wm_max_level(dev_priv) + 1;
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h 
> > b/drivers/gpu/drm/i915/i915_drv.h
> > index 0a393fdc53d1..6df8bff7f5a7 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1762,11 +1762,13 @@ struct ilk_wm_values {
> >  
> >  struct g4x_pipe_wm {
> > uint16_t plane[I915_MAX_PLANES];
> > +   uint16_t fbc;
> >  };
> >  
> >  struct g4x_sr_wm {
> > uint16_t plane;
> > uint16_t cursor;
> > +   uint16_t fbc;
> >  };
> >  
> >  struct vlv_wm_ddl_values {
> > @@ -1781,6 +1783,15 @@ struct vlv_wm_values {
> > bool cxsr;
> >  };
> >  
> > +struct g4x_wm_values {
> > +   struct g4x_pipe_wm pipe[2];
> > +   struct g4x_sr_wm sr;
> > +   struct g4x_sr_wm hpll;
> > +   bool cxsr;
> > +   bool hpll_en;
> > +   bool fbc_en;
> > +};
> > +
> >  struct skl_ddb_entry {
> > uint16_t start, end;/* in number of blocks, 'end' is exclusive */
> >  };
> > @@ -2410,6 +2421,7 @@ struct drm_i915_private {
> >

Re: [Intel-gfx] [PATCH 15/27] drm/i915: Split execlist priority queue into rbtree + linked list

2017-04-24 Thread Chris Wilson
On Mon, Apr 24, 2017 at 01:44:53PM +0100, Tvrtko Ursulin wrote:
> 
> On 24/04/2017 12:07, Chris Wilson wrote:
> >On Mon, Apr 24, 2017 at 11:28:32AM +0100, Tvrtko Ursulin wrote:
> >>
> >>On 19/04/2017 10:41, Chris Wilson wrote:
> >>>All the requests at the same priority are executed in FIFO order. They
> >>>do not need to be stored in the rbtree themselves, as they are a simple
> >>>list within a level. If we move the requests at one priority into a list,
> >>>we can then reduce the rbtree to the set of priorities. This should keep
> >>>the height of the rbtree small, as the number of active priorities can not
> >>>exceed the number of active requests and should be typically only a few.
> >>>
> >>>Currently, we have ~2k possible different priority levels, that may
> >>>increase to allow even more fine grained selection. Allocating those in
> >>>advance seems a waste (and may be impossible), so we opt for allocating
> >>>upon first use, and freeing after its requests are depleted. To avoid
> >>>the possibility of an allocation failure causing us to lose a request,
> >>>we preallocate the default priority (0) and bump any request to that
> >>>priority if we fail to allocate it the appropriate plist. Having a
> >>>request (that is ready to run, so not leading to corruption) execute
> >>>out-of-order is better than leaking the request (and its dependency
> >>>tree) entirely.
> >>>
> >>>There should be a benefit to reducing execlists_dequeue() to principally
> >>>using a simple list (and reducing the frequency of both rbtree iteration
> >>>and balancing on erase) but for typical workloads, request coalescing
> >>>should be small enough that we don't notice any change. The main gain is
> >>>from improving PI calls to schedule, and the explicit list within a
> >>>level should make request unwinding simpler (we just need to insert at
> >>>the head of the list rather than the tail and not have to make the
> >>>rbtree search more complicated).
> >>
> >>Sounds attractive! What workloads show the benefit and how much?
> >
> >The default will show the best, since everything is priority 0 more or
> >less and so we reduce the rbtree search to a single lookup and list_add.
> >It's hard to measure the impact of the rbtree though. On the dequeue
> >side, the mmio access dominates. On the schedule side, if we have lots
> >of requests, the dfs dominates.
> >
> >I have an idea on how we might stress the rbtree in submit_request - but
> >still it requires long queues untypical of most workloads. Still tbd.
> >
> >>>-static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
> >>>+static bool
> >>>+insert_request(struct intel_engine_cs *engine,
> >>>+ struct i915_priotree *pt,
> >>>+ int prio)
> >>>{
> >>>+  struct execlist_priolist *plist;
> >>>   struct rb_node **p, *rb;
> >>>   bool first = true;
> >>>
> >>>+find_plist:
> >>>   /* most positive priority is scheduled first, equal priorities fifo */
> >>>   rb = NULL;
> >>>-  p = >rb_node;
> >>>+  p = >execlist_queue.rb_node;
> >>>   while (*p) {
> >>>-  struct i915_priotree *pos;
> >>>-
> >>>   rb = *p;
> >>>-  pos = rb_entry(rb, typeof(*pos), node);
> >>>-  if (pt->priority > pos->priority) {
> >>>+  plist = rb_entry(rb, typeof(*plist), node);
> >>>+  if (prio > plist->priority) {
> >>>   p = >rb_left;
> >>>-  } else {
> >>>+  } else if (prio < plist->priority) {
> >>>   p = >rb_right;
> >>>   first = false;
> >>>+  } else {
> >>>+  list_add_tail(>link, >requests);
> >>>+  return false;
> >>>   }
> >>>   }
> >>>-  rb_link_node(>node, rb, p);
> >>>-  rb_insert_color(>node, root);
> >>>+
> >>>+  if (!prio) {
> >>>+  plist = >default_priolist;
> >>
> >>Should be "prio == I915_PRIO_DEFAULT" (give or take).
> >>
> >>But I am not completely happy with special casing the default
> >>priority for two reasons.
> >>
> >>Firstly, userspace can opt to lower its priority and completely
> >>defeat this path.
> >>
> >>Secondly, we already have flip priority which perhaps should have
> >>it's own fast path / avoid allocation as well.
> >>
> >>Those two combined make me unsure whether the optimisation is worth
> >>it. What would be the pros and cons of three steps:
> >>
> >>1. No optimisation.
> >>2. prio == default optimisation like above.
> >>3. Better system with caching of frequently used levels.
> >>
> >>Last is definitely complicated, second is not, but is the second
> >>much better than the first?
> >
> >It was not intended as an optimisation. It is for handling the
> >ENOMEM here. We cannot abort the request at such a late stage, so we
> >need somewhere to hold it. That dictated having a preallocted slot. I
> >also didn't like having to preallocate all possible levels as that seems
> >a waste, especially as I like to invent new levels and suspect that we
> >may end up using a full u32 

Re: [Intel-gfx] [PATCH 07/27] drm/i915: Squash repeated awaits on the same fence

2017-04-24 Thread Tvrtko Ursulin


On 19/04/2017 10:41, Chris Wilson wrote:

Track the latest fence waited upon on each context, and only add a new
asynchronous wait if the new fence is more recent than the recorded
fence for that context. This requires us to filter out unordered
timelines, which are noted by DMA_FENCE_NO_CONTEXT. However, in the
absence of a universal identifier, we have to use our own
i915->mm.unordered_timeline token.


(._.), a bit later... @_@!

What does this fixes and is the complexity worth it?

Regards,

Tvrtko




v2: Throw around the debug crutches
v3: Inline the likely case of the pre-allocation cache being full.
v4: Drop the pre-allocation support, we can lose the most recent fence
in case of allocation failure -- it just means we may emit more awaits
than strictly necessary but will not break.
v5: Trim allocation size for leaf nodes, they only need an array of u32
not pointers.

Signed-off-by: Chris Wilson 
Cc: Tvrtko Ursulin 
Cc: Joonas Lahtinen 
---
 drivers/gpu/drm/i915/i915_gem_request.c|  67 +++---
 drivers/gpu/drm/i915/i915_gem_timeline.c   | 260 +
 drivers/gpu/drm/i915/i915_gem_timeline.h   |  14 ++
 drivers/gpu/drm/i915/selftests/i915_gem_timeline.c | 123 ++
 .../gpu/drm/i915/selftests/i915_mock_selftests.h   |   1 +
 5 files changed, 438 insertions(+), 27 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/selftests/i915_gem_timeline.c

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
b/drivers/gpu/drm/i915/i915_gem_request.c
index 97c07986b7c1..fb6c31ba3ef9 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -730,9 +730,7 @@ int
 i915_gem_request_await_dma_fence(struct drm_i915_gem_request *req,
 struct dma_fence *fence)
 {
-   struct dma_fence_array *array;
int ret;
-   int i;

if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, >flags))
return 0;
@@ -744,39 +742,54 @@ i915_gem_request_await_dma_fence(struct 
drm_i915_gem_request *req,
if (fence->context == req->fence.context)
return 0;

-   if (dma_fence_is_i915(fence))
-   return i915_gem_request_await_request(req, to_request(fence));
+   /* Squash repeated waits to the same timelines, picking the latest */
+   if (fence->context != req->i915->mm.unordered_timeline &&
+   intel_timeline_sync_get(req->timeline,
+   fence->context, fence->seqno))
+   return 0;

-   if (!dma_fence_is_array(fence)) {
+   if (dma_fence_is_i915(fence)) {
+   ret = i915_gem_request_await_request(req, to_request(fence));
+   if (ret < 0)
+   return ret;
+   } else if (!dma_fence_is_array(fence)) {
ret = i915_sw_fence_await_dma_fence(>submit,
fence, I915_FENCE_TIMEOUT,
GFP_KERNEL);
-   return ret < 0 ? ret : 0;
-   }
-
-   /* Note that if the fence-array was created in signal-on-any mode,
-* we should *not* decompose it into its individual fences. However,
-* we don't currently store which mode the fence-array is operating
-* in. Fortunately, the only user of signal-on-any is private to
-* amdgpu and we should not see any incoming fence-array from
-* sync-file being in signal-on-any mode.
-*/
-
-   array = to_dma_fence_array(fence);
-   for (i = 0; i < array->num_fences; i++) {
-   struct dma_fence *child = array->fences[i];
-
-   if (dma_fence_is_i915(child))
-   ret = i915_gem_request_await_request(req,
-to_request(child));
-   else
-   ret = i915_sw_fence_await_dma_fence(>submit,
-   child, 
I915_FENCE_TIMEOUT,
-   GFP_KERNEL);
if (ret < 0)
return ret;
+   } else {
+   struct dma_fence_array *array = to_dma_fence_array(fence);
+   int i;
+
+   /* Note that if the fence-array was created in signal-on-any
+* mode, we should *not* decompose it into its individual
+* fences. However, we don't currently store which mode the
+* fence-array is operating in. Fortunately, the only user of
+* signal-on-any is private to amdgpu and we should not see any
+* incoming fence-array from sync-file being in signal-on-any
+* mode.
+*/
+
+   for (i = 0; i < array->num_fences; i++) {
+   struct dma_fence *child = 

[Intel-gfx] [PATCH i-g-t] tests/pm_rc6_residency: Add subtest to check RC6 suspend handling

2017-04-24 Thread Ewelina Musial
In some cases we observed that forcewake isn't kept after
resume and then RC6 residency is not constant.

References: HSD#1804921797
Cc: Arkadiusz Hiler 
Cc: Michal Winiarski 
Cc: Lukasz Fiedorowicz 
Signed-off-by: Ewelina Musial 
---
 tests/pm_rc6_residency.c | 53 
 1 file changed, 53 insertions(+)

diff --git a/tests/pm_rc6_residency.c b/tests/pm_rc6_residency.c
index bdb9747..4e61326 100644
--- a/tests/pm_rc6_residency.c
+++ b/tests/pm_rc6_residency.c
@@ -165,6 +165,44 @@ static void measure_residencies(int devid, unsigned int 
rc6_mask,
res->rc6 += res->rc6p;
 }
 
+enum sleep_state { NOSLEEP, SUSPEND, HIBERNATE };
+static void test_rc6_forcewake(enum sleep_state sleep_state)
+{
+   int fd, fw_fd;
+   unsigned long residency_pre, residency_post;
+
+   fd = drm_open_driver(DRIVER_INTEL);
+   igt_assert_lte(0, fd);
+
+   fw_fd = igt_open_forcewake_handle(fd);
+   igt_assert_lte(0, fw_fd);
+
+   switch (sleep_state) {
+   case NOSLEEP:
+   break;
+   case SUSPEND:
+   igt_system_suspend_autoresume(SUSPEND_STATE_MEM,
+ SUSPEND_TEST_NONE);
+   break;
+   case HIBERNATE:
+   igt_system_suspend_autoresume(SUSPEND_STATE_DISK,
+ SUSPEND_TEST_NONE);
+   break;
+   }
+
+   sleep(1); // time to fully resume
+
+   // forcewake should keep residency constant after resume
+   residency_pre = read_rc6_residency("rc6");
+   sleep(SLEEP_DURATION);
+   residency_post = read_rc6_residency("rc6");
+
+   igt_assert_eq(residency_pre, residency_post);
+
+   close(fw_fd);
+   close(fd);
+}
+
 igt_main
 {
unsigned int rc6_mask;
@@ -209,4 +247,19 @@ igt_main
 
residency_accuracy(res.rc6pp, res.duration, "rc6pp");
}
+   igt_subtest("rc6-forcewake") {
+   igt_skip_on(!(rc6_mask & RC6_ENABLED));
+
+   test_rc6_forcewake(NOSLEEP);
+   }
+   igt_subtest("rc6-forcewake-suspend") {
+   igt_skip_on(!(rc6_mask & RC6_ENABLED));
+
+   test_rc6_forcewake(SUSPEND);
+   }
+   igt_subtest("rc6-forcewake-hibernate") {
+   igt_skip_on(!(rc6_mask & RC6_ENABLED));
+
+   test_rc6_forcewake(HIBERNATE);
+   }
 }
-- 
2.9.3

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 3/4] drm/i915: Compute the ring->space slightly less pessimistically

2017-04-24 Thread Chris Wilson
On Mon, Apr 24, 2017 at 03:37:44PM +0300, Mika Kuoppala wrote:
> Chris Wilson  writes:
> 
> > We only have to prevent the RING_TAIL from catching the RING_HEAD
> > cacheline and do not need to enforce a whole cacheline separation, and in
> > the process we can remove one branch from the computation.
> >
> > Signed-off-by: Chris Wilson 
> > ---
> >  drivers/gpu/drm/i915/intel_ringbuffer.c | 20 ++--
> >  drivers/gpu/drm/i915/intel_ringbuffer.h | 24 +---
> >  2 files changed, 23 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> > b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index 1ec98851a621..48d4b5b24b21 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -39,12 +39,15 @@
> >   */
> >  #define LEGACY_REQUEST_SIZE 200
> >  
> > -static int __intel_ring_space(int head, int tail, int size)
> > +static unsigned int __intel_ring_space(unsigned int head,
> > +  unsigned int tail,
> > +  unsigned int size)
> >  {
> > -   int space = head - tail;
> > -   if (space <= 0)
> > -   space += size;
> > -   return space - I915_RING_FREE_SPACE;
> > +   /* "If the Ring Buffer Head Pointer and the Tail Pointer are on the
> > +* same cacheline, the Head Pointer must not be greater than the Tail
> > +* Pointer."
> > +*/
> > +   return (round_down(head, CACHELINE_BYTES) - tail - 8) & (size - 1);
> >  }
> >  
> >  void intel_ring_update_space(struct intel_ring *ring)
> > @@ -1618,12 +1621,9 @@ static int wait_for_space(struct 
> > drm_i915_gem_request *req, int bytes)
> > GEM_BUG_ON(!req->reserved_space);
> >  
> > list_for_each_entry(target, >request_list, ring_link) {
> > -   unsigned space;
> > -
> > /* Would completion of this request free enough space? */
> > -   space = __intel_ring_space(target->postfix, ring->emit,
> > -  ring->size);
> > -   if (space >= bytes)
> > +   if (bytes <= __intel_ring_space(target->postfix,
> > +   ring->emit, ring->size))
> > break;
> > }
> >  
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
> > b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > index 31998ea46fb8..e28ffc98c520 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > @@ -17,17 +17,6 @@
> >  #define CACHELINE_BYTES 64
> >  #define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(uint32_t))
> >  
> > -/*
> > - * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use"
> > - * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer 
> > Use"
> > - * Gen4+ BSpec "vol1c Memory Interface and Command Stream" / 5.3.4.5 "Ring 
> > Buffer Use"
> > - *
> > - * "If the Ring Buffer Head Pointer and the Tail Pointer are on the same
> > - * cacheline, the Head Pointer must not be greater than the Tail
> > - * Pointer."
> > - */
> > -#define I915_RING_FREE_SPACE 64
> > -
> >  struct intel_hw_status_page {
> > struct i915_vma *vma;
> > u32 *page_addr;
> > @@ -545,6 +534,19 @@ assert_ring_tail_valid(const struct intel_ring *ring, 
> > unsigned int tail)
> >  */
> > GEM_BUG_ON(!IS_ALIGNED(tail, 8));
> > GEM_BUG_ON(tail >= ring->size);
> > +
> > +   /* "Ring Buffer Use"
> > +*  Gen2 BSpec "1. Programming Environment" / 1.4.4.6
> > +*  Gen3 BSpec "1c Memory Interface Functions" / 2.3.4.5
> > +*  Gen4+ BSpec "1c Memory Interface and Command Stream" / 5.3.4.5
> > +* "If the Ring Buffer Head Pointer and the Tail Pointer are on the
> > +* same cacheline, the Head Pointer must not be greater than the Tail
> > +* Pointer."
> > +*/
> > +#define cacheline(a) round_down(a, CACHELINE_BYTES)
> > +   GEM_BUG_ON(cacheline(tail) == cacheline(ring->head) &&
> 
> We are about to set the hw tail. But we are using our
> outdated bookkeepping value of ring head to do the check.
> 
> I am confused. Is this something that can happen only when
> it wraps. And why we dont need to check the HW head?

We know that the real HEAD is at least ring->head. We know that when we
computed the space for the request, we used at most ring->head.
Therefore from our logic, we have treated ring->head == RING_HEAD and
should never violate the cacheline for our own bookkeeping. That it may
not result in the HW tripping up is not as important as confirming our
own logic to prevent the trip.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 4/4] drm/i915: Include interesting seqno in the missed breadcrumb debug

2017-04-24 Thread Mika Kuoppala
Chris Wilson  writes:

> Knowing the neighbouring seqno (current on hw, last submitted to hw)
> provide some useful breadcrumbs to the debug log.
>
> Signed-off-by: Chris Wilson 

Reviewed-by: Mika Kuoppala 

> ---
>  drivers/gpu/drm/i915/intel_breadcrumbs.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c 
> b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 9ccbf26124c6..35da5928bd8a 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -64,10 +64,12 @@ static unsigned long wait_timeout(void)
>  
>  static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
>  {
> - DRM_DEBUG_DRIVER("%s missed breadcrumb at %pF, irq posted? %s\n",
> + DRM_DEBUG_DRIVER("%s missed breadcrumb at %pF, irq posted? %s, current 
> seqno=%x, last=%x\n",
>engine->name, __builtin_return_address(0),
>yesno(test_bit(ENGINE_IRQ_BREADCRUMB,
> - >irq_posted)));
> + >irq_posted)),
> +  intel_engine_get_seqno(engine),
> +  intel_engine_last_submit(engine));
>  
>   set_bit(engine->id, >i915->gpu_error.missed_irq_rings);
>  }
> -- 
> 2.11.0
>
> ___
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/4] drm/i915: Poison the request before emitting commands

2017-04-24 Thread Chris Wilson
On Mon, Apr 24, 2017 at 03:29:48PM +0300, Mika Kuoppala wrote:
> Chris Wilson  writes:
> 
> > If we poison the request before we emit commands, it should be easier to
> > spot when we execute an uninitialised request.
> >
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=100144
> > Signed-off-by: Chris Wilson 
> > ---
> >  drivers/gpu/drm/i915/intel_ringbuffer.c | 1 +
> >  1 file changed, 1 insertion(+)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> > b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index 8c874996c617..1ec98851a621 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -1693,6 +1693,7 @@ u32 *intel_ring_begin(struct drm_i915_gem_request 
> > *req, int num_dwords)
> >  
> > GEM_BUG_ON(ring->emit > ring->size - bytes);
> > cs = ring->vaddr + ring->emit;
> > +   GEM_DEBUG_EXEC(memset(cs, POISON_INUSE, bytes));
> 
> Will more modern hardware just skipover these? If so
> then perhaps more advanced solution would be to fill with bb
> start into the previous address to get a guaranteed
> hang on the spot.

It's not so much the expectation that the value will cause a hang, but
that the value is much more recognisable when diagnosing after something
goes wrong. Such as we do write a complete LRI but forget the value, the
command is correct and so no harm caused, but we may spot the poison
later on.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/2] drm: Make fbdev inherit the crtc's initial rotation

2017-04-24 Thread Ville Syrjälä
On Sun, Apr 23, 2017 at 06:11:04PM +0200, Hans de Goede wrote:
> Hi All,
> 
> So I recently bought a (second-hand) Bay Trail tablet which has its LCD
> mounted upside-down. As such I've ported Ville Syrjala's patches to deal
> with this to current mainline and I'm hereby posting them upstream
> for merging.
> 
> These patches fix the kernel-console as well as the boot-splash
> (at leats plymouth does not reset the rotation) being upside down as
> soon as a native kms driver such as the i915 driver is loaded.
> 
> This fixes the orientation of the displayed image from boot till the
> Xserver or a Wayland compositor takes over.

It doesn't work for X? Using -modesetting perhaps? IIRC it worked just
fine with -intel way back when.

> 
> I've a patch for iio-sensor-proxy which fixes the rotation under Xorg /
> Wayland when using a desktop environment which honors iio-sensor-proxy's
> rotation detection:
> https://github.com/hadess/iio-sensor-proxy/pull/162

Or is it just this thing that clobbers what the DDX inherited from the
kernel as the initial rotation?

-- 
Ville Syrjälä
Intel OTC
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 15/27] drm/i915: Split execlist priority queue into rbtree + linked list

2017-04-24 Thread Tvrtko Ursulin


On 24/04/2017 12:07, Chris Wilson wrote:

On Mon, Apr 24, 2017 at 11:28:32AM +0100, Tvrtko Ursulin wrote:


On 19/04/2017 10:41, Chris Wilson wrote:

All the requests at the same priority are executed in FIFO order. They
do not need to be stored in the rbtree themselves, as they are a simple
list within a level. If we move the requests at one priority into a list,
we can then reduce the rbtree to the set of priorities. This should keep
the height of the rbtree small, as the number of active priorities can not
exceed the number of active requests and should be typically only a few.

Currently, we have ~2k possible different priority levels, that may
increase to allow even more fine grained selection. Allocating those in
advance seems a waste (and may be impossible), so we opt for allocating
upon first use, and freeing after its requests are depleted. To avoid
the possibility of an allocation failure causing us to lose a request,
we preallocate the default priority (0) and bump any request to that
priority if we fail to allocate it the appropriate plist. Having a
request (that is ready to run, so not leading to corruption) execute
out-of-order is better than leaking the request (and its dependency
tree) entirely.

There should be a benefit to reducing execlists_dequeue() to principally
using a simple list (and reducing the frequency of both rbtree iteration
and balancing on erase) but for typical workloads, request coalescing
should be small enough that we don't notice any change. The main gain is

>from improving PI calls to schedule, and the explicit list within a

level should make request unwinding simpler (we just need to insert at
the head of the list rather than the tail and not have to make the
rbtree search more complicated).


Sounds attractive! What workloads show the benefit and how much?


The default will show the best, since everything is priority 0 more or
less and so we reduce the rbtree search to a single lookup and list_add.
It's hard to measure the impact of the rbtree though. On the dequeue
side, the mmio access dominates. On the schedule side, if we have lots
of requests, the dfs dominates.

I have an idea on how we might stress the rbtree in submit_request - but
still it requires long queues untypical of most workloads. Still tbd.


-static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
+static bool
+insert_request(struct intel_engine_cs *engine,
+  struct i915_priotree *pt,
+  int prio)
{
+   struct execlist_priolist *plist;
struct rb_node **p, *rb;
bool first = true;

+find_plist:
/* most positive priority is scheduled first, equal priorities fifo */
rb = NULL;
-   p = >rb_node;
+   p = >execlist_queue.rb_node;
while (*p) {
-   struct i915_priotree *pos;
-
rb = *p;
-   pos = rb_entry(rb, typeof(*pos), node);
-   if (pt->priority > pos->priority) {
+   plist = rb_entry(rb, typeof(*plist), node);
+   if (prio > plist->priority) {
p = >rb_left;
-   } else {
+   } else if (prio < plist->priority) {
p = >rb_right;
first = false;
+   } else {
+   list_add_tail(>link, >requests);
+   return false;
}
}
-   rb_link_node(>node, rb, p);
-   rb_insert_color(>node, root);
+
+   if (!prio) {
+   plist = >default_priolist;


Should be "prio == I915_PRIO_DEFAULT" (give or take).

But I am not completely happy with special casing the default
priority for two reasons.

Firstly, userspace can opt to lower its priority and completely
defeat this path.

Secondly, we already have flip priority which perhaps should have
it's own fast path / avoid allocation as well.

Those two combined make me unsure whether the optimisation is worth
it. What would be the pros and cons of three steps:

1. No optimisation.
2. prio == default optimisation like above.
3. Better system with caching of frequently used levels.

Last is definitely complicated, second is not, but is the second
much better than the first?


It was not intended as an optimisation. It is for handling the
ENOMEM here. We cannot abort the request at such a late stage, so we
need somewhere to hold it. That dictated having a preallocted slot. I
also didn't like having to preallocate all possible levels as that seems
a waste, especially as I like to invent new levels and suspect that we
may end up using a full u32 range.

Using it for the default priority was then to take advantage of the
preallocation.


Perhaps a simplification of 3) where we would defer the freeing of
unused priority levels until the busy to idle transition? That would
also drop the existence and need for special handling of
engine->default_prio.


+   } else {
+   plist = kmalloc(sizeof(*plist), 

Re: [Intel-gfx] [PATCH 1/4] drm/i915: Differentiate between sw write location into ring and last hw read

2017-04-24 Thread Chris Wilson
On Mon, Apr 24, 2017 at 01:32:03PM +0100, Chris Wilson wrote:
> On Mon, Apr 24, 2017 at 03:21:56PM +0300, Mika Kuoppala wrote:
> > Chris Wilson  writes:
> > 
> > > We need to keep track of the last location we ask the hw to read up to
> > > (RING_TAIL) separately from our last write location into the ring, so
> > > that in the event of a GPU reset we do not tell the HW to proceed into
> > > a partially written request (which can happen if that request is waiting
> > > for an external signal before being executed).
> > 
> > But is it so that this can happen also without external signal?
> > Our submit is already async and waiting for the prev. And we have
> > already pushed the tail into the partial 'current'.
> 
> You need something to delay the request submission. In gem_exec_fence,
> it is waiting on a request from another engine (that happens to be a
> deliberate hang). Similarly, any third party signal would leave the
> request incomplete whilst we continue to write new requests after it.

To remind myself, the culprit was the I915_WRITE_TAIL(ring->tail) in
engine->reset_hw() that didn't just restore the previous I915_WRITE_TAIL
from the last submit_request, but would advance into the partials.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 3/4] drm/i915: Compute the ring->space slightly less pessimistically

2017-04-24 Thread Mika Kuoppala
Chris Wilson  writes:

> We only have to prevent the RING_TAIL from catching the RING_HEAD
> cacheline and do not need to enforce a whole cacheline separation, and in
> the process we can remove one branch from the computation.
>
> Signed-off-by: Chris Wilson 
> ---
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 20 ++--
>  drivers/gpu/drm/i915/intel_ringbuffer.h | 24 +---
>  2 files changed, 23 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 1ec98851a621..48d4b5b24b21 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -39,12 +39,15 @@
>   */
>  #define LEGACY_REQUEST_SIZE 200
>  
> -static int __intel_ring_space(int head, int tail, int size)
> +static unsigned int __intel_ring_space(unsigned int head,
> +unsigned int tail,
> +unsigned int size)
>  {
> - int space = head - tail;
> - if (space <= 0)
> - space += size;
> - return space - I915_RING_FREE_SPACE;
> + /* "If the Ring Buffer Head Pointer and the Tail Pointer are on the
> +  * same cacheline, the Head Pointer must not be greater than the Tail
> +  * Pointer."
> +  */
> + return (round_down(head, CACHELINE_BYTES) - tail - 8) & (size - 1);
>  }
>  
>  void intel_ring_update_space(struct intel_ring *ring)
> @@ -1618,12 +1621,9 @@ static int wait_for_space(struct drm_i915_gem_request 
> *req, int bytes)
>   GEM_BUG_ON(!req->reserved_space);
>  
>   list_for_each_entry(target, >request_list, ring_link) {
> - unsigned space;
> -
>   /* Would completion of this request free enough space? */
> - space = __intel_ring_space(target->postfix, ring->emit,
> -ring->size);
> - if (space >= bytes)
> + if (bytes <= __intel_ring_space(target->postfix,
> + ring->emit, ring->size))
>   break;
>   }
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 31998ea46fb8..e28ffc98c520 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -17,17 +17,6 @@
>  #define CACHELINE_BYTES 64
>  #define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(uint32_t))
>  
> -/*
> - * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use"
> - * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer Use"
> - * Gen4+ BSpec "vol1c Memory Interface and Command Stream" / 5.3.4.5 "Ring 
> Buffer Use"
> - *
> - * "If the Ring Buffer Head Pointer and the Tail Pointer are on the same
> - * cacheline, the Head Pointer must not be greater than the Tail
> - * Pointer."
> - */
> -#define I915_RING_FREE_SPACE 64
> -
>  struct intel_hw_status_page {
>   struct i915_vma *vma;
>   u32 *page_addr;
> @@ -545,6 +534,19 @@ assert_ring_tail_valid(const struct intel_ring *ring, 
> unsigned int tail)
>*/
>   GEM_BUG_ON(!IS_ALIGNED(tail, 8));
>   GEM_BUG_ON(tail >= ring->size);
> +
> + /* "Ring Buffer Use"
> +  *  Gen2 BSpec "1. Programming Environment" / 1.4.4.6
> +  *  Gen3 BSpec "1c Memory Interface Functions" / 2.3.4.5
> +  *  Gen4+ BSpec "1c Memory Interface and Command Stream" / 5.3.4.5
> +  * "If the Ring Buffer Head Pointer and the Tail Pointer are on the
> +  * same cacheline, the Head Pointer must not be greater than the Tail
> +  * Pointer."
> +  */
> +#define cacheline(a) round_down(a, CACHELINE_BYTES)
> + GEM_BUG_ON(cacheline(tail) == cacheline(ring->head) &&

We are about to set the hw tail. But we are using our
outdated bookkeepping value of ring head to do the check.

I am confused. Is this something that can happen only when
it wraps. And why we dont need to check the HW head?

-Mika


> +tail < ring->head);
> +#undef cacheline
>  }
>  
>  static inline unsigned int
> -- 
> 2.11.0
>
> ___
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/4] drm/i915: Differentiate between sw write location into ring and last hw read

2017-04-24 Thread Chris Wilson
On Mon, Apr 24, 2017 at 03:21:56PM +0300, Mika Kuoppala wrote:
> Chris Wilson  writes:
> 
> > We need to keep track of the last location we ask the hw to read up to
> > (RING_TAIL) separately from our last write location into the ring, so
> > that in the event of a GPU reset we do not tell the HW to proceed into
> > a partially written request (which can happen if that request is waiting
> > for an external signal before being executed).
> 
> But is it so that this can happen also without external signal?
> Our submit is already async and waiting for the prev. And we have
> already pushed the tail into the partial 'current'.

You need something to delay the request submission. In gem_exec_fence,
it is waiting on a request from another engine (that happens to be a
deliberate hang). Similarly, any third party signal would leave the
request incomplete whilst we continue to write new requests after it.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/4] drm/i915: Poison the request before emitting commands

2017-04-24 Thread Mika Kuoppala
Chris Wilson  writes:

> If we poison the request before we emit commands, it should be easier to
> spot when we execute an uninitialised request.
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=100144
> Signed-off-by: Chris Wilson 
> ---
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 8c874996c617..1ec98851a621 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1693,6 +1693,7 @@ u32 *intel_ring_begin(struct drm_i915_gem_request *req, 
> int num_dwords)
>  
>   GEM_BUG_ON(ring->emit > ring->size - bytes);
>   cs = ring->vaddr + ring->emit;
> + GEM_DEBUG_EXEC(memset(cs, POISON_INUSE, bytes));

Will more modern hardware just skipover these? If so
then perhaps more advanced solution would be to fill with bb
start into the previous address to get a guaranteed
hang on the spot.

Reviewed-by: Mika Kuoppala 

>   ring->emit += bytes;
>   ring->space -= bytes;
>   GEM_BUG_ON(ring->space < 0);
> -- 
> 2.11.0
>
> ___
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/4] drm/i915: Differentiate between sw write location into ring and last hw read

2017-04-24 Thread Mika Kuoppala
Chris Wilson  writes:

> We need to keep track of the last location we ask the hw to read up to
> (RING_TAIL) separately from our last write location into the ring, so
> that in the event of a GPU reset we do not tell the HW to proceed into
> a partially written request (which can happen if that request is waiting
> for an external signal before being executed).

But is it so that this can happen also without external signal?
Our submit is already async and waiting for the prev. And we have
already pushed the tail into the partial 'current'.

-Mika

>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100144
> Testcase: igt/gem_exec_fence/await-hang
> Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests")
> Fixes: d55ac5bf97c6 ("drm/i915: Defer transfer onto execution timeline to 
> actual hw submission")
> Signed-off-by: Chris Wilson 
> Cc: Tvrtko Ursulin 
> Cc: Mika Kuoppala 
> ---
>  drivers/gpu/drm/i915/i915_gem_request.c|  2 +-
>  drivers/gpu/drm/i915/i915_guc_submission.c |  4 +--
>  drivers/gpu/drm/i915/intel_lrc.c   |  6 ++---
>  drivers/gpu/drm/i915/intel_ringbuffer.c| 42 
> +++---
>  drivers/gpu/drm/i915/intel_ringbuffer.h| 17 +++-
>  5 files changed, 48 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
> b/drivers/gpu/drm/i915/i915_gem_request.c
> index e2ec42b2bf24..99146de1dc93 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -651,7 +651,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>* GPU processing the request, we never over-estimate the
>* position of the head.
>*/
> - req->head = req->ring->tail;
> + req->head = req->ring->emit;
>  
>   /* Check that we didn't interrupt ourselves with a new request */
>   GEM_BUG_ON(req->timeline->seqno != req->fence.seqno);
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
> b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 1642fff9cf13..ab5140ba108d 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -480,9 +480,7 @@ static void guc_wq_item_append(struct i915_guc_client 
> *client,
>   GEM_BUG_ON(freespace < wqi_size);
>  
>   /* The GuC firmware wants the tail index in QWords, not bytes */
> - tail = rq->tail;
> - assert_ring_tail_valid(rq->ring, rq->tail);
> - tail >>= 3;
> + tail = intel_ring_set_tail(rq->ring, rq->tail) >> 3;
>   GEM_BUG_ON(tail > WQ_RING_TAIL_MAX);
>  
>   /* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
> b/drivers/gpu/drm/i915/intel_lrc.c
> index 7df278fe492e..d3612969098f 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -326,8 +326,7 @@ static u64 execlists_update_context(struct 
> drm_i915_gem_request *rq)
>   rq->ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
>   u32 *reg_state = ce->lrc_reg_state;
>  
> - assert_ring_tail_valid(rq->ring, rq->tail);
> - reg_state[CTX_RING_TAIL+1] = rq->tail;
> + reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
>  
>   /* True 32b PPGTT with dynamic page allocation: update PDP
>* registers and point the unallocated PDPs to scratch page.
> @@ -2054,7 +2053,8 @@ void intel_lr_context_resume(struct drm_i915_private 
> *dev_priv)
>   ce->state->obj->mm.dirty = true;
>   i915_gem_object_unpin_map(ce->state->obj);
>  
> - ce->ring->head = ce->ring->tail = 0;
> + GEM_BUG_ON(!list_empty(>ring->request_list));
> + ce->ring->head = ce->ring->tail = ce->ring->emit = 0;
>   intel_ring_update_space(ce->ring);
>   }
>   }
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 32afac6c754f..8c874996c617 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -49,7 +49,7 @@ static int __intel_ring_space(int head, int tail, int size)
>  
>  void intel_ring_update_space(struct intel_ring *ring)
>  {
> - ring->space = __intel_ring_space(ring->head, ring->tail, ring->size);
> + ring->space = __intel_ring_space(ring->head, ring->emit, ring->size);
>  }
>  
>  static int
> @@ -774,8 +774,8 @@ static void i9xx_submit_request(struct 
> drm_i915_gem_request *request)
>  
>   i915_gem_request_submit(request);
>  
> - assert_ring_tail_valid(request->ring, request->tail);
> - I915_WRITE_TAIL(request->engine, request->tail);
> + I915_WRITE_TAIL(request->engine,
> + intel_ring_set_tail(request->ring, request->tail));
>  }
>  
>  static void 

Re: [Intel-gfx] [PATCH 15/27] drm/i915: Split execlist priority queue into rbtree + linked list

2017-04-24 Thread Chris Wilson
On Mon, Apr 24, 2017 at 12:07:47PM +0100, Chris Wilson wrote:
> On Mon, Apr 24, 2017 at 11:28:32AM +0100, Tvrtko Ursulin wrote:
> > 
> > On 19/04/2017 10:41, Chris Wilson wrote:
> > Sounds attractive! What workloads show the benefit and how much?
> 
> The default will show the best, since everything is priority 0 more or
> less and so we reduce the rbtree search to a single lookup and list_add.
> It's hard to measure the impact of the rbtree though. On the dequeue
> side, the mmio access dominates. On the schedule side, if we have lots
> of requests, the dfs dominates.
> 
> I have an idea on how we might stress the rbtree in submit_request - but
> still it requires long queues untypical of most workloads. Still tbd.

I have something that does show a difference in that path (which is
potentially in hardirq). Overal time is completely dominated by the
reservation_object (ofc, we'll get back around to its scalability
patches at some point). For a few thousand prio=0 requests inflight, the
difference in execlists_submit_request() is about 6x, and for
intel_lrc_irq_hander() is about 2x (just a factor that I sent a lot of
coalesceable requests and so the reduction of rb_next to list_next).

Completely synthetic testing, I would be worried if the rbtree was that
tall in practice (request generation >> execution). The neat part of the
split, I think is that make the resubmission of a gazzumped request
easier - instead of writing a parallel rbtree sort, we just put the old
request at the head of the plist.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v4.1 01/9] drm/atomic: Handle aspect ratio and scaling mode in core, v2.

2017-04-24 Thread Maarten Lankhorst
On 19-04-17 17:43, Daniel Vetter wrote:
> On Thu, Apr 13, 2017 at 11:15:37AM +0200, Maarten Lankhorst wrote:
>> This is required to for i915 to convert connector properties to atomic.
>>
>> Changes since v1:
>> - Add docbook info. (danvet)
>> - Change picture_aspect_ratio to enum hdmi_picture_aspect.
>>
>> Signed-off-by: Maarten Lankhorst 
>> Cc: dri-de...@lists.freedesktop.org
>> Acked-by: Dave Airlie 
>> ---
>>  drivers/gpu/drm/drm_atomic.c |  8 
>>  include/drm/drm_connector.h  | 16 
>>  2 files changed, 24 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
>> index 30229ab719c0..25ea6f797a54 100644
>> --- a/drivers/gpu/drm/drm_atomic.c
>> +++ b/drivers/gpu/drm/drm_atomic.c
>> @@ -1123,6 +1123,10 @@ int drm_atomic_connector_set_property(struct 
>> drm_connector *connector,
>>   */
>>  if (state->link_status != DRM_LINK_STATUS_GOOD)
>>  state->link_status = val;
>> +} else if (property == config->aspect_ratio_property) {
>> +state->picture_aspect_ratio = val;
>> +} else if (property == config->scaling_mode_property) {
>> +state->scaling_mode = val;
>>  } else if (connector->funcs->atomic_set_property) {
>>  return connector->funcs->atomic_set_property(connector,
>>  state, property, val);
>> @@ -1199,6 +1203,10 @@ drm_atomic_connector_get_property(struct 
>> drm_connector *connector,
>>  *val = state->tv.hue;
>>  } else if (property == config->link_status_property) {
>>  *val = state->link_status;
>> +} else if (property == config->aspect_ratio_property) {
>> +*val = state->picture_aspect_ratio;
>> +} else if (property == config->scaling_mode_property) {
>> +*val = state->scaling_mode;
>>  } else if (connector->funcs->atomic_get_property) {
>>  return connector->funcs->atomic_get_property(connector,
>>  state, property, val);
>> diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
>> index 4eeda120e46d..5b50bc2db6fb 100644
>> --- a/include/drm/drm_connector.h
>> +++ b/include/drm/drm_connector.h
>> @@ -25,6 +25,7 @@
>>  
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  
>>  #include 
>> @@ -326,6 +327,21 @@ struct drm_connector_state {
>>  struct drm_atomic_state *state;
>>  
>>  struct drm_tv_connector_state tv;
>> +
>> +/**
>> + * @picture_aspect_ratio: Connector property to control the
>> + * HDMI infoframe aspect ratio setting.
>> + *
>> + * The DRM_MODE_PICTURE_ASPECT_* values much match the
> I think the above will upset sphinx and spew a warning, you need
> ASPECT_\* or something like that. Or just spell them out and enumerate
> them all. Fixed either way this is
I checked and make htmldocs worked just fine, but I'll quote the *.

Thanks,
Maarten
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915/glk: Don't allow 12 bpc when htotal is too big

2017-04-24 Thread Jani Nikula
On Mon, 24 Apr 2017, Ander Conselvan De Oliveira  wrote:
> On Mon, 2017-04-24 at 13:47 +0300, Ander Conselvan de Oliveira wrote:
>> Display workaround #1139 for Geminilake instructs us to restrict HDMI
>> to 8 bpc when htotal is greater than 5460. Otherwise, the pipe is unable
>> to generate a proper signal and is left in a state where corruption is
>> seen with other modes.
>> 
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100440
>> Cc: Shashank Sharma 
>> Signed-off-by: Ander Conselvan de Oliveira 
>> 
>
> Perhaps this should have
>
> Fixes: 14292b7ff86f ("drm/i915: allow HDMI 2.0 clock rates")
>
> since it is only after that patch that the issue would be exposed, even though
> is not the cause for it. Jani?

I don't mind/care too much either way, because it's only GLK <= A1.

BR,
Jani.



>
> Ander
>
>> ---
>>  drivers/gpu/drm/i915/intel_hdmi.c | 5 +
>>  1 file changed, 5 insertions(+)
>> 
>> diff --git a/drivers/gpu/drm/i915/intel_hdmi.c 
>> b/drivers/gpu/drm/i915/intel_hdmi.c
>> index 6efc3cb..52f0b2d 100644
>> --- a/drivers/gpu/drm/i915/intel_hdmi.c
>> +++ b/drivers/gpu/drm/i915/intel_hdmi.c
>> @@ -1327,6 +1327,11 @@ static bool hdmi_12bpc_possible(struct 
>> intel_crtc_state *crtc_state)
>>  return false;
>>  }
>>  
>> +/* Display Wa #1139 */
>> +if (IS_GLK_REVID(dev_priv, 0, GLK_REVID_A1) &&
>> +crtc_state->base.adjusted_mode.htotal > 5460)
>> +return false;
>> +
>>  return true;
>>  }
>>  

-- 
Jani Nikula, Intel Open Source Technology Center
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/glk: Don't allow 12 bpc when htotal is too big

2017-04-24 Thread Patchwork
== Series Details ==

Series: drm/i915/glk: Don't allow 12 bpc when htotal is too big
URL   : https://patchwork.freedesktop.org/series/23451/
State : success

== Summary ==

Series 23451v1 drm/i915/glk: Don't allow 12 bpc when htotal is too big
https://patchwork.freedesktop.org/api/1.0/series/23451/revisions/1/mbox/

Test gem_exec_flush:
Subgroup basic-batch-kernel-default-uc:
fail   -> PASS   (fi-snb-2600) fdo#17

fdo#17 https://bugs.freedesktop.org/show_bug.cgi?id=17

fi-bdw-5557u total:278  pass:267  dwarn:0   dfail:0   fail:0   skip:11  
time:429s
fi-bdw-gvtdvmtotal:278  pass:256  dwarn:8   dfail:0   fail:0   skip:14  
time:430s
fi-bsw-n3050 total:278  pass:242  dwarn:0   dfail:0   fail:0   skip:36  
time:576s
fi-bxt-j4205 total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  
time:506s
fi-bxt-t5700 total:278  pass:258  dwarn:0   dfail:0   fail:0   skip:20  
time:564s
fi-byt-j1900 total:278  pass:254  dwarn:0   dfail:0   fail:0   skip:24  
time:490s
fi-byt-n2820 total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  
time:493s
fi-hsw-4770  total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  
time:415s
fi-hsw-4770r total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  
time:410s
fi-ilk-650   total:278  pass:228  dwarn:0   dfail:0   fail:0   skip:50  
time:425s
fi-ivb-3520m total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:494s
fi-ivb-3770  total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:464s
fi-kbl-7500u total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:461s
fi-kbl-7560u total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  
time:562s
fi-skl-6260u total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  
time:450s
fi-skl-6700hqtotal:278  pass:261  dwarn:0   dfail:0   fail:0   skip:17  
time:575s
fi-skl-6700k total:278  pass:256  dwarn:4   dfail:0   fail:0   skip:18  
time:457s
fi-skl-6770hqtotal:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  
time:497s
fi-skl-gvtdvmtotal:278  pass:265  dwarn:0   dfail:0   fail:0   skip:13  
time:429s
fi-snb-2520m total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  
time:529s
fi-snb-2600  total:278  pass:249  dwarn:0   dfail:0   fail:0   skip:29  
time:398s

9fb4b60bfa8c532ad6eda05af002c6b2f090d97f drm-tip: 2017y-04m-24d-09h-40m-53s UTC 
integration manifest
7fa2f63 drm/i915/glk: Don't allow 12 bpc when htotal is too big

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_4536/
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915/glk: Don't allow 12 bpc when htotal is too big

2017-04-24 Thread Sharma, Shashank

Regards

Shashank


On 4/24/2017 4:17 PM, Ander Conselvan de Oliveira wrote:

Display workaround #1139 for Geminilake instructs us to restrict HDMI
to 8 bpc when htotal is greater than 5460. Otherwise, the pipe is unable
to generate a proper signal and is left in a state where corruption is
seen with other modes.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100440
Cc: Shashank Sharma 
Signed-off-by: Ander Conselvan de Oliveira 

---
  drivers/gpu/drm/i915/intel_hdmi.c | 5 +
  1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_hdmi.c 
b/drivers/gpu/drm/i915/intel_hdmi.c
index 6efc3cb..52f0b2d 100644
--- a/drivers/gpu/drm/i915/intel_hdmi.c
+++ b/drivers/gpu/drm/i915/intel_hdmi.c
@@ -1327,6 +1327,11 @@ static bool hdmi_12bpc_possible(struct intel_crtc_state 
*crtc_state)
return false;
}
  
+	/* Display Wa #1139 */

+   if (IS_GLK_REVID(dev_priv, 0, GLK_REVID_A1) &&
+   crtc_state->base.adjusted_mode.htotal > 5460)

Small inputs:
- We might wanna extend this for CNL too, as the WAR is valid for it 
too. I am not sure it should be this same patch or not.
- We need to modify this check not to affect the YCBCR 420 modes, as 
there won't be any change between the timings of a YCBCR mode and a
  normal mode (apart from a flag), but htotal should be valid in YCBCR 
420 case. But I can handle this part while sending second patch set of

  YCBCR 420 handling.
- Also, should we reject 12BPC totally, or just clamp htotal to max 
possible (5460) and go ahead? Ville ?


In any case, please feel free to use:
Reviewed-by: Shashank Sharma 

+   return false;
+
return true;
  }
  


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 15/27] drm/i915: Split execlist priority queue into rbtree + linked list

2017-04-24 Thread Chris Wilson
On Mon, Apr 24, 2017 at 11:28:32AM +0100, Tvrtko Ursulin wrote:
> 
> On 19/04/2017 10:41, Chris Wilson wrote:
> >All the requests at the same priority are executed in FIFO order. They
> >do not need to be stored in the rbtree themselves, as they are a simple
> >list within a level. If we move the requests at one priority into a list,
> >we can then reduce the rbtree to the set of priorities. This should keep
> >the height of the rbtree small, as the number of active priorities can not
> >exceed the number of active requests and should be typically only a few.
> >
> >Currently, we have ~2k possible different priority levels, that may
> >increase to allow even more fine grained selection. Allocating those in
> >advance seems a waste (and may be impossible), so we opt for allocating
> >upon first use, and freeing after its requests are depleted. To avoid
> >the possibility of an allocation failure causing us to lose a request,
> >we preallocate the default priority (0) and bump any request to that
> >priority if we fail to allocate it the appropriate plist. Having a
> >request (that is ready to run, so not leading to corruption) execute
> >out-of-order is better than leaking the request (and its dependency
> >tree) entirely.
> >
> >There should be a benefit to reducing execlists_dequeue() to principally
> >using a simple list (and reducing the frequency of both rbtree iteration
> >and balancing on erase) but for typical workloads, request coalescing
> >should be small enough that we don't notice any change. The main gain is
> >from improving PI calls to schedule, and the explicit list within a
> >level should make request unwinding simpler (we just need to insert at
> >the head of the list rather than the tail and not have to make the
> >rbtree search more complicated).
> 
> Sounds attractive! What workloads show the benefit and how much?

The default will show the best, since everything is priority 0 more or
less and so we reduce the rbtree search to a single lookup and list_add.
It's hard to measure the impact of the rbtree though. On the dequeue
side, the mmio access dominates. On the schedule side, if we have lots
of requests, the dfs dominates.

I have an idea on how we might stress the rbtree in submit_request - but
still it requires long queues untypical of most workloads. Still tbd.

> >-static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
> >+static bool
> >+insert_request(struct intel_engine_cs *engine,
> >+   struct i915_priotree *pt,
> >+   int prio)
> > {
> >+struct execlist_priolist *plist;
> > struct rb_node **p, *rb;
> > bool first = true;
> >
> >+find_plist:
> > /* most positive priority is scheduled first, equal priorities fifo */
> > rb = NULL;
> >-p = >rb_node;
> >+p = >execlist_queue.rb_node;
> > while (*p) {
> >-struct i915_priotree *pos;
> >-
> > rb = *p;
> >-pos = rb_entry(rb, typeof(*pos), node);
> >-if (pt->priority > pos->priority) {
> >+plist = rb_entry(rb, typeof(*plist), node);
> >+if (prio > plist->priority) {
> > p = >rb_left;
> >-} else {
> >+} else if (prio < plist->priority) {
> > p = >rb_right;
> > first = false;
> >+} else {
> >+list_add_tail(>link, >requests);
> >+return false;
> > }
> > }
> >-rb_link_node(>node, rb, p);
> >-rb_insert_color(>node, root);
> >+
> >+if (!prio) {
> >+plist = >default_priolist;
> 
> Should be "prio == I915_PRIO_DEFAULT" (give or take).
> 
> But I am not completely happy with special casing the default
> priority for two reasons.
> 
> Firstly, userspace can opt to lower its priority and completely
> defeat this path.
> 
> Secondly, we already have flip priority which perhaps should have
> it's own fast path / avoid allocation as well.
> 
> Those two combined make me unsure whether the optimisation is worth
> it. What would be the pros and cons of three steps:
> 
> 1. No optimisation.
> 2. prio == default optimisation like above.
> 3. Better system with caching of frequently used levels.
> 
> Last is definitely complicated, second is not, but is the second
> much better than the first?

It was not intended as an optimisation. It is for handling the
ENOMEM here. We cannot abort the request at such a late stage, so we
need somewhere to hold it. That dictated having a preallocted slot. I
also didn't like having to preallocate all possible levels as that seems
a waste, especially as I like to invent new levels and suspect that we
may end up using a full u32 range.

Using it for the default priority was then to take advantage of the
preallocation.

> Perhaps a simplification of 3) where we would defer the freeing of
> unused priority levels until the busy to idle transition? That would
> also drop the existence and need for special 

Re: [Intel-gfx] [PATCH] drm/i915/glk: Don't allow 12 bpc when htotal is too big

2017-04-24 Thread Ander Conselvan De Oliveira
On Mon, 2017-04-24 at 13:47 +0300, Ander Conselvan de Oliveira wrote:
> Display workaround #1139 for Geminilake instructs us to restrict HDMI
> to 8 bpc when htotal is greater than 5460. Otherwise, the pipe is unable
> to generate a proper signal and is left in a state where corruption is
> seen with other modes.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100440
> Cc: Shashank Sharma 
> Signed-off-by: Ander Conselvan de Oliveira 
> 

Perhaps this should have

Fixes: 14292b7ff86f ("drm/i915: allow HDMI 2.0 clock rates")

since it is only after that patch that the issue would be exposed, even though
is not the cause for it. Jani?

Ander

> ---
>  drivers/gpu/drm/i915/intel_hdmi.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/intel_hdmi.c 
> b/drivers/gpu/drm/i915/intel_hdmi.c
> index 6efc3cb..52f0b2d 100644
> --- a/drivers/gpu/drm/i915/intel_hdmi.c
> +++ b/drivers/gpu/drm/i915/intel_hdmi.c
> @@ -1327,6 +1327,11 @@ static bool hdmi_12bpc_possible(struct 
> intel_crtc_state *crtc_state)
>   return false;
>   }
>  
> + /* Display Wa #1139 */
> + if (IS_GLK_REVID(dev_priv, 0, GLK_REVID_A1) &&
> + crtc_state->base.adjusted_mode.htotal > 5460)
> + return false;
> +
>   return true;
>  }
>  
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH] drm/i915/glk: Don't allow 12 bpc when htotal is too big

2017-04-24 Thread Ander Conselvan de Oliveira
Display workaround #1139 for Geminilake instructs us to restrict HDMI
to 8 bpc when htotal is greater than 5460. Otherwise, the pipe is unable
to generate a proper signal and is left in a state where corruption is
seen with other modes.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100440
Cc: Shashank Sharma 
Signed-off-by: Ander Conselvan de Oliveira 

---
 drivers/gpu/drm/i915/intel_hdmi.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_hdmi.c 
b/drivers/gpu/drm/i915/intel_hdmi.c
index 6efc3cb..52f0b2d 100644
--- a/drivers/gpu/drm/i915/intel_hdmi.c
+++ b/drivers/gpu/drm/i915/intel_hdmi.c
@@ -1327,6 +1327,11 @@ static bool hdmi_12bpc_possible(struct intel_crtc_state 
*crtc_state)
return false;
}
 
+   /* Display Wa #1139 */
+   if (IS_GLK_REVID(dev_priv, 0, GLK_REVID_A1) &&
+   crtc_state->base.adjusted_mode.htotal > 5460)
+   return false;
+
return true;
 }
 
-- 
2.9.3

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v5 12/15] drm/i915/perf: Add OA unit support for Gen 8+

2017-04-24 Thread Chris Wilson
On Mon, Apr 24, 2017 at 12:17:48AM -0700, Lionel Landwerlin wrote:
> +static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv)
> +{
> + struct i915_gem_context *ctx;
> + int ret;
> +
> + ret = i915_mutex_lock_interruptible(_priv->drm);
> + if (ret)
> + return ret;

Missed a switch to kernel context, to ensure that the last context is
also saved to memory.

> +
> + /* The OA register config is setup through the context image. This image
> +  * might be written to by the GPU on context switch (in particular on
> +  * lite-restore). This means we can't safely update a context's image,
> +  * if this context is scheduled/submitted to run on the GPU.
> +  *
> +  * We could emit the OA register config through the batch buffer but
> +  * this might leave small interval of time where the OA unit is
> +  * configured at an invalid sampling period.
> +  *
> +  * So far the best way to work around this issue seems to be draining
> +  * the GPU from any submitted work.
> +  */
> +ret = i915_gem_wait_for_idle(dev_priv,
> + I915_WAIT_INTERRUPTIBLE |
> + I915_WAIT_LOCKED);
> +if (ret) {
> + mutex_unlock(_priv->drm.struct_mutex);
> +return ret;
> + }
> +
> + /* Since execlist submission may be happening asynchronously here then
> +  * we first mark existing contexts dirty before we update the current
> +  * context so if any switches happen in the middle we can expect
> +  * that the act of scheduling will have itself ensured a consistent
> +  * OA state update.
> +  */
> + list_for_each_entry(ctx, _priv->context_list, link) {
> + /* The actual update of the register state context will happen
> +  * the next time this logical ring is submitted. (See
> +  * i915_oa_update_reg_state() which hooks into
> +  * execlists_update_context())
> +  */
> + atomic_set(>engine[RCS].oa_state_dirty, 1);

Considering that everything is now available to be modified, you can
change the contexts right here and now and remove the checks from the
much more common path.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 15/27] drm/i915: Split execlist priority queue into rbtree + linked list

2017-04-24 Thread Tvrtko Ursulin


On 19/04/2017 10:41, Chris Wilson wrote:

All the requests at the same priority are executed in FIFO order. They
do not need to be stored in the rbtree themselves, as they are a simple
list within a level. If we move the requests at one priority into a list,
we can then reduce the rbtree to the set of priorities. This should keep
the height of the rbtree small, as the number of active priorities can not
exceed the number of active requests and should be typically only a few.

Currently, we have ~2k possible different priority levels, that may
increase to allow even more fine grained selection. Allocating those in
advance seems a waste (and may be impossible), so we opt for allocating
upon first use, and freeing after its requests are depleted. To avoid
the possibility of an allocation failure causing us to lose a request,
we preallocate the default priority (0) and bump any request to that
priority if we fail to allocate it the appropriate plist. Having a
request (that is ready to run, so not leading to corruption) execute
out-of-order is better than leaking the request (and its dependency
tree) entirely.

There should be a benefit to reducing execlists_dequeue() to principally
using a simple list (and reducing the frequency of both rbtree iteration
and balancing on erase) but for typical workloads, request coalescing
should be small enough that we don't notice any change. The main gain is
from improving PI calls to schedule, and the explicit list within a
level should make request unwinding simpler (we just need to insert at
the head of the list rather than the tail and not have to make the
rbtree search more complicated).


Sounds attractive! What workloads show the benefit and how much?


v2: Avoid use-after-free when deleting a depleted priolist

Signed-off-by: Chris Wilson 
Cc: Michał Winiarski 
Cc: Tvrtko Ursulin 
Cc: Joonas Lahtinen 
---
 drivers/gpu/drm/i915/i915_debugfs.c| 12 +++--
 drivers/gpu/drm/i915/i915_gem_request.c|  4 +-
 drivers/gpu/drm/i915/i915_gem_request.h|  2 +-
 drivers/gpu/drm/i915/i915_guc_submission.c | 20 ++--
 drivers/gpu/drm/i915/intel_lrc.c   | 75 ++
 drivers/gpu/drm/i915/intel_ringbuffer.h|  7 +++
 6 files changed, 90 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 0b5d7142d8d9..a8c7788d986e 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3314,7 +3314,6 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)

if (i915.enable_execlists) {
u32 ptr, read, write;
-   struct rb_node *rb;
unsigned int idx;

seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
@@ -3358,9 +3357,14 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
rcu_read_unlock();

spin_lock_irq(>timeline->lock);
-   for (rb = engine->execlist_first; rb; rb = rb_next(rb)) 
{
-   rq = rb_entry(rb, typeof(*rq), priotree.node);
-   print_request(m, rq, "\t\tQ ");
+   for (rb = engine->execlist_first; rb; rb = rb_next(rb)){
+   struct execlist_priolist *plist =
+   rb_entry(rb, typeof(*plist), node);
+
+   list_for_each_entry(rq,
+   >requests,
+   priotree.link)
+   print_request(m, rq, "\t\tQ ");
}
spin_unlock_irq(>timeline->lock);
} else if (INTEL_GEN(dev_priv) > 6) {
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
b/drivers/gpu/drm/i915/i915_gem_request.c
index 83b1584b3deb..59c0e0b00028 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -159,7 +159,7 @@ i915_priotree_fini(struct drm_i915_private *i915, struct 
i915_priotree *pt)
 {
struct i915_dependency *dep, *next;

-   GEM_BUG_ON(!RB_EMPTY_NODE(>node));
+   GEM_BUG_ON(!list_empty(>link));

/* Everyone we depended upon (the fences we wait to be signaled)
 * should retire before us and remove themselves from our list.
@@ -185,7 +185,7 @@ i915_priotree_init(struct i915_priotree *pt)
 {
INIT_LIST_HEAD(>signalers_list);
INIT_LIST_HEAD(>waiters_list);
-   RB_CLEAR_NODE(>node);
+   INIT_LIST_HEAD(>link);
pt->priority = INT_MIN;
 }

diff --git a/drivers/gpu/drm/i915/i915_gem_request.h 
b/drivers/gpu/drm/i915/i915_gem_request.h
index 4ccab5affd3c..0a1d717b9fa7 100644
--- 

[Intel-gfx] ✓ Fi.CI.BAT: success for series starting with [1/2] drm/i915: Calculate ironlake intermediate watermarks correctly

2017-04-24 Thread Patchwork
== Series Details ==

Series: series starting with [1/2] drm/i915: Calculate ironlake intermediate 
watermarks correctly
URL   : https://patchwork.freedesktop.org/series/23444/
State : success

== Summary ==

Series 23444v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/23444/revisions/1/mbox/

Test gem_exec_flush:
Subgroup basic-batch-kernel-default-uc:
fail   -> PASS   (fi-snb-2600) fdo#17

fdo#17 https://bugs.freedesktop.org/show_bug.cgi?id=17

fi-bdw-5557u total:278  pass:267  dwarn:0   dfail:0   fail:0   skip:11  
time:430s
fi-bdw-gvtdvmtotal:278  pass:256  dwarn:8   dfail:0   fail:0   skip:14  
time:428s
fi-bsw-n3050 total:278  pass:242  dwarn:0   dfail:0   fail:0   skip:36  
time:577s
fi-bxt-j4205 total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  
time:508s
fi-byt-j1900 total:278  pass:254  dwarn:0   dfail:0   fail:0   skip:24  
time:486s
fi-byt-n2820 total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  
time:478s
fi-hsw-4770  total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  
time:412s
fi-hsw-4770r total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  
time:403s
fi-ilk-650   total:278  pass:228  dwarn:0   dfail:0   fail:0   skip:50  
time:427s
fi-ivb-3520m total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:486s
fi-ivb-3770  total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:483s
fi-kbl-7500u total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:457s
fi-kbl-7560u total:278  pass:267  dwarn:1   dfail:0   fail:0   skip:10  
time:567s
fi-skl-6260u total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  
time:458s
fi-skl-6700hqtotal:278  pass:261  dwarn:0   dfail:0   fail:0   skip:17  
time:573s
fi-skl-6700k total:278  pass:256  dwarn:4   dfail:0   fail:0   skip:18  
time:464s
fi-skl-6770hqtotal:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  
time:495s
fi-skl-gvtdvmtotal:278  pass:265  dwarn:0   dfail:0   fail:0   skip:13  
time:433s
fi-snb-2520m total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  
time:528s
fi-snb-2600  total:278  pass:249  dwarn:0   dfail:0   fail:0   skip:29  
time:405s
fi-bxt-t5700 failed to collect. IGT log at Patchwork_4535/fi-bxt-t5700/igt.log

bc781a3cf9a80e4c5ed0d47fd0c67923bcfcdf2c drm-tip: 2017y-04m-22d-08h-49m-55s UTC 
integration manifest
ecb65dc drm/i915: Calculate vlv/chv intermediate watermarks correctly
99ba5c6 drm/i915: Calculate ironlake intermediate watermarks correctly

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_4535/
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH 2/2] drm/i915: Calculate vlv/chv intermediate watermarks correctly

2017-04-24 Thread Maarten Lankhorst
The watermarks it should calculate against are the old optimal watermarks.
The currently active crtc watermarks are pure fiction, and are invalid in
case of a nonblocking modeset, page flip enabling/disabling planes or any
other reason.

When the crtc is disabled or during a modeset the intermediate watermarks
don't need to be programmed separately, and could be directly assigned
to the optimal watermarks.

Also rename crtc_state to new_crtc_state, to distinguish it from the old state.

Signed-off-by: Maarten Lankhorst 
Cc: Ville Syrjälä 
---
 drivers/gpu/drm/i915/intel_pm.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 7da701dc3caa..9ee73ef1dd1b 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -1458,16 +1458,24 @@ static void vlv_atomic_update_fifo(struct 
intel_atomic_state *state,
 
 static int vlv_compute_intermediate_wm(struct drm_device *dev,
   struct intel_crtc *crtc,
-  struct intel_crtc_state *crtc_state)
+  struct intel_crtc_state *new_crtc_state)
 {
-   struct vlv_wm_state *intermediate = _state->wm.vlv.intermediate;
-   const struct vlv_wm_state *optimal = _state->wm.vlv.optimal;
-   const struct vlv_wm_state *active = >wm.active.vlv;
+   struct vlv_wm_state *intermediate = 
_crtc_state->wm.vlv.intermediate;
+   const struct vlv_wm_state *optimal = _crtc_state->wm.vlv.optimal;
+   const struct drm_crtc_state *old_drm_crtc_state =
+   drm_atomic_get_old_crtc_state(new_crtc_state->base.state, 
>base);
+   const struct vlv_wm_state *active = 
_intel_crtc_state(old_drm_crtc_state)->wm.vlv.optimal;
int level;
 
+   if (!new_crtc_state->base.active || 
drm_atomic_crtc_needs_modeset(_crtc_state->base)) {
+   *intermediate = *optimal;
+
+   return 0;
+   }
+
intermediate->num_levels = min(optimal->num_levels, active->num_levels);
intermediate->cxsr = optimal->cxsr && active->cxsr &&
-   !crtc_state->disable_cxsr;
+   !new_crtc_state->disable_cxsr;
 
for (level = 0; level < intermediate->num_levels; level++) {
enum plane_id plane_id;
@@ -1491,7 +1499,7 @@ static int vlv_compute_intermediate_wm(struct drm_device 
*dev,
 * omit the post-vblank programming; only update if it's different.
 */
if (memcmp(intermediate, optimal, sizeof(*intermediate)) != 0)
-   crtc_state->wm.need_postvbl_update = true;
+   new_crtc_state->wm.need_postvbl_update = true;
 
return 0;
 }
-- 
2.7.4

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH 1/2] drm/i915: Calculate ironlake intermediate watermarks correctly

2017-04-24 Thread Maarten Lankhorst
The watermarks it should calculate against are the old optimal watermarks.
The currently active crtc watermarks are pure fiction, and are invalid in
case of a nonblocking modeset, page flip enabling/disabling planes or any
other reason.

When the crtc is disabled or during a modeset the intermediate watermarks
don't need to be programmed separately, and could be directly assigned
to the optimal watermarks.

Signed-off-by: Maarten Lankhorst 
Cc: Ville Syrjälä 
---
 drivers/gpu/drm/i915/intel_pm.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index cacb65fa2dd5..7da701dc3caa 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -2639,7 +2639,9 @@ static int ilk_compute_intermediate_wm(struct drm_device 
*dev,
   struct intel_crtc_state *newstate)
 {
struct intel_pipe_wm *a = >wm.ilk.intermediate;
-   struct intel_pipe_wm *b = _crtc->wm.active.ilk;
+   const struct drm_crtc_state *old_drm_state =
+   drm_atomic_get_old_crtc_state(newstate->base.state, 
_crtc->base);
+   const struct intel_pipe_wm *b = 
_intel_crtc_state(old_drm_state)->wm.ilk.optimal;
int level, max_level = ilk_wm_max_level(to_i915(dev));
 
/*
@@ -2648,6 +2650,9 @@ static int ilk_compute_intermediate_wm(struct drm_device 
*dev,
 * and after the vblank.
 */
*a = newstate->wm.ilk.optimal;
+   if (!newstate->base.active || 
drm_atomic_crtc_needs_modeset(>base))
+   return 0;
+
a->pipe_enabled |= b->pipe_enabled;
a->sprites_enabled |= b->sprites_enabled;
a->sprites_scaled |= b->sprites_scaled;
-- 
2.7.4

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] ✓ Fi.CI.BAT: success for RFC: Design: DRM: Blending pipeline using DRM plane properties

2017-04-24 Thread Patchwork
== Series Details ==

Series: RFC: Design: DRM: Blending pipeline using DRM plane properties
URL   : https://patchwork.freedesktop.org/series/23443/
State : success

== Summary ==

Series 23443v1 RFC: Design: DRM: Blending pipeline using DRM plane properties
https://patchwork.freedesktop.org/api/1.0/series/23443/revisions/1/mbox/

Test gem_exec_flush:
Subgroup basic-batch-kernel-default-uc:
fail   -> PASS   (fi-snb-2600) fdo#17
Test gem_exec_suspend:
Subgroup basic-s4-devices:
dmesg-warn -> PASS   (fi-kbl-7560u) fdo#100125

fdo#17 https://bugs.freedesktop.org/show_bug.cgi?id=17
fdo#100125 https://bugs.freedesktop.org/show_bug.cgi?id=100125

fi-bdw-5557u total:278  pass:267  dwarn:0   dfail:0   fail:0   skip:11  
time:429s
fi-bdw-gvtdvmtotal:278  pass:256  dwarn:8   dfail:0   fail:0   skip:14  
time:428s
fi-bsw-n3050 total:278  pass:242  dwarn:0   dfail:0   fail:0   skip:36  
time:573s
fi-bxt-j4205 total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  
time:509s
fi-bxt-t5700 total:278  pass:258  dwarn:0   dfail:0   fail:0   skip:20  
time:553s
fi-byt-j1900 total:278  pass:254  dwarn:0   dfail:0   fail:0   skip:24  
time:487s
fi-byt-n2820 total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  
time:483s
fi-hsw-4770  total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  
time:407s
fi-hsw-4770r total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  
time:401s
fi-ilk-650   total:278  pass:228  dwarn:0   dfail:0   fail:0   skip:50  
time:428s
fi-ivb-3520m total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:485s
fi-ivb-3770  total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:461s
fi-kbl-7500u total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  
time:457s
fi-kbl-7560u total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  
time:570s
fi-skl-6260u total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  
time:448s
fi-skl-6700hqtotal:278  pass:261  dwarn:0   dfail:0   fail:0   skip:17  
time:576s
fi-skl-6700k total:278  pass:256  dwarn:4   dfail:0   fail:0   skip:18  
time:462s
fi-skl-6770hqtotal:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  
time:495s
fi-skl-gvtdvmtotal:278  pass:265  dwarn:0   dfail:0   fail:0   skip:13  
time:431s
fi-snb-2520m total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  
time:533s
fi-snb-2600  total:278  pass:249  dwarn:0   dfail:0   fail:0   skip:29  
time:399s

bc781a3cf9a80e4c5ed0d47fd0c67923bcfcdf2c drm-tip: 2017y-04m-22d-08h-49m-55s UTC 
integration manifest
cc35e8d RFC: Design: DRM: Blending pipeline using DRM plane properties

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_4534/
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH] RFC: Design: DRM: Blending pipeline using DRM plane properties

2017-04-24 Thread Shashank Sharma
This patch proposes a RFC design to handle blending of various
framebuffers with different color spaces, using the DRM color
properties.

Signed-off-by: Shashank Sharma 
---
 drivers/gpu/drm/rfc-design-blending.txt | 52 +
 1 file changed, 52 insertions(+)
 create mode 100644 drivers/gpu/drm/rfc-design-blending.txt

diff --git a/drivers/gpu/drm/rfc-design-blending.txt 
b/drivers/gpu/drm/rfc-design-blending.txt
new file mode 100644
index 000..55d96e9
--- /dev/null
+++ b/drivers/gpu/drm/rfc-design-blending.txt
@@ -0,0 +1,52 @@
+Hi all,
+
+I wanted to send this design in a separate thread, but then I realized we can 
use this thread itself as many of the
+stakeholders are already active here.
+
+As you all know, we will be dealing with the complex situations of blending of 
two(or more) different buffers
+in the pipeline, which are in different format and different color space.
+(PS: This ascii block design is best visible in a widescreen monitor, or in 
HTML page)
+
+=
+
+  property 1 = CSCproperty 2 = Degamma  property 3 = 
Gamut  property 4 = palette
+  +--++---+  
+--+   +---+  RGB REC 2020 buffer
+YUV   |color space   ||Linearizion|  |Gamut 
mapping |   |Non-Linearizion+---+
+REC709--> |conversion+--->+(Degamma)  
+->+(REC709->REC2020) +---+(Gamma)|   |
+  |(YUV->RGB)||   |  | 
 |   |   |+--v---+
+  +--++---+  
+--+   +---+|   
   |
+   
  |  |
+   
  | Blending unit|
+   
  |  |--> 
blended output
+RGB REC 2020 buffer (Bypass everything)
  |  |
+  
+->
|  |
+   
  |  |
+   
  +--+
+=
+
+This is a design proposal of a blending pipeline, using a sequence of plane 
level DRM properties.
+The description of the block is:
+- Input buffers are two different streams for example
+   - YCBCR buffer in REC709 colorspace
+   - RGB buffer in BT2020 colorspace
+- Aim is to make bending accurate by providing similar input to the blending 
unit (same in format as well as color space).
+- Every block here represents a plane level DRM property, with specific task 
to do.
+   - first block is CSC property, which is for conversion from YCBCR->RGB 
only (This doesnt do gamut mapping)
+   - second block is the property which will linearize the input, a 
degamma kind of property
+   - third block is a Gamut mapping proprty, which will do a gamut 
conversion (ex 709->2020)
+   - forth block is a Non-Linearizion block, which will apply back the 
curve (like Gamma) required
+   - The output of this pipeline is a RGB buffer in REC2020 color space
+   - Any driver can map its HW's plane level color correction capabilities 
to these properties.
+   - Once blending is done, driver can apply any post blending CRTC level 
color property, to:
+   - Change output type (ex. changing RGB->YUV using CRTC level 
CSC property)
+   - Apply a curve on the blended output (using CRTC level 
gamma/LUT property)
+
+- Important points:
+   - The sequence of the properties has to be fixed (almost in this 
order), considering the linear data requirement of Gamut mapping
+   - The color space of blending needs to be decided in the userspace, by 
UI manager, with some policies like (examples):
+   - If any of the buffer is REC2020, all buffers should be 

Re: [Intel-gfx] [RFC 2/2] drm/i915: Select engines via class and instance in execbuffer2

2017-04-24 Thread Tvrtko Ursulin


On 18/04/2017 22:10, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 05:56:15PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Building on top of the previous patch which exported the concept
of engine classes and instances, we can also use this instead of
the current awkward engine selection uAPI.

This is primarily interesting for the VCS engine selection which
is a) currently done via disjoint set of flags, and b) the
current I915_EXEC_BSD flags has different semantics depending on
the underlying hardware which is bad.

Proposed idea here is to reserve 16-bits of flags, to pass in
the engine class and instance (8 bits each), and a new flag
named I915_EXEC_CLASS_INSTACE to tell the kernel this new engine
selection API is in use.

The new uAPI also removes access to the weak VCS engine
balancing as currently existing in the driver.

Example usage to send a command to VCS0:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 0);

Or to send a command to VCS1:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 1);


To save a bit of space, we can use the ring selector as a class selector
if bit18 is set, with 19-27 as instance. That limits us to 64 classes -
hopefully not a problem for near future. At least I might have you sold
you on a flexible execbuf3 by then.


I was considering re-using those bits yes. I was thinking about the pro 
of keeping it completely separate but I suppose there is not much value 
in that. So I can re-use the ring selector just as well and have a 
smaller impact on number of bits left over.



(As a digression, some cryptic notes for an implementation I did over Easter:
/*
 * Execbuf3!
 *
 * ringbuffer
 *  - per context
 *  - per engine


We have this already so I am missing something I guess.


 *  - PAGE_SIZE ctl [ro head, rw tai] + user pot
 *  - kthread [i915/$ctx-$engine] (optional?)


No idea what these two are. :)


 *  - assumes NO_RELOC-esque awareness


Ok ok NO_RELOC. :)


 *
 * SYNC flags [wait/signal], handle [semaphore/fence]


Sync fence in out just as today, but probably more?


 *
 * BIND handle, offset [user provided]
 * ALLOC[32,64] handle, flags, *offset [kernel provided, need RELOC]
 * RELOC[32,64] handle, target_handle, offset, delta
 * CLEAR flags, handle
 * UNBIND handle


Explicit VMA management? Separate ioctl maybe would be better?


 *
 * BATCH flags, handle, offset
 * [or SVM flags, address]
 *   PIN flags (MAY_RELOC), count, handle[count]
 *   FENCE flags, count, handle[count]
 * SUBMIT handle [fence/NULL with error]
 */


No idea again. :)


At the moment it is just trying to do execbuf2, but more compactly and
with fewer ioctls. But one of the main selling points is that we can
extend the information passed around more freely than execbuf2.)


I have nothing against a better eb since I trust you know much better it 
is needed and when. But I don't know how long it will take to get there. 
This class/instance idea could be implemented quickly to solve the sore 
point of VCS/VCS2 engine selection. But yeah, it is another uABI to keep 
in that case.


Regards,

Tvrtko
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH i-g-t v2] tests/meta_test: Add a meta test for sanity checks of CI systems

2017-04-24 Thread Abdiel Janulgue
On 19.04.2017 13:13, Marta Lofstedt wrote:
> +
> +__attribute__((format(printf, 1, 2)))
> +static void kmsg(const char *format, ...)
> +#define KERN_EMER"<0>"
> +#define KERN_ALERT   "<1>"
> +#define KERN_CRIT"<2>"
> +#define KERN_ERR "<3>"
> +#define KERN_WARNING "<4>"
> +#define KERN_NOTICE  "<5>"
> +#define KERN_INFO"<6>"
> +#define KERN_DEBUG   "<7>"
> +{
> + va_list ap;
> + FILE *file;
> +
> + file = fopen("/dev/kmsg", "w");
> + if (file == NULL)
> + return;
> +
> + va_start(ap, format);
> + vfprintf(file, format, ap);
> + va_end(ap);
> + fclose(file);
> +}
> +

igt_core.c already implements kmsg(). Is there a reason why not to
export and re-use that instead?

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [Mesa-dev] [RFC 1/2] drm/i915: Engine discovery uAPI

2017-04-24 Thread Tvrtko Ursulin


On 19/04/2017 06:22, Kenneth Graunke wrote:

On Tuesday, April 18, 2017 9:56:14 AM PDT Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Engine discovery uAPI allows userspace to probe for engine
configuration and features without needing to maintain the
internal PCI id based database.


I don't understand why I would want to query the existence of engines
from the kernel.  As a userspace driver developer, I have to know how to
program the specific generation of hardware I'm on.  I better know what
engines that GPU has, or else there's no way I can competently program it.

In Mesa, we recently imported libdrm and deleted all the engine checks
(a460e1eb51406e5ca54abda42112bfb8523ff046).  All generations have an
RCS, Gen6+ has a separate BLT, and we don't need to use the others.
It's completely statically determinable with a simple check.  Runtime
checks make sense for optional things...but not "is there a 3D engine?".

Plus, even if you add this to the kernel, we still support back to 3.6
(and ChromeOS needs us to continue supporting 3.8), so we won't be able
to magically use the new uABI - we'd need to support both.  Which, if
the point is to delete code...we'd actually have /more/ code for a few
years.  Or, we could not use it...but then nobody would be testing it,
and if a bug creeps in...that pushes it back more years still.


Okay the argument of more code for a while is I suppose always true with 
these types of work. But in general, the idea is to consolidate the 
queries and avoid (may be only partially) duplicate PCI databases across 
components.


Because I suspect today you do some device discovery via libpciaccess 
(or something) and some via i915 GET_PARAMs and so. So the idea is to 
consolidate all that and do it via i915. Since another argument you 
raise, is that you have to know how does the GPU looks like to be able 
to competently program it, in which case who knows better than the 
kernel driver?


But I think the main part of the argument is that why collect and derive 
this information from various sources when perhaps it could only be one.


Maybe the exact idea is not so interesting for Mesa, which I wouldn't be 
surprised at all since the idea was born from libva and the BSD engine 
usage there. In which case perhaps Mesa could become more interested if 
the proposal was exporting some other data to userspace?


I don't think it is critical to find something like that in Mesa, but it 
may be interesting. I think Ben mentioned at one point he had some ideas 
in this area, or something was discussed in the past which may be 
similar. I forgot the exact details now.


So I think for now, if there is nothing which would be interesting in 
Mesa along the lines described so far, please just keep an eye on this. 
Just to make sure if some other component will be interested, and we end 
up starting to implement something, it is at least not hindering you, if 
we cannot find anything useful for you in here.


Regards,

Tvrtko
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [RFC 1/2] drm/i915: Engine discovery uAPI

2017-04-24 Thread Tvrtko Ursulin


On 18/04/2017 21:13, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 05:56:14PM +0100, Tvrtko Ursulin wrote:

+enum drm_i915_gem_engine_class {
+   DRM_I915_ENGINE_CLASS_OTHER = 0,
+   DRM_I915_ENGINE_CLASS_RENDER = 1,
+   DRM_I915_ENGINE_CLASS_COPY = 2,
+   DRM_I915_ENGINE_CLASS_VIDEO_DECODE = 3,
+   DRM_I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
+   DRM_I915_ENGINE_CLASS_MAX /* non-ABI */
+};
+
+struct drm_i915_engine_info {
+   /** Engine instance number. */
+   __u32   instance;
+   __u32   rsvd;
+
+   /** Engine specific info. */
+#define DRM_I915_ENGINE_HAS_HEVC   BIT(0)
+   __u64 info;
+};


So the main question is how can we extend this in future, keeping
forwards/backwards compat?

I think if we put a query version into info and then the kernel supplies
an array matching that version (or reports the most recent version
supported if the request is too modern.)

The kernel has to keep all the old struct variants and exporters
indefinitely.


Versioning sounds good to me.


Another alternative would get an ENGINE_GETPARAM where we just have a
switch of all possibily questions. Maybe better as a CONTEXT_GETPARAM if
we start thinking about allowing CONTEXT_SETPARAM to fine tune
individual clients.


This idea I did not get - what is the switch of all possible questions? 
You mean new ioctl like ENGINE_GETPARAM which would return a list of 
queries supported by CONTEXT_GETPARAM? Which would effectively be a 
dispatcher-in-dispatcher kind of thing?



+struct drm_i915_gem_engine_info {
+   /** in: Engine class to probe (enum drm_i915_gem_engine_class). */
+   __u32 engine_class;


__u32 [in/out] version ? (see above)


+
+   /** out: Actual number of hardware engines. */
+   __u32 num_engines;
+
+   /**
+* in: Number of struct drm_i915_engine_ifo entries in the provided
+* info array.
+*/
+   __u32 info_size;


This is superfluous with num_engines. The standard 2 pass, discovery of
size, followed by allocation and final query.


This is also fine. I was one the fence whether to actually condense it 
to one field in the first posting or not myself.


Regards,

Tvrtko
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 12/15] drm/i915: Two stage watermarks for g4x

2017-04-24 Thread Maarten Lankhorst
On 21-04-17 20:14, ville.syrj...@linux.intel.com wrote:
> From: Ville Syrjälä 
>
> Implement proper two stage watermark programming for g4x. As with
> other pre-SKL platforms, the watermark registers aren't double
> buffered on g4x. Hence we must sequence the watermark update
> carefully around plane updates.
>
> The code is quite heavily modelled on the VLV/CHV code, with some
> fairly significant differences due to the different hardware
> architecture:
> * g4x doesn't use inverted watermark values
> * CxSR actually affects the watermarks since it controls memory self
>   refresh in addition to the max FIFO mode
> * A further HPLL SR mode is possible with higher memory wakeup
>   latency
> * g4x has FBC2 and so it also has FBC watermarks
> * max FIFO mode for primary plane only (cursor is allowed, sprite is not)
> * g4x has no manual FIFO repartitioning
> * some TLB miss related workarounds are needed for the watermarks
>
> Actually the hardware is quite similar to ILK+ in many ways. The
> most visible differences are in the actual watermakr register
> layout. ILK revamped that part quite heavily whereas g4x is still
> using the layout inherited from earlier platforms.
>
> Note that we didn't previously enable the HPLL SR on g4x. So in order
> to not introduce too many functional changes in this patch I've not
> actually enabled it here either, even though the code is now fully
> ready for it. We'll enable it separately later on.
>
> Signed-off-by: Ville Syrjälä 
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c  |  12 +-
>  drivers/gpu/drm/i915/i915_drv.h  |  12 +
>  drivers/gpu/drm/i915/intel_display.c |  25 +-
>  drivers/gpu/drm/i915/intel_drv.h |  28 ++
>  drivers/gpu/drm/i915/intel_pm.c  | 942 
> +++
>  5 files changed, 792 insertions(+), 227 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index 870c470177b5..69550d31099e 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3898,6 +3898,8 @@ static void wm_latency_show(struct seq_file *m, const 
> uint16_t wm[8])
>   num_levels = 3;
>   else if (IS_VALLEYVIEW(dev_priv))
>   num_levels = 1;
> + else if (IS_G4X(dev_priv))
> + num_levels = 3;
>   else
>   num_levels = ilk_wm_max_level(dev_priv) + 1;
>  
> @@ -3910,8 +3912,10 @@ static void wm_latency_show(struct seq_file *m, const 
> uint16_t wm[8])
>* - WM1+ latency values in 0.5us units
>* - latencies are in us on gen9/vlv/chv
>*/
> - if (INTEL_GEN(dev_priv) >= 9 || IS_VALLEYVIEW(dev_priv) ||
> - IS_CHERRYVIEW(dev_priv))
> + if (INTEL_GEN(dev_priv) >= 9 ||
> + IS_VALLEYVIEW(dev_priv) ||
> + IS_CHERRYVIEW(dev_priv) ||
> + IS_G4X(dev_priv))
>   latency *= 10;
>   else if (level > 0)
>   latency *= 5;
> @@ -3972,7 +3976,7 @@ static int pri_wm_latency_open(struct inode *inode, 
> struct file *file)
>  {
>   struct drm_i915_private *dev_priv = inode->i_private;
>  
> - if (INTEL_GEN(dev_priv) < 5)
> + if (INTEL_GEN(dev_priv) < 5 && !IS_G4X(dev_priv))
>   return -ENODEV;
>  
>   return single_open(file, pri_wm_latency_show, dev_priv);
> @@ -4014,6 +4018,8 @@ static ssize_t wm_latency_write(struct file *file, 
> const char __user *ubuf,
>   num_levels = 3;
>   else if (IS_VALLEYVIEW(dev_priv))
>   num_levels = 1;
> + else if (IS_G4X(dev_priv))
> + num_levels = 3;
>   else
>   num_levels = ilk_wm_max_level(dev_priv) + 1;
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 0a393fdc53d1..6df8bff7f5a7 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1762,11 +1762,13 @@ struct ilk_wm_values {
>  
>  struct g4x_pipe_wm {
>   uint16_t plane[I915_MAX_PLANES];
> + uint16_t fbc;
>  };
>  
>  struct g4x_sr_wm {
>   uint16_t plane;
>   uint16_t cursor;
> + uint16_t fbc;
>  };
>  
>  struct vlv_wm_ddl_values {
> @@ -1781,6 +1783,15 @@ struct vlv_wm_values {
>   bool cxsr;
>  };
>  
> +struct g4x_wm_values {
> + struct g4x_pipe_wm pipe[2];
> + struct g4x_sr_wm sr;
> + struct g4x_sr_wm hpll;
> + bool cxsr;
> + bool hpll_en;
> + bool fbc_en;
> +};
> +
>  struct skl_ddb_entry {
>   uint16_t start, end;/* in number of blocks, 'end' is exclusive */
>  };
> @@ -2410,6 +2421,7 @@ struct drm_i915_private {
>   struct ilk_wm_values hw;
>   struct skl_wm_values skl_hw;
>   struct vlv_wm_values vlv;
> + struct g4x_wm_values g4x;
>   };
>  
>   uint8_t 

[Intel-gfx] [PATCH v5 11/15] drm/i915/perf: Add 'render basic' Gen8+ OA unit configs

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

Adds a static OA unit, MUX, B Counter + Flex EU configurations for basic
render metrics on Broadwell, Cherryview, Skylake and Broxton. These are
auto generated from an XML description of metric sets, currently
maintained in gputop, ref:

 https://github.com/rib/gputop
 > gputop-data/oa-*.xml
 > scripts/i915-perf-kernelgen.py

 $ make -C gputop-data -f Makefile.xml WHITELIST=RenderBasic

v2: add newlines to debug messages + fix comment (Matthew Auld)

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
Acked-by: Lionel Landwerlin 
---
 drivers/gpu/drm/i915/Makefile |   8 +-
 drivers/gpu/drm/i915/i915_drv.h   |   2 +
 drivers/gpu/drm/i915/i915_oa_bdw.c| 380 ++
 drivers/gpu/drm/i915/i915_oa_bdw.h|  38 
 drivers/gpu/drm/i915/i915_oa_bxt.c| 238 +
 drivers/gpu/drm/i915/i915_oa_bxt.h|  38 
 drivers/gpu/drm/i915/i915_oa_chv.c| 225 
 drivers/gpu/drm/i915/i915_oa_chv.h|  38 
 drivers/gpu/drm/i915/i915_oa_sklgt2.c | 228 
 drivers/gpu/drm/i915/i915_oa_sklgt2.h |  38 
 drivers/gpu/drm/i915/i915_oa_sklgt3.c | 236 +
 drivers/gpu/drm/i915/i915_oa_sklgt3.h |  38 
 drivers/gpu/drm/i915/i915_oa_sklgt4.c | 247 ++
 drivers/gpu/drm/i915/i915_oa_sklgt4.h |  38 
 14 files changed, 1791 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/i915/i915_oa_bdw.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_bdw.h
 create mode 100644 drivers/gpu/drm/i915/i915_oa_bxt.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_bxt.h
 create mode 100644 drivers/gpu/drm/i915/i915_oa_chv.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_chv.h
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt2.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt2.h
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt3.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt3.h
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt4.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt4.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 2cf04504e494..41400a138a1e 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -127,7 +127,13 @@ i915-y += i915_vgpu.o

 # perf code
 i915-y += i915_perf.o \
- i915_oa_hsw.o
+ i915_oa_hsw.o \
+ i915_oa_bdw.o \
+ i915_oa_chv.o \
+ i915_oa_sklgt2.o \
+ i915_oa_sklgt3.o \
+ i915_oa_sklgt4.o \
+ i915_oa_bxt.o

 ifeq ($(CONFIG_DRM_I915_GVT),y)
 i915-y += intel_gvt.o
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index fb2188c8d72f..ffa1fc5eddfd 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2466,6 +2466,8 @@ struct drm_i915_private {
int mux_regs_len;
const struct i915_oa_reg *b_counter_regs;
int b_counter_regs_len;
+   const struct i915_oa_reg *flex_regs;
+   int flex_regs_len;

struct {
struct i915_vma *vma;
diff --git a/drivers/gpu/drm/i915/i915_oa_bdw.c 
b/drivers/gpu/drm/i915/i915_oa_bdw.c
new file mode 100644
index ..b0b1b75fb431
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_oa_bdw.c
@@ -0,0 +1,380 @@
+/*
+ * Autogenerated file, DO NOT EDIT manually!
+ *
+ * Copyright (c) 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include 
+
+#include "i915_drv.h"
+#include "i915_oa_bdw.h"
+
+enum metric_set_id {
+   METRIC_SET_ID_RENDER_BASIC = 1,
+};
+
+int i915_oa_n_builtin_metric_sets_bdw = 1;
+
+static const struct i915_oa_reg 

[Intel-gfx] [PATCH v5 01/15] drm/i915/perf: fix gen7_append_oa_reports comment

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

If I'm going to complain about a back-to-front convention then the least
I can do is not muddle the comment up too.

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/i915_perf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 060b171480d5..78fef53b45c9 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -431,7 +431,7 @@ static int append_oa_sample(struct i915_perf_stream *stream,
  * userspace.
  *
  * Note: reports are consumed from the head, and appended to the
- * tail, so the head chases the tail?... If you think that's mad
+ * tail, so the tail chases the head?... If you think that's mad
  * and back-to-front you're not alone, but this follows the
  * Gen PRM naming convention.
  *
--
2.11.0
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v5 02/15] drm/i915/perf: avoid poll, read, EAGAIN busy loops

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

If the function for checking whether there is OA buffer data available
(during a poll or blocking read) has false positives then we want to
avoid a situation where the subsequent read() returns EAGAIN (after
a more accurate check) followed by a poll() immediately reporting
the same false positive POLLIN event and effectively maintaining a
busy loop until there really is data.

This makes sure that we clear the .pollin event status whenever we
return EAGAIN to userspace which will throttle subsequent POLLIN events
and repeated attempts to read to the 5ms intervals of the hrtimer
callback we have.

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/i915_perf.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 78fef53b45c9..f59f6dd20922 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1352,7 +1352,15 @@ static ssize_t i915_perf_read(struct file *file,
mutex_unlock(_priv->perf.lock);
}

-   if (ret >= 0) {
+   /* We allow the poll checking to sometimes report false positive POLLIN
+* events where we might actually report EAGAIN on read() if there's
+* not really any data available. In this situation though we don't
+* want to enter a busy loop between poll() reporting a POLLIN event
+* and read() returning -EAGAIN. Clearing the oa.pollin state here
+* effectively ensures we back off until the next hrtimer callback
+* before reporting another POLLIN event.
+*/
+   if (ret >= 0 || ret == -EAGAIN) {
/* Maybe make ->pollin per-stream state if we support multiple
 * concurrent streams in the future.
 */
--
2.11.0
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH 00/15] Enable OA unit for Gen 8 and 9 in i915 perf

2017-04-24 Thread Lionel Landwerlin
Hi,

Taking over from Rob for this v5. This series only has the following
changes from v4 :

 - patch 9 & 10 : updated number for GETPARAM after rebase

 - patch 12 : drain the GPU before reconfiguring the OA unit to work
   around a race condition where the CPU & GPU update the context
   image at the same time (see gen8_configure_all_contexts)

The second item had been causing instabilities for a while in the IGT
tests. With that out of the way, tests are a lot more stable now.

Cheers,

Robert Bragg (15):
  drm/i915/perf: fix gen7_append_oa_reports comment
  drm/i915/perf: avoid poll, read, EAGAIN busy loops
  drm/i915/perf: avoid read back of head register
  drm/i915/perf: no head/tail ref in gen7_oa_read
  drm/i915/perf: improve tail race workaround
  drm/i915/perf: improve invalid OA format debug message
  drm/i915/perf: better pipeline aged/aging tail updates
  drm/i915/perf: rate limit spurious oa report notice
  drm/i915: expose _SLICE_MASK GETPARM
  drm/i915: expose _SUBSLICE_MASK GETPARM
  drm/i915/perf: Add 'render basic' Gen8+ OA unit configs
  drm/i915/perf: Add OA unit support for Gen 8+
  drm/i915/perf: Add more OA configs for BDW, CHV, SKL + BXT
  drm/i915/perf: per-gen timebase for checking sample freq
  drm/i915/perf: remove perf.hook_lock

 drivers/gpu/drm/i915/Makefile   |8 +-
 drivers/gpu/drm/i915/i915_drv.c |   10 +
 drivers/gpu/drm/i915/i915_drv.h |  121 +-
 drivers/gpu/drm/i915/i915_gem_context.h |1 +
 drivers/gpu/drm/i915/i915_oa_bdw.c  | 5154 +++
 drivers/gpu/drm/i915/i915_oa_bdw.h  |   38 +
 drivers/gpu/drm/i915/i915_oa_bxt.c  | 2541 +++
 drivers/gpu/drm/i915/i915_oa_bxt.h  |   38 +
 drivers/gpu/drm/i915/i915_oa_chv.c  | 2730 
 drivers/gpu/drm/i915/i915_oa_chv.h  |   38 +
 drivers/gpu/drm/i915/i915_oa_hsw.c  |   58 +-
 drivers/gpu/drm/i915/i915_oa_sklgt2.c   | 3302 
 drivers/gpu/drm/i915/i915_oa_sklgt2.h   |   38 +
 drivers/gpu/drm/i915/i915_oa_sklgt3.c   | 2856 +
 drivers/gpu/drm/i915/i915_oa_sklgt3.h   |   38 +
 drivers/gpu/drm/i915/i915_oa_sklgt4.c   | 2910 +
 drivers/gpu/drm/i915/i915_oa_sklgt4.h   |   38 +
 drivers/gpu/drm/i915/i915_perf.c| 1361 ++--
 drivers/gpu/drm/i915/i915_reg.h |   22 +
 drivers/gpu/drm/i915/intel_lrc.c|5 +
 include/uapi/drm/i915_drm.h |   27 +-
 21 files changed, 21096 insertions(+), 238 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_oa_bdw.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_bdw.h
 create mode 100644 drivers/gpu/drm/i915/i915_oa_bxt.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_bxt.h
 create mode 100644 drivers/gpu/drm/i915/i915_oa_chv.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_chv.h
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt2.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt2.h
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt3.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt3.h
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt4.c
 create mode 100644 drivers/gpu/drm/i915/i915_oa_sklgt4.h

--
2.11.0
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v5 05/15] drm/i915/perf: improve tail race workaround

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

There's a HW race condition between OA unit tail pointer register
updates and writes to memory whereby the tail pointer can sometimes get
ahead of what's been written out to the OA buffer so far (in terms of
what's visible to the CPU).

Although this can be observed explicitly while copying reports to
userspace by checking for a zeroed report-id field in tail reports, we
want to account for this earlier, as part of the _oa_buffer_check to
avoid lots of redundant read() attempts.

Previously the driver used to define an effective tail pointer that
lagged the real pointer by a 'tail margin' measured in bytes derived
from OA_TAIL_MARGIN_NSEC and the configured sampling frequency.
Unfortunately this was flawed considering that the OA unit may also
automatically generate non-periodic reports (such as on context switch)
or the OA unit may be enabled without any periodic sampling.

This improves how we define a tail pointer for reading that lags the
real tail pointer by at least %OA_TAIL_MARGIN_NSEC nanoseconds, which
gives enough time for the corresponding reports to become visible to the
CPU.

The driver now maintains two tail pointers:
 1) An 'aging' tail with an associated timestamp that is tracked until we
can trust the corresponding data is visible to the CPU; at which point
it is considered 'aged'.
 2) An 'aged' tail that can be used for read()ing.

The two separate pointers let us decouple read()s from tail pointer aging.

The tail pointers are checked and updated at a limited rate within a
hrtimer callback (the same callback that is used for delivering POLLIN
events) and since we're now measuring the wall clock time elapsed since
a given tail pointer was read the mechanism no longer cares about
the OA unit's periodic sampling frequency.

The natural place to handle the tail pointer updates was in
gen7_oa_buffer_is_empty() which is called as part of blocking reads and
the hrtimer callback used for polling, and so this was renamed to
oa_buffer_check() considering the added side effect while checking
whether the buffer contains data.

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/i915_drv.h  |  60 -
 drivers/gpu/drm/i915/i915_perf.c | 277 ++-
 2 files changed, 241 insertions(+), 96 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b618a8bc54f6..69a914870774 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2097,7 +2097,7 @@ struct i915_oa_ops {
size_t *offset);

/**
-* @oa_buffer_is_empty: Check if OA buffer empty (false positives OK)
+* @oa_buffer_check: Check for OA buffer data + update tail
 *
 * This is either called via fops or the poll check hrtimer (atomic
 * ctx) without any locks taken.
@@ -2110,7 +2110,7 @@ struct i915_oa_ops {
 * here, which will be handled gracefully - likely resulting in an
 * %EAGAIN error for userspace.
 */
-   bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv);
+   bool (*oa_buffer_check)(struct drm_i915_private *dev_priv);
 };

 struct intel_cdclk_state {
@@ -2453,9 +2453,6 @@ struct drm_i915_private {

bool periodic;
int period_exponent;
-   int timestamp_frequency;
-
-   int tail_margin;

int metrics_set;

@@ -2471,6 +2468,59 @@ struct drm_i915_private {
int format_size;

/**
+* Locks reads and writes to all head/tail state
+*
+* Consider: the head and tail pointer state
+* needs to be read consistently from a hrtimer
+* callback (atomic context) and read() fop
+* (user context) with tail pointer updates
+* happening in atomic context and head updates
+* in user context and the (unlikely)
+* possibility of read() errors needing to
+* reset all head/tail state.
+*
+* Note: Contention or performance aren't
+* currently a significant concern here
+* considering the relatively low frequency of
+* hrtimer callbacks (5ms period) and that
+* reads typically only happen in response to a
+* hrtimer event and likely complete before the
+* next callback.
+   

[Intel-gfx] [PATCH v5 12/15] drm/i915/perf: Add OA unit support for Gen 8+

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

Enables access to OA unit metrics for BDW, CHV, SKL and BXT which all
share (more-or-less) the same OA unit design.

Of particular note in comparison to Haswell: some OA unit HW config
state has become per-context state and as a consequence it is somewhat
more complicated to manage synchronous state changes from the cpu while
there's no guarantee of what context (if any) is currently actively
running on the gpu.

The periodic sampling frequency which can be particularly useful for
system-wide analysis (as opposed to command stream synchronised
MI_REPORT_PERF_COUNT commands) is perhaps the most surprising state to
have become per-context save and restored (while the OABUFFER
destination is still a shared, system-wide resource).

This support for gen8+ takes care to consider a number of timing
challenges involved in synchronously updating per-context state
primarily by programming all config state from the cpu and updating all
current and saved contexts synchronously while the OA unit is still
disabled.

The driver intentionally avoids depending on command streamer
programming to update OA state considering the lack of synchronization
between the automatic loading of OACTXCONTROL state (that includes the
periodic sampling state and enable state) on context restore and the
parsing of any general purpose BB the driver can control. I.e. this
implementation is careful to avoid the possibility of a context restore
temporarily enabling any out-of-date periodic sampling state. In
addition to the risk of transiently-out-of-date state being loaded
automatically; there are also internal HW latencies involved in the
loading of MUX configurations which would be difficult to account for
from the command streamer (and we only want to enable the unit when once
the MUX configuration is complete).

Since the Gen8+ OA unit design no longer supports clock gating the unit
off for a single given context (which effectively stopped any progress
of counters while any other context was running) and instead supports
tagging OA reports with a context ID for filtering on the CPU, it means
we can no longer hide the system-wide progress of counters from a
non-privileged application only interested in metrics for its own
context. Although we could theoretically try and subtract the progress
of other contexts before forwarding reports via read() we aren't in a
position to filter reports captured via MI_REPORT_PERF_COUNT commands.
As a result, for Gen8+, we always require the
dev.i915.perf_stream_paranoid to be unset for any access to OA metrics
if not root.

v5: Drain submitted requests when enabling metric set to ensure no
lite-restore erases the context image we just updated (Lionel)

Signed-off-by: Robert Bragg 
Signed-off-by: Lionel Landwerlin 
Reviewed-by: Matthew Auld  \o/
---
 drivers/gpu/drm/i915/i915_drv.h |  45 +-
 drivers/gpu/drm/i915/i915_gem_context.h |   1 +
 drivers/gpu/drm/i915/i915_perf.c| 969 +---
 drivers/gpu/drm/i915/i915_reg.h |  22 +
 drivers/gpu/drm/i915/intel_lrc.c|   5 +
 include/uapi/drm/i915_drm.h |  19 +-
 6 files changed, 968 insertions(+), 93 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ffa1fc5eddfd..41a97ade39ea 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2064,9 +2064,17 @@ struct i915_oa_ops {
void (*init_oa_buffer)(struct drm_i915_private *dev_priv);

/**
-* @enable_metric_set: Applies any MUX configuration to set up the
-* Boolean and Custom (B/C) counters that are part of the counter
-* reports being sampled. May apply system constraints such as
+* @select_metric_set: The auto generated code that checks whether a
+* requested OA config is applicable to the system and if so sets up
+* the mux, oa and flex eu register config pointers according to the
+* current dev_priv->perf.oa.metrics_set.
+*/
+   int (*select_metric_set)(struct drm_i915_private *dev_priv);
+
+   /**
+* @enable_metric_set: Selects and applies any MUX configuration to set
+* up the Boolean and Custom (B/C) counters that are part of the
+* counter reports being sampled. May apply system constraints such as
 * disabling EU clock gating as required.
 */
int (*enable_metric_set)(struct drm_i915_private *dev_priv);
@@ -2097,20 +2105,13 @@ struct i915_oa_ops {
size_t *offset);

/**
-* @oa_buffer_check: Check for OA buffer data + update tail
-*
-* This is either called via fops or the poll check hrtimer (atomic
-* ctx) without any locks taken.
+* @oa_hw_tail_read: read the OA tail pointer register
 *
-* It's safe to read OA config state here 

[Intel-gfx] [PATCH v5 08/15] drm/i915/perf: rate limit spurious oa report notice

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

This change is pre-emptively aiming to avoid a potential cause of kernel
logging noise in case some condition were to result in us seeing invalid
OA reports.

The workaround for the OA unit's tail pointer race condition is what
avoids the primary known cause of invalid reports being seen and with
that in place we aren't expecting to see this notice but it can't be
entirely ruled out.

Just in case some condition does lead to the notice then it's likely
that it will be triggered repeatedly while attempting to append a
sequence of reports and depending on the configured OA sampling
frequency that might be a large number of repeat notices.

v2: (Chris) avoid inconsistent warning on throttle with
printk_ratelimit()
v3: (Matt) init and summarise with stream init/close not driver init/fini

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/i915_drv.h  |  6 ++
 drivers/gpu/drm/i915/i915_perf.c | 28 +++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 69a914870774..fb2188c8d72f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2451,6 +2451,12 @@ struct drm_i915_private {
wait_queue_head_t poll_wq;
bool pollin;

+   /**
+* For rate limiting any notifications of spurious
+* invalid OA reports
+*/
+   struct ratelimit_state spurious_report_rs;
+
bool periodic;
int period_exponent;

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 5738b99caa5b..3277a52ce98e 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -632,7 +632,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream 
*stream,
 * copying it to userspace...
 */
if (report32[0] == 0) {
-   DRM_NOTE("Skipping spurious, invalid OA report\n");
+   if (__ratelimit(_priv->perf.oa.spurious_report_rs))
+   DRM_NOTE("Skipping spurious, invalid OA 
report\n");
continue;
}

@@ -913,6 +914,11 @@ static void i915_oa_stream_destroy(struct i915_perf_stream 
*stream)
oa_put_render_ctx_id(stream);

dev_priv->perf.oa.exclusive_stream = NULL;
+
+   if (dev_priv->perf.oa.spurious_report_rs.missed) {
+   DRM_NOTE("%d spurious OA report notices suppressed due to 
ratelimiting\n",
+dev_priv->perf.oa.spurious_report_rs.missed);
+   }
 }

 static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
@@ -1268,6 +1274,26 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
return -EINVAL;
}

+   /* We set up some ratelimit state to potentially throttle any _NOTES
+* about spurious, invalid OA reports which we don't forward to
+* userspace.
+*
+* The initialization is associated with opening the stream (not driver
+* init) considering we print a _NOTE about any throttling when closing
+* the stream instead of waiting until driver _fini which no one would
+* ever see.
+*
+* Using the same limiting factors as printk_ratelimit()
+*/
+   ratelimit_state_init(_priv->perf.oa.spurious_report_rs,
+5 * HZ, 10);
+   /* Since we use a DRM_NOTE for spurious reports it would be
+* inconsistent to let __ratelimit() automatically print a warning for
+* throttling.
+*/
+   ratelimit_set_flags(_priv->perf.oa.spurious_report_rs,
+   RATELIMIT_MSG_ON_RELEASE);
+
stream->sample_size = sizeof(struct drm_i915_perf_record_header);

format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
--
2.11.0
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v5 09/15] drm/i915: expose _SLICE_MASK GETPARM

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

Enables userspace to determine the number of slices enabled and also
know what specific slices are enabled. This information is required, for
example, to be able to analyse some OA counter reports where the counter
configuration depends on the HW slice configuration.

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
Acked-by: Lionel Landwerlin 
---
 drivers/gpu/drm/i915/i915_drv.c | 5 +
 include/uapi/drm/i915_drm.h | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index cc7393e65e99..f13b2c3ce6eb 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -358,6 +358,11 @@ static int i915_getparam(struct drm_device *dev, void 
*data,
 */
value = 1;
break;
+   case I915_PARAM_SLICE_MASK:
+   value = INTEL_INFO(dev_priv)->sseu.slice_mask;
+   if (!value)
+   return -ENODEV;
+   break;
default:
DRM_DEBUG("Unknown parameter %d\n", param->param);
return -EINVAL;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index f24a80d2d42e..25695c3d9a76 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -418,6 +418,9 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_CAPTURE 45

+/* Query the mask of slices available for this system */
+#define I915_PARAM_SLICE_MASK   46
+
 typedef struct drm_i915_getparam {
__s32 param;
/*
--
2.11.0
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v5 03/15] drm/i915/perf: avoid read back of head register

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

There's no need for the driver to keep reading back the head pointer
from hardware since the hardware doesn't update it automatically. This
way we can treat any invalid head pointer value as a software/driver
bug instead of spurious hardware behaviour.

This change is also a small stepping stone towards re-working how
the head and tail state is managed as part of an improved workaround
for the tail register race condition.

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/i915_drv.h  | 11 ++
 drivers/gpu/drm/i915/i915_perf.c | 46 ++--
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 357b6c6c2f04..b618a8bc54f6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2469,6 +2469,17 @@ struct drm_i915_private {
u8 *vaddr;
int format;
int format_size;
+
+   /**
+* Although we can always read back the head
+* pointer register, we prefer to avoid
+* trusting the HW state, just to avoid any
+* risk that some hardware condition could
+* somehow bump the head pointer unpredictably
+* and cause us to forward the wrong OA buffer
+* data to userspace.
+*/
+   u32 head;
} oa_buffer;

u32 gen7_latched_oastatus1;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index f59f6dd20922..f47d1cc2144b 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -322,9 +322,8 @@ struct perf_open_properties {
 static bool gen7_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private 
*dev_priv)
 {
int report_size = dev_priv->perf.oa.oa_buffer.format_size;
-   u32 oastatus2 = I915_READ(GEN7_OASTATUS2);
u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
-   u32 head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
+   u32 head = dev_priv->perf.oa.oa_buffer.head;
u32 tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;

return OA_TAKEN(tail, head) <
@@ -458,16 +457,24 @@ static int gen7_append_oa_reports(struct i915_perf_stream 
*stream,
return -EIO;

head = *head_ptr - gtt_offset;
+
+   /* An out of bounds or misaligned head pointer implies a driver bug
+* since we are in full control of head pointer which should only
+* be incremented by multiples of the report size (notably also
+* all a power of two).
+*/
+   if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size,
+ "Inconsistent OA buffer head pointer = %u\n", head))
+   return -EIO;
+
tail -= gtt_offset;

/* The OA unit is expected to wrap the tail pointer according to the OA
-* buffer size and since we should never write a misaligned head
-* pointer we don't expect to read one back either...
+* buffer size
 */
-   if (tail > OA_BUFFER_SIZE || head > OA_BUFFER_SIZE ||
-   head % report_size) {
-   DRM_ERROR("Inconsistent OA buffer pointer (head = %u, tail = 
%u): force restart\n",
- head, tail);
+   if (tail > OA_BUFFER_SIZE) {
+   DRM_ERROR("Inconsistent OA buffer tail pointer = %u: force 
restart\n",
+ tail);
dev_priv->perf.oa.ops.oa_disable(dev_priv);
dev_priv->perf.oa.ops.oa_enable(dev_priv);
*head_ptr = I915_READ(GEN7_OASTATUS2) &
@@ -562,8 +569,6 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
size_t *offset)
 {
struct drm_i915_private *dev_priv = stream->dev_priv;
-   int report_size = dev_priv->perf.oa.oa_buffer.format_size;
-   u32 oastatus2;
u32 oastatus1;
u32 head;
u32 tail;
@@ -572,10 +577,9 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
if (WARN_ON(!dev_priv->perf.oa.oa_buffer.vaddr))
return -EIO;

-   oastatus2 = I915_READ(GEN7_OASTATUS2);
oastatus1 = I915_READ(GEN7_OASTATUS1);

-   head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
+   head = dev_priv->perf.oa.oa_buffer.head;
tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;

/* XXX: On Haswell we don't have a safe way to clear oastatus1
@@ -616,10 +620,9 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
dev_priv->perf.oa.ops.oa_disable(dev_priv);

[Intel-gfx] [PATCH v5 10/15] drm/i915: expose _SUBSLICE_MASK GETPARM

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

Assuming a uniform mask across all slices, this enables userspace to
determine the specific sub slices enabled. This information is required,
for example, to be able to analyse some OA counter reports where the
counter configuration depends on the HW sub slice configuration.

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
Acked-by: Lionel Landwerlin 
---
 drivers/gpu/drm/i915/i915_drv.c | 5 +
 include/uapi/drm/i915_drm.h | 5 +
 2 files changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index f13b2c3ce6eb..8769692b9408 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -363,6 +363,11 @@ static int i915_getparam(struct drm_device *dev, void 
*data,
if (!value)
return -ENODEV;
break;
+   case I915_PARAM_SUBSLICE_MASK:
+   value = INTEL_INFO(dev_priv)->sseu.subslice_mask;
+   if (!value)
+   return -ENODEV;
+   break;
default:
DRM_DEBUG("Unknown parameter %d\n", param->param);
return -EINVAL;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 25695c3d9a76..464547d08173 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -421,6 +421,11 @@ typedef struct drm_i915_irq_wait {
 /* Query the mask of slices available for this system */
 #define I915_PARAM_SLICE_MASK   46

+/* Assuming it's uniform for each slice, this queries the mask of subslices
+ * per-slice for this system.
+ */
+#define I915_PARAM_SUBSLICE_MASK47
+
 typedef struct drm_i915_getparam {
__s32 param;
/*
--
2.11.0
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v5 15/15] drm/i915/perf: remove perf.hook_lock

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

In earlier iterations of the i915-perf driver we had a number of
callbacks/hooks from other parts of the i915 driver to e.g. notify us
when a legacy context was pinned and these could run asynchronously with
respect to the stream file operations and might also run in atomic
context.

dev_priv->perf.hook_lock had been for serialising access to state needed
within these callbacks, but as the code has evolved some of the hooks
have gone away or are implemented to avoid needing to lock any state.

The remaining use of this lock was actually redundant considering how
the gen7 oacontrol state used to be updated as part of a context pin
hook.

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
Acked-by: Lionel Landwerlin 
---
 drivers/gpu/drm/i915/i915_drv.h  |  2 --
 drivers/gpu/drm/i915/i915_perf.c | 32 ++--
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 30fe1fc04234..a6aacd70ce7e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2441,8 +2441,6 @@ struct drm_i915_private {
struct mutex lock;
struct list_head streams;

-   spinlock_t hook_lock;
-
struct {
struct i915_perf_stream *exclusive_stream;

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index bbd3f7fdd52a..d36918a2bb70 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1710,9 +1710,17 @@ static void gen8_disable_metric_set(struct 
drm_i915_private *dev_priv)
/* NOP */
 }

-static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
+static void gen7_oa_enable(struct drm_i915_private *dev_priv)
 {
-   lockdep_assert_held(_priv->perf.hook_lock);
+   /* Reset buf pointers so we don't forward reports from before now.
+*
+* Think carefully if considering trying to avoid this, since it
+* also ensures status flags and the buffer itself are cleared
+* in error paths, and we have checks for invalid reports based
+* on the assumption that certain fields are written to zeroed
+* memory which this helps maintains.
+*/
+   gen7_init_oa_buffer(dev_priv);

if (dev_priv->perf.oa.exclusive_stream->enabled) {
struct i915_gem_context *ctx =
@@ -1735,25 +1743,6 @@ static void gen7_update_oacontrol_locked(struct 
drm_i915_private *dev_priv)
I915_WRITE(GEN7_OACONTROL, 0);
 }

-static void gen7_oa_enable(struct drm_i915_private *dev_priv)
-{
-   unsigned long flags;
-
-   /* Reset buf pointers so we don't forward reports from before now.
-*
-* Think carefully if considering trying to avoid this, since it
-* also ensures status flags and the buffer itself are cleared
-* in error paths, and we have checks for invalid reports based
-* on the assumption that certain fields are written to zeroed
-* memory which this helps maintains.
-*/
-   gen7_init_oa_buffer(dev_priv);
-
-   spin_lock_irqsave(_priv->perf.hook_lock, flags);
-   gen7_update_oacontrol_locked(dev_priv);
-   spin_unlock_irqrestore(_priv->perf.hook_lock, flags);
-}
-
 static void gen8_oa_enable(struct drm_i915_private *dev_priv)
 {
u32 report_format = dev_priv->perf.oa.oa_buffer.format;
@@ -3034,7 +3023,6 @@ void i915_perf_init(struct drm_i915_private *dev_priv)

INIT_LIST_HEAD(_priv->perf.streams);
mutex_init(_priv->perf.lock);
-   spin_lock_init(_priv->perf.hook_lock);
spin_lock_init(_priv->perf.oa.oa_buffer.ptr_lock);

oa_sample_rate_hard_limit =
--
2.11.0
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v5 04/15] drm/i915/perf: no head/tail ref in gen7_oa_read

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

This avoids redundantly passing an (inout) head and tail pointer to
gen7_append_oa_reports() from gen7_oa_read which doesn't need to
reference either itself.

Moving the head/tail reads and writes into gen7_append_oa_reports should
have no functional effect except to avoid some redundant head pointer
writes in cases where nothing was copied to userspace.

This is a stepping stone towards updating how the head and tail pointer
state is managed to improve the workaround for the OA unit's tail
pointer race. It reduces the number of places we need to read/write the
head and tail pointers.

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/i915_perf.c | 51 +++-
 1 file changed, 19 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index f47d1cc2144b..83dc67a635fb 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -420,8 +420,6 @@ static int append_oa_sample(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
- * @head_ptr: (inout): the current oa buffer cpu read position
- * @tail: the current oa buffer gpu write position
  *
  * Notably any error condition resulting in a short read (-%ENOSPC or
  * -%EFAULT) will be returned even though one or more records may
@@ -439,9 +437,7 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 static int gen7_append_oa_reports(struct i915_perf_stream *stream,
  char __user *buf,
  size_t count,
- size_t *offset,
- u32 *head_ptr,
- u32 tail)
+ size_t *offset)
 {
struct drm_i915_private *dev_priv = stream->dev_priv;
int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -449,14 +445,15 @@ static int gen7_append_oa_reports(struct i915_perf_stream 
*stream,
int tail_margin = dev_priv->perf.oa.tail_margin;
u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
u32 mask = (OA_BUFFER_SIZE - 1);
-   u32 head;
+   size_t start_offset = *offset;
+   u32 head, oastatus1, tail;
u32 taken;
int ret = 0;

if (WARN_ON(!stream->enabled))
return -EIO;

-   head = *head_ptr - gtt_offset;
+   head = dev_priv->perf.oa.oa_buffer.head - gtt_offset;

/* An out of bounds or misaligned head pointer implies a driver bug
 * since we are in full control of head pointer which should only
@@ -467,7 +464,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream 
*stream,
  "Inconsistent OA buffer head pointer = %u\n", head))
return -EIO;

-   tail -= gtt_offset;
+   oastatus1 = I915_READ(GEN7_OASTATUS1);
+   tail = (oastatus1 & GEN7_OASTATUS1_TAIL_MASK) - gtt_offset;

/* The OA unit is expected to wrap the tail pointer according to the OA
 * buffer size
@@ -477,8 +475,6 @@ static int gen7_append_oa_reports(struct i915_perf_stream 
*stream,
  tail);
dev_priv->perf.oa.ops.oa_disable(dev_priv);
dev_priv->perf.oa.ops.oa_enable(dev_priv);
-   *head_ptr = I915_READ(GEN7_OASTATUS2) &
-   GEN7_OASTATUS2_HEAD_MASK;
return -EIO;
}

@@ -542,7 +538,18 @@ static int gen7_append_oa_reports(struct i915_perf_stream 
*stream,
report32[0] = 0;
}

-   *head_ptr = gtt_offset + head;
+
+   if (start_offset != *offset) {
+   /* We removed the gtt_offset for the copy loop above, indexing
+* relative to oa_buf_base so put back here...
+*/
+   head += gtt_offset;
+
+   I915_WRITE(GEN7_OASTATUS2,
+  ((head & GEN7_OASTATUS2_HEAD_MASK) |
+   OA_MEM_SELECT_GGTT));
+   dev_priv->perf.oa.oa_buffer.head = head;
+   }

return ret;
 }
@@ -570,8 +577,6 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
 {
struct drm_i915_private *dev_priv = stream->dev_priv;
u32 oastatus1;
-   u32 head;
-   u32 tail;
int ret;

if (WARN_ON(!dev_priv->perf.oa.oa_buffer.vaddr))
@@ -579,9 +584,6 @@ static int gen7_oa_read(struct i915_perf_stream *stream,

oastatus1 = I915_READ(GEN7_OASTATUS1);

-   head = dev_priv->perf.oa.oa_buffer.head;
-   tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
-
/* XXX: On Haswell we don't have a safe way to clear oastatus1
 * bits while the OA unit is enabled (while the tail 

[Intel-gfx] [PATCH v5 07/15] drm/i915/perf: better pipeline aged/aging tail updates

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

This updates the tail pointer race workaround handling to updating the
'aged' pointer before looking to start aging a new one. There's the
possibility that there is already new data available and so we can
immediately start aging a new pointer without having to first wait for a
later hrtimer callback (and then another to age).

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/i915_perf.c | 41 ++--
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 08cc2b0dd734..5738b99caa5b 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -391,6 +391,29 @@ static bool gen7_oa_buffer_check_unlocked(struct 
drm_i915_private *dev_priv)

now = ktime_get_mono_fast_ns();

+   /* Update the aged tail
+*
+* Flip the tail pointer available for read()s once the aging tail is
+* old enough to trust that the corresponding data will be visible to
+* the CPU...
+*
+* Do this before updating the aging pointer in case we may be able to
+* immediately start aging a new pointer too (if new data has become
+* available) without needing to wait for a later hrtimer callback.
+*/
+   if (aging_tail != INVALID_TAIL_PTR &&
+   ((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) >
+OA_TAIL_MARGIN_NSEC)) {
+   aged_idx ^= 1;
+   dev_priv->perf.oa.oa_buffer.aged_tail_idx = aged_idx;
+
+   aged_tail = aging_tail;
+
+   /* Mark that we need a new pointer to start aging... */
+   dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset = 
INVALID_TAIL_PTR;
+   aging_tail = INVALID_TAIL_PTR;
+   }
+
/* Update the aging tail
 *
 * We throttle aging tail updates until we have a new tail that
@@ -420,24 +443,6 @@ static bool gen7_oa_buffer_check_unlocked(struct 
drm_i915_private *dev_priv)
}
}

-   /* Update the aged tail
-*
-* Flip the tail pointer available for read()s once the aging tail is
-* old enough to trust that the corresponding data will be visible to
-* the CPU...
-*/
-   if (aging_tail != INVALID_TAIL_PTR &&
-   ((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) >
-OA_TAIL_MARGIN_NSEC)) {
-   aged_idx ^= 1;
-   dev_priv->perf.oa.oa_buffer.aged_tail_idx = aged_idx;
-
-   aged_tail = aging_tail;
-
-   /* Mark that we need a new pointer to start aging... */
-   dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset = 
INVALID_TAIL_PTR;
-   }
-
spin_unlock_irqrestore(_priv->perf.oa.oa_buffer.ptr_lock, flags);

return aged_tail == INVALID_TAIL_PTR ?
--
2.11.0
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v5 06/15] drm/i915/perf: improve invalid OA format debug message

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

A minor improvement to debugging output

Signed-off-by: Robert Bragg 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/i915_perf.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 18734e1926b9..08cc2b0dd734 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1904,11 +1904,13 @@ static int read_properties_unlocked(struct 
drm_i915_private *dev_priv,
break;
case DRM_I915_PERF_PROP_OA_FORMAT:
if (value == 0 || value >= I915_OA_FORMAT_MAX) {
-   DRM_DEBUG("Invalid OA report format\n");
+   DRM_DEBUG("Out-of-range OA report format 
%llu\n",
+ value);
return -EINVAL;
}
if (!dev_priv->perf.oa.oa_formats[value].size) {
-   DRM_DEBUG("Invalid OA report format\n");
+   DRM_DEBUG("Unsupported OA report format %llu\n",
+ value);
return -EINVAL;
}
props->oa_format = value;
--
2.11.0
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v5 14/15] drm/i915/perf: per-gen timebase for checking sample freq

2017-04-24 Thread Lionel Landwerlin
From: Robert Bragg 

An oa_exponent_to_ns() utility and per-gen timebase constants where
recently removed when updating the tail pointer race condition WA, and
this restores those so we can update the _PROP_OA_EXPONENT validation
done in read_properties_unlocked() to not assume we have a 12.5MHz
timebase as we did for Haswell.

Accordingly the oa_sample_rate_hard_limit value that's referenced by
proc_dointvec_minmax defining the absolute limit for the OA sampling
frequency is now initialized to (timestamp_frequency / 2) instead of the
6.25MHz constant for Haswell.

v2:
Specify frequency of 19.2MHz for BXT (Ville)
Initialize oa_sample_rate_hard_limit per-gen too (Lionel)

Signed-off-by: Robert Bragg 
Cc: Lionel Landwerlin 
Cc: Ville Syrjälä 
Reviewed-by: Matthew Auld 
Acked-by: Lionel Landwerlin 
---
 drivers/gpu/drm/i915/i915_drv.h  |  1 +
 drivers/gpu/drm/i915/i915_perf.c | 37 ++---
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 41a97ade39ea..30fe1fc04234 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2460,6 +2460,7 @@ struct drm_i915_private {

bool periodic;
int period_exponent;
+   int timestamp_frequency;

int metrics_set;

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 96fd59359109..bbd3f7fdd52a 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -288,10 +288,12 @@ static u32 i915_perf_stream_paranoid = true;

 /* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
  *
- * 160ns is the smallest sampling period we can theoretically program the OA
- * unit with on Haswell, corresponding to 6.25MHz.
+ * The highest sampling frequency we can theoretically program the OA unit
+ * with is always half the timestamp frequency: E.g. 6.25Mhz for Haswell.
+ *
+ * Initialized just before we register the sysctl parameter.
  */
-static int oa_sample_rate_hard_limit = 625;
+static int oa_sample_rate_hard_limit;

 /* Theoretically we can program the OA unit to sample every 160ns but don't
  * allow that by default unless root...
@@ -2580,6 +2582,12 @@ i915_perf_open_ioctl_locked(struct drm_i915_private 
*dev_priv,
return ret;
 }

+static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
+{
+   return div_u64(10ULL * (2ULL << exponent),
+  dev_priv->perf.oa.timestamp_frequency);
+}
+
 /**
  * read_properties_unlocked - validate + copy userspace stream open properties
  * @dev_priv: i915 device instance
@@ -2676,16 +2684,13 @@ static int read_properties_unlocked(struct 
drm_i915_private *dev_priv,
}

/* Theoretically we can program the OA unit to sample
-* every 160ns but don't allow that by default unless
-* root.
-*
-* On Haswell the period is derived from the exponent
-* as:
-*
-*   period = 80ns * 2^(exponent + 1)
+* e.g. every 160ns for HSW, 167ns for BDW/SKL or 104ns
+* for BXT. We don't allow such high sampling
+* frequencies by default unless root.
 */
+
BUILD_BUG_ON(sizeof(oa_period) != 8);
-   oa_period = 80ull * (2ull << value);
+   oa_period = oa_exponent_to_ns(dev_priv, value);

/* This check is primarily to ensure that oa_period <=
 * UINT32_MAX (before passing to do_div which only
@@ -2941,6 +2946,8 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
dev_priv->perf.oa.ops.oa_hw_tail_read =
gen7_oa_hw_tail_read;

+   dev_priv->perf.oa.timestamp_frequency = 1250;
+
dev_priv->perf.oa.oa_formats = hsw_oa_formats;

dev_priv->perf.oa.n_builtin_sets =
@@ -2954,6 +2961,8 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
 */

if (IS_GEN8(dev_priv)) {
+   dev_priv->perf.oa.timestamp_frequency = 1250;
+
dev_priv->perf.oa.ctx_oactxctrl_offset = 0x120;
dev_priv->perf.oa.ctx_flexeu0_offset = 0x2ce;
dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<25);
@@ -2970,6 +2979,8 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
i915_oa_select_metric_set_chv;