Re: [Intel-gfx] [PATCH v7 10/17] drm/i915/pxp: interfaces for using protected objects

2021-09-01 Thread Rodrigo Vivi
On Tue, Aug 31, 2021 at 03:01:51PM -0700, Daniele Ceraolo Spurio wrote:
> 
> 
> > > +}
> > > +
> > > +void intel_pxp_invalidate(struct intel_pxp *pxp)
> > > +{
> > > + struct drm_i915_private *i915 = pxp_to_gt(pxp)->i915;
> > > + struct i915_gem_context *ctx, *cn;
> > > +
> > > + /* ban all contexts marked as protected */
> > > + spin_lock_irq(>gem.contexts.lock);
> > > + list_for_each_entry_safe(ctx, cn, >gem.contexts.list, link) {
> > > + struct i915_gem_engines_iter it;
> > > + struct intel_context *ce;
> > > +
> > > + if (!kref_get_unless_zero(>ref))
> > > + continue;
> > > +
> > > + if (likely(!i915_gem_context_uses_protected_content(ctx))) {
> > > + i915_gem_context_put(ctx);
> > > + continue;
> > > + }
> > > +
> > > + spin_unlock_irq(>gem.contexts.lock);
> > > +
> > > + /*
> > > +  * By the time we get here, the HW keys are already long gone,
> > > +  * so any batch using them that's already on the engines is very
> > > +  * likely a lost cause (and it has probably already hung the
> > > +  * engine). Therefore, we skip attempting to pull the running
> > > +  * context out of the HW and we prioritize bringing the session
> > > +  * back as soon as possible.
> > > +  * For each context we ban we increase the ctx->guilty_count, so
> > > +  * that userspace can see that all the intel contexts have been
> > > +  * banned (on a non-recoverable gem context, guilty intel
> > > +  * contexts are banned immediately on reset, so we report the
> > > +  * same way here).
> > hmm... but guilty specifically means that they indeed caused the GPU hang.
> > does the umd really need this indication? any other way of doing this?
> 
> The request from Daniel was to re-use the existing interface and AFAICT the
> guilty_count is the only one we have for this. The alternative would be to
> add a new flag (like I had in the previous version of this patch), but that
> was shot down already. Lionel can probably comment more on the UMD
> requirements for this since it was a request from the mesa side.

Daniel and Lionel, could you please comment here?
Are we really expanding the guilty from the hang meaning?
Is it really better than creating the new flag?

> 
> 
> > > +  */
> > > + for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it)
> > > + if (!intel_context_ban(ce, NULL))
> > > + atomic_inc(>guilty_count);
> > > + i915_gem_context_unlock_engines(ctx);
> > > +
> > > + /*
> > > +  * The context has been killed, no need to keep the wakeref.
> > > +  * This is safe from races because the only other place this
> > > +  * is touched is context_close and we're holding a ctx ref
> > > +  */
> > The comments make sense, but maybe we should avoid the optimization here,
> > but maybe we should avoid the optimization and just keep it locked?
> 
> The lock released above the comment and the one taken after the pm_put are
> different ones, so even if we don't release the wakeref here we still need
> to do the same locking steps. Or did you mean something different with
> keeping it locked?

oh, please ignore me here. I thought it was the same locking...

> 
> Thanks,
> Daniele
> 
> > 
> > > + if (ctx->pxp_wakeref) {
> > > + intel_runtime_pm_put(>runtime_pm,
> > > +  ctx->pxp_wakeref);
> > > + ctx->pxp_wakeref = 0;
> > > + }
> > > +
> > > + spin_lock_irq(>gem.contexts.lock);
> > > + list_safe_reset_next(ctx, cn, link);
> > > + i915_gem_context_put(ctx);
> > > + }
> > > + spin_unlock_irq(>gem.contexts.lock);
> > > +}
> > > +
> > > diff --git a/drivers/gpu/drm/i915/pxp/intel_pxp.h 
> > > b/drivers/gpu/drm/i915/pxp/intel_pxp.h
> > > index 8f1e86caa53f..f942bdd2af0c 100644
> > > --- a/drivers/gpu/drm/i915/pxp/intel_pxp.h
> > > +++ b/drivers/gpu/drm/i915/pxp/intel_pxp.h
> > > @@ -9,6 +9,8 @@
> > >   #include "gt/intel_gt_types.h"
> > >   #include "intel_pxp_types.h"
> > > +struct drm_i915_gem_object;
> > > +
> > >   static inline struct intel_gt *pxp_to_gt(const struct intel_pxp *pxp)
> > >   {
> > >   return container_of(pxp, struct intel_gt, pxp);
> > > @@ -33,6 +35,10 @@ void intel_pxp_fini_hw(struct intel_pxp *pxp);
> > >   void intel_pxp_mark_termination_in_progress(struct intel_pxp *pxp);
> > >   int intel_pxp_wait_for_arb_start(struct intel_pxp *pxp);
> > > +
> > > +int intel_pxp_key_check(struct intel_pxp *pxp, struct 
> > > drm_i915_gem_object *obj);
> > > +
> > > +void intel_pxp_invalidate(struct intel_pxp *pxp);
> > >   #else
> > >   static inline void intel_pxp_init(struct intel_pxp *pxp)
> > >   {
> > > @@ -46,6 +52,12 @@ static inline int intel_pxp_wait_for_arb_start(struct 
> > > intel_pxp *pxp)
> > >   {
> > >  

Re: [PATCH v7 10/17] drm/i915/pxp: interfaces for using protected objects

2021-08-31 Thread Daniele Ceraolo Spurio




+}
+
+void intel_pxp_invalidate(struct intel_pxp *pxp)
+{
+   struct drm_i915_private *i915 = pxp_to_gt(pxp)->i915;
+   struct i915_gem_context *ctx, *cn;
+
+   /* ban all contexts marked as protected */
+   spin_lock_irq(>gem.contexts.lock);
+   list_for_each_entry_safe(ctx, cn, >gem.contexts.list, link) {
+   struct i915_gem_engines_iter it;
+   struct intel_context *ce;
+
+   if (!kref_get_unless_zero(>ref))
+   continue;
+
+   if (likely(!i915_gem_context_uses_protected_content(ctx))) {
+   i915_gem_context_put(ctx);
+   continue;
+   }
+
+   spin_unlock_irq(>gem.contexts.lock);
+
+   /*
+* By the time we get here, the HW keys are already long gone,
+* so any batch using them that's already on the engines is very
+* likely a lost cause (and it has probably already hung the
+* engine). Therefore, we skip attempting to pull the running
+* context out of the HW and we prioritize bringing the session
+* back as soon as possible.
+* For each context we ban we increase the ctx->guilty_count, so
+* that userspace can see that all the intel contexts have been
+* banned (on a non-recoverable gem context, guilty intel
+* contexts are banned immediately on reset, so we report the
+* same way here).

hmm... but guilty specifically means that they indeed caused the GPU hang.
does the umd really need this indication? any other way of doing this?


The request from Daniel was to re-use the existing interface and AFAICT 
the guilty_count is the only one we have for this. The alternative would 
be to add a new flag (like I had in the previous version of this patch), 
but that was shot down already. Lionel can probably comment more on the 
UMD requirements for this since it was a request from the mesa side.




+*/
+   for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it)
+   if (!intel_context_ban(ce, NULL))
+   atomic_inc(>guilty_count);
+   i915_gem_context_unlock_engines(ctx);
+
+   /*
+* The context has been killed, no need to keep the wakeref.
+* This is safe from races because the only other place this
+* is touched is context_close and we're holding a ctx ref
+*/

The comments make sense, but maybe we should avoid the optimization here,
but maybe we should avoid the optimization and just keep it locked?


The lock released above the comment and the one taken after the pm_put 
are different ones, so even if we don't release the wakeref here we 
still need to do the same locking steps. Or did you mean something 
different with keeping it locked?


Thanks,
Daniele




+   if (ctx->pxp_wakeref) {
+   intel_runtime_pm_put(>runtime_pm,
+ctx->pxp_wakeref);
+   ctx->pxp_wakeref = 0;
+   }
+
+   spin_lock_irq(>gem.contexts.lock);
+   list_safe_reset_next(ctx, cn, link);
+   i915_gem_context_put(ctx);
+   }
+   spin_unlock_irq(>gem.contexts.lock);
+}
+
diff --git a/drivers/gpu/drm/i915/pxp/intel_pxp.h 
b/drivers/gpu/drm/i915/pxp/intel_pxp.h
index 8f1e86caa53f..f942bdd2af0c 100644
--- a/drivers/gpu/drm/i915/pxp/intel_pxp.h
+++ b/drivers/gpu/drm/i915/pxp/intel_pxp.h
@@ -9,6 +9,8 @@
  #include "gt/intel_gt_types.h"
  #include "intel_pxp_types.h"
  
+struct drm_i915_gem_object;

+
  static inline struct intel_gt *pxp_to_gt(const struct intel_pxp *pxp)
  {
return container_of(pxp, struct intel_gt, pxp);
@@ -33,6 +35,10 @@ void intel_pxp_fini_hw(struct intel_pxp *pxp);
  
  void intel_pxp_mark_termination_in_progress(struct intel_pxp *pxp);

  int intel_pxp_wait_for_arb_start(struct intel_pxp *pxp);
+
+int intel_pxp_key_check(struct intel_pxp *pxp, struct drm_i915_gem_object 
*obj);
+
+void intel_pxp_invalidate(struct intel_pxp *pxp);
  #else
  static inline void intel_pxp_init(struct intel_pxp *pxp)
  {
@@ -46,6 +52,12 @@ static inline int intel_pxp_wait_for_arb_start(struct 
intel_pxp *pxp)
  {
return -ENODEV;
  }
+
+static inline int intel_pxp_key_check(struct intel_pxp *pxp,
+ struct drm_i915_gem_object *obj)
+{
+   return -ENODEV;
+}
  #endif
  
  #endif /* __INTEL_PXP_H__ */

diff --git a/drivers/gpu/drm/i915/pxp/intel_pxp_session.c 
b/drivers/gpu/drm/i915/pxp/intel_pxp_session.c
index 67c30e534d50..c6a5e4197e40 100644
--- a/drivers/gpu/drm/i915/pxp/intel_pxp_session.c
+++ b/drivers/gpu/drm/i915/pxp/intel_pxp_session.c
@@ -72,6 +72,9 @@ static int pxp_create_arb_session(struct intel_pxp *pxp)

Re: [PATCH v7 10/17] drm/i915/pxp: interfaces for using protected objects

2021-08-31 Thread Rodrigo Vivi
On Fri, Aug 27, 2021 at 06:27:31PM -0700, Daniele Ceraolo Spurio wrote:
> This api allow user mode to create protected buffers and to mark
> contexts as making use of such objects. Only when using contexts
> marked in such a way is the execution guaranteed to work as expected.
> 
> Contexts can only be marked as using protected content at creation time
> (i.e. the parameter is immutable) and they must be both bannable and not
> recoverable. Given that the protected session gets invalidated on
> suspend, contexts created this way hold a runtime pm wakeref until
> they're either destroyed or invalidated.
> 
> All protected objects and contexts will be considered invalid when the
> PXP session is destroyed and all new submissions using them will be
> rejected. All intel contexts within the invalidated gem contexts will be
> marked banned. Userspace can detect that an invalidation has occurred via
> the RESET_STATS ioctl, where we report it the same way as a ban due to a
> hang.
> 
> v5: squash patches, rebase on proto_ctx, update kerneldoc
> 
> v6: rebase on obj create_ext changes
> 
> v7: Use session counter to check if an object it valid, hold wakeref in
> context, don't add a new flag to RESET_STATS (Daniel)
> 
> Signed-off-by: Daniele Ceraolo Spurio 
> Signed-off-by: Bommu Krishnaiah 
> Cc: Rodrigo Vivi 
> Cc: Chris Wilson 
> Cc: Lionel Landwerlin 
> Cc: Jason Ekstrand 
> Cc: Daniel Vetter 
> Reviewed-by: Rodrigo Vivi  #v5
> ---
>  drivers/gpu/drm/i915/gem/i915_gem_context.c   | 98 ---
>  drivers/gpu/drm/i915/gem/i915_gem_context.h   |  6 ++
>  .../gpu/drm/i915/gem/i915_gem_context_types.h | 28 ++
>  drivers/gpu/drm/i915/gem/i915_gem_create.c| 72 ++
>  .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 18 
>  drivers/gpu/drm/i915/gem/i915_gem_object.c|  1 +
>  drivers/gpu/drm/i915/gem/i915_gem_object.h|  6 ++
>  .../gpu/drm/i915/gem/i915_gem_object_types.h  |  8 ++
>  .../gpu/drm/i915/gem/selftests/mock_context.c |  4 +-
>  drivers/gpu/drm/i915/pxp/intel_pxp.c  | 83 
>  drivers/gpu/drm/i915/pxp/intel_pxp.h  | 12 +++
>  drivers/gpu/drm/i915/pxp/intel_pxp_session.c  |  6 ++
>  drivers/gpu/drm/i915/pxp/intel_pxp_types.h|  9 ++
>  include/uapi/drm/i915_drm.h   | 55 ++-
>  14 files changed, 371 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index fd169cf2f75a..f411e26768fd 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -77,6 +77,8 @@
>  #include "gt/intel_gpu_commands.h"
>  #include "gt/intel_ring.h"
>  
> +#include "pxp/intel_pxp.h"
> +
>  #include "i915_gem_context.h"
>  #include "i915_trace.h"
>  #include "i915_user_extensions.h"
> @@ -186,10 +188,13 @@ static int validate_priority(struct drm_i915_private 
> *i915,
>   return 0;
>  }
>  
> -static void proto_context_close(struct i915_gem_proto_context *pc)
> +static void proto_context_close(struct drm_i915_private *i915,
> + struct i915_gem_proto_context *pc)
>  {
>   int i;
>  
> + if (pc->pxp_wakeref)
> + intel_runtime_pm_put(>runtime_pm, pc->pxp_wakeref);
>   if (pc->vm)
>   i915_vm_put(pc->vm);
>   if (pc->user_engines) {
> @@ -241,6 +246,33 @@ static int proto_context_set_persistence(struct 
> drm_i915_private *i915,
>   return 0;
>  }
>  
> +static int proto_context_set_protected(struct drm_i915_private *i915,
> +struct i915_gem_proto_context *pc,
> +bool protected)
> +{
> + int ret = 0;
> +
> + if (!intel_pxp_is_enabled(>gt.pxp)) {
> + ret = -ENODEV;
> + } else if (!protected) {
> + pc->uses_protected_content = false;
> + } else if ((pc->user_flags & BIT(UCONTEXT_RECOVERABLE)) ||
> +!(pc->user_flags & BIT(UCONTEXT_BANNABLE))) {
> + ret = -EPERM;
> + } else {
> + pc->uses_protected_content = true;
> +
> + /*
> +  * protected context usage requires the PXP session to be up,
> +  * which in turn requires the device to be active.
> +  */
> + pc->pxp_wakeref = intel_runtime_pm_get(>runtime_pm);
> + ret = intel_pxp_wait_for_arb_start(>gt.pxp);
> + }
> +
> + return ret;
> +}
> +
>  static struct i915_gem_proto_context *
>  proto_context_create(struct drm_i915_private *i915, unsigned int flags)
>  {
> @@ -269,7 +301,7 @@ proto_context_create(struct drm_i915_private *i915, 
> unsigned int flags)
>   return pc;
>  
>  proto_close:
> - proto_context_close(pc);
> + proto_context_close(i915, pc);
>   return err;
>  }
>  
> @@ -693,6 +725,8 @@ static int set_proto_ctx_param(struct 
> drm_i915_file_private *fpriv,
>   ret = -EPERM;
>   else 

[PATCH v7 10/17] drm/i915/pxp: interfaces for using protected objects

2021-08-27 Thread Daniele Ceraolo Spurio
This api allow user mode to create protected buffers and to mark
contexts as making use of such objects. Only when using contexts
marked in such a way is the execution guaranteed to work as expected.

Contexts can only be marked as using protected content at creation time
(i.e. the parameter is immutable) and they must be both bannable and not
recoverable. Given that the protected session gets invalidated on
suspend, contexts created this way hold a runtime pm wakeref until
they're either destroyed or invalidated.

All protected objects and contexts will be considered invalid when the
PXP session is destroyed and all new submissions using them will be
rejected. All intel contexts within the invalidated gem contexts will be
marked banned. Userspace can detect that an invalidation has occurred via
the RESET_STATS ioctl, where we report it the same way as a ban due to a
hang.

v5: squash patches, rebase on proto_ctx, update kerneldoc

v6: rebase on obj create_ext changes

v7: Use session counter to check if an object it valid, hold wakeref in
context, don't add a new flag to RESET_STATS (Daniel)

Signed-off-by: Daniele Ceraolo Spurio 
Signed-off-by: Bommu Krishnaiah 
Cc: Rodrigo Vivi 
Cc: Chris Wilson 
Cc: Lionel Landwerlin 
Cc: Jason Ekstrand 
Cc: Daniel Vetter 
Reviewed-by: Rodrigo Vivi  #v5
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 98 ---
 drivers/gpu/drm/i915/gem/i915_gem_context.h   |  6 ++
 .../gpu/drm/i915/gem/i915_gem_context_types.h | 28 ++
 drivers/gpu/drm/i915/gem/i915_gem_create.c| 72 ++
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 18 
 drivers/gpu/drm/i915/gem/i915_gem_object.c|  1 +
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  6 ++
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  8 ++
 .../gpu/drm/i915/gem/selftests/mock_context.c |  4 +-
 drivers/gpu/drm/i915/pxp/intel_pxp.c  | 83 
 drivers/gpu/drm/i915/pxp/intel_pxp.h  | 12 +++
 drivers/gpu/drm/i915/pxp/intel_pxp_session.c  |  6 ++
 drivers/gpu/drm/i915/pxp/intel_pxp_types.h|  9 ++
 include/uapi/drm/i915_drm.h   | 55 ++-
 14 files changed, 371 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index fd169cf2f75a..f411e26768fd 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -77,6 +77,8 @@
 #include "gt/intel_gpu_commands.h"
 #include "gt/intel_ring.h"
 
+#include "pxp/intel_pxp.h"
+
 #include "i915_gem_context.h"
 #include "i915_trace.h"
 #include "i915_user_extensions.h"
@@ -186,10 +188,13 @@ static int validate_priority(struct drm_i915_private 
*i915,
return 0;
 }
 
-static void proto_context_close(struct i915_gem_proto_context *pc)
+static void proto_context_close(struct drm_i915_private *i915,
+   struct i915_gem_proto_context *pc)
 {
int i;
 
+   if (pc->pxp_wakeref)
+   intel_runtime_pm_put(>runtime_pm, pc->pxp_wakeref);
if (pc->vm)
i915_vm_put(pc->vm);
if (pc->user_engines) {
@@ -241,6 +246,33 @@ static int proto_context_set_persistence(struct 
drm_i915_private *i915,
return 0;
 }
 
+static int proto_context_set_protected(struct drm_i915_private *i915,
+  struct i915_gem_proto_context *pc,
+  bool protected)
+{
+   int ret = 0;
+
+   if (!intel_pxp_is_enabled(>gt.pxp)) {
+   ret = -ENODEV;
+   } else if (!protected) {
+   pc->uses_protected_content = false;
+   } else if ((pc->user_flags & BIT(UCONTEXT_RECOVERABLE)) ||
+  !(pc->user_flags & BIT(UCONTEXT_BANNABLE))) {
+   ret = -EPERM;
+   } else {
+   pc->uses_protected_content = true;
+
+   /*
+* protected context usage requires the PXP session to be up,
+* which in turn requires the device to be active.
+*/
+   pc->pxp_wakeref = intel_runtime_pm_get(>runtime_pm);
+   ret = intel_pxp_wait_for_arb_start(>gt.pxp);
+   }
+
+   return ret;
+}
+
 static struct i915_gem_proto_context *
 proto_context_create(struct drm_i915_private *i915, unsigned int flags)
 {
@@ -269,7 +301,7 @@ proto_context_create(struct drm_i915_private *i915, 
unsigned int flags)
return pc;
 
 proto_close:
-   proto_context_close(pc);
+   proto_context_close(i915, pc);
return err;
 }
 
@@ -693,6 +725,8 @@ static int set_proto_ctx_param(struct drm_i915_file_private 
*fpriv,
ret = -EPERM;
else if (args->value)
pc->user_flags |= BIT(UCONTEXT_BANNABLE);
+   else if (pc->uses_protected_content)
+   ret = -EPERM;
else
pc->user_flags &=