On Tue, Mar 9, 2010 at 4:45 PM, Jerome Glisse <jgli...@redhat.com> wrote:
> This patch cleanup the fence code, it drops the timeout field of
> fence as the time to complete each IB is unpredictable and shouldn't
> be bound.
>
> The fence cleanup lead to GPU lockup detection improvement, this
> patch introduce a callback, allowing to do asic specific test for
> lockup detection. In this patch the CP is use as a first indicator
> of GPU lockup. If CP doesn't make progress during 1second we assume
> we are facing a GPU lockup.
>
> To avoid overhead of testing GPU lockup frequently due to fence
> taking time to be signaled we query the lockup callback every
> 500msec. There is plenty code comment explaining the design & choise
> inside the code.
>
> This have been tested mostly on R3XX/R5XX hw, in normal running
> destkop (compiz firefox, quake3 running) the lockup callback wasn't
> call once (1 hour session). Also tested with forcing GPU lockup and
> lockup was reported after the 1s CP activity timeout.
>
> V2 switch to 500ms timeout so GPU lockup get call at least 2 times
>   in less than 2sec.
> V3 store last jiffies in fence struct so on ERESTART, EBUSY we keep
>   track of how long we already wait for a given fence
> V4 make sure we got up to date cp read pointer so we don't have
>   false positive
>
> Signed-off-by: Jerome Glisse <jgli...@redhat.com>
> ---
>  drivers/gpu/drm/radeon/evergreen.c    |    6 ++
>  drivers/gpu/drm/radeon/r100.c         |   86 +++++++++++++++++++++++++++
>  drivers/gpu/drm/radeon/r300.c         |   28 ++++++++-
>  drivers/gpu/drm/radeon/r600.c         |   34 ++++++++++-
>  drivers/gpu/drm/radeon/radeon.h       |  104 
> +++++++++++++++++++--------------
>  drivers/gpu/drm/radeon/radeon_asic.h  |   20 ++++++-
>  drivers/gpu/drm/radeon/radeon_fence.c |  102 +++++++++++++++++---------------
>  drivers/gpu/drm/radeon/rv770.c        |    6 --
>  8 files changed, 280 insertions(+), 106 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/evergreen.c 
> b/drivers/gpu/drm/radeon/evergreen.c
> index bd2e7aa..8988df7 100644
> --- a/drivers/gpu/drm/radeon/evergreen.c
> +++ b/drivers/gpu/drm/radeon/evergreen.c
> @@ -490,6 +490,12 @@ int evergreen_mc_init(struct radeon_device *rdev)
>        return 0;
>  }
>
> +bool evergreen_gpu_is_lockup(struct radeon_device *rdev)
> +{
> +       /* FIXME: implement for evergreen */
> +       return false;
> +}
> +
>  int evergreen_gpu_reset(struct radeon_device *rdev)
>  {
>        /* FIXME: implement for evergreen */
> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> index 91eb762..e4487f3 100644
> --- a/drivers/gpu/drm/radeon/r100.c
> +++ b/drivers/gpu/drm/radeon/r100.c
> @@ -1772,6 +1772,92 @@ int r100_rb2d_reset(struct radeon_device *rdev)
>        return -1;
>  }
>
> +void r100_gpu_lockup_update(struct r100_gpu_lockup *lockup, struct radeon_cp 
> *cp)
> +{
> +       lockup->last_cp_rptr = cp->rptr;
> +       lockup->last_jiffies = jiffies;
> +}
> +
> +/**
> + * r100_gpu_cp_is_lockup() - check if CP is lockup by recording information
> + * @rdev:      radeon device structure
> + * @lockup:    r100_gpu_lockup structure holding CP lockup tracking 
> informations
> + * @cp:                radeon_cp structure holding CP information
> + *
> + * We don't need to initialize the lockup tracking information as we will 
> either
> + * have CP rptr to a different value of jiffies wrap around which will force
> + * initialization of the lockup tracking informations.
> + *
> + * A possible false positivie is if we get call after while and last_cp_rptr 
> ==
> + * the current CP rptr, even if it's unlikely it might happen. To avoid this
> + * if the elapsed time since last call is bigger than 2 second than we return
> + * false and update the tracking information. Due to this the caller must 
> call
> + * r100_gpu_cp_is_lockup several time in less than 2sec for lockup to be 
> reported
> + * the fencing code should be cautious about that.
> + *
> + * Caller should write to the ring to force CP to do something so we don't 
> get
> + * false positive when CP is just gived nothing to do.
> + *
> + **/
> +bool r100_gpu_cp_is_lockup(struct radeon_device *rdev, struct 
> r100_gpu_lockup *lockup, struct radeon_cp *cp)
> +{
> +       unsigned long cjiffies, elapsed;
> +
> +       cjiffies = jiffies;
> +       if (!time_after(cjiffies, lockup->last_jiffies)) {

time_after(0x00000010, 0xfeffffff) returns true. time_after macro is
defined to deal correctly with wrapping. Only time when it returns
false is when difference of times is more than the half of maximum
jiffies.

> +               /* likely a wrap around */
> +               lockup->last_cp_rptr = cp->rptr;
> +               lockup->last_jiffies = jiffies;
> +               return false;
> +       }
> +       if (cp->rptr != lockup->last_cp_rptr) {
> +               /* CP is still working no lockup */
> +               lockup->last_cp_rptr = cp->rptr;
> +               lockup->last_jiffies = jiffies;
> +               return false;
> +       }
> +       elapsed = jiffies_to_msecs(cjiffies - lockup->last_jiffies);

This handles the wrapping correctly. (small_jiffies - large_jiffies =
small_difference)

> +       if (elapsed >= 3000) {
> +               /* very likely the improbable case where current
> +                * rptr is equal to last recorded, a while ago, rptr
> +                * this is more likely a false positive update tracking
> +                * information which should force us to be recall at
> +                * latter point
> +                */
> +               lockup->last_cp_rptr = cp->rptr;
> +               lockup->last_jiffies = jiffies;
> +               return false;
> +       }
> +       if (elapsed >= 1000) {
> +               dev_err(rdev->dev, "GPU lockup CP stall for more than 
> %lumsec\n", elapsed);
> +               return true;
> +       }
> +       /* give a chance to the GPU ... */
> +       return false;
> +}
> +
> +bool r100_gpu_is_lockup(struct radeon_device *rdev)
> +{
> +       u32 rbbm_status;
> +       int r;
> +
> +       rbbm_status = RREG32(R_000E40_RBBM_STATUS);
> +       if (!G_000E40_GUI_ACTIVE(rbbm_status)) {
> +               r100_gpu_lockup_update(&rdev->config.r100.lockup, &rdev->cp);
> +               return false;
> +       }
> +       /* force CP activities */
> +       r = radeon_ring_lock(rdev, 2);
> +       if (!r) {
> +               /* PACKET2 NOP */
> +               radeon_ring_write(rdev, 0x80000000);
> +               radeon_ring_write(rdev, 0x80000000);
> +               radeon_ring_unlock_commit(rdev);
> +       }
> +       rdev->cp.rptr = RREG32(RADEON_CP_RB_RPTR);
> +       return r100_gpu_cp_is_lockup(rdev, &rdev->config.r100.lockup, 
> &rdev->cp);
> +}
> +
>  int r100_gpu_reset(struct radeon_device *rdev)
>  {
>        uint32_t status;
> diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
> index 4cef90c..23346f6 100644
> --- a/drivers/gpu/drm/radeon/r300.c
> +++ b/drivers/gpu/drm/radeon/r300.c
> @@ -26,8 +26,9 @@
>  *          Jerome Glisse
>  */
>  #include <linux/seq_file.h>
> -#include "drmP.h"
> -#include "drm.h"
> +#include <drm/drmP.h>
> +#include <drm/drm.h>
> +#include <drm/drm_crtc_helper.h>
>  #include "radeon_reg.h"
>  #include "radeon.h"
>  #include "radeon_drm.h"
> @@ -424,12 +425,35 @@ int r300_ga_reset(struct radeon_device *rdev)
>        return -1;
>  }
>
> +bool r300_gpu_is_lockup(struct radeon_device *rdev)
> +{
> +       u32 rbbm_status;
> +       int r;
> +
> +       rbbm_status = RREG32(R_000E40_RBBM_STATUS);
> +       if (!G_000E40_GUI_ACTIVE(rbbm_status)) {
> +               r100_gpu_lockup_update(&rdev->config.r300.lockup, &rdev->cp);
> +               return false;
> +       }
> +       /* force CP activities */
> +       r = radeon_ring_lock(rdev, 2);
> +       if (!r) {
> +               /* PACKET2 NOP */
> +               radeon_ring_write(rdev, 0x80000000);
> +               radeon_ring_write(rdev, 0x80000000);
> +               radeon_ring_unlock_commit(rdev);
> +       }
> +       rdev->cp.rptr = RREG32(RADEON_CP_RB_RPTR);
> +       return r100_gpu_cp_is_lockup(rdev, &rdev->config.r300.lockup, 
> &rdev->cp);
> +}
> +

Is here any difference to r100 version?

>  int r300_gpu_reset(struct radeon_device *rdev)
>  {
>        uint32_t status;
>
>        /* reset order likely matter */
>        status = RREG32(RADEON_RBBM_STATUS);
> +       dev_info(rdev->dev, "(%s:%d) RBBM_STATUS=0x%08X\n", __func__, 
> __LINE__, status);
>        /* reset HDP */
>        r100_hdp_reset(rdev);
>        /* reset rb2d */
> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
> index c522901..4520685 100644
> --- a/drivers/gpu/drm/radeon/r600.c
> +++ b/drivers/gpu/drm/radeon/r600.c
> @@ -788,7 +788,7 @@ int r600_gpu_soft_reset(struct radeon_device *rdev)
>                dev_info(rdev->dev, "  R_008020_GRBM_SOFT_RESET=0x%08X\n", 
> tmp);
>                WREG32(R_008020_GRBM_SOFT_RESET, tmp);
>                (void)RREG32(R_008020_GRBM_SOFT_RESET);
> -               udelay(50);
> +               mdelay(1);
>                WREG32(R_008020_GRBM_SOFT_RESET, 0);
>                (void)RREG32(R_008020_GRBM_SOFT_RESET);
>        }
> @@ -828,16 +828,16 @@ int r600_gpu_soft_reset(struct radeon_device *rdev)
>        dev_info(rdev->dev, "  R_000E60_SRBM_SOFT_RESET=0x%08X\n", srbm_reset);
>        WREG32(R_000E60_SRBM_SOFT_RESET, srbm_reset);
>        (void)RREG32(R_000E60_SRBM_SOFT_RESET);
> -       udelay(50);
> +       mdelay(1);
>        WREG32(R_000E60_SRBM_SOFT_RESET, 0);
>        (void)RREG32(R_000E60_SRBM_SOFT_RESET);
>        WREG32(R_000E60_SRBM_SOFT_RESET, srbm_reset);
>        (void)RREG32(R_000E60_SRBM_SOFT_RESET);
> -       udelay(50);
> +       mdelay(1);
>        WREG32(R_000E60_SRBM_SOFT_RESET, 0);
>        (void)RREG32(R_000E60_SRBM_SOFT_RESET);
>        /* Wait a little for things to settle down */
> -       udelay(50);
> +       mdelay(1);
>        dev_info(rdev->dev, "  R_008010_GRBM_STATUS=0x%08X\n",
>                RREG32(R_008010_GRBM_STATUS));
>        dev_info(rdev->dev, "  R_008014_GRBM_STATUS2=0x%08X\n",
> @@ -852,6 +852,32 @@ int r600_gpu_soft_reset(struct radeon_device *rdev)
>        return 0;
>  }
>
> +bool r600_gpu_is_lockup(struct radeon_device *rdev)
> +{
> +       u32 srbm_status;
> +       u32 grbm_status;
> +       u32 grbm_status2;
> +       int r;
> +
> +       srbm_status = RREG32(R_000E50_SRBM_STATUS);
> +       grbm_status = RREG32(R_008010_GRBM_STATUS);
> +       grbm_status2 = RREG32(R_008014_GRBM_STATUS2);
> +       if (!G_008010_GUI_ACTIVE(grbm_status)) {
> +               r100_gpu_lockup_update(&rdev->config.r300.lockup, &rdev->cp);
> +               return false;
> +       }
> +       /* force CP activities */
> +       r = radeon_ring_lock(rdev, 2);
> +       if (!r) {
> +               /* PACKET2 NOP */
> +               radeon_ring_write(rdev, 0x80000000);
> +               radeon_ring_write(rdev, 0x80000000);
> +               radeon_ring_unlock_commit(rdev);
> +       }
> +       rdev->cp.rptr = RREG32(R600_CP_RB_RPTR);
> +       return r100_gpu_cp_is_lockup(rdev, &rdev->config.r300.lockup, 
> &rdev->cp);
> +}
> +
>  int r600_gpu_reset(struct radeon_device *rdev)
>  {
>        return r600_gpu_soft_reset(rdev);
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 829e26e..e97a118 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -97,6 +97,7 @@ extern int radeon_audio;
>  * symbol;
>  */
>  #define RADEON_MAX_USEC_TIMEOUT                100000  /* 100 ms */
> +#define RADEON_FENCE_JIFFIES_TIMEOUT   (HZ / 2)
>  /* RADEON_IB_POOL_SIZE must be a power of 2 */
>  #define RADEON_IB_POOL_SIZE            16
>  #define RADEON_DEBUGFS_MAX_NUM_FILES   32
> @@ -179,7 +180,8 @@ struct radeon_fence_driver {
>        uint32_t                        scratch_reg;
>        atomic_t                        seq;
>        uint32_t                        last_seq;
> -       unsigned long                   count_timeout;
> +       unsigned long                   last_jiffies;
> +       unsigned long                   last_timeout;
>        wait_queue_head_t               queue;
>        rwlock_t                        lock;
>        struct list_head                created;
> @@ -194,7 +196,6 @@ struct radeon_fence {
>        struct list_head                list;
>        /* protected by radeon_fence.lock */
>        uint32_t                        seq;
> -       unsigned long                   timeout;
>        bool                            emited;
>        bool                            signaled;
>  };
> @@ -742,6 +743,7 @@ struct radeon_asic {
>        int (*resume)(struct radeon_device *rdev);
>        int (*suspend)(struct radeon_device *rdev);
>        void (*vga_set_state)(struct radeon_device *rdev, bool state);
> +       bool (*gpu_is_lockup)(struct radeon_device *rdev);
>        int (*gpu_reset)(struct radeon_device *rdev);
>        void (*gart_tlb_flush)(struct radeon_device *rdev);
>        int (*gart_set_page)(struct radeon_device *rdev, int i, uint64_t addr);
> @@ -800,59 +802,68 @@ struct radeon_asic {
>  /*
>  * Asic structures
>  */
> +struct r100_gpu_lockup {
> +       unsigned long   last_jiffies;
> +       u32             last_cp_rptr;
> +};
> +
>  struct r100_asic {
> -       const unsigned  *reg_safe_bm;
> -       unsigned        reg_safe_bm_size;
> -       u32             hdp_cntl;
> +       const unsigned          *reg_safe_bm;
> +       unsigned                reg_safe_bm_size;
> +       u32                     hdp_cntl;
> +       struct r100_gpu_lockup  lockup;
>  };
>
>  struct r300_asic {
> -       const unsigned  *reg_safe_bm;
> -       unsigned        reg_safe_bm_size;
> -       u32             resync_scratch;
> -       u32             hdp_cntl;
> +       const unsigned          *reg_safe_bm;
> +       unsigned                reg_safe_bm_size;
> +       u32                     resync_scratch;
> +       u32                     hdp_cntl;
> +       struct r100_gpu_lockup  lockup;
>  };
>
>  struct r600_asic {
> -       unsigned max_pipes;
> -       unsigned max_tile_pipes;
> -       unsigned max_simds;
> -       unsigned max_backends;
> -       unsigned max_gprs;
> -       unsigned max_threads;
> -       unsigned max_stack_entries;
> -       unsigned max_hw_contexts;
> -       unsigned max_gs_threads;
> -       unsigned sx_max_export_size;
> -       unsigned sx_max_export_pos_size;
> -       unsigned sx_max_export_smx_size;
> -       unsigned sq_num_cf_insts;
> -       unsigned tiling_nbanks;
> -       unsigned tiling_npipes;
> -       unsigned tiling_group_size;
> +       unsigned                max_pipes;
> +       unsigned                max_tile_pipes;
> +       unsigned                max_simds;
> +       unsigned                max_backends;
> +       unsigned                max_gprs;
> +       unsigned                max_threads;
> +       unsigned                max_stack_entries;
> +       unsigned                max_hw_contexts;
> +       unsigned                max_gs_threads;
> +       unsigned                sx_max_export_size;
> +       unsigned                sx_max_export_pos_size;
> +       unsigned                sx_max_export_smx_size;
> +       unsigned                sq_num_cf_insts;
> +       unsigned                tiling_nbanks;
> +       unsigned                tiling_npipes;
> +       unsigned                tiling_group_size;
> +       struct r100_gpu_lockup  lockup;
>  };
>
>  struct rv770_asic {
> -       unsigned max_pipes;
> -       unsigned max_tile_pipes;
> -       unsigned max_simds;
> -       unsigned max_backends;
> -       unsigned max_gprs;
> -       unsigned max_threads;
> -       unsigned max_stack_entries;
> -       unsigned max_hw_contexts;
> -       unsigned max_gs_threads;
> -       unsigned sx_max_export_size;
> -       unsigned sx_max_export_pos_size;
> -       unsigned sx_max_export_smx_size;
> -       unsigned sq_num_cf_insts;
> -       unsigned sx_num_of_sets;
> -       unsigned sc_prim_fifo_size;
> -       unsigned sc_hiz_tile_fifo_size;
> -       unsigned sc_earlyz_tile_fifo_fize;
> -       unsigned tiling_nbanks;
> -       unsigned tiling_npipes;
> -       unsigned tiling_group_size;
> +       unsigned                max_pipes;
> +       unsigned                max_tile_pipes;
> +       unsigned                max_simds;
> +       unsigned                max_backends;
> +       unsigned                max_gprs;
> +       unsigned                max_threads;
> +       unsigned                max_stack_entries;
> +       unsigned                max_hw_contexts;
> +       unsigned                max_gs_threads;
> +       unsigned                sx_max_export_size;
> +       unsigned                sx_max_export_pos_size;
> +       unsigned                sx_max_export_smx_size;
> +       unsigned                sq_num_cf_insts;
> +       unsigned                sx_num_of_sets;
> +       unsigned                sc_prim_fifo_size;
> +       unsigned                sc_hiz_tile_fifo_size;
> +       unsigned                sc_earlyz_tile_fifo_fize;
> +       unsigned                tiling_nbanks;
> +       unsigned                tiling_npipes;
> +       unsigned                tiling_group_size;
> +       struct r100_gpu_lockup  lockup;
>  };
>
>  union radeon_asic_config {
> @@ -1135,6 +1146,7 @@ static inline void radeon_ring_write(struct 
> radeon_device *rdev, uint32_t v)
>  #define radeon_suspend(rdev) (rdev)->asic->suspend((rdev))
>  #define radeon_cs_parse(p) rdev->asic->cs_parse((p))
>  #define radeon_vga_set_state(rdev, state) 
> (rdev)->asic->vga_set_state((rdev), (state))
> +#define radeon_gpu_is_lockup(rdev) (rdev)->asic->gpu_is_lockup((rdev))
>  #define radeon_gpu_reset(rdev) (rdev)->asic->gpu_reset((rdev))
>  #define radeon_gart_tlb_flush(rdev) (rdev)->asic->gart_tlb_flush((rdev))
>  #define radeon_gart_set_page(rdev, i, p) (rdev)->asic->gart_set_page((rdev), 
> (i), (p))
> @@ -1233,6 +1245,8 @@ extern int r100_cs_packet_parse(struct radeon_cs_parser 
> *p,
>                                unsigned idx);
>  extern void r100_enable_bm(struct radeon_device *rdev);
>  extern void r100_set_common_regs(struct radeon_device *rdev);
> +extern void r100_gpu_lockup_update(struct r100_gpu_lockup *lockup, struct 
> radeon_cp *cp);
> +extern bool r100_gpu_cp_is_lockup(struct radeon_device *rdev, struct 
> r100_gpu_lockup *lockup, struct radeon_cp *cp);
>
>  /* rv200,rv250,rv280 */
>  extern void r200_set_safe_registers(struct radeon_device *rdev);
> diff --git a/drivers/gpu/drm/radeon/radeon_asic.h 
> b/drivers/gpu/drm/radeon/radeon_asic.h
> index d3a157b..cb1afb2 100644
> --- a/drivers/gpu/drm/radeon/radeon_asic.h
> +++ b/drivers/gpu/drm/radeon/radeon_asic.h
> @@ -52,6 +52,7 @@ extern int r100_resume(struct radeon_device *rdev);
>  uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg);
>  void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
>  void r100_vga_set_state(struct radeon_device *rdev, bool state);
> +bool r100_gpu_is_lockup(struct radeon_device *rdev);
>  int r100_gpu_reset(struct radeon_device *rdev);
>  u32 r100_get_vblank_counter(struct radeon_device *rdev, int crtc);
>  void r100_pci_gart_tlb_flush(struct radeon_device *rdev);
> @@ -89,6 +90,7 @@ static struct radeon_asic r100_asic = {
>        .suspend = &r100_suspend,
>        .resume = &r100_resume,
>        .vga_set_state = &r100_vga_set_state,
> +       .gpu_is_lockup = &r100_gpu_is_lockup,
>        .gpu_reset = &r100_gpu_reset,
>        .gart_tlb_flush = &r100_pci_gart_tlb_flush,
>        .gart_set_page = &r100_pci_gart_set_page,
> @@ -135,6 +137,7 @@ static struct radeon_asic r200_asic = {
>        .suspend = &r100_suspend,
>        .resume = &r100_resume,
>        .vga_set_state = &r100_vga_set_state,
> +       .gpu_is_lockup = &r100_gpu_is_lockup,
>        .gpu_reset = &r100_gpu_reset,
>        .gart_tlb_flush = &r100_pci_gart_tlb_flush,
>        .gart_set_page = &r100_pci_gart_set_page,
> @@ -174,6 +177,7 @@ extern int r300_init(struct radeon_device *rdev);
>  extern void r300_fini(struct radeon_device *rdev);
>  extern int r300_suspend(struct radeon_device *rdev);
>  extern int r300_resume(struct radeon_device *rdev);
> +extern bool r300_gpu_is_lockup(struct radeon_device *rdev);
>  extern int r300_gpu_reset(struct radeon_device *rdev);
>  extern void r300_ring_start(struct radeon_device *rdev);
>  extern void r300_fence_ring_emit(struct radeon_device *rdev,
> @@ -192,6 +196,7 @@ static struct radeon_asic r300_asic = {
>        .suspend = &r300_suspend,
>        .resume = &r300_resume,
>        .vga_set_state = &r100_vga_set_state,
> +       .gpu_is_lockup = &r300_gpu_is_lockup,
>        .gpu_reset = &r300_gpu_reset,
>        .gart_tlb_flush = &r100_pci_gart_tlb_flush,
>        .gart_set_page = &r100_pci_gart_set_page,
> @@ -231,6 +236,7 @@ static struct radeon_asic r300_asic_pcie = {
>        .suspend = &r300_suspend,
>        .resume = &r300_resume,
>        .vga_set_state = &r100_vga_set_state,
> +       .gpu_is_lockup = &r300_gpu_is_lockup,
>        .gpu_reset = &r300_gpu_reset,
>        .gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
>        .gart_set_page = &rv370_pcie_gart_set_page,
> @@ -275,6 +281,7 @@ static struct radeon_asic r420_asic = {
>        .suspend = &r420_suspend,
>        .resume = &r420_resume,
>        .vga_set_state = &r100_vga_set_state,
> +       .gpu_is_lockup = &r300_gpu_is_lockup,
>        .gpu_reset = &r300_gpu_reset,
>        .gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
>        .gart_set_page = &rv370_pcie_gart_set_page,
> @@ -325,6 +332,7 @@ static struct radeon_asic rs400_asic = {
>        .suspend = &rs400_suspend,
>        .resume = &rs400_resume,
>        .vga_set_state = &r100_vga_set_state,
> +       .gpu_is_lockup = &r300_gpu_is_lockup,
>        .gpu_reset = &r300_gpu_reset,
>        .gart_tlb_flush = &rs400_gart_tlb_flush,
>        .gart_set_page = &rs400_gart_set_page,
> @@ -385,6 +393,7 @@ static struct radeon_asic rs600_asic = {
>        .suspend = &rs600_suspend,
>        .resume = &rs600_resume,
>        .vga_set_state = &r100_vga_set_state,
> +       .gpu_is_lockup = &r300_gpu_is_lockup,
>        .gpu_reset = &r300_gpu_reset,
>        .gart_tlb_flush = &rs600_gart_tlb_flush,
>        .gart_set_page = &rs600_gart_set_page,
> @@ -434,6 +443,7 @@ static struct radeon_asic rs690_asic = {
>        .suspend = &rs690_suspend,
>        .resume = &rs690_resume,
>        .vga_set_state = &r100_vga_set_state,
> +       .gpu_is_lockup = &r300_gpu_is_lockup,
>        .gpu_reset = &r300_gpu_reset,
>        .gart_tlb_flush = &rs400_gart_tlb_flush,
>        .gart_set_page = &rs400_gart_set_page,
> @@ -487,6 +497,7 @@ static struct radeon_asic rv515_asic = {
>        .suspend = &rv515_suspend,
>        .resume = &rv515_resume,
>        .vga_set_state = &r100_vga_set_state,
> +       .gpu_is_lockup = &r300_gpu_is_lockup,
>        .gpu_reset = &rv515_gpu_reset,
>        .gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
>        .gart_set_page = &rv370_pcie_gart_set_page,
> @@ -531,6 +542,7 @@ static struct radeon_asic r520_asic = {
>        .suspend = &rv515_suspend,
>        .resume = &r520_resume,
>        .vga_set_state = &r100_vga_set_state,
> +       .gpu_is_lockup = &r300_gpu_is_lockup,
>        .gpu_reset = &rv515_gpu_reset,
>        .gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
>        .gart_set_page = &rv370_pcie_gart_set_page,
> @@ -587,6 +599,7 @@ int r600_copy_dma(struct radeon_device *rdev,
>                  struct radeon_fence *fence);
>  int r600_irq_process(struct radeon_device *rdev);
>  int r600_irq_set(struct radeon_device *rdev);
> +bool r600_gpu_is_lockup(struct radeon_device *rdev);
>  int r600_gpu_reset(struct radeon_device *rdev);
>  int r600_set_surface_reg(struct radeon_device *rdev, int reg,
>                         uint32_t tiling_flags, uint32_t pitch,
> @@ -611,6 +624,7 @@ static struct radeon_asic r600_asic = {
>        .resume = &r600_resume,
>        .cp_commit = &r600_cp_commit,
>        .vga_set_state = &r600_vga_set_state,
> +       .gpu_is_lockup = &r600_gpu_is_lockup,
>        .gpu_reset = &r600_gpu_reset,
>        .gart_tlb_flush = &r600_pcie_gart_tlb_flush,
>        .gart_set_page = &rs600_gart_set_page,
> @@ -648,7 +662,6 @@ int rv770_init(struct radeon_device *rdev);
>  void rv770_fini(struct radeon_device *rdev);
>  int rv770_suspend(struct radeon_device *rdev);
>  int rv770_resume(struct radeon_device *rdev);
> -int rv770_gpu_reset(struct radeon_device *rdev);
>
>  static struct radeon_asic rv770_asic = {
>        .init = &rv770_init,
> @@ -656,7 +669,8 @@ static struct radeon_asic rv770_asic = {
>        .suspend = &rv770_suspend,
>        .resume = &rv770_resume,
>        .cp_commit = &r600_cp_commit,
> -       .gpu_reset = &rv770_gpu_reset,
> +       .gpu_is_lockup = &r600_gpu_is_lockup,
> +       .gpu_reset = &r600_gpu_reset,
>        .vga_set_state = &r600_vga_set_state,
>        .gart_tlb_flush = &r600_pcie_gart_tlb_flush,
>        .gart_set_page = &rs600_gart_set_page,
> @@ -694,6 +708,7 @@ int evergreen_init(struct radeon_device *rdev);
>  void evergreen_fini(struct radeon_device *rdev);
>  int evergreen_suspend(struct radeon_device *rdev);
>  int evergreen_resume(struct radeon_device *rdev);
> +bool evergreen_gpu_is_lockup(struct radeon_device *rdev);
>  int evergreen_gpu_reset(struct radeon_device *rdev);
>  void evergreen_bandwidth_update(struct radeon_device *rdev);
>  void evergreen_hpd_init(struct radeon_device *rdev);
> @@ -708,6 +723,7 @@ static struct radeon_asic evergreen_asic = {
>        .suspend = &evergreen_suspend,
>        .resume = &evergreen_resume,
>        .cp_commit = NULL,
> +       .gpu_is_lockup = &evergreen_gpu_is_lockup,
>        .gpu_reset = &evergreen_gpu_reset,
>        .vga_set_state = &r600_vga_set_state,
>        .gart_tlb_flush = &r600_pcie_gart_tlb_flush,
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c 
> b/drivers/gpu/drm/radeon/radeon_fence.c
> index 8495d4e..3931542 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -57,7 +57,6 @@ int radeon_fence_emit(struct radeon_device *rdev, struct 
> radeon_fence *fence)
>                radeon_fence_ring_emit(rdev, fence);
>
>        fence->emited = true;
> -       fence->timeout = jiffies + ((2000 * HZ) / 1000);
>        list_del(&fence->list);
>        list_add_tail(&fence->list, &rdev->fence_drv.emited);
>        write_unlock_irqrestore(&rdev->fence_drv.lock, irq_flags);
> @@ -70,15 +69,34 @@ static bool radeon_fence_poll_locked(struct radeon_device 
> *rdev)
>        struct list_head *i, *n;
>        uint32_t seq;
>        bool wake = false;
> +       unsigned long cjiffies;
>
> -       if (rdev == NULL) {
> -               return true;
> -       }
> -       if (rdev->shutdown) {
> -               return true;
> -       }
>        seq = RREG32(rdev->fence_drv.scratch_reg);
> -       rdev->fence_drv.last_seq = seq;
> +       if (seq != rdev->fence_drv.last_seq) {
> +               rdev->fence_drv.last_seq = seq;
> +               rdev->fence_drv.last_jiffies = jiffies;
> +               rdev->fence_drv.last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
> +       } else {
> +               cjiffies = jiffies;
> +               if (time_after(cjiffies, rdev->fence_drv.last_jiffies)) {
> +                       cjiffies -= rdev->fence_drv.last_jiffies;
> +                       if (time_after(rdev->fence_drv.last_timeout, 
> cjiffies)) {
> +                               /* update the timeout */
> +                               rdev->fence_drv.last_timeout -= cjiffies;
> +                       } else {
> +                               /* the 500ms timeout is elapsed we should test
> +                                * for GPU lockup
> +                                */
> +                               rdev->fence_drv.last_timeout = 1;
> +                       }
> +               } else {
> +                       /* wrap around update last jiffies, we will just wait
> +                        * a little longer
> +                        */
> +                       rdev->fence_drv.last_jiffies = cjiffies;
> +               }
If we hit this else branch we were stalled for more than max_jiffies/2
time that would mean bad stuff in any case.
> +               return false;
> +       }
>        n = NULL;
>        list_for_each(i, &rdev->fence_drv.emited) {
>                fence = list_entry(i, struct radeon_fence, list);
> @@ -170,9 +188,8 @@ bool radeon_fence_signaled(struct radeon_fence *fence)
>  int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>  {
>        struct radeon_device *rdev;
> -       unsigned long cur_jiffies;
> -       unsigned long timeout;
> -       bool expired = false;
> +       unsigned long irq_flags, timeout;
> +       u32 seq;
>        int r;
>
>        if (fence == NULL) {
> @@ -183,14 +200,10 @@ int radeon_fence_wait(struct radeon_fence *fence, bool 
> intr)
>        if (radeon_fence_signaled(fence)) {
>                return 0;
>        }
> -
> +       timeout = rdev->fence_drv.last_timeout;
>  retry:
> -       cur_jiffies = jiffies;
> -       timeout = HZ / 100;
> -       if (time_after(fence->timeout, cur_jiffies)) {
> -               timeout = fence->timeout - cur_jiffies;
> -       }
> -
> +       /* save current sequence used to check for GPU lockup */
> +       seq = rdev->fence_drv.last_seq;
>        if (intr) {
>                radeon_irq_kms_sw_irq_get(rdev);
>                r = wait_event_interruptible_timeout(rdev->fence_drv.queue,
> @@ -205,38 +218,34 @@ retry:
>                radeon_irq_kms_sw_irq_put(rdev);
>        }
>        if (unlikely(!radeon_fence_signaled(fence))) {
> -               if (unlikely(r == 0)) {
> -                       expired = true;
> +               /* we were interrupted for some reason and fence isn't
> +                * isn't signaled yet, resume wait
> +                */
> +               if (r) {
> +                       timeout = r;
> +                       goto retry;
>                }
> -               if (unlikely(expired)) {
> -                       timeout = 1;
> -                       if (time_after(cur_jiffies, fence->timeout)) {
> -                               timeout = cur_jiffies - fence->timeout;
> -                       }
> -                       timeout = jiffies_to_msecs(timeout);
> -                       if (timeout > 500) {
> -                               DRM_ERROR("fence(%p:0x%08X) %lums timeout "
> -                                         "going to reset GPU\n",
> -                                         fence, fence->seq, timeout);
> -                               radeon_gpu_reset(rdev);
> -                               WREG32(rdev->fence_drv.scratch_reg, 
> fence->seq);
> -                       }
> +               /* don't protect read access to rdev->fence_drv.last_seq
> +                * if we experiencing a lockup the value doesn't change
> +                */
> +               if (seq == rdev->fence_drv.last_seq && 
> radeon_gpu_is_lockup(rdev)) {
> +                       /* good news we believe it's a lockup */
> +                       dev_warn(rdev->dev, "GPU lockup (last fence id 
> 0x%08X)\n", seq);
> +                       r = radeon_gpu_reset(rdev);
> +                       if (r)
> +                               return r;
> +                       /* FIXME: what should we do ? marking everyone
> +                        * as signaled for now
> +                        */
> +                       WREG32(rdev->fence_drv.scratch_reg, fence->seq);
>                }
> +               timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
> +               write_lock_irqsave(&rdev->fence_drv.lock, irq_flags);
> +               rdev->fence_drv.last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
> +               rdev->fence_drv.last_jiffies = jiffies;
> +               write_unlock_irqrestore(&rdev->fence_drv.lock, irq_flags);
>                goto retry;
>        }
> -       if (unlikely(expired)) {
> -               rdev->fence_drv.count_timeout++;
> -               cur_jiffies = jiffies;
> -               timeout = 1;
> -               if (time_after(cur_jiffies, fence->timeout)) {
> -                       timeout = cur_jiffies - fence->timeout;
> -               }
> -               timeout = jiffies_to_msecs(timeout);
> -               DRM_ERROR("fence(%p:0x%08X) %lums timeout\n",
> -                         fence, fence->seq, timeout);
> -               DRM_ERROR("last signaled fence(0x%08X)\n",
> -                         rdev->fence_drv.last_seq);
> -       }
>        return 0;
>  }
>
> @@ -332,7 +341,6 @@ int radeon_fence_driver_init(struct radeon_device *rdev)
>        INIT_LIST_HEAD(&rdev->fence_drv.created);
>        INIT_LIST_HEAD(&rdev->fence_drv.emited);
>        INIT_LIST_HEAD(&rdev->fence_drv.signaled);
> -       rdev->fence_drv.count_timeout = 0;
>        init_waitqueue_head(&rdev->fence_drv.queue);
>        rdev->fence_drv.initialized = true;
>        write_unlock_irqrestore(&rdev->fence_drv.lock, irq_flags);
> diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c
> index 37887de..16c793b 100644
> --- a/drivers/gpu/drm/radeon/rv770.c
> +++ b/drivers/gpu/drm/radeon/rv770.c
> @@ -917,12 +917,6 @@ int rv770_mc_init(struct radeon_device *rdev)
>        return 0;
>  }
>
> -int rv770_gpu_reset(struct radeon_device *rdev)
> -{
> -       /* FIXME: implement any rv770 specific bits */
> -       return r600_gpu_reset(rdev);
> -}
> -
>  static int rv770_startup(struct radeon_device *rdev)
>  {
>        int r;
> --
> 1.6.6.1
>

It would be good idea to check if jiffies are ticking anytime when we
can't run the driver code.(like suspend) I don't think there should be
any case but it would be better to be safe than sorry.

------------------------------------------------------------------------------
Download Intel&#174; Parallel Studio Eval
Try the new software tools for yourself. Speed compiling, find bugs
proactively, and fine-tune applications for parallel performance.
See why Intel Parallel Studio got high marks during beta.
http://p.sf.net/sfu/intel-sw-dev
--
_______________________________________________
Dri-devel mailing list
Dri-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/dri-devel

Reply via email to