Re: [PATCH 1/2] drm/vc4: Add T-format scanout support.

2017-06-13 Thread Boris Brezillon
On Wed,  7 Jun 2017 17:13:35 -0700
Eric Anholt  wrote:

> The T tiling format is what V3D uses for textures, with no raster
> support at all until later revisions of the hardware (and always at a
> large 3D performance penalty).  If we can't scan out V3D's format,
> then we often need to do a relayout at some stage of the pipeline,
> either right before texturing from the scanout buffer (common in X11
> without a compositor) or between a tiled screen buffer right before
> scanout (an option I've considered in trying to resolve this
> inconsistency, but which means needing to use the dirty fb ioctl and
> having some update policy).
> 
> T-format scanout lets us avoid either of those shadow copies, for a
> massive, obvious performance improvement to X11 window dragging
> without a compositor.  Unfortunately, enabling a compositor to work
> around the discrepancy has turned out to be too costly in memory
> consumption for the Raspbian distribution.
> 
> Because the HVS operates a scanline at a time, compositing from T does
> increase the memory bandwidth cost of scanout.  On my 1920x1080@32bpp
> display on a RPi3, we go from about 15% of system memory bandwidth
> with linear to about 20% with tiled.  However, for X11 this still ends
> up being a huge performance win in active usage.
> 
> This patch doesn't yet handle src_x/src_y offsetting within the tiled
> buffer.  However, we fail to do so for untiled buffers already.
> 
> Signed-off-by: Eric Anholt 

Reviewed-by: Boris Brezillon 

> ---
>  drivers/gpu/drm/vc4/vc4_plane.c | 31 +++
>  drivers/gpu/drm/vc4/vc4_regs.h  | 19 +++
>  include/uapi/drm/drm_fourcc.h   | 23 ++-
>  3 files changed, 68 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
> index da18dec21696..fa6809d8b0fe 100644
> --- a/drivers/gpu/drm/vc4/vc4_plane.c
> +++ b/drivers/gpu/drm/vc4/vc4_plane.c
> @@ -500,8 +500,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
>   u32 ctl0_offset = vc4_state->dlist_count;
>   const struct hvs_format *format = 
> vc4_get_hvs_format(fb->format->format);
>   int num_planes = drm_format_num_planes(format->drm);
> - u32 scl0, scl1;
> - u32 lbm_size;
> + u32 scl0, scl1, pitch0;
> + u32 lbm_size, tiling;
>   unsigned long irqflags;
>   int ret, i;
>  
> @@ -542,11 +542,31 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
>   scl1 = vc4_get_scl_field(state, 0);
>   }
>  
> + switch (fb->modifier) {
> + case DRM_FORMAT_MOD_LINEAR:
> + tiling = SCALER_CTL0_TILING_LINEAR;
> + pitch0 = VC4_SET_FIELD(fb->pitches[0], SCALER_SRC_PITCH);
> + break;
> + case DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED:
> + tiling = SCALER_CTL0_TILING_256B_OR_T;
> +
> + pitch0 = (VC4_SET_FIELD(0, SCALER_PITCH0_TILE_Y_OFFSET),
> +   VC4_SET_FIELD(0, SCALER_PITCH0_TILE_WIDTH_L),
> +   VC4_SET_FIELD((vc4_state->src_w[0] + 31) >> 5,
> + SCALER_PITCH0_TILE_WIDTH_R));
> + break;
> + default:
> + DRM_DEBUG_KMS("Unsupported FB tiling flag 0x%16llx",
> +   (long long)fb->modifier);
> + return -EINVAL;
> + }
> +
>   /* Control word */
>   vc4_dlist_write(vc4_state,
>   SCALER_CTL0_VALID |
>   (format->pixel_order << SCALER_CTL0_ORDER_SHIFT) |
>   (format->hvs << SCALER_CTL0_PIXEL_FORMAT_SHIFT) |
> + VC4_SET_FIELD(tiling, SCALER_CTL0_TILING) |
>   (vc4_state->is_unity ? SCALER_CTL0_UNITY : 0) |
>   VC4_SET_FIELD(scl0, SCALER_CTL0_SCL0) |
>   VC4_SET_FIELD(scl1, SCALER_CTL0_SCL1));
> @@ -600,8 +620,11 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
>   for (i = 0; i < num_planes; i++)
>   vc4_dlist_write(vc4_state, 0xc0c0c0c0);
>  
> - /* Pitch word 0/1/2 */
> - for (i = 0; i < num_planes; i++) {
> + /* Pitch word 0 */
> + vc4_dlist_write(vc4_state, pitch0);
> +
> + /* Pitch word 1/2 */
> + for (i = 1; i < num_planes; i++) {
>   vc4_dlist_write(vc4_state,
>   VC4_SET_FIELD(fb->pitches[i], 
> SCALER_SRC_PITCH));
>   }
> diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
> index 932093936178..d382c34c1b9e 100644
> --- a/drivers/gpu/drm/vc4/vc4_regs.h
> +++ b/drivers/gpu/drm/vc4/vc4_regs.h
> @@ -709,6 +709,13 @@ enum hvs_pixel_format {
>  #define SCALER_CTL0_SIZE_MASKVC4_MASK(29, 24)
>  #define SCALER_CTL0_SIZE_SHIFT   24
>  
> +#define SCALER_CTL0_TILING_MASK  VC4_MASK(21, 20)
> +#define SCALER_CTL0_TILING_SHIFT   

[PATCH 1/2] drm/vc4: Add T-format scanout support.

2017-06-07 Thread Eric Anholt
The T tiling format is what V3D uses for textures, with no raster
support at all until later revisions of the hardware (and always at a
large 3D performance penalty).  If we can't scan out V3D's format,
then we often need to do a relayout at some stage of the pipeline,
either right before texturing from the scanout buffer (common in X11
without a compositor) or between a tiled screen buffer right before
scanout (an option I've considered in trying to resolve this
inconsistency, but which means needing to use the dirty fb ioctl and
having some update policy).

T-format scanout lets us avoid either of those shadow copies, for a
massive, obvious performance improvement to X11 window dragging
without a compositor.  Unfortunately, enabling a compositor to work
around the discrepancy has turned out to be too costly in memory
consumption for the Raspbian distribution.

Because the HVS operates a scanline at a time, compositing from T does
increase the memory bandwidth cost of scanout.  On my 1920x1080@32bpp
display on a RPi3, we go from about 15% of system memory bandwidth
with linear to about 20% with tiled.  However, for X11 this still ends
up being a huge performance win in active usage.

This patch doesn't yet handle src_x/src_y offsetting within the tiled
buffer.  However, we fail to do so for untiled buffers already.

Signed-off-by: Eric Anholt 
---
 drivers/gpu/drm/vc4/vc4_plane.c | 31 +++
 drivers/gpu/drm/vc4/vc4_regs.h  | 19 +++
 include/uapi/drm/drm_fourcc.h   | 23 ++-
 3 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index da18dec21696..fa6809d8b0fe 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -500,8 +500,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
u32 ctl0_offset = vc4_state->dlist_count;
const struct hvs_format *format = 
vc4_get_hvs_format(fb->format->format);
int num_planes = drm_format_num_planes(format->drm);
-   u32 scl0, scl1;
-   u32 lbm_size;
+   u32 scl0, scl1, pitch0;
+   u32 lbm_size, tiling;
unsigned long irqflags;
int ret, i;
 
@@ -542,11 +542,31 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
scl1 = vc4_get_scl_field(state, 0);
}
 
+   switch (fb->modifier) {
+   case DRM_FORMAT_MOD_LINEAR:
+   tiling = SCALER_CTL0_TILING_LINEAR;
+   pitch0 = VC4_SET_FIELD(fb->pitches[0], SCALER_SRC_PITCH);
+   break;
+   case DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED:
+   tiling = SCALER_CTL0_TILING_256B_OR_T;
+
+   pitch0 = (VC4_SET_FIELD(0, SCALER_PITCH0_TILE_Y_OFFSET),
+ VC4_SET_FIELD(0, SCALER_PITCH0_TILE_WIDTH_L),
+ VC4_SET_FIELD((vc4_state->src_w[0] + 31) >> 5,
+   SCALER_PITCH0_TILE_WIDTH_R));
+   break;
+   default:
+   DRM_DEBUG_KMS("Unsupported FB tiling flag 0x%16llx",
+ (long long)fb->modifier);
+   return -EINVAL;
+   }
+
/* Control word */
vc4_dlist_write(vc4_state,
SCALER_CTL0_VALID |
(format->pixel_order << SCALER_CTL0_ORDER_SHIFT) |
(format->hvs << SCALER_CTL0_PIXEL_FORMAT_SHIFT) |
+   VC4_SET_FIELD(tiling, SCALER_CTL0_TILING) |
(vc4_state->is_unity ? SCALER_CTL0_UNITY : 0) |
VC4_SET_FIELD(scl0, SCALER_CTL0_SCL0) |
VC4_SET_FIELD(scl1, SCALER_CTL0_SCL1));
@@ -600,8 +620,11 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
for (i = 0; i < num_planes; i++)
vc4_dlist_write(vc4_state, 0xc0c0c0c0);
 
-   /* Pitch word 0/1/2 */
-   for (i = 0; i < num_planes; i++) {
+   /* Pitch word 0 */
+   vc4_dlist_write(vc4_state, pitch0);
+
+   /* Pitch word 1/2 */
+   for (i = 1; i < num_planes; i++) {
vc4_dlist_write(vc4_state,
VC4_SET_FIELD(fb->pitches[i], 
SCALER_SRC_PITCH));
}
diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
index 932093936178..d382c34c1b9e 100644
--- a/drivers/gpu/drm/vc4/vc4_regs.h
+++ b/drivers/gpu/drm/vc4/vc4_regs.h
@@ -709,6 +709,13 @@ enum hvs_pixel_format {
 #define SCALER_CTL0_SIZE_MASK  VC4_MASK(29, 24)
 #define SCALER_CTL0_SIZE_SHIFT 24
 
+#define SCALER_CTL0_TILING_MASKVC4_MASK(21, 20)
+#define SCALER_CTL0_TILING_SHIFT   20
+#define SCALER_CTL0_TILING_LINEAR  0
+#define SCALER_CTL0_TILING_64B 1
+#define SCALER_CTL0_TILING_128B2
+#define SCALER_CTL0_TILING_256B_OR_T   3
+
 #define SCALER_CTL0_HFLIP