Re: [PATCH v15 1/2] drm/tegra: dc: Support memory bandwidth management

2021-03-16 Thread Dmitry Osipenko
15.03.2021 21:39, Dmitry Osipenko пишет:
>>> +   /*
>>> +* Horizontal downscale needs a lower memory latency, which roughly
>>> +* depends on the scaled width.  Trying to tune latency of a memory
>>> +* client alone will likely result in a strong negative impact on
>>> +* other memory clients, hence we will request a higher bandwidth
>>> +* since latency depends on bandwidth.  This allows to prevent memory
>>> +* FIFO underflows for a large plane downscales, meanwhile allowing
>>> +* display to share bandwidth fairly with other memory clients.
>>> +*/
>>> +   if (src_w > dst_w)
>>> +   mul = (src_w - dst_w) * bpp / 2048 + 1;
>>> +   else
>>> +   mul = 1;
>> [...]
>>
>> One point is unexplained yet: why is the multiplier proportional to a
>> *difference* between src and dst widths? Also, I would expect max (worst
>> case) is pixclock * read_size when src_w/dst_w >= read_size.
> IIRC, the difference gives a more adequate/practical result than the
> proportion. Although, downstream driver uses proportion. I'll try to
> revisit this for the next version of the patch.

I tried to re-test everything and can't reproduce problems that existed
previously. We didn't have a finished memory drivers back then and I
think that Tegra30 latency tuning support and various Tegra20 changes
fixed those problems. I'll remove this hunk in the next version.


Re: [PATCH v15 1/2] drm/tegra: dc: Support memory bandwidth management

2021-03-15 Thread Dmitry Osipenko
15.03.2021 01:31, Michał Mirosław пишет:
> On Thu, Mar 11, 2021 at 08:22:54PM +0300, Dmitry Osipenko wrote:
>> Display controller (DC) performs isochronous memory transfers, and thus,
>> has a requirement for a minimum memory bandwidth that shall be fulfilled,
>> otherwise framebuffer data can't be fetched fast enough and this results
>> in a DC's data-FIFO underflow that follows by a visual corruption.
> [...]
>> +static unsigned long
>> +tegra_plane_overlap_mask(struct drm_crtc_state *state,
>> + const struct drm_plane_state *plane_state)
>> +{
>> +const struct drm_plane_state *other_state;
>> +const struct tegra_plane *tegra;
>> +unsigned long overlap_mask = 0;
>> +struct drm_plane *plane;
>> +struct drm_rect rect;
>> +
>> +if (!plane_state->visible || !plane_state->fb)
>> +return 0;
>> +
>> +drm_atomic_crtc_state_for_each_plane_state(plane, other_state, state) {
> [...]
>> +/*
>> + * Data-prefetch FIFO will easily help to overcome temporal memory
>> + * pressure if other plane overlaps with the cursor plane.
>> + */
>> +if (tegra_plane_is_cursor(plane_state) && overlap_mask)
>> +return 0;
>> +
>> +return overlap_mask;
>> +}
> 
> Since for cursor plane this always returns 0, you could test
> tegra_plane_is_cursor() at the start of the function.

Yes, thanks.

>> +static int tegra_crtc_calculate_memory_bandwidth(struct drm_crtc *crtc,
>> + struct drm_atomic_state *state)
> [...]
>> +/*
>> + * For overlapping planes pixel's data is fetched for each plane at
>> + * the same time, hence bandwidths are accumulated in this case.
>> + * This needs to be taken into account for calculating total bandwidth
>> + * consumed by all planes.
>> + *
>> + * Here we get the overlapping state of each plane, which is a
>> + * bitmask of plane indices telling with what planes there is an
>> + * overlap. Note that bitmask[plane] includes BIT(plane) in order
>> + * to make further code nicer and simpler.
>> + */
>> +drm_atomic_crtc_state_for_each_plane_state(plane, plane_state, 
>> new_state) {
>> +tegra_state = to_const_tegra_plane_state(plane_state);
>> +tegra = to_tegra_plane(plane);
>> +
>> +if (WARN_ON_ONCE(tegra->index >= TEGRA_DC_LEGACY_PLANES_NUM))
>> +return -EINVAL;
>> +
>> +plane_peak_bw[tegra->index] = 
>> tegra_state->peak_memory_bandwidth;
>> +mask = tegra_plane_overlap_mask(new_state, plane_state);
>> +overlap_mask[tegra->index] = mask;
>> +
>> +if (hweight_long(mask) != 3)
>> +all_planes_overlap_simultaneously = false;
>> +}
>> +
>> +old_state = drm_atomic_get_old_crtc_state(state, crtc);
>> +old_dc_state = to_const_dc_state(old_state);
>> +
>> +/*
>> + * Then we calculate maximum bandwidth of each plane state.
>> + * The bandwidth includes the plane BW + BW of the "simultaneously"
>> + * overlapping planes, where "simultaneously" means areas where DC
>> + * fetches from the planes simultaneously during of scan-out process.
>> + *
>> + * For example, if plane A overlaps with planes B and C, but B and C
>> + * don't overlap, then the peak bandwidth will be either in area where
>> + * A-and-B or A-and-C planes overlap.
>> + *
>> + * The plane_peak_bw[] contains peak memory bandwidth values of
>> + * each plane, this information is needed by interconnect provider
>> + * in order to set up latency allowness based on the peak BW, see
>> + * tegra_crtc_update_memory_bandwidth().
>> + */
>> +for (i = 0; i < ARRAY_SIZE(plane_peak_bw); i++) {
>> +overlap_bw = 0;
>> +
>> +for_each_set_bit(k, _mask[i], 3) {
>> +if (k == i)
>> +continue;
>> +
>> +if (all_planes_overlap_simultaneously)
>> +overlap_bw += plane_peak_bw[k];
>> +else
>> +overlap_bw = max(overlap_bw, plane_peak_bw[k]);
>> +}
>> +
>> +new_dc_state->plane_peak_bw[i] = plane_peak_bw[i] + overlap_bw;
>> +
>> +/*
>> + * If plane's peak bandwidth changed (for example plane isn't
>> + * overlapped anymore) and plane isn't in the atomic state,
>> + * then add plane to the state in order to have the bandwidth
>> + * updated.
>> + */
>> +if (old_dc_state->plane_peak_bw[i] !=
>> +new_dc_state->plane_peak_bw[i]) {
>> +plane = tegra_crtc_get_plane_by_index(crtc, i);
>> +if (!plane)
>> +continue;
>> +
>> +plane_state = drm_atomic_get_plane_state(state, plane);
>> +if (IS_ERR(plane_state))
>> + 

Re: [PATCH v15 1/2] drm/tegra: dc: Support memory bandwidth management

2021-03-14 Thread Michał Mirosław
On Thu, Mar 11, 2021 at 08:22:54PM +0300, Dmitry Osipenko wrote:
> Display controller (DC) performs isochronous memory transfers, and thus,
> has a requirement for a minimum memory bandwidth that shall be fulfilled,
> otherwise framebuffer data can't be fetched fast enough and this results
> in a DC's data-FIFO underflow that follows by a visual corruption.
[...]
> +static unsigned long
> +tegra_plane_overlap_mask(struct drm_crtc_state *state,
> +  const struct drm_plane_state *plane_state)
> +{
> + const struct drm_plane_state *other_state;
> + const struct tegra_plane *tegra;
> + unsigned long overlap_mask = 0;
> + struct drm_plane *plane;
> + struct drm_rect rect;
> +
> + if (!plane_state->visible || !plane_state->fb)
> + return 0;
> +
> + drm_atomic_crtc_state_for_each_plane_state(plane, other_state, state) {
[...]
> + /*
> +  * Data-prefetch FIFO will easily help to overcome temporal memory
> +  * pressure if other plane overlaps with the cursor plane.
> +  */
> + if (tegra_plane_is_cursor(plane_state) && overlap_mask)
> + return 0;
> +
> + return overlap_mask;
> +}

Since for cursor plane this always returns 0, you could test
tegra_plane_is_cursor() at the start of the function.

> +static int tegra_crtc_calculate_memory_bandwidth(struct drm_crtc *crtc,
> +  struct drm_atomic_state *state)
[...]
> + /*
> +  * For overlapping planes pixel's data is fetched for each plane at
> +  * the same time, hence bandwidths are accumulated in this case.
> +  * This needs to be taken into account for calculating total bandwidth
> +  * consumed by all planes.
> +  *
> +  * Here we get the overlapping state of each plane, which is a
> +  * bitmask of plane indices telling with what planes there is an
> +  * overlap. Note that bitmask[plane] includes BIT(plane) in order
> +  * to make further code nicer and simpler.
> +  */
> + drm_atomic_crtc_state_for_each_plane_state(plane, plane_state, 
> new_state) {
> + tegra_state = to_const_tegra_plane_state(plane_state);
> + tegra = to_tegra_plane(plane);
> +
> + if (WARN_ON_ONCE(tegra->index >= TEGRA_DC_LEGACY_PLANES_NUM))
> + return -EINVAL;
> +
> + plane_peak_bw[tegra->index] = 
> tegra_state->peak_memory_bandwidth;
> + mask = tegra_plane_overlap_mask(new_state, plane_state);
> + overlap_mask[tegra->index] = mask;
> +
> + if (hweight_long(mask) != 3)
> + all_planes_overlap_simultaneously = false;
> + }
> +
> + old_state = drm_atomic_get_old_crtc_state(state, crtc);
> + old_dc_state = to_const_dc_state(old_state);
> +
> + /*
> +  * Then we calculate maximum bandwidth of each plane state.
> +  * The bandwidth includes the plane BW + BW of the "simultaneously"
> +  * overlapping planes, where "simultaneously" means areas where DC
> +  * fetches from the planes simultaneously during of scan-out process.
> +  *
> +  * For example, if plane A overlaps with planes B and C, but B and C
> +  * don't overlap, then the peak bandwidth will be either in area where
> +  * A-and-B or A-and-C planes overlap.
> +  *
> +  * The plane_peak_bw[] contains peak memory bandwidth values of
> +  * each plane, this information is needed by interconnect provider
> +  * in order to set up latency allowness based on the peak BW, see
> +  * tegra_crtc_update_memory_bandwidth().
> +  */
> + for (i = 0; i < ARRAY_SIZE(plane_peak_bw); i++) {
> + overlap_bw = 0;
> +
> + for_each_set_bit(k, _mask[i], 3) {
> + if (k == i)
> + continue;
> +
> + if (all_planes_overlap_simultaneously)
> + overlap_bw += plane_peak_bw[k];
> + else
> + overlap_bw = max(overlap_bw, plane_peak_bw[k]);
> + }
> +
> + new_dc_state->plane_peak_bw[i] = plane_peak_bw[i] + overlap_bw;
> +
> + /*
> +  * If plane's peak bandwidth changed (for example plane isn't
> +  * overlapped anymore) and plane isn't in the atomic state,
> +  * then add plane to the state in order to have the bandwidth
> +  * updated.
> +  */
> + if (old_dc_state->plane_peak_bw[i] !=
> + new_dc_state->plane_peak_bw[i]) {
> + plane = tegra_crtc_get_plane_by_index(crtc, i);
> + if (!plane)
> + continue;
> +
> + plane_state = drm_atomic_get_plane_state(state, plane);
> + if (IS_ERR(plane_state))
> + return PTR_ERR(plane_state);
> + }
> + }
[...]

Does it 

[PATCH v15 1/2] drm/tegra: dc: Support memory bandwidth management

2021-03-11 Thread Dmitry Osipenko
Display controller (DC) performs isochronous memory transfers, and thus,
has a requirement for a minimum memory bandwidth that shall be fulfilled,
otherwise framebuffer data can't be fetched fast enough and this results
in a DC's data-FIFO underflow that follows by a visual corruption.

The Memory Controller drivers provide facility for memory bandwidth
management via interconnect API. Let's wire up the interconnect API
support to the DC driver in order to fix the distorted display output
on T30 Ouya, T124 TK1 and other Tegra devices.

Tested-by: Peter Geis  # Ouya T30
Tested-by: Matt Merhar  # Ouya T30
Tested-by: Nicolas Chauvet  # PAZ00 T20 and TK1 T124
Signed-off-by: Dmitry Osipenko 
---
 drivers/gpu/drm/tegra/Kconfig |   1 +
 drivers/gpu/drm/tegra/dc.c| 352 ++
 drivers/gpu/drm/tegra/dc.h|  14 ++
 drivers/gpu/drm/tegra/drm.c   |  14 ++
 drivers/gpu/drm/tegra/hub.c   |   3 +
 drivers/gpu/drm/tegra/plane.c | 127 
 drivers/gpu/drm/tegra/plane.h |  15 ++
 7 files changed, 526 insertions(+)

diff --git a/drivers/gpu/drm/tegra/Kconfig b/drivers/gpu/drm/tegra/Kconfig
index 5043dcaf1cf9..1650a448eabd 100644
--- a/drivers/gpu/drm/tegra/Kconfig
+++ b/drivers/gpu/drm/tegra/Kconfig
@@ -9,6 +9,7 @@ config DRM_TEGRA
select DRM_MIPI_DSI
select DRM_PANEL
select TEGRA_HOST1X
+   select INTERCONNECT
select IOMMU_IOVA
select CEC_CORE if CEC_NOTIFIER
help
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c
index 0ae3a025efe9..49fa488cf930 100644
--- a/drivers/gpu/drm/tegra/dc.c
+++ b/drivers/gpu/drm/tegra/dc.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -616,6 +617,9 @@ static int tegra_plane_atomic_check(struct drm_plane *plane,
struct tegra_dc *dc = to_tegra_dc(state->crtc);
int err;
 
+   plane_state->peak_memory_bandwidth = 0;
+   plane_state->avg_memory_bandwidth = 0;
+
/* no need for further checks if the plane is being disabled */
if (!state->crtc)
return 0;
@@ -802,6 +806,12 @@ static struct drm_plane *tegra_primary_plane_create(struct 
drm_device *drm,
formats = dc->soc->primary_formats;
modifiers = dc->soc->modifiers;
 
+   err = tegra_plane_interconnect_init(plane);
+   if (err) {
+   kfree(plane);
+   return ERR_PTR(err);
+   }
+
err = drm_universal_plane_init(drm, >base, possible_crtcs,
   _plane_funcs, formats,
   num_formats, modifiers, type, NULL);
@@ -833,9 +843,13 @@ static const u32 tegra_cursor_plane_formats[] = {
 static int tegra_cursor_atomic_check(struct drm_plane *plane,
 struct drm_plane_state *state)
 {
+   struct tegra_plane_state *plane_state = to_tegra_plane_state(state);
struct tegra_plane *tegra = to_tegra_plane(plane);
int err;
 
+   plane_state->peak_memory_bandwidth = 0;
+   plane_state->avg_memory_bandwidth = 0;
+
/* no need for further checks if the plane is being disabled */
if (!state->crtc)
return 0;
@@ -973,6 +987,12 @@ static struct drm_plane 
*tegra_dc_cursor_plane_create(struct drm_device *drm,
num_formats = ARRAY_SIZE(tegra_cursor_plane_formats);
formats = tegra_cursor_plane_formats;
 
+   err = tegra_plane_interconnect_init(plane);
+   if (err) {
+   kfree(plane);
+   return ERR_PTR(err);
+   }
+
err = drm_universal_plane_init(drm, >base, possible_crtcs,
   _plane_funcs, formats,
   num_formats, NULL,
@@ -1087,6 +1107,12 @@ static struct drm_plane 
*tegra_dc_overlay_plane_create(struct drm_device *drm,
num_formats = dc->soc->num_overlay_formats;
formats = dc->soc->overlay_formats;
 
+   err = tegra_plane_interconnect_init(plane);
+   if (err) {
+   kfree(plane);
+   return ERR_PTR(err);
+   }
+
if (!cursor)
type = DRM_PLANE_TYPE_OVERLAY;
else
@@ -1204,6 +1230,7 @@ tegra_crtc_atomic_duplicate_state(struct drm_crtc *crtc)
 {
struct tegra_dc_state *state = to_dc_state(crtc->state);
struct tegra_dc_state *copy;
+   unsigned int i;
 
copy = kmalloc(sizeof(*copy), GFP_KERNEL);
if (!copy)
@@ -1215,6 +1242,9 @@ tegra_crtc_atomic_duplicate_state(struct drm_crtc *crtc)
copy->div = state->div;
copy->planes = state->planes;
 
+   for (i = 0; i < ARRAY_SIZE(state->plane_peak_bw); i++)
+   copy->plane_peak_bw[i] = state->plane_peak_bw[i];
+
return >base;
 }
 
@@ -1741,6 +1771,106 @@ static int tegra_dc_wait_idle(struct tegra_dc *dc, 
unsigned long timeout)
return -ETIMEDOUT;
 }
 
+static void
+tegra_crtc_update_memory_bandwidth(struct drm_crtc