Hi,

2014-07-10 19:35 GMT+02:00 Michael Niedermayer <michae...@gmx.at>:
> AV_COPY, AV_SWAP, AV_ZERO
> AV_[RW]N[8-64]A need aligned memory
>
> AV_[RW][BLN][8-64]
> AV_COPY*U doesnt need aligned memory, but might be faster if its
> aligned.
> These might be slower than the "aligned only" variants

Documentation sent in another patch

If I have to assume I need addresses aligned on 8 to use AV_ZERO64,
then unfortunately, forcing that alignment on the MvField struct bumps
its size from 11 to 16. Maybe it's worth it performance-wise, but I'm
not going to make us spend more time on this part.

So here's a patch where I align various things to 4 to allow using
AV_ZERO32/AV_RN32A.

-- 
Christophe
From 86fcce0a680e799eff3df86eb28ee77a88736811 Mon Sep 17 00:00:00 2001
From: Christophe Gisquet <christophe.gisq...@gmail.com>
Date: Sat, 12 Jul 2014 16:51:09 +0200
Subject: [PATCH 2/4] hevc: use intreadwrite

When dealing with MVs, both components may be processed at a time.

On Win64, 560 to 539 cycles for derive_spatial_merge_candidates.
---
 libavcodec/hevc.c     |  3 +--
 libavcodec/hevc.h     |  2 +-
 libavcodec/hevc_mvs.c | 26 +++++++++++---------------
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index afb2baa..2cc4eca 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -1696,8 +1696,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
                 }
 
                 if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) {
-                    lc->pu.mvd.x = 0;
-                    lc->pu.mvd.y = 0;
+                    AV_ZERO32(&lc->pu.mvd);
                 } else {
                     ff_hevc_hls_mvd_coding(s, x0, y0, 1);
                 }
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index 2a5ce25..04be643 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -660,7 +660,7 @@ typedef struct Mv {
 } Mv;
 
 typedef struct MvField {
-    Mv mv[2];
+    DECLARE_ALIGNED(4, Mv, mv)[2];
     int8_t ref_idx[2];
     int8_t pred_flag;
 } MvField;
diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
index 023fb55..b1a1ffc 100644
--- a/libavcodec/hevc_mvs.c
+++ b/libavcodec/hevc_mvs.c
@@ -125,6 +125,7 @@ static int isDiffMER(HEVCContext *s, int xN, int yN, int xP, int yP)
            yN >> plevel == yP >> plevel;
 }
 
+#define MATCH_MV(x) (AV_RN32A(&A.x) == AV_RN32A(&B.x))
 #define MATCH(x) (A.x == B.x)
 
 // check if the mv's and refidx are the same between A and B
@@ -134,12 +135,12 @@ static int compareMVrefidx(struct MvField A, struct MvField B)
     int b_pf = B.pred_flag;
     if (a_pf == b_pf) {
         if (a_pf == PF_BI) {
-            return MATCH(ref_idx[0]) && MATCH(mv[0].x) && MATCH(mv[0].y) &&
-                   MATCH(ref_idx[1]) && MATCH(mv[1].x) && MATCH(mv[1].y);
+            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]) &&
+                   MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
         } else if (a_pf == PF_L0) {
-            return MATCH(ref_idx[0]) && MATCH(mv[0].x) && MATCH(mv[0].y);
+            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]);
         } else if (a_pf == PF_L1) {
-            return MATCH(ref_idx[1]) && MATCH(mv[1].x) && MATCH(mv[1].y);
+            return MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
         }
     }
     return 0;
@@ -505,15 +506,12 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
             if ((l0_cand.pred_flag & PF_L0) && (l1_cand.pred_flag & PF_L1) &&
                 (refPicList[0].list[l0_cand.ref_idx[0]] !=
                  refPicList[1].list[l1_cand.ref_idx[1]] ||
-                 l0_cand.mv[0].x != l1_cand.mv[1].x ||
-                 l0_cand.mv[0].y != l1_cand.mv[1].y)) {
+                 AV_RN32A(&l0_cand.mv[0]) != AV_RN32A(&l1_cand.mv[1]))) {
                 mergecandlist[nb_merge_cand].ref_idx[0]   = l0_cand.ref_idx[0];
                 mergecandlist[nb_merge_cand].ref_idx[1]   = l1_cand.ref_idx[1];
                 mergecandlist[nb_merge_cand].pred_flag    = PF_BI;
-                mergecandlist[nb_merge_cand].mv[0].x      = l0_cand.mv[0].x;
-                mergecandlist[nb_merge_cand].mv[0].y      = l0_cand.mv[0].y;
-                mergecandlist[nb_merge_cand].mv[1].x      = l1_cand.mv[1].x;
-                mergecandlist[nb_merge_cand].mv[1].y      = l1_cand.mv[1].y;
+                AV_COPY32(&mergecandlist[nb_merge_cand].mv[0], &l0_cand.mv[0]);
+                AV_COPY32(&mergecandlist[nb_merge_cand].mv[1], &l1_cand.mv[1]);
                 if (merge_idx == nb_merge_cand) return;
                 nb_merge_cand++;
             }
@@ -523,10 +521,8 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     // append Zero motion vector candidates
     while (nb_merge_cand < s->sh.max_num_merge_cand) {
         mergecandlist[nb_merge_cand].pred_flag    = PF_L0 + ((s->sh.slice_type == B_SLICE) << 1);
-        mergecandlist[nb_merge_cand].mv[0].x      = 0;
-        mergecandlist[nb_merge_cand].mv[0].y      = 0;
-        mergecandlist[nb_merge_cand].mv[1].x      = 0;
-        mergecandlist[nb_merge_cand].mv[1].y      = 0;
+        AV_ZERO32(mergecandlist[nb_merge_cand].mv+0);
+        AV_ZERO32(mergecandlist[nb_merge_cand].mv+1);
         mergecandlist[nb_merge_cand].ref_idx[0]   = zero_idx < nb_refs ? zero_idx : 0;
         mergecandlist[nb_merge_cand].ref_idx[1]   = zero_idx < nb_refs ? zero_idx : 0;
 
@@ -545,7 +541,7 @@ void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW,
 {
     int singleMCLFlag = 0;
     int nCS = 1 << log2_cb_size;
-    struct MvField mergecand_list[MRG_MAX_NUM_CANDS] = { { { { 0 } } } };
+    LOCAL_ALIGNED(4, MvField, mergecand_list, [MRG_MAX_NUM_CANDS]);
     int nPbW2 = nPbW;
     int nPbH2 = nPbH;
     HEVCLocalContext *lc = s->HEVClc;
-- 
1.9.2.msysgit.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to