> -----Original Message-----
> From: lng-odp [mailto:lng-odp-boun...@lists.linaro.org] On Behalf Of
> ext Ivan Khoronzhuk
> Sent: Monday, September 07, 2015 7:15 PM
> To: lng-odp@lists.linaro.org
> Subject: Re: [lng-odp] [API-NEXT PATCH 5/5] performance: sched: update
> scheduling test to use cycle counts
> 
> 
> 
> On 07.09.15 15:42, Petri Savolainen wrote:
> > Use CPU cycle count API instead of time API to measure CPU
> > cycles.
> >
> > Signed-off-by: Petri Savolainen <petri.savolai...@nokia.com>
> > ---
> >   test/performance/odp_scheduling.c | 132 ++++++++++++++++-----------
> -----------
> >   1 file changed, 57 insertions(+), 75 deletions(-)
> >
> > diff --git a/test/performance/odp_scheduling.c
> b/test/performance/odp_scheduling.c
> > index 2a7e531..b1206e8 100644
> > --- a/test/performance/odp_scheduling.c
> > +++ b/test/performance/odp_scheduling.c
> > @@ -183,9 +183,9 @@ static int test_alloc_single(int thr, odp_pool_t
> pool)
> >   {
> >     int i;
> >     odp_buffer_t temp_buf;
> > -   uint64_t t1, t2, cycles, ns;
> > +   uint64_t c1, c2, cycles;
> >
> > -   t1 = odp_time_cycles();
> > +   c1 = odp_cpu_cycles();
> >
> >     for (i = 0; i < ALLOC_ROUNDS; i++) {
> >             temp_buf = odp_buffer_alloc(pool);
> > @@ -198,12 +198,11 @@ static int test_alloc_single(int thr,
> odp_pool_t pool)
> >             odp_buffer_free(temp_buf);
> >     }
> >
> > -   t2     = odp_time_cycles();
> > -   cycles = odp_time_diff_cycles(t1, t2);
> > -   ns     = odp_time_cycles_to_ns(cycles);
> > +   c2     = odp_cpu_cycles();
> > +   cycles = odp_cpu_cycles_diff(c1, c2);
> >
> > -   printf("  [%i] alloc_sng alloc+free   %"PRIu64" cycles, %"PRIu64"
> ns\n",
> > -          thr, cycles/ALLOC_ROUNDS, ns/ALLOC_ROUNDS);
> > +   printf("  [%i] alloc_sng alloc+free   %4" PRIu64 " CPU cycles\n",
> > +          thr, cycles / ALLOC_ROUNDS);
> >
> >     return 0;
> >   }
> > @@ -220,9 +219,9 @@ static int test_alloc_multi(int thr, odp_pool_t
> pool)
> >   {
> >     int i, j;
> >     odp_buffer_t temp_buf[MAX_ALLOCS];
> > -   uint64_t t1, t2, cycles, ns;
> > +   uint64_t c1, c2, cycles;
> >
> > -   t1 = odp_time_cycles();
> > +   c1 = odp_cpu_cycles();
> >
> >     for (i = 0; i < ALLOC_ROUNDS; i++) {
> >             for (j = 0; j < MAX_ALLOCS; j++) {
> > @@ -238,13 +237,11 @@ static int test_alloc_multi(int thr, odp_pool_t
> pool)
> >                     odp_buffer_free(temp_buf[j-1]);
> >     }
> >
> > -   t2     = odp_time_cycles();
> > -   cycles = odp_time_diff_cycles(t1, t2);
> > -   ns     = odp_time_cycles_to_ns(cycles);
> > +   c2     = odp_cpu_cycles();
> > +   cycles = odp_cpu_cycles_diff(c1, c2);
> >
> > -   printf("  [%i] alloc_multi alloc+free %"PRIu64" cycles, %"PRIu64"
> ns\n",
> > -          thr, cycles/(ALLOC_ROUNDS*MAX_ALLOCS),
> > -          ns/(ALLOC_ROUNDS*MAX_ALLOCS));
> > +   printf("  [%i] alloc_multi alloc+free %4" PRIu64 " CPU cycles\n",
> I'd rather not limit it to 4 (%4) and leave as is, as we never can be
> sure that
> test works as expected. I mean all printfs in this patch.

It's not limiting. It reserves space for 4 digits. More digits than that would 
just misalign.

Anyway, changed to 6 digits.

> 
> > +          thr, cycles / (ALLOC_ROUNDS * MAX_ALLOCS));
> >
> >     return 0;
> >   }
> > @@ -265,7 +262,7 @@ static int test_poll_queue(int thr, odp_pool_t
> msg_pool)
> >     odp_buffer_t buf;
> >     test_message_t *t_msg;
> >     odp_queue_t queue;
> > -   uint64_t t1, t2, cycles, ns;
> > +   uint64_t c1, c2, cycles;
> >     int i;
> >
> >     /* Alloc test message */
> > @@ -289,7 +286,7 @@ static int test_poll_queue(int thr, odp_pool_t
> msg_pool)
> >             return -1;
> >     }
> >
> > -   t1 = odp_time_cycles();
> > +   c1 = odp_cpu_cycles();
> >
> >     for (i = 0; i < QUEUE_ROUNDS; i++) {
> >             ev = odp_buffer_to_event(buf);
> > @@ -310,12 +307,11 @@ static int test_poll_queue(int thr, odp_pool_t
> msg_pool)
> >             }
> >     }
> >
> > -   t2     = odp_time_cycles();
> > -   cycles = odp_time_diff_cycles(t1, t2);
> > -   ns     = odp_time_cycles_to_ns(cycles);
> > +   c2     = odp_cpu_cycles();
> > +   cycles = odp_cpu_cycles_diff(c1, c2);
> >
> > -   printf("  [%i] poll_queue enq+deq     %"PRIu64" cycles, %"PRIu64"
> ns\n",
> > -          thr, cycles/QUEUE_ROUNDS, ns/QUEUE_ROUNDS);
> > +   printf("  [%i] poll_queue enq+deq     %4" PRIu64 " CPU cycles\n",
> > +          thr, cycles / QUEUE_ROUNDS);
> >
> >     odp_buffer_free(buf);
> >     return 0;
> > @@ -341,14 +337,14 @@ static int test_schedule_single(const char
> *str, int thr,
> >   {
> >     odp_event_t ev;
> >     odp_queue_t queue;
> > -   uint64_t t1, t2, cycles, ns;
> > +   uint64_t c1, c2, cycles;
> >     uint32_t i;
> >     uint32_t tot;
> >
> >     if (create_queue(thr, msg_pool, prio))
> >             return -1;
> >
> > -   t1 = odp_time_cycles();
> > +   c1 = odp_cpu_cycles();
> >
> >     for (i = 0; i < QUEUE_ROUNDS; i++) {
> >             ev = odp_schedule(&queue, ODP_SCHED_WAIT);
> > @@ -382,18 +378,15 @@ static int test_schedule_single(const char
> *str, int thr,
> >
> >     odp_schedule_resume();
> >
> > -   t2     = odp_time_cycles();
> > -   cycles = odp_time_diff_cycles(t1, t2);
> > -   ns     = odp_time_cycles_to_ns(cycles);
> > +   c2     = odp_cpu_cycles();
> > +   cycles = odp_cpu_cycles_diff(c1, c2);
> >
> >     odp_barrier_wait(barrier);
> >     clear_sched_queues();
> >
> >     cycles = cycles/tot;
> > -   ns     = ns/tot;
> >
> > -   printf("  [%i] %s enq+deq %"PRIu64" cycles, %"PRIu64" ns\n",
> > -          thr, str, cycles, ns);
> > +   printf("  [%i] %s enq+deq %4" PRIu64 " CPU cycles\n", thr, str,
> cycles);
> >
> >     return 0;
> >   }
> > @@ -419,9 +412,7 @@ static int test_schedule_many(const char *str,
> int thr,
> >   {
> >     odp_event_t ev;
> >     odp_queue_t queue;
> > -   uint64_t t1;
> > -   uint64_t t2;
> > -   uint64_t cycles, ns;
> > +   uint64_t c1, c2, cycles;
> >     uint32_t i;
> >     uint32_t tot;
> >
> > @@ -429,7 +420,7 @@ static int test_schedule_many(const char *str,
> int thr,
> >             return -1;
> >
> >     /* Start sched-enq loop */
> > -   t1 = odp_time_cycles();
> > +   c1 = odp_cpu_cycles();
> >
> >     for (i = 0; i < QUEUE_ROUNDS; i++) {
> >             ev = odp_schedule(&queue, ODP_SCHED_WAIT);
> > @@ -463,18 +454,15 @@ static int test_schedule_many(const char *str,
> int thr,
> >
> >     odp_schedule_resume();
> >
> > -   t2     = odp_time_cycles();
> > -   cycles = odp_time_diff_cycles(t1, t2);
> > -   ns     = odp_time_cycles_to_ns(cycles);
> > +   c2     = odp_cpu_cycles();
> > +   cycles = odp_cpu_cycles_diff(c1, c2);
> >
> >     odp_barrier_wait(barrier);
> >     clear_sched_queues();
> >
> >     cycles = cycles/tot;
> I know that it's not from change, but printf is changed,
> maybe in order to keep module style, put cycles / tot in printf.

It's a matter of taste and not relevant for this change. Anyway, changed other 
locations to use this style.

> 
> > -   ns     = ns/tot;
> >
> > -   printf("  [%i] %s enq+deq %"PRIu64" cycles, %"PRIu64" ns\n",
> > -          thr, str, cycles, ns);
> > +   printf("  [%i] %s enq+deq %4" PRIu64 " CPU cycles\n", thr, str,
> cycles);
> >
> >     return 0;
> >   }
> > @@ -496,9 +484,7 @@ static int test_schedule_multi(const char *str,
> int thr,
> >   {
> >     odp_event_t ev[MULTI_BUFS_MAX];
> >     odp_queue_t queue;
> > -   uint64_t t1;
> > -   uint64_t t2;
> > -   uint64_t cycles, ns;
> > +   uint64_t c1, c2, cycles;
> >     int i, j;
> >     int num;
> >     uint32_t tot = 0;
> > @@ -547,7 +533,7 @@ static int test_schedule_multi(const char *str,
> int thr,
> >     }
> >
> >     /* Start sched-enq loop */
> > -   t1 = odp_time_cycles();
> > +   c1 = odp_cpu_cycles();
> >
> >     for (i = 0; i < QUEUE_ROUNDS; i++) {
> >             num = odp_schedule_multi(&queue, ODP_SCHED_WAIT, ev,
> > @@ -584,23 +570,18 @@ static int test_schedule_multi(const char *str,
> int thr,
> >     odp_schedule_resume();
> >
> >
> > -   t2     = odp_time_cycles();
> > -   cycles = odp_time_diff_cycles(t1, t2);
> > -   ns     = odp_time_cycles_to_ns(cycles);
> > +   c2     = odp_cpu_cycles();
> > +   cycles = odp_cpu_cycles_diff(c1, c2);
> >
> >     odp_barrier_wait(barrier);
> >     clear_sched_queues();
> >
> > -   if (tot) {
> > +   if (tot)
> >             cycles = cycles/tot;
> > -           ns     = ns/tot;
> > -   } else {
> > +   else
> >             cycles = 0;
> > -           ns     = 0;
> > -   }
> >
> > -   printf("  [%i] %s enq+deq %"PRIu64" cycles, %"PRIu64" ns\n",
> > -          thr, str, cycles, ns);
> > +   printf("  [%i] %s enq+deq %4" PRIu64 " CPU cycles\n", thr, str,
> cycles);
> >
> >     return 0;
> >   }
> > @@ -714,21 +695,21 @@ static void *run_thread(void *arg)
> >   }
> >
> >   /**
> > - * @internal Test cycle counter accuracy
> > + * @internal Test cycle counter frequency
> >    */
> > -static void test_time(void)
> > +static void test_cpu_freq(void)
> Why? maybe test_cpu_cycles*...

It measures cpu cycle count frequency against the max CPU frequency. In 
practice, how close to max freq the CPU is running during those two seconds.

> 
> >   {
> >     struct timespec tp1, tp2;
> > -   uint64_t t1, t2;
> > -   uint64_t ns1, ns2, cycles;
> > -   double err;
> > +   uint64_t c1, c2, cycles;
> > +   uint64_t nsec;
> > +   double diff_max_hz, max_cycles;
> >
> >     if (clock_gettime(CLOCK_MONOTONIC, &tp2)) {
> >             LOG_ERR("clock_gettime failed.\n");
> >             return;
> >     }
> >
> > -   printf("\nTime accuracy test (%i sec)\n", TEST_SEC);
> > +   printf("\nCPU cycle count accuracy test (%i sec)\n", TEST_SEC);
> Maybe beter to put this line under clock_gettime? In order to not miss
> t1 == t2? Maybe here it's not so important but if it's going to be
> replaced on odp_time, better to do it here, anyway printf is changed.

Moved above. It would not break anything here, but results more accurate 2 sec 
test period. Actual nsecs are used anyway.

> 
> >
> >     do {
> >             if (clock_gettime(CLOCK_MONOTONIC, &tp1)) {
> > @@ -738,7 +719,7 @@ static void test_time(void)
> >
> >     } while (tp1.tv_sec == tp2.tv_sec);
> >
> > -   t1 = odp_time_cycles();
> > +   c1 = odp_cpu_cycles();
> >
> >     do {
> >             if (clock_gettime(CLOCK_MONOTONIC, &tp2)) {
> > @@ -748,24 +729,25 @@ static void test_time(void)
> >
> >     } while ((tp2.tv_sec - tp1.tv_sec) < TEST_SEC);
> >
> > -   t2 = odp_time_cycles();
> > +   c2 = odp_cpu_cycles();
> >
> > -   ns1 = (tp2.tv_sec - tp1.tv_sec)*1000000000;
> > +   nsec = (tp2.tv_sec - tp1.tv_sec) * 1000000000;
> Maybe better to use sec here? Then no need to divide on 10000000000
> double later.
> Divide only in printf.
> and maybe ODP_TIME_SEC?
> Aslo is it correct to use clock_gettime here?
> Better to replace on time API probably. (I am ready to do that after)

Timespec is X sec + Y nsec. That needs to be turned into one nsec figure.

Yes, later ODP time API can be used instead of Posix API.


> 
> >
> >     if (tp2.tv_nsec > tp1.tv_nsec)
> > -           ns1 += tp2.tv_nsec - tp1.tv_nsec;
> > +           nsec += tp2.tv_nsec - tp1.tv_nsec;
> >     else
> > -           ns1 -= tp1.tv_nsec - tp2.tv_nsec;
> > +           nsec -= tp1.tv_nsec - tp2.tv_nsec;
> >
> > -   cycles = odp_time_diff_cycles(t1, t2);
> > -   ns2    = odp_time_cycles_to_ns(cycles);
> > +   cycles     = odp_cpu_cycles_diff(c1, c2);
> > +   max_cycles = (nsec * odp_sys_cpu_hz()) / 1000000000.0;
> That case could be impacted by cpufreq change...I believe that it's
> static.

Current, there's no max cpu freq call. This needs to be changed for that later.


> 
> >
> > -   err = ((double)(ns2) - (double)ns1) / (double)ns1;
> > +   /* Compare measured CPU cycles to maximum theoretical CPU cycle
> count */
> > +   diff_max_hz = ((double)(cycles) - max_cycles) / max_cycles;
> It's expected here to be -X%, but due to "time measurement" can have
> resolution impact,
> can happen that it will be +X%...that is confusing. Is it OK?

The result is how many percent we are off from the theoretical max CPU freq 
returned by the API. Depending on frequency scaling, turbo mode, API returning 
wrong CPU max hz, etc... it can be above (+) or under (-) the max freq.


-Petri


> 
> >
> > -   printf("clock_gettime         %"PRIu64" ns\n",    ns1);
> > -   printf("odp_time_cycles       %"PRIu64" cycles\n", cycles);
> > -   printf("odp_time_cycles_to_ns %"PRIu64" ns\n",    ns2);
> > -   printf("odp get cycle error   %f%%\n", err*100.0);
> > +   printf("clock_gettime          %" PRIu64 " ns\n", nsec);
> > +   printf("odp_cpu_cycles         %" PRIu64 " CPU cycles\n",
> cycles);
> > +   printf("odp_sys_cpu_hz         %" PRIu64 " hz\n",
> odp_sys_cpu_hz());
> > +   printf("Diff from max CPU freq %f%%\n", diff_max_hz * 100.0);
> >
> >     printf("\n");
> >   }
> > @@ -879,7 +861,7 @@ int main(int argc, char *argv[])
> >     printf("---------------\n");
> >     printf("ODP API version: %s\n",        odp_version_api_str());
> >     printf("CPU model:       %s\n",        odp_sys_cpu_model_str());
> > -   printf("CPU freq (hz):   %"PRIu64"\n", odp_sys_cpu_hz());
> > +   printf("CPU freq (hz):   %" PRIu64 "\n", odp_sys_cpu_hz());
> >     printf("Cache line size: %i\n",
> odp_sys_cache_line_size());
> >     printf("Max CPU count:   %i\n",        odp_cpu_count());
> >
> > @@ -898,8 +880,8 @@ int main(int argc, char *argv[])
> >     printf("first CPU:          %i\n", odp_cpumask_first(&cpumask));
> >     printf("cpu mask:           %s\n", cpumaskstr);
> >
> > -   /* Test cycle count accuracy */
> > -   test_time();
> > +   /* Test cycle count frequency */
> > +   test_cpu_freq();
> >
> >     shm = odp_shm_reserve("test_globals",
> >                           sizeof(test_globals_t), ODP_CACHE_LINE_SIZE,
> 0);
> >
> 
> --
> Regards,
> Ivan Khoronzhuk
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> https://lists.linaro.org/mailman/listinfo/lng-odp
_______________________________________________
lng-odp mailing list
lng-odp@lists.linaro.org
https://lists.linaro.org/mailman/listinfo/lng-odp

Reply via email to