[Patch 3/3] OProfile SPU event profiling support for IBM Cell processor

2008-11-24 Thread Carl Love

This is the second of the two kernel  patches for adding SPU profiling 
for the IBM Cell processor.  This patch contains the spu event profiling
setup, start and stop routines.

Signed-off-by: Carl Love <[EMAIL PROTECTED]>

Index: Cell_kernel_11_10_2008/arch/powerpc/oprofile/op_model_cell.c
===
--- Cell_kernel_11_10_2008.orig/arch/powerpc/oprofile/op_model_cell.c
+++ Cell_kernel_11_10_2008/arch/powerpc/oprofile/op_model_cell.c
@@ -41,10 +41,19 @@
 #include "cell/pr_util.h"
 
 static void cell_global_stop_spu_cycles(void);
+static void cell_global_stop_spu_events(void);
 
 #define PPU_PROFILING0
 #define SPU_PROFILING_CYCLES 1
 #define SPU_PROFILING_EVENTS 2
+#define PHYS_SPU_INIT0 /* Which spu in the node to start
+   * profiling on.
+   */
+#define SPU_EVENT_NUM_START  4100
+#define SPU_EVENT_NUM_STOP   4399
+#define SPU_PROFILE_EVENT_ADDR  4363  /* spu, address trace, decimal */
+#define SPU_PROFILE_EVENT_ADDR_MASK_A   0x146 /* sub unit set to zero */
+#define SPU_PROFILE_EVENT_ADDR_MASK_B   0x186 /* sub unit set to zero */
 
 #define NUM_SPUS_PER_NODE8
 #define SPU_CYCLES_EVENT_NUM 2 /*  event number for SPU_CYCLES */
@@ -58,11 +67,17 @@ static void cell_global_stop_spu_cycles(
 #define NUM_THREADS 2 /* number of physical threads in
   * physical processor
   */
-#define NUM_DEBUG_BUS_WORDS 4
+#define NUM_TRACE_BUS_WORDS 4
 #define NUM_INPUT_BUS_WORDS 2
 
 #define MAX_SPU_COUNT 0xFF /* maximum 24 bit LFSR value */
 
+/* Minumum HW interval timer setting to send value to trace buffer is 10 cycle.
+ * To configure counter to send value every N cycles set counter to
+ * 2^32 - 1 - N.
+ */
+#define NUM_INTERVAL_CYC  0x - 10
+
 /*
  * spu_cycle_reset is the number of cycles between samples.
  * This variable is used for SPU profiling and should ONLY be set
@@ -70,7 +85,7 @@ static void cell_global_stop_spu_cycles(
  */
 static unsigned int spu_cycle_reset;
 static unsigned int profiling_mode;
-
+static int spu_evnt_phys_spu_indx;
 
 struct pmc_cntrl_data {
unsigned long vcntr;
@@ -111,6 +126,8 @@ struct pm_cntrl {
u16 trace_mode;
u16 freeze;
u16 count_mode;
+   u16 spu_addr_trace;
+   u8  trace_buf_ovflw;
 };
 
 static struct {
@@ -128,7 +145,7 @@ static struct {
 #define GET_INPUT_CONTROL(x) ((x & 0x0004) >> 2)
 
 static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values);
-
+static unsigned long spu_pm_cnt[MAX_NUMNODES * NUM_SPUS_PER_NODE];
 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
 
 /*
@@ -158,6 +175,7 @@ static u32 hdw_thread;
 
 static u32 virt_cntr_inter_mask;
 static struct timer_list timer_virt_cntr;
+static struct timer_list timer_spu_event_swap;
 
 /*
  * pm_signal needs to be global since it is initialized in
@@ -171,7 +189,7 @@ static int spu_rtas_token;   /* token fo
 static u32 reset_value[NR_PHYS_CTRS];
 static int num_counters;
 static int oprofile_running;
-static DEFINE_SPINLOCK(virt_cntr_lock);
+static DEFINE_SPINLOCK(cntr_lock);
 
 static u32 ctr_enabled;
 
@@ -242,7 +260,6 @@ static int pm_rtas_activate_signals(u32 
i = 0;
for (j = 0; j < count; j++) {
if (pm_signal[j].signal_group != PPU_CYCLES_GRP_NUM) {
-
/* fw expects physical cpu # */
pm_signal_local[i].cpu = node;
pm_signal_local[i].signal_group
@@ -265,7 +282,6 @@ static int pm_rtas_activate_signals(u32 
return -EIO;
}
}
-
return 0;
 }
 
@@ -339,7 +355,7 @@ static void set_pm_event(u32 ctr, int ev
p->bit = signal_bit;
}
 
-   for (i = 0; i < NUM_DEBUG_BUS_WORDS; i++) {
+   for (i = 0; i < NUM_TRACE_BUS_WORDS; i++) {
if (bus_word & (1 << i)) {
pm_regs.debug_bus_control |=
(bus_type << (30 - (2 * i)));
@@ -373,12 +389,16 @@ static void write_pm_cntrl(int cpu)
if (pm_regs.pm_cntrl.stop_at_max == 1)
val |= CBE_PM_STOP_AT_MAX;
 
-   if (pm_regs.pm_cntrl.trace_mode == 1)
+   if (pm_regs.pm_cntrl.trace_mode != 0)
val |= CBE_PM_TRACE_MODE_SET(pm_regs.pm_cntrl.trace_mode);
 
+   if (pm_regs.pm_cntrl.trace_buf_ovflw == 1)
+   val |= CBE_PM_TRACE_BUF_OVFLW(pm_regs.pm_cntrl.trace_buf_ovflw);
if (pm_regs.pm_cntrl.freeze == 1)
val |= CBE_PM_FREEZE_ALL_CTRS;
 
+   val |= CBE_PM_SPU_ADDR_TRACE_SET(pm_regs.pm_cntrl.spu_addr_trace);
+
/*
 * Routine set_count_mode must be called previously to set
 * the count mode based on the user selection of user and kernel.
@@ -447,7 +467,7 @@ static void cell_virtual_cntr(unsigned l
 * not both pl

Re: [Patch 3/3] OProfile SPU event profiling support for IBM Cell processor

2008-11-25 Thread Arnd Bergmann
On Tuesday 25 November 2008, Carl Love wrote:
> 
> This is the second of the two kernel  patches for adding SPU profiling 
> for the IBM Cell processor.  This patch contains the spu event profiling
> setup, start and stop routines.
> 
> Signed-off-by: Carl Love <[EMAIL PROTECTED]>

Maybe a little more detailed description would be good. Explain
what this patch adds that isn't already there and why people would
want to have that in the kernel.
 
>  
>  static void cell_global_stop_spu_cycles(void);
> +static void cell_global_stop_spu_events(void);

Can you reorder the functions so that you don't need any forward
declarations? In general, it gets easier to understand if the
definition order matches the call graph.

>  static unsigned int spu_cycle_reset;
>  static unsigned int profiling_mode;
> -
> +static int spu_evnt_phys_spu_indx;
>  
>  struct pmc_cntrl_data {
>   unsigned long vcntr;
> @@ -111,6 +126,8 @@ struct pm_cntrl {
>   u16 trace_mode;
>   u16 freeze;
>   u16 count_mode;
> + u16 spu_addr_trace;
> + u8  trace_buf_ovflw;
>  };
>  
>  static struct {
> @@ -128,7 +145,7 @@ static struct {
>  #define GET_INPUT_CONTROL(x) ((x & 0x0004) >> 2)
>  
>  static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values);
> -
> +static unsigned long spu_pm_cnt[MAX_NUMNODES * NUM_SPUS_PER_NODE];
>  static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];

Can't you add this data to an existing data structure, e.g. struct spu,
rather than adding a new global?
  
>  static u32 reset_value[NR_PHYS_CTRS];
>  static int num_counters;
>  static int oprofile_running;
> -static DEFINE_SPINLOCK(virt_cntr_lock);
> +static DEFINE_SPINLOCK(cntr_lock);
>  
>  static u32 ctr_enabled;
>  
> @@ -242,7 +260,6 @@ static int pm_rtas_activate_signals(u32 
>   i = 0;
>   for (j = 0; j < count; j++) {
>   if (pm_signal[j].signal_group != PPU_CYCLES_GRP_NUM) {
> -
>   /* fw expects physical cpu # */
>   pm_signal_local[i].cpu = node;
>   pm_signal_local[i].signal_group
> @@ -265,7 +282,6 @@ static int pm_rtas_activate_signals(u32 
>   return -EIO;
>   }
>   }
> -
>   return 0;
>  }
>  

These look like a cleanups that should go into the first patch, right?

> +static void spu_evnt_swap(unsigned long data)

This function could use a comment about why we need to swap and what
the overall effect of swapping is.

>  int spu_prof_running;
>  static unsigned int profiling_interval;
> +DEFINE_SPINLOCK(oprof_spu_smpl_arry_lck);
> +unsigned long oprof_spu_smpl_arry_lck_flags;
>  
>  #define NUM_SPU_BITS_TRBUF 16
>  #define SPUS_PER_TB_ENTRY   4
> +#define SPUS_PER_NODE   8
>  
>  #define SPU_PC_MASK   0x
>  
> -static DEFINE_SPINLOCK(sample_array_lock);
> -unsigned long sample_array_lock_flags;
> -

This also looks like a rename that should go into the first patch.

> +/*
> + * Entry point for SPU event profiling.
> + * NOTE:  SPU profiling is done system-wide, not per-CPU.
> + *
> + * cycles_reset is the count value specified by the user when
> + * setting up OProfile to count SPU_CYCLES.
> + */
> +void start_spu_profiling_events(void)
> +{
> + spu_prof_running = 1;
> + schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
> +
> + return;
> +}
> +
> +void stop_spu_profiling_cycles(void)
>  {
>   spu_prof_running = 0;
>   hrtimer_cancel(&timer);
>   kfree(samples);
> - pr_debug("SPU_PROF: stop_spu_profiling issued\n");
> + pr_debug("SPU_PROF: stop_spu_profiling_cycles issued\n");
> +}
> +
> +void stop_spu_profiling_events(void)
> +{
> + spu_prof_running = 0;
>  }

Is this atomic? What if two CPUs access the spu_prof_running variable at
the same time?

Arnd <><
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev


Re: [Patch 3/3] OProfile SPU event profiling support for IBM Cell processor

2008-11-25 Thread Carl Love

On Tue, 2008-11-25 at 16:58 +0100, Arnd Bergmann wrote:



> >  struct pmc_cntrl_data {
> > unsigned long vcntr;
> > @@ -111,6 +126,8 @@ struct pm_cntrl {
> > u16 trace_mode;
> > u16 freeze;
> > u16 count_mode;
> > +   u16 spu_addr_trace;
> > +   u8  trace_buf_ovflw;
> >  };
> >  
> >  static struct {
> > @@ -128,7 +145,7 @@ static struct {
> >  #define GET_INPUT_CONTROL(x) ((x & 0x0004) >> 2)
> >  
> >  static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values);
> > -
> > +static unsigned long spu_pm_cnt[MAX_NUMNODES * NUM_SPUS_PER_NODE];
> >  static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
> 
> Can't you add this data to an existing data structure, e.g. struct spu,
> rather than adding a new global?

The data structure above holds flags for how the performance counters
are to be setup on each node.  This is not per SPU data.  It is per
system data.  The flags are setup once and then used when configuring
the various performance counter control registers on each node via one
of the PPU threads on each node.



> > +
> > +void stop_spu_profiling_events(void)
> > +{
> > +   spu_prof_running = 0;
> >  }
> 
> Is this atomic? What if two CPUs access the spu_prof_running variable at
> the same time?
> 
>   Arnd <><

When the user issues the command opcontrol --start then a series of
functions gets called by one CPU in the system that eventually gets down
to making the assignment spu_prof_running = 1.  Similarly, the variable
is set to 0 when the user executes the command opcontrol --stop.  In
each case, only one CPU in the system is executing the code to change
the value of the flag.  Once OProfile is started, you can't change the
event being profiled until after oprofile is stopped.  Hence, you can't
get the situation where spu_prof_running is set by the start and not
reset by the next stop command.  Now, as for multiple users issuing
opcontrol --start and/or opcontrol --stop at the same time, there is no
protection at the user level to prevent this.  If this occurs, then what
happens will depend on the order the OProfile file system serializes the
writes file for start/stop.  It is up to the users to not step on each
other.  If they do, the chances are they both will get bad data.

The other things you noted for this patch were easily fixed up.

___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev