On Thu, Jul 26, 2007 at 09:53:53AM +0200, Ingo Molnar wrote:
> 
> * Ankita Garg <[EMAIL PROTECTED]> wrote:
> 
> > > I'd suggest to not put a probe into a preempt-off section - put it 
> > > to the beginning and to the end of schedule() to capture 
> > > context-switches. _stp_print_flush() is in the systemtap-generated 
> > > module, right? Maybe the problem is resolved by changing that 
> > > spinlock to use raw_spinlock_t / DEFINE_RAW_SPIN_LOCK.
> > 
> > Yes, _stp_print_flush is in the systemtap-generated kprobe module. 
> > Placing the probe at the beginning of schedule() also has the same 
> > effect. Will try by changing the spinlock to raw_spinlock_t...
> 
> could you send us that module source ST generates? Perhaps there are 
> preempt_disable() (or local_irq_disable()) calls in it too.

Attaching the generated module as it is huge...

Looks like SystemTap makes use of relayfs for printing the buffer
contents. The _stp_print_flush() (line 269 in the attached module) implemented 
in the systemtap library function as calling the following:

static int _stp_relay_write (const void *data, unsigned length)
{
        unsigned long flags;
        struct rchan_buf *buf;

        if (unlikely(length == 0))
                return 0;

        local_irq_save(flags);
        buf = _stp_chan->buf[smp_processor_id()];
        if (unlikely(buf->offset + length > _stp_chan->subbuf_size))
                length = relay_switch_subbuf(buf, length);
        memcpy(buf->data + buf->offset, data, length);
        buf->offset += length;
        local_irq_restore(flags);

        if (unlikely(length == 0))
                return -1;

        return length;
}

The above does a local_irq_save().

-- 
Regards,
Ankita Garg ([EMAIL PROTECTED])
Linux Technology Center
IBM India Systems & Technology Labs, 
Bangalore, India   
#define TEST_MODE 0

#ifndef MAXNESTING
#define MAXNESTING 10
#endif
#ifndef MAXSTRINGLEN
#define MAXSTRINGLEN 128
#endif
#ifndef MAXACTION
#define MAXACTION 1000
#endif
#ifndef MAXACTION_INTERRUPTIBLE
#define MAXACTION_INTERRUPTIBLE (MAXACTION * 10)
#endif
#ifndef MAXTRYLOCK
#define MAXTRYLOCK MAXACTION
#endif
#ifndef TRYLOCKDELAY
#define TRYLOCKDELAY 100
#endif
#ifndef MAXMAPENTRIES
#define MAXMAPENTRIES 2048
#endif
#ifndef MAXERRORS
#define MAXERRORS 0
#endif
#ifndef MAXSKIPPED
#define MAXSKIPPED 100
#endif
#ifndef MINSTACKSPACE
#define MINSTACKSPACE 1024
#endif
#ifndef STP_OVERLOAD_INTERVAL
#define STP_OVERLOAD_INTERVAL 1000000000LL
#endif
#ifndef STP_OVERLOAD_THRESHOLD
#define STP_OVERLOAD_THRESHOLD 500000000LL
#endif
#ifndef STP_NO_OVERLOAD
#define STP_OVERLOAD
#endif
#include "runtime.h"
#include "regs.c"
#include "stack.c"
#include "regs-ia64.c"
#include "stat.c"
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/delay.h>
#include <linux/profile.h>
#include <linux/random.h>
#include <linux/utsname.h>
#include "loc2c-runtime.h" 
#ifndef read_trylock
#define read_trylock(x) ({ read_lock(x); 1; })
#endif
#if defined(CONFIG_MARKERS)
#include <linux/marker.h>
#endif
typedef char string_t[MAXSTRINGLEN];

#define STAP_SESSION_STARTING 0
#define STAP_SESSION_RUNNING 1
#define STAP_SESSION_ERROR 2
#define STAP_SESSION_STOPPING 3
#define STAP_SESSION_STOPPED 4
atomic_t session_state = ATOMIC_INIT (STAP_SESSION_STARTING);
atomic_t error_count = ATOMIC_INIT (0);
atomic_t skipped_count = ATOMIC_INIT (0);

struct context {
  atomic_t busy;
  const char *probe_point;
  int actionremaining;
  unsigned nesting;
  const char *last_error;
  const char *last_stmt;
  struct pt_regs *regs;
  struct kretprobe_instance *pi;
  va_list mark_va_list;
  #ifdef STP_TIMING
  Stat *statp;
  #endif
  #ifdef STP_OVERLOAD
  cycles_t cycles_base;
  cycles_t cycles_sum;
  #endif
  union {
    struct probe_1493_locals {
      string_t argstr;
      union {
        struct {
          string_t __tmp0;
          int64_t __tmp1;
          int64_t __tmp2;
          string_t __tmp3;
        };
      };
    } probe_1493;
    struct function_execname_locals {
      string_t __retvalue;
    } function_execname;
    struct function_pid_locals {
      int64_t __retvalue;
    } function_pid;
    struct function_tid_locals {
      int64_t __retvalue;
    } function_tid;
  } locals [MAXNESTING];
};

void *contexts = NULL; /* alloc_percpu */



static void function_execname (struct context * __restrict__ c);

static void function_pid (struct context * __restrict__ c);

static void function_tid (struct context * __restrict__ c);

void function_execname (struct context* __restrict__ c) {
  struct function_execname_locals *  __restrict__ l =
    & c->locals[c->nesting].function_execname;
  (void) l;
  #define CONTEXT c
  #define THIS l
  if (0) goto out;
  l->__retvalue[0] = '\0';
  {
     /* pure */
	strlcpy (THIS->__retvalue, current->comm, MAXSTRINGLEN);

  }
out:
  ;
  #undef CONTEXT
  #undef THIS
}


void function_pid (struct context* __restrict__ c) {
  struct function_pid_locals *  __restrict__ l =
    & c->locals[c->nesting].function_pid;
  (void) l;
  #define CONTEXT c
  #define THIS l
  if (0) goto out;
  l->__retvalue = 0;
  {
     /* pure */
	THIS->__retvalue = current->tgid;

  }
out:
  ;
  #undef CONTEXT
  #undef THIS
}


void function_tid (struct context* __restrict__ c) {
  struct function_tid_locals *  __restrict__ l =
    & c->locals[c->nesting].function_tid;
  (void) l;
  #define CONTEXT c
  #define THIS l
  if (0) goto out;
  l->__retvalue = 0;
  {
     /* pure */
	THIS->__retvalue = current->pid;

  }
out:
  ;
  #undef CONTEXT
  #undef THIS
}


#ifdef STP_TIMING
static __cacheline_aligned Stat time_probe_0;
#endif

static void probe_1493 (struct context * __restrict__ c) {
  struct probe_1493_locals * __restrict__ l =
    & c->locals[0].probe_1493;
  (void) l;
  #ifdef STP_TIMING
  c->statp = & time_probe_0;
  #endif
  {
    unsigned numtrylock = 0;
    (void) numtrylock;
    if (0) goto unlock_;
  }
  l->argstr[0] = '\0';
  {
    if (unlikely (c->last_error)) goto out;
    c->last_stmt = "identifier 'printf' at trace.stp:3:2";
    c->actionremaining -= 1;
    if (unlikely (c->actionremaining <= 0)) {
      c->last_error = "MAXACTION exceeded";
      goto out;
    }
    (void) 
    ({
      c->last_stmt = "identifier 'execname' at trace.stp:3:49";
      strlcpy (l->__tmp0, 
      ({
        
        if (unlikely (c->nesting+2 >= MAXNESTING)) {
          c->last_error = "MAXNESTING exceeded";
          c->last_stmt = "identifier 'execname' at trace.stp:3:49";
        } else if (likely (! c->last_error)) {
          c->nesting ++;
          function_execname (c);
          c->nesting --;
          if (c->last_error && ! c->last_error[0])
            c->last_error = 0;
        }
        c->locals[c->nesting+1].function_execname.__retvalue;
      }), MAXSTRINGLEN);
      c->last_stmt = "identifier 'pid' at trace.stp:3:61";
      l->__tmp1 = 
      ({
        
        if (unlikely (c->nesting+2 >= MAXNESTING)) {
          c->last_error = "MAXNESTING exceeded";
          c->last_stmt = "identifier 'pid' at trace.stp:3:61";
        } else if (likely (! c->last_error)) {
          c->nesting ++;
          function_pid (c);
          c->nesting --;
          if (c->last_error && ! c->last_error[0])
            c->last_error = 0;
        }
        c->locals[c->nesting+1].function_pid.__retvalue;
      });
      c->last_stmt = "identifier 'tid' at trace.stp:3:68";
      l->__tmp2 = 
      ({
        
        if (unlikely (c->nesting+2 >= MAXNESTING)) {
          c->last_error = "MAXNESTING exceeded";
          c->last_stmt = "identifier 'tid' at trace.stp:3:68";
        } else if (likely (! c->last_error)) {
          c->nesting ++;
          function_tid (c);
          c->nesting --;
          if (c->last_error && ! c->last_error[0])
            c->last_error = 0;
        }
        c->locals[c->nesting+1].function_tid.__retvalue;
      });
      c->last_stmt = "identifier 'argstr' at trace.stp:3:75";
      strlcpy (l->__tmp3, l->argstr, MAXSTRINGLEN);
      if (likely (! c->last_error)) {
        _stp_printf ("%s (pid: %lld, tid: %lld argstr: %s ) \n", l->__tmp0, l->__tmp1, l->__tmp2, l->__tmp3);
      }
      ((int64_t)0LL);
    });
    
    /* null */;
    
  }
out:
  _stp_print_flush();
unlock_: ;
}


/* ---- dwarf probes ---- */
#if ! defined(CONFIG_KPROBES)
#error "Need CONFIG_KPROBES!"
#endif

static int enter_kprobe_probe (struct kprobe *inst, struct pt_regs *regs);
static int enter_kretprobe_probe (struct kretprobe_instance *inst, struct pt_regs *regs);
struct stap_dwarf_probe {
  union { struct kprobe kp; struct kretprobe krp; } u;
  unsigned return_p:1;
  unsigned maxactive_p:1;
  unsigned registered_p:1;
  const char *module;
  const char *section;
  unsigned long address;
  unsigned long maxactive_val;
  const char *pp;
  void (*ph) (struct context*);
} stap_dwarf_probes[] = {
  { .address=0x2850deUL, .module="kernel", .section="_stext", .pp="kernel.function(\"[EMAIL PROTECTED]/sched.c:3792\")", .ph=&probe_1493 },
};

static int enter_kprobe_probe (struct kprobe *inst, struct pt_regs *regs) {
  struct stap_dwarf_probe *sdp = container_of(inst, struct stap_dwarf_probe, u.kp);
  struct context* __restrict__ c;
  unsigned long flags;
  #if defined(STP_TIMING) || defined(STP_OVERLOAD)
  cycles_t cycles_atstart = get_cycles ();
  #endif
  local_irq_save (flags);
  if (unlikely ((((unsigned long) (& c)) & (THREAD_SIZE-1))
    < (MINSTACKSPACE + sizeof (struct thread_info)))) {
    if (unlikely (atomic_inc_return (& skipped_count) > MAXSKIPPED)) {
      atomic_set (& session_state, STAP_SESSION_ERROR);
      _stp_exit ();
    }
    goto probe_epilogue;
  }
  if (atomic_read (&session_state) != STAP_SESSION_RUNNING)
    goto probe_epilogue;
  c = per_cpu_ptr (contexts, smp_processor_id());
  if (unlikely (atomic_inc_return (&c->busy) != 1)) {
    if (atomic_inc_return (& skipped_count) > MAXSKIPPED) {
      atomic_set (& session_state, STAP_SESSION_ERROR);
      _stp_exit ();
    }
    atomic_dec (& c->busy);
    goto probe_epilogue;
  }
  
  c->last_error = 0;
  c->nesting = 0;
  c->regs = 0;
  c->pi = 0;
  c->probe_point = 0;
  c->actionremaining = MAXACTION;
  #ifdef STP_TIMING
  c->statp = 0;
  #endif
  c->probe_point = sdp->pp;
  c->regs = regs;
  (*sdp->ph) (c);
  #if defined(STP_TIMING) || defined(STP_OVERLOAD)
  {
    cycles_t cycles_atend = get_cycles ();
    int32_t cycles_elapsed = ((int32_t)cycles_atend > (int32_t)cycles_atstart)
      ? ((int32_t)cycles_atend - (int32_t)cycles_atstart)
      : (~(int32_t)0) - (int32_t)cycles_atstart + (int32_t)cycles_atend + 1;
    #ifdef STP_TIMING
    if (likely (c->statp)) _stp_stat_add(*c->statp, cycles_elapsed);
    #endif
    #ifdef STP_OVERLOAD
    {
      cycles_t interval = (cycles_atend > c->cycles_base)
        ? (cycles_atend - c->cycles_base)
        : (STP_OVERLOAD_INTERVAL + 1);
      c->cycles_sum += cycles_elapsed;
      if (interval > STP_OVERLOAD_INTERVAL) {
        if (c->cycles_sum > STP_OVERLOAD_THRESHOLD) {
          _stp_error ("probe overhead exceeded threshold");
          atomic_set (&session_state, STAP_SESSION_ERROR);
          atomic_inc (&error_count);
        }
        c->cycles_base = cycles_atend;
        c->cycles_sum = 0;
      }
    }
    #endif
  }
  #endif
  if (unlikely (c->last_error && c->last_error[0])) {
    if (c->last_stmt != NULL)
      _stp_softerror ("%s near %s", c->last_error, c->last_stmt);
    else
      _stp_softerror ("%s", c->last_error);
    atomic_inc (& error_count);
    if (atomic_read (& error_count) > MAXERRORS) {
      atomic_set (& session_state, STAP_SESSION_ERROR);
      _stp_exit ();
    }
  }
  atomic_dec (&c->busy);
probe_epilogue:
  local_irq_restore (flags);
  return 0;
}

static int enter_kretprobe_probe (struct kretprobe_instance *inst, struct pt_regs *regs) {
  struct kretprobe *krp = inst->rp;
  struct stap_dwarf_probe *sdp = container_of(krp, struct stap_dwarf_probe, u.krp);
  struct context* __restrict__ c;
  unsigned long flags;
  #if defined(STP_TIMING) || defined(STP_OVERLOAD)
  cycles_t cycles_atstart = get_cycles ();
  #endif
  local_irq_save (flags);
  if (unlikely ((((unsigned long) (& c)) & (THREAD_SIZE-1))
    < (MINSTACKSPACE + sizeof (struct thread_info)))) {
    if (unlikely (atomic_inc_return (& skipped_count) > MAXSKIPPED)) {
      atomic_set (& session_state, STAP_SESSION_ERROR);
      _stp_exit ();
    }
    goto probe_epilogue;
  }
  if (atomic_read (&session_state) != STAP_SESSION_RUNNING)
    goto probe_epilogue;
  c = per_cpu_ptr (contexts, smp_processor_id());
  if (unlikely (atomic_inc_return (&c->busy) != 1)) {
    if (atomic_inc_return (& skipped_count) > MAXSKIPPED) {
      atomic_set (& session_state, STAP_SESSION_ERROR);
      _stp_exit ();
    }
    atomic_dec (& c->busy);
    goto probe_epilogue;
  }
  
  c->last_error = 0;
  c->nesting = 0;
  c->regs = 0;
  c->pi = 0;
  c->probe_point = 0;
  c->actionremaining = MAXACTION;
  #ifdef STP_TIMING
  c->statp = 0;
  #endif
  c->probe_point = sdp->pp;
  c->regs = regs;
  c->pi = inst;
  (*sdp->ph) (c);
  #if defined(STP_TIMING) || defined(STP_OVERLOAD)
  {
    cycles_t cycles_atend = get_cycles ();
    int32_t cycles_elapsed = ((int32_t)cycles_atend > (int32_t)cycles_atstart)
      ? ((int32_t)cycles_atend - (int32_t)cycles_atstart)
      : (~(int32_t)0) - (int32_t)cycles_atstart + (int32_t)cycles_atend + 1;
    #ifdef STP_TIMING
    if (likely (c->statp)) _stp_stat_add(*c->statp, cycles_elapsed);
    #endif
    #ifdef STP_OVERLOAD
    {
      cycles_t interval = (cycles_atend > c->cycles_base)
        ? (cycles_atend - c->cycles_base)
        : (STP_OVERLOAD_INTERVAL + 1);
      c->cycles_sum += cycles_elapsed;
      if (interval > STP_OVERLOAD_INTERVAL) {
        if (c->cycles_sum > STP_OVERLOAD_THRESHOLD) {
          _stp_error ("probe overhead exceeded threshold");
          atomic_set (&session_state, STAP_SESSION_ERROR);
          atomic_inc (&error_count);
        }
        c->cycles_base = cycles_atend;
        c->cycles_sum = 0;
      }
    }
    #endif
  }
  #endif
  if (unlikely (c->last_error && c->last_error[0])) {
    if (c->last_stmt != NULL)
      _stp_softerror ("%s near %s", c->last_error, c->last_stmt);
    else
      _stp_softerror ("%s", c->last_error);
    atomic_inc (& error_count);
    if (atomic_read (& error_count) > MAXERRORS) {
      atomic_set (& session_state, STAP_SESSION_ERROR);
      _stp_exit ();
    }
  }
  atomic_dec (&c->busy);
probe_epilogue:
  local_irq_restore (flags);
  return 0;
}

int systemtap_module_init (void) {
  int rc = 0;
  int i=0, j=0;
  const char *probe_point = "";
  down_read (& uts_sem);
  {
    #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19)
    const char* machine = utsname()->machine;
    const char* release = utsname()->release;
    #else
    const char* machine = system_utsname.machine;
    const char* release = system_utsname.release;
    #endif
    if (strcmp (machine, "x86_64")) {
      _stp_error ("module machine mismatch (%s vs %s)", machine, "x86_64");
      rc = -EINVAL;
    }
    if (strcmp (release, "2.6.22.1-rt3")) {
      _stp_error ("module release mismatch (%s vs %s)", release, "2.6.22.1-rt3");
      rc = -EINVAL;
    }
  }
  up_read (& uts_sem);
  if (rc) goto out;
  (void) probe_point;
  (void) i;
  (void) j;
  atomic_set (&session_state, STAP_SESSION_STARTING);
  if (sizeof (struct context) <= 131072)
    contexts = alloc_percpu (struct context);
  if (contexts == NULL) {
    _stp_error ("percpu context (size %lu) allocation failed", sizeof (struct context));
    rc = -ENOMEM;
    goto out;
  }
  #ifdef STP_TIMING
  time_probe_0 = _stp_stat_init (HIST_NONE);
  #endif
  for (i=0; i<1; i++) {
    struct stap_dwarf_probe *sdp = & stap_dwarf_probes[i];
    unsigned long relocated_addr = _stp_module_relocate (sdp->module, sdp->section, sdp->address);
    if (relocated_addr == 0) continue;
    probe_point = sdp->pp;
    if (sdp->return_p) {
      sdp->u.krp.kp.addr = (void *) relocated_addr;
      if (sdp->maxactive_p) {
        sdp->u.krp.maxactive = sdp->maxactive_val;
      } else {
        sdp->u.krp.maxactive = max(10, 4*NR_CPUS);
      }
      sdp->u.krp.handler = &enter_kretprobe_probe;
      rc = register_kretprobe (& sdp->u.krp);
    } else {
      sdp->u.kp.addr = (void *) relocated_addr;
      sdp->u.kp.pre_handler = &enter_kprobe_probe;
      rc = register_kprobe (& sdp->u.kp);
    }
    if (rc) {
      for (j=i-1; j>=0; j--) {
        struct stap_dwarf_probe *sdp2 = & stap_dwarf_probes[j];
        if (sdp2->return_p) unregister_kretprobe (&sdp2->u.krp);
        else unregister_kprobe (&sdp2->u.kp);
      }
      break;
    }
    else sdp->registered_p = 1;
  }
  if (rc) {
    _stp_error ("probe %s registration error (rc %d)", probe_point, rc);
    atomic_set (&session_state, STAP_SESSION_ERROR);
    goto out;
  }
  printk (KERN_DEBUG "%s: systemtap: 0.5.15/Red Hat elfutils 0.125, base: %p, memory: %lu+%lu+%lu+%lu+%lu data+text+ctx+io+glob, probes: 1\n", THIS_MODULE->name, THIS_MODULE->module_core, (unsigned long) (THIS_MODULE->core_size - THIS_MODULE->core_text_size), (unsigned long) THIS_MODULE->core_text_size, (unsigned long) (num_online_cpus() * sizeof(struct context)), (unsigned long) _stp_allocated_net_memory, (unsigned long) _stp_allocated_memory);
  atomic_set (&session_state, STAP_SESSION_RUNNING);
  return 0;
out:
  return rc;
}


void systemtap_module_exit (void) {
  int holdon;
  int i=0, j=0;
  (void) i;
  (void) j;
  if (atomic_read (&session_state) == STAP_SESSION_STARTING)
    return;
  if (atomic_read (&session_state) == STAP_SESSION_RUNNING)
    atomic_set (&session_state, STAP_SESSION_STOPPING);
  do {
    int i;
    holdon = 0;
    for (i=0; i < NR_CPUS; i++)
      if (cpu_possible (i) && atomic_read (& ((struct context *)per_cpu_ptr(contexts, i))->busy)) holdon = 1;
      cpu_relax ();
    } while (holdon);
  
  for (i=0; i<1; i++) {
    struct stap_dwarf_probe *sdp = & stap_dwarf_probes[i];
    if (! sdp->registered_p) continue;
    if (sdp->return_p) {
      unregister_kretprobe (&sdp->u.krp);
      atomic_add (sdp->u.krp.nmissed, & skipped_count);
      atomic_add (sdp->u.krp.kp.nmissed, & skipped_count);
    } else {
      unregister_kprobe (&sdp->u.kp);
      atomic_add (sdp->u.kp.nmissed, & skipped_count);
    }
    sdp->registered_p = 0;
  }
  free_percpu (contexts);
  #ifdef STP_TIMING
  {
    if (likely (time_probe_0)) {
      const char *probe_point = "kernel.function(\"schedule\")";
      const char *decl_location = "trace.stp:1:1";
      struct stat_data *stats = _stp_stat_get (time_probe_0, 0);
      const char *error;
      if (stats->count) {
        int64_t avg = _stp_div64 (&error, stats->sum, stats->count);
        _stp_printf ("probe %s (%s), hits: %lld, cycles: %lldmin/%lldavg/%lldmax\n",
        probe_point, decl_location, (long long) stats->count, (long long) stats->min, (long long) avg, (long long) stats->max);
      }
    }
    _stp_print_flush();
  }
  #endif
  if (atomic_read (& skipped_count) || atomic_read (& error_count)) {
    _stp_printf ("WARNING: Number of errors: %d, skipped probes: %d\n", (int) atomic_read (& error_count), (int) atomic_read (& skipped_count));
    _stp_print_flush();
  }
}


int probe_start () {
  return systemtap_module_init () ? -1 : 0;
}

void probe_exit () {
  systemtap_module_exit ();
}
MODULE_DESCRIPTION("systemtap probe");
MODULE_LICENSE("GPL");
/* filled in by runtime */
struct stap_symbol *stap_symbols;
unsigned stap_num_symbols;

Reply via email to