Author: Richard Plangger <planri...@gmail.com>
Branch: 
Changeset: r91967:e19ef006ba32
Date: 2017-07-23 16:46 -0400
http://bitbucket.org/pypy/pypy/changeset/e19ef006ba32/

Log:    reapply fix

diff too long, truncating to 2000 out of 2191 lines

diff --git a/pypy/module/_vmprof/test/test__vmprof.py 
b/pypy/module/_vmprof/test/test__vmprof.py
--- a/pypy/module/_vmprof/test/test__vmprof.py
+++ b/pypy/module/_vmprof/test/test__vmprof.py
@@ -115,3 +115,31 @@
                 assert fd1.read() == tmpfile.read()
         _vmprof.disable()
         assert _vmprof.get_profile_path() is None
+
+    def test_stop_sampling(self):
+        import os
+        import _vmprof
+        tmpfile = open(self.tmpfilename, 'wb')
+        native = 1
+        def f():
+            import sys
+            import math
+            j = sys.maxsize
+            for i in range(500):
+                j = math.sqrt(j)
+        _vmprof.enable(tmpfile.fileno(), 0.01, 0, native, 0, 0)
+        # get_vmprof_stack() always returns 0 here!
+        # see vmprof_common.c and assume RPYTHON_LL2CTYPES is defined!
+        f()
+        fileno = _vmprof.stop_sampling()
+        pos = os.lseek(fileno, 0, os.SEEK_CUR)
+        f()
+        pos2 = os.lseek(fileno, 0, os.SEEK_CUR)
+        assert pos == pos2
+        _vmprof.start_sampling()
+        f()
+        fileno = _vmprof.stop_sampling()
+        pos3 = os.lseek(fileno, 0, os.SEEK_CUR)
+        assert pos3 > pos
+        _vmprof.disable()
+
diff --git a/rpython/rlib/rvmprof/cintf.py b/rpython/rlib/rvmprof/cintf.py
--- a/rpython/rlib/rvmprof/cintf.py
+++ b/rpython/rlib/rvmprof/cintf.py
@@ -20,7 +20,8 @@
 
 compile_extra = ['-DRPYTHON_VMPROF', '-O3']
 separate_module_files = [
-    SHARED.join('symboltable.c')
+    SHARED.join('symboltable.c'),
+    SHARED.join('vmprof_unix.c')
 ]
 if sys.platform.startswith('linux'):
     separate_module_files += [
@@ -40,7 +41,7 @@
     compile_extra += ['-DVMPROF_LINUX']
 elif sys.platform == 'win32':
     compile_extra = ['-DRPYTHON_VMPROF', '-DVMPROF_WINDOWS']
-    separate_module_files = [SHARED.join('vmprof_main_win32.c')]
+    separate_module_files = [SHARED.join('vmprof_win.c')]
     _libs = []
 else:
     # Guessing a BSD-like Unix platform
@@ -58,7 +59,9 @@
         SHARED.join('compat.c'),
         SHARED.join('machine.c'),
         SHARED.join('vmp_stack.c'),
-        SHARED.join('vmprof_main.c'),
+        SHARED.join('vmprof_mt.c'),
+        SHARED.join('vmprof_memory.c'),
+        SHARED.join('vmprof_common.c'),
         # symbol table already in separate_module_files
     ] + separate_module_files,
     post_include_bits=[],
diff --git a/rpython/rlib/rvmprof/src/rvmprof.c 
b/rpython/rlib/rvmprof/src/rvmprof.c
--- a/rpython/rlib/rvmprof/src/rvmprof.c
+++ b/rpython/rlib/rvmprof/src/rvmprof.c
@@ -15,9 +15,9 @@
 
 #include "shared/vmprof_get_custom_offset.h"
 #ifdef VMPROF_UNIX
-#include "shared/vmprof_main.h"
+#include "shared/vmprof_unix.h"
 #else
-#include "shared/vmprof_main_win32.h"
+#include "shared/vmprof_win.h"
 #endif
 
 
diff --git a/rpython/rlib/rvmprof/src/shared/_vmprof.c 
b/rpython/rlib/rvmprof/src/shared/_vmprof.c
--- a/rpython/rlib/rvmprof/src/shared/_vmprof.c
+++ b/rpython/rlib/rvmprof/src/shared/_vmprof.c
@@ -9,8 +9,8 @@
 #include <signal.h>
 
 #include "_vmprof.h"
+#include "vmprof_common.h"
 
-static volatile int is_enabled = 0;
 static destructor Original_code_dealloc = 0;
 static PyObject* (*_default_eval_loop)(PyFrameObject *, int) = 0;
 
@@ -18,9 +18,9 @@
 #include "trampoline.h"
 #include "machine.h"
 #include "symboltable.h"
-#include "vmprof_main.h"
+#include "vmprof_unix.h"
 #else
-#include "vmprof_main_win32.h"
+#include "vmprof_win.h"
 #endif
 #include "vmp_stack.h"
 
@@ -156,7 +156,7 @@
 
 static void cpyprof_code_dealloc(PyObject *co)
 {
-    if (is_enabled) {
+    if (vmprof_is_enabled()) {
         emit_code_object((PyCodeObject *)co);
         /* xxx error return values are ignored */
     }
@@ -187,7 +187,7 @@
         return NULL;
     }
 
-    if (is_enabled) {
+    if (vmprof_is_enabled()) {
         PyErr_SetString(PyExc_ValueError, "vmprof is already enabled");
         return NULL;
     }
@@ -217,13 +217,13 @@
         return NULL;
     }
 
-    is_enabled = 1;
+    vmprof_set_enabled(1);
 
     Py_RETURN_NONE;
 }
 
 static PyObject * vmp_is_enabled(PyObject *module, PyObject *noargs) {
-    if (is_enabled) {
+    if (vmprof_is_enabled()) {
         Py_RETURN_TRUE;
     }
     Py_RETURN_FALSE;
@@ -237,7 +237,7 @@
         return NULL;
     }
 
-    is_enabled = 0;
+    vmprof_set_enabled(0);
 
     if (PyErr_Occurred())
         return NULL;
@@ -362,7 +362,7 @@
 #ifdef VMPROF_UNIX
 static PyObject * vmp_get_profile_path(PyObject *module, PyObject *noargs) {
     PyObject * o;
-    if (is_enabled) {
+    if (vmprof_is_enabled()) {
         char buffer[4096];
         buffer[0] = 0;
         ssize_t buffer_len = vmp_fd_to_path(vmp_profile_fileno(), buffer, 
4096);
@@ -382,21 +382,19 @@
 insert_real_time_thread(PyObject *module, PyObject * noargs) {
     ssize_t thread_count;
 
-    if (!is_enabled) {
+    if (!vmprof_is_enabled()) {
         PyErr_SetString(PyExc_ValueError, "vmprof is not enabled");
         return NULL;
     }
 
-    if (signal_type != SIGALRM) {
+    if (vmprof_get_signal_type() != SIGALRM) {
         PyErr_SetString(PyExc_ValueError, "vmprof is not in real time mode");
         return NULL;
     }
 
-    while (__sync_lock_test_and_set(&spinlock, 1)) {
-    }
-
+    vmprof_aquire_lock();
     thread_count = insert_thread(pthread_self(), -1);
-    __sync_lock_release(&spinlock);
+    vmprof_release_lock();
 
     return PyLong_FromSsize_t(thread_count);
 }
@@ -405,21 +403,19 @@
 remove_real_time_thread(PyObject *module, PyObject * noargs) {
     ssize_t thread_count;
 
-    if (!is_enabled) {
+    if (!vmprof_is_enabled()) {
         PyErr_SetString(PyExc_ValueError, "vmprof is not enabled");
         return NULL;
     }
 
-    if (signal_type != SIGALRM) {
+    if (vmprof_get_signal_type() != SIGALRM) {
         PyErr_SetString(PyExc_ValueError, "vmprof is not in real time mode");
         return NULL;
     }
 
-    while (__sync_lock_test_and_set(&spinlock, 1)) {
-    }
-
+    vmprof_aquire_lock();
     thread_count = remove_thread(pthread_self(), -1);
-    __sync_lock_release(&spinlock);
+    vmprof_release_lock();
 
     return PyLong_FromSsize_t(thread_count);
 }
diff --git a/rpython/rlib/rvmprof/src/shared/machine.c 
b/rpython/rlib/rvmprof/src/shared/machine.c
--- a/rpython/rlib/rvmprof/src/shared/machine.c
+++ b/rpython/rlib/rvmprof/src/shared/machine.c
@@ -27,6 +27,8 @@
     #endif
 #elif __linux__
     return "linux";
+#elif __FreeBSD__
+    return "freebsd"
 #else
     #error "Unknown compiler"
 #endif
@@ -38,7 +40,7 @@
     char proffs[24];
     (void)snprintf(proffs, 24, "/proc/self/fd/%d", fd);
     return readlink(proffs, buffer, buffer_len);
-#elif defined(VMPROF_UNIX)
+#elif defined(VMPROF_UNIX) && !defined(__FreeBSD__)
     fcntl(fd, F_GETPATH, buffer);
     return strlen(buffer);
 #endif
diff --git a/rpython/rlib/rvmprof/src/shared/vmp_stack.c 
b/rpython/rlib/rvmprof/src/shared/vmp_stack.c
--- a/rpython/rlib/rvmprof/src/shared/vmp_stack.c
+++ b/rpython/rlib/rvmprof/src/shared/vmp_stack.c
@@ -523,7 +523,7 @@
 
 int vmp_native_enable(void) {
 #ifdef VMPROF_LINUX
-    if (!unw_get_reg) {
+    if (libhandle == NULL) {
         if ((libhandle = dlopen(LIBUNWIND, RTLD_LAZY | RTLD_LOCAL)) == NULL) {
             goto bail_out;
         }
@@ -570,6 +570,7 @@
             vmprof_error = dlerror();
             fprintf(stderr, "could not close libunwind at runtime. error: 
%s\n", vmprof_error);
         }
+        libhandle = NULL;
     }
 
     vmp_native_traces_enabled = 0;
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof.h 
b/rpython/rlib/rvmprof/src/shared/vmprof.h
--- a/rpython/rlib/rvmprof/src/shared/vmprof.h
+++ b/rpython/rlib/rvmprof/src/shared/vmprof.h
@@ -1,5 +1,11 @@
 #pragma once
 
+#define _GNU_SOURCE 1
+
+#ifndef RPYTHON_VMPROF
+#include <Python.h>
+#endif
+
 #ifdef VMPROF_UNIX
 #include <unistd.h>
 #endif
@@ -79,3 +85,7 @@
 
 #endif
 
+void set_current_codes(void * to);
+int opened_profile(const char *interp_name, int memory, int proflines, int 
native, int real_time);
+void flush_codes(void);
+
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof_common.c 
b/rpython/rlib/rvmprof/src/shared/vmprof_common.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/shared/vmprof_common.c
@@ -0,0 +1,303 @@
+#include "vmprof_common.h"
+
+#include <assert.h>
+#include <errno.h>
+
+#ifdef RPYTHON_VMPROF
+#ifdef RPYTHON_LL2CTYPES
+   /* only for testing: ll2ctypes sets RPY_EXTERN from the command-line */
+
+#else
+#  include "common_header.h"
+#  include "structdef.h"
+#  include "src/threadlocal.h"
+#  include "rvmprof.h"
+#  include "forwarddecl.h"
+#endif
+#endif
+
+#ifdef VMP_SUPPORTS_NATIVE_PROFILING
+#include "vmp_stack.h" // reduces warings
+#endif
+
+
+static volatile int is_enabled = 0;
+static long prepare_interval_usec = 0;
+static long profile_interval_usec = 0;
+
+#ifdef VMPROF_UNIX
+static int signal_type = SIGPROF;
+static int itimer_type = ITIMER_PROF;
+static pthread_t *threads = NULL;
+static size_t threads_size = 0;
+static size_t thread_count = 0;
+static size_t threads_size_step = 8;
+#endif
+
+int vmprof_get_itimer_type(void) {
+    return itimer_type;
+}
+
+int vmprof_is_enabled(void) {
+    return is_enabled;
+}
+
+void vmprof_set_enabled(int value) {
+    is_enabled = value;
+}
+
+long vmprof_get_prepare_interval_usec(void) {
+    return prepare_interval_usec;
+}
+
+long vmprof_get_profile_interval_usec(void) {
+    return profile_interval_usec;
+}
+
+void vmprof_set_prepare_interval_usec(long value) {
+    prepare_interval_usec = value;
+}
+
+void vmprof_set_profile_interval_usec(long value) {
+    profile_interval_usec = value;
+}
+
+int vmprof_get_signal_type(void) {
+    return signal_type;
+}
+
+char *vmprof_init(int fd, double interval, int memory,
+                  int proflines, const char *interp_name, int native, int 
real_time)
+{
+    if (!(interval >= 1e-6 && interval < 1.0)) {   /* also if it is NaN */
+        return "bad value for 'interval'";
+    }
+    prepare_interval_usec = (int)(interval * 1000000.0);
+
+    if (prepare_concurrent_bufs() < 0)
+        return "out of memory";
+#if VMPROF_UNIX
+    if (real_time) {
+        signal_type = SIGALRM;
+        itimer_type = ITIMER_REAL;
+    } else {
+        signal_type = SIGPROF;
+        itimer_type = ITIMER_PROF;
+    }
+    set_current_codes(NULL);
+    assert(fd >= 0);
+#else
+    if (memory) {
+        return "memory tracking only supported on unix";
+    }
+    if (native) {
+        return "native profiling only supported on unix";
+    }
+#endif
+    vmp_set_profile_fileno(fd);
+    if (opened_profile(interp_name, memory, proflines, native, real_time) < 0) 
{
+        vmp_set_profile_fileno(0);
+        return strerror(errno);
+    }
+    return NULL;
+}
+
+int opened_profile(const char *interp_name, int memory, int proflines, int 
native, int real_time)
+{
+    int success;
+    int bits;
+    struct {
+        long hdr[5];
+        char interp_name[259];
+    } header;
+
+    const char * machine;
+    size_t namelen = strnlen(interp_name, 255);
+
+    machine = vmp_machine_os_name();
+
+    header.hdr[0] = 0;
+    header.hdr[1] = 3;
+    header.hdr[2] = 0;
+    header.hdr[3] = prepare_interval_usec;
+    if (strstr(machine, "win64") != 0) {
+        header.hdr[4] = 1;
+    } else {
+        header.hdr[4] = 0;
+    }
+    header.interp_name[0] = MARKER_HEADER;
+    header.interp_name[1] = '\x00';
+    header.interp_name[2] = VERSION_TIMESTAMP;
+    header.interp_name[3] = memory*PROFILE_MEMORY + proflines*PROFILE_LINES + \
+                            native*PROFILE_NATIVE + 
real_time*PROFILE_REAL_TIME;
+#ifdef RPYTHON_VMPROF
+    header.interp_name[3] += PROFILE_RPYTHON;
+#endif
+    header.interp_name[4] = (char)namelen;
+
+    memcpy(&header.interp_name[5], interp_name, namelen);
+    success = vmp_write_all((char*)&header, 5 * sizeof(long) + 5 + namelen);
+    if (success < 0) {
+        return success;
+    }
+
+    /* Write the time and the zone to the log file, profiling will start now */
+    (void)vmp_write_time_now(MARKER_TIME_N_ZONE);
+
+    /* write some more meta information */
+    vmp_write_meta("os", machine);
+    bits = vmp_machine_bits();
+    if (bits == 64) {
+        vmp_write_meta("bits", "64");
+    } else if (bits == 32) {
+        vmp_write_meta("bits", "32");
+    }
+
+    return success;
+}
+
+
+/* Seems that CPython 3.5.1 made our job harder.  Did not find out how
+   to do that without these hacks.  We can't use PyThreadState_GET(),
+   because that calls PyThreadState_Get() which fails an assert if the
+   result is NULL. */
+#if PY_MAJOR_VERSION >= 3 && !defined(_Py_atomic_load_relaxed)
+                             /* this was abruptly un-defined in 3.5.1 */
+void *volatile _PyThreadState_Current;
+   /* XXX simple volatile access is assumed atomic */
+#  define _Py_atomic_load_relaxed(pp)  (*(pp))
+#endif
+
+#ifdef RPYTHON_VMPROF
+#ifndef RPYTHON_LL2CTYPES
+PY_STACK_FRAME_T *get_vmprof_stack(void)
+{
+    struct pypy_threadlocal_s *tl;
+    _OP_THREADLOCALREF_ADDR_SIGHANDLER(tl);
+    if (tl == NULL) {
+        return NULL;
+    } else {
+        return tl->vmprof_tl_stack;
+    }
+}
+#else
+PY_STACK_FRAME_T *get_vmprof_stack(void)
+{
+    return 0;
+}
+#endif
+
+intptr_t vmprof_get_traceback(void *stack, void *ucontext,
+                              intptr_t *result_p, intptr_t result_length)
+{
+    int n;
+    int enabled;
+#ifdef VMPROF_WINDOWS
+    intptr_t pc = 0;   /* XXX implement me */
+#else
+    intptr_t pc = ucontext ? (intptr_t)GetPC((ucontext_t *)ucontext) : 0;
+#endif
+    if (stack == NULL) {
+        stack = get_vmprof_stack();
+    }
+#ifdef VMP_SUPPORTS_NATIVE_PROFILING
+    enabled = vmp_native_enabled();
+    vmp_native_disable();
+#endif
+    n = get_stack_trace(stack, result_p, result_length - 2, pc);
+#ifdef VMP_SUPPORTS_NATIVE_PROFILING
+    if (enabled) {
+        vmp_native_enable();
+    }
+#endif
+    return (intptr_t)n;
+}
+#endif
+
+#ifdef VMPROF_UNIX
+
+ssize_t search_thread(pthread_t tid, ssize_t i)
+{
+    if (i < 0)
+        i = 0;
+    while ((size_t)i < thread_count) {
+        if (pthread_equal(threads[i], tid))
+            return i;
+        i++;
+    }
+    return -1;
+}
+
+ssize_t insert_thread(pthread_t tid, ssize_t i)
+{
+    assert(signal_type == SIGALRM);
+    i = search_thread(tid, i);
+    if (i > 0)
+        return -1;
+    if (thread_count == threads_size) {
+        threads_size += threads_size_step;
+        threads = realloc(threads, sizeof(pid_t) * threads_size);
+        assert(threads != NULL);
+        memset(threads + thread_count, 0, sizeof(pid_t) * threads_size_step);
+    }
+    threads[thread_count++] = tid;
+    return thread_count;
+}
+
+ssize_t remove_thread(pthread_t tid, ssize_t i)
+{
+    assert(signal_type == SIGALRM);
+    if (thread_count == 0)
+        return -1;
+    if (threads == NULL)
+        return -1;
+    i = search_thread(tid, i);
+    if (i < 0)
+        return -1;
+    threads[i] = threads[--thread_count];
+    threads[thread_count] = 0;
+    return thread_count;
+}
+
+ssize_t remove_threads(void)
+{
+    assert(signal_type == SIGALRM);
+    if (threads != NULL) {
+        free(threads);
+        threads = NULL;
+    }
+    thread_count = 0;
+    threads_size = 0;
+    return 0;
+}
+
+int broadcast_signal_for_threads(void)
+{
+    int done = 1;
+    size_t i = 0;
+    pthread_t self = pthread_self();
+    pthread_t tid;
+    while (i < thread_count) {
+        tid = threads[i];
+        if (pthread_equal(tid, self)) {
+            done = 0;
+        } else if (pthread_kill(tid, SIGALRM)) {
+            remove_thread(tid, i);
+        }
+        i++;
+    }
+    return done;
+}
+
+int is_main_thread(void)
+{
+#ifdef VMPROF_LINUX
+    pid_t pid = getpid();
+    pid_t tid = (pid_t) syscall(SYS_gettid);
+    return (pid == tid);
+#elif defined(VMPROF_APPLE)
+    return pthread_main_np();
+#endif
+}
+
+#endif
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof_common.h 
b/rpython/rlib/rvmprof/src/shared/vmprof_common.h
--- a/rpython/rlib/rvmprof/src/shared/vmprof_common.h
+++ b/rpython/rlib/rvmprof/src/shared/vmprof_common.h
@@ -8,84 +8,27 @@
 #include <time.h>
 #include <stdlib.h>
 
-#ifndef VMPROF_WINDOWS
+#ifdef VMPROF_UNIX
 #include <sys/time.h>
 #include "vmprof_mt.h"
+#include <signal.h>
+#include <pthread.h>
 #endif
 
+#include "vmprof_getpc.h"
+
 #ifdef VMPROF_LINUX
 #include <syscall.h>
 #endif
 
 #define MAX_FUNC_NAME 1024
 
-static long prepare_interval_usec = 0;
-static long profile_interval_usec = 0;
-
-static int opened_profile(const char *interp_name, int memory, int proflines, 
int native, int real_time);
-
-#ifdef VMPROF_UNIX
-static int signal_type = SIGPROF;
-static int itimer_type = ITIMER_PROF;
-static pthread_t *threads = NULL;
-static size_t threads_size = 0;
-static size_t thread_count = 0;
-static size_t threads_size_step = 8;
-static struct profbuf_s *volatile current_codes;
-#endif
-
 #ifdef VMPROF_UNIX
 
-static inline ssize_t search_thread(pthread_t tid, ssize_t i) {
-    if (i < 0)
-        i = 0;
-    while ((size_t)i < thread_count) {
-        if (pthread_equal(threads[i], tid))
-            return i;
-        i++;
-    }
-    return -1;
-}
-
-ssize_t insert_thread(pthread_t tid, ssize_t i) {
-    assert(signal_type == SIGALRM);
-    i = search_thread(tid, i);
-    if (i > 0)
-        return -1;
-    if (thread_count == threads_size) {
-        threads_size += threads_size_step;
-        threads = realloc(threads, sizeof(pid_t) * threads_size);
-        assert(threads != NULL);
-        memset(threads + thread_count, 0, sizeof(pid_t) * threads_size_step);
-    }
-    threads[thread_count++] = tid;
-    return thread_count;
-}
-
-ssize_t remove_thread(pthread_t tid, ssize_t i) {
-    assert(signal_type == SIGALRM);
-    if (thread_count == 0)
-        return -1;
-    if (threads == NULL)
-        return -1;
-    i = search_thread(tid, i);
-    if (i < 0)
-        return -1;
-    threads[i] = threads[--thread_count];
-    threads[thread_count] = 0;
-    return thread_count;
-}
-
-ssize_t remove_threads(void) {
-    assert(signal_type == SIGALRM);
-    if (threads != NULL) {
-        free(threads);
-        threads = NULL;
-    }
-    thread_count = 0;
-    threads_size = 0;
-    return 0;
-}
+ssize_t search_thread(pthread_t tid, ssize_t i);
+ssize_t insert_thread(pthread_t tid, ssize_t i);
+ssize_t remove_thread(pthread_t tid, ssize_t i);
+ssize_t remove_threads(void);
 
 #endif
 
@@ -130,95 +73,9 @@
 
 RPY_EXTERN
 char *vmprof_init(int fd, double interval, int memory,
-                  int proflines, const char *interp_name, int native, int 
real_time)
-{
-    if (!(interval >= 1e-6 && interval < 1.0)) {   /* also if it is NaN */
-        return "bad value for 'interval'";
-    }
-    prepare_interval_usec = (int)(interval * 1000000.0);
+                  int proflines, const char *interp_name, int native, int 
real_time);
 
-    if (prepare_concurrent_bufs() < 0)
-        return "out of memory";
-#if VMPROF_UNIX
-    if (real_time) {
-        signal_type = SIGALRM;
-        itimer_type = ITIMER_REAL;
-    } else {
-        signal_type = SIGPROF;
-        itimer_type = ITIMER_PROF;
-    }
-    current_codes = NULL;
-    assert(fd >= 0);
-#else
-    if (memory) {
-        return "memory tracking only supported on unix";
-    }
-    if (native) {
-        return "native profiling only supported on unix";
-    }
-#endif
-    vmp_set_profile_fileno(fd);
-    if (opened_profile(interp_name, memory, proflines, native, real_time) < 0) 
{
-        vmp_set_profile_fileno(0);
-        return strerror(errno);
-    }
-    return NULL;
-}
-
-static int opened_profile(const char *interp_name, int memory, int proflines, 
int native, int real_time)
-{
-    int success;
-    int bits;
-    struct {
-        long hdr[5];
-        char interp_name[259];
-    } header;
-
-    const char * machine;
-    size_t namelen = strnlen(interp_name, 255);
-
-    machine = vmp_machine_os_name();
-
-    header.hdr[0] = 0;
-    header.hdr[1] = 3;
-    header.hdr[2] = 0;
-    header.hdr[3] = prepare_interval_usec;
-    if (strstr(machine, "win64") != 0) {
-        header.hdr[4] = 1;
-    } else {
-        header.hdr[4] = 0;
-    }
-    header.interp_name[0] = MARKER_HEADER;
-    header.interp_name[1] = '\x00';
-    header.interp_name[2] = VERSION_TIMESTAMP;
-    header.interp_name[3] = memory*PROFILE_MEMORY + proflines*PROFILE_LINES + \
-                            native*PROFILE_NATIVE + 
real_time*PROFILE_REAL_TIME;
-#ifdef RPYTHON_VMPROF
-    header.interp_name[3] += PROFILE_RPYTHON;
-#endif
-    header.interp_name[4] = (char)namelen;
-
-    memcpy(&header.interp_name[5], interp_name, namelen);
-    success = vmp_write_all((char*)&header, 5 * sizeof(long) + 5 + namelen);
-    if (success < 0) {
-        return success;
-    }
-
-    /* Write the time and the zone to the log file, profiling will start now */
-    (void)vmp_write_time_now(MARKER_TIME_N_ZONE);
-
-    /* write some more meta information */
-    vmp_write_meta("os", machine);
-    bits = vmp_machine_bits();
-    if (bits == 64) {
-        vmp_write_meta("bits", "64");
-    } else if (bits == 32) {
-        vmp_write_meta("bits", "32");
-    }
-
-    return success;
-}
-
+int opened_profile(const char *interp_name, int memory, int proflines, int 
native, int real_time);
 
 /* Seems that CPython 3.5.1 made our job harder.  Did not find out how
    to do that without these hacks.  We can't use PyThreadState_GET(),
@@ -233,46 +90,22 @@
 
 #ifdef RPYTHON_VMPROF
 #ifndef RPYTHON_LL2CTYPES
-static PY_STACK_FRAME_T *get_vmprof_stack(void)
-{
-    struct pypy_threadlocal_s *tl;
-    _OP_THREADLOCALREF_ADDR_SIGHANDLER(tl);
-    if (tl == NULL)
-        return NULL;
-    else
-        return tl->vmprof_tl_stack;
-}
-#else
-static PY_STACK_FRAME_T *get_vmprof_stack(void)
-{
-    return 0;
-}
+PY_STACK_FRAME_T *get_vmprof_stack(void);
+#endif
+RPY_EXTERN
+intptr_t vmprof_get_traceback(void *stack, void *ucontext,
+                              intptr_t *result_p, intptr_t result_length);
 #endif
 
-RPY_EXTERN
-intptr_t vmprof_get_traceback(void *stack, void *ucontext,
-                              intptr_t *result_p, intptr_t result_length)
-{
-    int n;
-    int enabled;
-#ifdef VMPROF_WINDOWS
-    intptr_t pc = 0;   /* XXX implement me */
-#else
-    intptr_t pc = ucontext ? (intptr_t)GetPC((ucontext_t *)ucontext) : 0;
+int vmprof_get_signal_type(void);
+long vmprof_get_prepare_interval_usec(void);
+long vmprof_get_profile_interval_usec(void);
+void vmprof_set_prepare_interval_usec(long value);
+void vmprof_set_profile_interval_usec(long value);
+int vmprof_is_enabled(void);
+void vmprof_set_enabled(int value);
+int vmprof_get_itimer_type(void);
+#ifdef VMPROF_UNIX
+int broadcast_signal_for_threads(void);
+int is_main_thread(void);
 #endif
-    if (stack == NULL) {
-        stack = get_vmprof_stack();
-    }
-#ifdef VMP_SUPPORTS_NATIVE_PROFILING
-    enabled = vmp_native_enabled();
-    vmp_native_disable();
-#endif
-    n = get_stack_trace(stack, result_p, result_length - 2, pc);
-#ifdef VMP_SUPPORTS_NATIVE_PROFILING
-    if (enabled) {
-        vmp_native_enable();
-    }
-#endif
-    return (intptr_t)n;
-}
-#endif
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof_getpc.h 
b/rpython/rlib/rvmprof/src/shared/vmprof_getpc.h
--- a/rpython/rlib/rvmprof/src/shared/vmprof_getpc.h
+++ b/rpython/rlib/rvmprof/src/shared/vmprof_getpc.h
@@ -142,6 +142,7 @@
 // the right value for your system, and add it to the list in
 // vmrpof_config.h
 #else
+
 static intptr_t GetPC(ucontext_t *signal_ucontext) {
   return signal_ucontext->PC_FROM_UCONTEXT;   // defined in config.h
 }
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof_memory.c 
b/rpython/rlib/rvmprof/src/shared/vmprof_memory.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/shared/vmprof_memory.c
@@ -0,0 +1,81 @@
+#include "vmprof_memory.h"
+
+#ifdef VMPROF_APPLE
+/* On OS X we can get RSS using the Mach API. */
+#include <mach/mach.h>
+#include <mach/message.h>
+#include <mach/kern_return.h>
+#include <mach/task_info.h>
+
+static mach_port_t mach_task;
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+/* On '''normal''' Unices we can get RSS from '/proc/<pid>/status'. */
+static int proc_file = -1;
+#endif
+
+int setup_rss(void)
+{
+#ifdef VMPROF_LINUX
+    char buf[128];
+
+    sprintf(buf, "/proc/%d/status", getpid());
+    proc_file = open(buf, O_RDONLY);
+    return proc_file;
+#elif defined(VMPROF_APPLE)
+    mach_task = mach_task_self();
+    return 0;
+#else
+    return 0;
+#endif
+}
+
+int teardown_rss(void)
+{
+#ifdef VMPROF_LINUX
+    close(proc_file);
+    proc_file = -1;
+    return 0;
+#else
+    return 0;
+#endif
+}
+
+long get_current_proc_rss(void)
+{
+#ifdef VMPROF_LINUX
+    char buf[1024];
+    int i = 0;
+
+    if (lseek(proc_file, 0, SEEK_SET) == -1)
+        return -1;
+    if (read(proc_file, buf, 1024) == -1)
+        return -1;
+    while (i < 1020) {
+        if (strncmp(buf + i, "VmRSS:\t", 7) == 0) {
+            i += 7;
+            return atoi(buf + i);
+        }
+        i++;
+    }
+    return -1;
+#elif defined(VMPROF_APPLE)
+    mach_msg_type_number_t out_count = MACH_TASK_BASIC_INFO_COUNT;
+    mach_task_basic_info_data_t taskinfo = { .resident_size = 0 };
+
+    kern_return_t error = task_info(mach_task, MACH_TASK_BASIC_INFO, 
(task_info_t)&taskinfo, &out_count);
+    if (error == KERN_SUCCESS) {
+        return (long)(taskinfo.resident_size / 1024);
+    } else {
+        return -1;
+    }
+#else
+    return -1; // not implemented
+#endif
+}
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof_memory.h 
b/rpython/rlib/rvmprof/src/shared/vmprof_memory.h
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/shared/vmprof_memory.h
@@ -0,0 +1,5 @@
+#pragma once
+
+int setup_rss(void);
+int teardown_rss(void);
+long get_current_proc_rss(void);
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof_mt.c 
b/rpython/rlib/rvmprof/src/shared/vmprof_mt.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/shared/vmprof_mt.c
@@ -0,0 +1,181 @@
+#include "vmprof_mt.h"
+/* Support for multithreaded write() operations (implementation) */
+
+#include <assert.h>
+
+#if defined(__i386__) || defined(__amd64__)
+  static inline void write_fence(void) { asm("" : : : "memory"); }
+#else
+  static inline void write_fence(void) { __sync_synchronize(); }
+#endif
+
+static char volatile profbuf_state[MAX_NUM_BUFFERS];
+static struct profbuf_s *profbuf_all_buffers = NULL;
+static int volatile profbuf_write_lock = 2;
+static long profbuf_pending_write;
+
+
+static void unprepare_concurrent_bufs(void)
+{
+    if (profbuf_all_buffers != NULL) {
+        munmap(profbuf_all_buffers, sizeof(struct profbuf_s) * 
MAX_NUM_BUFFERS);
+        profbuf_all_buffers = NULL;
+    }
+}
+
+int prepare_concurrent_bufs(void)
+{
+    assert(sizeof(struct profbuf_s) == 8192);
+
+    unprepare_concurrent_bufs();
+    profbuf_all_buffers = mmap(NULL, sizeof(struct profbuf_s) * 
MAX_NUM_BUFFERS,
+                               PROT_READ | PROT_WRITE,
+                               MAP_PRIVATE | MAP_ANONYMOUS,
+                               -1, 0);
+    if (profbuf_all_buffers == MAP_FAILED) {
+        profbuf_all_buffers = NULL;
+        return -1;
+    }
+    memset((char *)profbuf_state, PROFBUF_UNUSED, sizeof(profbuf_state));
+    profbuf_write_lock = 0;
+    profbuf_pending_write = -1;
+    return 0;
+}
+
+static int _write_single_ready_buffer(int fd, long i)
+{
+    /* Try to write to disk the buffer number 'i'.  This function must
+       only be called while we hold the write lock. */
+    assert(profbuf_write_lock != 0);
+
+    if (profbuf_pending_write >= 0) {
+        /* A partially written buffer is waiting.  We'll write the
+           rest of this buffer now, instead of 'i'. */
+        i = profbuf_pending_write;
+        assert(profbuf_state[i] == PROFBUF_READY);
+    }
+
+    if (profbuf_state[i] != PROFBUF_READY) {
+        /* this used to be a race condition: the buffer was written by a
+           different thread already, nothing to do now */
+        return 0;
+    }
+
+    int err;
+    struct profbuf_s *p = &profbuf_all_buffers[i];
+    ssize_t count = write(fd, p->data + p->data_offset, p->data_size);
+    if (count == p->data_size) {
+        profbuf_state[i] = PROFBUF_UNUSED;
+        profbuf_pending_write = -1;
+    }
+    else {
+        if (count > 0) {
+            p->data_offset += count;
+            p->data_size -= count;
+        }
+        profbuf_pending_write = i;
+        if (count < 0)
+            return -1;
+    }
+    return 0;
+}
+
+static void _write_ready_buffers(int fd)
+{
+    long i;
+    int has_write_lock = 0;
+
+    for (i = 0; i < MAX_NUM_BUFFERS; i++) {
+        if (profbuf_state[i] == PROFBUF_READY) {
+            if (!has_write_lock) {
+                if (!__sync_bool_compare_and_swap(&profbuf_write_lock, 0, 1))
+                    return;   /* can't acquire the write lock, give up */
+                has_write_lock = 1;
+            }
+            if (_write_single_ready_buffer(fd, i) < 0)
+                break;
+        }
+    }
+    if (has_write_lock)
+        profbuf_write_lock = 0;
+}
+
+struct profbuf_s *reserve_buffer(int fd)
+{
+    /* Tries to enter a region of code that fills one buffer.  If
+       successful, returns the profbuf_s.  It fails only if the
+       concurrent buffers are all busy (extreme multithreaded usage).
+
+       This might call write() to emit the data sitting in
+       previously-prepared buffers.  In case of write() error, the
+       error is ignored but unwritten data stays in the buffers.
+    */
+    long i;
+
+    _write_ready_buffers(fd);
+
+    for (i = 0; i < MAX_NUM_BUFFERS; i++) {
+        if (profbuf_state[i] == PROFBUF_UNUSED &&
+            __sync_bool_compare_and_swap(&profbuf_state[i], PROFBUF_UNUSED,
+                                         PROFBUF_FILLING)) {
+            struct profbuf_s *p = &profbuf_all_buffers[i];
+            p->data_size = 0;
+            p->data_offset = 0;
+            return p;
+        }
+    }
+    /* no unused buffer found */
+    return NULL;
+}
+
+void commit_buffer(int fd, struct profbuf_s *buf)
+{
+    /* Leaves a region of code that filled 'buf'.
+
+       This might call write() to emit the data now ready.  In case of
+       write() error, the error is ignored but unwritten data stays in
+       the buffers.
+    */
+
+    /* Make sure every thread sees the full content of 'buf' */
+    write_fence();
+
+    /* Then set the 'ready' flag */
+    long i = buf - profbuf_all_buffers;
+    assert(profbuf_state[i] == PROFBUF_FILLING);
+    profbuf_state[i] = PROFBUF_READY;
+
+    if (!__sync_bool_compare_and_swap(&profbuf_write_lock, 0, 1)) {
+        /* can't acquire the write lock, ignore */
+    }
+    else {
+        _write_single_ready_buffer(fd, i);
+        profbuf_write_lock = 0;
+    }
+}
+
+void cancel_buffer(struct profbuf_s *buf)
+{
+    long i = buf - profbuf_all_buffers;
+    assert(profbuf_state[i] == PROFBUF_FILLING);
+    profbuf_state[i] = PROFBUF_UNUSED;
+}
+
+int shutdown_concurrent_bufs(int fd)
+{
+    /* no signal handler can be running concurrently here, because we
+       already did vmprof_ignore_signals(1) */
+    assert(profbuf_write_lock == 0);
+    profbuf_write_lock = 2;
+
+    /* last attempt to flush buffers */
+    int i;
+    for (i = 0; i < MAX_NUM_BUFFERS; i++) {
+        while (profbuf_state[i] == PROFBUF_READY) {
+            if (_write_single_ready_buffer(fd, i) < 0)
+                return -1;
+        }
+    }
+    unprepare_concurrent_bufs();
+    return 0;
+}
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof_mt.h 
b/rpython/rlib/rvmprof/src/shared/vmprof_mt.h
--- a/rpython/rlib/rvmprof/src/shared/vmprof_mt.h
+++ b/rpython/rlib/rvmprof/src/shared/vmprof_mt.h
@@ -1,11 +1,11 @@
 #pragma once
 /* Support for multithreaded write() operations */
 
+#include "vmprof.h"
+
 #include <string.h>
 #include <sys/mman.h>
 
-#include "vmprof.h"
-
 /* The idea is that we have MAX_NUM_BUFFERS available, all of size
    SINGLE_BUF_SIZE.  Threads and signal handlers can ask to reserve a
    buffer, fill it, and finally "commit" it, at which point its
@@ -29,12 +29,6 @@
 */
 #define MAX_NUM_BUFFERS  20
 
-#if defined(__i386__) || defined(__amd64__)
-  static inline void write_fence(void) { asm("" : : : "memory"); }
-#else
-  static inline void write_fence(void) { __sync_synchronize(); }
-#endif
-
 #ifndef MAP_ANONYMOUS
 #define MAP_ANONYMOUS MAP_ANON
 #endif
@@ -50,173 +44,8 @@
     char data[SINGLE_BUF_SIZE];
 };
 
-static char volatile profbuf_state[MAX_NUM_BUFFERS];
-static struct profbuf_s *profbuf_all_buffers = NULL;
-static int volatile profbuf_write_lock = 2;
-static long profbuf_pending_write;
-
-
-static void unprepare_concurrent_bufs(void)
-{
-    if (profbuf_all_buffers != NULL) {
-        munmap(profbuf_all_buffers, sizeof(struct profbuf_s) * 
MAX_NUM_BUFFERS);
-        profbuf_all_buffers = NULL;
-    }
-}
-
-static int prepare_concurrent_bufs(void)
-{
-    assert(sizeof(struct profbuf_s) == 8192);
-
-    unprepare_concurrent_bufs();
-    profbuf_all_buffers = mmap(NULL, sizeof(struct profbuf_s) * 
MAX_NUM_BUFFERS,
-                               PROT_READ | PROT_WRITE,
-                               MAP_PRIVATE | MAP_ANONYMOUS,
-                               -1, 0);
-    if (profbuf_all_buffers == MAP_FAILED) {
-        profbuf_all_buffers = NULL;
-        return -1;
-    }
-    memset((char *)profbuf_state, PROFBUF_UNUSED, sizeof(profbuf_state));
-    profbuf_write_lock = 0;
-    profbuf_pending_write = -1;
-    return 0;
-}
-
-static int _write_single_ready_buffer(int fd, long i)
-{
-    /* Try to write to disk the buffer number 'i'.  This function must
-       only be called while we hold the write lock. */
-    assert(profbuf_write_lock != 0);
-
-    if (profbuf_pending_write >= 0) {
-        /* A partially written buffer is waiting.  We'll write the
-           rest of this buffer now, instead of 'i'. */
-        i = profbuf_pending_write;
-        assert(profbuf_state[i] == PROFBUF_READY);
-    }
-
-    if (profbuf_state[i] != PROFBUF_READY) {
-        /* this used to be a race condition: the buffer was written by a
-           different thread already, nothing to do now */
-        return 0;
-    }
-
-    int err;
-    struct profbuf_s *p = &profbuf_all_buffers[i];
-    ssize_t count = write(fd, p->data + p->data_offset, p->data_size);
-    if (count == p->data_size) {
-        profbuf_state[i] = PROFBUF_UNUSED;
-        profbuf_pending_write = -1;
-    }
-    else {
-        if (count > 0) {
-            p->data_offset += count;
-            p->data_size -= count;
-        }
-        profbuf_pending_write = i;
-        if (count < 0)
-            return -1;
-    }
-    return 0;
-}
-
-static void _write_ready_buffers(int fd)
-{
-    long i;
-    int has_write_lock = 0;
-
-    for (i = 0; i < MAX_NUM_BUFFERS; i++) {
-        if (profbuf_state[i] == PROFBUF_READY) {
-            if (!has_write_lock) {
-                if (!__sync_bool_compare_and_swap(&profbuf_write_lock, 0, 1))
-                    return;   /* can't acquire the write lock, give up */
-                has_write_lock = 1;
-            }
-            if (_write_single_ready_buffer(fd, i) < 0)
-                break;
-        }
-    }
-    if (has_write_lock)
-        profbuf_write_lock = 0;
-}
-
-static struct profbuf_s *reserve_buffer(int fd)
-{
-    /* Tries to enter a region of code that fills one buffer.  If
-       successful, returns the profbuf_s.  It fails only if the
-       concurrent buffers are all busy (extreme multithreaded usage).
-
-       This might call write() to emit the data sitting in
-       previously-prepared buffers.  In case of write() error, the
-       error is ignored but unwritten data stays in the buffers.
-    */
-    long i;
-
-    _write_ready_buffers(fd);
-
-    for (i = 0; i < MAX_NUM_BUFFERS; i++) {
-        if (profbuf_state[i] == PROFBUF_UNUSED &&
-            __sync_bool_compare_and_swap(&profbuf_state[i], PROFBUF_UNUSED,
-                                         PROFBUF_FILLING)) {
-            struct profbuf_s *p = &profbuf_all_buffers[i];
-            p->data_size = 0;
-            p->data_offset = 0;
-            return p;
-        }
-    }
-    /* no unused buffer found */
-    return NULL;
-}
-
-static void commit_buffer(int fd, struct profbuf_s *buf)
-{
-    /* Leaves a region of code that filled 'buf'.
-
-       This might call write() to emit the data now ready.  In case of
-       write() error, the error is ignored but unwritten data stays in
-       the buffers.
-    */
-
-    /* Make sure every thread sees the full content of 'buf' */
-    write_fence();
-
-    /* Then set the 'ready' flag */
-    long i = buf - profbuf_all_buffers;
-    assert(profbuf_state[i] == PROFBUF_FILLING);
-    profbuf_state[i] = PROFBUF_READY;
-
-    if (!__sync_bool_compare_and_swap(&profbuf_write_lock, 0, 1)) {
-        /* can't acquire the write lock, ignore */
-    }
-    else {
-        _write_single_ready_buffer(fd, i);
-        profbuf_write_lock = 0;
-    }
-}
-
-static void cancel_buffer(struct profbuf_s *buf)
-{
-    long i = buf - profbuf_all_buffers;
-    assert(profbuf_state[i] == PROFBUF_FILLING);
-    profbuf_state[i] = PROFBUF_UNUSED;
-}
-
-static int shutdown_concurrent_bufs(int fd)
-{
-    /* no signal handler can be running concurrently here, because we
-       already did vmprof_ignore_signals(1) */
-    assert(profbuf_write_lock == 0);
-    profbuf_write_lock = 2;
-
-    /* last attempt to flush buffers */
-    int i;
-    for (i = 0; i < MAX_NUM_BUFFERS; i++) {
-        while (profbuf_state[i] == PROFBUF_READY) {
-            if (_write_single_ready_buffer(fd, i) < 0)
-                return -1;
-        }
-    }
-    unprepare_concurrent_bufs();
-    return 0;
-}
+int prepare_concurrent_bufs(void);
+struct profbuf_s *reserve_buffer(int fd);
+void commit_buffer(int fd, struct profbuf_s *buf);
+void cancel_buffer(struct profbuf_s *buf);
+int shutdown_concurrent_bufs(int fd);
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof_unix.c 
b/rpython/rlib/rvmprof/src/shared/vmprof_unix.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/shared/vmprof_unix.c
@@ -0,0 +1,496 @@
+#include "vmprof_unix.h"
+
+#ifdef VMPROF_UNIX
+
+#if VMPROF_LINUX
+#include <syscall.h>
+#endif
+
+
+#include <dlfcn.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+
+#include "vmp_stack.h"
+#include "vmprof_mt.h"
+#include "vmprof_getpc.h"
+#include "vmprof_common.h"
+#include "vmprof_memory.h"
+#include "compat.h"
+
+
+
+/* value: LSB bit is 1 if signals must be ignored; all other bits
+   are a counter for how many threads are currently in a signal handler */
+static long volatile signal_handler_ignore = 1;
+static long volatile signal_handler_entries = 0;
+static char atfork_hook_installed = 0;
+static volatile int spinlock;
+static jmp_buf restore_point;
+static struct profbuf_s *volatile current_codes;
+
+
+void vmprof_ignore_signals(int ignored)
+{
+    if (ignored) {
+        /* set the last bit, and wait until concurrently-running signal
+           handlers finish */
+        __sync_add_and_fetch(&signal_handler_ignore, 1L);
+        while (signal_handler_entries != 0L) {
+            usleep(1);
+        }
+    } else {
+        __sync_sub_and_fetch(&signal_handler_ignore, 1L);
+    }
+}
+
+long vmprof_enter_signal(void)
+{
+    __sync_fetch_and_add(&signal_handler_entries, 1L);
+    return signal_handler_ignore;
+}
+
+long vmprof_exit_signal(void)
+{
+    return __sync_sub_and_fetch(&signal_handler_entries, 1L);
+}
+
+int install_pthread_atfork_hooks(void) {
+    /* this is needed to prevent the problems described there:
+         - http://code.google.com/p/gperftools/issues/detail?id=278
+         - http://lists.debian.org/debian-glibc/2010/03/msg00161.html
+
+        TL;DR: if the RSS of the process is large enough, the clone() syscall
+        will be interrupted by the SIGPROF before it can complete, then
+        retried, interrupted again and so on, in an endless loop.  The
+        solution is to disable the timer around the fork, and re-enable it
+        only inside the parent.
+    */
+    if (atfork_hook_installed)
+        return 0;
+    int ret = pthread_atfork(atfork_disable_timer, atfork_enable_timer, 
atfork_close_profile_file);
+    if (ret != 0)
+        return -1;
+    atfork_hook_installed = 1;
+    return 0;
+}
+
+void segfault_handler(int arg)
+{
+    longjmp(restore_point, SIGSEGV);
+}
+
+int _vmprof_sample_stack(struct profbuf_s *p, PY_THREAD_STATE_T * tstate, 
ucontext_t * uc)
+{
+    int depth;
+    struct prof_stacktrace_s *st = (struct prof_stacktrace_s *)p->data;
+    st->marker = MARKER_STACKTRACE;
+    st->count = 1;
+#ifdef RPYTHON_VMPROF
+    depth = get_stack_trace(get_vmprof_stack(), st->stack, MAX_STACK_DEPTH-1, 
(intptr_t)GetPC(uc));
+#else
+    depth = get_stack_trace(tstate, st->stack, MAX_STACK_DEPTH-1, 
(intptr_t)NULL);
+#endif
+    // useful for tests (see test_stop_sampling)
+#ifndef RPYTHON_LL2CTYPES
+    if (depth == 0) {
+        return 0;
+    }
+#endif
+    st->depth = depth;
+    st->stack[depth++] = tstate;
+    long rss = get_current_proc_rss();
+    if (rss >= 0)
+        st->stack[depth++] = (void*)rss;
+    p->data_offset = offsetof(struct prof_stacktrace_s, marker);
+    p->data_size = (depth * sizeof(void *) +
+                    sizeof(struct prof_stacktrace_s) -
+                    offsetof(struct prof_stacktrace_s, marker));
+    return 1;
+}
+
+#ifndef RPYTHON_VMPROF
+PY_THREAD_STATE_T * _get_pystate_for_this_thread(void) {
+    // see issue 116 on github.com/vmprof/vmprof-python.
+    // PyGILState_GetThisThreadState(); can hang forever
+    //
+    PyInterpreterState * istate;
+    PyThreadState * state;
+    long mythread_id;
+
+    mythread_id = PyThread_get_thread_ident();
+    istate = PyInterpreterState_Head();
+    if (istate == NULL) {
+        fprintf(stderr, "WARNING: interp state head is null (for thread id 
%ld)\n", mythread_id);
+        return NULL;
+    }
+    // fish fish fish, it will NOT lock the keymutex in pythread
+    do {
+        state = PyInterpreterState_ThreadHead(istate);
+        do {
+            if (state->thread_id == mythread_id) {
+                return state;
+            }
+        } while ((state = PyThreadState_Next(state)) != NULL);
+    } while ((istate = PyInterpreterState_Next(istate)) != NULL);
+
+    // uh? not found?
+    fprintf(stderr, "WARNING: cannot find thread state (for thread id %ld), 
sample will be thrown away\n", mythread_id);
+    return NULL;
+}
+#endif
+
+void flush_codes(void)
+{
+    struct profbuf_s *p = current_codes;
+    if (p != NULL) {
+        current_codes = NULL;
+        commit_buffer(vmp_profile_fileno(), p);
+    }
+}
+
+void set_current_codes(void * to) {
+    current_codes = to;
+}
+
+#endif
+
+void vmprof_aquire_lock(void) {
+    while (__sync_lock_test_and_set(&spinlock, 1)) {
+    }
+}
+
+void vmprof_release_lock(void) {
+    __sync_lock_release(&spinlock);
+}
+
+void sigprof_handler(int sig_nr, siginfo_t* info, void *ucontext)
+{
+    int commit;
+    PY_THREAD_STATE_T * tstate = NULL;
+    void (*prevhandler)(int);
+
+#ifndef RPYTHON_VMPROF
+
+    // Even though the docs say that this function call is for 'esoteric use'
+    // it seems to be correctly set when the interpreter is teared down!
+    if (!Py_IsInitialized()) {
+        return;
+    }
+
+    // TERRIBLE HACK AHEAD
+    // on OS X, the thread local storage is sometimes uninitialized
+    // when the signal handler runs - it means it's impossible to read errno
+    // or call any syscall or read PyThread_Current or pthread_self. 
Additionally,
+    // it seems impossible to read the register gs.
+    // here we register segfault handler (all guarded by a spinlock) and call
+    // longjmp in case segfault happens while reading a thread local
+    //
+    // We do the same error detection for linux to ensure that
+    // get_current_thread_state returns a sane result
+    while (__sync_lock_test_and_set(&spinlock, 1)) {
+    }
+
+#ifdef VMPROF_UNIX
+    // SIGNAL ABUSE AHEAD
+    // On linux, the prof timer will deliver the signal to the thread which 
triggered the timer,
+    // because these timers are based on process and system time, and as such, 
are thread-aware.
+    // For the real timer, the signal gets delivered to the main thread, 
seemingly always.
+    // Consequently if we want to sample multiple threads, we need to forward 
this signal.
+    if (vmprof_get_signal_type() == SIGALRM) {
+        if (is_main_thread() && broadcast_signal_for_threads()) {
+            __sync_lock_release(&spinlock);
+            return;
+        }
+    }
+#endif
+
+    prevhandler = signal(SIGSEGV, &segfault_handler);
+    int fault_code = setjmp(restore_point);
+    if (fault_code == 0) {
+        pthread_self();
+        tstate = _get_pystate_for_this_thread();
+    } else {
+        signal(SIGSEGV, prevhandler);
+        __sync_lock_release(&spinlock);
+        return;
+    }
+    signal(SIGSEGV, prevhandler);
+    __sync_lock_release(&spinlock);
+#endif
+
+    long val = vmprof_enter_signal();
+
+    if (val == 0) {
+        int saved_errno = errno;
+        int fd = vmp_profile_fileno();
+        assert(fd >= 0);
+
+        struct profbuf_s *p = reserve_buffer(fd);
+        if (p == NULL) {
+            /* ignore this signal: there are no free buffers right now */
+        } else {
+#ifdef RPYTHON_VMPROF
+            commit = _vmprof_sample_stack(p, NULL, (ucontext_t*)ucontext);
+#else
+            commit = _vmprof_sample_stack(p, tstate, (ucontext_t*)ucontext);
+#endif
+            if (commit) {
+                commit_buffer(fd, p);
+            } else {
+#ifndef RPYTHON_VMPROF
+                fprintf(stderr, "WARNING: canceled buffer, no stack trace was 
written\n");
+#else
+                fprintf(stderr, "WARNING: canceled buffer, no stack trace was 
written\n");
+#endif
+                cancel_buffer(p);
+            }
+        }
+
+        errno = saved_errno;
+    }
+
+    vmprof_exit_signal();
+}
+
+int install_sigprof_handler(void)
+{
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_sigaction = sigprof_handler;
+    sa.sa_flags = SA_RESTART | SA_SIGINFO;
+    if (sigemptyset(&sa.sa_mask) == -1 ||
+        sigaction(vmprof_get_signal_type(), &sa, NULL) == -1)
+        return -1;
+    return 0;
+}
+
+int remove_sigprof_handler(void)
+{
+    struct sigaction ign_sigint, prev;
+    ign_sigint.sa_handler = SIG_IGN;
+    ign_sigint.sa_flags = 0;
+    sigemptyset(&ign_sigint.sa_mask);
+
+    if (sigaction(vmprof_get_signal_type(), &ign_sigint, NULL) < 0) {
+        fprintf(stderr, "Could not remove the signal handler (for 
profiling)\n");
+        return -1;
+    }
+    return 0;
+}
+
+int install_sigprof_timer(void)
+{
+    static struct itimerval timer;
+    timer.it_interval.tv_sec = 0;
+    timer.it_interval.tv_usec = (int)vmprof_get_profile_interval_usec();
+    timer.it_value = timer.it_interval;
+    if (setitimer(vmprof_get_itimer_type(), &timer, NULL) != 0)
+        return -1;
+    return 0;
+}
+
+int remove_sigprof_timer(void)
+{
+    static struct itimerval timer;
+    timerclear(&(timer.it_interval));
+    timerclear(&(timer.it_value));
+    if (setitimer(vmprof_get_itimer_type(), &timer, NULL) != 0) {
+        fprintf(stderr, "Could not disable the signal handler (for 
profiling)\n");
+        return -1;
+    }
+    return 0;
+}
+
+void atfork_disable_timer(void)
+{
+    if (vmprof_get_profile_interval_usec() > 0) {
+        remove_sigprof_timer();
+        vmprof_set_enabled(0);
+    }
+}
+
+void atfork_close_profile_file(void)
+{
+    int fd = vmp_profile_fileno();
+    if (fd != -1)
+        close(fd);
+    vmp_set_profile_fileno(-1);
+}
+void atfork_enable_timer(void)
+{
+    if (vmprof_get_profile_interval_usec() > 0) {
+        install_sigprof_timer();
+        vmprof_set_enabled(1);
+    }
+}
+
+#ifdef VMP_SUPPORTS_NATIVE_PROFILING
+void init_cpyprof(int native)
+{
+    // skip this if native should not be enabled
+    if (!native) {
+        vmp_native_disable();
+        return;
+    }
+    vmp_native_enable();
+}
+
+static void disable_cpyprof(void)
+{
+    vmp_native_disable();
+}
+#endif
+
+int vmprof_enable(int memory, int native, int real_time)
+{
+#ifdef VMP_SUPPORTS_NATIVE_PROFILING
+    init_cpyprof(native);
+#endif
+    assert(vmp_profile_fileno() >= 0);
+    assert(vmprof_get_prepare_interval_usec() > 0);
+    vmprof_set_profile_interval_usec(vmprof_get_prepare_interval_usec());
+    if (memory && setup_rss() == -1)
+        goto error;
+#if VMPROF_UNIX
+    if (real_time && insert_thread(pthread_self(), -1) == -1)
+        goto error;
+#endif
+    if (install_pthread_atfork_hooks() == -1)
+        goto error;
+    if (install_sigprof_handler() == -1)
+        goto error;
+    if (install_sigprof_timer() == -1)
+        goto error;
+    vmprof_ignore_signals(0);
+    return 0;
+
+ error:
+    vmp_set_profile_fileno(-1);
+    vmprof_set_profile_interval_usec(0);
+    return -1;
+}
+
+
+int close_profile(void)
+{
+    int fileno = vmp_profile_fileno();
+    fsync(fileno);
+    (void)vmp_write_time_now(MARKER_TRAILER);
+    teardown_rss();
+
+    /* don't close() the file descriptor from here */
+    vmp_set_profile_fileno(-1);
+    return 0;
+}
+
+int vmprof_disable(void)
+{
+    vmprof_ignore_signals(1);
+    vmprof_set_profile_interval_usec(0);
+#ifdef VMP_SUPPORTS_NATIVE_PROFILING
+    disable_cpyprof();
+#endif
+
+    if (remove_sigprof_timer() == -1) {
+        return -1;
+    }
+    if (remove_sigprof_handler() == -1) {
+        return -1;
+    }
+#ifdef VMPROF_UNIX
+    if ((vmprof_get_signal_type() == SIGALRM) && remove_threads() == -1) {
+        return -1;
+    }
+#endif
+    flush_codes();
+    if (shutdown_concurrent_bufs(vmp_profile_fileno()) < 0)
+        return -1;
+    return close_profile();
+}
+
+int vmprof_register_virtual_function(char *code_name, intptr_t code_uid,
+                                     int auto_retry)
+{
+    long namelen = strnlen(code_name, 1023);
+    long blocklen = 1 + sizeof(intptr_t) + sizeof(long) + namelen;
+    struct profbuf_s *p;
+    char *t;
+
+ retry:
+    p = current_codes;
+    if (p != NULL) {
+        if (__sync_bool_compare_and_swap(&current_codes, p, NULL)) {
+            /* grabbed 'current_codes': we will append the current block
+               to it if it contains enough room */
+            size_t freesize = SINGLE_BUF_SIZE - p->data_size;
+            if (freesize < (size_t)blocklen) {
+                /* full: flush it */
+                commit_buffer(vmp_profile_fileno(), p);
+                p = NULL;
+            }
+        }
+        else {
+            /* compare-and-swap failed, don't try again */
+            p = NULL;
+        }
+    }
+
+    if (p == NULL) {
+        p = reserve_buffer(vmp_profile_fileno());
+        if (p == NULL) {
+            /* can't get a free block; should almost never be the
+               case.  Spin loop if allowed, or return a failure code
+               if not (e.g. we're in a signal handler) */
+            if (auto_retry > 0) {
+                auto_retry--;
+                usleep(1);
+                goto retry;
+            }
+            return -1;
+        }
+    }
+
+    t = p->data + p->data_size;
+    p->data_size += blocklen;
+    assert(p->data_size <= SINGLE_BUF_SIZE);
+    *t++ = MARKER_VIRTUAL_IP;
+    memcpy(t, &code_uid, sizeof(intptr_t)); t += sizeof(intptr_t);
+    memcpy(t, &namelen, sizeof(long)); t += sizeof(long);
+    memcpy(t, code_name, namelen);
+
+    /* try to reattach 'p' to 'current_codes' */
+    if (!__sync_bool_compare_and_swap(&current_codes, NULL, p)) {
+        /* failed, flush it */
+        commit_buffer(vmp_profile_fileno(), p);
+    }
+    return 0;
+}
+
+int get_stack_trace(PY_THREAD_STATE_T * current, void** result, int max_depth, 
intptr_t pc)
+{
+    PY_STACK_FRAME_T * frame;
+#ifdef RPYTHON_VMPROF
+    // do nothing here,
+    frame = (PY_STACK_FRAME_T*)current;
+#else
+    if (current == NULL) {
+        fprintf(stderr, "WARNING: get_stack_trace, current is NULL\n");
+        return 0;
+    }
+    frame = current->frame;
+#endif
+    if (frame == NULL) {
+        fprintf(stderr, "WARNING: get_stack_trace, frame is NULL\n");
+        return 0;
+    }
+    return vmp_walk_and_record_stack(frame, result, max_depth, 1, pc);
+}
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof_unix.h 
b/rpython/rlib/rvmprof/src/shared/vmprof_unix.h
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/shared/vmprof_unix.h
@@ -0,0 +1,86 @@
+#pragma once
+
+/* VMPROF
+ *
+ * statistical sampling profiler specifically designed to profile programs
+ * which run on a Virtual Machine and/or bytecode interpreter, such as Python,
+ * etc.
+ *
+ * The logic to dump the C stack traces is partly stolen from the code in
+ * gperftools.
+ * The file "getpc.h" has been entirely copied from gperftools.
+ *
+ * Tested only on gcc, linux, x86_64.
+ *
+ * Copyright (C) 2014-2017
+ *   Antonio Cuni - anto.c...@gmail.com
+ *   Maciej Fijalkowski - fij...@gmail.com
+ *   Armin Rigo - ar...@tunes.org
+ *   Richard Plangger - planri...@gmail.com
+ *
+ */
+
+#include "vmprof.h"
+
+#include "vmprof_mt.h"
+
+#include <signal.h>
+
+RPY_EXTERN void vmprof_ignore_signals(int ignored);
+RPY_EXTERN long vmprof_enter_signal(void);
+RPY_EXTERN long vmprof_exit_signal(void);
+
+/* *************************************************************
+ * functions to dump the stack trace
+ * *************************************************************
+ */
+
+#ifndef RPYTHON_VMPROF
+PY_THREAD_STATE_T * _get_pystate_for_this_thread(void);
+#endif
+int get_stack_trace(PY_THREAD_STATE_T * current, void** result, int max_depth, 
intptr_t pc);
+
+/* *************************************************************
+ * the signal handler
+ * *************************************************************
+ */
+
+#include <setjmp.h>
+
+void segfault_handler(int arg);
+int _vmprof_sample_stack(struct profbuf_s *p, PY_THREAD_STATE_T * tstate, 
ucontext_t * uc);
+void sigprof_handler(int sig_nr, siginfo_t* info, void *ucontext);
+
+
+/* *************************************************************
+ * the setup and teardown functions
+ * *************************************************************
+ */
+
+int install_sigprof_handler(void);
+int remove_sigprof_handler(void);
+int install_sigprof_timer(void);
+int remove_sigprof_timer(void);
+void atfork_disable_timer(void);
+void atfork_enable_timer(void);
+void atfork_close_profile_file(void);
+int install_pthread_atfork_hooks(void);
+
+#ifdef VMP_SUPPORTS_NATIVE_PROFILING
+void init_cpyprof(int native);
+static void disable_cpyprof(void);
+#endif
+
+int close_profile(void);
+
+RPY_EXTERN
+int vmprof_enable(int memory, int native, int real_time);
+RPY_EXTERN
+int vmprof_disable(void);
+RPY_EXTERN
+int vmprof_register_virtual_function(char *code_name, intptr_t code_uid,
+                                     int auto_retry);
+
+
+void vmprof_aquire_lock(void);
+void vmprof_release_lock(void);
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof_win.c 
b/rpython/rlib/rvmprof/src/shared/vmprof_win.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/shared/vmprof_win.c
@@ -0,0 +1,42 @@
+// cannot include this header because it also has definitions
+#include "windows.h"
+#include "compat.h"
+#include "vmp_stack.h"
+
+HANDLE write_mutex;
+
+int prepare_concurrent_bufs(void)
+{
+    if (!(write_mutex = CreateMutex(NULL, FALSE, NULL)))
+        return -1;
+    return 0;
+}
+
+#include <tlhelp32.h>
+
+int vmp_write_all(const char *buf, size_t bufsize)
+{
+    int res;
+    int fd;
+    int count;
+
+    res = WaitForSingleObject(write_mutex, INFINITE);
+    fd = vmp_profile_fileno();
+
+    if (fd == -1) {
+        ReleaseMutex(write_mutex);
+        return -1;
+    }
+    while (bufsize > 0) {
+        count = _write(fd, buf, (long)bufsize);
+        if (count <= 0) {
+            ReleaseMutex(write_mutex);
+            return -1;   /* failed */
+        }
+        buf += count;
+        bufsize -= count;
+    }
+    ReleaseMutex(write_mutex);
+    return 0;
+}
+
diff --git a/rpython/rlib/rvmprof/src/shared/vmprof_win.h 
b/rpython/rlib/rvmprof/src/shared/vmprof_win.h
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/shared/vmprof_win.h
@@ -0,0 +1,203 @@
+#pragma once
+
+#include "windows.h"
+#include "compat.h"
+#include "vmp_stack.h"
+
+HANDLE write_mutex;
+
+int prepare_concurrent_bufs(void);
+
+#include "vmprof_common.h"
+#include <tlhelp32.h>
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to