Ping! Could someone who can approve please review the patch?
Thanks, -Andi Nuno Diegues <n...@ist.utl.pt> writes: > Hello, > > I have taken the chance to improve the patch by addressing the > comments above in this thread. > Namely: > - to use a simple random generator managed inside the library only > - removed floating point usage and replaced by fixed arithmetic > - added some comments where relevant > > Re-running the STAMP benchmarks shows similar gains (to those shown > above) with respect to an unmodified GCC 5.0.0: > > benchmark: speedup > genome: 1.58 > intruder: 1.78 > labyrinth: 1.0 > ssca2: 1.01 > yada: 1.0 > kmeans-high: 1.16 > kmeans-low: 1.16 > vacation-high: 2.05 > vacation-low: 2.81 > > I appreciate any feedback and comments! > > Index: libitm/libitm_i.h > =================================================================== > --- libitm/libitm_i.h (revision 219316) > +++ libitm/libitm_i.h (working copy) > @@ -242,6 +242,9 @@ struct gtm_thread > uint32_t restart_reason[NUM_RESTARTS]; > uint32_t restart_total; > > + // Keeps track of how many transactions were successfully executed. > + uint64_t number_executed_txns; > + > // *** The shared part of gtm_thread starts here. *** > // Shared state is on separate cachelines to avoid false sharing with > // thread-local parts of gtm_thread. > @@ -286,6 +289,8 @@ struct gtm_thread > static void *operator new(size_t); > static void operator delete(void *); > > + static void reoptimize_htm_execution(); > + > // Invoked from assembly language, thus the "asm" specifier on > // the name, avoiding complex name mangling. > static uint32_t begin_transaction(uint32_t, const gtm_jmpbuf *) > @@ -309,6 +314,59 @@ struct gtm_thread > void commit_user_actions (); > }; > > +// Different ways to deal with capacity aborts in HTM execution. > +enum gtm_capacity_abort_mode > +{ > + STUBBORN, > + HALVEN, > + GIVEUP > +}; > + > +// Definition of fixed point arithmetic types. > +// Half the bits are dedicated to the fractional type, and the rest to the > +// "whole" part. > +#define FIXED_PT_WIDTH 64 > +#define FIXED_PT_INTEGER_WIDTH 32 > +typedef uint64_t fixed_pt_t; > +typedef __uint128_t fixed_pt_td; > + > +#define FIXED_PT_FRAC_WIDTH (FIXED_PT_WIDTH - FIXED_PT_INTEGER_WIDTH) > +#define FIXED_PT_ONE ((fixed_pt_t)((fixed_pt_t)1 << FIXED_PT_FRAC_WIDTH)) > +#define FIXED_PT_TWO (FIXED_PT_ONE + FIXED_PT_ONE) > + > +#define fixed_mul(A,B) \ > + ((fixed_pt_t)(((fixed_pt_td)(A) * (fixed_pt_td)(B)) >> > FIXED_PT_FRAC_WIDTH)) > +#define fixed_div(A,B) \ > + ((fixed_pt_t)(((fixed_pt_td)(A) << FIXED_PT_FRAC_WIDTH) / > (fixed_pt_td)(B))) > +#define fixed_const(R) \ > + ((fixed_pt_t)((R) * FIXED_PT_ONE + ((R) >= 0 ? 0.5 : -0.5))) > + > +// Maintains the current values optimized for HTM execution and the > +// corresponding statistics gathered for the decision-making. > +struct gtm_global_optimizer > +{ > + // Mode chosen to currently deal with capacity aborts. > + gtm_capacity_abort_mode optimized_mode; > + // Number of attempts chosen to currently insist on HTM execution. > + uint32_t optimized_attempts; > + > + uint64_t last_cycles; > + uint64_t last_total_txs_executed; > + > + fixed_pt_t last_throughput; > + uint32_t last_attempts; > + > + fixed_pt_t best_ever_throughput; > + uint32_t best_ever_attempts; > + > + uint64_t txns_while_stubborn; > + uint64_t cycles_while_stubborn; > + uint64_t txns_while_halven; > + uint64_t cycles_while_halven; > + uint64_t txns_while_giveup; > + uint64_t cycles_while_giveup; > +}; > + > } // namespace GTM > > #include "tls.h" > @@ -346,6 +404,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache > // the name, avoiding complex name mangling. > extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath"); > > +// Maintains the optimization for HTM execution. > +extern gtm_global_optimizer optimizer; > + > } // namespace GTM > > #endif // LIBITM_I_H > Index: libitm/libitm.h > =================================================================== > --- libitm/libitm.h (revision 219316) > +++ libitm/libitm.h (working copy) > @@ -101,7 +101,8 @@ typedef enum > /* These are not part of the ABI but used for custom HTM fast paths. See > ITM_beginTransaction and gtm_thread::begin_transaction. */ > pr_HTMRetryableAbort = 0x800000, > - pr_HTMRetriedAfterAbort = 0x1000000 > + pr_HTMRetriedAfterAbort = 0x1000000, > + pr_HTMCapacityAbort = 0x2000000 > } _ITM_codeProperties; > > /* Result from startTransaction that describes what actions to take. > Index: libitm/method-serial.cc > =================================================================== > --- libitm/method-serial.cc (revision 219316) > +++ libitm/method-serial.cc (working copy) > @@ -223,7 +223,23 @@ struct htm_mg : public method_group > // initially disabled. > #ifdef USE_HTM_FASTPATH > htm_fastpath = htm_init(); > +#ifdef HAVE_AS_RTM > + optimizer.optimized_mode = STUBBORN; > + optimizer.optimized_attempts = htm_fastpath; > + optimizer.last_cycles = rdtsc(); > + optimizer.last_total_txs_executed = 0; > + optimizer.last_throughput = fixed_const(0.0001); > + optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1; > + optimizer.best_ever_throughput = optimizer.last_throughput; > + optimizer.best_ever_attempts = htm_fastpath; > + optimizer.txns_while_stubborn = 1; > + optimizer.cycles_while_stubborn = 100; > + optimizer.txns_while_halven = 1; > + optimizer.cycles_while_halven = 100; > + optimizer.txns_while_giveup = 1; > + optimizer.cycles_while_giveup = 100; > #endif > +#endif > } > virtual void fini() > { > Index: libitm/beginend.cc > =================================================================== > --- libitm/beginend.cc (revision 219316) > +++ libitm/beginend.cc (working copy) > @@ -25,7 +25,6 @@ > #include "libitm_i.h" > #include <pthread.h> > > - > using namespace GTM; > > #if !defined(HAVE_ARCH_GTM_THREAD) || !defined(HAVE_ARCH_GTM_THREAD_DISP) > @@ -39,6 +38,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0; > gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE]; > atomic<gtm_version> GTM::gtm_clock; > > +// Optimization of HTM executions. > +gtm_global_optimizer GTM::optimizer; > + > /* ??? Move elsewhere when we figure out library initialization. */ > uint64_t GTM::gtm_spin_count_var = 1000; > > @@ -149,6 +151,225 @@ choose_code_path(uint32_t prop, abi_dispatch *disp > return a_runInstrumentedCode; > } > > +static inline fixed_pt_t > +fixed_sqrt(fixed_pt_t n) > +{ > + int invert = 0; > + int iter = FIXED_PT_FRAC_WIDTH; > + int l, i; > + > + if (n == 0 || n == FIXED_PT_ONE) > + { > + return n; > + } > + if (n < FIXED_PT_ONE && n > 6) > + { > + invert = 1; > + n = fixed_div(FIXED_PT_ONE, n); > + } > + if (n > FIXED_PT_ONE) > + { > + int s = n; > + iter = 0; > + while (s > 0) > + { > + s >>= 2; > + iter++; > + } > + } > + > + l = (n >> 1) + 1; > + for (i = 0; i < iter; i++) > + { > + l = (l + fixed_div(n, l)) >> 1; > + } > + if (invert) > + { > + return (fixed_div(FIXED_PT_ONE, l)); > + } > + return l; > +} > + > +static inline fixed_pt_t > +fixed_ln(fixed_pt_t x) > +{ > + const fixed_pt_t LN2 = fixed_const(0.69314718055994530942); > + const fixed_pt_t LG[7] = { > + fixed_const(6.666666666666735130e-01), > + fixed_const(3.999999999940941908e-01), > + fixed_const(2.857142874366239149e-01), > + fixed_const(2.222219843214978396e-01), > + fixed_const(1.818357216161805012e-01), > + fixed_const(1.531383769920937332e-01), > + fixed_const(1.479819860511658591e-01) > + }; > + > + if (x == 0) > + { > + return 0xffffffff; > + } > + > + fixed_pt_t log2 = 0; > + fixed_pt_t xi = x; > + while (xi > FIXED_PT_TWO) > + { > + xi >>= 1; > + log2++; > + } > + > + fixed_pt_t f = xi - FIXED_PT_ONE; > + fixed_pt_t s = fixed_div(f, FIXED_PT_TWO + f); > + fixed_pt_t z = fixed_mul(s, s); > + fixed_pt_t w = fixed_mul(z, z); > + fixed_pt_t R = > + fixed_mul(w, LG[1] + fixed_mul(w, LG[3] + fixed_mul(w, LG[5]))) > + + fixed_mul(z, LG[0] + fixed_mul(w, LG[2] > + + fixed_mul(w, LG[4] + fixed_mul(w, LG[6])))); > + return fixed_mul(LN2, (log2 << FIXED_PT_FRAC_WIDTH)) > + + f - fixed_mul(s, f - R); > +} > + > +// State not synchronized; assumes single thread usage at any time. > +// Invoked only by the thread reoptimizing, so assumption holds. > +int > +obtainRandomInt(int max) > +{ > + static int seed = 123456789; > + seed = (1103515245 * seed + 12345) % 4294967296; > + return seed; > +} > + > +// Called by the thread at the tail of the list of threads. > +void > +GTM::gtm_thread::reoptimize_htm_execution() > +{ > + gtm_thread *tx = gtm_thr(); > + uint64_t total_txs_executed = 0; > + > + // Collect the statistics obtained so far. > + serial_lock.read_lock(tx); > + gtm_thread *ptr = list_of_threads; > + for (; ptr; ptr = ptr->next_thread) > + { > + total_txs_executed += ptr->number_executed_txns; > + } > + serial_lock.read_unlock(tx); > + > + // Obtain the delta performance with respect to the last period. > + const uint64_t current_cycles = rdtsc(); > + const uint64_t cycles_used = current_cycles - optimizer.last_cycles; > + const uint64_t new_txs_executed = > + total_txs_executed - optimizer.last_total_txs_executed; > + optimizer.last_cycles = current_cycles; > + optimizer.last_total_txs_executed = total_txs_executed; > + if (optimizer.optimized_mode == STUBBORN) > + { > + optimizer.txns_while_stubborn += new_txs_executed; > + optimizer.cycles_while_stubborn += cycles_used; > + } > + else if (optimizer.optimized_mode == HALVEN) > + { > + optimizer.txns_while_halven += new_txs_executed; > + optimizer.cycles_while_halven += cycles_used; > + } > + else > + { > + optimizer.txns_while_giveup += new_txs_executed; > + optimizer.cycles_while_giveup += cycles_used; > + } > + > + // Compute Upper Confidence Bounds for the mode to choose next. > + // Use fixed point arithmetic types to spare floating point usage. > + const fixed_pt_t log_sum = > + fixed_mul(FIXED_PT_TWO, > + fixed_ln(fixed_const(optimizer.txns_while_stubborn) > + + fixed_const(optimizer.txns_while_halven) > + + fixed_const(optimizer.txns_while_giveup))); > + const fixed_pt_t ucb_stubborn = > + fixed_div(fixed_div(fixed_const(optimizer.txns_while_stubborn), > + fixed_const(optimizer.cycles_while_stubborn)), > + optimizer.best_ever_throughput) > + + fixed_sqrt(fixed_div(log_sum, > + fixed_const(optimizer.txns_while_stubborn))); > + const fixed_pt_t ucb_halven = > + fixed_div(fixed_div(fixed_const(optimizer.txns_while_halven), > + fixed_const(optimizer.cycles_while_halven)), > + optimizer.best_ever_throughput) > + + fixed_sqrt(fixed_div(log_sum, > fixed_const(optimizer.txns_while_halven))); > + const fixed_pt_t ucb_giveup = > + fixed_div(fixed_div(fixed_const(optimizer.txns_while_giveup), > + fixed_const(optimizer.cycles_while_giveup)), > + optimizer.best_ever_throughput) > + + fixed_sqrt(fixed_div(log_sum, > fixed_const(optimizer.txns_while_giveup))); > + > + if (ucb_stubborn > ucb_halven && ucb_stubborn > ucb_giveup) > + { > + optimizer.optimized_mode = STUBBORN; > + } > + else if (ucb_halven > ucb_giveup) > + { > + optimizer.optimized_mode = HALVEN; > + } > + else > + { > + optimizer.optimized_mode = GIVEUP; > + } > + > + // Compute gradient descent for the number of retries. > + const fixed_pt_t current_throughput = > + fixed_div(fixed_const(new_txs_executed), fixed_const(cycles_used)); > + const fixed_pt_t change_for_better = > + fixed_div(current_throughput, optimizer.last_throughput); > + const fixed_pt_t change_for_worse = > + fixed_div(optimizer.last_throughput, current_throughput); > + int32_t last_attempts = optimizer.last_attempts; > + int32_t current_attempts = optimizer.optimized_attempts; > + int32_t new_attempts = current_attempts; > + if (unlikely(change_for_worse > fixed_const(1.40))) > + { > + optimizer.optimized_attempts = optimizer.best_ever_attempts; > + optimizer.last_throughput = current_throughput; > + optimizer.last_attempts = current_attempts; > + return; > + } > + > + if (unlikely(obtainRandomInt(100) == 0)) > + { > + optimizer.last_attempts = optimizer.optimized_attempts; > + optimizer.last_throughput = current_throughput; > + optimizer.optimized_attempts = obtainRandomInt(18) + 2; > + return; > + } > + > + if (change_for_better > fixed_const(1.05)) > + { > + new_attempts += current_attempts - last_attempts; > + if (current_attempts == last_attempts) > + new_attempts += obtainRandomInt(2) == 0 ? -1 : 1; > + } > + else if (change_for_worse > fixed_const(1.05)) > + { > + new_attempts -= current_attempts - last_attempts; > + if (current_attempts == last_attempts) > + new_attempts += obtainRandomInt(2) == 0 ? -1 : 1; > + } > + if (unlikely(new_attempts > 20)) > + new_attempts = 20; > + else if (unlikely(new_attempts < 2)) > + new_attempts = 2; > + if (current_attempts != new_attempts) > + { > + optimizer.last_attempts = current_attempts; > + optimizer.last_throughput = current_throughput; > + } > + optimizer.optimized_attempts = new_attempts; > + if (unlikely(current_throughput > optimizer.best_ever_throughput)) > + { > + optimizer.best_ever_throughput = current_throughput; > + optimizer.best_ever_attempts = current_attempts; > + } > +} > + > uint32_t > GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb) > { > @@ -190,7 +411,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, > #ifndef HTM_CUSTOM_FASTPATH > if (likely(htm_fastpath && (prop & pr_hasNoAbort))) > { > - for (uint32_t t = htm_fastpath; t; t--) > + for (uint32_t t = optimizer.optimized_attempts; t; t--) > { > uint32_t ret = htm_begin(); > if (htm_begin_success(ret)) > @@ -209,19 +430,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, > } > // The transaction has aborted. Don't retry if it's unlikely that > // retrying the transaction will be successful. > - if (!htm_abort_should_retry(ret)) > +#ifdef HAVE_AS_RTM > + if (htm_is_capacity_abort(ret)) > + { > + gtm_capacity_abort_mode capacity_mode = > optimizer.optimized_mode; > + if (capacity_mode == HALVEN) > + t = t / 2 + 1; > + else if (capacity_mode == GIVEUP) > + t = 1; > + } > + // Only one thread performs this to avoid the need for > + // synchronization. We use the oldest thread that is active. > + // We also reoptimize at most once per transaction. > + if (unlikely(tx->next_thread == NULL && > + tx->number_executed_txns % 500 == 0 && t == 1)) > + reoptimize_htm_execution(); > +#else > + if (!htm_abort_should_retry(ret)) > break; > +#endif > // Wait until any concurrent serial-mode transactions have finished. > // This is an empty critical section, but won't be elided. > if (serial_lock.is_write_locked()) > { > - tx = gtm_thr(); > - if (unlikely(tx == NULL)) > - { > - // See below. > - tx = new gtm_thread(); > - set_gtm_thr(tx); > - } > + tx = gtm_thr(); > + if (unlikely(tx == NULL)) > + { > + // See below. > + tx = new gtm_thread(); > + set_gtm_thr(tx); > + } > // Check whether there is an enclosing serial-mode transaction; > // if so, we just continue as a nested transaction and don't > // try to use the HTM fastpath. This case can happen when an > @@ -262,12 +500,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, > // other fallback will use serial transactions, which don't use > // restart_total but will reset it when committing. > if (!(prop & pr_HTMRetriedAfterAbort)) > - tx->restart_total = htm_fastpath; > + tx->restart_total = optimizer.optimized_attempts; > > if (--tx->restart_total > 0) > { > // Wait until any concurrent serial-mode transactions have finished. > // Essentially the same code as above. > +#ifdef HAVE_AS_RTM > + if (prop & pr_HTMCapacityAbort) > + { > + gtm_capacity_abort_mode capacity_mode = > optimizer.optimized_mode; > + if (capacity_mode == HALVEN) > + tx->restart_total = tx->restart_total; > + else if (capacity_mode == GIVEUP) > + goto stop_custom_htm_fastpath; > + } > + > + if (unlikely(tx->next_thread == NULL && > + tx->number_executed_txns % 500 == 0 && tx->restart_total == 1)) > + reoptimize_htm_execution(); > +#endif > if (serial_lock.is_write_locked()) > { > if (tx->nesting > 0) > @@ -665,12 +917,21 @@ _ITM_commitTransaction(void) > if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked())) > { > htm_commit(); > + gtm_thread *tx = gtm_thr(); > + if (unlikely(tx == NULL)) > + { > + tx = new gtm_thread(); > + set_gtm_thr(tx); > + } > + tx->number_executed_txns++; > return; > } > #endif > gtm_thread *tx = gtm_thr(); > if (!tx->trycommit ()) > tx->restart (RESTART_VALIDATE_COMMIT); > + > + tx->number_executed_txns++; > } > > void ITM_REGPARM > @@ -681,6 +942,13 @@ _ITM_commitTransactionEH(void *exc_ptr) > if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked())) > { > htm_commit(); > + gtm_thread *tx = gtm_thr(); > + if (unlikely(tx == NULL)) > + { > + tx = new gtm_thread(); > + set_gtm_thr(tx); > + } > + tx->number_executed_txns++; > return; > } > #endif > @@ -690,4 +958,5 @@ _ITM_commitTransactionEH(void *exc_ptr) > tx->eh_in_flight = exc_ptr; > tx->restart (RESTART_VALIDATE_COMMIT); > } > + tx->number_executed_txns++; > } > Index: libitm/config/x86/target.h > =================================================================== > --- libitm/config/x86/target.h (revision 219316) > +++ libitm/config/x86/target.h (working copy) > @@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret) > return begin_ret & _XABORT_RETRY; > } > > +static inline bool > +htm_is_capacity_abort(uint32_t begin_ret) > +{ > + return begin_ret & _XABORT_CAPACITY; > +} > + > /* Returns true iff a hardware transaction is currently being executed. */ > static inline bool > htm_transaction_active () > { > return _xtest() != 0; > } > + > +static inline uint64_t rdtsc() > +{ > + uint32_t hi, lo; > + __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); > + return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); > +} > #endif > > > Index: libitm/config/x86/sjlj.S > =================================================================== > --- libitm/config/x86/sjlj.S (revision 219316) > +++ libitm/config/x86/sjlj.S (working copy) > @@ -59,12 +59,14 @@ > #define pr_hasNoAbort 0x08 > #define pr_HTMRetryableAbort 0x800000 > #define pr_HTMRetriedAfterAbort 0x1000000 > +#define pr_HTMCapacityAbort 0x2000000 > #define a_runInstrumentedCode 0x01 > #define a_runUninstrumentedCode 0x02 > #define a_tryHTMFastPath 0x20 > > #define _XABORT_EXPLICIT (1 << 0) > #define _XABORT_RETRY (1 << 1) > +#define _XABORT_CAPACITY (1 << 3) > > .text > > @@ -108,9 +110,12 @@ SYM(_ITM_beginTransaction): > .Ltxn_abort: > /* If it might make sense to retry the HTM fast path, let the C++ > code decide. */ > - testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax > + testl $(_XABORT_RETRY|_XABORT_EXPLICIT|_XABORT_CAPACITY), %eax > jz .Lno_htm > orl $pr_HTMRetryableAbort, %edi > + testl $(_XABORT_CAPACITY), %eax > + jz .Lno_htm > + orl $pr_HTMCapacityAbort, %edi > /* Let the C++ code handle the retry policy. */ > .Lno_htm: > #endif > > > > > On Fri, Apr 10, 2015 at 8:24 AM, Andi Kleen <a...@firstfloor.org> wrote: >> Nuno Diegues <n...@ist.utl.pt> writes: >>> >>> One general question on how to proceed: >>> given that I make some further changes, should I post the whole patch again? >> >> Yes please resend the patch. >> >> -Andi > -- a...@linux.intel.com -- Speaking for myself only