+1 for benchmarking on multiprocessor machine (4> processors?). Looks like
it is better to use highly multithreaded benchmark to see the worst impact
on performance.
Aleksey.
On 11/24/06, Robin Garner <[EMAIL PROTECTED]> wrote:
Salikh Zakirov wrote:
> Hi,
>
> As a result of numerous class unloading discussions, we
> I've hacked vtable marking proposals into GC_CC directly, and measured
> their impact on the performance. I've attached the two patches
> corresponding to "vtable marks" and "indirect marks".
>
> Benchmark: dacapo-2006-10 hsqldb
> Machine: IBM Thinkpad T41p, Pentium M 1700 MHz (1 core), 1 Gb
> Windows XP SP2, MSVC 7.0, release build
> Benchmark arguments:
>
> java -verbose:gc -jar c:/work/dacapo/dacapo-2006-10.jar -s default -n
> 3 hsqldb
>
> Benchmarks results:
>
> no vtable marks: ===== DaCapo hsqldb PASSED in 6168 msec =====
> vtable marks: ===== DaCapo hsqldb PASSED in 6218 msec =====
> (0.8% slowdown)
> indirect marks: ===== DaCapo hsqldb PASSED in 6409 msec
=====
> (3.9% slowdown)
>
> Garbage collection times:
> (garbage collection times were collected for the whole dacapo run,
> including warmup benchmark runs).
>
> no vtable marks:
> COMPACT avg 614.375 +/- 117.537 = 4915.000 / 8, min 50.000, max
911.000
> COPY avg 255.000 +/- 39.325 = 2040.000 / 8, min 90.000, max
490.000
> FORCED avg 189.333 +/- 7.589 = 2840.000 / 15, min 140.000, max
240.000
>
> vtable marks:
> COMPACT avg 615.500 +/- 119.544 = 4924.000 / 8, min 40.000, max
931.000
> COPY avg 260.000 +/- 27.839 = 2340.000 / 9, min 160.000, max
460.000
> FORCED avg 186.667 +/- 7.411 = 2800.000 / 15, min 140.000, max
240.000
>
> indirect marks:
> COMPACT avg 619.375 +/- 123.104 = 4955.000 / 8, min 30.000, max
941.000
> COPY avg 265.000 +/- 38.868 = 2120.000 / 8, min 110.000, max
500.000
> FORCED avg 194.000 +/- 8.095 = 2910.000 / 15, min 150.000, max
250.000
>
> Resume: as was predicted, adding unconditional write to object scanning
> does not have much impact on the garbage collection time. However,
> overall impact is visible on benchmark level.
>
> Regarding the false sharing wnen writing vtable marks,
> the benchmarking should be run on a multiprocessor machine and with a
> parallel GC.
Actually I think the results show that the vtable marks are in the
noise. hsqldb is a highly multithreaded benchmark, and so prone to
timing discrepancies. What was the variability of the results ? A
single-threaded benchmark like bloat, antlr or pmd might give less
variation.
The other interesting point is the side data structure, something like
MARK_BYTES=size_of_vtable_space << log_min_vtable_align;
byte[MARK_BYTES] mark_bytes;
mark_bytes[((int)vtable)<<(min_vtable_align)] = 1;
of course this is most space efficient if you coarsely align vtables,
and constrain them to a particular area of the heap.
cheers
> ------------------------------------------------------------------------
>
> diff --git vm/gc_cc/src/collect_copy.cpp vm/gc_cc/src/collect_copy.cpp
> index a3b6a96..a4663fc 100644
> --- vm/gc_cc/src/collect_copy.cpp
> +++ vm/gc_cc/src/collect_copy.cpp
> @@ -168,6 +168,7 @@ static bool gc_copy_process_reference(Sl
> // move the object?
> #define pos ((unsigned char*) obj)
> Partial_Reveal_VTable *vtable = ah_to_vtable(vt);
> + vtable->mark = 1;
> GC_VTable_Info *gcvt = vtable->get_gcvt();
>
> if (pos >= heap.compaction_region_start() && pos <
heap.compaction_region_end()) {
> diff --git vm/gc_cc/src/collect_forced.cpp
vm/gc_cc/src/collect_forced.cpp
> index 072f21e..92bf167 100644
> --- vm/gc_cc/src/collect_forced.cpp
> +++ vm/gc_cc/src/collect_forced.cpp
> @@ -64,6 +64,7 @@ static void forced_process_reference(Par
> obj->obj_info() = (info & ~MARK_BITS) | heap_mark_phase;
>
> Partial_Reveal_VTable *vtable = obj->vtable();
> + vtable->mark = 1;
> GC_VTable_Info *gcvt = vtable->get_gcvt();
>
> if (gcvt->is_array()) { // is array
> diff --git vm/gc_cc/src/collect_slide_compact.cpp
vm/gc_cc/src/collect_slide_compact.cpp
> index e5b4f54..985b94e 100644
> --- vm/gc_cc/src/collect_slide_compact.cpp
> +++ vm/gc_cc/src/collect_slide_compact.cpp
> @@ -454,6 +454,7 @@ static void slide_process_object(Partial
> assert(obj->vt() & ~RESCAN_BIT); // has vt
>
> Partial_Reveal_VTable *vtable = ah_to_vtable(vt & ~RESCAN_BIT);
> + vtable->mark = 1;
> GC_VTable_Info *gcvt = vtable->get_gcvt();
>
> // process slots
> diff --git vm/gc_cc/src/gc_types.h vm/gc_cc/src/gc_types.h
> index 1ac4236..849aaf0 100644
> --- vm/gc_cc/src/gc_types.h
> +++ vm/gc_cc/src/gc_types.h
> @@ -152,6 +152,9 @@ typedef struct Partial_Reveal_VTable {
> private:
> GC_VTable_Info *gcvt;
> public:
> + /// pointer to the class reachability mark,
> + /// used for class unloading
> + size_t mark;
>
> void set_gcvt(struct GC_VTable_Info *new_gcvt) { gcvt = new_gcvt; }
> struct GC_VTable_Info *get_gcvt() { return gcvt; }
> diff --git vm/vmcore/include/vtable.h vm/vmcore/include/vtable.h
> index a1fc8b4..eb08687 100644
> --- vm/vmcore/include/vtable.h
> +++ vm/vmcore/include/vtable.h
> @@ -53,6 +53,7 @@ typedef struct Intfc_Table {
>
> typedef struct VTable {
> Byte _gc_private_information[GC_BYTES_IN_VTABLE];
> + size_t mark;
> Class* clss;
>
> // See the masks in vm_for_gc.h.
>
>
> ------------------------------------------------------------------------
>
> diff --git vm/gc_cc/src/collect_copy.cpp vm/gc_cc/src/collect_copy.cpp
> index a3b6a96..c2caac2 100644
> --- vm/gc_cc/src/collect_copy.cpp
> +++ vm/gc_cc/src/collect_copy.cpp
> @@ -168,6 +168,7 @@ static bool gc_copy_process_reference(Sl
> // move the object?
> #define pos ((unsigned char*) obj)
> Partial_Reveal_VTable *vtable = ah_to_vtable(vt);
> + *vtable->mark = 1;
> GC_VTable_Info *gcvt = vtable->get_gcvt();
>
> if (pos >= heap.compaction_region_start() && pos <
heap.compaction_region_end()) {
> diff --git vm/gc_cc/src/collect_forced.cpp
vm/gc_cc/src/collect_forced.cpp
> index 072f21e..7e4de43 100644
> --- vm/gc_cc/src/collect_forced.cpp
> +++ vm/gc_cc/src/collect_forced.cpp
> @@ -64,6 +64,7 @@ static void forced_process_reference(Par
> obj->obj_info() = (info & ~MARK_BITS) | heap_mark_phase;
>
> Partial_Reveal_VTable *vtable = obj->vtable();
> + *vtable->mark = 1;
> GC_VTable_Info *gcvt = vtable->get_gcvt();
>
> if (gcvt->is_array()) { // is array
> diff --git vm/gc_cc/src/collect_slide_compact.cpp
vm/gc_cc/src/collect_slide_compact.cpp
> index e5b4f54..4a3ee9c 100644
> --- vm/gc_cc/src/collect_slide_compact.cpp
> +++ vm/gc_cc/src/collect_slide_compact.cpp
> @@ -454,6 +454,7 @@ static void slide_process_object(Partial
> assert(obj->vt() & ~RESCAN_BIT); // has vt
>
> Partial_Reveal_VTable *vtable = ah_to_vtable(vt & ~RESCAN_BIT);
> + *vtable->mark = 1;
> GC_VTable_Info *gcvt = vtable->get_gcvt();
>
> // process slots
> diff --git vm/gc_cc/src/gc_types.h vm/gc_cc/src/gc_types.h
> index 1ac4236..da9a48c 100644
> --- vm/gc_cc/src/gc_types.h
> +++ vm/gc_cc/src/gc_types.h
> @@ -152,6 +152,9 @@ typedef struct Partial_Reveal_VTable {
> private:
> GC_VTable_Info *gcvt;
> public:
> + /// pointer to the class reachability mark,
> + /// used for class unloading
> + size_t *mark;
>
> void set_gcvt(struct GC_VTable_Info *new_gcvt) { gcvt = new_gcvt; }
> struct GC_VTable_Info *get_gcvt() { return gcvt; }
> diff --git vm/vmcore/include/Class.h vm/vmcore/include/Class.h
> index 7194edb..a6c198c 100644
> --- vm/vmcore/include/Class.h
> +++ vm/vmcore/include/Class.h
> @@ -772,6 +772,8 @@ enum AccessAndPropertiesFlags {
> * calling the verifier, preparing, resolving and initializing the
class.*/
>
> struct Class {
> + /// mark used for the class unloading
> + size_t mark;
> private:
> typedef struct {
> union {
--
Robin Garner
Dept. of Computer Science
Australian National University
http://cs.anu.edu.au/people/Robin.Garner/