Re: [Qemu-devel] [PATCH] x86_64: optimise muldiv64 for x86_64 architecture
On 01/09/2015 01:53 AM, Frediano Ziglio wrote: As this platform can do multiply/divide using 128 bit precision use these instruction to implement it. Signed-off-by: Frediano Ziglio frediano.zig...@huawei.com --- include/qemu-common.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/qemu-common.h b/include/qemu-common.h index f862214..5366220 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -370,6 +370,7 @@ static inline uint8_t from_bcd(uint8_t val) } /* compute with 96 bit intermediate result: (a*b)/c */ +#ifndef __x86_64__ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) { union { @@ -392,6 +393,18 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) res.l.low = (((rh % c) 32) + (rl 0x)) / c; return res.ll; } +#else +static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) +{ +uint64_t res; + +asm (mulq %2\n\tdivq %3 + : =a(res) + : a(a), qm((uint64_t) b), qm((uint64_t)c) + : rdx, cc); +return res; +} +#endif Honestly, this ought to move into qemu/host-utils.h, and it should use __int128 for targets that support it. Which includes x86_64, but also other 64-bit hosts. r~
[Qemu-devel] [PATCH] x86_64: optimise muldiv64 for x86_64 architecture
As this platform can do multiply/divide using 128 bit precision use these instruction to implement it. Signed-off-by: Frediano Ziglio frediano.zig...@huawei.com --- include/qemu-common.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/qemu-common.h b/include/qemu-common.h index f862214..5366220 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -370,6 +370,7 @@ static inline uint8_t from_bcd(uint8_t val) } /* compute with 96 bit intermediate result: (a*b)/c */ +#ifndef __x86_64__ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) { union { @@ -392,6 +393,18 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) res.l.low = (((rh % c) 32) + (rl 0x)) / c; return res.ll; } +#else +static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) +{ +uint64_t res; + +asm (mulq %2\n\tdivq %3 + : =a(res) + : a(a), qm((uint64_t) b), qm((uint64_t)c) + : rdx, cc); +return res; +} +#endif /* Round number down to multiple */ #define QEMU_ALIGN_DOWN(n, m) ((n) / (m) * (m)) -- 1.9.1
Re: [Qemu-devel] [PATCH] x86_64: optimise muldiv64 for x86_64 architecture
2015-01-09 11:24 GMT+00:00 Paolo Bonzini pbonz...@redhat.com: On 09/01/2015 12:04, Frediano Ziglio wrote: 2015-01-09 10:35 GMT+00:00 Paolo Bonzini pbonz...@redhat.com: On 09/01/2015 11:27, Frediano Ziglio wrote: Signed-off-by: Frediano Ziglio frediano.zig...@huawei.com --- include/qemu-common.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/qemu-common.h b/include/qemu-common.h index f862214..5366220 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -370,6 +370,7 @@ static inline uint8_t from_bcd(uint8_t val) } /* compute with 96 bit intermediate result: (a*b)/c */ +#ifndef __x86_64__ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) { union { @@ -392,6 +393,18 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) res.l.low = (((rh % c) 32) + (rl 0x)) / c; return res.ll; } +#else +static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) +{ +uint64_t res; + +asm (mulq %2\n\tdivq %3 + : =a(res) + : a(a), qm((uint64_t) b), qm((uint64_t)c) + : rdx, cc); +return res; +} +#endif Good idea. However, if you have __int128, you can just do return (__int128)a * b / c and the compiler should generate the right code. Conveniently, there is already CONFIG_INT128 that you can use. Well, it works but in our case b = c, that is a * b / c is always 2^64. This is not necessarily the case. Quick grep: hw/timer/hpet.c:return (muldiv64(value, HPET_CLK_PERIOD, FS_PER_NS)); hw/timer/hpet.c:return (muldiv64(value, FS_PER_NS, HPET_CLK_PERIOD)); One of the two must disprove your assertion. :) Unless FS_PER_NS == HPET_CLK_PERIOD :) But it's true that we expect no overflow. This is enough! This lead to no integer overflow in the last division. However the compiler does not know this so it does the entire (a*b) / c division which is mainly consists in two integer division instead of one (not taking into account that is implemented using a helper function). I think that I'll write two patches. One implementing using the int128 as you suggested (which is much easier to read that current one and assembly ones) that another for x86_64 optimization. Right, that's even better. Out of curiosity, have you seen it in some profiles? Paolo No, just looked at the complicate code and generated code and though why using dozen of instructions if two are enough? :) Frediano
[Qemu-devel] [PATCH] x86_64: optimise muldiv64 for x86_64 architecture
As this platform can do multiply/divide using 128 bit precision use these instruction to implement it. Signed-off-by: Frediano Ziglio frediano.zig...@huawei.com --- include/qemu-common.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/qemu-common.h b/include/qemu-common.h index f862214..5366220 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -370,6 +370,7 @@ static inline uint8_t from_bcd(uint8_t val) } /* compute with 96 bit intermediate result: (a*b)/c */ +#ifndef __x86_64__ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) { union { @@ -392,6 +393,18 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) res.l.low = (((rh % c) 32) + (rl 0x)) / c; return res.ll; } +#else +static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) +{ +uint64_t res; + +asm (mulq %2\n\tdivq %3 + : =a(res) + : a(a), qm((uint64_t) b), qm((uint64_t)c) + : rdx, cc); +return res; +} +#endif /* Round number down to multiple */ #define QEMU_ALIGN_DOWN(n, m) ((n) / (m) * (m)) -- 1.9.1
Re: [Qemu-devel] [PATCH] x86_64: optimise muldiv64 for x86_64 architecture
On 09/01/2015 11:27, Frediano Ziglio wrote: Signed-off-by: Frediano Ziglio frediano.zig...@huawei.com --- include/qemu-common.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/qemu-common.h b/include/qemu-common.h index f862214..5366220 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -370,6 +370,7 @@ static inline uint8_t from_bcd(uint8_t val) } /* compute with 96 bit intermediate result: (a*b)/c */ +#ifndef __x86_64__ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) { union { @@ -392,6 +393,18 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) res.l.low = (((rh % c) 32) + (rl 0x)) / c; return res.ll; } +#else +static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) +{ +uint64_t res; + +asm (mulq %2\n\tdivq %3 + : =a(res) + : a(a), qm((uint64_t) b), qm((uint64_t)c) + : rdx, cc); +return res; +} +#endif Good idea. However, if you have __int128, you can just do return (__int128)a * b / c and the compiler should generate the right code. Conveniently, there is already CONFIG_INT128 that you can use. Paolo
Re: [Qemu-devel] [PATCH] x86_64: optimise muldiv64 for x86_64 architecture
2015-01-09 10:35 GMT+00:00 Paolo Bonzini pbonz...@redhat.com: On 09/01/2015 11:27, Frediano Ziglio wrote: Signed-off-by: Frediano Ziglio frediano.zig...@huawei.com --- include/qemu-common.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/qemu-common.h b/include/qemu-common.h index f862214..5366220 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -370,6 +370,7 @@ static inline uint8_t from_bcd(uint8_t val) } /* compute with 96 bit intermediate result: (a*b)/c */ +#ifndef __x86_64__ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) { union { @@ -392,6 +393,18 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) res.l.low = (((rh % c) 32) + (rl 0x)) / c; return res.ll; } +#else +static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) +{ +uint64_t res; + +asm (mulq %2\n\tdivq %3 + : =a(res) + : a(a), qm((uint64_t) b), qm((uint64_t)c) + : rdx, cc); +return res; +} +#endif Good idea. However, if you have __int128, you can just do return (__int128)a * b / c and the compiler should generate the right code. Conveniently, there is already CONFIG_INT128 that you can use. Paolo Well, it works but in our case b = c, that is a * b / c is always 2^64. This lead to no integer overflow in the last division. However the compiler does not know this so it does the entire (a*b) / c division which is mainly consists in two integer division instead of one (not taking into account that is implemented using a helper function). I think that I'll write two patches. One implementing using the int128 as you suggested (which is much easier to read that current one and assembly ones) that another for x86_64 optimization. Frediano
Re: [Qemu-devel] [PATCH] x86_64: optimise muldiv64 for x86_64 architecture
On 09/01/2015 12:04, Frediano Ziglio wrote: 2015-01-09 10:35 GMT+00:00 Paolo Bonzini pbonz...@redhat.com: On 09/01/2015 11:27, Frediano Ziglio wrote: Signed-off-by: Frediano Ziglio frediano.zig...@huawei.com --- include/qemu-common.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/qemu-common.h b/include/qemu-common.h index f862214..5366220 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -370,6 +370,7 @@ static inline uint8_t from_bcd(uint8_t val) } /* compute with 96 bit intermediate result: (a*b)/c */ +#ifndef __x86_64__ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) { union { @@ -392,6 +393,18 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) res.l.low = (((rh % c) 32) + (rl 0x)) / c; return res.ll; } +#else +static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) +{ +uint64_t res; + +asm (mulq %2\n\tdivq %3 + : =a(res) + : a(a), qm((uint64_t) b), qm((uint64_t)c) + : rdx, cc); +return res; +} +#endif Good idea. However, if you have __int128, you can just do return (__int128)a * b / c and the compiler should generate the right code. Conveniently, there is already CONFIG_INT128 that you can use. Well, it works but in our case b = c, that is a * b / c is always 2^64. This is not necessarily the case. Quick grep: hw/timer/hpet.c:return (muldiv64(value, HPET_CLK_PERIOD, FS_PER_NS)); hw/timer/hpet.c:return (muldiv64(value, FS_PER_NS, HPET_CLK_PERIOD)); One of the two must disprove your assertion. :) But it's true that we expect no overflow. This lead to no integer overflow in the last division. However the compiler does not know this so it does the entire (a*b) / c division which is mainly consists in two integer division instead of one (not taking into account that is implemented using a helper function). I think that I'll write two patches. One implementing using the int128 as you suggested (which is much easier to read that current one and assembly ones) that another for x86_64 optimization. Right, that's even better. Out of curiosity, have you seen it in some profiles? Paolo
Re: [Qemu-devel] [PATCH] x86_64: optimise muldiv64 for x86_64 architecture
On 9 January 2015 at 11:24, Paolo Bonzini pbonz...@redhat.com wrote: On 09/01/2015 12:04, Frediano Ziglio wrote: I think that I'll write two patches. One implementing using the int128 as you suggested (which is much easier to read that current one and assembly ones) that another for x86_64 optimization. Right, that's even better. Personally I would prefer we didn't write inline assembly functions if we can avoid them. So I'd rather see an int128 version, and if the compiler doesn't do a good enough job then go talk to the compiler folks to improve things. Out of curiosity, have you seen it in some profiles? I would absolutely want to see significant perf uplift on a real workload before we start putting inline asm into qemu-common.h... -- PMM