Cédric Le Goater <c...@kaod.org> writes:
> Hello Alex, > > Thanks for putting some time into this problem, > > On 8/4/22 11:20, Alex Bennée wrote: >> Investigating why some BMC models are so slow compared to a plain ARM >> virt machines I did some profiling of: >> ./qemu-system-arm -M romulus-bmc -nic user \ >> -drive >> file=obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd \ >> -nographic -serial mon:stdio >> And saw that object_dynamic_cast was dominating the profile times. >> We >> have a number of cases in the CPU hot path and more importantly for >> this model in the SSI bus. As the class is static once the object is >> created we just cache it and use it instead of the dynamic case >> macros. >> [AJB: I suspect a proper fix for this is for QOM to support a cached >> class lookup, abortive macro attempt #if 0'd in this patch]. >> Signed-off-by: Alex Bennée <alex.ben...@linaro.org> >> Cc: Cédric Le Goater <c...@kaod.org> > > > Here are some results, > > * romulus-bmc, OpenBmc login prompt > > without : 82s > with : 56s Looks like I lucked out picking the lowest hanging fruit. > > * ast2500-evb,execute-in-place=true, U-boot 2019.04 prompt > > without : 30s > with : 22s > > * witherspoon-bmc,execute-in-place=true, U-boot 2016.07 prompt > > without : 5.5s > with : 4.1s > > There is definitely an improvement in all scenarios. > > Applying a similar technique on AspeedSMCClass, I could gain > another ~10% and boot the ast2500-evb,execute-in-place=true > machine, up to the U-boot 2019.04 prompt, in less then 20s. There are some fundamentals to XIP which means they will be slower if each instruction is being sucked through io_readx/device emulation although I'm not sure what the exact mechanism is because surely a ROM can just be mapped into the address space and run from there? > However, newer u-boot are still quite slow to boot when executing > from the flash device. For any of those machines? Whats the next command line for me to dig into? > > Thanks, > > C. > >> --- >> include/hw/core/cpu.h | 2 ++ >> include/hw/ssi/ssi.h | 3 +++ >> include/qom/object.h | 29 +++++++++++++++++++++++++++++ >> accel/tcg/cputlb.c | 12 ++++++++---- >> hw/ssi/ssi.c | 10 +++++++--- >> 5 files changed, 49 insertions(+), 7 deletions(-) >> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h >> index 500503da13..70027a772e 100644 >> --- a/include/hw/core/cpu.h >> +++ b/include/hw/core/cpu.h >> @@ -317,6 +317,8 @@ struct qemu_work_item; >> struct CPUState { >> /*< private >*/ >> DeviceState parent_obj; >> + /* cache to avoid expensive CPU_GET_CLASS */ >> + CPUClass *cc; >> /*< public >*/ >> int nr_cores; >> diff --git a/include/hw/ssi/ssi.h b/include/hw/ssi/ssi.h >> index f411858ab0..6950f86810 100644 >> --- a/include/hw/ssi/ssi.h >> +++ b/include/hw/ssi/ssi.h >> @@ -59,6 +59,9 @@ struct SSIPeripheralClass { >> struct SSIPeripheral { >> DeviceState parent_obj; >> + /* cache the class */ >> + SSIPeripheralClass *spc; >> + >> /* Chip select state */ >> bool cs; >> }; >> diff --git a/include/qom/object.h b/include/qom/object.h >> index ef7258a5e1..2202dbfa43 100644 >> --- a/include/qom/object.h >> +++ b/include/qom/object.h >> @@ -198,6 +198,35 @@ struct Object >> OBJ_NAME##_CLASS(const void *klass) \ >> { return OBJECT_CLASS_CHECK(ClassType, klass, TYPENAME); } >> +#if 0 >> +/** >> + * DECLARE_CACHED_CLASS_CHECKER: >> + * @InstanceType: instance struct name >> + * @ClassType: class struct name >> + * @OBJ_NAME: the object name in uppercase with underscore separators >> + * @TYPENAME: type name >> + * >> + * This variant of DECLARE_CLASS_CHECKERS allows for the caching of >> + * class in the parent object instance. This is useful for very hot >> + * path code at the expense of an extra indirection and check. As per >> + * the original direct usage of this macro should be avoided if the >> + * complete OBJECT_DECLARE_TYPE macro has been used. >> + * >> + * This macro will provide the class type cast functions for a >> + * QOM type. >> + */ >> +#define DECLARE_CACHED_CLASS_CHECKERS(InstanceType, ClassType, OBJ_NAME, >> TYPENAME) \ >> + DECLARE_CLASS_CHECKERS(ClassType, OBJ_NAME, TYPENAME) \ >> + static inline G_GNUC_UNUSED ClassType * \ >> + OBJ_NAME##_GET_CACHED_CLASS(const void *obj) \ >> + { \ >> + InstanceType *p = (InstanceType *) obj; \ >> + p->cc = p->cc ? p->cc : OBJECT_GET_CLASS(ClassType, obj, TYPENAME);\ >> + return p->cc; \ >> + } >> + >> +#endif >> + >> /** >> * DECLARE_OBJ_CHECKERS: >> * @InstanceType: instance struct name >> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c >> index a46f3a654d..882315f7dd 100644 >> --- a/accel/tcg/cputlb.c >> +++ b/accel/tcg/cputlb.c >> @@ -1303,8 +1303,9 @@ static inline ram_addr_t >> qemu_ram_addr_from_host_nofail(void *ptr) >> static void tlb_fill(CPUState *cpu, target_ulong addr, int size, >> MMUAccessType access_type, int mmu_idx, uintptr_t >> retaddr) >> { >> - CPUClass *cc = CPU_GET_CLASS(cpu); >> + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); >> bool ok; >> + cpu->cc = cc; >> /* >> * This is not a probe, so only valid return is success; failure >> @@ -1319,7 +1320,8 @@ static inline void cpu_unaligned_access(CPUState *cpu, >> vaddr addr, >> MMUAccessType access_type, >> int mmu_idx, uintptr_t retaddr) >> { >> - CPUClass *cc = CPU_GET_CLASS(cpu); >> + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); >> + cpu->cc = cc; >> cc->tcg_ops->do_unaligned_access(cpu, addr, access_type, >> mmu_idx, retaddr); >> } >> @@ -1331,7 +1333,8 @@ static inline void cpu_transaction_failed(CPUState >> *cpu, hwaddr physaddr, >> MemTxResult response, >> uintptr_t retaddr) >> { >> - CPUClass *cc = CPU_GET_CLASS(cpu); >> + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); >> + cpu->cc = cc; >> if (!cpu->ignore_memory_transaction_failures && >> cc->tcg_ops->do_transaction_failed) { >> @@ -1606,7 +1609,8 @@ static int probe_access_internal(CPUArchState *env, >> target_ulong addr, >> if (!tlb_hit_page(tlb_addr, page_addr)) { >> if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) { >> CPUState *cs = env_cpu(env); >> - CPUClass *cc = CPU_GET_CLASS(cs); >> + CPUClass *cc = cs->cc ? cs->cc : CPU_GET_CLASS(cs); >> + cs->cc = cc; >> if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size, >> access_type, >> mmu_idx, nonfault, retaddr)) { >> diff --git a/hw/ssi/ssi.c b/hw/ssi/ssi.c >> index 003931fb50..f749feb6e3 100644 >> --- a/hw/ssi/ssi.c >> +++ b/hw/ssi/ssi.c >> @@ -38,7 +38,8 @@ static void ssi_cs_default(void *opaque, int n, int level) >> bool cs = !!level; >> assert(n == 0); >> if (s->cs != cs) { >> - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); >> + /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); */ >> + SSIPeripheralClass *ssc = s->spc; >> if (ssc->set_cs) { >> ssc->set_cs(s, cs); >> } >> @@ -48,7 +49,8 @@ static void ssi_cs_default(void *opaque, int n, int level) >> static uint32_t ssi_transfer_raw_default(SSIPeripheral *dev, >> uint32_t val) >> { >> - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); >> + /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); */ >> + SSIPeripheralClass *ssc = dev->spc; >> if ((dev->cs && ssc->cs_polarity == SSI_CS_HIGH) || >> (!dev->cs && ssc->cs_polarity == SSI_CS_LOW) || >> @@ -67,6 +69,7 @@ static void ssi_peripheral_realize(DeviceState *dev, Error >> **errp) >> ssc->cs_polarity != SSI_CS_NONE) { >> qdev_init_gpio_in_named(dev, ssi_cs_default, SSI_GPIO_CS, 1); >> } >> + s->spc = ssc; >> ssc->realize(s, errp); >> } >> @@ -120,7 +123,8 @@ uint32_t ssi_transfer(SSIBus *bus, uint32_t val) >> QTAILQ_FOREACH(kid, &b->children, sibling) { >> SSIPeripheral *peripheral = SSI_PERIPHERAL(kid->child); >> - ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); >> + /* ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); */ >> + ssc = peripheral->spc; >> r |= ssc->transfer_raw(peripheral, val); >> } >> -- Alex Bennée