[PATCH v3 7/7] RTC:Allow to migrate from old version
The new logic is compatible with old. So it should not block to migrate from old version. But new version cannot migrate to old. Signed-off-by: Yang Zhang --- hw/mc146818rtc.c | 48 1 files changed, 44 insertions(+), 4 deletions(-) diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c index 384bdc1..02f7ea0 100644 --- a/hw/mc146818rtc.c +++ b/hw/mc146818rtc.c @@ -635,11 +635,51 @@ static int rtc_post_load(void *opaque, int version_id) return 0; } +static int rtc_load_old(QEMUFile *f, void *opaque, int version_id) +{ +RTCState *s = opaque; + +if (version_id > 2) { +return -EINVAL; +} + +qemu_get_buffer(f, s->cmos_data, sizeof(s->cmos_data)); +/* dummy load for compatibility */ +qemu_get_byte(f); /* cmos_index */ +qemu_get_be32(f); /* tm_sec */ +qemu_get_be32(f); /* tm_min */ +qemu_get_be32(f); /* tm_hour */ +qemu_get_be32(f); /* tm_wday */ +qemu_get_be32(f); /* tm_mday */ +qemu_get_be32(f); /* tm_mon */ +qemu_get_be32(f); /* tm_year */ +qemu_get_be64(f); /* periodic_timer */ +qemu_get_be64(f); /* next_periodic_time */ +qemu_get_be64(f); /* next_second_time */ +qemu_get_be64(f); /* second_timer */ +qemu_get_be64(f); /* second_timer2 */ +qemu_get_be32(f); /* irq_coalesced */ +qemu_get_be32(f); /* period */ + + +rtc_set_date_from_host(&s->dev); +periodic_timer_update(s, qemu_get_clock_ns(rtc_clock)); +check_update_timer(s); + +#ifdef TARGET_I386 +if (s->lost_tick_policy == LOST_TICK_SLEW) { +rtc_coalesced_timer_update(s); +} +#endif +return 0; +} + static const VMStateDescription vmstate_rtc = { .name = "mc146818rtc", -.version_id = 2, -.minimum_version_id = 1, -.minimum_version_id_old = 1, +.version_id = 3, +.minimum_version_id = 3, +.minimum_version_id_old = 2, +.load_state_old = rtc_load_old, .post_load = rtc_post_load, .fields = (VMStateField []) { VMSTATE_BUFFER(cmos_data, RTCState), @@ -777,7 +817,7 @@ static int rtc_initfn(ISADevice *dev) memory_region_init_io(&s->io, &cmos_ops, s, "rtc", 2); isa_register_ioport(dev, &s->io, base); -qdev_set_legacy_instance_id(&dev->qdev, base, 2); +qdev_set_legacy_instance_id(&dev->qdev, base, 3); qemu_register_reset(rtc_reset, s); object_property_add(OBJECT(s), "date", "struct tm", -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 6/7] RTC:Add alarm support
Add the alarm check when update cycle ended. If alarm is fired, also AIE bit is setting, then raise a interrupt Signed-off-by: Yang Zhang --- hw/mc146818rtc.c | 48 ++-- 1 files changed, 46 insertions(+), 2 deletions(-) diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c index fae049e..384bdc1 100644 --- a/hw/mc146818rtc.c +++ b/hw/mc146818rtc.c @@ -114,6 +114,7 @@ typedef struct RTCState { static void rtc_set_time(RTCState *s); static void rtc_calibrate_time(RTCState *s); static void rtc_set_cmos(RTCState *s); +static inline int rtc_from_bcd(RTCState *s, int a); static int32_t divider_reset; @@ -267,15 +268,58 @@ static void rtc_update_timer(void *opaque) } } +static inline uint8_t convert_hour(RTCState *s, uint8_t hour) +{ +if (!(s->cmos_data[RTC_REG_B] & REG_B_24H)) { +hour %= 12; +if (s->cmos_data[RTC_HOURS] & 0x80) { +hour += 12; +} +} +return hour; +} +static uint32_t check_alarm(RTCState *s) +{ +uint8_t alarm_hour, alarm_min, alarm_sec; +uint8_t cur_hour, cur_min, cur_sec; + +rtc_calibrate_time(s); +rtc_set_cmos(s); + +alarm_sec = rtc_from_bcd(s, s->cmos_data[RTC_SECONDS_ALARM]); +alarm_min = rtc_from_bcd(s, s->cmos_data[RTC_MINUTES_ALARM]); +alarm_hour = rtc_from_bcd(s, s->cmos_data[RTC_HOURS_ALARM]); +alarm_hour = convert_hour(s, alarm_hour); + +cur_sec = rtc_from_bcd(s, s->cmos_data[RTC_SECONDS]); +cur_min = rtc_from_bcd(s, s->cmos_data[RTC_MINUTES]); +cur_hour = rtc_from_bcd(s, s->cmos_data[RTC_HOURS]); +cur_hour = convert_hour(s, cur_hour); + +if (((alarm_sec & 0xc0) == 0xc0 || alarm_sec == cur_sec) && +((alarm_min & 0xc0) == 0xc0 || alarm_min == cur_min) && +((alarm_hour & 0xc0) == 0xc0 || alarm_hour == cur_hour)) { +return 1; +} +return 0; + +} + static void rtc_update_timer2(void *opaque) { RTCState *s = opaque; if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) { s->cmos_data[RTC_REG_C] |= REG_C_UF; +if (check_alarm(s)) { +s->cmos_data[RTC_REG_C] |= REG_C_AF; +} + s->cmos_data[RTC_REG_A] &= ~REG_A_UIP; -s->cmos_data[RTC_REG_C] |= REG_C_IRQF; -qemu_irq_raise(s->irq); +if (s->cmos_data[RTC_REG_B] & (REG_B_AIE | REG_B_UIE)) { +s->cmos_data[RTC_REG_C] |= REG_C_IRQF; +qemu_irq_raise(s->irq); +} } check_update_timer(s); } -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 4/7] RTC: Set internal millisecond register to 500ms when reset divider
The first update cycle begins one - half seconds later when divider reset is removing. Signed-off-by: Yang Zhang --- hw/mc146818rtc.c | 38 +- 1 files changed, 33 insertions(+), 5 deletions(-) diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c index 6ebb8f6..5e7fbb5 100644 --- a/hw/mc146818rtc.c +++ b/hw/mc146818rtc.c @@ -110,6 +110,8 @@ static void rtc_set_time(RTCState *s); static void rtc_calibrate_time(RTCState *s); static void rtc_set_cmos(RTCState *s); +static int32_t divider_reset; + static uint64_t get_guest_rtc_us(RTCState *s) { int64_t host_usec, offset_usec, guest_usec; @@ -220,16 +222,24 @@ static void rtc_periodic_timer(void *opaque) } } -static void rtc_set_offset(RTCState *s) +static void rtc_set_offset(RTCState *s, int32_t start_usec) { struct tm *tm = &s->current_tm; -int64_t host_usec, guest_sec, guest_usec; +int64_t host_usec, guest_sec, guest_usec, offset_usec, old_guest_usec; host_usec = qemu_get_clock_ns(host_clock) / NS_PER_USEC; +offset_usec = s->offset_sec * USEC_PER_SEC + s->offset_usec; +old_guest_usec = (host_usec + offset_usec) % USEC_PER_SEC; guest_sec = mktimegm(tm); -guest_usec = guest_sec * USEC_PER_SEC; +/* start_usec equal 0 means rtc internal millisecond is + * same with before */ +if (start_usec == 0) { +guest_usec = guest_sec * USEC_PER_SEC + old_guest_usec; +} else { +guest_usec = guest_sec * USEC_PER_SEC + start_usec; +} s->offset_sec = (guest_usec - host_usec) / USEC_PER_SEC; s->offset_usec = (guest_usec - host_usec) % USEC_PER_SEC; } @@ -260,10 +270,22 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data) /* if in set mode, do not update the time */ if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) { rtc_set_time(s); -rtc_set_offset(s); +rtc_set_offset(s, 0); } break; case RTC_REG_A: +/* when the divider reset is removed, the first update cycle + * begins one-half second later*/ +if (((s->cmos_data[RTC_REG_A] & 0x60) == 0x60) && +((data & 0x70) >> 4) <= 2) { +divider_reset = 1; +if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) { +rtc_calibrate_time(s); +rtc_set_offset(s, 50); +s->cmos_data[RTC_REG_A] &= ~REG_A_UIP; +divider_reset = 0; +} +} /* UIP bit is read only */ s->cmos_data[RTC_REG_A] = (data & ~REG_A_UIP) | (s->cmos_data[RTC_REG_A] & REG_A_UIP); @@ -283,7 +305,13 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data) /* if disabling set mode, update the time */ if (s->cmos_data[RTC_REG_B] & REG_B_SET) { rtc_set_time(s); -rtc_set_offset(s); +if (divider_reset == 1) { +rtc_set_offset(s, 50); +s->cmos_data[RTC_REG_A] &= ~REG_A_UIP; +divider_reset = 0; +} else { +rtc_set_offset(s, 0); +} } } s->cmos_data[RTC_REG_B] = data; -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 5/7] RTC:Add RTC update-ended interrupt support
Use a timer to emulate update cycle. When update cycle ended and UIE is setting, then raise an interrupt. The timer runs only when UF or AF is cleared. Signed-off-by: Yang Zhang --- hw/mc146818rtc.c | 86 ++ 1 files changed, 80 insertions(+), 6 deletions(-) diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c index 5e7fbb5..fae049e 100644 --- a/hw/mc146818rtc.c +++ b/hw/mc146818rtc.c @@ -97,6 +97,11 @@ typedef struct RTCState { /* periodic timer */ QEMUTimer *periodic_timer; int64_t next_periodic_time; +/* update-ended timer */ +QEMUTimer *update_timer; +QEMUTimer *update_timer2; +uint64_t next_update_time; +uint32_t use_timer; uint16_t irq_reinject_on_ack_count; uint32_t irq_coalesced; uint32_t period; @@ -157,7 +162,8 @@ static void rtc_coalesced_timer(void *opaque) } #endif -static void rtc_timer_update(RTCState *s, int64_t current_time) +/* handle periodic timer */ +static void periodic_timer_update(RTCState *s, int64_t current_time) { int period_code, period; int64_t cur_clock, next_irq_clock; @@ -195,7 +201,7 @@ static void rtc_periodic_timer(void *opaque) { RTCState *s = opaque; -rtc_timer_update(s, s->next_periodic_time); +periodic_timer_update(s, s->next_periodic_time); s->cmos_data[RTC_REG_C] |= REG_C_PF; if (s->cmos_data[RTC_REG_B] & REG_B_PIE) { s->cmos_data[RTC_REG_C] |= REG_C_IRQF; @@ -222,6 +228,58 @@ static void rtc_periodic_timer(void *opaque) } } +/* handle update-ended timer */ +static void check_update_timer(RTCState *s) +{ +uint64_t next_update_time, expire_time; +uint64_t guest_usec; +qemu_del_timer(s->update_timer); +qemu_del_timer(s->update_timer2); + +if (!((s->cmos_data[RTC_REG_C] & (REG_C_UF | REG_C_AF)) == +(REG_C_UF | REG_C_AF)) && !(s->cmos_data[RTC_REG_B] & REG_B_SET)) { +s->use_timer = 1; +guest_usec = get_guest_rtc_us(s) % USEC_PER_SEC; +if (guest_usec >= (USEC_PER_SEC - 244)) { +/* RTC is in update cycle when enabling UIE */ +s->cmos_data[RTC_REG_A] |= REG_A_UIP; +next_update_time = (USEC_PER_SEC - guest_usec) * NS_PER_USEC; +expire_time = qemu_get_clock_ns(rtc_clock) + next_update_time; +qemu_mod_timer(s->update_timer2, expire_time); +} else { +next_update_time = (USEC_PER_SEC - guest_usec - 244) * NS_PER_USEC; +expire_time = qemu_get_clock_ns(rtc_clock) + next_update_time; +s->next_update_time = expire_time; +qemu_mod_timer(s->update_timer, expire_time); +} +} else { +s->use_timer = 0; +} +} + +static void rtc_update_timer(void *opaque) +{ +RTCState *s = opaque; + +if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) { +s->cmos_data[RTC_REG_A] |= REG_A_UIP; +qemu_mod_timer(s->update_timer2, s->next_update_time + 244000UL); +} +} + +static void rtc_update_timer2(void *opaque) +{ +RTCState *s = opaque; + +if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) { +s->cmos_data[RTC_REG_C] |= REG_C_UF; +s->cmos_data[RTC_REG_A] &= ~REG_A_UIP; +s->cmos_data[RTC_REG_C] |= REG_C_IRQF; +qemu_irq_raise(s->irq); +} +check_update_timer(s); +} + static void rtc_set_offset(RTCState *s, int32_t start_usec) { struct tm *tm = &s->current_tm; @@ -283,13 +341,14 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data) rtc_calibrate_time(s); rtc_set_offset(s, 50); s->cmos_data[RTC_REG_A] &= ~REG_A_UIP; +check_update_timer(s); divider_reset = 0; } } /* UIP bit is read only */ s->cmos_data[RTC_REG_A] = (data & ~REG_A_UIP) | (s->cmos_data[RTC_REG_A] & REG_A_UIP); -rtc_timer_update(s, qemu_get_clock_ns(rtc_clock)); +periodic_timer_update(s, qemu_get_clock_ns(rtc_clock)); break; case RTC_REG_B: if (data & REG_B_SET) { @@ -315,7 +374,8 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data) } } s->cmos_data[RTC_REG_B] = data; -rtc_timer_update(s, qemu_get_clock_ns(rtc_clock)); +periodic_timer_update(s, qemu_get_clock_ns(rtc_clock)); +check_update_timer(s); break; case RTC_REG_C: case RTC_REG_D: @@ -445,7 +505,7 @@ static uint32_t cmos_ioport_read(void *opaque, uint32_t addr) break; case RTC_REG_A: ret = s->cmos_data[s->cmos_index]; -if (update_in_progress(s)) { +if ((s->use_timer == 0) && update_in_progress(s)) { ret |= REG_A_UIP; } break; @@ -453,6 +513,12 @@ static uint32_t cmos_ioport_read(void *
[PATCH v3 3/7] RTC: Add UIP(update in progress) check logic
The UIP(update in progress) is set when RTC is updating. And the update cycle begins 244us later after UIP is set. And it is cleared when update end. . Signed-off-by: Yang Zhang --- hw/mc146818rtc.c | 18 ++ 1 files changed, 18 insertions(+), 0 deletions(-) diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c index 82a5b8a..6ebb8f6 100644 --- a/hw/mc146818rtc.c +++ b/hw/mc146818rtc.c @@ -377,6 +377,21 @@ static void rtc_calibrate_time(RTCState *s) s->current_tm = *ret; } +static int update_in_progress(RTCState *s) +{ +int64_t guest_usec; + +if (s->cmos_data[RTC_REG_B] & REG_B_SET) { +return 0; +} +guest_usec = get_guest_rtc_us(s); +/* UIP bit will be set at last 244us of every second. */ +if ((guest_usec % USEC_PER_SEC) >= (USEC_PER_SEC - 244)) { +return 1; +} +return 0; +} + static uint32_t cmos_ioport_read(void *opaque, uint32_t addr) { RTCState *s = opaque; @@ -402,6 +417,9 @@ static uint32_t cmos_ioport_read(void *opaque, uint32_t addr) break; case RTC_REG_A: ret = s->cmos_data[s->cmos_index]; +if (update_in_progress(s)) { +ret |= REG_A_UIP; +} break; case RTC_REG_C: ret = s->cmos_data[s->cmos_index]; -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 2/7] RTC: Update the RTC clock only when reading it
There has no need to use two periodic timer to update RTC time. In this patch, we only update it when guest reading it. Signed-off-by: Yang Zhang --- hw/mc146818rtc.c | 207 +- 1 files changed, 66 insertions(+), 141 deletions(-) diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c index 9b49cbc..82a5b8a 100644 --- a/hw/mc146818rtc.c +++ b/hw/mc146818rtc.c @@ -44,6 +44,9 @@ # define DPRINTF_C(format, ...) do { } while (0) #endif +#define USEC_PER_SEC100L +#define NS_PER_USEC 1000L + #define RTC_REINJECT_ON_ACK_COUNT 20 #define RTC_SECONDS 0 @@ -85,6 +88,8 @@ typedef struct RTCState { uint8_t cmos_data[128]; uint8_t cmos_index; struct tm current_tm; +int64_t offset_sec; +int32_t offset_usec; int32_t base_year; qemu_irq irq; qemu_irq sqw_irq; @@ -92,21 +97,29 @@ typedef struct RTCState { /* periodic timer */ QEMUTimer *periodic_timer; int64_t next_periodic_time; -/* second update */ -int64_t next_second_time; uint16_t irq_reinject_on_ack_count; uint32_t irq_coalesced; uint32_t period; QEMUTimer *coalesced_timer; -QEMUTimer *second_timer; -QEMUTimer *second_timer2; Notifier clock_reset_notifier; LostTickPolicy lost_tick_policy; Notifier suspend_notifier; } RTCState; static void rtc_set_time(RTCState *s); -static void rtc_copy_date(RTCState *s); +static void rtc_calibrate_time(RTCState *s); +static void rtc_set_cmos(RTCState *s); + +static uint64_t get_guest_rtc_us(RTCState *s) +{ +int64_t host_usec, offset_usec, guest_usec; + +host_usec = qemu_get_clock_ns(host_clock) / NS_PER_USEC; +offset_usec = s->offset_sec * USEC_PER_SEC + s->offset_usec; +guest_usec = host_usec + offset_usec; + +return guest_usec; +} #ifdef TARGET_I386 static void rtc_coalesced_timer_update(RTCState *s) @@ -207,6 +220,20 @@ static void rtc_periodic_timer(void *opaque) } } +static void rtc_set_offset(RTCState *s) +{ +struct tm *tm = &s->current_tm; +int64_t host_usec, guest_sec, guest_usec; + +host_usec = qemu_get_clock_ns(host_clock) / NS_PER_USEC; + +guest_sec = mktimegm(tm); +guest_usec = guest_sec * USEC_PER_SEC; + +s->offset_sec = (guest_usec - host_usec) / USEC_PER_SEC; +s->offset_usec = (guest_usec - host_usec) % USEC_PER_SEC; +} + static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data) { RTCState *s = opaque; @@ -233,6 +260,7 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data) /* if in set mode, do not update the time */ if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) { rtc_set_time(s); +rtc_set_offset(s); } break; case RTC_REG_A: @@ -243,6 +271,11 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data) break; case RTC_REG_B: if (data & REG_B_SET) { +/* update cmos to when the rtc was stopping */ +if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) { +rtc_calibrate_time(s); +rtc_set_cmos(s); +} /* set mode: reset UIP mode */ s->cmos_data[RTC_REG_A] &= ~REG_A_UIP; data &= ~REG_B_UIE; @@ -250,6 +283,7 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data) /* if disabling set mode, update the time */ if (s->cmos_data[RTC_REG_B] & REG_B_SET) { rtc_set_time(s); +rtc_set_offset(s); } } s->cmos_data[RTC_REG_B] = data; @@ -305,7 +339,7 @@ static void rtc_set_time(RTCState *s) rtc_change_mon_event(tm); } -static void rtc_copy_date(RTCState *s) +static void rtc_set_cmos(RTCState *s) { const struct tm *tm = &s->current_tm; int year; @@ -331,122 +365,16 @@ static void rtc_copy_date(RTCState *s) s->cmos_data[RTC_YEAR] = rtc_to_bcd(s, year); } -/* month is between 0 and 11. */ -static int get_days_in_month(int month, int year) -{ -static const int days_tab[12] = { -31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 -}; -int d; -if ((unsigned )month >= 12) -return 31; -d = days_tab[month]; -if (month == 1) { -if ((year % 4) == 0 && ((year % 100) != 0 || (year % 400) == 0)) -d++; -} -return d; -} - -/* update 'tm' to the next second */ -static void rtc_next_second(struct tm *tm) +static void rtc_calibrate_time(RTCState *s) { -int days_in_month; - -tm->tm_sec++; -if ((unsigned)tm->tm_sec >= 60) { -tm->tm_sec = 0; -tm->tm_min++; -if ((unsigned)tm->tm_min >= 60) { -tm->tm_min = 0; -tm->tm_hour++; -if ((unsigned)tm->tm_hour >= 24) { -tm->tm_hour = 0; -
[PATCH v3 1/7] RTC: Remove the logic to update time format when DM bit changed
Change DM(date mode) and 24/12 control bit don't affect the internal registers. It only indicates what format is using for those registers. So we don't need to update time format when it is modified. Signed-off-by: Yang Zhang --- hw/mc146818rtc.c | 10 +- 1 files changed, 1 insertions(+), 9 deletions(-) diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c index a46fdfc..9b49cbc 100644 --- a/hw/mc146818rtc.c +++ b/hw/mc146818rtc.c @@ -252,15 +252,7 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data) rtc_set_time(s); } } -if (((s->cmos_data[RTC_REG_B] ^ data) & (REG_B_DM | REG_B_24H)) && -!(data & REG_B_SET)) { -/* If the time format has changed and not in set mode, - update the registers immediately. */ -s->cmos_data[RTC_REG_B] = data; -rtc_copy_date(s); -} else { -s->cmos_data[RTC_REG_B] = data; -} +s->cmos_data[RTC_REG_B] = data; rtc_timer_update(s, qemu_get_clock_ns(rtc_clock)); break; case RTC_REG_C: -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 0/7] RTC: New logic to emulate RTC
Changes in v3: Rebase to latest head. Remove the logic to update time format when DM bit changed. Allow to migrate from old version. Solve the async when reading UF and UIP Changes in v2: Add UIP check logic. Add logic that next second tick will occur in exactly 500ms later after reset divider Current RTC emulation uses periodic timer(2 timers per second) to update RTC clock. And it will stop CPU staying at deep C-state for long period. Our experience shows the Pkg C6 residency reduced 6% when running 64 idle guest. The following patch stop the two periodic timer and only updating RTC clock when guest try to read it. --- Yang Zhang (7): RTC: Remove the logic to update time format when DM bit changed RTC: Update the RTC clock only when reading it RTC: Add UIP(update in progress) check logic RTC: Set internal millisecond register to 500ms when reset divider RTC: Add RTC update-ended interrupt support RTC: Add alarm support RTC: Allow to migrate from old version hw/mc146818rtc.c | 427 ++--- 1 files changed, 274 insertions(+), 153 deletions(-) best regards yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
can't restore a guest saved by an older qemu
Hi Avi, I can't restore a guest saved by an older qemu. I wonder if this issue is introduced by some latest commits on the file 'hw/i8254.c'. The new commit: b5ed4b6f (on Mar 1)I tried to restore a guest using this qemu. The old commit: 9d636ae7 (on Feb 11)I saved a guest using this qemu. I got some error when restoring a guest saved by an old qemu now. [root@vt-nhm9 jay]# qemu-system-x86_64 -smp 4 -m 1024 -hda rhel6u1.qcow -incoming "exec:dd if=save.img" Could not open option rom 'vapic.bin': No such file or directory qemu-system-x86_64: pci_add_option_rom: failed to find romfile "pxe-rtl8139.rom" VNC server running on `::1:5900' Unknown ramblock ":00:03.0/rtl8139.rom", cannot accept migration qemu: warning: error while loading state for instance 0x0 of device 'ram' load of migration failed dd: writing to `standard output': Broken pipe 193+0 records in 192+0 records out 98304 bytes (98 kB) copied, 0.00713571 s, 13.8 MB/s A good one is something like the following. [root@vt-nhm9 jay]# ~/qemu-kvm/x86_64-softmmu/qemu-system-x86_64 -smp 4 -m 1024 -hda rhel6u1.qcow -incoming "exec:dd if=save1.img" VNC server running on `::1:5900' 781430+1 records in 781430+1 records out 400092473 bytes (400 MB) copied, 1.03926 s, 385 MB/s Best Regards, Yongjie Ren (Jay) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/4] KVM: Avoid checking huge page mappings in get_dirty_log()
Takuya Yoshikawa wrote: > Takuya Yoshikawa wrote: > > > + while (mask) { > > + rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; > > + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); > > > > - return write_protected; > > + /* clear the first set bit */ > > + mask &= mask - 1; > > + } > > } > > while (mask) { > fsbit = __ffs(mask); > gfn_offset += fsbit; > mask >>= fsbit + 1; > > rmapp = &slot->rmap[gfn_offset]; > __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); > } > > Seems to be better: BSF friendly. I was wrong. "and < shift" and bsf did not show any difference for this kind of loop on my box. So we should keep the original implementation. Takuya -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 3/4] net: split hostname and service by last colon
On 24/02/12 17:29, Kevin Wolf wrote: Am 10.02.2012 07:27, schrieb Amos Kong: IPv6 address contains colons, parse will be wrong. [2312::8274]:5200 Signed-off-by: Amos Kong --- net.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/net.c b/net.c index f63014c..9e1ef9e 100644 --- a/net.c +++ b/net.c @@ -84,7 +84,7 @@ static int get_str_sep(char *buf, int buf_size, const char **pp, int sep) const char *p, *p1; int len; p = *pp; -p1 = strchr(p, sep); +p1 = strrchr(p, sep); if (!p1) return -1; len = p1 - p; And what if the port isn't specified? I think you would erroneously interpret the last part of the IP address as port. IPv6 address have paired colons, need more precision check. -- Amos. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 1/4] Use getaddrinfo for migration
On 24/02/12 17:08, Kevin Wolf wrote: Am 10.02.2012 07:27, schrieb Amos Kong: This allows us to use ipv4/ipv6 for migration addresses. Once there, it also uses /etc/services names (it came free). Signed-off-by: Juan Quintela Signed-off-by: Amos Kong --- migration-tcp.c | 60 --- net.c | 108 +++ qemu_socket.h |3 ++ 3 files changed, 127 insertions(+), 44 deletions(-) This patch is making too many changes at once: 1. Move code around 2. Remove duplicated code between client and server 3. Replace parse_host_port() with an open-coded version 4. Modify the open-coded parse_host_port() to use getaddrinfo instead of inet_aton/gethostbyname (Why can't we just change parse_host_port? Are there valid reasons to use the old implementation? Which?) tcp_common_start() needs to use the address list which is got by getaddrinfo(), But I agree with verifying host/port by getaddrinfo() in parse_host_port. This makes it rather hard to review. diff --git a/migration-tcp.c b/migration-tcp.c index 35a5781..644bb8f 100644 --- a/migration-tcp.c +++ b/migration-tcp.c @@ -81,43 +81,27 @@ static void tcp_wait_for_connect(void *opaque) int tcp_start_outgoing_migration(MigrationState *s, const char *host_port) { -struct sockaddr_in addr; int ret; - -ret = parse_host_port(&addr, host_port); -if (ret< 0) { -return ret; -} +int fd; s->get_error = socket_errno; s->write = socket_write; s->close = tcp_close; -s->fd = qemu_socket(PF_INET, SOCK_STREAM, 0); -if (s->fd == -1) { -DPRINTF("Unable to open socket"); -return -socket_error(); -} - -socket_set_nonblock(s->fd); - -do { -ret = connect(s->fd, (struct sockaddr *)&addr, sizeof(addr)); -if (ret == -1) { -ret = -socket_error(); -} -if (ret == -EINPROGRESS || ret == -EWOULDBLOCK) { -qemu_set_fd_handler2(s->fd, NULL, NULL, tcp_wait_for_connect, s); -return 0; -} -} while (ret == -EINTR); - -if (ret< 0) { +ret = tcp_client_start(host_port,&fd); +s->fd = fd; +if (ret == -EINPROGRESS || ret == -EWOULDBLOCK) { +DPRINTF("connect in progress"); +qemu_set_fd_handler2(s->fd, NULL, NULL, tcp_wait_for_connect, s); +} else if (ret< 0) { DPRINTF("connect failed\n"); -migrate_fd_error(s); +if (ret != -EINVAL) { +migrate_fd_error(s); +} This looks odd. It is probably needed to keep the old behaviour where a failed parse_host_port() wouldn't call migrate_fd_error(). Is this really required? Maybe I'm missing something, but the caller can't know which failure case we had and if migrate_fd_error() has been called. getaddrinfo() is used in tcp_common_start(), -EINVAL will be returned when getaddrinfo() doesn't return 0. return ret; +} else { +migrate_fd_connect(s); } -migrate_fd_connect(s); return 0; } @@ -157,28 +141,16 @@ out2: int tcp_start_incoming_migration(const char *host_port) { -struct sockaddr_in addr; -int val; +int ret; int s; DPRINTF("Attempting to start an incoming migration\n"); -if (parse_host_port(&addr, host_port)< 0) { -fprintf(stderr, "invalid host/port combination: %s\n", host_port); -return -EINVAL; -} - -s = qemu_socket(PF_INET, SOCK_STREAM, 0); -if (s == -1) { -return -socket_error(); +ret = tcp_server_start(host_port,&s); +if (ret< 0) { +return ret; } -val = 1; -setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (const char *)&val, sizeof(val)); - -if (bind(s, (struct sockaddr *)&addr, sizeof(addr)) == -1) { -goto err; -} if (listen(s, 1) == -1) { goto err; } diff --git a/net.c b/net.c index c34474f..f63014c 100644 --- a/net.c +++ b/net.c @@ -99,6 +99,114 @@ static int get_str_sep(char *buf, int buf_size, const char **pp, int sep) return 0; } +static int tcp_server_bind(int fd, struct addrinfo *rp) +{ +int ret; +int val = 1; + +/* allow fast reuse */ +setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (const char *)&val, sizeof(val)); + +ret = bind(fd, rp->ai_addr, rp->ai_addrlen); + +if (ret == -1) { +ret = -socket_error(); +} +return ret; + +} + +static int tcp_client_connect(int fd, struct addrinfo *rp) +{ +int ret; + +do { +ret = connect(fd, rp->ai_addr, rp->ai_addrlen); +if (ret == -1) { +ret = -socket_error(); +} +} while (ret == -EINTR); + +return ret; +} + + +static int tcp_start_common(const char *str, int *fd, bool server) +{ +char hostname[512]; +const char *service; +const char *name; +struct addrinfo hints; +struct addrinfo *result, *rp; +int s; +int sfd; +int ret = -EINVAL; + +*fd = -1; +se
Re: [Qemu-devel] [PATCH 1/4] Use getaddrinfo for migration
On 24/02/12 17:34, Kevin Wolf wrote: Am 10.02.2012 07:27, schrieb Amos Kong: This allows us to use ipv4/ipv6 for migration addresses. Once there, it also uses /etc/services names (it came free). Signed-off-by: Juan Quintela Signed-off-by: Amos Kong --- migration-tcp.c | 60 --- net.c | 108 +++ qemu_socket.h |3 ++ 3 files changed, 127 insertions(+), 44 deletions(-) @@ -157,28 +141,16 @@ out2: int tcp_start_incoming_migration(const char *host_port) { -struct sockaddr_in addr; -int val; +int ret; int s; DPRINTF("Attempting to start an incoming migration\n"); -if (parse_host_port(&addr, host_port)< 0) { -fprintf(stderr, "invalid host/port combination: %s\n", host_port); -return -EINVAL; -} Oh, and this case doesn't print an error message any more now. The check work is done in tcp_start_common() tcp_start_incoming_migration() -> tcp_client_start() -> tcp_start_common() -- Amos. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/4] KVM: Avoid checking huge page mappings in get_dirty_log()
Takuya Yoshikawa wrote: > + while (mask) { > + rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; > + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); > > - return write_protected; > + /* clear the first set bit */ > + mask &= mask - 1; > + } > } while (mask) { fsbit = __ffs(mask); gfn_offset += fsbit; mask >>= fsbit + 1; rmapp = &slot->rmap[gfn_offset]; __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); } Seems to be better: BSF friendly. I will update this function and post the result with dirty-log-perf output! Takuya -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Bug 42829] KVM Guest with virtio network driver loses network connectivity
https://bugzilla.kernel.org/show_bug.cgi?id=42829 --- Comment #6 from Steve 2012-03-02 01:05:35 --- My results: === git bisect log: --- git bisect start # good: [2c53b436a30867eb6b47dd7bab23ba638d1fb0d2] Linux 3.0-rc3 git bisect good 2c53b436a30867eb6b47dd7bab23ba638d1fb0d2 # bad: [b0af8dfdd67699e25083478c63eedef2e72ebd85] Linux 3.0-rc5 git bisect bad b0af8dfdd67699e25083478c63eedef2e72ebd85 # bad: [c01ad4081939f91ebd7277e8e731fd90ceb3e632] Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input git bisect bad c01ad4081939f91ebd7277e8e731fd90ceb3e632 # bad: [f4ef084226f82ca923bf0a2658bb2876bd215ec1] Merge branch 'fbdev-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/lethal/fbdev-3.x git bisect bad f4ef084226f82ca923bf0a2658bb2876bd215ec1 # bad: [bd5dc17be87b3a3073d50b23802647db3ae3fa8e] uts: make default hostname configurable, rather than always using "(none)" git bisect bad bd5dc17be87b3a3073d50b23802647db3ae3fa8e # bad: [f39e8409955fad210a9a7169cc53c4c18daaef3a] Merge branch 'drm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/airlied/drm-2.6 git bisect bad f39e8409955fad210a9a7169cc53c4c18daaef3a # bad: [81eb3dd8438802138ac9ce12428632f35562c060] Merge branch 'for-linus' of git://neil.brown.name/md git bisect bad 81eb3dd8438802138ac9ce12428632f35562c060 # good: [9864c0053d3da4c5731ac8a6c4835179310bd40a] md: Using poll /proc/mdstat can monitor the events of adding a spare disks git bisect good 9864c0053d3da4c5731ac8a6c4835179310bd40a # bad: [9b2dc8b665932a8e681a7ab3237f60475e75e161] md/raid5: fix raid5_set_bi_hw_segments git bisect bad 9b2dc8b665932a8e681a7ab3237f60475e75e161 # bad: [27d5ea04d08bea37bf651090e5f3c573d2390df8] md/bitmap: use proper accessor macro git bisect bad 27d5ea04d08bea37bf651090e5f3c573d2390df8 # bad: [01393f3d5836b7d62e925e6f4658a7eb22b83a11] md: check ->hot_remove_disk when removing disk git bisect bad 01393f3d5836b7d62e925e6f4658a7eb22b83a11 git bisect message: --- 01393f3d5836b7d62e925e6f4658a7eb22b83a11 is the first bad commit commit 01393f3d5836b7d62e925e6f4658a7eb22b83a11 Author: Namhyung Kim Date: Thu Jun 9 11:42:54 2011 +1000 md: check ->hot_remove_disk when removing disk Check pers->hot_remove_disk instead of pers->hot_add_disk in slot_store() during disk removal. The linear personality only has ->hot_add_disk and no ->hot_remove_disk, so that removing disk in the array resulted to following kernel bug: $ sudo mdadm --create /dev/md0 --level=linear --raid-devices=4 /dev/loop[0-3] $ echo none | sudo tee /sys/block/md0/md/dev-loop2/slot BUG: unable to handle kernel NULL pointer dereference at (null) IP: [< (null)>] (null) PGD c9f5d067 PUD 8575a067 PMD 0 Oops: 0010 [#1] SMP CPU 2 Modules linked in: linear loop bridge stp llc kvm_intel kvm asus_atk0110 sr_mod cdrom sg Pid: 10450, comm: tee Not tainted 3.0.0-rc1-leonard+ #173 System manufacturer System Product Name/P5G41TD-M PRO RIP: 0010:[<>] [< (null)>] (null) RSP: 0018:880085757df0 EFLAGS: 00010282 RAX: a00168e0 RBX: 8800d1431800 RCX: 006e RDX: 0001 RSI: 0002 RDI: 88008543c000 RBP: 880085757e48 R08: 0002 R09: 000a R10: R11: 88008543c2e0 R12: R13: 8800b4641000 R14: 0005 R15: FS: 7fe8c9e05700() GS:88011fa0() knlGS: CS: 0010 DS: ES: CR0: 8005003b CR2: CR3: b4502000 CR4: 000406e0 DR0: DR1: DR2: DR3: DR6: 0ff0 DR7: 0400 Process tee (pid: 10450, threadinfo 880085756000, task 8800c9f08000) Stack: 8138496a 8800b4641000 88008543c268 8800b4641000 88008543c000 8800d1431868 81a78a90 8800b4641000 88008543c000 8800d1431800 880085757e98 Call Trace: [] ? slot_store+0xaa/0x265 [] rdev_attr_store+0x89/0xa8 [] sysfs_write_file+0x108/0x144 [] vfs_write+0xb1/0x10d [] ? trace_hardirqs_on_caller+0x111/0x135 [] sys_write+0x4d/0x77 [] system_call_fastpath+0x16/0x1b Code: Bad RIP value. RIP [< (null)>] (null) RSP CR2: ---[ end trace ba5fc64319a826fb ]--- Signed-off-by: Namhyung Kim Cc: sta...@kernel.org Signed-off-by: NeilBrown :04 04 e3d19f53113f5bb5faef422958607e4f0131d235 eab9913143e1c027df693bf1fa57475da77bd36e M drivers How could I help now ? Thank you for your time. -- Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email --- You are receiving this mail bec
Re: [Qemu-devel] [PATCH]qemu: deal with guest paniced event
At 03/02/2012 12:51 AM, Luiz Capitulino Wrote: > On Mon, 27 Feb 2012 11:05:58 +0800 > Wen Congyang wrote: > >> When the host knows the guest is paniced, it will set >> exit_reason to KVM_EXIT_GUEST_PANIC. So if qemu receive >> this exit_reason, we can send a event to tell management >> application that the guest is paniced. >> >> Signed-off-by: Wen Congyang >> --- >> kvm-all.c |3 +++ >> linux-headers/linux/kvm.h |1 + >> monitor.c |3 +++ >> monitor.h |1 + >> 4 files changed, 8 insertions(+), 0 deletions(-) >> >> diff --git a/kvm-all.c b/kvm-all.c >> index c4babda..ae428ab 100644 >> --- a/kvm-all.c >> +++ b/kvm-all.c >> @@ -1190,6 +1190,9 @@ int kvm_cpu_exec(CPUState *env) >> (uint64_t)run->hw.hardware_exit_reason); >> ret = -1; >> break; >> +case KVM_EXIT_GUEST_PANIC: >> +monitor_protocol_event(QEVENT_GUEST_PANICED, NULL); >> +break; > > The event alone is not enough, because the mngt app may miss it (eg. the panic > happens before the mngt app connected to qemu). > > A simple way to solve this would be to also add a new RunState called > guest-panic and make the transition to it (besides sending the event). > > A more general way would be to model this after -drive's werror/rerror > options, > say guest-error=report|ignore|stop. When guest-error=stop, the mngt app will > get a STOP event and can check the VM runstate to know if it's guest-panic. OK, I will fix it. Thanks Wen Congyang > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PULL] kvm updates
Am 01.03.2012 13:47, schrieb Avi Kivity: > Avi Kivity (1): > pc-bios: update kvmvapic.bin > > Gleb Natapov (1): > kvm: Synchronize cpu state in kvm_arch_stop_on_emulation_error() > > Jan Kiszka (10): > kvm: Set cpu_single_env only once > Remove useless casts from cpu iterators > Process pending work while waiting for initial kick-off in TCG mode > Allow to use pause_all_vcpus from VCPU context > target-i386: Add infrastructure for reporting TPR MMIO accesses > kvmvapic: Add option ROM > kvmvapic: Introduce TPR access optimization for Windows guests > kvmvapic: Simplify mp/up_set_tpr > optionsrom: Reserve space for checksum > kvmvapic: Use optionrom helpers What about my kvmclock kvm_enabled() fix? Should I rather submit that as part of my qom-cpu queue? It depends on it. Andreas -- SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Bug 42829] KVM Guest with virtio network driver loses network connectivity
https://bugzilla.kernel.org/show_bug.cgi?id=42829 Steve changed: What|Removed |Added Kernel Version|3.2-rc3 |v3.0-rc5 -- Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email --- You are receiving this mail because: --- You are watching the assignee of the bug. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Bug 42829] KVM Guest with virtio network driver loses network connectivity
https://bugzilla.kernel.org/show_bug.cgi?id=42829 --- Comment #5 from Steve 2012-03-01 23:04:37 --- Results of my test: === In all test cases host configuration is the same: - kernel: latest 3.3-rc5+ compiler: gcc (4.7.0-pre 20120225 rev. 184573) glibc-2.14.1-r2 bridge-utils-1.5 iproute2-3.1.0 ethtool-3.2 other system stuff: all latest from their git repos util-linux, net-tools, kmod, udev, seabios, qemu-kvm In all test cases guest configuration except kernel is the same: compiler: gcc 4.6.2 glibc-2.14.1-r2 iproute2-3.1.0 ethtool-3.2 other system stuff: all latest from their git repos util-linux, net-tools, kmod, udev 1) guest kernel =v2.6.39 - works fine more than 5min 2) guest kernel =v3.0 - virtio network goes down in less then 1min of test 3) guest kernel latest v3.3-rc5+ - virtio network goes down in less then 1min of test I continue with tracking in kernel tags after v2.6.39: 4) guest kernel =v3.0-rc1 - works fine more than 5min 5) guest kernel =v3.0-rc5 - virtio network goes down in less then 1min of test 6) guest kernel =v3.0-rc3 - works fine more than 5min I shortly look at =v3.0-rc5, problem could be somewhere in this tag. Later I would like to locate problematic commit with git bisecting and after I report my results. -- Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email --- You are receiving this mail because: --- You are watching the assignee of the bug. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware
On 3/1/2012 5:36 AM, Jamal Hadi Salim wrote: > On Wed, 2012-02-29 at 10:19 -0800, John Fastabend wrote: > >>> >>> I want to see a unified API so that user space control applications (RSTP, >>> TRILL?) >>> can use one set of netlink calls for both software bridge and hardware >>> offloaded >>> bridges. Does this proposal meet that requirement? >>> > > I dont see any issues with those requirements being met. > >> Jamal, so why do "They have to be different calls"? I'm not so sure >> anymore... >> moving to RTM_FDB_XXXENTRY saved some refactoring in the bridge module but >> that >> is just cosmetic. > > I may not want to use the s/ware bridge i.e I may want to use h/ware > bridge. I may want to use both. So there are 3 variations there. You > need at least 1.5 bits to represent them if you are going to use the > same interface. There may be features in either h/ware but not in > s/ware and vice-versa. > A single interface with flags which say this applies to hware:sware:both > would be good, but it may be harder to achieve - thats why i suggested > they be different. > > cheers, > jamal > Hmm so I think what I'll do is this... both: ndm_flags = 0 sw : ndm_flags = NTF_SW_FDB hw : ndm_flags = NTF_HW_FDB Then current tools will work with embedded bridges and software bridges with the interesting case being when a port supporting an offloaded FDB is attached to a SW bridge. Doing both in this case seems to be a reasonable default to me. The tricky bit will be pulling the message handlers out of the ./net/bridge code so that we don't have to always load the bridge module to add entries to a macvlan for example. I need to look at a few other things today but I'll code up a patch for this tomorrow. .John -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware
On 3/1/2012 6:14 AM, Michael S. Tsirkin wrote: > On Wed, Feb 29, 2012 at 09:25:56AM -0800, John Fastabend wrote: >> Agreed. I think adding some ndo_ops for bridging offloads here would >> work. For example the DSA infrastructure and/or macvlan devices might >> need this. Along the lines of extending this RFC, >> >> [RFC] hardware bridging support for DSA switches >> http://patchwork.ozlabs.org/patch/16578/ >> >> >> .John > > One place where this might not work well would be > macvtap which is not a network device so it doesn't have > its own address, instead it inherits one from macvlan. > But is macvtap really doing any forwarding or implementing any RX filters? Took a quick scan and it looks like the forwarding logic is all in the macvlan code paths. In this case I suspect if we enable macvlan then any device built on top of it would work. .John -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Added functionality that allows dynamically add and remove device specific reset functions
On Thu, Mar 1, 2012 at 10:18 AM, wrote: > From b4cf24d5987475862de799c78773f13f25ed2af8 Mon Sep 17 00:00:00 2001 > From: Tadeusz Struk > Date: Tue, 17 Jan 2012 16:45:46 + > Subject: [PATCH] Added functionality that allows dynamically add and remove > device specific reset functions The subject line (after removing "[PATCH"]) becomes part of the permanent changelog, so please choose one that follows the typical style for the file. A good trick is to use "git log --oneline drivers/pci/pci.h" and make yours look like the others. > I have a use case where I need to cleanup resource allocated for Virtual > Functions after a guest OS that used it crashed. This cleanup needs to > be done before the VF is being FLRed. The only possible way to do this > seems to be by using pci_dev_specific_reset() function. Unfortunately > this function only works for devices defined in a static table in the > drivers/pci/quirks.c file. This patch changes it so that specific reset > functions can be added and deleted dynamically. > > Signed-off-by: Tadeusz Struk > > --- > drivers/pci/pci.h | 19 ++- > drivers/pci/quirks.c | 63 > ++ > 2 files changed, 71 insertions(+), 11 deletions(-) > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index 1009a5e..10929d8 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -312,18 +312,35 @@ static inline resource_size_t > pci_resource_alignment(struct pci_dev *dev, > extern void pci_enable_acs(struct pci_dev *dev); > > struct pci_dev_reset_methods { > + struct list_head list; > u16 vendor; > u16 device; > int (*reset)(struct pci_dev *dev, int probe); > }; > > #ifdef CONFIG_PCI_QUIRKS > -extern int pci_dev_specific_reset(struct pci_dev *dev, int probe); > +extern int > +pci_dev_specific_reset(struct pci_dev *dev, int probe); > +extern int > +pci_dev_specific_reset_add(const struct pci_dev_reset_methods *reset_method); > +extern int > +pci_dev_specific_reset_remove(const struct pci_dev_reset_methods > *reset_method); Please keep the existing code style, namely, function return type, name, and arguments all on the same line (broken as needed to stay in 80 columns). The "extern" is superfluous but most of the rest of the file does use it, so this patch should, too. It would be nice if you added a second patch to remove *all* the "externs" in this file. > + > #else > static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe) > { > return -ENOTTY; > } > +int > +pci_dev_specific_reset_add(const struct pci_dev_reset_methods *reset_method) Function name on same line as return type (several more instances below). > +{ > + return 0; > +} > +int > +pci_dev_specific_reset_remove(const struct pci_dev_reset_methods > *reset_method) > +{ > + return 0; > +} > #endif > > #endif /* DRIVERS_PCI_H */ > diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c > index 6476547..963e527 100644 > --- a/drivers/pci/quirks.c > +++ b/drivers/pci/quirks.c > @@ -3070,26 +3070,69 @@ static int reset_intel_82599_sfp_virtfn(struct > pci_dev *dev, int probe) > } > > #define PCI_DEVICE_ID_INTEL_82599_SFP_VF 0x10ed > +static struct pci_dev_reset_methods intel_82599_sfp_vf_reset = { > + .vendor = PCI_VENDOR_ID_INTEL, > + .device = PCI_DEVICE_ID_INTEL_82599_SFP_VF, > + .reset = reset_intel_82599_sfp_virtfn, > + .list = LIST_HEAD_INIT(intel_82599_sfp_vf_reset.list) I don't think you need to initialize the .list member here. You should be able to do that when linking the method into the list. > +} ; > > -static const struct pci_dev_reset_methods pci_dev_reset_methods[] = { > - { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82599_SFP_VF, > - reset_intel_82599_sfp_virtfn }, > - { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, > - reset_intel_generic_dev }, > - { 0 } > -}; > +static struct pci_dev_reset_methods intel_generic_reset = { > + .vendor = PCI_VENDOR_ID_INTEL, > + .device = PCI_ANY_ID, > + .reset = reset_intel_generic_dev, > + .list = LIST_HEAD_INIT(intel_generic_reset.list) > +} ; > + > +static DEFINE_SPINLOCK(reset_list_lock); > +static LIST_HEAD(reset_list); > + > + > +static int __init pci_dev_specific_reset_init(void) > +{ > + pci_dev_specific_reset_add(&intel_82599_sfp_vf_reset); > + pci_dev_specific_reset_add(&intel_generic_reset); I think this would be better as: for (i = 0; i < ARRAY_SIZE(pci_dev_reset_methods); i++) pci_dev_specific_reset_add(&pci_dev_reset_methods[i]); That way, new statically included methods could be added without touching this function. Also, I don't think the pci_dev_reset_methods[] definition would need to be changed at all for this patch (though I think you could remove the trailing "{ 0 }" sentinel). > + return 0; > +} > + > +late_initcall(pci_dev_specific_reset_init); > + > +int > +pci_dev_specific_re
[PATCH] Fix warning
>From 268c5427305d59c1f6f6c1ce8047f6e32a7edcac Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Mon, 14 Feb 2011 14:38:18 + Subject: [PATCH] Fixed warning This patch fixes the following warning. # virsh start fedora16-64 kernel: [ 133.324565] pci-stub :02:01.1: claimed by stub kernel: [ 134.163769] pci-stub :02:01.1: enabling device ( -> 0002) kernel: [ 164.282679] [ cut here ] kernel: [ 164.282685] WARNING: at drivers/pci/search.c:46 pci_find_upstream_pcie_bridge+0x87/0x9f() kernel: [ 164.282687] Hardware name: SandyBridge Platform kernel: [ 164.282689] Modules linked in: sha512_generic sha256_generic icp_qa_al(O) nfs fscache auth_rpcgss nfs_acl mga drm ip6table_filter ip6_tables ebtable_nat ebtables lockd ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack xt_CHECKSUM iptable_mangle tun bridge stp llc sunrpc btrfs zlib_deflate libcrc32c virtio_net kvm_intel kvm uinput matroxfb_base matroxfb_DAC1064 matroxfb_accel matroxfb_Ti3026 matroxfb_g450 g450_pll matroxfb_misc e1000e iTCO_wdt iTCO_vendor_support igb microcode i2c_i801 shpchp serio_raw i2c_core pcspkr dca [last unloaded: scsi_wait_scan] kernel: [ 164.282724] Pid: 1233, comm: qemu-kvm Tainted: G O 3.2.5 #10 kernel: [ 164.282726] Call Trace: kernel: [ 164.282732] [] warn_slowpath_common+0x83/0x9b kernel: [ 164.282735] [] warn_slowpath_null+0x1a/0x1c kernel: [ 164.282737] [] pci_find_upstream_pcie_bridge+0x87/0x9f kernel: [ 164.282741] [] domain_context_mapping+0x50/0xe6 kernel: [ 164.282744] [] domain_add_dev_info+0x44/0xe3 kernel: [ 164.282747] [] intel_iommu_attach_device+0x14f/0x15c kernel: [ 164.282750] [] iommu_attach_device+0x1c/0x1e kernel: [ 164.282764] [] kvm_assign_device+0x4a/0x114 [kvm] kernel: [ 164.282773] [] kvm_vm_ioctl_assigned_device+0x434/0xb25 [kvm] kernel: [ 164.282777] [] ? __do_fault+0x351/0x38b kernel: [ 164.282781] [] ? arch_local_irq_save+0x15/0x1b kernel: [ 164.282784] [] ? _raw_spin_unlock_irqrestore+0x17/0x19 kernel: [ 164.282787] [] ? pci_conf1_read+0xe1/0xee kernel: [ 164.282794] [] kvm_vm_ioctl+0x377/0x3ac [kvm] kernel: [ 164.282797] [] ? pci_read_config+0xa2/0x1bd kernel: [ 164.282801] [] ? virt_to_head_page+0xe/0x31 kernel: [ 164.282804] [] do_vfs_ioctl+0x45d/0x49e kernel: [ 164.282808] [] ? fsnotify_access+0x5f/0x67 kernel: [ 164.282811] [] sys_ioctl+0x56/0x7b kernel: [ 164.282814] [] system_call_fastpath+0x16/0x1b kernel: [ 164.282816] ---[ end trace 6a834ec5ac21cba8 ]--- Signed-off-by: Tadeusz Struk --- drivers/pci/search.c |7 +-- 1 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/pci/search.c b/drivers/pci/search.c index 9d75dc8..7847c6b 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -26,6 +26,7 @@ struct pci_dev * pci_find_upstream_pcie_bridge(struct pci_dev *pdev) { struct pci_dev *tmp = NULL; + struct pci_dev *vf = pdev; if (pci_is_pcie(pdev)) return NULL; @@ -40,8 +41,10 @@ pci_find_upstream_pcie_bridge(struct pci_dev *pdev) } /* PCI device should connect to a PCIe bridge */ if (pdev->pcie_type != PCI_EXP_TYPE_PCI_BRIDGE) { - /* Busted hardware? */ - WARN_ON_ONCE(1); + if (!vf->is_virtfn) { + /* Busted hardware? */ + WARN_ON_ONCE(1); + } return NULL; } return pdev; -- 1.7.7.6 -- Intel Shannon Limited Registered in Ireland Registered Office: Collinstown Industrial Park, Leixlip, County Kildare Registered Number: 308263 Business address: Dromore House, East Park, Shannon, Co. Clare This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). Any review or distribution by others is strictly prohibited. If you are not the intended recipient, please contact the sender and delete all copies. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Added functionality that allows dynamically add and remove device specific reset functions
>From b4cf24d5987475862de799c78773f13f25ed2af8 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Tue, 17 Jan 2012 16:45:46 + Subject: [PATCH] Added functionality that allows dynamically add and remove device specific reset functions I have a use case where I need to cleanup resource allocated for Virtual Functions after a guest OS that used it crashed. This cleanup needs to be done before the VF is being FLRed. The only possible way to do this seems to be by using pci_dev_specific_reset() function. Unfortunately this function only works for devices defined in a static table in the drivers/pci/quirks.c file. This patch changes it so that specific reset functions can be added and deleted dynamically. Signed-off-by: Tadeusz Struk --- drivers/pci/pci.h| 19 ++- drivers/pci/quirks.c | 63 ++ 2 files changed, 71 insertions(+), 11 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1009a5e..10929d8 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -312,18 +312,35 @@ static inline resource_size_t pci_resource_alignment(struct pci_dev *dev, extern void pci_enable_acs(struct pci_dev *dev); struct pci_dev_reset_methods { + struct list_head list; u16 vendor; u16 device; int (*reset)(struct pci_dev *dev, int probe); }; #ifdef CONFIG_PCI_QUIRKS -extern int pci_dev_specific_reset(struct pci_dev *dev, int probe); +extern int +pci_dev_specific_reset(struct pci_dev *dev, int probe); +extern int +pci_dev_specific_reset_add(const struct pci_dev_reset_methods *reset_method); +extern int +pci_dev_specific_reset_remove(const struct pci_dev_reset_methods *reset_method); + #else static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe) { return -ENOTTY; } +int +pci_dev_specific_reset_add(const struct pci_dev_reset_methods *reset_method) +{ + return 0; +} +int +pci_dev_specific_reset_remove(const struct pci_dev_reset_methods *reset_method) +{ + return 0; +} #endif #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6476547..963e527 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3070,26 +3070,69 @@ static int reset_intel_82599_sfp_virtfn(struct pci_dev *dev, int probe) } #define PCI_DEVICE_ID_INTEL_82599_SFP_VF 0x10ed +static struct pci_dev_reset_methods intel_82599_sfp_vf_reset = { + .vendor = PCI_VENDOR_ID_INTEL, + .device = PCI_DEVICE_ID_INTEL_82599_SFP_VF, + .reset = reset_intel_82599_sfp_virtfn, + .list = LIST_HEAD_INIT(intel_82599_sfp_vf_reset.list) +} ; -static const struct pci_dev_reset_methods pci_dev_reset_methods[] = { - { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82599_SFP_VF, -reset_intel_82599_sfp_virtfn }, - { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, - reset_intel_generic_dev }, - { 0 } -}; +static struct pci_dev_reset_methods intel_generic_reset = { + .vendor = PCI_VENDOR_ID_INTEL, + .device = PCI_ANY_ID, + .reset = reset_intel_generic_dev, + .list = LIST_HEAD_INIT(intel_generic_reset.list) +} ; + +static DEFINE_SPINLOCK(reset_list_lock); +static LIST_HEAD(reset_list); + + +static int __init pci_dev_specific_reset_init(void) +{ + pci_dev_specific_reset_add(&intel_82599_sfp_vf_reset); + pci_dev_specific_reset_add(&intel_generic_reset); + return 0; +} + +late_initcall(pci_dev_specific_reset_init); + +int +pci_dev_specific_reset_add(const struct pci_dev_reset_methods *reset_method) +{ + if (reset_method && reset_method->reset) { + spin_lock(&reset_list_lock); + list_add((struct list_head *)&reset_method->list, +&reset_list); + spin_unlock(&reset_list_lock); + return 0; + } + return -EINVAL; +} +EXPORT_SYMBOL(pci_dev_specific_reset_add); + +int +pci_dev_specific_reset_remove(const struct pci_dev_reset_methods *reset_method) +{ + if (reset_method) { + spin_lock(&reset_list_lock); + list_del((struct list_head *)(&reset_method->list)); + spin_unlock(&reset_list_lock); + return 0; + } + return -EINVAL; +} +EXPORT_SYMBOL(pci_dev_specific_reset_remove); int pci_dev_specific_reset(struct pci_dev *dev, int probe) { const struct pci_dev_reset_methods *i; - - for (i = pci_dev_reset_methods; i->reset; i++) { + list_for_each_entry(i, &reset_list, list) { if ((i->vendor == dev->vendor || i->vendor == (u16)PCI_ANY_ID) && (i->device == dev->device || i->device == (u16)PCI_ANY_ID)) return i->reset(dev, probe); } - return -ENOTTY; } -- 1.7.7.6 -- Intel Shannon Limited Registered in Ireland Registered Office: Collin
Re: [Qemu-devel] [PULL] Urgent memory fix for kvm with unaligned memory slots
On 03/01/2012 07:08 PM, Eric Blake wrote: > On 03/01/2012 10:03 AM, Avi Kivity wrote: > >>> > >>> -ram = memory_region_get_ram_ptr(mr) + section->offset_within_region; > >>> +ram = memory_region_get_ram_ptr(mr) + section->offset_within_region > >>> + delta; > >> > >> Am I crazy, or does this look wrong? > > > > Could be both. Why do you thing it is wrong? > > Line wrapping makes it look like we are adding two lines, one line > ending in 'section->offset_within_region', and the next line starting > with 'delta;', which is a syntax error. > > But without line wrapping, we are adding just one line with > 'offset_within_region + delta;' at the end of that line. > Ah, of course. I just copy/pasted this into thunderbird, this is not a proper patch but a pull request. Sorry about the confusion. Bobby: so it does indeed look wrong. This says nothing about your sanity though, for that consult a qualified professional instead of asking on the mailing list. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PULL] Urgent memory fix for kvm with unaligned memory slots
On Thu, Mar 1, 2012 at 12:08 PM, Eric Blake wrote: > On 03/01/2012 10:03 AM, Avi Kivity wrote: - ram = memory_region_get_ram_ptr(mr) + section->offset_within_region; + ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta; >>> >>> Am I crazy, or does this look wrong? >> >> Could be both. Why do you thing it is wrong? > > Line wrapping makes it look like we are adding two lines, one line > ending in 'section->offset_within_region', and the next line starting > with 'delta;', which is a syntax error. yea, patch line wrapping was making my eyes see things. okay, sorry for the noise. > > But without line wrapping, we are adding just one line with > 'offset_within_region + delta;' at the end of that line. > > -- > Eric Blake ebl...@redhat.com +1-919-301-3266 > Libvirt virtualization library http://libvirt.org > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PULL] Urgent memory fix for kvm with unaligned memory slots
On 03/01/2012 10:03 AM, Avi Kivity wrote: >>> >>> -ram = memory_region_get_ram_ptr(mr) + section->offset_within_region; >>> +ram = memory_region_get_ram_ptr(mr) + section->offset_within_region >>> + delta; >> >> Am I crazy, or does this look wrong? > > Could be both. Why do you thing it is wrong? Line wrapping makes it look like we are adding two lines, one line ending in 'section->offset_within_region', and the next line starting with 'delta;', which is a syntax error. But without line wrapping, we are adding just one line with 'offset_within_region + delta;' at the end of that line. -- Eric Blake ebl...@redhat.com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
Re: [PULL] Urgent memory fix for kvm with unaligned memory slots
On 03/01/2012 06:51 PM, Bobby Powers wrote: > > /* kvm works in page size chunks, but the function may be called > >with sub-page size and unaligned start address. */ > > -size = TARGET_PAGE_ALIGN(size); > > -start_addr = TARGET_PAGE_ALIGN(start_addr); > > +delta = TARGET_PAGE_ALIGN(size) - size; > > +if (delta > size) { > > +return; > > +} > > +start_addr += delta; > > +size -= delta; > > +size &= TARGET_PAGE_MASK; > > +if (!size || (start_addr & ~TARGET_PAGE_MASK)) { > > +return; > > +} > > > > if (!memory_region_is_ram(mr)) { > > return; > > } > > > > -ram = memory_region_get_ram_ptr(mr) + section->offset_within_region; > > +ram = memory_region_get_ram_ptr(mr) + section->offset_within_region > > + delta; > > Am I crazy, or does this look wrong? Could be both. Why do you thing it is wrong? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH]qemu: deal with guest paniced event
On Mon, 27 Feb 2012 11:05:58 +0800 Wen Congyang wrote: > When the host knows the guest is paniced, it will set > exit_reason to KVM_EXIT_GUEST_PANIC. So if qemu receive > this exit_reason, we can send a event to tell management > application that the guest is paniced. > > Signed-off-by: Wen Congyang > --- > kvm-all.c |3 +++ > linux-headers/linux/kvm.h |1 + > monitor.c |3 +++ > monitor.h |1 + > 4 files changed, 8 insertions(+), 0 deletions(-) > > diff --git a/kvm-all.c b/kvm-all.c > index c4babda..ae428ab 100644 > --- a/kvm-all.c > +++ b/kvm-all.c > @@ -1190,6 +1190,9 @@ int kvm_cpu_exec(CPUState *env) > (uint64_t)run->hw.hardware_exit_reason); > ret = -1; > break; > +case KVM_EXIT_GUEST_PANIC: > +monitor_protocol_event(QEVENT_GUEST_PANICED, NULL); > +break; The event alone is not enough, because the mngt app may miss it (eg. the panic happens before the mngt app connected to qemu). A simple way to solve this would be to also add a new RunState called guest-panic and make the transition to it (besides sending the event). A more general way would be to model this after -drive's werror/rerror options, say guest-error=report|ignore|stop. When guest-error=stop, the mngt app will get a STOP event and can check the VM runstate to know if it's guest-panic. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PULL] Urgent memory fix for kvm with unaligned memory slots
On Thu, Mar 1, 2012 at 8:40 AM, Avi Kivity wrote: > The memory core may generate RAM memory regions that are not page > aligned, but the kvm code is not prepared to handle them well and will > abort under certain conditions. This patch fixes the problem. > > Please pull from: > > git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git memory/urgent > > > Avi Kivity (1): > kvm: fix unaligned slots > > kvm-all.c | 15 --- > 1 files changed, 12 insertions(+), 3 deletions(-) > > diff --git a/kvm-all.c b/kvm-all.c > index c4babda..4b7a4ae 100644 > --- a/kvm-all.c > +++ b/kvm-all.c > @@ -541,17 +541,26 @@ static void kvm_set_phys_mem(MemoryRegionSection > *section, bool add) > target_phys_addr_t start_addr = section->offset_within_address_space; > ram_addr_t size = section->size; > void *ram = NULL; > + unsigned delta; > > /* kvm works in page size chunks, but the function may be called > with sub-page size and unaligned start address. */ > - size = TARGET_PAGE_ALIGN(size); > - start_addr = TARGET_PAGE_ALIGN(start_addr); > + delta = TARGET_PAGE_ALIGN(size) - size; > + if (delta > size) { > + return; > + } > + start_addr += delta; > + size -= delta; > + size &= TARGET_PAGE_MASK; > + if (!size || (start_addr & ~TARGET_PAGE_MASK)) { > + return; > + } > > if (!memory_region_is_ram(mr)) { > return; > } > > - ram = memory_region_get_ram_ptr(mr) + section->offset_within_region; > + ram = memory_region_get_ram_ptr(mr) + section->offset_within_region > + delta; Am I crazy, or does this look wrong? > > while (1) { > mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + > size); > > -- > error compiling committee.c: too many arguments to function > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4] KVM: Resize kvm_io_range array dynamically
On Thu, 2012-03-01 at 12:14 +0200, Sasha Levin wrote: > On Thu, Mar 1, 2012 at 9:01 AM, Amos Kong wrote: > > This patch makes the kvm_io_range array can be resized dynamically. > > Set an upper bounds for kvm_io_range to limit userspace. > > 1000 is a very large limit and not bloat the typical user. > > Whats the reason for making everything dynamic? Memory savings there > aren't that significant. We're currently looking at about 14k for these arrays with 300 entries since we have two of them. If we're going to double it or triple it to handle the maximum use case, why impose that on the typical VM? It may not be multiple megabytes, but I wouldn't say it's insignificant either. > > If you want to make it more efficient just define: > static struct kvm_io_bus io_bus[2]; > > somewhere in kvm_main.c, and just switch between them when you need to > do insertion and removal of devices. You get the benefit of zero slub > usage, no allocations in any of the paths, and much simpler logic. It's updated via rcu. The change Amos is proposing is fairly trivial; allocate the necessary size and memcpy instead of memdup. Maybe it can be optimized further, but this seems like a step in the right direction to handle worst case use and, if anything, benefit the typical user too. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Bug 42829] KVM Guest with virtio network driver loses network connectivity
https://bugzilla.kernel.org/show_bug.cgi?id=42829 --- Comment #4 from Steve 2012-03-01 15:22:53 --- Thank you for information. On host I used last updated: kernel from: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git qemu-kvm from: git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git seabios from: git://git.seabios.org/seabios.git used compiler, libs & other stuff: gcc-4.7.0 (feb 2012) glibc-2.14.1-r2 bridge-utils-1.5 iproute2-3.1.0 ethtool-3.2 On guest I can proceed this: 1) test case one - kernel before https://bugzilla.kernel.org/userprefs.cgi?tab=email --- You are receiving this mail because: --- You are watching the assignee of the bug. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
live migration between amd fam15h-fam10h
Hi, I am getting a frozen guest when migrating from an Opteron 6274 host (amd fam15h) to an Opteron 6174 host (amd fam10h). The live migration completes succesfully, but the guest is frozen: vcn screen is still there, but no input is possible and no kernel output is seen. Trying "c" on the qemu-monitor does not help. I am using "-cpu Opteron_G3" which I assumed would be ok for both host cpus. In the opposite direction (migrating from an amd fam10h host to an amdfam15h host) the guest continues to run on the destination. However, on most of these successfull live migrations, I notice a "clocksource unstable" message on the guest kernel (using the default kvm-clock clocksource) e.g. Clocksource tsc unstable (delta = -1500533439 ns) Same situation (guest runs on destination with clocksource unstable message) happens when migrating between fam15h hosts (I have not tried between fam10h hosts) Changing the clocksource (tsc, acpi_pm, hpet) does not solve the issue. Also tried with "-cpu kvm64" with same result. qemu-kvm version: 0.15.1, 1.0 or qemu-kvm/master Host kernel: 3.0.15 (on both hosts) Guest kernel: 3.0.6 or 3.2 this is the qemu-kvm command line used on the source host: " kvm -enable-kvm -m 1024 -smp 1 -cpu Opteron_G3,check -drive \ file=/opt/test.img,if=none,id=drive-virtio-disk1,format=raw,cache=writethrough,boot=on -device virtio-blk-pci,bus=pci.0,addr=0x5,drive=drive-virtio-disk1,id=virtio-disk1 -monitor stdio -vnc 0.0.0.0:6 -vga std -chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 -usb -device usb-tablet,id=input0 " The destination host has the same command line with an added "-incoming tcp:". I have mainly tested this with non-shared storage (but also shared storage has the same result). Migration is triggered with "migrate -b tcp:destip:" Do the TSC microarchitecture changes in amdfam15h (see AMD SW optimiization guide for fam15h, 47414 Rev 3.02 Appendix E) affect pvclock stability on migration in same family or across families? cpuid information follows in case it's helpful. 6274 host: eax ineax ebx ecx edx 000d 68747541 444d4163 69746e65 0001 00600f12 02100800 1e98220b 178bfbff 0002 0003 0004 0005 0040 0040 0003 0006 0001 0007 0008 0009 000a 000b 000c 000d 8000 801e 68747541 444d4163 69746e65 8001 00600f12 3000 01c9bfff 2fd3fbff 8002 20444d41 6574704f 286e6f72 20294d54 8003 636f7250 6f737365 32362072 20203437 8004 20202020 20202020 20202020 00202020 8005 ff20ff18 ff20ff30 10040140 40020140 8006 6400 64004200 08008140 0060e140 8007 03d9 8008 3030 500f 8009 800a 0001 0001 14ff 800b 800c 800d 800e 800f 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 f020f018 6400 801a 0003 801b 00ff 801c 80032013 00010200 800f 801d 801e 0022 0101 0100 Vendor ID: "AuthenticAMD"; CPUID level 13 AMD-specific functions Version 00600f12: Family: 15 Model: 1 [] Standard feature flags 178bfbff: Floating Point Unit Virtual Mode Extensions Debugging Extensions Page Size Extensions Time Stamp Counter (with RDTSC and CR4 disable bit) Model Specific Registers with RDMSR & WRMSR PAE - Page Address Extensions Machine Check Exception COMPXCHG8B Instruction APIC SYSCALL/SYSRET or SYSENTER/SYSEXIT instructions MTRR - Memory Type Range Registers Global paging extension Machine Check Architecture Conditional Move Instruction PAT - Page Attribute Table PSE-36 - Page Size Extensions 19 - reserved MMX instructions FXSAVE/FXRSTOR 25 - reserved 26 - reserved 28 - reserved Generation: 15 Model: 1 Extended feature flags 2fd3fbff: Floating Point Unit Virtual Mode Extensions Debugging Extensions Page Size Ext
[Bug 42829] KVM Guest with virtio network driver loses network connectivity
https://bugzilla.kernel.org/show_bug.cgi?id=42829 Kurk changed: What|Removed |Added CC||k...@shiftmail.org --- Comment #3 from Kurk 2012-03-01 14:16:36 --- Hello, I am pretty sure I read about a very similar bug report recently, dunno if that was on kvm or qemu mailing list, or in a bugzilla entry like this. The guy said that there was a reproducer: running a network test between two affected virtual machines. Maybe that was netperf, not sure. Would reproduce the bug in a short time. He said kernel 3.0 (GUEST!) was the first he tested to have this problem, while much older guest kernels did not have this problem. I suggest Steve tries this reproducer, and tell if it works in his case. If that works, debugging would be much faster. -- Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email --- You are receiving this mail because: --- You are watching the assignee of the bug. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware
On Wed, Feb 29, 2012 at 09:25:56AM -0800, John Fastabend wrote: > Agreed. I think adding some ndo_ops for bridging offloads here would > work. For example the DSA infrastructure and/or macvlan devices might > need this. Along the lines of extending this RFC, > > [RFC] hardware bridging support for DSA switches > http://patchwork.ozlabs.org/patch/16578/ > > > .John One place where this might not work well would be macvtap which is not a network device so it doesn't have its own address, instead it inherits one from macvlan. -- MST -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PULL] Urgent memory fix for kvm with unaligned memory slots
The memory core may generate RAM memory regions that are not page aligned, but the kvm code is not prepared to handle them well and will abort under certain conditions. This patch fixes the problem. Please pull from: git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git memory/urgent Avi Kivity (1): kvm: fix unaligned slots kvm-all.c | 15 --- 1 files changed, 12 insertions(+), 3 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index c4babda..4b7a4ae 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -541,17 +541,26 @@ static void kvm_set_phys_mem(MemoryRegionSection *section, bool add) target_phys_addr_t start_addr = section->offset_within_address_space; ram_addr_t size = section->size; void *ram = NULL; +unsigned delta; /* kvm works in page size chunks, but the function may be called with sub-page size and unaligned start address. */ -size = TARGET_PAGE_ALIGN(size); -start_addr = TARGET_PAGE_ALIGN(start_addr); +delta = TARGET_PAGE_ALIGN(size) - size; +if (delta > size) { +return; +} +start_addr += delta; +size -= delta; +size &= TARGET_PAGE_MASK; +if (!size || (start_addr & ~TARGET_PAGE_MASK)) { +return; +} if (!memory_region_is_ram(mr)) { return; } -ram = memory_region_get_ram_ptr(mr) + section->offset_within_region; +ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta; while (1) { mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size); -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Add -netdev to man page
On 03/01/2012 02:51 PM, Miroslav Rezanina wrote: > There's missing -netdev description in the man page for qemu. As this is > recommended way to create network backend, lack of documentation can > discourage > its usage. > > -net option is preserved but marked as obsolete way. Please post to qemu-de...@nongnu.org instead. > "socket],id=str[,option][,option][,...]\n", QEMU_ARCH_ALL) > STEXI > +@item -netdev user|tap|vde|socket,id=@var{str}[,option][,option][,...] > + > +Create a new network backend to the guest. "for the guest"? But they're actually not for the guest, at least not directly. > Network's id "The backend's id" or "network backend identifiers" > can be used with > +the -device option to "plug" a particular network device into the network > +backend, e.g. "to connect a backend with a device"? > + > +@example > +-netdev user,id=mynet -device e1000,netdev=mynet > +@end example > + > +You can use following types of backend: > + > +@table @option > + > +@item -netdev user > + > +User networking is default network backend. This backend does not require > root > +priviledges, does not allow ICMP trafic and host is not directly accessible > +from the host or the external network. > + > +Valid options are: > + > +@table @option > + > +@item restrict=y|yes|n|no > +If this options is enabled, the guest will be isolated, i.e. it will not be > +able to contact the host and no guest IP packets will be routed over the host > +to the outside. This option does not affect explicitly set forwarding rule. "rules" > + > +@item net=@var{addr}[/@var{mask}] > +Set IP network address the guest will see. Optionally specify the netmask, "the IP" > +either in the form a.b.c.d or as number of valid top-most bits. Default is > +10.0.2.0/8. > + > +@item host=@var{addr} > +Specify the guest-visible address of the host. Default is the 2nd IP in the > +guest network, i.e. x.x.x.2. > + > +@item hostname=@var{name} > +Specifies the client hostname reported by the builtin DHCP server. > + > +@item dhcpstart=@var{addr} > +Specify the first of the 16 IPs the built-in DHCP server can assign. Default > +is the 16th to 31st IP in the guest network, i.e. x.x.x.16 to x.x.x.31. > + > +@item dns=@var{addr} > +Specify the guest-visible address of the virtual nameserver. The address must > +be different from the host address. Default is the 3rd IP in the guest > network, > +i.e. x.x.x.3. > + > +@item tftp=@var{dir} > +When using the user mode network stack, activate a built-in TFTP > +server. The files in @var{dir} will be exposed as the root of a TFTP server. > +The TFTP client on the guest must be configured in binary mode (use the > command > +@code{bin} of the Unix TFTP client). > + > +@item bootfile=@var{file} > +When using the user mode network stack, broadcast @var{file} as the BOOTP > +filename. In conjunction with @option{tftp}, this can be used to network boot > +a guest from a local directory. > + > +Example (using pxelinux): > +@example > +qemu -hda linux.img -boot n -netdev > user,id=netid,tftp=/path/to/tftp/files,bootfile=/pxelinux.0 > +@end example > + > +@item smb=@var{dir}[,smbserver=@var{addr}] > +When using the user mode network stack, activate a built-in SMB > +server so that Windows OSes can access to the host files in @file{@var{dir}} > +transparently. The IP address of the SMB server can be set to @var{addr}. By > +default the 4th IP in the guest network is used, i.e. x.x.x.4. > + > +In the guest Windows OS, the line: > +@example > +10.0.2.4 smbserver > +@end example > +must "may"? > be added in the file @file{C:\WINDOWS\LMHOSTS} (for windows 9x/Me) > +or @file{C:\WINNT\SYSTEM32\DRIVERS\ETC\LMHOSTS} (Windows NT/2000). > + > +Then @file{@var{dir}} can be accessed in @file{\\smbserver\qemu}. > + > +Note that a SAMBA server must be installed on the host OS in > +@file{/usr/sbin/smbd}. QEMU was tested successfully with smbd versions from > +Red Hat 9, Fedora Core 3 and OpenSUSE 11.x. > + > +@item > hostfwd=[tcp|udp]:[@var{hostaddr}]:@var{hostport}-[@var{guestaddr}]:@var{guestport} > +Redirect incoming TCP or UDP connections to the host port @var{hostport} to > +the guest IP address @var{guestaddr} on guest port @var{guestport}. If > +@var{guestaddr} is not specified, its value is x.x.x.15 (default first > address > +given by the built-in DHCP server). By specifying @var{hostaddr}, the rule > can > +be bound to a specific host interface. If no connection type is set, TCP is > +used. This option can be given multiple times. > + > +For example, to redirect host X11 connection from screen 1 to guest > +screen 0, use the following: > + > +@example > +# on the host > +qemu -netdev user,id=netid,hostfwd=tcp:127.0.0.1:6001-:6000 [...] > +# this host xterm should open in the guest X11 server > +xterm -display :1 > +@end example > + > +To redirect telnet connections from host port to telnet port on > +the guest, use the following: > + > +@example > +# on the host > +qemu -netdev user,id=netid,
Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware
On Wed, 2012-02-29 at 10:19 -0800, John Fastabend wrote: > > > > I want to see a unified API so that user space control applications (RSTP, > > TRILL?) > > can use one set of netlink calls for both software bridge and hardware > > offloaded > > bridges. Does this proposal meet that requirement? > > I dont see any issues with those requirements being met. > Jamal, so why do "They have to be different calls"? I'm not so sure anymore... > moving to RTM_FDB_XXXENTRY saved some refactoring in the bridge module but > that > is just cosmetic. I may not want to use the s/ware bridge i.e I may want to use h/ware bridge. I may want to use both. So there are 3 variations there. You need at least 1.5 bits to represent them if you are going to use the same interface. There may be features in either h/ware but not in s/ware and vice-versa. A single interface with flags which say this applies to hware:sware:both would be good, but it may be harder to achieve - thats why i suggested they be different. cheers, jamal -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware
On Wed, 2012-02-29 at 09:25 -0800, John Fastabend wrote: > Well I think NETLINK_ROUTE is the most correct type to use in this > case. Per netlink.h its for routing and device hooks. > > #define NETLINK_ROUTE 0 /* Routing/device hook > */ > > And NETLINK_ROUTE msg_types use the RTM_* prefix. The _*NEIGH postfix > were merely a copy from the SW BRIDGE code paths. How about, > > PF_BRIDGE:RTM_FDB_NEWENTRY > PF_BRIDGE:RTM_FDB_DELENTRY > PF_BRIDGE:RTM_FDB_GETENTRY OK, I guess ;-> > And a new group RTNLGRP_FDB. Nod. > Also using NETLINK_ROUTE gives the correct > rtnl locking semantics for free. makes sense. > Agreed. I think adding some ndo_ops for bridging offloads here would > work. For example the DSA infrastructure and/or macvlan devices might > need this. Along the lines of extending this RFC, > > [RFC] hardware bridging support for DSA switches > http://patchwork.ozlabs.org/patch/16578/ Certainly - thats one approach that is reasonable. Where is Lennert? ;-> I changed his email address to one that i am familiar with. cheers, jamal -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Add -netdev to man page
There's missing -netdev description in the man page for qemu. As this is recommended way to create network backend, lack of documentation can discourage its usage. -net option is preserved but marked as obsolete way. Signed-off-by: Miroslav Rezanina --- qemu-options.hx | 261 +++ 1 files changed, 261 insertions(+), 0 deletions(-) diff --git a/qemu-options.hx b/qemu-options.hx index b129996..6e7835a 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1292,6 +1292,267 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev, #endif "socket],id=str[,option][,option][,...]\n", QEMU_ARCH_ALL) STEXI +@item -netdev user|tap|vde|socket,id=@var{str}[,option][,option][,...] + +Create a new network backend to the guest. Network's id can be used with +the -device option to "plug" a particular network device into the network +backend, e.g. + +@example +-netdev user,id=mynet -device e1000,netdev=mynet +@end example + +You can use following types of backend: + +@table @option + +@item -netdev user + +User networking is default network backend. This backend does not require root +priviledges, does not allow ICMP trafic and host is not directly accessible +from the host or the external network. + +Valid options are: + +@table @option + +@item restrict=y|yes|n|no +If this options is enabled, the guest will be isolated, i.e. it will not be +able to contact the host and no guest IP packets will be routed over the host +to the outside. This option does not affect explicitly set forwarding rule. + +@item net=@var{addr}[/@var{mask}] +Set IP network address the guest will see. Optionally specify the netmask, +either in the form a.b.c.d or as number of valid top-most bits. Default is +10.0.2.0/8. + +@item host=@var{addr} +Specify the guest-visible address of the host. Default is the 2nd IP in the +guest network, i.e. x.x.x.2. + +@item hostname=@var{name} +Specifies the client hostname reported by the builtin DHCP server. + +@item dhcpstart=@var{addr} +Specify the first of the 16 IPs the built-in DHCP server can assign. Default +is the 16th to 31st IP in the guest network, i.e. x.x.x.16 to x.x.x.31. + +@item dns=@var{addr} +Specify the guest-visible address of the virtual nameserver. The address must +be different from the host address. Default is the 3rd IP in the guest network, +i.e. x.x.x.3. + +@item tftp=@var{dir} +When using the user mode network stack, activate a built-in TFTP +server. The files in @var{dir} will be exposed as the root of a TFTP server. +The TFTP client on the guest must be configured in binary mode (use the command +@code{bin} of the Unix TFTP client). + +@item bootfile=@var{file} +When using the user mode network stack, broadcast @var{file} as the BOOTP +filename. In conjunction with @option{tftp}, this can be used to network boot +a guest from a local directory. + +Example (using pxelinux): +@example +qemu -hda linux.img -boot n -netdev user,id=netid,tftp=/path/to/tftp/files,bootfile=/pxelinux.0 +@end example + +@item smb=@var{dir}[,smbserver=@var{addr}] +When using the user mode network stack, activate a built-in SMB +server so that Windows OSes can access to the host files in @file{@var{dir}} +transparently. The IP address of the SMB server can be set to @var{addr}. By +default the 4th IP in the guest network is used, i.e. x.x.x.4. + +In the guest Windows OS, the line: +@example +10.0.2.4 smbserver +@end example +must be added in the file @file{C:\WINDOWS\LMHOSTS} (for windows 9x/Me) +or @file{C:\WINNT\SYSTEM32\DRIVERS\ETC\LMHOSTS} (Windows NT/2000). + +Then @file{@var{dir}} can be accessed in @file{\\smbserver\qemu}. + +Note that a SAMBA server must be installed on the host OS in +@file{/usr/sbin/smbd}. QEMU was tested successfully with smbd versions from +Red Hat 9, Fedora Core 3 and OpenSUSE 11.x. + +@item hostfwd=[tcp|udp]:[@var{hostaddr}]:@var{hostport}-[@var{guestaddr}]:@var{guestport} +Redirect incoming TCP or UDP connections to the host port @var{hostport} to +the guest IP address @var{guestaddr} on guest port @var{guestport}. If +@var{guestaddr} is not specified, its value is x.x.x.15 (default first address +given by the built-in DHCP server). By specifying @var{hostaddr}, the rule can +be bound to a specific host interface. If no connection type is set, TCP is +used. This option can be given multiple times. + +For example, to redirect host X11 connection from screen 1 to guest +screen 0, use the following: + +@example +# on the host +qemu -netdev user,id=netid,hostfwd=tcp:127.0.0.1:6001-:6000 [...] +# this host xterm should open in the guest X11 server +xterm -display :1 +@end example + +To redirect telnet connections from host port to telnet port on +the guest, use the following: + +@example +# on the host +qemu -netdev user,id=netid,hostfwd=tcp:::23 [...] +telnet localhost +@end example + +Then when you use on the host @code{telnet localhost }, you +connect to the guest telnet server. + +@item guestfwd=[tcp]:@var{s
[PULL] kvm updates
This batch of updates is mostly Jan's rework of qemu-kvm's TPR optimization for 32-bit Windows, making Windows XP much faster with kvm enabled on older Intel and any AMD hardware. Any similarities to the original hack are purely coincidental. Please pull from: git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git uq/master Avi Kivity (1): pc-bios: update kvmvapic.bin Gleb Natapov (1): kvm: Synchronize cpu state in kvm_arch_stop_on_emulation_error() Jan Kiszka (10): kvm: Set cpu_single_env only once Remove useless casts from cpu iterators Process pending work while waiting for initial kick-off in TCG mode Allow to use pause_all_vcpus from VCPU context target-i386: Add infrastructure for reporting TPR MMIO accesses kvmvapic: Add option ROM kvmvapic: Introduce TPR access optimization for Windows guests kvmvapic: Simplify mp/up_set_tpr optionsrom: Reserve space for checksum kvmvapic: Use optionrom helpers .gitignore|1 + Makefile |2 +- Makefile.target |3 +- cpu-all.h |3 +- cpus.c| 26 ++- hw/apic.c | 126 ++- hw/apic.h |2 + hw/apic_common.c | 69 - hw/apic_internal.h| 27 ++ hw/kvm/apic.c | 32 ++ hw/kvmvapic.c | 805 + hw/mc146818rtc.c |5 +- kvm-all.c |5 - pc-bios/kvmvapic.bin | Bin 0 -> 9216 bytes pc-bios/optionrom/Makefile|2 +- pc-bios/optionrom/kvmvapic.S | 335 + pc-bios/optionrom/optionrom.h |3 +- target-i386/cpu.h | 10 + target-i386/helper.c | 16 + target-i386/kvm.c | 26 ++- 20 files changed, 1468 insertions(+), 30 deletions(-) create mode 100644 hw/kvmvapic.c create mode 100755 pc-bios/kvmvapic.bin create mode 100644 pc-bios/optionrom/kvmvapic.S -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor
On Thu, 1 Mar 2012, Russell King - ARM Linux wrote: > On Thu, Mar 01, 2012 at 10:27:02AM +, Dave Martin wrote: > > So, where there's a compelling reason to inline these things, we can use > > the existing techniques if we're alert to the risks. But in cases where > > there isn't a compelling reason, aren't we just inviting fragility > > unnecessarily? > > The practical experience from the kernel suggests that there isn't a > problem - that's not to say that future versions of gcc won't become > a problem, and that the compiler guys may refuse to fix it. > > I think it's a feature that we should be pressing gcc guys for - it's > fairly fundamental to any programming which requires interfaces that > require certain args in certain registers, or receive results in > certain registers. > > The options over this are basically: > 1. refusing to upgrade to any version of gcc which does not allow >registers-in-asm > 2. doing the store-to-memory reload-in-asm thing > 3. hand-coding veneers for every call to marshall the registers > > Each of those has its down sides, but I suspect with (1), it may be > possible to have enough people applying pressure to the compiler guys > that they finally see sense on this matter. I tend to have a very practical approach about this sort of issues, so I am tempted to go with 1) if you agree. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Bug 42829] KVM Guest with virtio network driver loses network connectivity
https://bugzilla.kernel.org/show_bug.cgi?id=42829 Steve changed: What|Removed |Added Kernel Version|3.2 |3.2-rc3 --- Comment #2 from Steve 2012-03-01 10:46:09 --- Rusty, could you have a look at, please ? Thank you for your time. -- Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email --- You are receiving this mail because: --- You are watching the assignee of the bug. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor
On Thu, Mar 01, 2012 at 10:27:02AM +, Dave Martin wrote: > So, where there's a compelling reason to inline these things, we can use > the existing techniques if we're alert to the risks. But in cases where > there isn't a compelling reason, aren't we just inviting fragility > unnecessarily? The practical experience from the kernel suggests that there isn't a problem - that's not to say that future versions of gcc won't become a problem, and that the compiler guys may refuse to fix it. I think it's a feature that we should be pressing gcc guys for - it's fairly fundamental to any programming which requires interfaces that require certain args in certain registers, or receive results in certain registers. The options over this are basically: 1. refusing to upgrade to any version of gcc which does not allow registers-in-asm 2. doing the store-to-memory reload-in-asm thing 3. hand-coding veneers for every call to marshall the registers Each of those has its down sides, but I suspect with (1), it may be possible to have enough people applying pressure to the compiler guys that they finally see sense on this matter. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4] KVM: Remove unused dirty_bitmap_head and nr_dirty_pages
Now that we do neither double buffering nor heuristic selection of the write protection method these are not needed anymore. Note: some drivers have their own implementation of set_bit_le() and making it generic needs a bit of work; so we use test_and_set_bit_le() and will later replace it with generic set_bit_le(). Signed-off-by: Takuya Yoshikawa --- include/linux/kvm_host.h |2 -- virt/kvm/kvm_main.c | 14 +- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 355e445..73c7d76 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -177,8 +177,6 @@ struct kvm_memory_slot { unsigned long flags; unsigned long *rmap; unsigned long *dirty_bitmap; - unsigned long *dirty_bitmap_head; - unsigned long nr_dirty_pages; struct kvm_arch_memory_slot arch; unsigned long userspace_addr; int user_alloc; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e4431ad..27a1083 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -522,12 +522,11 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) return; if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) - vfree(memslot->dirty_bitmap_head); + vfree(memslot->dirty_bitmap); else - kfree(memslot->dirty_bitmap_head); + kfree(memslot->dirty_bitmap); memslot->dirty_bitmap = NULL; - memslot->dirty_bitmap_head = NULL; } /* @@ -611,8 +610,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) /* * Allocation size is twice as large as the actual dirty bitmap size. - * This makes it possible to do double buffering: see x86's - * kvm_vm_ioctl_get_dirty_log(). + * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. */ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { @@ -627,8 +625,6 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) if (!memslot->dirty_bitmap) return -ENOMEM; - memslot->dirty_bitmap_head = memslot->dirty_bitmap; - memslot->nr_dirty_pages = 0; #endif /* !CONFIG_S390 */ return 0; } @@ -1476,8 +1472,8 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; - if (!test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap)) - memslot->nr_dirty_pages++; + /* TODO: introduce set_bit_le() and use it */ + test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap); } } -- 1.7.5.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/4] KVM: Switch to srcu-less get_dirty_log()
We have seen some problems of the current implementation of get_dirty_log() which uses synchronize_srcu_expedited() for updating dirty bitmaps; e.g. it is noticeable that this sometimes gives us ms order of latency when we use VGA displays. Furthermore the recent discussion on the following thread "srcu: Implement call_srcu()" http://lkml.org/lkml/2012/1/31/211 also motivated us to implement get_dirty_log() without SRCU. This patch achieves this goal without sacrificing the performance of both VGA and live migration: in practice the new code is much faster than the old one unless we have too many dirty pages. Implementation: The key part of the implementation is the use of xchg() operation for clearing dirty bits atomically. Since this allows us to update only BITS_PER_LONG pages at once, we need to iterate over the dirty bitmap until every dirty bit is cleared again for the next call. Although some people may worry about the problem of using the atomic memory instruction many times to the concurrently accessible bitmap, it is usually accessed with mmu_lock held and we rarely see concurrent accesses: so what we need to care about is the pure xchg() overheads. Another point to note is that we do not use for_each_set_bit() to check which ones in each BITS_PER_LONG pages are actually dirty. Instead we simply use __ffs() in a loop. This is much faster than repeatedly call find_next_bit(). Performance: The dirty-log-perf unit test showed nice improvements, some times faster than before, when the number of dirty pages was below 8K. For other cases we saw a bit of regression but still enough fast compared to the processing of these dirty pages in the userspace. For real workloads, both VGA and live migration, we have observed pure improvements: when the guest was reading a file, we originally saw a few ms of latency, but with the new method the latency was less than 200us. Signed-off-by: Takuya Yoshikawa --- arch/x86/kvm/x86.c | 116 +++ 1 files changed, 43 insertions(+), 73 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3bc1922..0748bab 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3041,55 +3041,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, } /** - * write_protect_slot - write protect a slot for dirty logging - * @kvm: the kvm instance - * @memslot: the slot we protect - * @dirty_bitmap: the bitmap indicating which pages are dirty - * @nr_dirty_pages: the number of dirty pages + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log * - * We have two ways to find all sptes to protect: - * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and - *checks ones that have a spte mapping a page in the slot. - * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap. + * We need to keep it in mind that VCPU threads can write to the bitmap + * concurrently. So, to avoid losing data, we keep the following order for + * each bit: * - * Generally speaking, if there are not so many dirty pages compared to the - * number of shadow pages, we should use the latter. + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Flush TLB's if needed. + * 4. Copy the snapshot to the userspace. * - * Note that letting others write into a page marked dirty in the old bitmap - * by using the remaining tlb entry is not a problem. That page will become - * write protected again when we flush the tlb and then be reported dirty to - * the user space by copying the old bitmap. + * Between 2 and 3, the guest may write to the page using the remaining TLB + * entry. This is not a problem because the page will be reported dirty at + * step 4 using the snapshot taken before and step 3 ensures that successive + * writes will be logged for the next call. */ -static void write_protect_slot(struct kvm *kvm, - struct kvm_memory_slot *memslot, - unsigned long *dirty_bitmap, - unsigned long nr_dirty_pages) -{ - spin_lock(&kvm->mmu_lock); - - /* Not many dirty pages compared to # of shadow pages. */ - if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { - gfn_t offset; - - for_each_set_bit(offset, dirty_bitmap, memslot->npages) - kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 1); - - kvm_flush_remote_tlbs(kvm); - } else - kvm_mmu_slot_remove_write_access(kvm, memslot->id); - - spin_unlock(&kvm->mmu_lock); -} - -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +int kvm_vm_ioctl_get_dirty_log(stru
Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor
On Thu, Mar 01, 2012 at 10:10:29AM +, Russell King - ARM Linux wrote: > On Wed, Feb 29, 2012 at 12:58:26PM +, Dave Martin wrote: > > On Wed, Feb 29, 2012 at 09:56:02AM +, Ian Campbell wrote: > > > On Wed, 2012-02-29 at 09:34 +, Dave Martin wrote: > > > > On Tue, Feb 28, 2012 at 12:28:29PM +, Stefano Stabellini wrote: > > > > > > > > I don't have a very strong opinion on which register we should use, > > > > > but > > > > > I would like to avoid r7 if it is already actively used by gcc. > > > > > > > > But there is no framepointer for Thumb-2 code (?) > > > > > > Peter Maydell suggested there was: > > > > r7 is (used by gcc as) the Thumb frame pointer; I don't know if this > > > > makes it worth avoiding in this context. > > > > > > Sounds like it might be a gcc-ism, possibly a non-default option? > > > > > > Anyway, I think r12 will be fine for our purposes so the point is rather > > > moot. > > > > Just had a chat with some tools guys -- apparently, when passing register > > arguments to gcc inline asms there really isn't a guarantee that those > > variables will be in the expected registers on entry to the inline asm. > > The best you can do is: > > register unsigned int foo asm("r7") = value; > > asm("blah %0" : : "r" (foo)); > > and ensure that your assembly checks that %0 _is_ r7 and produces a build > error if not. See the __asmeq() macro in asm/system.h to find out how to > do that. > > This feature has been missing from ARM GCC for quite a long time - it's > used extensively on x86 GCC, where they have one register class per > register, so they can do stuff like: > > asm("blah %0" : : "a" (value)); > > and be guaranteed that %0 will be eax. > > > If you need a specific register, this means that you must set up that > > register explicitly inside the asm if you want a guarantee that the > > code will work: > > > > asm volatile ( > > "movw r12, %[hvc_num]\n\t" > > ... > > "hvc#0" > > :: [hvc_num] "i" (NUMBER) : "r12" > > ); > > > > Of course, if you need to set up more than about 5 or 6 registers in > > this way, the doubled register footprint means that the compiler will > > have to start spilling stuff to the stack. > > No it won't - it will barf instead - think about it. If you're clobbering > r0 - r5, but need to pass in six values in registers, gcc can't use r0-r5 > for that, so it must use the remaining registers. It gets rather unhappy > with that, and starts erroring out (iirc 'too many reloads' or some similar > error.) I've been there. You're right about that -- I didn't pursue my line of thought to the end, there. I have see the behaviour you describe. > If you want to do it that way, your only option is to store them to memory > and pass the address of the block into the assembly, and reload them there. > Which is extremely sucky and inefficient. > > Practically, the register variable plus asm() does seem to work, and seems > to work for virtually all gcc versions out there (there have been the odd > buggy version, but those bugs appear to get fixed.) That is inconvenient for us, but it's a not a bug. The ability for asm contraints to be able to gcc to put things in specific registers (as with the gcc "abcd" constraints for i386) would be nice, but as you point out, this capability is simply not supported by gcc right now for ARM -- the compiler guys seem to be pretty opposed to it, so I can't say I anticiapte this being supported in the near future. So, where there's a compelling reason to inline these things, we can use the existing techniques if we're alert to the risks. But in cases where there isn't a compelling reason, aren't we just inviting fragility unnecessarily? Cheers ---Dave -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/4] KVM: Avoid checking huge page mappings in get_dirty_log()
Dropped such mappings when we enabled dirty logging and we will never create new ones until we stop the logging. For this we introduce a new function which can be used to write protect a range of PT level pages: although we do not need to care about a range of pages at this point, the following patch will need this feature to optimize the write protection of many pages. Signed-off-by: Takuya Yoshikawa --- arch/x86/include/asm/kvm_host.h |5 ++- arch/x86/kvm/mmu.c | 40 +- arch/x86/kvm/x86.c |8 ++ 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74c9edf..935cbcc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -712,8 +712,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, +struct kvm_memory_slot *slot, +gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 67857bd..be8a529 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1037,27 +1037,47 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level return write_protected; } -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot) +/** + * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages + * @kvm: kvm instance + * @slot: slot to protect + * @gfn_offset: start of the BITS_PER_LONG pages we care about + * @mask: indicates which pages we should protect + * + * Used when we do not need to care about huge page mappings: e.g. during dirty + * logging we do not have any such mappings. + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, +struct kvm_memory_slot *slot, +gfn_t gfn_offset, unsigned long mask) { unsigned long *rmapp; - int i, write_protected = 0; - for (i = PT_PAGE_TABLE_LEVEL; -i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, i); - } + while (mask) { + rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); - return write_protected; + /* clear the first set bit */ + mask &= mask - 1; + } } static int rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; + unsigned long *rmapp; + int i; + int write_protected = 0; slot = gfn_to_memslot(kvm, gfn); - return kvm_mmu_rmap_write_protect(kvm, gfn, slot); + + for (i = PT_PAGE_TABLE_LEVEL; +i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + rmapp = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(kvm, rmapp, i); + } + + return write_protected; } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9d99e5..3bc1922 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3069,13 +3069,11 @@ static void write_protect_slot(struct kvm *kvm, /* Not many dirty pages compared to # of shadow pages. */ if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { - unsigned long gfn_offset; + gfn_t offset; - for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { - unsigned long gfn = memslot->base_gfn + gfn_offset; + for_each_set_bit(offset, dirty_bitmap, memslot->npages) + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 1); - kvm_mmu_rmap_write_protect(kvm, gfn, memslot); - } kvm_flush_remote_tlbs(kvm); } else kvm_mmu_slot_remove_write_access(kvm, memslot->id); -- 1.7.5.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/4] KVM: MMU: Split the main body of rmap_write_protect() off from others
We will use this in the following patch to implement another function which needs to write protect pages using the rmap information. Note that there is a small change in debug printing for large pages: we do not differentiate them from others to avoid duplicating code. Signed-off-by: Takuya Yoshikawa --- arch/x86/kvm/mmu.c | 53 ++- 1 files changed, 27 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff053ca..67857bd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1010,42 +1010,43 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot) +static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { - unsigned long *rmapp; - u64 *spte; - int i, write_protected = 0; + u64 *spte = NULL; + int write_protected = 0; - rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); - spte = rmap_next(rmapp, NULL); - while (spte) { + while ((spte = rmap_next(rmapp, spte))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writable_pte(*spte)) { + + if (!is_writable_pte(*spte)) + continue; + + if (level == PT_PAGE_TABLE_LEVEL) { mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); - write_protected = 1; + } else { + BUG_ON(!is_large_pte(*spte)); + drop_spte(kvm, spte); + --kvm->stat.lpages; + spte = NULL; } - spte = rmap_next(rmapp, spte); + + write_protected = 1; } - /* check for huge page mappings */ - for (i = PT_DIRECTORY_LEVEL; + return write_protected; +} + +int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, + struct kvm_memory_slot *slot) +{ + unsigned long *rmapp; + int i, write_protected = 0; + + for (i = PT_PAGE_TABLE_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON(!is_large_pte(*spte)); - pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writable_pte(*spte)) { - drop_spte(kvm, spte); - --kvm->stat.lpages; - spte = NULL; - write_protected = 1; - } - spte = rmap_next(rmapp, spte); - } + write_protected |= __rmap_write_protect(kvm, rmapp, i); } return write_protected; -- 1.7.5.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/4] KVM: srcu-less dirty logging -v2
v2: changed to protect masked pages Live migration gets a bit faster than v1. Takuya === from v1 This patch series is the result of the integration of my dirty logging optimization work, including preparation for the new GET_DIRTY_LOG API, and the attempt to get rid of controversial synchronize_srcu_expedited(). 1 - KVM: MMU: Split the main body of rmap_write_protect() off from others 2 - KVM: Avoid checking huge page mappings in get_dirty_log() 3 - KVM: Switch to srcu-less get_dirty_log() 4 - KVM: Remove unused dirty_bitmap_head and nr_dirty_pages Although there are still some remaining tasks, the test result obtained looks very promising. Remaining tasks: - Implement set_bit_le() for mark_page_dirty() Some drivers are using their own implementation of it and a bit of work is needed to make it generic. I want to do this separately later because it cannot be done within kvm tree. - Stop allocating extra dirty bitmap buffer area According to Peter, mmu_notifier has become preemptible. If we can change mmu_lock from spin_lock to mutex_lock, as Avi said before, this would be staightforward because we can use __put_user() right after xchg() with the mmu_lock held. Test results: 1. dirty-log-perf unit test (on Sandy Bridge core-i3 32-bit host) With some changes added since the previous post, the performance was much improved: now even when every page in the slot is dirty, the number is reasonably close to the original one. For others, needless to say, we have achieved very nice improvement. - kvm.git next average(ns)stdev ns/pagepages 147018.677604.9147018.61 158080.282211.9 79040.12 127555.680619.8 31888.94 108865.678499.3 13608.28 114707.843508.6 7169.2 16 76679.037659.8 2396.2 32 59159.820417.1 924.3 64 60418.219405.7 472.0 128 76267.021450.5 297.9 256 113182.022684.9 221.0 512 930344.2 153766.5 908.5 1K 939098.2 163800.3 458.5 2K 996813.477921.0 243.3 4K 1113232.6 107782.6 135.8 8K 1241206.482282.575.7 16K 1529526.4 116388.246.6 32K 2147538.4 227375.932.7 64K 3309619.479356.825.2 128K 6016951.8 549873.422.9 256K - kvm.git next + srcu-less series average(ns)stdev ns/pagepagesimprovement(%) 14086.0 3532.3 14086.01 944 13303.6 3317.7 6651.821088 13455.6 3315.2 3363.94 848 14125.8 3435.4 1765.78 671 15322.4 3690.1 957.6 16 649 17026.6 4037.2 532.0 32 350 21258.6 4852.3 332.1 64 178 33845.614115.8 264.4 128 79 37893.0 681.8 148.0 256 101 61707.4 1057.6 120.5 512 83 88861.4 2131.086.7 1K 947 151315.6 6490.573.8 2K 521 290579.6 8523.070.9 4K 243 518231.020412.663.2 8K 115 2271171.412064.9 138.6 16K -45 3375866.214743.3 103.0 32K -55 4408395.610720.067.2 64K -51 5915336.226538.145.1 128K -44 8497356.416441.032.4 256K -29 Note that when the number of dirty pages was large, we spent less than 100ns for getting one dirty page information: see ns/page column. As Avi noted before, this is much faster than the userspace send one page to the destination node. Furthermore, with the already proposed new GET_DIRTY_LOG API, we will be able to restrict the area from which we get the log and will not need to care about ms order of latency observed for very large number of dirty pages. 2. real workloads (on Xeon W3520 64-bit host) I traced kvm_vm_ioctl_get_dirty_log() during heavy VGA updates and during live migration. 2.1. VGA: guest was doing "x11perf -rect1 -rect10 -rect100 -rect500" As can be guessed from the result of dirty-log-perf, we observed very nice improvement. - kvm.git next For heavy updates: 100us to 300us. Worst: 300us - kvm.git next + srcu-less series For heavy updates: 3us to 10us. Worst: 50us. 2.2. live migration: guest was doing "dd if=/path/to/a/file of=/dev/null" The improvement was significant again. - kvm.git next For heavy updates: 1ms to 3ms - kvm.git next + srcu-less series For heavy updates: 50us to 300us Probably we gained a lot from the locality of WWS. Takuya -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4] KVM: Resize kvm_io_range array dynamically
On Thu, Mar 1, 2012 at 9:01 AM, Amos Kong wrote: > This patch makes the kvm_io_range array can be resized dynamically. > Set an upper bounds for kvm_io_range to limit userspace. > 1000 is a very large limit and not bloat the typical user. Whats the reason for making everything dynamic? Memory savings there aren't that significant. If you want to make it more efficient just define: static struct kvm_io_bus io_bus[2]; somewhere in kvm_main.c, and just switch between them when you need to do insertion and removal of devices. You get the benefit of zero slub usage, no allocations in any of the paths, and much simpler logic. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor
On Wed, Feb 29, 2012 at 02:44:24PM +, Ian Campbell wrote: > > If you need a specific register, this means that you must set up that > > register explicitly inside the asm if you want a guarantee that the > > code will work: > > > > asm volatile ( > > "movw r12, %[hvc_num]\n\t" > > Is gcc (or gas?) smart enough to optimise this away if it turns out that > %[hvc_num] == r12? No, and it won't do, because %[hvc_num] is specified in these operands: > > ... > > "hvc#0" > > :: [hvc_num] "i" (NUMBER) : "r12" to be an integer, not a register. > How are system calls implemented on the userspace side? I confess I > don't know what the ARM syscall ABI looks like -- is it all registers or > is some of it on the stack? It sounds like the solution ought to be > pretty similar though. All registers. We have a few which take a pointer to an in memory array, but those are for some old multiplexed syscalls. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor
On Wed, Feb 29, 2012 at 12:58:26PM +, Dave Martin wrote: > On Wed, Feb 29, 2012 at 09:56:02AM +, Ian Campbell wrote: > > On Wed, 2012-02-29 at 09:34 +, Dave Martin wrote: > > > On Tue, Feb 28, 2012 at 12:28:29PM +, Stefano Stabellini wrote: > > > > > > I don't have a very strong opinion on which register we should use, but > > > > I would like to avoid r7 if it is already actively used by gcc. > > > > > > But there is no framepointer for Thumb-2 code (?) > > > > Peter Maydell suggested there was: > > > r7 is (used by gcc as) the Thumb frame pointer; I don't know if this > > > makes it worth avoiding in this context. > > > > Sounds like it might be a gcc-ism, possibly a non-default option? > > > > Anyway, I think r12 will be fine for our purposes so the point is rather > > moot. > > Just had a chat with some tools guys -- apparently, when passing register > arguments to gcc inline asms there really isn't a guarantee that those > variables will be in the expected registers on entry to the inline asm. The best you can do is: register unsigned int foo asm("r7") = value; asm("blah %0" : : "r" (foo)); and ensure that your assembly checks that %0 _is_ r7 and produces a build error if not. See the __asmeq() macro in asm/system.h to find out how to do that. This feature has been missing from ARM GCC for quite a long time - it's used extensively on x86 GCC, where they have one register class per register, so they can do stuff like: asm("blah %0" : : "a" (value)); and be guaranteed that %0 will be eax. > If you need a specific register, this means that you must set up that > register explicitly inside the asm if you want a guarantee that the > code will work: > > asm volatile ( > "movw r12, %[hvc_num]\n\t" > ... > "hvc#0" > :: [hvc_num] "i" (NUMBER) : "r12" > ); > > Of course, if you need to set up more than about 5 or 6 registers in > this way, the doubled register footprint means that the compiler will > have to start spilling stuff to the stack. No it won't - it will barf instead - think about it. If you're clobbering r0 - r5, but need to pass in six values in registers, gcc can't use r0-r5 for that, so it must use the remaining registers. It gets rather unhappy with that, and starts erroring out (iirc 'too many reloads' or some similar error.) I've been there. If you want to do it that way, your only option is to store them to memory and pass the address of the block into the assembly, and reload them there. Which is extremely sucky and inefficient. Practically, the register variable plus asm() does seem to work, and seems to work for virtually all gcc versions out there (there have been the odd buggy version, but those bugs appear to get fixed.) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: x86: kvmclock: abstract save/restore sched_clock_state
On Tue, 7 Feb 2012, Marcelo Tosatti wrote: > > Upon resume from hibernation, CPU 0's hvclock area contains the old > values for system_time and tsc_timestamp. It is necessary for the > hypervisor to update these values with uptodate ones before the CPU uses > them. > > Abstract TSC's save/restore sched_clock_state functions and use > restore_state to write to KVM_SYSTEM_TIME MSR, forcing an update. > > Fixes suspend-to-disk with kvmclock. > > Signed-off-by: Marcelo Tosatti > > diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h > index 15d9915..c91e8b9 100644 > --- a/arch/x86/include/asm/tsc.h > +++ b/arch/x86/include/asm/tsc.h > @@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu); > extern void check_tsc_sync_target(void); > > extern int notsc_setup(char *); > -extern void save_sched_clock_state(void); > -extern void restore_sched_clock_state(void); > +extern void tsc_save_sched_clock_state(void); > +extern void tsc_restore_sched_clock_state(void); > > #endif /* _ASM_X86_TSC_H */ > diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h > index 5d0afac..baaca8d 100644 > --- a/arch/x86/include/asm/x86_init.h > +++ b/arch/x86/include/asm/x86_init.h > @@ -162,6 +162,8 @@ struct x86_cpuinit_ops { > * @is_untracked_pat_range exclude from PAT logic > * @nmi_init enable NMI on cpus > * @i8042_detect pre-detect if i8042 controller exists > + * @save_sched_clock_state: save state for sched_clock() on suspend > + * @restore_sched_clock_state: restore state for sched_clock() on > resume > */ > struct x86_platform_ops { > unsigned long (*calibrate_tsc)(void); > @@ -173,6 +175,8 @@ struct x86_platform_ops { > void (*nmi_init)(void); > unsigned char (*get_nmi_reason)(void); > int (*i8042_detect)(void); > + void (*save_sched_clock_state)(void); > + void (*restore_sched_clock_state)(void); > }; > > struct pci_dev; > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c > index ca4e735..57e6b78 100644 > --- a/arch/x86/kernel/kvmclock.c > +++ b/arch/x86/kernel/kvmclock.c > @@ -136,6 +136,15 @@ int kvm_register_clock(char *txt) > return ret; > } > > +void kvm_save_sched_clock_state(void) static ? > +{ > +} > + > +void kvm_restore_sched_clock_state(void) Ditto > +{ > + kvm_register_clock("primary cpu clock, resume"); > +} > + Otherwise: Reviewed-by: Thomas Gleixner -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor
On Wed, Feb 29, 2012 at 02:52:38PM +, Stefano Stabellini wrote: > On Wed, 29 Feb 2012, Dave Martin wrote: > > On Wed, Feb 29, 2012 at 09:56:02AM +, Ian Campbell wrote: > > > On Wed, 2012-02-29 at 09:34 +, Dave Martin wrote: > > > > On Tue, Feb 28, 2012 at 12:28:29PM +, Stefano Stabellini wrote: > > > > > > > > I don't have a very strong opinion on which register we should use, > > > > > but > > > > > I would like to avoid r7 if it is already actively used by gcc. > > > > > > > > But there is no framepointer for Thumb-2 code (?) > > > > > > Peter Maydell suggested there was: > > > > r7 is (used by gcc as) the Thumb frame pointer; I don't know if this > > > > makes it worth avoiding in this context. > > > > > > Sounds like it might be a gcc-ism, possibly a non-default option? > > > > > > Anyway, I think r12 will be fine for our purposes so the point is rather > > > moot. > > > > Just had a chat with some tools guys -- apparently, when passing register > > arguments to gcc inline asms there really isn't a guarantee that those > > variables will be in the expected registers on entry to the inline asm. > > > > If gcc reorders other function calls or other code around the inline asm > > (which it can do, except under certain controlled situations), then > > intervening code can clobber any registers in general. > > > > Or, to summarise another way, there is no way to control which register > > is used to pass something to an inline asm in general (often we get away > > with this, and there are a lot of inline asms in the kernel that assume > > it works, but the more you inline the more likely you are to get nasty > > surprises). There is no workaroud, except on some architectures where > > special asm constraints allow specific individual registers to be > > specified for operands (i386 for example). > > > > If you need a specific register, this means that you must set up that > > register explicitly inside the asm if you want a guarantee that the > > code will work: > > > > asm volatile ( > > "movw r12, %[hvc_num]\n\t" > > ... > > "hvc#0" > > :: [hvc_num] "i" (NUMBER) : "r12" > > ); > > > > OK, we can arrange the hypercall code to be like that. > Also with your patch series it would be "_hvc" because of the .macro, > right? Yes, but I would avoid making too many assumptions about the final form of that patch -- it looks like there's significant work to do there, since I made some unsafe assumptions about how the tools work... We might end up with a magic #define after all. > > This is the kind of problem which goes away when out-of-lining the > > hvc wrapper behind a C function interface, since the ABI then provides > > guarantees about how values are mershaled into and out of that code. > > Do you mean implementing the entire HYPERVISOR_example_op in assembly > and calling it from C? > Because I guess that gcc would still be free to mess with the registers > between the C function entry point and any inline assembly code. gcc can arrange for the relevant things to be already in r0-r3 and the relevant stack slots before branching to a function just as for inline asm. The only differences are that the compiler cannot choose which registers to use, and the branch cannot be optimised away by the compiler (the CPU may be able to optimise the branch away at runtime of course, but that's another story...) What libc appears to do is wrap each syscall in a separate function. This means that it's not necessary to shuffle all the arguments by one position when invoking the actual syscall. (The generic "syscall" function does of course need to shuffle the arguments so as to displace the syscall number from the first argument to r7 -- but that's hard to avoid without inlining.) For example: 00090b50 : 90b50: e52d7004push{r7}; (str r7, [sp, #-4]!) 90b54: e59f7010ldr r7, [pc, #16] ; 90b6c 90b58: ef00svc 0x 90b5c: e49d7004pop {r7}; (ldr r7, [sp], #4) 90b60: e3700a01cmn r0, #4096 ; 0x1000 ... Syscalls with more than 4 args still need to load the extra ones from the stack, of course: 00090090 : 90090: e92d0090push{r4, r7} 90094: e59d4008ldr r4, [sp, #8] 90098: e59f7010ldr r7, [pc, #16] ; 900b0 9009c: ef00svc 0x ... I don't know whether that makes sense for a hypervisor... it partly depends on how many different hypercalls there are. By all means implement it both ways and measure the performance difference, if possible. Cheers ---Dave -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4] Quirk for IVB graphics FLR errata
For IvyBridge Mobile platform, a system hang may occur if a FLR(Function Level Reset) is asserted to internal graphics. This quirk patch is workaround for the IVB FLR errata issue. We are disabling the FLR reset handshake between the PCH and CPU display, then manually powering down the panel power sequencing and resetting the PCH display. Changes from v3: - add X86 configuration to avoid compile problem in other architecture. Signed-off-by: Xudong Hao Signed-off-by: Kay, Allen M Reviewed-by: Xiantao Zhang --- drivers/pci/quirks.c | 59 ++ 1 files changed, 59 insertions(+), 0 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6476547..63e8b70 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3069,11 +3069,70 @@ static int reset_intel_82599_sfp_virtfn(struct pci_dev *dev, int probe) return 0; } +#ifdef CONFIG_X86 + +#include "../gpu/drm/i915/i915_reg.h" +#define MSG_CTL0x45010 + +static const int op_timeout = 10; /* set timeout 10 seconds */ +static int reset_ivb_igd(struct pci_dev *dev, int probe) { + u8 *mmio_base; + u32 val; + cycles_t cyc_op_timeout = tsc_khz*op_timeout*1000; + + if (probe) + return 0; + + mmio_base = ioremap_nocache(pci_resource_start(dev, 0), +pci_resource_len(dev, 0)); + if (!mmio_base) + return -ENOMEM; + + /* Work Around */ + *((u32 *)(mmio_base + MSG_CTL)) = 0x0002; + /* Clobbering SOUTH_CHICKEN2 register is fine only if the next +* driver loaded sets the right bits. However, this's a reset and +* the bits have been set by i915 previously, so we clobber +* SOUTH_CHICKEN2 register directly here. +*/ + *((u32 *)(mmio_base + SOUTH_CHICKEN2)) = 0x0005; + val = *((u32 *)(mmio_base + PCH_PP_CONTROL)) & 0xfffe; + *((u32 *)(mmio_base + PCH_PP_CONTROL)) = val; + do { + cycles_t start_time = get_cycles(); + while (1) { + val = *((u32 *)(mmio_base + PCH_PP_STATUS)); + if (((val & 0x8000) == 0) + && ((val & 0x3000) == 0)) + break; + if (cyc_op_timeout < (get_cycles() - start_time)) + break; + cpu_relax(); + } + } while (0); + *((u32 *)(mmio_base + 0xd0100)) = 0x0002; + + iounmap(pci_resource_start(dev, 0)); + return 0; +} +#else +static int reset_ivb_igd(struct pci_dev *dev, int probe) { } +#endif /* CONFIG_X86 */ + #define PCI_DEVICE_ID_INTEL_82599_SFP_VF 0x10ed +#define PCI_DEVICE_ID_INTEL_IVB_M_VGA 0x0156 +#define PCI_DEVICE_ID_INTEL_IVB_M2_VGA 0x0166 static const struct pci_dev_reset_methods pci_dev_reset_methods[] = { { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82599_SFP_VF, reset_intel_82599_sfp_virtfn }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_M_VGA, + reset_ivb_igd }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_M2_VGA, + reset_ivb_igd }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, reset_intel_generic_dev }, { 0 } -- 1.6.0.rc1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor
On Wed, Feb 29, 2012 at 02:44:24PM +, Ian Campbell wrote: > On Wed, 2012-02-29 at 12:58 +, Dave Martin wrote: > > On Wed, Feb 29, 2012 at 09:56:02AM +, Ian Campbell wrote: > > > On Wed, 2012-02-29 at 09:34 +, Dave Martin wrote: > > > > On Tue, Feb 28, 2012 at 12:28:29PM +, Stefano Stabellini wrote: > > > > > > > > I don't have a very strong opinion on which register we should use, > > > > > but > > > > > I would like to avoid r7 if it is already actively used by gcc. > > > > > > > > But there is no framepointer for Thumb-2 code (?) > > > > > > Peter Maydell suggested there was: > > > > r7 is (used by gcc as) the Thumb frame pointer; I don't know if this > > > > makes it worth avoiding in this context. > > > > > > Sounds like it might be a gcc-ism, possibly a non-default option? > > > > > > Anyway, I think r12 will be fine for our purposes so the point is rather > > > moot. > > > > Just had a chat with some tools guys -- apparently, when passing register > > arguments to gcc inline asms there really isn't a guarantee that those > > variables will be in the expected registers on entry to the inline asm. > > > > If gcc reorders other function calls or other code around the inline asm > > (which it can do, except under certain controlled situations), then > > intervening code can clobber any registers in general. > > > > Or, to summarise another way, there is no way to control which register > > is used to pass something to an inline asm in general (often we get away > > with this, and there are a lot of inline asms in the kernel that assume > > it works, but the more you inline the more likely you are to get nasty > > surprises). There is no workaroud, except on some architectures where > > special asm constraints allow specific individual registers to be > > specified for operands (i386 for example). > > I had assumed I just couldn't find the right syntax. Useful to know that > I couldn't find it because it doesn't exist! > > > If you need a specific register, this means that you must set up that > > register explicitly inside the asm if you want a guarantee that the > > code will work: > > > > asm volatile ( > > "movw r12, %[hvc_num]\n\t" > > Is gcc (or gas?) smart enough to optimise this away if it turns out that > %[hvc_num] == r12? No, unfortunately. Except for the information defined by the constraints, the inline asm block is completely opaque to the compiler (except for pasting in operands -- which is a string operation done with no knowledge of what the text means for the assembler). > > > ... > > "hvc#0" > > :: [hvc_num] "i" (NUMBER) : "r12" > > ); > > > > Of course, if you need to set up more than about 5 or 6 registers in > > this way, the doubled register footprint means that the compiler will > > have to start spilling stuff to the stack. > > > > > > This is the kind of problem which goes away when out-of-lining the > > hvc wrapper behind a C function interface, since the ABI then provides > > guarantees about how values are mershaled into and out of that code. > > I don't think anything would stop gcc from clobbering an argument > register right on function entry (e..g it might move r0 to r8 and > clobber r0, for whatever reason), so that they are no longer where you > expect them to be when you hit the asm. Unlikely perhaps but no more so > than the other issues you've raised? > > Or did you mean out-of-line as in "written in a .S file" as well as out > of line? Yes. Some toolchains have a concept of out-of-line assembler functions in a .c file, but gcc doesn't -- the asm is always inline in its immediate context, even if the containing function won't be inlined. However, the compiler would have to be applying pretty creative optimizations to break cases cases where an inlinable function contains, say, nothing except for declarations, the asm() and a return statement. I feel that the kernel implicitly relies on such things working in too many places for breakage of that assumption to go unnoticed. > > > Notwithstanding the above, even if we do make theoretically unsound > > (but often true) assumptions about inline asms, ARM will be no worse > > than other arches in this respect. > > This is true. > > > Other than serving as a reminder that inline asm is a deep can of > > worms, this doesn't really give us a neat solution... > > How are system calls implemented on the userspace side? I confess I > don't know what the ARM syscall ABI looks like -- is it all registers or > is some of it on the stack? It sounds like the solution ought to be > pretty similar though. I _believe_ it's now out of line in most cases. I'm not sure I totally understand it all, though: http://www.eglibc.org/cgi-bin/viewvc.cgi/trunk/ports/sysdeps/unix/sysv/linux/arm/eabi/ There is an internal inline syscall wrapper INTERNAL_SYSCALL_RAW(), but I can't see where it is used. For Thumb code it actually