date:20120301

[PATCH v3 7/7] RTC:Allow to migrate from old version

2012-03-01 Thread Zhang, Yang Z

The new logic is compatible with old. So it should not block to migrate
from old version. But new version cannot migrate to old.

Signed-off-by: Yang Zhang 
---
 hw/mc146818rtc.c |   48 
 1 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c
index 384bdc1..02f7ea0 100644
--- a/hw/mc146818rtc.c
+++ b/hw/mc146818rtc.c
@@ -635,11 +635,51 @@ static int rtc_post_load(void *opaque, int version_id)
 return 0;
 }

+static int rtc_load_old(QEMUFile *f, void *opaque, int version_id)
+{
+RTCState *s = opaque;
+
+if (version_id > 2) {
+return -EINVAL;
+}
+
+qemu_get_buffer(f, s->cmos_data, sizeof(s->cmos_data));
+/* dummy load for compatibility */
+qemu_get_byte(f); /* cmos_index */
+qemu_get_be32(f); /* tm_sec */
+qemu_get_be32(f); /* tm_min */
+qemu_get_be32(f); /* tm_hour */
+qemu_get_be32(f); /* tm_wday */
+qemu_get_be32(f); /* tm_mday */
+qemu_get_be32(f); /* tm_mon */
+qemu_get_be32(f); /* tm_year */
+qemu_get_be64(f); /* periodic_timer */
+qemu_get_be64(f); /* next_periodic_time */
+qemu_get_be64(f); /* next_second_time */
+qemu_get_be64(f); /* second_timer */
+qemu_get_be64(f); /* second_timer2 */
+qemu_get_be32(f); /* irq_coalesced */
+qemu_get_be32(f); /* period */
+
+
+rtc_set_date_from_host(&s->dev);
+periodic_timer_update(s,  qemu_get_clock_ns(rtc_clock));
+check_update_timer(s);
+
+#ifdef TARGET_I386
+if (s->lost_tick_policy == LOST_TICK_SLEW) {
+rtc_coalesced_timer_update(s);
+}
+#endif
+return 0;
+}
+
 static const VMStateDescription vmstate_rtc = {
 .name = "mc146818rtc",
-.version_id = 2,
-.minimum_version_id = 1,
-.minimum_version_id_old = 1,
+.version_id = 3,
+.minimum_version_id = 3,
+.minimum_version_id_old = 2,
+.load_state_old = rtc_load_old,
 .post_load = rtc_post_load,
 .fields  = (VMStateField []) {
 VMSTATE_BUFFER(cmos_data, RTCState),
@@ -777,7 +817,7 @@ static int rtc_initfn(ISADevice *dev)
 memory_region_init_io(&s->io, &cmos_ops, s, "rtc", 2);
 isa_register_ioport(dev, &s->io, base);

-qdev_set_legacy_instance_id(&dev->qdev, base, 2);
+qdev_set_legacy_instance_id(&dev->qdev, base, 3);
 qemu_register_reset(rtc_reset, s);

 object_property_add(OBJECT(s), "date", "struct tm",
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 6/7] RTC:Add alarm support

2012-03-01 Thread Zhang, Yang Z

Add the alarm check when update cycle ended. If alarm is fired,
also AIE bit is setting, then raise a interrupt

Signed-off-by: Yang Zhang 
---
 hw/mc146818rtc.c |   48 ++--
 1 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c
index fae049e..384bdc1 100644
--- a/hw/mc146818rtc.c
+++ b/hw/mc146818rtc.c
@@ -114,6 +114,7 @@ typedef struct RTCState {
 static void rtc_set_time(RTCState *s);
 static void rtc_calibrate_time(RTCState *s);
 static void rtc_set_cmos(RTCState *s);
+static inline int rtc_from_bcd(RTCState *s, int a);

 static int32_t divider_reset;

@@ -267,15 +268,58 @@ static void rtc_update_timer(void *opaque)
 }
 }

+static inline uint8_t convert_hour(RTCState *s, uint8_t hour)
+{
+if (!(s->cmos_data[RTC_REG_B] & REG_B_24H)) {
+hour %= 12;
+if (s->cmos_data[RTC_HOURS] & 0x80) {
+hour += 12;
+}
+}
+return hour;
+}
+static uint32_t check_alarm(RTCState *s)
+{
+uint8_t alarm_hour, alarm_min, alarm_sec;
+uint8_t cur_hour, cur_min, cur_sec;
+
+rtc_calibrate_time(s);
+rtc_set_cmos(s);
+
+alarm_sec = rtc_from_bcd(s, s->cmos_data[RTC_SECONDS_ALARM]);
+alarm_min = rtc_from_bcd(s, s->cmos_data[RTC_MINUTES_ALARM]);
+alarm_hour = rtc_from_bcd(s, s->cmos_data[RTC_HOURS_ALARM]);
+alarm_hour = convert_hour(s, alarm_hour);
+
+cur_sec = rtc_from_bcd(s, s->cmos_data[RTC_SECONDS]);
+cur_min = rtc_from_bcd(s, s->cmos_data[RTC_MINUTES]);
+cur_hour = rtc_from_bcd(s, s->cmos_data[RTC_HOURS]);
+cur_hour = convert_hour(s, cur_hour);
+
+if (((alarm_sec & 0xc0) == 0xc0 || alarm_sec == cur_sec) &&
+((alarm_min & 0xc0) == 0xc0 || alarm_min == cur_min) &&
+((alarm_hour & 0xc0) == 0xc0 || alarm_hour == cur_hour)) {
+return 1;
+}
+return 0;
+
+}
+
 static void rtc_update_timer2(void *opaque)
 {
 RTCState *s = opaque;

 if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) {
 s->cmos_data[RTC_REG_C] |= REG_C_UF;
+if (check_alarm(s)) {
+s->cmos_data[RTC_REG_C] |= REG_C_AF;
+}
+
 s->cmos_data[RTC_REG_A] &= ~REG_A_UIP;
-s->cmos_data[RTC_REG_C] |= REG_C_IRQF;
-qemu_irq_raise(s->irq);
+if (s->cmos_data[RTC_REG_B] & (REG_B_AIE | REG_B_UIE)) {
+s->cmos_data[RTC_REG_C] |= REG_C_IRQF;
+qemu_irq_raise(s->irq);
+}
 }
 check_update_timer(s);
 }
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 4/7] RTC: Set internal millisecond register to 500ms when reset divider

2012-03-01 Thread Zhang, Yang Z

The first update cycle begins one - half seconds later when divider
reset is removing.

Signed-off-by: Yang Zhang 
---
 hw/mc146818rtc.c |   38 +-
 1 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c
index 6ebb8f6..5e7fbb5 100644
--- a/hw/mc146818rtc.c
+++ b/hw/mc146818rtc.c
@@ -110,6 +110,8 @@ static void rtc_set_time(RTCState *s);
 static void rtc_calibrate_time(RTCState *s);
 static void rtc_set_cmos(RTCState *s);

+static int32_t divider_reset;
+
 static uint64_t get_guest_rtc_us(RTCState *s)
 {
 int64_t host_usec, offset_usec, guest_usec;
@@ -220,16 +222,24 @@ static void rtc_periodic_timer(void *opaque)
 }
 }

-static void rtc_set_offset(RTCState *s)
+static void rtc_set_offset(RTCState *s, int32_t start_usec)
 {
 struct tm *tm = &s->current_tm;
-int64_t host_usec, guest_sec, guest_usec;
+int64_t host_usec, guest_sec, guest_usec, offset_usec, old_guest_usec;

 host_usec = qemu_get_clock_ns(host_clock) / NS_PER_USEC;
+offset_usec = s->offset_sec * USEC_PER_SEC + s->offset_usec;
+old_guest_usec = (host_usec + offset_usec) % USEC_PER_SEC;

 guest_sec = mktimegm(tm);
-guest_usec = guest_sec * USEC_PER_SEC;

+/* start_usec equal 0 means rtc internal millisecond is
+ * same with before */
+if (start_usec == 0) {
+guest_usec = guest_sec * USEC_PER_SEC + old_guest_usec;
+} else {
+guest_usec = guest_sec * USEC_PER_SEC + start_usec;
+}
 s->offset_sec = (guest_usec - host_usec) / USEC_PER_SEC;
 s->offset_usec = (guest_usec - host_usec) % USEC_PER_SEC;
 }
@@ -260,10 +270,22 @@ static void cmos_ioport_write(void *opaque, uint32_t 
addr, uint32_t data)
 /* if in set mode, do not update the time */
 if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) {
 rtc_set_time(s);
-rtc_set_offset(s);
+rtc_set_offset(s, 0);
 }
 break;
 case RTC_REG_A:
+/* when the divider reset is removed, the first update cycle
+ * begins one-half second later*/
+if (((s->cmos_data[RTC_REG_A] & 0x60) == 0x60) &&
+((data & 0x70) >> 4) <= 2) {
+divider_reset = 1;
+if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) {
+rtc_calibrate_time(s);
+rtc_set_offset(s, 50);
+s->cmos_data[RTC_REG_A] &= ~REG_A_UIP;
+divider_reset = 0;
+}
+}
 /* UIP bit is read only */
 s->cmos_data[RTC_REG_A] = (data & ~REG_A_UIP) |
 (s->cmos_data[RTC_REG_A] & REG_A_UIP);
@@ -283,7 +305,13 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, 
uint32_t data)
 /* if disabling set mode, update the time */
 if (s->cmos_data[RTC_REG_B] & REG_B_SET) {
 rtc_set_time(s);
-rtc_set_offset(s);
+if (divider_reset == 1) {
+rtc_set_offset(s, 50);
+s->cmos_data[RTC_REG_A] &= ~REG_A_UIP;
+divider_reset = 0;
+} else {
+rtc_set_offset(s, 0);
+}
 }
 }
 s->cmos_data[RTC_REG_B] = data;
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 5/7] RTC:Add RTC update-ended interrupt support

2012-03-01 Thread Zhang, Yang Z

Use a timer to emulate update cycle. When update cycle ended and UIE is 
setting, 
then raise an interrupt. The timer runs only when UF or AF is cleared. 

Signed-off-by: Yang Zhang 
---
 hw/mc146818rtc.c |   86 ++
 1 files changed, 80 insertions(+), 6 deletions(-)

diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c
index 5e7fbb5..fae049e 100644
--- a/hw/mc146818rtc.c
+++ b/hw/mc146818rtc.c
@@ -97,6 +97,11 @@ typedef struct RTCState {
 /* periodic timer */
 QEMUTimer *periodic_timer;
 int64_t next_periodic_time;
+/* update-ended timer */
+QEMUTimer *update_timer;
+QEMUTimer *update_timer2;
+uint64_t next_update_time;
+uint32_t use_timer;
 uint16_t irq_reinject_on_ack_count;
 uint32_t irq_coalesced;
 uint32_t period;
@@ -157,7 +162,8 @@ static void rtc_coalesced_timer(void *opaque)
 }
 #endif

-static void rtc_timer_update(RTCState *s, int64_t current_time)
+/* handle periodic timer */
+static void periodic_timer_update(RTCState *s, int64_t current_time)
 {
 int period_code, period;
 int64_t cur_clock, next_irq_clock;
@@ -195,7 +201,7 @@ static void rtc_periodic_timer(void *opaque)
 {
 RTCState *s = opaque;

-rtc_timer_update(s, s->next_periodic_time);
+periodic_timer_update(s, s->next_periodic_time);
 s->cmos_data[RTC_REG_C] |= REG_C_PF;
 if (s->cmos_data[RTC_REG_B] & REG_B_PIE) {
 s->cmos_data[RTC_REG_C] |= REG_C_IRQF;
@@ -222,6 +228,58 @@ static void rtc_periodic_timer(void *opaque)
 }
 }

+/* handle update-ended timer */
+static void check_update_timer(RTCState *s)
+{
+uint64_t next_update_time, expire_time;
+uint64_t guest_usec;
+qemu_del_timer(s->update_timer);
+qemu_del_timer(s->update_timer2);
+
+if (!((s->cmos_data[RTC_REG_C] & (REG_C_UF | REG_C_AF)) ==
+(REG_C_UF | REG_C_AF)) && !(s->cmos_data[RTC_REG_B] & REG_B_SET)) {
+s->use_timer = 1;
+guest_usec = get_guest_rtc_us(s) % USEC_PER_SEC;
+if (guest_usec >= (USEC_PER_SEC - 244)) {
+/* RTC is in update cycle when enabling UIE */
+s->cmos_data[RTC_REG_A] |= REG_A_UIP;
+next_update_time = (USEC_PER_SEC - guest_usec) * NS_PER_USEC;
+expire_time = qemu_get_clock_ns(rtc_clock) + next_update_time;
+qemu_mod_timer(s->update_timer2, expire_time);
+} else {
+next_update_time = (USEC_PER_SEC - guest_usec - 244) * NS_PER_USEC;
+expire_time = qemu_get_clock_ns(rtc_clock) + next_update_time;
+s->next_update_time = expire_time;
+qemu_mod_timer(s->update_timer, expire_time);
+}
+} else {
+s->use_timer = 0;
+}
+}
+
+static void rtc_update_timer(void *opaque)
+{
+RTCState *s = opaque;
+
+if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) {
+s->cmos_data[RTC_REG_A] |= REG_A_UIP;
+qemu_mod_timer(s->update_timer2, s->next_update_time + 244000UL);
+}
+}
+
+static void rtc_update_timer2(void *opaque)
+{
+RTCState *s = opaque;
+
+if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) {
+s->cmos_data[RTC_REG_C] |= REG_C_UF;
+s->cmos_data[RTC_REG_A] &= ~REG_A_UIP;
+s->cmos_data[RTC_REG_C] |= REG_C_IRQF;
+qemu_irq_raise(s->irq);
+}
+check_update_timer(s);
+}
+
 static void rtc_set_offset(RTCState *s, int32_t start_usec)
 {
 struct tm *tm = &s->current_tm;
@@ -283,13 +341,14 @@ static void cmos_ioport_write(void *opaque, uint32_t 
addr, uint32_t data)
 rtc_calibrate_time(s);
 rtc_set_offset(s, 50);
 s->cmos_data[RTC_REG_A] &= ~REG_A_UIP;
+check_update_timer(s);
 divider_reset = 0;
 }
 }
 /* UIP bit is read only */
 s->cmos_data[RTC_REG_A] = (data & ~REG_A_UIP) |
 (s->cmos_data[RTC_REG_A] & REG_A_UIP);
-rtc_timer_update(s, qemu_get_clock_ns(rtc_clock));
+periodic_timer_update(s, qemu_get_clock_ns(rtc_clock));
 break;
 case RTC_REG_B:
 if (data & REG_B_SET) {
@@ -315,7 +374,8 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, 
uint32_t data)
 }
 }
 s->cmos_data[RTC_REG_B] = data;
-rtc_timer_update(s, qemu_get_clock_ns(rtc_clock));
+periodic_timer_update(s, qemu_get_clock_ns(rtc_clock));
+check_update_timer(s);
 break;
 case RTC_REG_C:
 case RTC_REG_D:
@@ -445,7 +505,7 @@ static uint32_t cmos_ioport_read(void *opaque, uint32_t 
addr)
 break;
 case RTC_REG_A:
 ret = s->cmos_data[s->cmos_index];
-if (update_in_progress(s)) {
+if ((s->use_timer == 0) && update_in_progress(s)) {
 ret |= REG_A_UIP;
 }
 break;
@@ -453,6 +513,12 @@ static uint32_t cmos_ioport_read(void *

[PATCH v3 3/7] RTC: Add UIP(update in progress) check logic

2012-03-01 Thread Zhang, Yang Z

The UIP(update in progress) is set when RTC is updating. And the update 
cycle begins 244us later after UIP is set. And it is cleared when update end.
.

Signed-off-by: Yang Zhang 
---
 hw/mc146818rtc.c |   18 ++
 1 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c
index 82a5b8a..6ebb8f6 100644
--- a/hw/mc146818rtc.c
+++ b/hw/mc146818rtc.c
@@ -377,6 +377,21 @@ static void rtc_calibrate_time(RTCState *s)
 s->current_tm = *ret;
 }

+static int update_in_progress(RTCState *s)
+{
+int64_t guest_usec;
+
+if (s->cmos_data[RTC_REG_B] & REG_B_SET) {
+return 0;
+}
+guest_usec = get_guest_rtc_us(s);
+/* UIP bit will be set at last 244us of every second. */
+if ((guest_usec % USEC_PER_SEC) >= (USEC_PER_SEC - 244)) {
+return 1;
+}
+return 0;
+}
+
 static uint32_t cmos_ioport_read(void *opaque, uint32_t addr)
 {
 RTCState *s = opaque;
@@ -402,6 +417,9 @@ static uint32_t cmos_ioport_read(void *opaque, uint32_t 
addr)
 break;
 case RTC_REG_A:
 ret = s->cmos_data[s->cmos_index];
+if (update_in_progress(s)) {
+ret |= REG_A_UIP;
+}
 break;
 case RTC_REG_C:
 ret = s->cmos_data[s->cmos_index];
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 2/7] RTC: Update the RTC clock only when reading it

2012-03-01 Thread Zhang, Yang Z

There has no need to use two periodic timer to update RTC time.
In this patch, we only update it when guest reading it.

Signed-off-by: Yang Zhang 
---
 hw/mc146818rtc.c |  207 +-
 1 files changed, 66 insertions(+), 141 deletions(-)

diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c
index 9b49cbc..82a5b8a 100644
--- a/hw/mc146818rtc.c
+++ b/hw/mc146818rtc.c
@@ -44,6 +44,9 @@
 # define DPRINTF_C(format, ...)  do { } while (0)
 #endif

+#define USEC_PER_SEC100L
+#define NS_PER_USEC 1000L
+
 #define RTC_REINJECT_ON_ACK_COUNT 20

 #define RTC_SECONDS 0
@@ -85,6 +88,8 @@ typedef struct RTCState {
 uint8_t cmos_data[128];
 uint8_t cmos_index;
 struct tm current_tm;
+int64_t offset_sec;
+int32_t offset_usec;
 int32_t base_year;
 qemu_irq irq;
 qemu_irq sqw_irq;
@@ -92,21 +97,29 @@ typedef struct RTCState {
 /* periodic timer */
 QEMUTimer *periodic_timer;
 int64_t next_periodic_time;
-/* second update */
-int64_t next_second_time;
 uint16_t irq_reinject_on_ack_count;
 uint32_t irq_coalesced;
 uint32_t period;
 QEMUTimer *coalesced_timer;
-QEMUTimer *second_timer;
-QEMUTimer *second_timer2;
 Notifier clock_reset_notifier;
 LostTickPolicy lost_tick_policy;
 Notifier suspend_notifier;
 } RTCState;

 static void rtc_set_time(RTCState *s);
-static void rtc_copy_date(RTCState *s);
+static void rtc_calibrate_time(RTCState *s);
+static void rtc_set_cmos(RTCState *s);
+
+static uint64_t get_guest_rtc_us(RTCState *s)
+{
+int64_t host_usec, offset_usec, guest_usec;
+
+host_usec = qemu_get_clock_ns(host_clock) / NS_PER_USEC;
+offset_usec = s->offset_sec * USEC_PER_SEC + s->offset_usec;
+guest_usec = host_usec + offset_usec;
+
+return guest_usec;
+}

 #ifdef TARGET_I386
 static void rtc_coalesced_timer_update(RTCState *s)
@@ -207,6 +220,20 @@ static void rtc_periodic_timer(void *opaque)
 }
 }

+static void rtc_set_offset(RTCState *s)
+{
+struct tm *tm = &s->current_tm;
+int64_t host_usec, guest_sec, guest_usec;
+
+host_usec = qemu_get_clock_ns(host_clock) / NS_PER_USEC;
+
+guest_sec = mktimegm(tm);
+guest_usec = guest_sec * USEC_PER_SEC;
+
+s->offset_sec = (guest_usec - host_usec) / USEC_PER_SEC;
+s->offset_usec = (guest_usec - host_usec) % USEC_PER_SEC;
+}
+
 static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data)
 {
 RTCState *s = opaque;
@@ -233,6 +260,7 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, 
uint32_t data)
 /* if in set mode, do not update the time */
 if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) {
 rtc_set_time(s);
+rtc_set_offset(s);
 }
 break;
 case RTC_REG_A:
@@ -243,6 +271,11 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, 
uint32_t data)
 break;
 case RTC_REG_B:
 if (data & REG_B_SET) {
+/* update cmos to when the rtc was stopping */
+if (!(s->cmos_data[RTC_REG_B] & REG_B_SET)) {
+rtc_calibrate_time(s);
+rtc_set_cmos(s);
+}
 /* set mode: reset UIP mode */
 s->cmos_data[RTC_REG_A] &= ~REG_A_UIP;
 data &= ~REG_B_UIE;
@@ -250,6 +283,7 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, 
uint32_t data)
 /* if disabling set mode, update the time */
 if (s->cmos_data[RTC_REG_B] & REG_B_SET) {
 rtc_set_time(s);
+rtc_set_offset(s);
 }
 }
 s->cmos_data[RTC_REG_B] = data;
@@ -305,7 +339,7 @@ static void rtc_set_time(RTCState *s)
 rtc_change_mon_event(tm);
 }

-static void rtc_copy_date(RTCState *s)
+static void rtc_set_cmos(RTCState *s)
 {
 const struct tm *tm = &s->current_tm;
 int year;
@@ -331,122 +365,16 @@ static void rtc_copy_date(RTCState *s)
 s->cmos_data[RTC_YEAR] = rtc_to_bcd(s, year);
 }

-/* month is between 0 and 11. */
-static int get_days_in_month(int month, int year)
-{
-static const int days_tab[12] = {
-31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
-};
-int d;
-if ((unsigned )month >= 12)
-return 31;
-d = days_tab[month];
-if (month == 1) {
-if ((year % 4) == 0 && ((year % 100) != 0 || (year % 400) == 0))
-d++;
-}
-return d;
-}
-
-/* update 'tm' to the next second */
-static void rtc_next_second(struct tm *tm)
+static void rtc_calibrate_time(RTCState *s)
 {
-int days_in_month;
-
-tm->tm_sec++;
-if ((unsigned)tm->tm_sec >= 60) {
-tm->tm_sec = 0;
-tm->tm_min++;
-if ((unsigned)tm->tm_min >= 60) {
-tm->tm_min = 0;
-tm->tm_hour++;
-if ((unsigned)tm->tm_hour >= 24) {
-tm->tm_hour = 0;
-

[PATCH v3 1/7] RTC: Remove the logic to update time format when DM bit changed

2012-03-01 Thread Zhang, Yang Z

Change DM(date mode) and 24/12 control bit don't affect the internal
registers. It only indicates what format is using for those registers. So
we don't need to update time format when it is modified.

Signed-off-by: Yang Zhang 
---
 hw/mc146818rtc.c |   10 +-
 1 files changed, 1 insertions(+), 9 deletions(-)

diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c
index a46fdfc..9b49cbc 100644
--- a/hw/mc146818rtc.c
+++ b/hw/mc146818rtc.c
@@ -252,15 +252,7 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, 
uint32_t data)
 rtc_set_time(s);
 }
 }
-if (((s->cmos_data[RTC_REG_B] ^ data) & (REG_B_DM | REG_B_24H)) &&
-!(data & REG_B_SET)) {
-/* If the time format has changed and not in set mode,
-   update the registers immediately. */
-s->cmos_data[RTC_REG_B] = data;
-rtc_copy_date(s);
-} else {
-s->cmos_data[RTC_REG_B] = data;
-}
+s->cmos_data[RTC_REG_B] = data;
 rtc_timer_update(s, qemu_get_clock_ns(rtc_clock));
 break;
 case RTC_REG_C:
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 0/7] RTC: New logic to emulate RTC

2012-03-01 Thread Zhang, Yang Z

Changes in v3:
Rebase to latest head.
Remove the logic to update time format when DM bit changed.
Allow to migrate from old version.
Solve the async when reading UF and UIP

Changes in v2:
Add UIP check logic.
Add logic that next second tick will occur in exactly 500ms later after reset 
divider

Current RTC emulation uses periodic timer(2 timers per second) to update RTC 
clock. And it will stop CPU staying at deep C-state for long period. Our 
experience shows the Pkg C6 residency reduced 6% when running 64 idle guest.
The following patch stop the two periodic timer and only updating RTC clock 
when guest try to read it.
--- 
Yang Zhang (7):
RTC: Remove the logic to update time format when DM bit changed
RTC: Update the RTC clock only when reading it
RTC: Add UIP(update in progress) check logic
RTC: Set internal millisecond register to 500ms when reset divider
RTC: Add RTC update-ended interrupt support
RTC: Add alarm support
RTC: Allow to migrate from old version

hw/mc146818rtc.c |  427 
++---
1 files changed, 274 insertions(+), 153 deletions(-)

best regards
yang

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

can't restore a guest saved by an older qemu

2012-03-01 Thread Ren, Yongjie

Hi Avi,
I can't restore a guest saved by an older qemu. I wonder if this issue is 
introduced by some latest commits on the file 'hw/i8254.c'.
The new commit: b5ed4b6f  (on Mar 1)I tried to restore a guest using this 
qemu.
The old commit: 9d636ae7  (on Feb 11)I saved a guest using this qemu.

I got some error when restoring a guest saved by an old qemu now.
[root@vt-nhm9 jay]# qemu-system-x86_64 -smp 4 -m 1024 -hda rhel6u1.qcow 
-incoming "exec:dd if=save.img"
Could not open option rom 'vapic.bin': No such file or directory
qemu-system-x86_64: pci_add_option_rom: failed to find romfile "pxe-rtl8139.rom"
VNC server running on `::1:5900'
Unknown ramblock ":00:03.0/rtl8139.rom", cannot accept migration
qemu: warning: error while loading state for instance 0x0 of device 'ram'
load of migration failed
dd: writing to `standard output': Broken pipe
193+0 records in
192+0 records out
98304 bytes (98 kB) copied, 0.00713571 s, 13.8 MB/s

A good one is something like the following.
[root@vt-nhm9 jay]# ~/qemu-kvm/x86_64-softmmu/qemu-system-x86_64 -smp 4 -m 1024 
-hda rhel6u1.qcow -incoming "exec:dd if=save1.img"
VNC server running on `::1:5900'
781430+1 records in
781430+1 records out
400092473 bytes (400 MB) copied, 1.03926 s, 385 MB/s


Best Regards,
 Yongjie Ren  (Jay)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/4] KVM: Avoid checking huge page mappings in get_dirty_log()

2012-03-01 Thread Takuya Yoshikawa

Takuya Yoshikawa  wrote:

> Takuya Yoshikawa  wrote:
> 
> > +   while (mask) {
> > +   rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
> > +   __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
> >  
> > -   return write_protected;
> > +   /* clear the first set bit */
> > +   mask &= mask - 1;
> > +   }
> >  }
> 
> while (mask) {
>   fsbit = __ffs(mask);
>   gfn_offset += fsbit;
>   mask >>= fsbit + 1;
> 
>   rmapp = &slot->rmap[gfn_offset];
>   __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
> }
> 
> Seems to be better: BSF friendly.

I was wrong.

"and < shift" and bsf did not show any difference for this kind of loop
on my box.  So we should keep the original implementation.

Takuya
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PATCH 3/4] net: split hostname and service by last colon

2012-03-01 Thread Amos Kong


On 24/02/12 17:29, Kevin Wolf wrote:

Am 10.02.2012 07:27, schrieb Amos Kong:

IPv6 address contains colons, parse will be wrong.

 [2312::8274]:5200

Signed-off-by: Amos Kong
---
  net.c |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net.c b/net.c
index f63014c..9e1ef9e 100644
--- a/net.c
+++ b/net.c
@@ -84,7 +84,7 @@ static int get_str_sep(char *buf, int buf_size, const char 
**pp, int sep)
  const char *p, *p1;
  int len;
  p = *pp;
-p1 = strchr(p, sep);
+p1 = strrchr(p, sep);
  if (!p1)
  return -1;
  len = p1 - p;


And what if the port isn't specified? I think you would erroneously
interpret the last part of the IP address as port.


IPv6 address have paired colons, need more precision check.

--
Amos.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PATCH 1/4] Use getaddrinfo for migration

2012-03-01 Thread Amos Kong


On 24/02/12 17:08, Kevin Wolf wrote:

Am 10.02.2012 07:27, schrieb Amos Kong:

This allows us to use ipv4/ipv6 for migration addresses.
Once there, it also uses /etc/services names (it came free).

Signed-off-by: Juan Quintela
Signed-off-by: Amos Kong
---
  migration-tcp.c |   60 ---
  net.c   |  108 +++
  qemu_socket.h   |3 ++
  3 files changed, 127 insertions(+), 44 deletions(-)


This patch is making too many changes at once:

1. Move code around
2. Remove duplicated code between client and server
3. Replace parse_host_port() with an open-coded version



4. Modify the open-coded parse_host_port() to use getaddrinfo instead of
inet_aton/gethostbyname (Why can't we just change parse_host_port? Are
there valid reasons to use the old implementation? Which?)


tcp_common_start() needs to use the address list which is got by 
getaddrinfo(),


But I agree with verifying host/port by getaddrinfo() in parse_host_port.



This makes it rather hard to review.


diff --git a/migration-tcp.c b/migration-tcp.c
index 35a5781..644bb8f 100644
--- a/migration-tcp.c
+++ b/migration-tcp.c
@@ -81,43 +81,27 @@ static void tcp_wait_for_connect(void *opaque)

  int tcp_start_outgoing_migration(MigrationState *s, const char *host_port)
  {
-struct sockaddr_in addr;
  int ret;
-
-ret = parse_host_port(&addr, host_port);
-if (ret<  0) {
-return ret;
-}
+int fd;

  s->get_error = socket_errno;
  s->write = socket_write;
  s->close = tcp_close;

-s->fd = qemu_socket(PF_INET, SOCK_STREAM, 0);
-if (s->fd == -1) {
-DPRINTF("Unable to open socket");
-return -socket_error();
-}
-
-socket_set_nonblock(s->fd);
-
-do {
-ret = connect(s->fd, (struct sockaddr *)&addr, sizeof(addr));
-if (ret == -1) {
-ret = -socket_error();
-}
-if (ret == -EINPROGRESS || ret == -EWOULDBLOCK) {
-qemu_set_fd_handler2(s->fd, NULL, NULL, tcp_wait_for_connect, s);
-return 0;
-}
-} while (ret == -EINTR);
-
-if (ret<  0) {
+ret = tcp_client_start(host_port,&fd);
+s->fd = fd;
+if (ret == -EINPROGRESS || ret == -EWOULDBLOCK) {
+DPRINTF("connect in progress");
+qemu_set_fd_handler2(s->fd, NULL, NULL, tcp_wait_for_connect, s);
+} else if (ret<  0) {
  DPRINTF("connect failed\n");
-migrate_fd_error(s);
+if (ret != -EINVAL) {
+migrate_fd_error(s);
+}


This looks odd. It is probably needed to keep the old behaviour where a
failed parse_host_port() wouldn't call migrate_fd_error(). Is this
really required? Maybe I'm missing something, but the caller can't know
which failure case we had and if migrate_fd_error() has been called.


getaddrinfo() is used in tcp_common_start(), -EINVAL will be returned
when getaddrinfo() doesn't return 0.



  return ret;
+} else {
+migrate_fd_connect(s);
  }
-migrate_fd_connect(s);
  return 0;
  }

@@ -157,28 +141,16 @@ out2:

  int tcp_start_incoming_migration(const char *host_port)
  {
-struct sockaddr_in addr;
-int val;
+int ret;
  int s;

  DPRINTF("Attempting to start an incoming migration\n");

-if (parse_host_port(&addr, host_port)<  0) {
-fprintf(stderr, "invalid host/port combination: %s\n", host_port);
-return -EINVAL;
-}
-
-s = qemu_socket(PF_INET, SOCK_STREAM, 0);
-if (s == -1) {
-return -socket_error();
+ret = tcp_server_start(host_port,&s);
+if (ret<  0) {
+return ret;
  }

-val = 1;
-setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (const char *)&val, sizeof(val));
-
-if (bind(s, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
-goto err;
-}
  if (listen(s, 1) == -1) {
  goto err;
  }
diff --git a/net.c b/net.c
index c34474f..f63014c 100644
--- a/net.c
+++ b/net.c
@@ -99,6 +99,114 @@ static int get_str_sep(char *buf, int buf_size, const char 
**pp, int sep)
  return 0;
  }

+static int tcp_server_bind(int fd, struct addrinfo *rp)
+{
+int ret;
+int val = 1;
+
+/* allow fast reuse */
+setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (const char *)&val, sizeof(val));
+
+ret = bind(fd, rp->ai_addr, rp->ai_addrlen);
+
+if (ret == -1) {
+ret = -socket_error();
+}
+return ret;
+
+}
+
+static int tcp_client_connect(int fd, struct addrinfo *rp)
+{
+int ret;
+
+do {
+ret = connect(fd, rp->ai_addr, rp->ai_addrlen);
+if (ret == -1) {
+ret = -socket_error();
+}
+} while (ret == -EINTR);
+
+return ret;
+}
+
+
+static int tcp_start_common(const char *str, int *fd, bool server)
+{
+char hostname[512];
+const char *service;
+const char *name;
+struct addrinfo hints;
+struct addrinfo *result, *rp;
+int s;
+int sfd;
+int ret = -EINVAL;
+
+*fd = -1;
+se

Re: [Qemu-devel] [PATCH 1/4] Use getaddrinfo for migration

2012-03-01 Thread Amos Kong


On 24/02/12 17:34, Kevin Wolf wrote:

Am 10.02.2012 07:27, schrieb Amos Kong:

This allows us to use ipv4/ipv6 for migration addresses.
Once there, it also uses /etc/services names (it came free).

Signed-off-by: Juan Quintela
Signed-off-by: Amos Kong
---
  migration-tcp.c |   60 ---
  net.c   |  108 +++
  qemu_socket.h   |3 ++
  3 files changed, 127 insertions(+), 44 deletions(-)



@@ -157,28 +141,16 @@ out2:

  int tcp_start_incoming_migration(const char *host_port)
  {
-struct sockaddr_in addr;
-int val;
+int ret;
  int s;

  DPRINTF("Attempting to start an incoming migration\n");

-if (parse_host_port(&addr, host_port)<  0) {
-fprintf(stderr, "invalid host/port combination: %s\n", host_port);
-return -EINVAL;
-}


Oh, and this case doesn't print an error message any more now.


The check work is done in tcp_start_common()

tcp_start_incoming_migration()
 -> tcp_client_start()
 -> tcp_start_common()

--
Amos.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/4] KVM: Avoid checking huge page mappings in get_dirty_log()

2012-03-01 Thread Takuya Yoshikawa

Takuya Yoshikawa  wrote:

> + while (mask) {
> + rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
> + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
>  
> - return write_protected;
> + /* clear the first set bit */
> + mask &= mask - 1;
> + }
>  }

while (mask) {
fsbit = __ffs(mask);
gfn_offset += fsbit;
mask >>= fsbit + 1;

rmapp = &slot->rmap[gfn_offset];
__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
}

Seems to be better: BSF friendly.

I will update this function and post the result with dirty-log-perf output!

Takuya
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Bug 42829] KVM Guest with virtio network driver loses network connectivity

2012-03-01 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=42829





--- Comment #6 from Steve   2012-03-02 01:05:35 ---
My results:
===

git bisect log:
---
git bisect start
# good: [2c53b436a30867eb6b47dd7bab23ba638d1fb0d2] Linux 3.0-rc3
git bisect good 2c53b436a30867eb6b47dd7bab23ba638d1fb0d2
# bad: [b0af8dfdd67699e25083478c63eedef2e72ebd85] Linux 3.0-rc5
git bisect bad b0af8dfdd67699e25083478c63eedef2e72ebd85
# bad: [c01ad4081939f91ebd7277e8e731fd90ceb3e632] Merge branch 'for-linus' of
git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input
git bisect bad c01ad4081939f91ebd7277e8e731fd90ceb3e632
# bad: [f4ef084226f82ca923bf0a2658bb2876bd215ec1] Merge branch
'fbdev-fixes-for-linus' of
git://git.kernel.org/pub/scm/linux/kernel/git/lethal/fbdev-3.x
git bisect bad f4ef084226f82ca923bf0a2658bb2876bd215ec1
# bad: [bd5dc17be87b3a3073d50b23802647db3ae3fa8e] uts: make default hostname
configurable, rather than always using "(none)"
git bisect bad bd5dc17be87b3a3073d50b23802647db3ae3fa8e
# bad: [f39e8409955fad210a9a7169cc53c4c18daaef3a] Merge branch 'drm-fixes' of
git://git.kernel.org/pub/scm/linux/kernel/git/airlied/drm-2.6
git bisect bad f39e8409955fad210a9a7169cc53c4c18daaef3a
# bad: [81eb3dd8438802138ac9ce12428632f35562c060] Merge branch 'for-linus' of
git://neil.brown.name/md
git bisect bad 81eb3dd8438802138ac9ce12428632f35562c060
# good: [9864c0053d3da4c5731ac8a6c4835179310bd40a] md: Using poll  /proc/mdstat
can monitor the events of adding a spare disks
git bisect good 9864c0053d3da4c5731ac8a6c4835179310bd40a
# bad: [9b2dc8b665932a8e681a7ab3237f60475e75e161] md/raid5: fix
raid5_set_bi_hw_segments
git bisect bad 9b2dc8b665932a8e681a7ab3237f60475e75e161
# bad: [27d5ea04d08bea37bf651090e5f3c573d2390df8] md/bitmap: use proper
accessor macro
git bisect bad 27d5ea04d08bea37bf651090e5f3c573d2390df8
# bad: [01393f3d5836b7d62e925e6f4658a7eb22b83a11] md: check ->hot_remove_disk
when removing disk
git bisect bad 01393f3d5836b7d62e925e6f4658a7eb22b83a11


git bisect message:
---
01393f3d5836b7d62e925e6f4658a7eb22b83a11 is the first bad commit
commit 01393f3d5836b7d62e925e6f4658a7eb22b83a11
Author: Namhyung Kim 
Date:   Thu Jun 9 11:42:54 2011 +1000

md: check ->hot_remove_disk when removing disk

Check pers->hot_remove_disk instead of pers->hot_add_disk in slot_store()
during disk removal. The linear personality only has ->hot_add_disk and
no ->hot_remove_disk, so that removing disk in the array resulted to
following kernel bug:

$ sudo mdadm --create /dev/md0 --level=linear --raid-devices=4
/dev/loop[0-3]
$ echo none | sudo tee /sys/block/md0/md/dev-loop2/slot
 BUG: unable to handle kernel NULL pointer dereference at   (null)
 IP: [<  (null)>]   (null)
 PGD c9f5d067 PUD 8575a067 PMD 0
 Oops: 0010 [#1] SMP
 CPU 2
 Modules linked in: linear loop bridge stp llc kvm_intel kvm asus_atk0110
sr_mod cdrom sg

 Pid: 10450, comm: tee Not tainted 3.0.0-rc1-leonard+ #173 System
manufacturer System Product Name/P5G41TD-M PRO
 RIP: 0010:[<>]  [<  (null)>]   (null)
 RSP: 0018:880085757df0  EFLAGS: 00010282
 RAX: a00168e0 RBX: 8800d1431800 RCX: 006e
 RDX: 0001 RSI: 0002 RDI: 88008543c000
 RBP: 880085757e48 R08: 0002 R09: 000a
 R10:  R11: 88008543c2e0 R12: 
 R13: 8800b4641000 R14: 0005 R15: 
 FS:  7fe8c9e05700() GS:88011fa0()
knlGS:
 CS:  0010 DS:  ES:  CR0: 8005003b
 CR2:  CR3: b4502000 CR4: 000406e0
 DR0:  DR1:  DR2: 
 DR3:  DR6: 0ff0 DR7: 0400
 Process tee (pid: 10450, threadinfo 880085756000, task
8800c9f08000)
 Stack:
  8138496a 8800b4641000 88008543c268 
  8800b4641000 88008543c000 8800d1431868 81a78a90
  8800b4641000 88008543c000 8800d1431800 880085757e98
 Call Trace:
  [] ? slot_store+0xaa/0x265
  [] rdev_attr_store+0x89/0xa8
  [] sysfs_write_file+0x108/0x144
  [] vfs_write+0xb1/0x10d
  [] ? trace_hardirqs_on_caller+0x111/0x135
  [] sys_write+0x4d/0x77
  [] system_call_fastpath+0x16/0x1b
 Code:  Bad RIP value.
 RIP  [<  (null)>]   (null)
  RSP 
 CR2: 
 ---[ end trace ba5fc64319a826fb ]---

Signed-off-by: Namhyung Kim 
Cc: sta...@kernel.org
Signed-off-by: NeilBrown 

:04 04 e3d19f53113f5bb5faef422958607e4f0131d235
eab9913143e1c027df693bf1fa57475da77bd36e M  drivers


How could I help now ?

Thank you for your time.

-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail bec

Re: [Qemu-devel] [PATCH]qemu: deal with guest paniced event

2012-03-01 Thread Wen Congyang

At 03/02/2012 12:51 AM, Luiz Capitulino Wrote:
> On Mon, 27 Feb 2012 11:05:58 +0800
> Wen Congyang  wrote:
> 
>> When the host knows the guest is paniced, it will set
>> exit_reason to KVM_EXIT_GUEST_PANIC. So if qemu receive
>> this exit_reason, we can send a event to tell management
>> application that the guest is paniced.
>>
>> Signed-off-by: Wen Congyang 
>> ---
>>  kvm-all.c |3 +++
>>  linux-headers/linux/kvm.h |1 +
>>  monitor.c |3 +++
>>  monitor.h |1 +
>>  4 files changed, 8 insertions(+), 0 deletions(-)
>>
>> diff --git a/kvm-all.c b/kvm-all.c
>> index c4babda..ae428ab 100644
>> --- a/kvm-all.c
>> +++ b/kvm-all.c
>> @@ -1190,6 +1190,9 @@ int kvm_cpu_exec(CPUState *env)
>>  (uint64_t)run->hw.hardware_exit_reason);
>>  ret = -1;
>>  break;
>> +case KVM_EXIT_GUEST_PANIC:
>> +monitor_protocol_event(QEVENT_GUEST_PANICED, NULL);
>> +break;
> 
> The event alone is not enough, because the mngt app may miss it (eg. the panic
> happens before the mngt app connected to qemu).
> 
> A simple way to solve this would be to also add a new RunState called
> guest-panic and make the transition to it (besides sending the event).
> 
> A more general way would be to model this after -drive's werror/rerror 
> options,
> say guest-error=report|ignore|stop. When guest-error=stop, the mngt app will
> get a STOP event and can check the VM runstate to know if it's guest-panic.

OK, I will fix it.

Thanks
Wen Congyang

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PULL] kvm updates

2012-03-01 Thread Andreas Färber

Am 01.03.2012 13:47, schrieb Avi Kivity:
> Avi Kivity (1):
>   pc-bios: update kvmvapic.bin
> 
> Gleb Natapov (1):
>   kvm: Synchronize cpu state in kvm_arch_stop_on_emulation_error()
> 
> Jan Kiszka (10):
>   kvm: Set cpu_single_env only once
>   Remove useless casts from cpu iterators
>   Process pending work while waiting for initial kick-off in TCG mode
>   Allow to use pause_all_vcpus from VCPU context
>   target-i386: Add infrastructure for reporting TPR MMIO accesses
>   kvmvapic: Add option ROM
>   kvmvapic: Introduce TPR access optimization for Windows guests
>   kvmvapic: Simplify mp/up_set_tpr
>   optionsrom: Reserve space for checksum
>   kvmvapic: Use optionrom helpers

What about my kvmclock kvm_enabled() fix? Should I rather submit that as
part of my qom-cpu queue? It depends on it.

Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Bug 42829] KVM Guest with virtio network driver loses network connectivity

2012-03-01 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=42829


Steve  changed:

   What|Removed |Added

 Kernel Version|3.2-rc3 |v3.0-rc5




-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Bug 42829] KVM Guest with virtio network driver loses network connectivity

2012-03-01 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=42829





--- Comment #5 from Steve   2012-03-01 23:04:37 ---
Results of my test:
===

In all test cases host configuration is the same:
-
kernel: latest 3.3-rc5+
compiler: gcc (4.7.0-pre 20120225 rev. 184573)
glibc-2.14.1-r2
bridge-utils-1.5
iproute2-3.1.0
ethtool-3.2

other system stuff: all latest from their git repos
util-linux, net-tools, kmod, udev, seabios, qemu-kvm

In all test cases guest configuration except kernel is the same:

compiler: gcc 4.6.2
glibc-2.14.1-r2
iproute2-3.1.0
ethtool-3.2

other system stuff: all latest from their git repos
util-linux, net-tools, kmod, udev


1) guest kernel =v2.6.39
   - works fine more than 5min
2) guest kernel =v3.0
   - virtio network goes down in less then 1min of test
3) guest kernel latest v3.3-rc5+
   - virtio network goes down in less then 1min of test


I continue with tracking in kernel tags after v2.6.39:

4) guest kernel =v3.0-rc1
   - works fine more than 5min

5) guest kernel =v3.0-rc5
   - virtio network goes down in less then 1min of test

6) guest kernel =v3.0-rc3
   - works fine more than 5min

I shortly look at =v3.0-rc5, problem could be somewhere in this tag.

Later I would like to locate problematic commit
with git bisecting and after I report my results.

-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware

2012-03-01 Thread John Fastabend

On 3/1/2012 5:36 AM, Jamal Hadi Salim wrote:
> On Wed, 2012-02-29 at 10:19 -0800, John Fastabend wrote:
> 
>>>
>>> I want to see a unified API so that user space control applications (RSTP, 
>>> TRILL?)
>>> can use one set of netlink calls for both software bridge and hardware 
>>> offloaded
>>> bridges.  Does this proposal meet that requirement?
>>>
> 
> I dont see any issues with those requirements being met.
> 
>> Jamal, so why do "They have to be different calls"? I'm not so sure 
>> anymore...
>> moving to RTM_FDB_XXXENTRY saved some refactoring in the bridge module but 
>> that
>> is just cosmetic.
> 
> I may not want to use the s/ware bridge i.e I may want to use h/ware
> bridge. I may want to use both. So there are 3 variations there. You
> need at least 1.5 bits to represent them if you are going to use the
> same interface. There may be features in either h/ware but not in
> s/ware and vice-versa. 
> A single interface with flags which say this applies to hware:sware:both
> would be good, but it may be harder to achieve - thats why i suggested
> they be different.
> 
> cheers,
> jamal
> 

Hmm so I think what I'll do is this...

 both: ndm_flags = 0 
 sw  : ndm_flags = NTF_SW_FDB
 hw  : ndm_flags = NTF_HW_FDB

Then current tools will work with embedded bridges and software bridges
with the interesting case being when a port supporting an offloaded FDB
is attached to a SW bridge. Doing both in this case seems to be a reasonable
default to me.

The tricky bit will be pulling the message handlers out of the ./net/bridge
code so that we don't have to always load the bridge module to add entries
to a macvlan for example. I need to look at a few other things today but
I'll code up a patch for this tomorrow.

.John
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware

2012-03-01 Thread John Fastabend

On 3/1/2012 6:14 AM, Michael S. Tsirkin wrote:
> On Wed, Feb 29, 2012 at 09:25:56AM -0800, John Fastabend wrote:
>> Agreed. I think adding some ndo_ops for bridging offloads here would
>> work. For example the DSA infrastructure and/or macvlan devices might
>> need this. Along the lines of extending this RFC,
>>
>> [RFC] hardware bridging support for DSA switches
>> http://patchwork.ozlabs.org/patch/16578/
>>
>>
>> .John
> 
> One place where this might not work well would be
> macvtap which is not a network device so it doesn't have
> its own address, instead it inherits one from macvlan.
> 

But is macvtap really doing any forwarding or implementing any
RX filters? Took a quick scan and it looks like the forwarding
logic is all in the macvlan code paths. In this case I suspect
if we enable macvlan then any device built on top of it would
work.

.John
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Added functionality that allows dynamically add and remove device specific reset functions

2012-03-01 Thread Bjorn Helgaas

On Thu, Mar 1, 2012 at 10:18 AM,   wrote:
> From b4cf24d5987475862de799c78773f13f25ed2af8 Mon Sep 17 00:00:00 2001
> From: Tadeusz Struk 
> Date: Tue, 17 Jan 2012 16:45:46 +
> Subject: [PATCH] Added functionality that allows dynamically add and remove
>  device specific reset functions

The subject line (after removing "[PATCH"]) becomes part of the
permanent changelog, so please choose one that follows the typical
style for the file.  A good trick is to use "git log --oneline
drivers/pci/pci.h" and make yours look like the others.

> I have a use case where I need to cleanup resource allocated for Virtual
> Functions after a guest OS that used it crashed. This cleanup needs to
> be done before the VF is being FLRed. The only possible way to do this
> seems to be by using pci_dev_specific_reset() function. Unfortunately
> this function only works for devices defined in a static table in the
> drivers/pci/quirks.c file. This patch changes it so that specific reset
> functions can be added and deleted dynamically.
>
> Signed-off-by: Tadeusz Struk 
>
> ---
>  drivers/pci/pci.h    |   19 ++-
>  drivers/pci/quirks.c |   63 
> ++
>  2 files changed, 71 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 1009a5e..10929d8 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -312,18 +312,35 @@ static inline resource_size_t 
> pci_resource_alignment(struct pci_dev *dev,
>  extern void pci_enable_acs(struct pci_dev *dev);
>
>  struct pci_dev_reset_methods {
> +       struct list_head list;
>        u16 vendor;
>        u16 device;
>        int (*reset)(struct pci_dev *dev, int probe);
>  };
>
>  #ifdef CONFIG_PCI_QUIRKS
> -extern int pci_dev_specific_reset(struct pci_dev *dev, int probe);
> +extern int
> +pci_dev_specific_reset(struct pci_dev *dev, int probe);
> +extern int
> +pci_dev_specific_reset_add(const struct pci_dev_reset_methods *reset_method);
> +extern int
> +pci_dev_specific_reset_remove(const struct pci_dev_reset_methods 
> *reset_method);

Please keep the existing code style, namely, function return type,
name, and arguments all on the same line (broken as needed to stay in
80 columns).  The "extern" is superfluous but most of the rest of the
file does use it, so this patch should, too.  It would be nice if you
added a second patch to remove *all* the "externs" in this file.

> +
>  #else
>  static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe)
>  {
>        return -ENOTTY;
>  }
> +int
> +pci_dev_specific_reset_add(const struct pci_dev_reset_methods *reset_method)

Function name on same line as return type (several more instances below).

> +{
> +       return 0;
> +}
> +int
> +pci_dev_specific_reset_remove(const struct pci_dev_reset_methods 
> *reset_method)
> +{
> +       return 0;
> +}
>  #endif
>
>  #endif /* DRIVERS_PCI_H */
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> index 6476547..963e527 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -3070,26 +3070,69 @@ static int reset_intel_82599_sfp_virtfn(struct 
> pci_dev *dev, int probe)
>  }
>
>  #define PCI_DEVICE_ID_INTEL_82599_SFP_VF   0x10ed
> +static struct pci_dev_reset_methods intel_82599_sfp_vf_reset = {
> +       .vendor = PCI_VENDOR_ID_INTEL,
> +       .device = PCI_DEVICE_ID_INTEL_82599_SFP_VF,
> +       .reset = reset_intel_82599_sfp_virtfn,
> +       .list = LIST_HEAD_INIT(intel_82599_sfp_vf_reset.list)

I don't think you need to initialize the .list member here.  You
should be able to do that when linking the method into the list.

> +} ;
>
> -static const struct pci_dev_reset_methods pci_dev_reset_methods[] = {
> -       { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82599_SFP_VF,
> -                reset_intel_82599_sfp_virtfn },
> -       { PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
> -               reset_intel_generic_dev },
> -       { 0 }
> -};
> +static struct pci_dev_reset_methods intel_generic_reset = {
> +       .vendor = PCI_VENDOR_ID_INTEL,
> +       .device = PCI_ANY_ID,
> +       .reset = reset_intel_generic_dev,
> +       .list = LIST_HEAD_INIT(intel_generic_reset.list)
> +} ;
> +
> +static DEFINE_SPINLOCK(reset_list_lock);
> +static LIST_HEAD(reset_list);
> +
> +
> +static int __init pci_dev_specific_reset_init(void)
> +{
> +       pci_dev_specific_reset_add(&intel_82599_sfp_vf_reset);
> +       pci_dev_specific_reset_add(&intel_generic_reset);

I think this would be better as:

for (i = 0; i < ARRAY_SIZE(pci_dev_reset_methods); i++)
pci_dev_specific_reset_add(&pci_dev_reset_methods[i]);

That way, new statically included methods could be added without
touching this function.  Also, I don't think the
pci_dev_reset_methods[] definition would need to be changed at all for
this patch (though I think you could remove the trailing "{ 0 }"
sentinel).

> +       return 0;
> +}
> +
> +late_initcall(pci_dev_specific_reset_init);
> +
> +int
> +pci_dev_specific_re

[PATCH] Fix warning

2012-03-01 Thread tadeusz . struk

>From 268c5427305d59c1f6f6c1ce8047f6e32a7edcac Mon Sep 17 00:00:00 2001
From: Tadeusz Struk 
Date: Mon, 14 Feb 2011 14:38:18 +
Subject: [PATCH] Fixed warning

This patch fixes the following warning.
# virsh start fedora16-64
kernel: [  133.324565] pci-stub :02:01.1: claimed by stub
kernel: [  134.163769] pci-stub :02:01.1: enabling device ( -> 0002)
kernel: [  164.282679] [ cut here ]
kernel: [  164.282685] WARNING: at drivers/pci/search.c:46 
pci_find_upstream_pcie_bridge+0x87/0x9f()
kernel: [  164.282687] Hardware name: SandyBridge Platform
kernel: [  164.282689] Modules linked in: sha512_generic sha256_generic 
icp_qa_al(O) nfs fscache auth_rpcgss nfs_acl mga drm ip6table_filter ip6_tables 
ebtable_nat ebtables lockd ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 
nf_defrag_ipv4 xt_state nf_conntrack xt_CHECKSUM iptable_mangle tun bridge stp 
llc sunrpc btrfs zlib_deflate libcrc32c virtio_net kvm_intel kvm uinput 
matroxfb_base matroxfb_DAC1064 matroxfb_accel matroxfb_Ti3026 matroxfb_g450 
g450_pll matroxfb_misc e1000e iTCO_wdt iTCO_vendor_support igb microcode 
i2c_i801 shpchp serio_raw i2c_core pcspkr dca [last unloaded: scsi_wait_scan]
kernel: [  164.282724] Pid: 1233, comm: qemu-kvm Tainted: G   O 3.2.5 
#10
kernel: [  164.282726] Call Trace:
kernel: [  164.282732]  [] warn_slowpath_common+0x83/0x9b
kernel: [  164.282735]  [] warn_slowpath_null+0x1a/0x1c
kernel: [  164.282737]  [] 
pci_find_upstream_pcie_bridge+0x87/0x9f
kernel: [  164.282741]  [] domain_context_mapping+0x50/0xe6
kernel: [  164.282744]  [] domain_add_dev_info+0x44/0xe3
kernel: [  164.282747]  [] 
intel_iommu_attach_device+0x14f/0x15c
kernel: [  164.282750]  [] iommu_attach_device+0x1c/0x1e
kernel: [  164.282764]  [] kvm_assign_device+0x4a/0x114 [kvm]
kernel: [  164.282773]  [] 
kvm_vm_ioctl_assigned_device+0x434/0xb25 [kvm]
kernel: [  164.282777]  [] ? __do_fault+0x351/0x38b
kernel: [  164.282781]  [] ? arch_local_irq_save+0x15/0x1b
kernel: [  164.282784]  [] ? 
_raw_spin_unlock_irqrestore+0x17/0x19
kernel: [  164.282787]  [] ? pci_conf1_read+0xe1/0xee
kernel: [  164.282794]  [] kvm_vm_ioctl+0x377/0x3ac [kvm]
kernel: [  164.282797]  [] ? pci_read_config+0xa2/0x1bd
kernel: [  164.282801]  [] ? virt_to_head_page+0xe/0x31
kernel: [  164.282804]  [] do_vfs_ioctl+0x45d/0x49e
kernel: [  164.282808]  [] ? fsnotify_access+0x5f/0x67
kernel: [  164.282811]  [] sys_ioctl+0x56/0x7b
kernel: [  164.282814]  [] system_call_fastpath+0x16/0x1b
kernel: [  164.282816] ---[ end trace 6a834ec5ac21cba8 ]---

Signed-off-by: Tadeusz Struk 

---
 drivers/pci/search.c |7 +--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 9d75dc8..7847c6b 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -26,6 +26,7 @@ struct pci_dev *
 pci_find_upstream_pcie_bridge(struct pci_dev *pdev)
 {
struct pci_dev *tmp = NULL;
+   struct pci_dev *vf = pdev;
 
if (pci_is_pcie(pdev))
return NULL;
@@ -40,8 +41,10 @@ pci_find_upstream_pcie_bridge(struct pci_dev *pdev)
}
/* PCI device should connect to a PCIe bridge */
if (pdev->pcie_type != PCI_EXP_TYPE_PCI_BRIDGE) {
-   /* Busted hardware? */
-   WARN_ON_ONCE(1);
+   if (!vf->is_virtfn) {
+   /* Busted hardware? */
+   WARN_ON_ONCE(1);
+   }
return NULL;
}
return pdev;
-- 
1.7.7.6

--
Intel Shannon Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
Business address: Dromore House, East Park, Shannon, Co. Clare

This e-mail and any attachments may contain confidential material for the sole 
use of the intended recipient(s). Any review or distribution by others is 
strictly prohibited. If you are not the intended recipient, please contact the 
sender and delete all copies.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Added functionality that allows dynamically add and remove device specific reset functions

2012-03-01 Thread tadeusz . struk

>From b4cf24d5987475862de799c78773f13f25ed2af8 Mon Sep 17 00:00:00 2001
From: Tadeusz Struk 
Date: Tue, 17 Jan 2012 16:45:46 +
Subject: [PATCH] Added functionality that allows dynamically add and remove
 device specific reset functions

I have a use case where I need to cleanup resource allocated for Virtual
Functions after a guest OS that used it crashed. This cleanup needs to
be done before the VF is being FLRed. The only possible way to do this
seems to be by using pci_dev_specific_reset() function. Unfortunately
this function only works for devices defined in a static table in the
drivers/pci/quirks.c file. This patch changes it so that specific reset
functions can be added and deleted dynamically.

Signed-off-by: Tadeusz Struk 

---
 drivers/pci/pci.h|   19 ++-
 drivers/pci/quirks.c |   63 ++
 2 files changed, 71 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 1009a5e..10929d8 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -312,18 +312,35 @@ static inline resource_size_t 
pci_resource_alignment(struct pci_dev *dev,
 extern void pci_enable_acs(struct pci_dev *dev);
 
 struct pci_dev_reset_methods {
+   struct list_head list;
u16 vendor;
u16 device;
int (*reset)(struct pci_dev *dev, int probe);
 };
 
 #ifdef CONFIG_PCI_QUIRKS
-extern int pci_dev_specific_reset(struct pci_dev *dev, int probe);
+extern int
+pci_dev_specific_reset(struct pci_dev *dev, int probe);
+extern int
+pci_dev_specific_reset_add(const struct pci_dev_reset_methods *reset_method);
+extern int
+pci_dev_specific_reset_remove(const struct pci_dev_reset_methods 
*reset_method);
+
 #else
 static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe)
 {
return -ENOTTY;
 }
+int
+pci_dev_specific_reset_add(const struct pci_dev_reset_methods *reset_method)
+{
+   return 0;
+}
+int
+pci_dev_specific_reset_remove(const struct pci_dev_reset_methods *reset_method)
+{
+   return 0;
+}
 #endif
 
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 6476547..963e527 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3070,26 +3070,69 @@ static int reset_intel_82599_sfp_virtfn(struct pci_dev 
*dev, int probe)
 }
 
 #define PCI_DEVICE_ID_INTEL_82599_SFP_VF   0x10ed
+static struct pci_dev_reset_methods intel_82599_sfp_vf_reset = {
+   .vendor = PCI_VENDOR_ID_INTEL,
+   .device = PCI_DEVICE_ID_INTEL_82599_SFP_VF,
+   .reset = reset_intel_82599_sfp_virtfn,
+   .list = LIST_HEAD_INIT(intel_82599_sfp_vf_reset.list)
+} ;
 
-static const struct pci_dev_reset_methods pci_dev_reset_methods[] = {
-   { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82599_SFP_VF,
-reset_intel_82599_sfp_virtfn },
-   { PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
-   reset_intel_generic_dev },
-   { 0 }
-};
+static struct pci_dev_reset_methods intel_generic_reset = {
+   .vendor = PCI_VENDOR_ID_INTEL,
+   .device = PCI_ANY_ID,
+   .reset = reset_intel_generic_dev,
+   .list = LIST_HEAD_INIT(intel_generic_reset.list)
+} ;
+
+static DEFINE_SPINLOCK(reset_list_lock);
+static LIST_HEAD(reset_list);
+
+
+static int __init pci_dev_specific_reset_init(void)
+{
+   pci_dev_specific_reset_add(&intel_82599_sfp_vf_reset);
+   pci_dev_specific_reset_add(&intel_generic_reset);
+   return 0;
+}
+
+late_initcall(pci_dev_specific_reset_init);
+
+int
+pci_dev_specific_reset_add(const struct pci_dev_reset_methods *reset_method)
+{
+   if (reset_method && reset_method->reset) {
+   spin_lock(&reset_list_lock);
+   list_add((struct list_head *)&reset_method->list,
+&reset_list);
+   spin_unlock(&reset_list_lock);
+   return 0;
+   }
+   return -EINVAL;
+}
+EXPORT_SYMBOL(pci_dev_specific_reset_add);
+
+int
+pci_dev_specific_reset_remove(const struct pci_dev_reset_methods *reset_method)
+{
+   if (reset_method) {
+   spin_lock(&reset_list_lock);
+   list_del((struct list_head *)(&reset_method->list));
+   spin_unlock(&reset_list_lock);
+   return 0;
+   }
+   return -EINVAL;
+}
+EXPORT_SYMBOL(pci_dev_specific_reset_remove);
 
 int pci_dev_specific_reset(struct pci_dev *dev, int probe)
 {
const struct pci_dev_reset_methods *i;
-
-   for (i = pci_dev_reset_methods; i->reset; i++) {
+   list_for_each_entry(i, &reset_list, list) {
if ((i->vendor == dev->vendor ||
 i->vendor == (u16)PCI_ANY_ID) &&
(i->device == dev->device ||
 i->device == (u16)PCI_ANY_ID))
return i->reset(dev, probe);
}
-
return -ENOTTY;
 }
-- 
1.7.7.6

--
Intel Shannon Limited
Registered in Ireland
Registered Office: Collin

Re: [Qemu-devel] [PULL] Urgent memory fix for kvm with unaligned memory slots

2012-03-01 Thread Avi Kivity

On 03/01/2012 07:08 PM, Eric Blake wrote:
> On 03/01/2012 10:03 AM, Avi Kivity wrote:
> >>>
> >>> -ram = memory_region_get_ram_ptr(mr) + section->offset_within_region;
> >>> +ram = memory_region_get_ram_ptr(mr) + section->offset_within_region
> >>> + delta;
> >>
> >> Am I crazy, or does this look wrong?
> > 
> > Could be both.  Why do you thing it is wrong?
>
> Line wrapping makes it look like we are adding two lines, one line
> ending in 'section->offset_within_region', and the next line starting
> with 'delta;', which is a syntax error.
>
> But without line wrapping, we are adding just one line with
> 'offset_within_region + delta;' at the end of that line.
>

Ah, of course.  I just copy/pasted this into thunderbird, this is not a
proper patch but a pull request.  Sorry about the confusion.

Bobby: so it does indeed look wrong.  This says nothing about your
sanity though, for that consult a qualified professional instead of
asking on the mailing list.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PULL] Urgent memory fix for kvm with unaligned memory slots

2012-03-01 Thread Bobby Powers

On Thu, Mar 1, 2012 at 12:08 PM, Eric Blake  wrote:
> On 03/01/2012 10:03 AM, Avi Kivity wrote:

 -    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region;
 +    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region
 + delta;
>>>
>>> Am I crazy, or does this look wrong?
>>
>> Could be both.  Why do you thing it is wrong?
>
> Line wrapping makes it look like we are adding two lines, one line
> ending in 'section->offset_within_region', and the next line starting
> with 'delta;', which is a syntax error.

yea, patch line wrapping was making my eyes see things.  okay, sorry
for the noise.

>
> But without line wrapping, we are adding just one line with
> 'offset_within_region + delta;' at the end of that line.
>
> --
> Eric Blake   ebl...@redhat.com    +1-919-301-3266
> Libvirt virtualization library http://libvirt.org
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PULL] Urgent memory fix for kvm with unaligned memory slots

2012-03-01 Thread Eric Blake

On 03/01/2012 10:03 AM, Avi Kivity wrote:
>>>
>>> -ram = memory_region_get_ram_ptr(mr) + section->offset_within_region;
>>> +ram = memory_region_get_ram_ptr(mr) + section->offset_within_region
>>> + delta;
>>
>> Am I crazy, or does this look wrong?
> 
> Could be both.  Why do you thing it is wrong?

Line wrapping makes it look like we are adding two lines, one line
ending in 'section->offset_within_region', and the next line starting
with 'delta;', which is a syntax error.

But without line wrapping, we are adding just one line with
'offset_within_region + delta;' at the end of that line.

-- 
Eric Blake   ebl...@redhat.com+1-919-301-3266
Libvirt virtualization library http://libvirt.org

signature.asc
Description: OpenPGP digital signature

Re: [PULL] Urgent memory fix for kvm with unaligned memory slots

2012-03-01 Thread Avi Kivity

On 03/01/2012 06:51 PM, Bobby Powers wrote:
> > /* kvm works in page size chunks, but the function may be called
> >with sub-page size and unaligned start address. */
> > -size = TARGET_PAGE_ALIGN(size);
> > -start_addr = TARGET_PAGE_ALIGN(start_addr);
> > +delta = TARGET_PAGE_ALIGN(size) - size;
> > +if (delta > size) {
> > +return;
> > +}
> > +start_addr += delta;
> > +size -= delta;
> > +size &= TARGET_PAGE_MASK;
> > +if (!size || (start_addr & ~TARGET_PAGE_MASK)) {
> > +return;
> > +}
> >
> > if (!memory_region_is_ram(mr)) {
> > return;
> > }
> >
> > -ram = memory_region_get_ram_ptr(mr) + section->offset_within_region;
> > +ram = memory_region_get_ram_ptr(mr) + section->offset_within_region
> > + delta;
>
> Am I crazy, or does this look wrong?

Could be both.  Why do you thing it is wrong?

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PATCH]qemu: deal with guest paniced event

2012-03-01 Thread Luiz Capitulino

On Mon, 27 Feb 2012 11:05:58 +0800
Wen Congyang  wrote:

> When the host knows the guest is paniced, it will set
> exit_reason to KVM_EXIT_GUEST_PANIC. So if qemu receive
> this exit_reason, we can send a event to tell management
> application that the guest is paniced.
> 
> Signed-off-by: Wen Congyang 
> ---
>  kvm-all.c |3 +++
>  linux-headers/linux/kvm.h |1 +
>  monitor.c |3 +++
>  monitor.h |1 +
>  4 files changed, 8 insertions(+), 0 deletions(-)
> 
> diff --git a/kvm-all.c b/kvm-all.c
> index c4babda..ae428ab 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -1190,6 +1190,9 @@ int kvm_cpu_exec(CPUState *env)
>  (uint64_t)run->hw.hardware_exit_reason);
>  ret = -1;
>  break;
> +case KVM_EXIT_GUEST_PANIC:
> +monitor_protocol_event(QEVENT_GUEST_PANICED, NULL);
> +break;

The event alone is not enough, because the mngt app may miss it (eg. the panic
happens before the mngt app connected to qemu).

A simple way to solve this would be to also add a new RunState called
guest-panic and make the transition to it (besides sending the event).

A more general way would be to model this after -drive's werror/rerror options,
say guest-error=report|ignore|stop. When guest-error=stop, the mngt app will
get a STOP event and can check the VM runstate to know if it's guest-panic.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PULL] Urgent memory fix for kvm with unaligned memory slots

2012-03-01 Thread Bobby Powers

On Thu, Mar 1, 2012 at 8:40 AM, Avi Kivity  wrote:
> The memory core may generate RAM memory regions that are not page
> aligned, but the kvm code is not prepared to handle them well and will
> abort under certain conditions.  This patch fixes the problem.
>
> Please pull from:
>
>  git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git memory/urgent
>
> 
> Avi Kivity (1):
>      kvm: fix unaligned slots
>
>  kvm-all.c |   15 ---
>  1 files changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/kvm-all.c b/kvm-all.c
> index c4babda..4b7a4ae 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -541,17 +541,26 @@ static void kvm_set_phys_mem(MemoryRegionSection
> *section, bool add)
>     target_phys_addr_t start_addr = section->offset_within_address_space;
>     ram_addr_t size = section->size;
>     void *ram = NULL;
> +    unsigned delta;
>
>     /* kvm works in page size chunks, but the function may be called
>        with sub-page size and unaligned start address. */
> -    size = TARGET_PAGE_ALIGN(size);
> -    start_addr = TARGET_PAGE_ALIGN(start_addr);
> +    delta = TARGET_PAGE_ALIGN(size) - size;
> +    if (delta > size) {
> +        return;
> +    }
> +    start_addr += delta;
> +    size -= delta;
> +    size &= TARGET_PAGE_MASK;
> +    if (!size || (start_addr & ~TARGET_PAGE_MASK)) {
> +        return;
> +    }
>
>     if (!memory_region_is_ram(mr)) {
>         return;
>     }
>
> -    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region;
> +    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region
> + delta;

Am I crazy, or does this look wrong?

>
>     while (1) {
>         mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr +
> size);
>
> --
> error compiling committee.c: too many arguments to function
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4] KVM: Resize kvm_io_range array dynamically

2012-03-01 Thread Alex Williamson

On Thu, 2012-03-01 at 12:14 +0200, Sasha Levin wrote:
> On Thu, Mar 1, 2012 at 9:01 AM, Amos Kong  wrote:
> > This patch makes the kvm_io_range array can be resized dynamically.
> > Set an upper bounds for kvm_io_range to limit userspace.
> > 1000 is a very large limit and not bloat the typical user.
> 
> Whats the reason for making everything dynamic? Memory savings there
> aren't that significant.

We're currently looking at about 14k for these arrays with 300 entries
since we have two of them.  If we're going to double it or triple it to
handle the maximum use case, why impose that on the typical VM?  It may
not be multiple megabytes, but I wouldn't say it's insignificant either.

> 
> If you want to make it more efficient just define:
> static struct kvm_io_bus io_bus[2];
> 
> somewhere in kvm_main.c, and just switch between them when you need to
> do insertion and removal of devices. You get the benefit of zero slub
> usage, no allocations in any of the paths, and much simpler logic.

It's updated via rcu.  The change Amos is proposing is fairly trivial;
allocate the necessary size and memcpy instead of memdup.  Maybe it can
be optimized further, but this seems like a step in the right direction
to handle worst case use and, if anything, benefit the typical user too.
Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Bug 42829] KVM Guest with virtio network driver loses network connectivity

2012-03-01 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=42829





--- Comment #4 from Steve   2012-03-01 15:22:53 ---
Thank you for information.

On host I used last updated:

kernel from:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

qemu-kvm from:
git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git

seabios from:
git://git.seabios.org/seabios.git

used compiler, libs & other stuff:
gcc-4.7.0 (feb 2012)
glibc-2.14.1-r2
bridge-utils-1.5
iproute2-3.1.0
ethtool-3.2

On guest I can proceed this:

1) test case one - kernel before https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

live migration between amd fam15h-fam10h

2012-03-01 Thread Vasilis Liaskovitis

Hi,

I am getting a frozen guest when migrating from an Opteron 6274 host (amd 
fam15h) to
an Opteron 6174 host (amd fam10h). The live migration completes succesfully, but
the guest is frozen: vcn screen is still there, but no input is possible and
no kernel output is seen. Trying "c" on the qemu-monitor does not help.
I am using "-cpu Opteron_G3" which I assumed would be ok for both host cpus.

In the opposite direction (migrating from an amd fam10h host to an amdfam15h
host) the guest continues to run on the destination. However, on most of these
successfull live migrations, I notice a "clocksource unstable" message on the
guest kernel (using the default kvm-clock clocksource) e.g.
Clocksource tsc unstable (delta = -1500533439 ns)
Same situation (guest runs on destination with clocksource unstable message)
happens when migrating between fam15h hosts (I have not tried between fam10h
hosts)

Changing the clocksource (tsc, acpi_pm, hpet) does not solve the issue.
Also tried with "-cpu kvm64" with same result.

qemu-kvm version: 0.15.1, 1.0 or qemu-kvm/master
Host kernel: 3.0.15 (on both hosts)
Guest kernel: 3.0.6 or 3.2

this is the qemu-kvm command line used on the source host: 

"
kvm -enable-kvm -m 1024 -smp 1 -cpu Opteron_G3,check -drive \
file=/opt/test.img,if=none,id=drive-virtio-disk1,format=raw,cache=writethrough,boot=on
-device 
virtio-blk-pci,bus=pci.0,addr=0x5,drive=drive-virtio-disk1,id=virtio-disk1
-monitor stdio -vnc 0.0.0.0:6 -vga std -chardev pty,id=charserial0 -device
isa-serial,chardev=charserial0,id=serial0 -usb -device usb-tablet,id=input0
"

The destination host has the same command line with an added "-incoming
tcp:". I have mainly tested this with non-shared storage (but also shared
storage has the same result). Migration is triggered with "migrate -b 
tcp:destip:"

Do the TSC microarchitecture changes in amdfam15h (see AMD SW optimiization
guide for fam15h, 47414 Rev 3.02 Appendix E) affect pvclock stability on
migration in same family or across families?

cpuid information follows in case it's helpful.

6274 host:

 eax ineax  ebx  ecx  edx
 000d 68747541 444d4163 69746e65
0001 00600f12 02100800 1e98220b 178bfbff
0002    
0003    
0004    
0005 0040 0040 0003 
0006   0001 
0007    
0008    
0009    
000a    
000b    
000c    
000d    
8000 801e 68747541 444d4163 69746e65
8001 00600f12 3000 01c9bfff 2fd3fbff
8002 20444d41 6574704f 286e6f72 20294d54
8003 636f7250 6f737365 32362072 20203437
8004 20202020 20202020 20202020 00202020
8005 ff20ff18 ff20ff30 10040140 40020140
8006 6400 64004200 08008140 0060e140
8007    03d9
8008 3030  500f 
8009    
800a 0001 0001  14ff
800b    
800c    
800d    
800e    
800f    
8010    
8011    
8012    
8013    
8014    
8015    
8016    
8017    
8018    
8019 f020f018 6400  
801a 0003   
801b 00ff   
801c  80032013 00010200 800f
801d    
801e 0022 0101 0100 

Vendor ID: "AuthenticAMD"; CPUID level 13

AMD-specific functions
Version 00600f12:
Family: 15 Model: 1 []

Standard feature flags 178bfbff:
Floating Point Unit
Virtual Mode Extensions
Debugging Extensions
Page Size Extensions
Time Stamp Counter (with RDTSC and CR4 disable bit)
Model Specific Registers with RDMSR & WRMSR
PAE - Page Address Extensions
Machine Check Exception
COMPXCHG8B Instruction
APIC
SYSCALL/SYSRET or SYSENTER/SYSEXIT instructions
MTRR - Memory Type Range Registers
Global paging extension
Machine Check Architecture
Conditional Move Instruction
PAT - Page Attribute Table
PSE-36 - Page Size Extensions
19 - reserved
MMX instructions
FXSAVE/FXRSTOR
25 - reserved
26 - reserved
28 - reserved
Generation: 15 Model: 1
Extended feature flags 2fd3fbff:
Floating Point Unit
Virtual Mode Extensions
Debugging Extensions
Page Size Ext

[Bug 42829] KVM Guest with virtio network driver loses network connectivity

2012-03-01 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=42829


Kurk  changed:

   What|Removed |Added

 CC||k...@shiftmail.org




--- Comment #3 from Kurk   2012-03-01 14:16:36 ---
Hello,
I am pretty sure I read about a very similar bug report recently, dunno if that
was on kvm or qemu mailing list, or in a bugzilla entry like this.
The guy said that there was a reproducer: running a network test between two
affected virtual machines. Maybe that was netperf, not sure. Would reproduce
the bug in a short time.
He said kernel 3.0 (GUEST!) was the first he tested to have this problem, while
much older guest kernels did not have this problem.

I suggest Steve tries this reproducer, and tell if it works in his case. If
that works, debugging would be much faster.

-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware

2012-03-01 Thread Michael S. Tsirkin

On Wed, Feb 29, 2012 at 09:25:56AM -0800, John Fastabend wrote:
> Agreed. I think adding some ndo_ops for bridging offloads here would
> work. For example the DSA infrastructure and/or macvlan devices might
> need this. Along the lines of extending this RFC,
> 
> [RFC] hardware bridging support for DSA switches
> http://patchwork.ozlabs.org/patch/16578/
> 
> 
> .John

One place where this might not work well would be
macvtap which is not a network device so it doesn't have
its own address, instead it inherits one from macvlan.

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PULL] Urgent memory fix for kvm with unaligned memory slots

2012-03-01 Thread Avi Kivity

The memory core may generate RAM memory regions that are not page
aligned, but the kvm code is not prepared to handle them well and will
abort under certain conditions.  This patch fixes the problem.

Please pull from:

  git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git memory/urgent


Avi Kivity (1):
  kvm: fix unaligned slots

 kvm-all.c |   15 ---
 1 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/kvm-all.c b/kvm-all.c
index c4babda..4b7a4ae 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -541,17 +541,26 @@ static void kvm_set_phys_mem(MemoryRegionSection
*section, bool add)
 target_phys_addr_t start_addr = section->offset_within_address_space;
 ram_addr_t size = section->size;
 void *ram = NULL;
+unsigned delta;
 
 /* kvm works in page size chunks, but the function may be called
with sub-page size and unaligned start address. */
-size = TARGET_PAGE_ALIGN(size);
-start_addr = TARGET_PAGE_ALIGN(start_addr);
+delta = TARGET_PAGE_ALIGN(size) - size;
+if (delta > size) {
+return;
+}
+start_addr += delta;
+size -= delta;
+size &= TARGET_PAGE_MASK;
+if (!size || (start_addr & ~TARGET_PAGE_MASK)) {
+return;
+}
 
 if (!memory_region_is_ram(mr)) {
 return;
 }
 
-ram = memory_region_get_ram_ptr(mr) + section->offset_within_region;
+ram = memory_region_get_ram_ptr(mr) + section->offset_within_region
+ delta;
 
 while (1) {
 mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr +
size);

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Add -netdev to man page

2012-03-01 Thread Avi Kivity

On 03/01/2012 02:51 PM, Miroslav Rezanina wrote:
> There's missing -netdev description in the man page for qemu. As this is
> recommended way to create network backend, lack of documentation can 
> discourage
> its usage.
>
> -net option is preserved but marked as obsolete way.

Please post to qemu-de...@nongnu.org instead.

>  "socket],id=str[,option][,option][,...]\n", QEMU_ARCH_ALL)
>  STEXI
> +@item -netdev user|tap|vde|socket,id=@var{str}[,option][,option][,...]
> +
> +Create a new network backend to the guest. 

"for the guest"?  But they're actually not for the guest, at least not
directly.

> Network's id 

"The backend's id" or "network backend identifiers"

> can be used with
> +the -device option to "plug" a particular network device into the network 
> +backend, e.g.

"to connect a backend with a device"?

> +
> +@example
> +-netdev user,id=mynet -device e1000,netdev=mynet
> +@end example
> +
> +You can use following types of backend:
> +
> +@table @option
> +
> +@item -netdev user
> +
> +User networking is default network backend. This backend does not require 
> root 
> +priviledges, does not allow ICMP trafic and host is not directly accessible 
> +from the host or the external network.
> +
> +Valid options are:
> +
> +@table @option
> +
> +@item restrict=y|yes|n|no
> +If this options is enabled, the guest will be isolated, i.e. it will not be
> +able to contact the host and no guest IP packets will be routed over the host
> +to the outside. This option does not affect explicitly set forwarding rule.

"rules"

> +
> +@item net=@var{addr}[/@var{mask}]
> +Set IP network address the guest will see. Optionally specify the netmask,

"the IP"

> +either in the form a.b.c.d or as number of valid top-most bits. Default is
> +10.0.2.0/8.
> +
> +@item host=@var{addr}
> +Specify the guest-visible address of the host. Default is the 2nd IP in the
> +guest network, i.e. x.x.x.2.
> +
> +@item hostname=@var{name}
> +Specifies the client hostname reported by the builtin DHCP server.
> +
> +@item dhcpstart=@var{addr}
> +Specify the first of the 16 IPs the built-in DHCP server can assign. Default
> +is the 16th to 31st IP in the guest network, i.e. x.x.x.16 to x.x.x.31.
> +
> +@item dns=@var{addr}
> +Specify the guest-visible address of the virtual nameserver. The address must
> +be different from the host address. Default is the 3rd IP in the guest 
> network,
> +i.e. x.x.x.3.
> +
> +@item tftp=@var{dir}
> +When using the user mode network stack, activate a built-in TFTP
> +server. The files in @var{dir} will be exposed as the root of a TFTP server.
> +The TFTP client on the guest must be configured in binary mode (use the 
> command
> +@code{bin} of the Unix TFTP client).
> +
> +@item bootfile=@var{file}
> +When using the user mode network stack, broadcast @var{file} as the BOOTP
> +filename. In conjunction with @option{tftp}, this can be used to network boot
> +a guest from a local directory.
> +
> +Example (using pxelinux):
> +@example
> +qemu -hda linux.img -boot n -netdev 
> user,id=netid,tftp=/path/to/tftp/files,bootfile=/pxelinux.0
> +@end example
> +
> +@item smb=@var{dir}[,smbserver=@var{addr}]
> +When using the user mode network stack, activate a built-in SMB
> +server so that Windows OSes can access to the host files in @file{@var{dir}}
> +transparently. The IP address of the SMB server can be set to @var{addr}. By
> +default the 4th IP in the guest network is used, i.e. x.x.x.4.
> +
> +In the guest Windows OS, the line:
> +@example
> +10.0.2.4 smbserver
> +@end example
> +must 

"may"?

> be added in the file @file{C:\WINDOWS\LMHOSTS} (for windows 9x/Me)
> +or @file{C:\WINNT\SYSTEM32\DRIVERS\ETC\LMHOSTS} (Windows NT/2000).
> +
> +Then @file{@var{dir}} can be accessed in @file{\\smbserver\qemu}.
> +
> +Note that a SAMBA server must be installed on the host OS in
> +@file{/usr/sbin/smbd}. QEMU was tested successfully with smbd versions from
> +Red Hat 9, Fedora Core 3 and OpenSUSE 11.x.
> +
> +@item 
> hostfwd=[tcp|udp]:[@var{hostaddr}]:@var{hostport}-[@var{guestaddr}]:@var{guestport}
> +Redirect incoming TCP or UDP connections to the host port @var{hostport} to
> +the guest IP address @var{guestaddr} on guest port @var{guestport}. If
> +@var{guestaddr} is not specified, its value is x.x.x.15 (default first 
> address
> +given by the built-in DHCP server). By specifying @var{hostaddr}, the rule 
> can
> +be bound to a specific host interface. If no connection type is set, TCP is
> +used. This option can be given multiple times.
> +
> +For example, to redirect host X11 connection from screen 1 to guest
> +screen 0, use the following:
> +
> +@example
> +# on the host
> +qemu -netdev user,id=netid,hostfwd=tcp:127.0.0.1:6001-:6000 [...]
> +# this host xterm should open in the guest X11 server
> +xterm -display :1
> +@end example
> +
> +To redirect telnet connections from host port  to telnet port on
> +the guest, use the following:
> +
> +@example
> +# on the host
> +qemu -netdev user,id=netid,

Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware

2012-03-01 Thread Jamal Hadi Salim

On Wed, 2012-02-29 at 10:19 -0800, John Fastabend wrote:

> > 
> > I want to see a unified API so that user space control applications (RSTP, 
> > TRILL?)
> > can use one set of netlink calls for both software bridge and hardware 
> > offloaded
> > bridges.  Does this proposal meet that requirement?
> > 

I dont see any issues with those requirements being met.

> Jamal, so why do "They have to be different calls"? I'm not so sure anymore...
> moving to RTM_FDB_XXXENTRY saved some refactoring in the bridge module but 
> that
> is just cosmetic.

I may not want to use the s/ware bridge i.e I may want to use h/ware
bridge. I may want to use both. So there are 3 variations there. You
need at least 1.5 bits to represent them if you are going to use the
same interface. There may be features in either h/ware but not in
s/ware and vice-versa. 
A single interface with flags which say this applies to hware:sware:both
would be good, but it may be harder to achieve - thats why i suggested
they be different.

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware

2012-03-01 Thread Jamal Hadi Salim

On Wed, 2012-02-29 at 09:25 -0800, John Fastabend wrote:

> Well I think NETLINK_ROUTE is the most correct type to use in this
> case. Per netlink.h its for routing and device hooks.
> 
> #define NETLINK_ROUTE   0   /* Routing/device hook
>   */
> 
> And NETLINK_ROUTE msg_types use the RTM_* prefix. The _*NEIGH postfix
> were merely a copy from the SW BRIDGE code paths. How about,
> 
> PF_BRIDGE:RTM_FDB_NEWENTRY
> PF_BRIDGE:RTM_FDB_DELENTRY
> PF_BRIDGE:RTM_FDB_GETENTRY

OK, I guess ;->

> And a new group RTNLGRP_FDB. 

Nod.

> Also using NETLINK_ROUTE gives the correct
> rtnl locking semantics for free.

makes sense.


> Agreed. I think adding some ndo_ops for bridging offloads here would
> work. For example the DSA infrastructure and/or macvlan devices might
> need this. Along the lines of extending this RFC,
> 
> [RFC] hardware bridging support for DSA switches
> http://patchwork.ozlabs.org/patch/16578/

Certainly - thats one approach that is reasonable.
Where is Lennert? ;-> I changed his email address to one that i am 
familiar with.

cheers,
jamal


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Add -netdev to man page

2012-03-01 Thread Miroslav Rezanina

There's missing -netdev description in the man page for qemu. As this is
recommended way to create network backend, lack of documentation can discourage
its usage.

-net option is preserved but marked as obsolete way.

Signed-off-by: Miroslav Rezanina 
---
 qemu-options.hx |  261 +++
 1 files changed, 261 insertions(+), 0 deletions(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index b129996..6e7835a 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1292,6 +1292,267 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
 #endif
 "socket],id=str[,option][,option][,...]\n", QEMU_ARCH_ALL)
 STEXI
+@item -netdev user|tap|vde|socket,id=@var{str}[,option][,option][,...]
+
+Create a new network backend to the guest. Network's id can be used with
+the -device option to "plug" a particular network device into the network 
+backend, e.g.
+
+@example
+-netdev user,id=mynet -device e1000,netdev=mynet
+@end example
+
+You can use following types of backend:
+
+@table @option
+
+@item -netdev user
+
+User networking is default network backend. This backend does not require root 
+priviledges, does not allow ICMP trafic and host is not directly accessible 
+from the host or the external network.
+
+Valid options are:
+
+@table @option
+
+@item restrict=y|yes|n|no
+If this options is enabled, the guest will be isolated, i.e. it will not be
+able to contact the host and no guest IP packets will be routed over the host
+to the outside. This option does not affect explicitly set forwarding rule.
+
+@item net=@var{addr}[/@var{mask}]
+Set IP network address the guest will see. Optionally specify the netmask,
+either in the form a.b.c.d or as number of valid top-most bits. Default is
+10.0.2.0/8.
+
+@item host=@var{addr}
+Specify the guest-visible address of the host. Default is the 2nd IP in the
+guest network, i.e. x.x.x.2.
+
+@item hostname=@var{name}
+Specifies the client hostname reported by the builtin DHCP server.
+
+@item dhcpstart=@var{addr}
+Specify the first of the 16 IPs the built-in DHCP server can assign. Default
+is the 16th to 31st IP in the guest network, i.e. x.x.x.16 to x.x.x.31.
+
+@item dns=@var{addr}
+Specify the guest-visible address of the virtual nameserver. The address must
+be different from the host address. Default is the 3rd IP in the guest network,
+i.e. x.x.x.3.
+
+@item tftp=@var{dir}
+When using the user mode network stack, activate a built-in TFTP
+server. The files in @var{dir} will be exposed as the root of a TFTP server.
+The TFTP client on the guest must be configured in binary mode (use the command
+@code{bin} of the Unix TFTP client).
+
+@item bootfile=@var{file}
+When using the user mode network stack, broadcast @var{file} as the BOOTP
+filename. In conjunction with @option{tftp}, this can be used to network boot
+a guest from a local directory.
+
+Example (using pxelinux):
+@example
+qemu -hda linux.img -boot n -netdev 
user,id=netid,tftp=/path/to/tftp/files,bootfile=/pxelinux.0
+@end example
+
+@item smb=@var{dir}[,smbserver=@var{addr}]
+When using the user mode network stack, activate a built-in SMB
+server so that Windows OSes can access to the host files in @file{@var{dir}}
+transparently. The IP address of the SMB server can be set to @var{addr}. By
+default the 4th IP in the guest network is used, i.e. x.x.x.4.
+
+In the guest Windows OS, the line:
+@example
+10.0.2.4 smbserver
+@end example
+must be added in the file @file{C:\WINDOWS\LMHOSTS} (for windows 9x/Me)
+or @file{C:\WINNT\SYSTEM32\DRIVERS\ETC\LMHOSTS} (Windows NT/2000).
+
+Then @file{@var{dir}} can be accessed in @file{\\smbserver\qemu}.
+
+Note that a SAMBA server must be installed on the host OS in
+@file{/usr/sbin/smbd}. QEMU was tested successfully with smbd versions from
+Red Hat 9, Fedora Core 3 and OpenSUSE 11.x.
+
+@item 
hostfwd=[tcp|udp]:[@var{hostaddr}]:@var{hostport}-[@var{guestaddr}]:@var{guestport}
+Redirect incoming TCP or UDP connections to the host port @var{hostport} to
+the guest IP address @var{guestaddr} on guest port @var{guestport}. If
+@var{guestaddr} is not specified, its value is x.x.x.15 (default first address
+given by the built-in DHCP server). By specifying @var{hostaddr}, the rule can
+be bound to a specific host interface. If no connection type is set, TCP is
+used. This option can be given multiple times.
+
+For example, to redirect host X11 connection from screen 1 to guest
+screen 0, use the following:
+
+@example
+# on the host
+qemu -netdev user,id=netid,hostfwd=tcp:127.0.0.1:6001-:6000 [...]
+# this host xterm should open in the guest X11 server
+xterm -display :1
+@end example
+
+To redirect telnet connections from host port  to telnet port on
+the guest, use the following:
+
+@example
+# on the host
+qemu -netdev user,id=netid,hostfwd=tcp:::23 [...]
+telnet localhost 
+@end example
+
+Then when you use on the host @code{telnet localhost }, you
+connect to the guest telnet server.
+
+@item guestfwd=[tcp]:@var{s

[PULL] kvm updates

2012-03-01 Thread Avi Kivity

This batch of updates is mostly Jan's rework of qemu-kvm's TPR
optimization for 32-bit Windows, making Windows XP much faster with kvm
enabled on older Intel and any AMD hardware.  Any similarities to the
original hack are purely coincidental.

Please pull from:

  git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git uq/master


Avi Kivity (1):
  pc-bios: update kvmvapic.bin

Gleb Natapov (1):
  kvm: Synchronize cpu state in kvm_arch_stop_on_emulation_error()

Jan Kiszka (10):
  kvm: Set cpu_single_env only once
  Remove useless casts from cpu iterators
  Process pending work while waiting for initial kick-off in TCG mode
  Allow to use pause_all_vcpus from VCPU context
  target-i386: Add infrastructure for reporting TPR MMIO accesses
  kvmvapic: Add option ROM
  kvmvapic: Introduce TPR access optimization for Windows guests
  kvmvapic: Simplify mp/up_set_tpr
  optionsrom: Reserve space for checksum
  kvmvapic: Use optionrom helpers

 .gitignore|1 +
 Makefile  |2 +-
 Makefile.target   |3 +-
 cpu-all.h |3 +-
 cpus.c|   26 ++-
 hw/apic.c |  126 ++-
 hw/apic.h |2 +
 hw/apic_common.c  |   69 -
 hw/apic_internal.h|   27 ++
 hw/kvm/apic.c |   32 ++
 hw/kvmvapic.c |  805
+
 hw/mc146818rtc.c  |5 +-
 kvm-all.c |5 -
 pc-bios/kvmvapic.bin  |  Bin 0 -> 9216 bytes
 pc-bios/optionrom/Makefile|2 +-
 pc-bios/optionrom/kvmvapic.S  |  335 +
 pc-bios/optionrom/optionrom.h |3 +-
 target-i386/cpu.h |   10 +
 target-i386/helper.c  |   16 +
 target-i386/kvm.c |   26 ++-
 20 files changed, 1468 insertions(+), 30 deletions(-)
 create mode 100644 hw/kvmvapic.c
 create mode 100755 pc-bios/kvmvapic.bin
 create mode 100644 pc-bios/optionrom/kvmvapic.S

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor

2012-03-01 Thread Stefano Stabellini

On Thu, 1 Mar 2012, Russell King - ARM Linux wrote:
> On Thu, Mar 01, 2012 at 10:27:02AM +, Dave Martin wrote:
> > So, where there's a compelling reason to inline these things, we can use
> > the existing techniques if we're alert to the risks.  But in cases where
> > there isn't a compelling reason, aren't we just inviting fragility
> > unnecessarily?
> 
> The practical experience from the kernel suggests that there isn't a
> problem - that's not to say that future versions of gcc won't become
> a problem, and that the compiler guys may refuse to fix it.
> 
> I think it's a feature that we should be pressing gcc guys for - it's
> fairly fundamental to any programming which requires interfaces that
> require certain args in certain registers, or receive results in
> certain registers.
> 
> The options over this are basically:
> 1. refusing to upgrade to any version of gcc which does not allow
>registers-in-asm
> 2. doing the store-to-memory reload-in-asm thing
> 3. hand-coding veneers for every call to marshall the registers
> 
> Each of those has its down sides, but I suspect with (1), it may be
> possible to have enough people applying pressure to the compiler guys
> that they finally see sense on this matter.

I tend to have a very practical approach about this sort of issues, so I
am tempted to go with 1) if you agree.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Bug 42829] KVM Guest with virtio network driver loses network connectivity

2012-03-01 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=42829


Steve  changed:

   What|Removed |Added

 Kernel Version|3.2 |3.2-rc3




--- Comment #2 from Steve   2012-03-01 10:46:09 ---
Rusty, could you have a look at, please ?

Thank you for your time.

-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor

2012-03-01 Thread Russell King - ARM Linux

On Thu, Mar 01, 2012 at 10:27:02AM +, Dave Martin wrote:
> So, where there's a compelling reason to inline these things, we can use
> the existing techniques if we're alert to the risks.  But in cases where
> there isn't a compelling reason, aren't we just inviting fragility
> unnecessarily?

The practical experience from the kernel suggests that there isn't a
problem - that's not to say that future versions of gcc won't become
a problem, and that the compiler guys may refuse to fix it.

I think it's a feature that we should be pressing gcc guys for - it's
fairly fundamental to any programming which requires interfaces that
require certain args in certain registers, or receive results in
certain registers.

The options over this are basically:
1. refusing to upgrade to any version of gcc which does not allow
   registers-in-asm
2. doing the store-to-memory reload-in-asm thing
3. hand-coding veneers for every call to marshall the registers

Each of those has its down sides, but I suspect with (1), it may be
possible to have enough people applying pressure to the compiler guys
that they finally see sense on this matter.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/4] KVM: Remove unused dirty_bitmap_head and nr_dirty_pages

2012-03-01 Thread Takuya Yoshikawa

Now that we do neither double buffering nor heuristic selection of the
write protection method these are not needed anymore.

Note: some drivers have their own implementation of set_bit_le() and
making it generic needs a bit of work; so we use test_and_set_bit_le()
and will later replace it with generic set_bit_le().

Signed-off-by: Takuya Yoshikawa 
---
 include/linux/kvm_host.h |2 --
 virt/kvm/kvm_main.c  |   14 +-
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 355e445..73c7d76 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -177,8 +177,6 @@ struct kvm_memory_slot {
unsigned long flags;
unsigned long *rmap;
unsigned long *dirty_bitmap;
-   unsigned long *dirty_bitmap_head;
-   unsigned long nr_dirty_pages;
struct kvm_arch_memory_slot arch;
unsigned long userspace_addr;
int user_alloc;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e4431ad..27a1083 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -522,12 +522,11 @@ static void kvm_destroy_dirty_bitmap(struct 
kvm_memory_slot *memslot)
return;
 
if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
-   vfree(memslot->dirty_bitmap_head);
+   vfree(memslot->dirty_bitmap);
else
-   kfree(memslot->dirty_bitmap_head);
+   kfree(memslot->dirty_bitmap);
 
memslot->dirty_bitmap = NULL;
-   memslot->dirty_bitmap_head = NULL;
 }
 
 /*
@@ -611,8 +610,7 @@ static int kvm_vm_release(struct inode *inode, struct file 
*filp)
 
 /*
  * Allocation size is twice as large as the actual dirty bitmap size.
- * This makes it possible to do double buffering: see x86's
- * kvm_vm_ioctl_get_dirty_log().
+ * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
  */
 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
@@ -627,8 +625,6 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot 
*memslot)
if (!memslot->dirty_bitmap)
return -ENOMEM;
 
-   memslot->dirty_bitmap_head = memslot->dirty_bitmap;
-   memslot->nr_dirty_pages = 0;
 #endif /* !CONFIG_S390 */
return 0;
 }
@@ -1476,8 +1472,8 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
 
-   if (!test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap))
-   memslot->nr_dirty_pages++;
+   /* TODO: introduce set_bit_le() and use it */
+   test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap);
}
 }
 
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/4] KVM: Switch to srcu-less get_dirty_log()

2012-03-01 Thread Takuya Yoshikawa

We have seen some problems of the current implementation of
get_dirty_log() which uses synchronize_srcu_expedited() for updating
dirty bitmaps; e.g. it is noticeable that this sometimes gives us ms
order of latency when we use VGA displays.

Furthermore the recent discussion on the following thread
"srcu: Implement call_srcu()"
http://lkml.org/lkml/2012/1/31/211
also motivated us to implement get_dirty_log() without SRCU.

This patch achieves this goal without sacrificing the performance of
both VGA and live migration: in practice the new code is much faster
than the old one unless we have too many dirty pages.

Implementation:

The key part of the implementation is the use of xchg() operation for
clearing dirty bits atomically.  Since this allows us to update only
BITS_PER_LONG pages at once, we need to iterate over the dirty bitmap
until every dirty bit is cleared again for the next call.

Although some people may worry about the problem of using the atomic
memory instruction many times to the concurrently accessible bitmap,
it is usually accessed with mmu_lock held and we rarely see concurrent
accesses: so what we need to care about is the pure xchg() overheads.

Another point to note is that we do not use for_each_set_bit() to check
which ones in each BITS_PER_LONG pages are actually dirty.  Instead we
simply use __ffs() in a loop.  This is much faster than repeatedly call
find_next_bit().

Performance:

The dirty-log-perf unit test showed nice improvements, some times faster
than before, when the number of dirty pages was below 8K.  For other
cases we saw a bit of regression but still enough fast compared to the
processing of these dirty pages in the userspace.

For real workloads, both VGA and live migration, we have observed pure
improvements: when the guest was reading a file, we originally saw a few
ms of latency, but with the new method the latency was less than 200us.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/x86.c |  116 +++
 1 files changed, 43 insertions(+), 73 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3bc1922..0748bab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3041,55 +3041,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 }
 
 /**
- * write_protect_slot - write protect a slot for dirty logging
- * @kvm: the kvm instance
- * @memslot: the slot we protect
- * @dirty_bitmap: the bitmap indicating which pages are dirty
- * @nr_dirty_pages: the number of dirty pages
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
  *
- * We have two ways to find all sptes to protect:
- * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and
- *checks ones that have a spte mapping a page in the slot.
- * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
+ * We need to keep it in mind that VCPU threads can write to the bitmap
+ * concurrently.  So, to avoid losing data, we keep the following order for
+ * each bit:
  *
- * Generally speaking, if there are not so many dirty pages compared to the
- * number of shadow pages, we should use the latter.
+ *   1. Take a snapshot of the bit and clear it if needed.
+ *   2. Write protect the corresponding page.
+ *   3. Flush TLB's if needed.
+ *   4. Copy the snapshot to the userspace.
  *
- * Note that letting others write into a page marked dirty in the old bitmap
- * by using the remaining tlb entry is not a problem.  That page will become
- * write protected again when we flush the tlb and then be reported dirty to
- * the user space by copying the old bitmap.
+ * Between 2 and 3, the guest may write to the page using the remaining TLB
+ * entry.  This is not a problem because the page will be reported dirty at
+ * step 4 using the snapshot taken before and step 3 ensures that successive
+ * writes will be logged for the next call.
  */
-static void write_protect_slot(struct kvm *kvm,
-  struct kvm_memory_slot *memslot,
-  unsigned long *dirty_bitmap,
-  unsigned long nr_dirty_pages)
-{
-   spin_lock(&kvm->mmu_lock);
-
-   /* Not many dirty pages compared to # of shadow pages. */
-   if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
-   gfn_t offset;
-
-   for_each_set_bit(offset, dirty_bitmap, memslot->npages)
-   kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 
1);
-
-   kvm_flush_remote_tlbs(kvm);
-   } else
-   kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-
-   spin_unlock(&kvm->mmu_lock);
-}
-
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
- struct kvm_dirty_log *log)
+int kvm_vm_ioctl_get_dirty_log(stru

Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor

2012-03-01 Thread Dave Martin

On Thu, Mar 01, 2012 at 10:10:29AM +, Russell King - ARM Linux wrote:
> On Wed, Feb 29, 2012 at 12:58:26PM +, Dave Martin wrote:
> > On Wed, Feb 29, 2012 at 09:56:02AM +, Ian Campbell wrote:
> > > On Wed, 2012-02-29 at 09:34 +, Dave Martin wrote:
> > > > On Tue, Feb 28, 2012 at 12:28:29PM +, Stefano Stabellini wrote:
> > > 
> > > > > I don't have a very strong opinion on which register we should use, 
> > > > > but
> > > > > I would like to avoid r7 if it is already actively used by gcc.
> > > > 
> > > > But there is no framepointer for Thumb-2 code (?)
> > > 
> > > Peter Maydell suggested there was:
> > > > r7 is (used by gcc as) the Thumb frame pointer; I don't know if this
> > > > makes it worth avoiding in this context.
> > > 
> > > Sounds like it might be a gcc-ism, possibly a non-default option?
> > > 
> > > Anyway, I think r12 will be fine for our purposes so the point is rather
> > > moot.
> > 
> > Just had a chat with some tools guys -- apparently, when passing register
> > arguments to gcc inline asms there really isn't a guarantee that those
> > variables will be in the expected registers on entry to the inline asm.
> 
> The best you can do is:
> 
>   register unsigned int foo asm("r7") = value;
> 
>   asm("blah %0" : : "r" (foo));
> 
> and ensure that your assembly checks that %0 _is_ r7 and produces a build
> error if not.  See the __asmeq() macro in asm/system.h to find out how to
> do that.
> 
> This feature has been missing from ARM GCC for quite a long time - it's
> used extensively on x86 GCC, where they have one register class per
> register, so they can do stuff like:
> 
>   asm("blah %0" : : "a" (value));
> 
> and be guaranteed that %0 will be eax.
> 
> > If you need a specific register, this means that you must set up that
> > register explicitly inside the asm if you want a guarantee that the
> > code will work:
> > 
> > asm volatile (
> > "movw   r12, %[hvc_num]\n\t"
> > ...
> > "hvc#0"
> > :: [hvc_num] "i" (NUMBER) : "r12"
> > );
> > 
> > Of course, if you need to set up more than about 5 or 6 registers in
> > this way, the doubled register footprint means that the compiler will
> > have to start spilling stuff to the stack.
> 
> No it won't - it will barf instead - think about it.  If you're clobbering
> r0 - r5, but need to pass in six values in registers, gcc can't use r0-r5
> for that, so it must use the remaining registers.  It gets rather unhappy
> with that, and starts erroring out (iirc 'too many reloads' or some similar
> error.)  I've been there.

You're right about that -- I didn't pursue my line of thought to the end,
there.  I have see the behaviour you describe.

> If you want to do it that way, your only option is to store them to memory
> and pass the address of the block into the assembly, and reload them there.
> Which is extremely sucky and inefficient.
> 
> Practically, the register variable plus asm() does seem to work, and seems
> to work for virtually all gcc versions out there (there have been the odd
> buggy version, but those bugs appear to get fixed.)

That is inconvenient for us, but it's a not a bug.  The ability for asm
contraints to be able to gcc to put things in specific registers (as with
the gcc "abcd" constraints for i386) would be nice, but as you point out,
this capability is simply not supported by gcc right now for ARM -- the
compiler guys seem to be pretty opposed to it, so I can't say I anticiapte
this being supported in the near future.

So, where there's a compelling reason to inline these things, we can use
the existing techniques if we're alert to the risks.  But in cases where
there isn't a compelling reason, aren't we just inviting fragility
unnecessarily?

Cheers
---Dave
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/4] KVM: Avoid checking huge page mappings in get_dirty_log()

2012-03-01 Thread Takuya Yoshikawa

Dropped such mappings when we enabled dirty logging and we will never
create new ones until we stop the logging.

For this we introduce a new function which can be used to write protect
a range of PT level pages: although we do not need to care about a range
of pages at this point, the following patch will need this feature to
optimize the write protection of many pages.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/include/asm/kvm_host.h |5 ++-
 arch/x86/kvm/mmu.c  |   40 +-
 arch/x86/kvm/x86.c  |8 ++
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 74c9edf..935cbcc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -712,8 +712,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
-  struct kvm_memory_slot *slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+struct kvm_memory_slot *slot,
+gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 67857bd..be8a529 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1037,27 +1037,47 @@ static int __rmap_write_protect(struct kvm *kvm, 
unsigned long *rmapp, int level
return write_protected;
 }
 
-int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
-  struct kvm_memory_slot *slot)
+/**
+ * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to protect
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should protect
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+struct kvm_memory_slot *slot,
+gfn_t gfn_offset, unsigned long mask)
 {
unsigned long *rmapp;
-   int i, write_protected = 0;
 
-   for (i = PT_PAGE_TABLE_LEVEL;
-i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-   rmapp = __gfn_to_rmap(gfn, i, slot);
-   write_protected |= __rmap_write_protect(kvm, rmapp, i);
-   }
+   while (mask) {
+   rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
+   __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
 
-   return write_protected;
+   /* clear the first set bit */
+   mask &= mask - 1;
+   }
 }
 
 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
struct kvm_memory_slot *slot;
+   unsigned long *rmapp;
+   int i;
+   int write_protected = 0;
 
slot = gfn_to_memslot(kvm, gfn);
-   return kvm_mmu_rmap_write_protect(kvm, gfn, slot);
+
+   for (i = PT_PAGE_TABLE_LEVEL;
+i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+   rmapp = __gfn_to_rmap(gfn, i, slot);
+   write_protected |= __rmap_write_protect(kvm, rmapp, i);
+   }
+
+   return write_protected;
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c9d99e5..3bc1922 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3069,13 +3069,11 @@ static void write_protect_slot(struct kvm *kvm,
 
/* Not many dirty pages compared to # of shadow pages. */
if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
-   unsigned long gfn_offset;
+   gfn_t offset;
 
-   for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
-   unsigned long gfn = memslot->base_gfn + gfn_offset;
+   for_each_set_bit(offset, dirty_bitmap, memslot->npages)
+   kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 
1);
 
-   kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
-   }
kvm_flush_remote_tlbs(kvm);
} else
kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/4] KVM: MMU: Split the main body of rmap_write_protect() off from others

2012-03-01 Thread Takuya Yoshikawa

We will use this in the following patch to implement another function
which needs to write protect pages using the rmap information.

Note that there is a small change in debug printing for large pages:
we do not differentiate them from others to avoid duplicating code.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |   53 ++-
 1 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff053ca..67857bd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1010,42 +1010,43 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
 }
 
-int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
-  struct kvm_memory_slot *slot)
+static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int 
level)
 {
-   unsigned long *rmapp;
-   u64 *spte;
-   int i, write_protected = 0;
+   u64 *spte = NULL;
+   int write_protected = 0;
 
-   rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
-   spte = rmap_next(rmapp, NULL);
-   while (spte) {
+   while ((spte = rmap_next(rmapp, spte))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
-   if (is_writable_pte(*spte)) {
+
+   if (!is_writable_pte(*spte))
+   continue;
+
+   if (level == PT_PAGE_TABLE_LEVEL) {
mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
-   write_protected = 1;
+   } else {
+   BUG_ON(!is_large_pte(*spte));
+   drop_spte(kvm, spte);
+   --kvm->stat.lpages;
+   spte = NULL;
}
-   spte = rmap_next(rmapp, spte);
+
+   write_protected = 1;
}
 
-   /* check for huge page mappings */
-   for (i = PT_DIRECTORY_LEVEL;
+   return write_protected;
+}
+
+int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
+  struct kvm_memory_slot *slot)
+{
+   unsigned long *rmapp;
+   int i, write_protected = 0;
+
+   for (i = PT_PAGE_TABLE_LEVEL;
 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
rmapp = __gfn_to_rmap(gfn, i, slot);
-   spte = rmap_next(rmapp, NULL);
-   while (spte) {
-   BUG_ON(!(*spte & PT_PRESENT_MASK));
-   BUG_ON(!is_large_pte(*spte));
-   pgprintk("rmap_write_protect(large): spte %p %llx 
%lld\n", spte, *spte, gfn);
-   if (is_writable_pte(*spte)) {
-   drop_spte(kvm, spte);
-   --kvm->stat.lpages;
-   spte = NULL;
-   write_protected = 1;
-   }
-   spte = rmap_next(rmapp, spte);
-   }
+   write_protected |= __rmap_write_protect(kvm, rmapp, i);
}
 
return write_protected;
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/4] KVM: srcu-less dirty logging -v2

2012-03-01 Thread Takuya Yoshikawa

v2: changed to protect masked pages

Live migration gets a bit faster than v1.

Takuya


=== from v1

This patch series is the result of the integration of my dirty logging
optimization work, including preparation for the new GET_DIRTY_LOG API,
and the attempt to get rid of controversial synchronize_srcu_expedited().

1 - KVM: MMU: Split the main body of rmap_write_protect() off from others
2 - KVM: Avoid checking huge page mappings in get_dirty_log()
3 - KVM: Switch to srcu-less get_dirty_log()
4 - KVM: Remove unused dirty_bitmap_head and nr_dirty_pages

Although there are still some remaining tasks, the test result obtained
looks very promising.


Remaining tasks:

- Implement set_bit_le() for mark_page_dirty()

  Some drivers are using their own implementation of it and a bit of
  work is needed to make it generic.  I want to do this separately
  later because it cannot be done within kvm tree.

- Stop allocating extra dirty bitmap buffer area

  According to Peter, mmu_notifier has become preemptible.  If we can
  change mmu_lock from spin_lock to mutex_lock, as Avi said before, this
  would be staightforward because we can use __put_user() right after
  xchg() with the mmu_lock held.


Test results:

1. dirty-log-perf unit test (on Sandy Bridge core-i3 32-bit host)

With some changes added since the previous post, the performance was
much improved: now even when every page in the slot is dirty, the number
is reasonably close to the original one.  For others, needless to say,
we have achieved very nice improvement.

- kvm.git next
average(ns)stdev ns/pagepages

 147018.677604.9147018.61
 158080.282211.9 79040.12
 127555.680619.8 31888.94
 108865.678499.3 13608.28
 114707.843508.6  7169.2   16
  76679.037659.8  2396.2   32
  59159.820417.1   924.3   64
  60418.219405.7   472.0  128
  76267.021450.5   297.9  256
 113182.022684.9   221.0  512
 930344.2   153766.5   908.5   1K
 939098.2   163800.3   458.5   2K
 996813.477921.0   243.3   4K
1113232.6   107782.6   135.8   8K
1241206.482282.575.7  16K
1529526.4   116388.246.6  32K
2147538.4   227375.932.7  64K
3309619.479356.825.2 128K
6016951.8   549873.422.9 256K

- kvm.git next + srcu-less series
average(ns)stdev ns/pagepagesimprovement(%)

  14086.0 3532.3 14086.01 944
  13303.6 3317.7  6651.821088
  13455.6 3315.2  3363.94 848
  14125.8 3435.4  1765.78 671
  15322.4 3690.1   957.6   16 649
  17026.6 4037.2   532.0   32 350
  21258.6 4852.3   332.1   64 178
  33845.614115.8   264.4  128  79
  37893.0  681.8   148.0  256 101
  61707.4 1057.6   120.5  512  83
  88861.4 2131.086.7   1K 947
 151315.6 6490.573.8   2K 521
 290579.6 8523.070.9   4K 243
 518231.020412.663.2   8K 115
2271171.412064.9   138.6  16K -45
3375866.214743.3   103.0  32K -55
4408395.610720.067.2  64K -51
5915336.226538.145.1 128K -44
8497356.416441.032.4 256K -29

Note that when the number of dirty pages was large, we spent less than
100ns for getting one dirty page information: see ns/page column.

As Avi noted before, this is much faster than the userspace send one
page to the destination node.

Furthermore, with the already proposed new GET_DIRTY_LOG API, we will
be able to restrict the area from which we get the log and will not need
to care about ms order of latency observed for very large number of dirty
pages.

2. real workloads (on Xeon W3520 64-bit host)

I traced kvm_vm_ioctl_get_dirty_log() during heavy VGA updates and
during live migration.

2.1. VGA: guest was doing "x11perf -rect1 -rect10 -rect100 -rect500"

As can be guessed from the result of dirty-log-perf, we observed very
nice improvement.

- kvm.git next
For heavy updates: 100us to 300us.
Worst: 300us

- kvm.git next + srcu-less series
For heavy updates: 3us to 10us.
Worst: 50us.

2.2. live migration: guest was doing "dd if=/path/to/a/file of=/dev/null"

The improvement was significant again.

- kvm.git next
For heavy updates: 1ms to 3ms

- kvm.git next + srcu-less series
For heavy updates: 50us to 300us

Probably we gained a lot from the locality of WWS.


Takuya

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4] KVM: Resize kvm_io_range array dynamically

2012-03-01 Thread Sasha Levin

On Thu, Mar 1, 2012 at 9:01 AM, Amos Kong  wrote:
> This patch makes the kvm_io_range array can be resized dynamically.
> Set an upper bounds for kvm_io_range to limit userspace.
> 1000 is a very large limit and not bloat the typical user.

Whats the reason for making everything dynamic? Memory savings there
aren't that significant.

If you want to make it more efficient just define:
static struct kvm_io_bus io_bus[2];

somewhere in kvm_main.c, and just switch between them when you need to
do insertion and removal of devices. You get the benefit of zero slub
usage, no allocations in any of the paths, and much simpler logic.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor

2012-03-01 Thread Russell King - ARM Linux

On Wed, Feb 29, 2012 at 02:44:24PM +, Ian Campbell wrote:
> > If you need a specific register, this means that you must set up that
> > register explicitly inside the asm if you want a guarantee that the
> > code will work:
> > 
> > asm volatile (
> > "movw   r12, %[hvc_num]\n\t"
> 
> Is gcc (or gas?) smart enough to optimise this away if it turns out that
> %[hvc_num] == r12?

No, and it won't do, because %[hvc_num] is specified in these operands:

> > ...
> > "hvc#0"
> > :: [hvc_num] "i" (NUMBER) : "r12"

to be an integer, not a register.

> How are system calls implemented on the userspace side? I confess I
> don't know what the ARM syscall ABI looks like -- is it all registers or
> is some of it on the stack? It sounds like the solution ought to be
> pretty similar though.

All registers.  We have a few which take a pointer to an in memory array,
but those are for some old multiplexed syscalls.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor

2012-03-01 Thread Russell King - ARM Linux

On Wed, Feb 29, 2012 at 12:58:26PM +, Dave Martin wrote:
> On Wed, Feb 29, 2012 at 09:56:02AM +, Ian Campbell wrote:
> > On Wed, 2012-02-29 at 09:34 +, Dave Martin wrote:
> > > On Tue, Feb 28, 2012 at 12:28:29PM +, Stefano Stabellini wrote:
> > 
> > > > I don't have a very strong opinion on which register we should use, but
> > > > I would like to avoid r7 if it is already actively used by gcc.
> > > 
> > > But there is no framepointer for Thumb-2 code (?)
> > 
> > Peter Maydell suggested there was:
> > > r7 is (used by gcc as) the Thumb frame pointer; I don't know if this
> > > makes it worth avoiding in this context.
> > 
> > Sounds like it might be a gcc-ism, possibly a non-default option?
> > 
> > Anyway, I think r12 will be fine for our purposes so the point is rather
> > moot.
> 
> Just had a chat with some tools guys -- apparently, when passing register
> arguments to gcc inline asms there really isn't a guarantee that those
> variables will be in the expected registers on entry to the inline asm.

The best you can do is:

register unsigned int foo asm("r7") = value;

asm("blah %0" : : "r" (foo));

and ensure that your assembly checks that %0 _is_ r7 and produces a build
error if not.  See the __asmeq() macro in asm/system.h to find out how to
do that.

This feature has been missing from ARM GCC for quite a long time - it's
used extensively on x86 GCC, where they have one register class per
register, so they can do stuff like:

asm("blah %0" : : "a" (value));

and be guaranteed that %0 will be eax.

> If you need a specific register, this means that you must set up that
> register explicitly inside the asm if you want a guarantee that the
> code will work:
> 
>   asm volatile (
>   "movw   r12, %[hvc_num]\n\t"
>   ...
>   "hvc#0"
>   :: [hvc_num] "i" (NUMBER) : "r12"
>   );
> 
> Of course, if you need to set up more than about 5 or 6 registers in
> this way, the doubled register footprint means that the compiler will
> have to start spilling stuff to the stack.

No it won't - it will barf instead - think about it.  If you're clobbering
r0 - r5, but need to pass in six values in registers, gcc can't use r0-r5
for that, so it must use the remaining registers.  It gets rather unhappy
with that, and starts erroring out (iirc 'too many reloads' or some similar
error.)  I've been there.

If you want to do it that way, your only option is to store them to memory
and pass the address of the block into the assembly, and reload them there.
Which is extremely sucky and inefficient.

Practically, the register variable plus asm() does seem to work, and seems
to work for virtually all gcc versions out there (there have been the odd
buggy version, but those bugs appear to get fixed.)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: x86: kvmclock: abstract save/restore sched_clock_state

2012-03-01 Thread Thomas Gleixner

On Tue, 7 Feb 2012, Marcelo Tosatti wrote:
> 
> Upon resume from hibernation, CPU 0's hvclock area contains the old
> values for system_time and tsc_timestamp. It is necessary for the
> hypervisor to update these values with uptodate ones before the CPU uses
> them.
> 
> Abstract TSC's save/restore sched_clock_state functions and use
> restore_state to write to KVM_SYSTEM_TIME MSR, forcing an update.
> 
> Fixes suspend-to-disk with kvmclock.
> 
> Signed-off-by: Marcelo Tosatti 
> 
> diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
> index 15d9915..c91e8b9 100644
> --- a/arch/x86/include/asm/tsc.h
> +++ b/arch/x86/include/asm/tsc.h
> @@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu);
>  extern void check_tsc_sync_target(void);
>  
>  extern int notsc_setup(char *);
> -extern void save_sched_clock_state(void);
> -extern void restore_sched_clock_state(void);
> +extern void tsc_save_sched_clock_state(void);
> +extern void tsc_restore_sched_clock_state(void);
>  
>  #endif /* _ASM_X86_TSC_H */
> diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
> index 5d0afac..baaca8d 100644
> --- a/arch/x86/include/asm/x86_init.h
> +++ b/arch/x86/include/asm/x86_init.h
> @@ -162,6 +162,8 @@ struct x86_cpuinit_ops {
>   * @is_untracked_pat_range   exclude from PAT logic
>   * @nmi_init enable NMI on cpus
>   * @i8042_detect pre-detect if i8042 controller exists
> + * @save_sched_clock_state:  save state for sched_clock() on suspend
> + * @restore_sched_clock_state:   restore state for sched_clock() on 
> resume
>   */
>  struct x86_platform_ops {
>   unsigned long (*calibrate_tsc)(void);
> @@ -173,6 +175,8 @@ struct x86_platform_ops {
>   void (*nmi_init)(void);
>   unsigned char (*get_nmi_reason)(void);
>   int (*i8042_detect)(void);
> + void (*save_sched_clock_state)(void);
> + void (*restore_sched_clock_state)(void);
>  };
>  
>  struct pci_dev;
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index ca4e735..57e6b78 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -136,6 +136,15 @@ int kvm_register_clock(char *txt)
>   return ret;
>  }
>  
> +void kvm_save_sched_clock_state(void)

static ?

> +{
> +}
> +
> +void kvm_restore_sched_clock_state(void)

Ditto

> +{
> + kvm_register_clock("primary cpu clock, resume");
> +}
> +

Otherwise: Reviewed-by: Thomas Gleixner 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor

2012-03-01 Thread Dave Martin

On Wed, Feb 29, 2012 at 02:52:38PM +, Stefano Stabellini wrote:
> On Wed, 29 Feb 2012, Dave Martin wrote:
> > On Wed, Feb 29, 2012 at 09:56:02AM +, Ian Campbell wrote:
> > > On Wed, 2012-02-29 at 09:34 +, Dave Martin wrote:
> > > > On Tue, Feb 28, 2012 at 12:28:29PM +, Stefano Stabellini wrote:
> > > 
> > > > > I don't have a very strong opinion on which register we should use, 
> > > > > but
> > > > > I would like to avoid r7 if it is already actively used by gcc.
> > > > 
> > > > But there is no framepointer for Thumb-2 code (?)
> > > 
> > > Peter Maydell suggested there was:
> > > > r7 is (used by gcc as) the Thumb frame pointer; I don't know if this
> > > > makes it worth avoiding in this context.
> > > 
> > > Sounds like it might be a gcc-ism, possibly a non-default option?
> > > 
> > > Anyway, I think r12 will be fine for our purposes so the point is rather
> > > moot.
> > 
> > Just had a chat with some tools guys -- apparently, when passing register
> > arguments to gcc inline asms there really isn't a guarantee that those
> > variables will be in the expected registers on entry to the inline asm.
> > 
> > If gcc reorders other function calls or other code around the inline asm
> > (which it can do, except under certain controlled situations), then
> > intervening code can clobber any registers in general.
> > 
> > Or, to summarise another way, there is no way to control which register
> > is used to pass something to an inline asm in general (often we get away
> > with this, and there are a lot of inline asms in the kernel that assume
> > it works, but the more you inline the more likely you are to get nasty
> > surprises).  There is no workaroud, except on some architectures where
> > special asm constraints allow specific individual registers to be
> > specified for operands (i386 for example).
> > 
> > If you need a specific register, this means that you must set up that
> > register explicitly inside the asm if you want a guarantee that the
> > code will work:
> > 
> > asm volatile (
> > "movw   r12, %[hvc_num]\n\t"
> > ...
> > "hvc#0"
> > :: [hvc_num] "i" (NUMBER) : "r12"
> > );
> > 
> 
> OK, we can arrange the hypercall code to be like that.
> Also with your patch series it would be "_hvc" because of the .macro,
> right?

Yes, but I would avoid making too many assumptions about the final form
of that patch -- it looks like there's significant work to do there,
since I made some unsafe assumptions about how the tools work...

We might end up with a magic #define after all.

> > This is the kind of problem which goes away when out-of-lining the
> > hvc wrapper behind a C function interface, since the ABI then provides
> > guarantees about how values are mershaled into and out of that code.
> 
> Do you mean implementing the entire HYPERVISOR_example_op in assembly
> and calling it from C?
> Because I guess that gcc would still be free to mess with the registers
> between the C function entry point and any inline assembly code.

gcc can arrange for the relevant things to be already in r0-r3 and the
relevant stack slots before branching to a function just as for inline
asm.  The only differences are that the compiler cannot choose which
registers to use, and the branch cannot be optimised away by the compiler
(the CPU may be able to optimise the branch away at runtime of course,
but that's another story...)

What libc appears to do is wrap each syscall in a separate function.
This means that it's not necessary to shuffle all the arguments by
one position when invoking the actual syscall.  (The generic "syscall"
function does of course need to shuffle the arguments so as to
displace the syscall number from the first argument to r7 --
but that's hard to avoid without inlining.)

For example:

00090b50 :
   90b50:   e52d7004push{r7}; (str r7, [sp, #-4]!)
   90b54:   e59f7010ldr r7, [pc, #16]   ; 90b6c 
   90b58:   ef00svc 0x
   90b5c:   e49d7004pop {r7}; (ldr r7, [sp], #4)
   90b60:   e3700a01cmn r0, #4096   ; 0x1000
...

Syscalls with more than 4 args still need to load the extra ones
from the stack, of course:

00090090 :
   90090:   e92d0090push{r4, r7}
   90094:   e59d4008ldr r4, [sp, #8]
   90098:   e59f7010ldr r7, [pc, #16]   ; 900b0 

   9009c:   ef00svc 0x
...


I don't know whether that makes sense for a hypervisor... it partly
depends on how many different hypercalls there are.

By all means implement it both ways and measure the performance
difference, if possible.

Cheers
---Dave
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V4] Quirk for IVB graphics FLR errata

2012-03-01 Thread Hao, Xudong

For IvyBridge Mobile platform, a system hang may occur if a FLR(Function Level
 Reset) is asserted to internal graphics.

This quirk patch is workaround for the IVB FLR errata issue.
We are disabling the FLR reset handshake between the PCH and CPU display, then 
manually powering down the panel power sequencing and resetting the PCH display.

Changes from v3:
- add X86 configuration to avoid compile problem in other architecture.

Signed-off-by: Xudong Hao 
Signed-off-by: Kay, Allen M 
Reviewed-by: Xiantao Zhang 
---
 drivers/pci/quirks.c |   59 ++
 1 files changed, 59 insertions(+), 0 deletions(-)

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6476547..63e8b70 
100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3069,11 +3069,70 @@ static int reset_intel_82599_sfp_virtfn(struct pci_dev 
*dev, int probe)
return 0;
 }
 
+#ifdef CONFIG_X86
+
+#include "../gpu/drm/i915/i915_reg.h"
+#define MSG_CTL0x45010
+
+static const int op_timeout = 10;  /* set timeout 10 seconds */
+static int reset_ivb_igd(struct pci_dev *dev, int probe) {
+   u8 *mmio_base;
+   u32 val;
+   cycles_t cyc_op_timeout = tsc_khz*op_timeout*1000;
+
+   if (probe)
+   return 0;
+
+   mmio_base = ioremap_nocache(pci_resource_start(dev, 0),
+pci_resource_len(dev, 0));
+   if (!mmio_base)
+   return -ENOMEM;
+
+   /* Work Around */
+   *((u32 *)(mmio_base + MSG_CTL)) = 0x0002;
+   /* Clobbering SOUTH_CHICKEN2 register is fine only if the next
+* driver loaded sets the right bits. However, this's a reset and
+* the bits have been set by i915 previously, so we clobber
+* SOUTH_CHICKEN2 register directly here.
+*/
+   *((u32 *)(mmio_base + SOUTH_CHICKEN2)) = 0x0005;
+   val = *((u32 *)(mmio_base + PCH_PP_CONTROL)) & 0xfffe;
+   *((u32 *)(mmio_base + PCH_PP_CONTROL)) = val;
+   do {
+   cycles_t start_time = get_cycles();
+   while (1) {
+   val = *((u32 *)(mmio_base + PCH_PP_STATUS));
+   if (((val & 0x8000) == 0)
+   && ((val & 0x3000) == 0))
+   break;
+   if (cyc_op_timeout < (get_cycles() - start_time))
+   break;
+   cpu_relax();
+   }
+   } while (0);
+   *((u32 *)(mmio_base + 0xd0100)) = 0x0002;
+
+   iounmap(pci_resource_start(dev, 0));
+   return 0;
+}
+#else
+static int reset_ivb_igd(struct pci_dev *dev, int probe) { }
+#endif /* CONFIG_X86 */
+
 #define PCI_DEVICE_ID_INTEL_82599_SFP_VF   0x10ed
+#define PCI_DEVICE_ID_INTEL_IVB_M_VGA  0x0156
+#define PCI_DEVICE_ID_INTEL_IVB_M2_VGA 0x0166
 
 static const struct pci_dev_reset_methods pci_dev_reset_methods[] = {
{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82599_SFP_VF,
 reset_intel_82599_sfp_virtfn },
+   { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_M_VGA,
+   reset_ivb_igd },
+   { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_M2_VGA,
+   reset_ivb_igd },
{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
reset_intel_generic_dev },
{ 0 }
--
1.6.0.rc1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH-WIP 01/13] xen/arm: use r12 to pass the hypercall number to the hypervisor

2012-03-01 Thread Dave Martin

On Wed, Feb 29, 2012 at 02:44:24PM +, Ian Campbell wrote:
> On Wed, 2012-02-29 at 12:58 +, Dave Martin wrote:
> > On Wed, Feb 29, 2012 at 09:56:02AM +, Ian Campbell wrote:
> > > On Wed, 2012-02-29 at 09:34 +, Dave Martin wrote:
> > > > On Tue, Feb 28, 2012 at 12:28:29PM +, Stefano Stabellini wrote:
> > > 
> > > > > I don't have a very strong opinion on which register we should use, 
> > > > > but
> > > > > I would like to avoid r7 if it is already actively used by gcc.
> > > > 
> > > > But there is no framepointer for Thumb-2 code (?)
> > > 
> > > Peter Maydell suggested there was:
> > > > r7 is (used by gcc as) the Thumb frame pointer; I don't know if this
> > > > makes it worth avoiding in this context.
> > > 
> > > Sounds like it might be a gcc-ism, possibly a non-default option?
> > > 
> > > Anyway, I think r12 will be fine for our purposes so the point is rather
> > > moot.
> > 
> > Just had a chat with some tools guys -- apparently, when passing register
> > arguments to gcc inline asms there really isn't a guarantee that those
> > variables will be in the expected registers on entry to the inline asm.
> > 
> > If gcc reorders other function calls or other code around the inline asm
> > (which it can do, except under certain controlled situations), then
> > intervening code can clobber any registers in general.
> > 
> > Or, to summarise another way, there is no way to control which register
> > is used to pass something to an inline asm in general (often we get away
> > with this, and there are a lot of inline asms in the kernel that assume
> > it works, but the more you inline the more likely you are to get nasty
> > surprises).  There is no workaroud, except on some architectures where
> > special asm constraints allow specific individual registers to be
> > specified for operands (i386 for example).
> 
> I had assumed I just couldn't find the right syntax. Useful to know that
> I couldn't find it because it doesn't exist!
> 
> > If you need a specific register, this means that you must set up that
> > register explicitly inside the asm if you want a guarantee that the
> > code will work:
> > 
> > asm volatile (
> > "movw   r12, %[hvc_num]\n\t"
> 
> Is gcc (or gas?) smart enough to optimise this away if it turns out that
> %[hvc_num] == r12?

No, unfortunately.  Except for the information defined by the constraints,
the inline asm block is completely opaque to the compiler (except for
pasting in operands -- which is a string operation done with no knowledge
of what the text means for the assembler).

> 
> > ...
> > "hvc#0"
> > :: [hvc_num] "i" (NUMBER) : "r12"
> > );
> > 
> > Of course, if you need to set up more than about 5 or 6 registers in
> > this way, the doubled register footprint means that the compiler will
> > have to start spilling stuff to the stack.
> > 
> > 
> > This is the kind of problem which goes away when out-of-lining the
> > hvc wrapper behind a C function interface, since the ABI then provides
> > guarantees about how values are mershaled into and out of that code.
> 
> I don't think anything would stop gcc from clobbering an argument
> register right on function entry (e..g it might move r0 to r8 and
> clobber r0, for whatever reason), so that they are no longer where you
> expect them to be when you hit the asm. Unlikely perhaps but no more so
> than the other issues you've raised?
>   
> Or did you mean out-of-line as in "written in a .S file" as well as out
> of line?

Yes.  Some toolchains have a concept of out-of-line assembler functions
in a .c file, but gcc doesn't -- the asm is always inline in its
immediate context, even if the containing function won't be inlined.

However, the compiler would have to be applying pretty creative
optimizations to break cases cases where an inlinable function contains,
say, nothing except for declarations, the asm() and a return statement.

I feel that the kernel implicitly relies on such things working in too
many places for breakage of that assumption to go unnoticed.

> 
> > Notwithstanding the above, even if we do make theoretically unsound
> > (but often true) assumptions about inline asms, ARM will be no worse
> > than other arches in this respect.
> 
> This is true.
> 
> > Other than serving as a reminder that inline asm is a deep can of
> > worms, this doesn't really give us a neat solution...
> 
> How are system calls implemented on the userspace side? I confess I
> don't know what the ARM syscall ABI looks like -- is it all registers or
> is some of it on the stack? It sounds like the solution ought to be
> pretty similar though.

I _believe_ it's now out of line in most cases.

I'm not sure I totally understand it all, though:

http://www.eglibc.org/cgi-bin/viewvc.cgi/trunk/ports/sysdeps/unix/sysv/linux/arm/eabi/

There is an internal inline syscall wrapper INTERNAL_SYSCALL_RAW(), but
I can't see where it is used.  For Thumb code it actually

57 matches

Mail list logo