From 9a6e2dcfdda31275296c2a55ae10ec9ee5265459 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 10 May 2017 16:32:55 +0800 Subject: mc146818rtc: update periodic timer only if it is needed Currently, the timer is updated whenever RegA or RegB is written even if the periodic timer related configuration is not changed This patch optimizes it slightly to make the update happen only if its period or enable-status is changed, also later patches are depend on this optimization Signed-off-by: Xiao Guangrong Message-Id: <20170510083259.3900-2-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index 93de3e1cc5..7d78391b62 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -391,6 +391,7 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { RTCState *s = opaque; + bool update_periodic_timer; if ((addr & 1) == 0) { s->cmos_index = data & 0x7f; @@ -423,6 +424,8 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, } break; case RTC_REG_A: + update_periodic_timer = (s->cmos_data[RTC_REG_A] ^ data) & 0x0f; + if ((data & 0x60) == 0x60) { if (rtc_running(s)) { rtc_update_time(s); @@ -445,10 +448,17 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, /* UIP bit is read only */ s->cmos_data[RTC_REG_A] = (data & ~REG_A_UIP) | (s->cmos_data[RTC_REG_A] & REG_A_UIP); - periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + + if (update_periodic_timer) { + periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + } + check_update_timer(s); break; case RTC_REG_B: + update_periodic_timer = (s->cmos_data[RTC_REG_B] ^ data) + & REG_B_PIE; + if (data & REG_B_SET) { /* update cmos to when the rtc was stopping */ if (rtc_running(s)) { @@ -475,7 +485,11 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, qemu_irq_lower(s->irq); } s->cmos_data[RTC_REG_B] = data; - periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + + if (update_periodic_timer) { + periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + } + check_update_timer(s); break; case RTC_REG_C: -- cgit v1.2.3-55-g7522 From 369b41359af46bded5799c9ef8be2b641d92e043 Mon Sep 17 00:00:00 2001 From: Tai Yunfang Date: Wed, 10 May 2017 16:32:56 +0800 Subject: mc146818rtc: precisely count the clock for periodic timer There are two issues in current code: 1) If the period is changed by re-configuring RegA, the coalesced irq will be scaled to reflect the new period, however, it calculates the new interrupt number like this: s->irq_coalesced = (s->irq_coalesced * s->period) / period; There are some clocks will be lost if they are not enough to be squeezed to a single new period that will cause the VM clock slower In order to fix the issue, we calculate the interrupt window based on the precise clock rather than period, then the clocks lost during period is scaled can be compensated properly 2) If periodic_timer_update() is called due to RegA reconfiguration, i.e, the period is updated, current time is not the start point for the next periodic timer, instead, which should start from the last interrupt, otherwise, the clock in VM will become slow This patch takes the clocks from last interrupt to current clock into account and compensates the clocks for the next interrupt, especially if a complete interrupt was lost in this window, the time can be caught up by LOST_TICK_POLICY_SLEW Signed-off-by: Tai Yunfang Signed-off-by: Xiao Guangrong Message-Id: <20170510083259.3900-3-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 120 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 97 insertions(+), 23 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index 7d78391b62..aeb60cc3e3 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -146,31 +146,100 @@ static void rtc_coalesced_timer(void *opaque) } #endif -/* handle periodic timer */ -static void periodic_timer_update(RTCState *s, int64_t current_time) +static uint32_t rtc_periodic_clock_ticks(RTCState *s) { - int period_code, period; - int64_t cur_clock, next_irq_clock; + int period_code; + + if (!(s->cmos_data[RTC_REG_B] & REG_B_PIE)) { + return 0; + } period_code = s->cmos_data[RTC_REG_A] & 0x0f; - if (period_code != 0 - && (s->cmos_data[RTC_REG_B] & REG_B_PIE)) { - if (period_code <= 2) - period_code += 7; - /* period in 32 Khz cycles */ - period = 1 << (period_code - 1); -#ifdef TARGET_I386 - if (period != s->period) { - s->irq_coalesced = (s->irq_coalesced * s->period) / period; - DPRINTF_C("cmos: coalesced irqs scaled to %d\n", s->irq_coalesced); - } - s->period = period; -#endif + if (!period_code) { + return 0; + } + + if (period_code <= 2) { + period_code += 7; + } + + /* period in 32 Khz cycles */ + return 1 << (period_code - 1); +} + +/* + * handle periodic timer. @old_period indicates the periodic timer update + * is just due to period adjustment. + */ +static void +periodic_timer_update(RTCState *s, int64_t current_time, uint32_t old_period) +{ + uint32_t period; + int64_t cur_clock, next_irq_clock, lost_clock = 0; + + period = rtc_periodic_clock_ticks(s); + + if (period) { /* compute 32 khz clock */ cur_clock = muldiv64(current_time, RTC_CLOCK_RATE, NANOSECONDS_PER_SECOND); - next_irq_clock = (cur_clock & ~(period - 1)) + period; + /* + * if the periodic timer's update is due to period re-configuration, + * we should count the clock since last interrupt. + */ + if (old_period) { + int64_t last_periodic_clock, next_periodic_clock; + + next_periodic_clock = muldiv64(s->next_periodic_time, + RTC_CLOCK_RATE, NANOSECONDS_PER_SECOND); + last_periodic_clock = next_periodic_clock - old_period; + lost_clock = cur_clock - last_periodic_clock; + assert(lost_clock >= 0); + } + +#ifdef TARGET_I386 + /* + * s->irq_coalesced can change for two reasons: + * + * a) if one or more periodic timer interrupts have been lost, + * lost_clock will be more that a period. + * + * b) when the period may be reconfigured, we expect the OS to + * treat delayed tick as the new period. So, when switching + * from a shorter to a longer period, scale down the missing, + * because the OS will treat past delayed ticks as longer + * (leftovers are put back into lost_clock). When switching + * to a shorter period, scale up the missing ticks since the + * OS handler will treat past delayed ticks as shorter. + */ + if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { + uint32_t old_irq_coalesced = s->irq_coalesced; + + s->period = period; + lost_clock += old_irq_coalesced * old_period; + s->irq_coalesced = lost_clock / s->period; + lost_clock %= s->period; + if (old_irq_coalesced != s->irq_coalesced || + old_period != s->period) { + DPRINTF_C("cmos: coalesced irqs scaled from %d to %d, " + "period scaled from %d to %d\n", old_irq_coalesced, + s->irq_coalesced, old_period, s->period); + rtc_coalesced_timer_update(s); + } + } else +#endif + { + /* + * no way to compensate the interrupt if LOST_TICK_POLICY_SLEW + * is not used, we should make the time progress anyway. + */ + lost_clock = MIN(lost_clock, period); + } + + assert(lost_clock >= 0 && lost_clock <= period); + + next_irq_clock = cur_clock + period - lost_clock; s->next_periodic_time = muldiv64(next_irq_clock, NANOSECONDS_PER_SECOND, RTC_CLOCK_RATE) + 1; timer_mod(s->periodic_timer, s->next_periodic_time); @@ -186,7 +255,7 @@ static void rtc_periodic_timer(void *opaque) { RTCState *s = opaque; - periodic_timer_update(s, s->next_periodic_time); + periodic_timer_update(s, s->next_periodic_time, 0); s->cmos_data[RTC_REG_C] |= REG_C_PF; if (s->cmos_data[RTC_REG_B] & REG_B_PIE) { s->cmos_data[RTC_REG_C] |= REG_C_IRQF; @@ -391,6 +460,7 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { RTCState *s = opaque; + uint32_t old_period; bool update_periodic_timer; if ((addr & 1) == 0) { @@ -425,6 +495,7 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, break; case RTC_REG_A: update_periodic_timer = (s->cmos_data[RTC_REG_A] ^ data) & 0x0f; + old_period = rtc_periodic_clock_ticks(s); if ((data & 0x60) == 0x60) { if (rtc_running(s)) { @@ -450,7 +521,8 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, (s->cmos_data[RTC_REG_A] & REG_A_UIP); if (update_periodic_timer) { - periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + periodic_timer_update(s, qemu_clock_get_ns(rtc_clock), + old_period); } check_update_timer(s); @@ -458,6 +530,7 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, case RTC_REG_B: update_periodic_timer = (s->cmos_data[RTC_REG_B] ^ data) & REG_B_PIE; + old_period = rtc_periodic_clock_ticks(s); if (data & REG_B_SET) { /* update cmos to when the rtc was stopping */ @@ -487,7 +560,8 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, s->cmos_data[RTC_REG_B] = data; if (update_periodic_timer) { - periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + periodic_timer_update(s, qemu_clock_get_ns(rtc_clock), + old_period); } check_update_timer(s); @@ -757,7 +831,7 @@ static int rtc_post_load(void *opaque, int version_id) uint64_t now = qemu_clock_get_ns(rtc_clock); if (now < s->next_periodic_time || now > (s->next_periodic_time + get_max_clock_jump())) { - periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + periodic_timer_update(s, qemu_clock_get_ns(rtc_clock), 0); } } @@ -822,7 +896,7 @@ static void rtc_notify_clock_reset(Notifier *notifier, void *data) int64_t now = *(int64_t *)data; rtc_set_date_from_host(ISA_DEVICE(s)); - periodic_timer_update(s, now); + periodic_timer_update(s, now, 0); check_update_timer(s); #ifdef TARGET_I386 if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { -- cgit v1.2.3-55-g7522 From 4aa70a0e9cd0c0332a8369df8c4f6d8e22fafe23 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 10 May 2017 16:32:57 +0800 Subject: mc146818rtc: ensure LOST_TICK_POLICY_SLEW is only enabled on TARGET_I386 Any tick policy specified on other platforms rather on TARGET_I386 will fall back to LOST_TICK_POLICY_DISCARD silently, this patch makes sure only TARGET_I386 can enable LOST_TICK_POLICY_SLEW After that, we can enable LOST_TICK_POLICY_SLEW in the common code which need not use '#ifdef TARGET_I386' to make these code be x86 specific anymore Signed-off-by: Xiao Guangrong Message-Id: <20170510083259.3900-4-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index aeb60cc3e3..4870a72015 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -974,19 +974,19 @@ static void rtc_realizefn(DeviceState *dev, Error **errp) rtc_set_date_from_host(isadev); -#ifdef TARGET_I386 switch (s->lost_tick_policy) { +#ifdef TARGET_I386 case LOST_TICK_POLICY_SLEW: s->coalesced_timer = timer_new_ns(rtc_clock, rtc_coalesced_timer, s); break; +#endif case LOST_TICK_POLICY_DISCARD: break; default: error_setg(errp, "Invalid lost tick policy."); return; } -#endif s->periodic_timer = timer_new_ns(rtc_clock, rtc_periodic_timer, s); s->update_timer = timer_new_ns(rtc_clock, rtc_update_timer, s); -- cgit v1.2.3-55-g7522 From 388ad5d2969b70242a385031caadef46328bf940 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 10 May 2017 16:32:58 +0800 Subject: mc146818rtc: drop unnecessary '#ifdef TARGET_I386' If the code purely depends on LOST_TICK_POLICY_SLEW, we can simply drop '#ifdef TARGET_I386' as only x86 can enable this tick policy Signed-off-by: Xiao Guangrong Message-Id: <20170510083259.3900-5-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index 4870a72015..f9d6181d58 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -112,7 +112,6 @@ static uint64_t get_guest_rtc_ns(RTCState *s) guest_clock - s->last_update + s->offset; } -#ifdef TARGET_I386 static void rtc_coalesced_timer_update(RTCState *s) { if (s->irq_coalesced == 0) { @@ -126,6 +125,7 @@ static void rtc_coalesced_timer_update(RTCState *s) } } +#ifdef TARGET_I386 static void rtc_coalesced_timer(void *opaque) { RTCState *s = opaque; @@ -198,7 +198,6 @@ periodic_timer_update(RTCState *s, int64_t current_time, uint32_t old_period) assert(lost_clock >= 0); } -#ifdef TARGET_I386 /* * s->irq_coalesced can change for two reasons: * @@ -227,9 +226,7 @@ periodic_timer_update(RTCState *s, int64_t current_time, uint32_t old_period) s->irq_coalesced, old_period, s->period); rtc_coalesced_timer_update(s); } - } else -#endif - { + } else { /* * no way to compensate the interrupt if LOST_TICK_POLICY_SLEW * is not used, we should make the time progress anyway. @@ -244,9 +241,7 @@ periodic_timer_update(RTCState *s, int64_t current_time, uint32_t old_period) RTC_CLOCK_RATE) + 1; timer_mod(s->periodic_timer, s->next_periodic_time); } else { -#ifdef TARGET_I386 s->irq_coalesced = 0; -#endif timer_del(s->periodic_timer); } } @@ -835,13 +830,11 @@ static int rtc_post_load(void *opaque, int version_id) } } -#ifdef TARGET_I386 if (version_id >= 2) { if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { rtc_coalesced_timer_update(s); } } -#endif return 0; } @@ -898,11 +891,10 @@ static void rtc_notify_clock_reset(Notifier *notifier, void *data) rtc_set_date_from_host(ISA_DEVICE(s)); periodic_timer_update(s, now, 0); check_update_timer(s); -#ifdef TARGET_I386 + if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { rtc_coalesced_timer_update(s); } -#endif } /* set CMOS shutdown status register (index 0xF) as S3_resume(0xFE) @@ -923,12 +915,10 @@ static void rtc_reset(void *opaque) qemu_irq_lower(s->irq); -#ifdef TARGET_I386 if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { s->irq_coalesced = 0; s->irq_reinject_on_ack_count = 0; } -#endif } static const MemoryRegionOps cmos_ops = { -- cgit v1.2.3-55-g7522 From e0c8b950d17a57343926ed937af10e8903b0d6cc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 10 May 2017 16:32:59 +0800 Subject: mc146818rtc: embrace all x86 specific code Introduce a function, rtc_policy_slew_deliver_irq(), which delivers irq if LOST_TICK_POLICY_SLEW is used, as which is only supported on x86, other platforms call it will trigger a assert After that, we can move the x86 specific code to the common place Signed-off-by: Xiao Guangrong Message-Id: <20170510083259.3900-6-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 60 ++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index f9d6181d58..542cd09bc1 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -125,17 +125,34 @@ static void rtc_coalesced_timer_update(RTCState *s) } } +static QLIST_HEAD(, RTCState) rtc_devices = + QLIST_HEAD_INITIALIZER(rtc_devices); + #ifdef TARGET_I386 +void qmp_rtc_reset_reinjection(Error **errp) +{ + RTCState *s; + + QLIST_FOREACH(s, &rtc_devices, link) { + s->irq_coalesced = 0; + } +} + +static bool rtc_policy_slew_deliver_irq(RTCState *s) +{ + apic_reset_irq_delivered(); + qemu_irq_raise(s->irq); + return apic_get_irq_delivered(); +} + static void rtc_coalesced_timer(void *opaque) { RTCState *s = opaque; if (s->irq_coalesced != 0) { - apic_reset_irq_delivered(); s->cmos_data[RTC_REG_C] |= 0xc0; DPRINTF_C("cmos: injecting from timer\n"); - qemu_irq_raise(s->irq); - if (apic_get_irq_delivered()) { + if (rtc_policy_slew_deliver_irq(s)) { s->irq_coalesced--; DPRINTF_C("cmos: coalesced irqs decreased to %d\n", s->irq_coalesced); @@ -144,6 +161,12 @@ static void rtc_coalesced_timer(void *opaque) rtc_coalesced_timer_update(s); } +#else +static bool rtc_policy_slew_deliver_irq(RTCState *s) +{ + assert(0); + return false; +} #endif static uint32_t rtc_periodic_clock_ticks(RTCState *s) @@ -254,21 +277,17 @@ static void rtc_periodic_timer(void *opaque) s->cmos_data[RTC_REG_C] |= REG_C_PF; if (s->cmos_data[RTC_REG_B] & REG_B_PIE) { s->cmos_data[RTC_REG_C] |= REG_C_IRQF; -#ifdef TARGET_I386 if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { if (s->irq_reinject_on_ack_count >= RTC_REINJECT_ON_ACK_COUNT) - s->irq_reinject_on_ack_count = 0; - apic_reset_irq_delivered(); - qemu_irq_raise(s->irq); - if (!apic_get_irq_delivered()) { + s->irq_reinject_on_ack_count = 0; + if (!rtc_policy_slew_deliver_irq(s)) { s->irq_coalesced++; rtc_coalesced_timer_update(s); DPRINTF_C("cmos: coalesced irqs increased to %d\n", s->irq_coalesced); } } else -#endif - qemu_irq_raise(s->irq); + qemu_irq_raise(s->irq); } } @@ -612,20 +631,6 @@ static void rtc_get_time(RTCState *s, struct tm *tm) rtc_from_bcd(s, s->cmos_data[RTC_CENTURY]) * 100 - 1900; } -static QLIST_HEAD(, RTCState) rtc_devices = - QLIST_HEAD_INITIALIZER(rtc_devices); - -#ifdef TARGET_I386 -void qmp_rtc_reset_reinjection(Error **errp) -{ - RTCState *s; - - QLIST_FOREACH(s, &rtc_devices, link) { - s->irq_coalesced = 0; - } -} -#endif - static void rtc_set_time(RTCState *s) { struct tm tm; @@ -745,22 +750,19 @@ static uint64_t cmos_ioport_read(void *opaque, hwaddr addr, if (ret & (REG_C_UF | REG_C_AF)) { check_update_timer(s); } -#ifdef TARGET_I386 + if(s->irq_coalesced && (s->cmos_data[RTC_REG_B] & REG_B_PIE) && s->irq_reinject_on_ack_count < RTC_REINJECT_ON_ACK_COUNT) { s->irq_reinject_on_ack_count++; s->cmos_data[RTC_REG_C] |= REG_C_IRQF | REG_C_PF; - apic_reset_irq_delivered(); DPRINTF_C("cmos: injecting on ack\n"); - qemu_irq_raise(s->irq); - if (apic_get_irq_delivered()) { + if (rtc_policy_slew_deliver_irq(s)) { s->irq_coalesced--; DPRINTF_C("cmos: coalesced irqs decreased to %d\n", s->irq_coalesced); } } -#endif break; default: ret = s->cmos_data[s->cmos_index]; -- cgit v1.2.3-55-g7522 From bd618eab7641693f0838da52c5af5c8050f831d3 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 27 May 2017 10:53:01 +0800 Subject: qtest: add rtc periodic timer test It tests the accuracy of rtc periodic timer which is recently improved & fixed by commit 7ffcb539a3 ("mc146818rtc: precisely count the clock for periodic timer", 2017-05-19). Signed-off-by: Xiao Guangrong Message-Id: <20170527025301.23499-1-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 15 +++--------- include/hw/timer/mc146818rtc_regs.h | 20 +++++++++++++++ tests/rtc-test.c | 49 +++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 12 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index 542cd09bc1..1b8d3d7d4c 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -120,7 +120,7 @@ static void rtc_coalesced_timer_update(RTCState *s) /* divide each RTC interval to 2 - 8 smaller intervals */ int c = MIN(s->irq_coalesced, 7) + 1; int64_t next_clock = qemu_clock_get_ns(rtc_clock) + - muldiv64(s->period / c, NANOSECONDS_PER_SECOND, RTC_CLOCK_RATE); + periodic_clock_to_ns(s->period / c); timer_mod(s->coalesced_timer, next_clock); } } @@ -178,16 +178,8 @@ static uint32_t rtc_periodic_clock_ticks(RTCState *s) } period_code = s->cmos_data[RTC_REG_A] & 0x0f; - if (!period_code) { - return 0; - } - - if (period_code <= 2) { - period_code += 7; - } - /* period in 32 Khz cycles */ - return 1 << (period_code - 1); + return periodic_period_to_clock(period_code); } /* @@ -260,8 +252,7 @@ periodic_timer_update(RTCState *s, int64_t current_time, uint32_t old_period) assert(lost_clock >= 0 && lost_clock <= period); next_irq_clock = cur_clock + period - lost_clock; - s->next_periodic_time = muldiv64(next_irq_clock, NANOSECONDS_PER_SECOND, - RTC_CLOCK_RATE) + 1; + s->next_periodic_time = periodic_clock_to_ns(next_irq_clock) + 1; timer_mod(s->periodic_timer, s->next_periodic_time); } else { s->irq_coalesced = 0; diff --git a/include/hw/timer/mc146818rtc_regs.h b/include/hw/timer/mc146818rtc_regs.h index 6ede6c832e..c62f17bf2d 100644 --- a/include/hw/timer/mc146818rtc_regs.h +++ b/include/hw/timer/mc146818rtc_regs.h @@ -65,4 +65,24 @@ #define REG_C_AF 0x20 #define REG_C_MASK 0x70 +static inline uint32_t periodic_period_to_clock(int period_code) +{ + if (!period_code) { + return 0; + } + + if (period_code <= 2) { + period_code += 7; + } + /* period in 32 Khz cycles */ + return 1 << (period_code - 1); +} + +#define RTC_CLOCK_RATE 32768 + +static inline int64_t periodic_clock_to_ns(int64_t clocks) +{ + return muldiv64(clocks, NANOSECONDS_PER_SECOND, RTC_CLOCK_RATE); +} + #endif diff --git a/tests/rtc-test.c b/tests/rtc-test.c index a086efd120..e78f701afb 100644 --- a/tests/rtc-test.c +++ b/tests/rtc-test.c @@ -14,6 +14,7 @@ #include "qemu/osdep.h" #include "libqtest.h" +#include "qemu/timer.h" #include "hw/timer/mc146818rtc_regs.h" static uint8_t base = 0x70; @@ -542,6 +543,52 @@ static void register_b_set_flag(void) g_assert_cmpint(cmos_read(RTC_CENTURY), ==, 0x20); } +#define RTC_PERIOD_CODE1 13 /* 8 Hz */ +#define RTC_PERIOD_CODE2 15 /* 2 Hz */ + +#define RTC_PERIOD_TEST_NR 50 + +static uint64_t wait_periodic_interrupt(uint64_t real_time) +{ + while (!get_irq(RTC_ISA_IRQ)) { + real_time = clock_step_next(); + } + + g_assert((cmos_read(RTC_REG_C) & REG_C_PF) != 0); + return real_time; +} + +static void periodic_timer(void) +{ + int i; + uint64_t period_clocks, period_time, start_time, real_time; + + /* disable all interrupts. */ + cmos_write(RTC_REG_B, cmos_read(RTC_REG_B) & + ~(REG_B_PIE | REG_B_AIE | REG_B_UIE)); + cmos_write(RTC_REG_A, RTC_PERIOD_CODE1); + /* enable periodic interrupt after properly configure the period. */ + cmos_write(RTC_REG_B, cmos_read(RTC_REG_B) | REG_B_PIE); + + start_time = real_time = clock_step_next(); + + for (i = 0; i < RTC_PERIOD_TEST_NR; i++) { + cmos_write(RTC_REG_A, RTC_PERIOD_CODE1); + real_time = wait_periodic_interrupt(real_time); + cmos_write(RTC_REG_A, RTC_PERIOD_CODE2); + real_time = wait_periodic_interrupt(real_time); + } + + period_clocks = periodic_period_to_clock(RTC_PERIOD_CODE1) + + periodic_period_to_clock(RTC_PERIOD_CODE2); + period_clocks *= RTC_PERIOD_TEST_NR; + period_time = periodic_clock_to_ns(period_clocks); + + real_time -= start_time; + g_assert_cmpint(ABS((int64_t)(real_time - period_time)), <=, + NANOSECONDS_PER_SECOND * 0.5); +} + int main(int argc, char **argv) { QTestState *s = NULL; @@ -564,6 +611,8 @@ int main(int argc, char **argv) qtest_add_func("/rtc/set-year/1980", set_year_1980); qtest_add_func("/rtc/misc/register_b_set_flag", register_b_set_flag); qtest_add_func("/rtc/misc/fuzz-registers", fuzz_registers); + qtest_add_func("/rtc/periodic/interrupt", periodic_timer); + ret = g_test_run(); if (s) { -- cgit v1.2.3-55-g7522 From 9ba35d0b865361f9838f830672ab229a569024eb Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 May 2017 14:00:42 +0800 Subject: kvm: irqchip: trace changes on msi add/remove It'll be nice to know which virq belongs to which device/vector when adding msi routes, so adding two more parameters for the add trace. Meanwhile, releasing virq has no tracing before. Add one for it. Signed-off-by: Peter Xu Message-Id: <1494309644-18743-2-git-send-email-peterx@redhat.com> Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Paolo Bonzini --- kvm-all.c | 4 +++- trace-events | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 494b9256aa..1b9fe23490 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1144,6 +1144,7 @@ void kvm_irqchip_release_virq(KVMState *s, int virq) } clear_gsi(s, virq); kvm_arch_release_virq_post(virq); + trace_kvm_irqchip_release_virq(virq); } static unsigned int kvm_hash_msi(uint32_t data) @@ -1287,7 +1288,8 @@ int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) return -EINVAL; } - trace_kvm_irqchip_add_msi_route(virq); + trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", + vector, virq); kvm_add_routing_entry(s, &kroute); kvm_arch_add_msi_route_post(&kroute, vector, dev); diff --git a/trace-events b/trace-events index d7a4d94168..b496be94d4 100644 --- a/trace-events +++ b/trace-events @@ -62,8 +62,9 @@ kvm_device_ioctl(int fd, int type, void *arg) "dev fd %d, type 0x%x, arg %p" kvm_failed_reg_get(uint64_t id, const char *msg) "Warning: Unable to retrieve ONEREG %" PRIu64 " from KVM: %s" kvm_failed_reg_set(uint64_t id, const char *msg) "Warning: Unable to set ONEREG %" PRIu64 " to KVM: %s" kvm_irqchip_commit_routes(void) "" -kvm_irqchip_add_msi_route(int virq) "Adding MSI route virq=%d" +kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d" kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d" +kvm_irqchip_release_virq(int virq) "virq %d" # TCG related tracing (mostly disabled by default) # cpu-exec.c -- cgit v1.2.3-55-g7522 From 993b1f4b2ceb7b09a7153aa01d03bdf95972e61d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 May 2017 14:00:43 +0800 Subject: msix: trace control bit write op Meanwhile, abstract a function to detect msix masked bit. Signed-off-by: Peter Xu Message-Id: <1494309644-18743-3-git-send-email-peterx@redhat.com> Acked-by: Michael S. Tsirkin Reviewed-by: Michael S. Tsirkin Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Paolo Bonzini --- hw/pci/msix.c | 11 +++++++++-- hw/pci/trace-events | 3 +++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/hw/pci/msix.c b/hw/pci/msix.c index bb54e8b0ac..fc5fe511b3 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -22,6 +22,7 @@ #include "hw/xen/xen.h" #include "qemu/range.h" #include "qapi/error.h" +#include "trace.h" #define MSIX_CAP_LENGTH 12 @@ -130,10 +131,14 @@ static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked) } } +static bool msix_masked(PCIDevice *dev) +{ + return dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & MSIX_MASKALL_MASK; +} + static void msix_update_function_masked(PCIDevice *dev) { - dev->msix_function_masked = !msix_enabled(dev) || - (dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & MSIX_MASKALL_MASK); + dev->msix_function_masked = !msix_enabled(dev) || msix_masked(dev); } /* Handle MSI-X capability config write. */ @@ -148,6 +153,8 @@ void msix_write_config(PCIDevice *dev, uint32_t addr, return; } + trace_msix_write_config(dev->name, msix_enabled(dev), msix_masked(dev)); + was_masked = dev->msix_function_masked; msix_update_function_masked(dev); diff --git a/hw/pci/trace-events b/hw/pci/trace-events index 2b9cf24405..83c8f5ace7 100644 --- a/hw/pci/trace-events +++ b/hw/pci/trace-events @@ -7,3 +7,6 @@ pci_update_mappings_add(void *d, uint32_t bus, uint32_t slot, uint32_t func, int # hw/pci/pci_host.c pci_cfg_read(const char *dev, unsigned devid, unsigned fnid, unsigned offs, unsigned val) "%s %02u:%u @0x%x -> 0x%x" pci_cfg_write(const char *dev, unsigned devid, unsigned fnid, unsigned offs, unsigned val) "%s %02u:%u @0x%x <- 0x%x" + +# hw/pci/msix.c +msix_write_config(char *name, bool enabled, bool masked) "dev %s enabled %d masked %d" -- cgit v1.2.3-55-g7522 From fd563564222f308e1d86847efdec8555fb472536 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 May 2017 14:00:44 +0800 Subject: kvm: irqchip: skip update msi when disabled It's possible that one device kept its irqfd/virq there even when MSI/MSIX was disabled globally for that device. One example is virtio-net-pci (see commit f1d0f15a6 and virtio_pci_vq_vector_mask()). It is used as a fast path to avoid allocate/release irqfd/virq frequently when guest enables/disables MSIX. However, this fast path brought a problem to msi_route_list, that the device MSIRouteEntry is still dangling there even if MSIX disabled - then we cannot know which message to fetch, even if we can, the messages are meaningless. In this case, we can just simply ignore this entry. It's safe, since when MSIX is enabled again, we'll rebuild them no matter what. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1448813 Signed-off-by: Peter Xu Message-Id: <1494309644-18743-4-git-send-email-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- target/i386/kvm.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 49b6115eae..9087677d00 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -43,6 +43,7 @@ #include "standard-headers/asm-x86/hyperv.h" #include "hw/pci/pci.h" #include "hw/pci/msi.h" +#include "hw/pci/msix.h" #include "migration/blocker.h" #include "exec/memattrs.h" #include "trace.h" @@ -3510,12 +3511,17 @@ static void kvm_update_msi_routes_all(void *private, bool global, int cnt = 0; MSIRouteEntry *entry; MSIMessage msg; + PCIDevice *dev; + /* TODO: explicit route update */ QLIST_FOREACH(entry, &msi_route_list, list) { cnt++; - msg = pci_get_msi_message(entry->dev, entry->vector); - kvm_irqchip_update_msi_route(kvm_state, entry->virq, - msg, entry->dev); + dev = entry->dev; + if (!msix_enabled(dev) && !msi_enabled(dev)) { + continue; + } + msg = pci_get_msi_message(dev, entry->vector); + kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev); } kvm_irqchip_commit_routes(kvm_state); trace_kvm_x86_update_msi_routes(cnt); -- cgit v1.2.3-55-g7522 From 7e6478e7d4f2c4b607069bf488d57089a9d3244b Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 9 May 2017 12:04:52 -0700 Subject: Check the return value of fcntl in qemu_set_cloexec Assert that the return value is not an error. This issue was found by Coverity. CID: 1374831 Signed-off-by: Stefano Stabellini CC: groug@kaod.org CC: pbonzini@redhat.com CC: Eric Blake Message-Id: <1494356693-13190-2-git-send-email-sstabellini@kernel.org> Signed-off-by: Paolo Bonzini --- util/oslib-posix.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 7e28c161b2..048d40d9de 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -182,7 +182,9 @@ void qemu_set_cloexec(int fd) { int f; f = fcntl(fd, F_GETFD); - fcntl(fd, F_SETFD, f | FD_CLOEXEC); + assert(f != -1); + f = fcntl(fd, F_SETFD, f | FD_CLOEXEC); + assert(f != -1); } /* -- cgit v1.2.3-55-g7522 From f250a42ddaee042ad2eb02022a3ebd18fcf987de Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 16 May 2017 12:45:29 +0300 Subject: nbd: strict nbd_wr_syncv nbd_wr_syncv is called either from coroutine or from client negotiation code, when socket is in blocking mode. So, -EAGAIN is impossible. Furthermore, EAGAIN is confusing, as, what to read/write again? With EAGAIN as a return code we don't know how much data is already read or written by the function, so in case of EAGAIN the whole communication is broken. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20170516094533.6160-2-vsementsov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- nbd/common.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/nbd/common.c b/nbd/common.c index dccbb8e9de..4db45b3ede 100644 --- a/nbd/common.c +++ b/nbd/common.c @@ -20,6 +20,10 @@ #include "qapi/error.h" #include "nbd-internal.h" +/* nbd_wr_syncv + * The function may be called from coroutine or from non-coroutine context. + * When called from non-coroutine context @ioc must be in blocking mode. + */ ssize_t nbd_wr_syncv(QIOChannel *ioc, struct iovec *iov, size_t niov, @@ -42,11 +46,8 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, len = qio_channel_writev(ioc, local_iov, nlocal_iov, &local_err); } if (len == QIO_CHANNEL_ERR_BLOCK) { - if (qemu_in_coroutine()) { - qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT); - } else { - return -EAGAIN; - } + assert(qemu_in_coroutine()); + qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT); continue; } if (len < 0) { -- cgit v1.2.3-55-g7522 From f5d406fe86bb28da85824b6581e58980cc1a07f3 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 16 May 2017 12:45:30 +0300 Subject: nbd: read_sync and friends: return 0 on success functions read_sync, drop_sync, write_sync, and also nbd_negotiate_write, nbd_negotiate_read, nbd_negotiate_drop_sync returns number of processed bytes. But what this number can be, except requested number of bytes? Actually, underlying nbd_wr_syncv function returns a value >= 0 and != requested_bytes only on eof on read operation. So, firstly, it is impossible on write (let's add an assert) and on read it actually means, that communication is broken (except nbd_receive_reply, see below). Most of callers operate like this: if (func(..., size) != size) { /* error path */ } , i.e.: 1. They are not interested in partial success 2. Extra duplications in code (especially bad are duplications of magic numbers) 3. User doesn't see actual error message, as return code is lost. (this patch doesn't fix this point, but it makes fixing easier) Several callers handles ret >= 0 and != requested-size separately, by just returning EINVAL in this case. This patch makes read_sync and friends return EINVAL in this case, so final behavior is the same. And only one caller - nbd_receive_reply() does something not so obvious. It returns EINVAL for ret > 0 and != requested-size, like previous group, but for ret == 0 it returns 0. The only caller of nbd_receive_reply() - nbd_read_reply_entry() handles ret == 0 in the same way as ret < 0, so for now it doesn't matter. However, in following commits error path handling will be improved and we'll need to distinguish success from fail in this case too. So, this patch adds separate helper for this case - read_sync_eof. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20170516094533.6160-3-vsementsov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- nbd/client.c | 63 ++++++++++++++++------------------------ nbd/nbd-internal.h | 34 +++++++++++++++++++--- nbd/server.c | 84 +++++++++++++++++++++--------------------------------- 3 files changed, 88 insertions(+), 93 deletions(-) diff --git a/nbd/client.c b/nbd/client.c index a58fb02cb4..6b74a628f1 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -86,9 +86,9 @@ static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports); */ -/* Discard length bytes from channel. Return -errno on failure, or - * the amount of bytes consumed. */ -static ssize_t drop_sync(QIOChannel *ioc, size_t size) +/* Discard length bytes from channel. Return -errno on failure and 0 on + * success*/ +static int drop_sync(QIOChannel *ioc, size_t size) { ssize_t ret = 0; char small[1024]; @@ -96,14 +96,13 @@ static ssize_t drop_sync(QIOChannel *ioc, size_t size) buffer = sizeof(small) >= size ? small : g_malloc(MIN(65536, size)); while (size > 0) { - ssize_t count = read_sync(ioc, buffer, MIN(65536, size)); + ssize_t count = MIN(65536, size); + ret = read_sync(ioc, buffer, MIN(65536, size)); - if (count <= 0) { + if (ret < 0) { goto cleanup; } - assert(count <= size); size -= count; - ret += count; } cleanup: @@ -136,12 +135,12 @@ static int nbd_send_option_request(QIOChannel *ioc, uint32_t opt, stl_be_p(&req.option, opt); stl_be_p(&req.length, len); - if (write_sync(ioc, &req, sizeof(req)) != sizeof(req)) { + if (write_sync(ioc, &req, sizeof(req)) < 0) { error_setg(errp, "Failed to send option request header"); return -1; } - if (len && write_sync(ioc, (char *) data, len) != len) { + if (len && write_sync(ioc, (char *) data, len) < 0) { error_setg(errp, "Failed to send option request data"); return -1; } @@ -170,7 +169,7 @@ static int nbd_receive_option_reply(QIOChannel *ioc, uint32_t opt, nbd_opt_reply *reply, Error **errp) { QEMU_BUILD_BUG_ON(sizeof(*reply) != 20); - if (read_sync(ioc, reply, sizeof(*reply)) != sizeof(*reply)) { + if (read_sync(ioc, reply, sizeof(*reply)) < 0) { error_setg(errp, "failed to read option reply"); nbd_send_opt_abort(ioc); return -1; @@ -219,7 +218,7 @@ static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply, goto cleanup; } msg = g_malloc(reply->length + 1); - if (read_sync(ioc, msg, reply->length) != reply->length) { + if (read_sync(ioc, msg, reply->length) < 0) { error_setg(errp, "failed to read option error message"); goto cleanup; } @@ -321,7 +320,7 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, nbd_send_opt_abort(ioc); return -1; } - if (read_sync(ioc, &namelen, sizeof(namelen)) != sizeof(namelen)) { + if (read_sync(ioc, &namelen, sizeof(namelen)) < 0) { error_setg(errp, "failed to read option name length"); nbd_send_opt_abort(ioc); return -1; @@ -334,7 +333,7 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, return -1; } if (namelen != strlen(want)) { - if (drop_sync(ioc, len) != len) { + if (drop_sync(ioc, len) < 0) { error_setg(errp, "failed to skip export name with wrong length"); nbd_send_opt_abort(ioc); return -1; @@ -343,14 +342,14 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, } assert(namelen < sizeof(name)); - if (read_sync(ioc, name, namelen) != namelen) { + if (read_sync(ioc, name, namelen) < 0) { error_setg(errp, "failed to read export name"); nbd_send_opt_abort(ioc); return -1; } name[namelen] = '\0'; len -= namelen; - if (drop_sync(ioc, len) != len) { + if (drop_sync(ioc, len) < 0) { error_setg(errp, "failed to read export description"); nbd_send_opt_abort(ioc); return -1; @@ -477,7 +476,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, buf, 8) != 8) { + if (read_sync(ioc, buf, 8) < 0) { error_setg(errp, "Failed to read data"); goto fail; } @@ -503,7 +502,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, &magic, sizeof(magic)) != sizeof(magic)) { + if (read_sync(ioc, &magic, sizeof(magic)) < 0) { error_setg(errp, "Failed to read magic"); goto fail; } @@ -515,8 +514,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, uint16_t globalflags; bool fixedNewStyle = false; - if (read_sync(ioc, &globalflags, sizeof(globalflags)) != - sizeof(globalflags)) { + if (read_sync(ioc, &globalflags, sizeof(globalflags)) < 0) { error_setg(errp, "Failed to read server flags"); goto fail; } @@ -534,8 +532,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } /* client requested flags */ clientflags = cpu_to_be32(clientflags); - if (write_sync(ioc, &clientflags, sizeof(clientflags)) != - sizeof(clientflags)) { + if (write_sync(ioc, &clientflags, sizeof(clientflags)) < 0) { error_setg(errp, "Failed to send clientflags field"); goto fail; } @@ -573,13 +570,13 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } /* Read the response */ - if (read_sync(ioc, &s, sizeof(s)) != sizeof(s)) { + if (read_sync(ioc, &s, sizeof(s)) < 0) { error_setg(errp, "Failed to read export length"); goto fail; } *size = be64_to_cpu(s); - if (read_sync(ioc, flags, sizeof(*flags)) != sizeof(*flags)) { + if (read_sync(ioc, flags, sizeof(*flags)) < 0) { error_setg(errp, "Failed to read export flags"); goto fail; } @@ -596,14 +593,14 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, &s, sizeof(s)) != sizeof(s)) { + if (read_sync(ioc, &s, sizeof(s)) < 0) { error_setg(errp, "Failed to read export length"); goto fail; } *size = be64_to_cpu(s); TRACE("Size is %" PRIu64, *size); - if (read_sync(ioc, &oldflags, sizeof(oldflags)) != sizeof(oldflags)) { + if (read_sync(ioc, &oldflags, sizeof(oldflags)) < 0) { error_setg(errp, "Failed to read export flags"); goto fail; } @@ -619,7 +616,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } TRACE("Size is %" PRIu64 ", export flags %" PRIx16, *size, *flags); - if (zeroes && drop_sync(ioc, 124) != 124) { + if (zeroes && drop_sync(ioc, 124) < 0) { error_setg(errp, "Failed to read reserved block"); goto fail; } @@ -744,7 +741,6 @@ int nbd_disconnect(int fd) ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request) { uint8_t buf[NBD_REQUEST_SIZE]; - ssize_t ret; TRACE("Sending request to server: " "{ .from = %" PRIu64", .len = %" PRIu32 ", .handle = %" PRIu64 @@ -759,16 +755,7 @@ ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request) stq_be_p(buf + 16, request->from); stl_be_p(buf + 24, request->len); - ret = write_sync(ioc, buf, sizeof(buf)); - if (ret < 0) { - return ret; - } - - if (ret != sizeof(buf)) { - LOG("writing to socket failed"); - return -EINVAL; - } - return 0; + return write_sync(ioc, buf, sizeof(buf)); } ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) @@ -777,7 +764,7 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) uint32_t magic; ssize_t ret; - ret = read_sync(ioc, buf, sizeof(buf)); + ret = read_sync_eof(ioc, buf, sizeof(buf)); if (ret <= 0) { return ret; } diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index f43d990a05..e6bbc7c4b4 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -94,7 +94,13 @@ #define NBD_ENOSPC 28 #define NBD_ESHUTDOWN 108 -static inline ssize_t read_sync(QIOChannel *ioc, void *buffer, size_t size) +/* read_sync_eof + * Tries to read @size bytes from @ioc. Returns number of bytes actually read. + * May return a value >= 0 and < size only on EOF, i.e. when iteratively called + * qio_channel_readv() returns 0. So, there are no needs to call read_sync_eof + * iteratively. + */ +static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size) { struct iovec iov = { .iov_base = buffer, .iov_len = size }; /* Sockets are kept in blocking mode in the negotiation phase. After @@ -105,12 +111,32 @@ static inline ssize_t read_sync(QIOChannel *ioc, void *buffer, size_t size) return nbd_wr_syncv(ioc, &iov, 1, size, true); } -static inline ssize_t write_sync(QIOChannel *ioc, const void *buffer, - size_t size) +/* read_sync + * Reads @size bytes from @ioc. Returns 0 on success. + */ +static inline int read_sync(QIOChannel *ioc, void *buffer, size_t size) +{ + ssize_t ret = read_sync_eof(ioc, buffer, size); + + if (ret >= 0 && ret != size) { + ret = -EINVAL; + } + + return ret < 0 ? ret : 0; +} + +/* write_sync + * Writes @size bytes to @ioc. Returns 0 on success. + */ +static inline int write_sync(QIOChannel *ioc, const void *buffer, size_t size) { struct iovec iov = { .iov_base = (void *) buffer, .iov_len = size }; - return nbd_wr_syncv(ioc, &iov, 1, size, false); + ssize_t ret = nbd_wr_syncv(ioc, &iov, 1, size, false); + + assert(ret < 0 || ret == size); + + return ret < 0 ? ret : 0; } struct NBDTLSHandshakeData { diff --git a/nbd/server.c b/nbd/server.c index 924a1fe2db..1e1096c762 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -112,7 +112,7 @@ static gboolean nbd_negotiate_continue(QIOChannel *ioc, return TRUE; } -static ssize_t nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size) +static int nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size) { ssize_t ret; guint watch; @@ -130,8 +130,7 @@ static ssize_t nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size) } -static ssize_t nbd_negotiate_write(QIOChannel *ioc, const void *buffer, - size_t size) +static int nbd_negotiate_write(QIOChannel *ioc, const void *buffer, size_t size) { ssize_t ret; guint watch; @@ -148,24 +147,24 @@ static ssize_t nbd_negotiate_write(QIOChannel *ioc, const void *buffer, return ret; } -static ssize_t nbd_negotiate_drop_sync(QIOChannel *ioc, size_t size) +static int nbd_negotiate_drop_sync(QIOChannel *ioc, size_t size) { - ssize_t ret, dropped = size; + ssize_t ret; uint8_t *buffer = g_malloc(MIN(65536, size)); while (size > 0) { - ret = nbd_negotiate_read(ioc, buffer, MIN(65536, size)); + size_t count = MIN(65536, size); + ret = nbd_negotiate_read(ioc, buffer, count); if (ret < 0) { g_free(buffer); return ret; } - assert(ret <= size); - size -= ret; + size -= count; } g_free(buffer); - return dropped; + return 0; } /* Basic flow for negotiation @@ -206,22 +205,22 @@ static int nbd_negotiate_send_rep_len(QIOChannel *ioc, uint32_t type, type, opt, len); magic = cpu_to_be64(NBD_REP_MAGIC); - if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) { + if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) < 0) { LOG("write failed (rep magic)"); return -EINVAL; } opt = cpu_to_be32(opt); - if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) != sizeof(opt)) { + if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) < 0) { LOG("write failed (rep opt)"); return -EINVAL; } type = cpu_to_be32(type); - if (nbd_negotiate_write(ioc, &type, sizeof(type)) != sizeof(type)) { + if (nbd_negotiate_write(ioc, &type, sizeof(type)) < 0) { LOG("write failed (rep type)"); return -EINVAL; } len = cpu_to_be32(len); - if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) { + if (nbd_negotiate_write(ioc, &len, sizeof(len)) < 0) { LOG("write failed (rep data length)"); return -EINVAL; } @@ -256,7 +255,7 @@ nbd_negotiate_send_rep_err(QIOChannel *ioc, uint32_t type, if (ret < 0) { goto out; } - if (nbd_negotiate_write(ioc, msg, len) != len) { + if (nbd_negotiate_write(ioc, msg, len) < 0) { LOG("write failed (error message)"); ret = -EIO; } else { @@ -287,15 +286,15 @@ static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp) } len = cpu_to_be32(name_len); - if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) { + if (nbd_negotiate_write(ioc, &len, sizeof(len)) < 0) { LOG("write failed (name length)"); return -EINVAL; } - if (nbd_negotiate_write(ioc, name, name_len) != name_len) { + if (nbd_negotiate_write(ioc, name, name_len) < 0) { LOG("write failed (name buffer)"); return -EINVAL; } - if (nbd_negotiate_write(ioc, desc, desc_len) != desc_len) { + if (nbd_negotiate_write(ioc, desc, desc_len) < 0) { LOG("write failed (description buffer)"); return -EINVAL; } @@ -309,7 +308,7 @@ static int nbd_negotiate_handle_list(NBDClient *client, uint32_t length) NBDExport *exp; if (length) { - if (nbd_negotiate_drop_sync(client->ioc, length) != length) { + if (nbd_negotiate_drop_sync(client->ioc, length) < 0) { return -EIO; } return nbd_negotiate_send_rep_err(client->ioc, @@ -340,7 +339,7 @@ static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t length) LOG("Bad length received"); goto fail; } - if (nbd_negotiate_read(client->ioc, name, length) != length) { + if (nbd_negotiate_read(client->ioc, name, length) < 0) { LOG("read failed"); goto fail; } @@ -373,7 +372,7 @@ static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client, TRACE("Setting up TLS"); ioc = client->ioc; if (length) { - if (nbd_negotiate_drop_sync(ioc, length) != length) { + if (nbd_negotiate_drop_sync(ioc, length) < 0) { return NULL; } nbd_negotiate_send_rep_err(ioc, NBD_REP_ERR_INVALID, NBD_OPT_STARTTLS, @@ -437,8 +436,7 @@ static int nbd_negotiate_options(NBDClient *client) ... Rest of request */ - if (nbd_negotiate_read(client->ioc, &flags, sizeof(flags)) != - sizeof(flags)) { + if (nbd_negotiate_read(client->ioc, &flags, sizeof(flags)) < 0) { LOG("read failed"); return -EIO; } @@ -464,8 +462,7 @@ static int nbd_negotiate_options(NBDClient *client) uint32_t clientflags, length; uint64_t magic; - if (nbd_negotiate_read(client->ioc, &magic, sizeof(magic)) != - sizeof(magic)) { + if (nbd_negotiate_read(client->ioc, &magic, sizeof(magic)) < 0) { LOG("read failed"); return -EINVAL; } @@ -476,14 +473,14 @@ static int nbd_negotiate_options(NBDClient *client) } if (nbd_negotiate_read(client->ioc, &clientflags, - sizeof(clientflags)) != sizeof(clientflags)) { + sizeof(clientflags)) < 0) + { LOG("read failed"); return -EINVAL; } clientflags = be32_to_cpu(clientflags); - if (nbd_negotiate_read(client->ioc, &length, sizeof(length)) != - sizeof(length)) { + if (nbd_negotiate_read(client->ioc, &length, sizeof(length)) < 0) { LOG("read failed"); return -EINVAL; } @@ -513,7 +510,7 @@ static int nbd_negotiate_options(NBDClient *client) return -EINVAL; default: - if (nbd_negotiate_drop_sync(client->ioc, length) != length) { + if (nbd_negotiate_drop_sync(client->ioc, length) < 0) { return -EIO; } ret = nbd_negotiate_send_rep_err(client->ioc, @@ -551,7 +548,7 @@ static int nbd_negotiate_options(NBDClient *client) return nbd_negotiate_handle_export_name(client, length); case NBD_OPT_STARTTLS: - if (nbd_negotiate_drop_sync(client->ioc, length) != length) { + if (nbd_negotiate_drop_sync(client->ioc, length) < 0) { return -EIO; } if (client->tlscreds) { @@ -570,7 +567,7 @@ static int nbd_negotiate_options(NBDClient *client) } break; default: - if (nbd_negotiate_drop_sync(client->ioc, length) != length) { + if (nbd_negotiate_drop_sync(client->ioc, length) < 0) { return -EIO; } ret = nbd_negotiate_send_rep_err(client->ioc, @@ -659,12 +656,12 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) TRACE("TLS cannot be enabled with oldstyle protocol"); goto fail; } - if (nbd_negotiate_write(client->ioc, buf, sizeof(buf)) != sizeof(buf)) { + if (nbd_negotiate_write(client->ioc, buf, sizeof(buf)) < 0) { LOG("write failed"); goto fail; } } else { - if (nbd_negotiate_write(client->ioc, buf, 18) != 18) { + if (nbd_negotiate_write(client->ioc, buf, 18) < 0) { LOG("write failed"); goto fail; } @@ -679,7 +676,7 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) stq_be_p(buf + 18, client->exp->size); stw_be_p(buf + 26, client->exp->nbdflags | myflags); len = client->no_zeroes ? 10 : sizeof(buf) - 18; - if (nbd_negotiate_write(client->ioc, buf + 18, len) != len) { + if (nbd_negotiate_write(client->ioc, buf + 18, len) < 0) { LOG("write failed"); goto fail; } @@ -702,11 +699,6 @@ static ssize_t nbd_receive_request(QIOChannel *ioc, NBDRequest *request) return ret; } - if (ret != sizeof(buf)) { - LOG("read failed"); - return -EINVAL; - } - /* Request [ 0 .. 3] magic (NBD_REQUEST_MAGIC) [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, ...) @@ -737,7 +729,6 @@ static ssize_t nbd_receive_request(QIOChannel *ioc, NBDRequest *request) static ssize_t nbd_send_reply(QIOChannel *ioc, NBDReply *reply) { uint8_t buf[NBD_REPLY_SIZE]; - ssize_t ret; reply->error = system_errno_to_nbd_errno(reply->error); @@ -754,16 +745,7 @@ static ssize_t nbd_send_reply(QIOChannel *ioc, NBDReply *reply) stl_be_p(buf + 4, reply->error); stq_be_p(buf + 8, reply->handle); - ret = write_sync(ioc, buf, sizeof(buf)); - if (ret < 0) { - return ret; - } - - if (ret != sizeof(buf)) { - LOG("writing to socket failed"); - return -EINVAL; - } - return 0; + return write_sync(ioc, buf, sizeof(buf)); } #define MAX_NBD_REQUESTS 16 @@ -1067,7 +1049,7 @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply, rc = nbd_send_reply(client->ioc, reply); if (rc >= 0) { ret = write_sync(client->ioc, req->data, len); - if (ret != len) { + if (ret < 0) { rc = -EIO; } } @@ -1141,7 +1123,7 @@ static ssize_t nbd_co_receive_request(NBDRequestData *req, if (request->type == NBD_CMD_WRITE) { TRACE("Reading %" PRIu32 " byte(s)", request->len); - if (read_sync(client->ioc, req->data, request->len) != request->len) { + if (read_sync(client->ioc, req->data, request->len) < 0) { LOG("reading from socket failed"); rc = -EIO; goto out; -- cgit v1.2.3-55-g7522 From f2609565369429bc1619d106b200106dba29290e Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 16 May 2017 12:45:31 +0300 Subject: nbd: add errp parameter to nbd_wr_syncv() Will be used in following patch to provide actual error message in some cases. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20170516094533.6160-4-vsementsov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- block/nbd-client.c | 4 ++-- include/block/nbd.h | 3 ++- nbd/common.c | 12 +++++------- nbd/nbd-internal.h | 4 ++-- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 1e2952fdae..538d95e031 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -136,7 +136,7 @@ static int nbd_co_send_request(BlockDriverState *bs, rc = nbd_send_request(s->ioc, request); if (rc >= 0) { ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len, - false); + false, NULL); if (ret != request->len) { rc = -EIO; } @@ -165,7 +165,7 @@ static void nbd_co_receive_reply(NBDClientSession *s, } else { if (qiov && reply->error == 0) { ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len, - true); + true, NULL); if (ret != request->len) { reply->error = EIO; } diff --git a/include/block/nbd.h b/include/block/nbd.h index 0ed077502e..9d385ea564 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -127,7 +127,8 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, struct iovec *iov, size_t niov, size_t length, - bool do_read); + bool do_read, + Error **errp); int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, QCryptoTLSCreds *tlscreds, const char *hostname, QIOChannel **outioc, diff --git a/nbd/common.c b/nbd/common.c index 4db45b3ede..bd81637ab9 100644 --- a/nbd/common.c +++ b/nbd/common.c @@ -28,10 +28,10 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, struct iovec *iov, size_t niov, size_t length, - bool do_read) + bool do_read, + Error **errp) { ssize_t done = 0; - Error *local_err = NULL; struct iovec *local_iov = g_new(struct iovec, niov); struct iovec *local_iov_head = local_iov; unsigned int nlocal_iov = niov; @@ -41,19 +41,17 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, while (nlocal_iov > 0) { ssize_t len; if (do_read) { - len = qio_channel_readv(ioc, local_iov, nlocal_iov, &local_err); + len = qio_channel_readv(ioc, local_iov, nlocal_iov, errp); } else { - len = qio_channel_writev(ioc, local_iov, nlocal_iov, &local_err); + len = qio_channel_writev(ioc, local_iov, nlocal_iov, errp); } if (len == QIO_CHANNEL_ERR_BLOCK) { + /* errp should not be set */ assert(qemu_in_coroutine()); qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT); continue; } if (len < 0) { - TRACE("I/O error: %s", error_get_pretty(local_err)); - error_free(local_err); - /* XXX handle Error objects */ done = -EIO; goto cleanup; } diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index e6bbc7c4b4..1d479fe135 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -108,7 +108,7 @@ static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size) * our request/reply. Synchronization is done with recv_coroutine, so * that this is coroutine-safe. */ - return nbd_wr_syncv(ioc, &iov, 1, size, true); + return nbd_wr_syncv(ioc, &iov, 1, size, true, NULL); } /* read_sync @@ -132,7 +132,7 @@ static inline int write_sync(QIOChannel *ioc, const void *buffer, size_t size) { struct iovec iov = { .iov_base = (void *) buffer, .iov_len = size }; - ssize_t ret = nbd_wr_syncv(ioc, &iov, 1, size, false); + ssize_t ret = nbd_wr_syncv(ioc, &iov, 1, size, false, NULL); assert(ret < 0 || ret == size); -- cgit v1.2.3-55-g7522 From e44ed99d1949315755bffb12a5a483ac66d4a976 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 16 May 2017 12:45:32 +0300 Subject: nbd: add errp to read_sync, write_sync and drop_sync There a lot of calls of these functions, which already have errp, which they are filling themselves. On the other hand, nbd_wr_syncv has errp parameter too, so it would be great to connect them. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20170516094533.6160-5-vsementsov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- nbd/client.c | 76 +++++++++++++++++++++++++++--------------------------- nbd/nbd-internal.h | 16 +++++++----- nbd/server.c | 12 ++++----- 3 files changed, 54 insertions(+), 50 deletions(-) diff --git a/nbd/client.c b/nbd/client.c index 6b74a628f1..f102375504 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -88,7 +88,7 @@ static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports); /* Discard length bytes from channel. Return -errno on failure and 0 on * success*/ -static int drop_sync(QIOChannel *ioc, size_t size) +static int drop_sync(QIOChannel *ioc, size_t size, Error **errp) { ssize_t ret = 0; char small[1024]; @@ -97,7 +97,7 @@ static int drop_sync(QIOChannel *ioc, size_t size) buffer = sizeof(small) >= size ? small : g_malloc(MIN(65536, size)); while (size > 0) { ssize_t count = MIN(65536, size); - ret = read_sync(ioc, buffer, MIN(65536, size)); + ret = read_sync(ioc, buffer, MIN(65536, size), errp); if (ret < 0) { goto cleanup; @@ -135,13 +135,13 @@ static int nbd_send_option_request(QIOChannel *ioc, uint32_t opt, stl_be_p(&req.option, opt); stl_be_p(&req.length, len); - if (write_sync(ioc, &req, sizeof(req)) < 0) { - error_setg(errp, "Failed to send option request header"); + if (write_sync(ioc, &req, sizeof(req), errp) < 0) { + error_prepend(errp, "Failed to send option request header"); return -1; } - if (len && write_sync(ioc, (char *) data, len) < 0) { - error_setg(errp, "Failed to send option request data"); + if (len && write_sync(ioc, (char *) data, len, errp) < 0) { + error_prepend(errp, "Failed to send option request data"); return -1; } @@ -169,8 +169,8 @@ static int nbd_receive_option_reply(QIOChannel *ioc, uint32_t opt, nbd_opt_reply *reply, Error **errp) { QEMU_BUILD_BUG_ON(sizeof(*reply) != 20); - if (read_sync(ioc, reply, sizeof(*reply)) < 0) { - error_setg(errp, "failed to read option reply"); + if (read_sync(ioc, reply, sizeof(*reply), errp) < 0) { + error_prepend(errp, "failed to read option reply"); nbd_send_opt_abort(ioc); return -1; } @@ -218,8 +218,8 @@ static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply, goto cleanup; } msg = g_malloc(reply->length + 1); - if (read_sync(ioc, msg, reply->length) < 0) { - error_setg(errp, "failed to read option error message"); + if (read_sync(ioc, msg, reply->length, errp) < 0) { + error_prepend(errp, "failed to read option error message"); goto cleanup; } msg[reply->length] = '\0'; @@ -320,8 +320,8 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, nbd_send_opt_abort(ioc); return -1; } - if (read_sync(ioc, &namelen, sizeof(namelen)) < 0) { - error_setg(errp, "failed to read option name length"); + if (read_sync(ioc, &namelen, sizeof(namelen), errp) < 0) { + error_prepend(errp, "failed to read option name length"); nbd_send_opt_abort(ioc); return -1; } @@ -333,8 +333,8 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, return -1; } if (namelen != strlen(want)) { - if (drop_sync(ioc, len) < 0) { - error_setg(errp, "failed to skip export name with wrong length"); + if (drop_sync(ioc, len, errp) < 0) { + error_prepend(errp, "failed to skip export name with wrong length"); nbd_send_opt_abort(ioc); return -1; } @@ -342,15 +342,15 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, } assert(namelen < sizeof(name)); - if (read_sync(ioc, name, namelen) < 0) { - error_setg(errp, "failed to read export name"); + if (read_sync(ioc, name, namelen, errp) < 0) { + error_prepend(errp, "failed to read export name"); nbd_send_opt_abort(ioc); return -1; } name[namelen] = '\0'; len -= namelen; - if (drop_sync(ioc, len) < 0) { - error_setg(errp, "failed to read export description"); + if (drop_sync(ioc, len, errp) < 0) { + error_prepend(errp, "failed to read export description"); nbd_send_opt_abort(ioc); return -1; } @@ -476,8 +476,8 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, buf, 8) < 0) { - error_setg(errp, "Failed to read data"); + if (read_sync(ioc, buf, 8, errp) < 0) { + error_prepend(errp, "Failed to read data"); goto fail; } @@ -502,8 +502,8 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, &magic, sizeof(magic)) < 0) { - error_setg(errp, "Failed to read magic"); + if (read_sync(ioc, &magic, sizeof(magic), errp) < 0) { + error_prepend(errp, "Failed to read magic"); goto fail; } magic = be64_to_cpu(magic); @@ -514,8 +514,8 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, uint16_t globalflags; bool fixedNewStyle = false; - if (read_sync(ioc, &globalflags, sizeof(globalflags)) < 0) { - error_setg(errp, "Failed to read server flags"); + if (read_sync(ioc, &globalflags, sizeof(globalflags), errp) < 0) { + error_prepend(errp, "Failed to read server flags"); goto fail; } globalflags = be16_to_cpu(globalflags); @@ -532,8 +532,8 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } /* client requested flags */ clientflags = cpu_to_be32(clientflags); - if (write_sync(ioc, &clientflags, sizeof(clientflags)) < 0) { - error_setg(errp, "Failed to send clientflags field"); + if (write_sync(ioc, &clientflags, sizeof(clientflags), errp) < 0) { + error_prepend(errp, "Failed to send clientflags field"); goto fail; } if (tlscreds) { @@ -570,14 +570,14 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } /* Read the response */ - if (read_sync(ioc, &s, sizeof(s)) < 0) { - error_setg(errp, "Failed to read export length"); + if (read_sync(ioc, &s, sizeof(s), errp) < 0) { + error_prepend(errp, "Failed to read export length"); goto fail; } *size = be64_to_cpu(s); - if (read_sync(ioc, flags, sizeof(*flags)) < 0) { - error_setg(errp, "Failed to read export flags"); + if (read_sync(ioc, flags, sizeof(*flags), errp) < 0) { + error_prepend(errp, "Failed to read export flags"); goto fail; } be16_to_cpus(flags); @@ -593,15 +593,15 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, &s, sizeof(s)) < 0) { - error_setg(errp, "Failed to read export length"); + if (read_sync(ioc, &s, sizeof(s), errp) < 0) { + error_prepend(errp, "Failed to read export length"); goto fail; } *size = be64_to_cpu(s); TRACE("Size is %" PRIu64, *size); - if (read_sync(ioc, &oldflags, sizeof(oldflags)) < 0) { - error_setg(errp, "Failed to read export flags"); + if (read_sync(ioc, &oldflags, sizeof(oldflags), errp) < 0) { + error_prepend(errp, "Failed to read export flags"); goto fail; } be32_to_cpus(&oldflags); @@ -616,8 +616,8 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } TRACE("Size is %" PRIu64 ", export flags %" PRIx16, *size, *flags); - if (zeroes && drop_sync(ioc, 124) < 0) { - error_setg(errp, "Failed to read reserved block"); + if (zeroes && drop_sync(ioc, 124, errp) < 0) { + error_prepend(errp, "Failed to read reserved block"); goto fail; } rc = 0; @@ -755,7 +755,7 @@ ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request) stq_be_p(buf + 16, request->from); stl_be_p(buf + 24, request->len); - return write_sync(ioc, buf, sizeof(buf)); + return write_sync(ioc, buf, sizeof(buf), NULL); } ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) @@ -764,7 +764,7 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) uint32_t magic; ssize_t ret; - ret = read_sync_eof(ioc, buf, sizeof(buf)); + ret = read_sync_eof(ioc, buf, sizeof(buf), NULL); if (ret <= 0) { return ret; } diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index 1d479fe135..d6071640a0 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -100,7 +100,8 @@ * qio_channel_readv() returns 0. So, there are no needs to call read_sync_eof * iteratively. */ -static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size) +static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size, + Error **errp) { struct iovec iov = { .iov_base = buffer, .iov_len = size }; /* Sockets are kept in blocking mode in the negotiation phase. After @@ -108,18 +109,20 @@ static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size) * our request/reply. Synchronization is done with recv_coroutine, so * that this is coroutine-safe. */ - return nbd_wr_syncv(ioc, &iov, 1, size, true, NULL); + return nbd_wr_syncv(ioc, &iov, 1, size, true, errp); } /* read_sync * Reads @size bytes from @ioc. Returns 0 on success. */ -static inline int read_sync(QIOChannel *ioc, void *buffer, size_t size) +static inline int read_sync(QIOChannel *ioc, void *buffer, size_t size, + Error **errp) { - ssize_t ret = read_sync_eof(ioc, buffer, size); + ssize_t ret = read_sync_eof(ioc, buffer, size, errp); if (ret >= 0 && ret != size) { ret = -EINVAL; + error_setg(errp, "End of file"); } return ret < 0 ? ret : 0; @@ -128,11 +131,12 @@ static inline int read_sync(QIOChannel *ioc, void *buffer, size_t size) /* write_sync * Writes @size bytes to @ioc. Returns 0 on success. */ -static inline int write_sync(QIOChannel *ioc, const void *buffer, size_t size) +static inline int write_sync(QIOChannel *ioc, const void *buffer, size_t size, + Error **errp) { struct iovec iov = { .iov_base = (void *) buffer, .iov_len = size }; - ssize_t ret = nbd_wr_syncv(ioc, &iov, 1, size, false, NULL); + ssize_t ret = nbd_wr_syncv(ioc, &iov, 1, size, false, errp); assert(ret < 0 || ret == size); diff --git a/nbd/server.c b/nbd/server.c index 1e1096c762..ee59e5d234 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -124,7 +124,7 @@ static int nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size) nbd_negotiate_continue, qemu_coroutine_self(), NULL); - ret = read_sync(ioc, buffer, size); + ret = read_sync(ioc, buffer, size, NULL); g_source_remove(watch); return ret; @@ -142,7 +142,7 @@ static int nbd_negotiate_write(QIOChannel *ioc, const void *buffer, size_t size) nbd_negotiate_continue, qemu_coroutine_self(), NULL); - ret = write_sync(ioc, buffer, size); + ret = write_sync(ioc, buffer, size, NULL); g_source_remove(watch); return ret; } @@ -694,7 +694,7 @@ static ssize_t nbd_receive_request(QIOChannel *ioc, NBDRequest *request) uint32_t magic; ssize_t ret; - ret = read_sync(ioc, buf, sizeof(buf)); + ret = read_sync(ioc, buf, sizeof(buf), NULL); if (ret < 0) { return ret; } @@ -745,7 +745,7 @@ static ssize_t nbd_send_reply(QIOChannel *ioc, NBDReply *reply) stl_be_p(buf + 4, reply->error); stq_be_p(buf + 8, reply->handle); - return write_sync(ioc, buf, sizeof(buf)); + return write_sync(ioc, buf, sizeof(buf), NULL); } #define MAX_NBD_REQUESTS 16 @@ -1048,7 +1048,7 @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply, qio_channel_set_cork(client->ioc, true); rc = nbd_send_reply(client->ioc, reply); if (rc >= 0) { - ret = write_sync(client->ioc, req->data, len); + ret = write_sync(client->ioc, req->data, len, NULL); if (ret < 0) { rc = -EIO; } @@ -1123,7 +1123,7 @@ static ssize_t nbd_co_receive_request(NBDRequestData *req, if (request->type == NBD_CMD_WRITE) { TRACE("Reading %" PRIu32 " byte(s)", request->len); - if (read_sync(client->ioc, req->data, request->len) < 0) { + if (read_sync(client->ioc, req->data, request->len, NULL) < 0) { LOG("reading from socket failed"); rc = -EIO; goto out; -- cgit v1.2.3-55-g7522 From be41c100c0d67f6072ddd4910c4b6f7d239f025f Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 26 May 2017 14:09:13 +0300 Subject: nbd/client.c: use errp instead of LOG Move to modern errp scheme from just LOGging errors. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20170526110913.89098-1-vsementsov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- block/nbd-client.c | 7 ++++++- include/block/nbd.h | 5 +++-- nbd/client.c | 30 +++++++++++++++++------------- qemu-nbd.c | 3 ++- tests/qemu-iotests/083.out | 2 ++ 5 files changed, 30 insertions(+), 17 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 538d95e031..09d955bc4d 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -28,6 +28,7 @@ */ #include "qemu/osdep.h" +#include "qapi/error.h" #include "nbd-client.h" #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) @@ -70,10 +71,14 @@ static coroutine_fn void nbd_read_reply_entry(void *opaque) NBDClientSession *s = opaque; uint64_t i; int ret; + Error *local_err = NULL; for (;;) { assert(s->reply.handle == 0); - ret = nbd_receive_reply(s->ioc, &s->reply); + ret = nbd_receive_reply(s->ioc, &s->reply, &local_err); + if (ret < 0) { + error_report_err(local_err); + } if (ret <= 0) { break; } diff --git a/include/block/nbd.h b/include/block/nbd.h index 9d385ea564..416257abca 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -133,9 +133,10 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, QCryptoTLSCreds *tlscreds, const char *hostname, QIOChannel **outioc, off_t *size, Error **errp); -int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size); +int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size, + Error **errp); ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request); -ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply); +ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply, Error **errp); int nbd_client(int fd); int nbd_disconnect(int fd); diff --git a/nbd/client.c b/nbd/client.c index f102375504..595d99ed30 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -627,11 +627,13 @@ fail: } #ifdef __linux__ -int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size) +int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size, + Error **errp) { unsigned long sectors = size / BDRV_SECTOR_SIZE; if (size / BDRV_SECTOR_SIZE != sectors) { - LOG("Export size %lld too large for 32-bit kernel", (long long) size); + error_setg(errp, "Export size %lld too large for 32-bit kernel", + (long long) size); return -E2BIG; } @@ -639,7 +641,7 @@ int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size) if (ioctl(fd, NBD_SET_SOCK, (unsigned long) sioc->fd) < 0) { int serrno = errno; - LOG("Failed to set NBD socket"); + error_setg(errp, "Failed to set NBD socket"); return -serrno; } @@ -647,7 +649,7 @@ int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size) if (ioctl(fd, NBD_SET_BLKSIZE, (unsigned long)BDRV_SECTOR_SIZE) < 0) { int serrno = errno; - LOG("Failed setting NBD block size"); + error_setg(errp, "Failed setting NBD block size"); return -serrno; } @@ -659,7 +661,7 @@ int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size) if (ioctl(fd, NBD_SET_SIZE_BLOCKS, sectors) < 0) { int serrno = errno; - LOG("Failed setting size (in blocks)"); + error_setg(errp, "Failed setting size (in blocks)"); return -serrno; } @@ -670,12 +672,12 @@ int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size) if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) { int serrno = errno; - LOG("Failed setting read-only attribute"); + error_setg(errp, "Failed setting read-only attribute"); return -serrno; } } else { int serrno = errno; - LOG("Failed setting flags"); + error_setg(errp, "Failed setting flags"); return -serrno; } } @@ -723,8 +725,10 @@ int nbd_disconnect(int fd) } #else -int nbd_init(int fd, QIOChannelSocket *ioc, uint16_t flags, off_t size) +int nbd_init(int fd, QIOChannelSocket *ioc, uint16_t flags, off_t size, + Error **errp) { + error_setg(errp, "nbd_init is only supported on Linux"); return -ENOTSUP; } @@ -758,19 +762,19 @@ ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request) return write_sync(ioc, buf, sizeof(buf), NULL); } -ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) +ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply, Error **errp) { uint8_t buf[NBD_REPLY_SIZE]; uint32_t magic; ssize_t ret; - ret = read_sync_eof(ioc, buf, sizeof(buf), NULL); + ret = read_sync_eof(ioc, buf, sizeof(buf), errp); if (ret <= 0) { return ret; } if (ret != sizeof(buf)) { - LOG("read failed"); + error_setg(errp, "read failed"); return -EINVAL; } @@ -788,7 +792,7 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) if (reply->error == ESHUTDOWN) { /* This works even on mingw which lacks a native ESHUTDOWN */ - LOG("server shutting down"); + error_setg(errp, "server shutting down"); return -EINVAL; } TRACE("Got reply: { magic = 0x%" PRIx32 ", .error = % " PRId32 @@ -796,7 +800,7 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) magic, reply->error, reply->handle); if (magic != NBD_REPLY_MAGIC) { - LOG("invalid magic (got 0x%" PRIx32 ")", magic); + error_setg(errp, "invalid magic (got 0x%" PRIx32 ")", magic); return -EINVAL; } return sizeof(buf); diff --git a/qemu-nbd.c b/qemu-nbd.c index b7ab86bfa7..f60842fd86 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -288,8 +288,9 @@ static void *nbd_client_thread(void *arg) goto out_socket; } - ret = nbd_init(fd, sioc, nbdflags, size); + ret = nbd_init(fd, sioc, nbdflags, size, &local_error); if (ret < 0) { + error_report_err(local_error); goto out_fd; } diff --git a/tests/qemu-iotests/083.out b/tests/qemu-iotests/083.out index 0c13888ba1..a24c6bfece 100644 --- a/tests/qemu-iotests/083.out +++ b/tests/qemu-iotests/083.out @@ -69,10 +69,12 @@ read failed: Input/output error === Check disconnect 4 reply === +read failed read failed: Input/output error === Check disconnect 8 reply === +read failed read failed: Input/output error === Check disconnect before data === -- cgit v1.2.3-55-g7522 From 003a0cf2cd1828a1141a874428571267b117f765 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 15 May 2017 16:50:57 +0800 Subject: exec: simplify phys_page_find() params It really only plays with the dispatchers, so the parameter list does not need that complexity. This helps for readability at least. Signed-off-by: Peter Xu Message-Id: <1494838260-30439-2-git-send-email-peterx@redhat.com> Reviewed-by: David Gibson Signed-off-by: Paolo Bonzini --- exec.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/exec.c b/exec.c index b1db12fe36..a93e209625 100644 --- a/exec.c +++ b/exec.c @@ -374,10 +374,11 @@ static inline bool section_covers_addr(const MemoryRegionSection *section, int128_getlo(section->size), addr); } -static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr, - Node *nodes, MemoryRegionSection *sections) +static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr) { - PhysPageEntry *p; + PhysPageEntry lp = d->phys_map, *p; + Node *nodes = d->map.nodes; + MemoryRegionSection *sections = d->map.sections; hwaddr index = addr >> TARGET_PAGE_BITS; int i; @@ -415,8 +416,7 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d, section_covers_addr(section, addr)) { update = false; } else { - section = phys_page_find(d->phys_map, addr, d->map.nodes, - d->map.sections); + section = phys_page_find(d, addr); update = true; } if (resolve_subpage && section->mr->subpage) { @@ -1285,8 +1285,7 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti subpage_t *subpage; hwaddr base = section->offset_within_address_space & TARGET_PAGE_MASK; - MemoryRegionSection *existing = phys_page_find(d->phys_map, base, - d->map.nodes, d->map.sections); + MemoryRegionSection *existing = phys_page_find(d, base); MemoryRegionSection subsection = { .offset_within_address_space = base, .size = int128_make64(TARGET_PAGE_SIZE), -- cgit v1.2.3-55-g7522 From 2cbe2de5454cf9af44b620b2b40d56361a12a45f Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Thu, 18 May 2017 18:28:08 +0800 Subject: virtio-scsi: Unset hotplug handler when unrealize This matches the qbus_set_hotplug_handler in realize, and it releases the final reference to the embedded VirtIODevice so that it is properly finalized. A use-after-free is fixed with this patch, indirectly: virtio_device_instance_finalize wasn't called at hot-unplug, and the vdev->listener would be a dangling pointer in the global and the per address space listener list. See also RHBZ 1449031. Cc: qemu-stable@nongnu.org Signed-off-by: Fam Zheng Message-Id: <20170518102808.30046-1-famz@redhat.com> Signed-off-by: Paolo Bonzini --- hw/scsi/virtio-scsi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 46a3e3f280..f46f06d055 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -918,6 +918,9 @@ void virtio_scsi_common_unrealize(DeviceState *dev, Error **errp) static void virtio_scsi_device_unrealize(DeviceState *dev, Error **errp) { + VirtIOSCSI *s = VIRTIO_SCSI(dev); + + qbus_set_hotplug_handler(BUS(&s->bus), NULL, &error_abort); virtio_scsi_common_unrealize(dev, errp); } -- cgit v1.2.3-55-g7522 From c8bc83a4dd29a9a33f5be81686bfe6e2e628097b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 May 2017 13:35:28 +0200 Subject: target/i386: enable A20 automatically in system management mode Ignore env->a20_mask when running in system management mode. Reported-by: Anthony Xu Signed-off-by: Paolo Bonzini Message-Id: <1494502528-12670-1-git-send-email-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- target/i386/arch_memory_mapping.c | 18 +++++++++-------- target/i386/cpu.h | 9 +++++++++ target/i386/helper.c | 42 +++++++++++++++++++++------------------ 3 files changed, 42 insertions(+), 27 deletions(-) diff --git a/target/i386/arch_memory_mapping.c b/target/i386/arch_memory_mapping.c index 826aee597b..647cff2829 100644 --- a/target/i386/arch_memory_mapping.c +++ b/target/i386/arch_memory_mapping.c @@ -272,25 +272,27 @@ void x86_cpu_get_memory_mapping(CPUState *cs, MemoryMappingList *list, { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + int32_t a20_mask; if (!cpu_paging_enabled(cs)) { /* paging is disabled */ return; } + a20_mask = x86_get_a20_mask(env); if (env->cr[4] & CR4_PAE_MASK) { #ifdef TARGET_X86_64 if (env->hflags & HF_LMA_MASK) { if (env->cr[4] & CR4_LA57_MASK) { hwaddr pml5e_addr; - pml5e_addr = (env->cr[3] & PLM4_ADDR_MASK) & env->a20_mask; - walk_pml5e(list, cs->as, pml5e_addr, env->a20_mask); + pml5e_addr = (env->cr[3] & PLM4_ADDR_MASK) & a20_mask; + walk_pml5e(list, cs->as, pml5e_addr, a20_mask); } else { hwaddr pml4e_addr; - pml4e_addr = (env->cr[3] & PLM4_ADDR_MASK) & env->a20_mask; - walk_pml4e(list, cs->as, pml4e_addr, env->a20_mask, + pml4e_addr = (env->cr[3] & PLM4_ADDR_MASK) & a20_mask; + walk_pml4e(list, cs->as, pml4e_addr, a20_mask, 0xffffULL << 48); } } else @@ -298,16 +300,16 @@ void x86_cpu_get_memory_mapping(CPUState *cs, MemoryMappingList *list, { hwaddr pdpe_addr; - pdpe_addr = (env->cr[3] & ~0x1f) & env->a20_mask; - walk_pdpe2(list, cs->as, pdpe_addr, env->a20_mask); + pdpe_addr = (env->cr[3] & ~0x1f) & a20_mask; + walk_pdpe2(list, cs->as, pdpe_addr, a20_mask); } } else { hwaddr pde_addr; bool pse; - pde_addr = (env->cr[3] & ~0xfff) & env->a20_mask; + pde_addr = (env->cr[3] & ~0xfff) & a20_mask; pse = !!(env->cr[4] & CR4_PSE_MASK); - walk_pde2(list, cs->as, pde_addr, env->a20_mask, pse); + walk_pde2(list, cs->as, pde_addr, a20_mask, pse); } } diff --git a/target/i386/cpu.h b/target/i386/cpu.h index cfe825f0a4..0facb354b5 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1625,6 +1625,15 @@ static inline MemTxAttrs cpu_get_mem_attrs(CPUX86State *env) return ((MemTxAttrs) { .secure = (env->hflags & HF_SMM_MASK) != 0 }); } +static inline int32_t x86_get_a20_mask(CPUX86State *env) +{ + if (env->hflags & HF_SMM_MASK) { + return -1; + } else { + return env->a20_mask; + } +} + /* fpu_helper.c */ void cpu_set_mxcsr(CPUX86State *env, uint32_t val); void cpu_set_fpuc(CPUX86State *env, uint16_t val); diff --git a/target/i386/helper.c b/target/i386/helper.c index ee7eff2f6f..3850c56701 100644 --- a/target/i386/helper.c +++ b/target/i386/helper.c @@ -724,6 +724,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; uint64_t ptep, pte; + int32_t a20_mask; target_ulong pde_addr, pte_addr; int error_code = 0; int is_dirty, prot, page_size, is_write, is_user; @@ -739,6 +740,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, #endif is_write = is_write1 & 1; + a20_mask = x86_get_a20_mask(env); if (!(env->cr[0] & CR0_PG_MASK)) { pte = addr; #ifdef TARGET_X86_64 @@ -777,7 +779,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, if (la57) { pml5e_addr = ((env->cr[3] & ~0xfff) + - (((addr >> 48) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 48) & 0x1ff) << 3)) & a20_mask; pml5e = x86_ldq_phys(cs, pml5e_addr); if (!(pml5e & PG_PRESENT_MASK)) { goto do_fault; @@ -796,7 +798,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, } pml4e_addr = ((pml5e & PG_ADDRESS_MASK) + - (((addr >> 39) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 39) & 0x1ff) << 3)) & a20_mask; pml4e = x86_ldq_phys(cs, pml4e_addr); if (!(pml4e & PG_PRESENT_MASK)) { goto do_fault; @@ -810,7 +812,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, } ptep &= pml4e ^ PG_NX_MASK; pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) & - env->a20_mask; + a20_mask; pdpe = x86_ldq_phys(cs, pdpe_addr); if (!(pdpe & PG_PRESENT_MASK)) { goto do_fault; @@ -835,7 +837,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, { /* XXX: load them when cr3 is loaded ? */ pdpe_addr = ((env->cr[3] & ~0x1f) + ((addr >> 27) & 0x18)) & - env->a20_mask; + a20_mask; pdpe = x86_ldq_phys(cs, pdpe_addr); if (!(pdpe & PG_PRESENT_MASK)) { goto do_fault; @@ -848,7 +850,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, } pde_addr = ((pdpe & PG_ADDRESS_MASK) + (((addr >> 21) & 0x1ff) << 3)) & - env->a20_mask; + a20_mask; pde = x86_ldq_phys(cs, pde_addr); if (!(pde & PG_PRESENT_MASK)) { goto do_fault; @@ -870,7 +872,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, x86_stl_phys_notdirty(cs, pde_addr, pde); } pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) & - env->a20_mask; + a20_mask; pte = x86_ldq_phys(cs, pte_addr); if (!(pte & PG_PRESENT_MASK)) { goto do_fault; @@ -886,7 +888,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, /* page directory entry */ pde_addr = ((env->cr[3] & ~0xfff) + ((addr >> 20) & 0xffc)) & - env->a20_mask; + a20_mask; pde = x86_ldl_phys(cs, pde_addr); if (!(pde & PG_PRESENT_MASK)) { goto do_fault; @@ -913,7 +915,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, /* page directory entry */ pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) & - env->a20_mask; + a20_mask; pte = x86_ldl_phys(cs, pte_addr); if (!(pte & PG_PRESENT_MASK)) { goto do_fault; @@ -992,7 +994,7 @@ do_check_protect_pse36: } do_mapping: - pte = pte & env->a20_mask; + pte = pte & a20_mask; /* align to page_size */ pte &= PG_ADDRESS_MASK & ~(page_size - 1); @@ -1039,11 +1041,13 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) CPUX86State *env = &cpu->env; target_ulong pde_addr, pte_addr; uint64_t pte; + int32_t a20_mask; uint32_t page_offset; int page_size; + a20_mask = x86_get_a20_mask(env); if (!(env->cr[0] & CR0_PG_MASK)) { - pte = addr & env->a20_mask; + pte = addr & a20_mask; page_size = 4096; } else if (env->cr[4] & CR4_PAE_MASK) { target_ulong pdpe_addr; @@ -1064,7 +1068,7 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) if (la57) { pml5e_addr = ((env->cr[3] & ~0xfff) + - (((addr >> 48) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 48) & 0x1ff) << 3)) & a20_mask; pml5e = x86_ldq_phys(cs, pml5e_addr); if (!(pml5e & PG_PRESENT_MASK)) { return -1; @@ -1074,13 +1078,13 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) } pml4e_addr = ((pml5e & PG_ADDRESS_MASK) + - (((addr >> 39) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 39) & 0x1ff) << 3)) & a20_mask; pml4e = x86_ldq_phys(cs, pml4e_addr); if (!(pml4e & PG_PRESENT_MASK)) { return -1; } pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + - (((addr >> 30) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 30) & 0x1ff) << 3)) & a20_mask; pdpe = x86_ldq_phys(cs, pdpe_addr); if (!(pdpe & PG_PRESENT_MASK)) { return -1; @@ -1095,14 +1099,14 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) #endif { pdpe_addr = ((env->cr[3] & ~0x1f) + ((addr >> 27) & 0x18)) & - env->a20_mask; + a20_mask; pdpe = x86_ldq_phys(cs, pdpe_addr); if (!(pdpe & PG_PRESENT_MASK)) return -1; } pde_addr = ((pdpe & PG_ADDRESS_MASK) + - (((addr >> 21) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 21) & 0x1ff) << 3)) & a20_mask; pde = x86_ldq_phys(cs, pde_addr); if (!(pde & PG_PRESENT_MASK)) { return -1; @@ -1114,7 +1118,7 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) } else { /* 4 KB page */ pte_addr = ((pde & PG_ADDRESS_MASK) + - (((addr >> 12) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 12) & 0x1ff) << 3)) & a20_mask; page_size = 4096; pte = x86_ldq_phys(cs, pte_addr); } @@ -1125,7 +1129,7 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) uint32_t pde; /* page directory entry */ - pde_addr = ((env->cr[3] & ~0xfff) + ((addr >> 20) & 0xffc)) & env->a20_mask; + pde_addr = ((env->cr[3] & ~0xfff) + ((addr >> 20) & 0xffc)) & a20_mask; pde = x86_ldl_phys(cs, pde_addr); if (!(pde & PG_PRESENT_MASK)) return -1; @@ -1134,14 +1138,14 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) page_size = 4096 * 1024; } else { /* page directory entry */ - pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) & env->a20_mask; + pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) & a20_mask; pte = x86_ldl_phys(cs, pte_addr); if (!(pte & PG_PRESENT_MASK)) { return -1; } page_size = 4096; } - pte = pte & env->a20_mask; + pte = pte & a20_mask; } #ifdef TARGET_X86_64 -- cgit v1.2.3-55-g7522 From f8c45c6550b9ff1e1f0b92709ff3213a79870879 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 1 Mar 2017 10:34:48 +0100 Subject: target/i386: use multiple CPU AddressSpaces This speeds up SMM switches. Later on it may remove the need to take the BQL, and it may also allow to reuse code between TCG and KVM. Signed-off-by: Paolo Bonzini --- target/i386/cpu.c | 15 +++++++++----- target/i386/cpu.h | 11 +++++++++- target/i386/helper.c | 54 ++++++++++++++++++++++++------------------------ target/i386/machine.c | 4 ---- target/i386/smm_helper.c | 18 ---------------- 5 files changed, 47 insertions(+), 55 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index ffb5267162..3f832a6a94 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3239,7 +3239,7 @@ static void x86_cpu_machine_done(Notifier *n, void *unused) cpu->smram = g_new(MemoryRegion, 1); memory_region_init_alias(cpu->smram, OBJECT(cpu), "smram", smram, 0, 1ull << 32); - memory_region_set_enabled(cpu->smram, false); + memory_region_set_enabled(cpu->smram, true); memory_region_add_subregion_overlap(cpu->cpu_as_root, 0, cpu->smram, 1); } } @@ -3619,7 +3619,9 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) #ifndef CONFIG_USER_ONLY if (tcg_enabled()) { - AddressSpace *newas = g_new(AddressSpace, 1); + AddressSpace *as_normal = address_space_init_shareable(cs->memory, + "cpu-memory"); + AddressSpace *as_smm = g_new(AddressSpace, 1); cpu->cpu_as_mem = g_new(MemoryRegion, 1); cpu->cpu_as_root = g_new(MemoryRegion, 1); @@ -3635,9 +3637,11 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) get_system_memory(), 0, ~0ull); memory_region_add_subregion_overlap(cpu->cpu_as_root, 0, cpu->cpu_as_mem, 0); memory_region_set_enabled(cpu->cpu_as_mem, true); - address_space_init(newas, cpu->cpu_as_root, "CPU"); - cs->num_ases = 1; - cpu_address_space_init(cs, newas, 0); + address_space_init(as_smm, cpu->cpu_as_root, "CPU"); + + cs->num_ases = 2; + cpu_address_space_init(cs, as_normal, 0); + cpu_address_space_init(cs, as_smm, 1); /* ... SMRAM with higher priority, linked from /machine/smram. */ cpu->machine_done.notify = x86_cpu_machine_done; @@ -4053,6 +4057,7 @@ static void x86_cpu_common_class_init(ObjectClass *oc, void *data) #ifdef CONFIG_USER_ONLY cc->handle_mmu_fault = x86_cpu_handle_mmu_fault; #else + cc->asidx_from_attrs = x86_asidx_from_attrs; cc->get_memory_mapping = x86_cpu_get_memory_mapping; cc->get_phys_page_debug = x86_cpu_get_phys_page_debug; cc->write_elf64_note = x86_cpu_write_elf64_note; diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 0facb354b5..de0551f775 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1451,6 +1451,16 @@ int x86_cpu_handle_mmu_fault(CPUState *cpu, vaddr addr, void x86_cpu_set_a20(X86CPU *cpu, int a20_state); #ifndef CONFIG_USER_ONLY +static inline int x86_asidx_from_attrs(CPUState *cs, MemTxAttrs attrs) +{ + return !!attrs.secure; +} + +static inline AddressSpace *cpu_addressspace(CPUState *cs, MemTxAttrs attrs) +{ + return cpu_get_address_space(cs, cpu_asidx_from_attrs(cs, attrs)); +} + uint8_t x86_ldub_phys(CPUState *cs, hwaddr addr); uint32_t x86_lduw_phys(CPUState *cs, hwaddr addr); uint32_t x86_ldl_phys(CPUState *cs, hwaddr addr); @@ -1653,7 +1663,6 @@ void do_interrupt_x86_hardirq(CPUX86State *env, int intno, int is_hw); /* smm_helper.c */ void do_smm_enter(X86CPU *cpu); -void cpu_smm_update(X86CPU *cpu); /* apic.c */ void cpu_report_tpr_access(CPUX86State *env, TPRAccess access); diff --git a/target/i386/helper.c b/target/i386/helper.c index 3850c56701..ef0505949a 100644 --- a/target/i386/helper.c +++ b/target/i386/helper.c @@ -1403,89 +1403,89 @@ uint8_t x86_ldub_phys(CPUState *cs, hwaddr addr) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - return address_space_ldub(cs->as, addr, - cpu_get_mem_attrs(env), - NULL); + return address_space_ldub(as, addr, attrs, NULL); } uint32_t x86_lduw_phys(CPUState *cs, hwaddr addr) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - return address_space_lduw(cs->as, addr, - cpu_get_mem_attrs(env), - NULL); + return address_space_lduw(as, addr, attrs, NULL); } uint32_t x86_ldl_phys(CPUState *cs, hwaddr addr) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - return address_space_ldl(cs->as, addr, - cpu_get_mem_attrs(env), - NULL); + return address_space_ldl(as, addr, attrs, NULL); } uint64_t x86_ldq_phys(CPUState *cs, hwaddr addr) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - return address_space_ldq(cs->as, addr, - cpu_get_mem_attrs(env), - NULL); + return address_space_ldq(as, addr, attrs, NULL); } void x86_stb_phys(CPUState *cs, hwaddr addr, uint8_t val) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - address_space_stb(cs->as, addr, val, - cpu_get_mem_attrs(env), - NULL); + address_space_stb(as, addr, val, attrs, NULL); } void x86_stl_phys_notdirty(CPUState *cs, hwaddr addr, uint32_t val) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - address_space_stl_notdirty(cs->as, addr, val, - cpu_get_mem_attrs(env), - NULL); + address_space_stl_notdirty(as, addr, val, attrs, NULL); } void x86_stw_phys(CPUState *cs, hwaddr addr, uint32_t val) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - address_space_stw(cs->as, addr, val, - cpu_get_mem_attrs(env), - NULL); + address_space_stw(as, addr, val, attrs, NULL); } void x86_stl_phys(CPUState *cs, hwaddr addr, uint32_t val) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - address_space_stl(cs->as, addr, val, - cpu_get_mem_attrs(env), - NULL); + address_space_stl(as, addr, val, attrs, NULL); } void x86_stq_phys(CPUState *cs, hwaddr addr, uint64_t val) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - address_space_stq(cs->as, addr, val, - cpu_get_mem_attrs(env), - NULL); + address_space_stq(as, addr, val, attrs, NULL); } #endif diff --git a/target/i386/machine.c b/target/i386/machine.c index 3cb272948e..8c7a822e9f 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -274,10 +274,6 @@ static int cpu_post_load(void *opaque, int version_id) cpu_x86_update_dr7(env, dr7); } tlb_flush(cs); - - if (tcg_enabled()) { - cpu_smm_update(cpu); - } return 0; } diff --git a/target/i386/smm_helper.c b/target/i386/smm_helper.c index f051a77c4a..90621e5977 100644 --- a/target/i386/smm_helper.c +++ b/target/i386/smm_helper.c @@ -43,19 +43,6 @@ void helper_rsm(CPUX86State *env) #define SMM_REVISION_ID 0x00020000 #endif -/* Called with iothread lock taken */ -void cpu_smm_update(X86CPU *cpu) -{ - CPUX86State *env = &cpu->env; - bool smm_enabled = (env->hflags & HF_SMM_MASK); - - g_assert(qemu_mutex_iothread_locked()); - - if (cpu->smram) { - memory_region_set_enabled(cpu->smram, smm_enabled); - } -} - void do_smm_enter(X86CPU *cpu) { CPUX86State *env = &cpu->env; @@ -73,7 +60,6 @@ void do_smm_enter(X86CPU *cpu) } else { env->hflags2 |= HF2_NMI_MASK; } - cpu_smm_update(cpu); sm_state = env->smbase + 0x8000; @@ -338,10 +324,6 @@ void helper_rsm(CPUX86State *env) env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK; env->hflags &= ~HF_SMM_MASK; - qemu_mutex_lock_iothread(); - cpu_smm_update(cpu); - qemu_mutex_unlock_iothread(); - qemu_log_mask(CPU_LOG_INT, "SMM: after RSM\n"); log_cpu_state_mask(CPU_LOG_INT, CPU(cpu), CPU_DUMP_CCOP); } -- cgit v1.2.3-55-g7522 From 5b003a40bb1ab14d0398e91f03393d3c6b9577cd Mon Sep 17 00:00:00 2001 From: Mihail Abakumov Date: Fri, 19 May 2017 12:36:15 +0300 Subject: i386: fix read/write cr with icount option Running Windows with icount causes a crash in instruction of write cr. This patch fixes it. Reading and writing cr cause an icount read because there are called cpu_get_apic_tpr and cpu_set_apic_tpr functions. So, there is need gen_io_start()/gen_io_end() calls. Signed-off-by: Mihail Abakumov Message-Id: Signed-off-by: Paolo Bonzini --- target/i386/translate.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/target/i386/translate.c b/target/i386/translate.c index 674ec96d5a..ed3b896db4 100644 --- a/target/i386/translate.c +++ b/target/i386/translate.c @@ -7939,14 +7939,26 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_update_cc_op(s); gen_jmp_im(pc_start - s->cs_base); if (b & 2) { + if (s->tb->cflags & CF_USE_ICOUNT) { + gen_io_start(); + } gen_op_mov_v_reg(ot, cpu_T0, rm); gen_helper_write_crN(cpu_env, tcg_const_i32(reg), cpu_T0); + if (s->tb->cflags & CF_USE_ICOUNT) { + gen_io_end(); + } gen_jmp_im(s->pc - s->cs_base); gen_eob(s); } else { + if (s->tb->cflags & CF_USE_ICOUNT) { + gen_io_start(); + } gen_helper_read_crN(cpu_T0, cpu_env, tcg_const_i32(reg)); gen_op_mov_reg_v(ot, rm, cpu_T0); + if (s->tb->cflags & CF_USE_ICOUNT) { + gen_io_end(); + } } break; default: -- cgit v1.2.3-55-g7522 From ad9579aaa16d5b385922d49edac2c96c79bcfb62 Mon Sep 17 00:00:00 2001 From: Daniel P. Berrange Date: Thu, 25 May 2017 16:53:00 +0100 Subject: sockets: improve error reporting if UNIX socket path is too long The 'struct sockaddr_un' only allows 108 bytes for the socket path. If the user supplies a path, QEMU uses snprintf() to silently truncate it when too long. This is undesirable because the user will then be unable to connect to the path they asked for. If the user doesn't supply a path, QEMU builds one based on TMPDIR, but if that leads to an overlong path, it mistakenly uses error_setg_errno() with a stale errno value, because snprintf() does not set errno on truncation. In solving this the code needed some refactoring to ensure we don't pass 'un.sun_path' directly to any APIs which expect NUL-terminated strings, because the path is not required to be terminated. Signed-off-by: Daniel P. Berrange Message-Id: <20170525155300.22743-1-berrange@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Paolo Bonzini --- util/qemu-sockets.c | 68 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c index b39ae74fe0..82290cb687 100644 --- a/util/qemu-sockets.c +++ b/util/qemu-sockets.c @@ -845,6 +845,8 @@ static int unix_listen_saddr(UnixSocketAddress *saddr, { struct sockaddr_un un; int sock, fd; + char *pathbuf = NULL; + const char *path; sock = qemu_socket(PF_UNIX, SOCK_STREAM, 0); if (sock < 0) { @@ -852,20 +854,22 @@ static int unix_listen_saddr(UnixSocketAddress *saddr, return -1; } - memset(&un, 0, sizeof(un)); - un.sun_family = AF_UNIX; - if (saddr->path && strlen(saddr->path)) { - snprintf(un.sun_path, sizeof(un.sun_path), "%s", saddr->path); + if (saddr->path && saddr->path[0]) { + path = saddr->path; } else { const char *tmpdir = getenv("TMPDIR"); tmpdir = tmpdir ? tmpdir : "/tmp"; - if (snprintf(un.sun_path, sizeof(un.sun_path), "%s/qemu-socket-XXXXXX", - tmpdir) >= sizeof(un.sun_path)) { - error_setg_errno(errp, errno, - "TMPDIR environment variable (%s) too large", tmpdir); - goto err; - } + path = pathbuf = g_strdup_printf("%s/qemu-socket-XXXXXX", tmpdir); + } + if (strlen(path) > sizeof(un.sun_path)) { + error_setg(errp, "UNIX socket path '%s' is too long", path); + error_append_hint(errp, "Path must be less than %zu bytes\n", + sizeof(un.sun_path)); + goto err; + } + + if (pathbuf != NULL) { /* * This dummy fd usage silences the mktemp() unsecure warning. * Using mkstemp() doesn't make things more secure here @@ -873,24 +877,25 @@ static int unix_listen_saddr(UnixSocketAddress *saddr, * to unlink first and thus re-open the race window. The * worst case possible is bind() failing, i.e. a DoS attack. */ - fd = mkstemp(un.sun_path); + fd = mkstemp(pathbuf); if (fd < 0) { error_setg_errno(errp, errno, - "Failed to make a temporary socket name in %s", tmpdir); + "Failed to make a temporary socket %s", pathbuf); goto err; } close(fd); - if (update_addr) { - g_free(saddr->path); - saddr->path = g_strdup(un.sun_path); - } } - if (unlink(un.sun_path) < 0 && errno != ENOENT) { + if (unlink(path) < 0 && errno != ENOENT) { error_setg_errno(errp, errno, - "Failed to unlink socket %s", un.sun_path); + "Failed to unlink socket %s", path); goto err; } + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + strncpy(un.sun_path, path, sizeof(un.sun_path)); + if (bind(sock, (struct sockaddr*) &un, sizeof(un)) < 0) { error_setg_errno(errp, errno, "Failed to bind socket to %s", un.sun_path); goto err; @@ -900,9 +905,16 @@ static int unix_listen_saddr(UnixSocketAddress *saddr, goto err; } + if (update_addr && pathbuf) { + g_free(saddr->path); + saddr->path = pathbuf; + } else { + g_free(pathbuf); + } return sock; err: + g_free(pathbuf); closesocket(sock); return -1; } @@ -932,9 +944,16 @@ static int unix_connect_saddr(UnixSocketAddress *saddr, qemu_set_nonblock(sock); } + if (strlen(saddr->path) > sizeof(un.sun_path)) { + error_setg(errp, "UNIX socket path '%s' is too long", saddr->path); + error_append_hint(errp, "Path must be less than %zu bytes\n", + sizeof(un.sun_path)); + goto err; + } + memset(&un, 0, sizeof(un)); un.sun_family = AF_UNIX; - snprintf(un.sun_path, sizeof(un.sun_path), "%s", saddr->path); + strncpy(un.sun_path, saddr->path, sizeof(un.sun_path)); /* connect to peer */ do { @@ -956,13 +975,18 @@ static int unix_connect_saddr(UnixSocketAddress *saddr, } if (rc < 0) { - error_setg_errno(errp, -rc, "Failed to connect socket"); - close(sock); - sock = -1; + error_setg_errno(errp, -rc, "Failed to connect socket %s", + saddr->path); + goto err; } g_free(connect_state); return sock; + + err: + close(sock); + g_free(connect_state); + return -1; } #else -- cgit v1.2.3-55-g7522 From df8ad9f128c15aa0a0ebc7b24e9a22c9775b67af Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 26 May 2017 22:04:21 -0500 Subject: nbd: Fully initialize client in case of failed negotiation If a non-NBD client connects to qemu-nbd, we would end up with a SIGSEGV in nbd_client_put() because we were trying to unregister the client's association to the export, even though we skipped inserting the client into that list. Easy trigger in two terminals: $ qemu-nbd -p 30001 --format=raw file $ nmap 127.0.0.1 -p 30001 nmap claims that it thinks it connected to a pago-services1 server (which probably means nmap could be updated to learn the NBD protocol and give a more accurate diagnosis of the open port - but that's not our problem), then terminates immediately, so our call to nbd_negotiate() fails. The fix is to reorder nbd_co_client_start() to ensure that all initialization occurs before we ever try talking to a client in nbd_negotiate(), so that the teardown sequence on negotiation failure doesn't fault while dereferencing a half-initialized object. While debugging this, I also noticed that nbd_update_server_watch() called by nbd_client_closed() was still adding a channel to accept the next client, even when the state was no longer RUNNING. That is fixed by making nbd_can_accept() pay attention to the current state. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1451614 Signed-off-by: Eric Blake Message-Id: <20170527030421.28366-1-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- nbd/server.c | 8 +++----- qemu-nbd.c | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index ee59e5d234..49b55f6ede 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1358,16 +1358,14 @@ static coroutine_fn void nbd_co_client_start(void *opaque) if (exp) { nbd_export_get(exp); + QTAILQ_INSERT_TAIL(&exp->clients, client, next); } + qemu_co_mutex_init(&client->send_lock); + if (nbd_negotiate(data)) { client_close(client); goto out; } - qemu_co_mutex_init(&client->send_lock); - - if (exp) { - QTAILQ_INSERT_TAIL(&exp->clients, client, next); - } nbd_client_receive_next_request(client); diff --git a/qemu-nbd.c b/qemu-nbd.c index f60842fd86..651f85ecc1 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -325,7 +325,7 @@ out: static int nbd_can_accept(void) { - return nb_fds < shared; + return state == RUNNING && nb_fds < shared; } static void nbd_export_closed(NBDExport *exp) -- cgit v1.2.3-55-g7522 From e2b6c1712e08bc5feafb44fdc65ab81ef2630b4b Mon Sep 17 00:00:00 2001 From: Denis Plotnikov Date: Mon, 29 May 2017 13:49:04 +0300 Subject: kvmclock: update system_time_msr address forcibly Do an update of system_time_msr address every time before reading the value of tsc_timestamp from guest's kvmclock page. There is no other code paths which ensure that qemu has an up-to-date value of system_time_msr. So, force this update on guest's tsc_timestamp reading. This bug causes effect on those nested setups which turn off TPR access interception for L2 guests and that access being intercepted by L0 doesn't show up in L1. Linux bootstrap initiate kvmclock before APIC initializing causing TPR access. That's why on L1 guests, having TPR interception turned on for L2, the effect of the bug is not revealed. This patch fixes this problem by making sure it knows the correct system_time_msr address every time it is needed. Signed-off-by: Denis Plotnikov Message-Id: <1496054944-25623-1-git-send-email-dplotnikov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- hw/i386/kvm/clock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index 13eca374cd..363d1b5743 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -19,6 +19,7 @@ #include "qemu/host-utils.h" #include "sysemu/sysemu.h" #include "sysemu/kvm.h" +#include "sysemu/hw_accel.h" #include "kvm_i386.h" #include "hw/sysbus.h" #include "hw/kvm/clock.h" @@ -69,6 +70,8 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s) uint64_t nsec_hi; uint64_t nsec; + cpu_synchronize_state(cpu); + if (!(env->system_time_msr & 1ULL)) { /* KVM clock not active */ return 0; -- cgit v1.2.3-55-g7522 From 7e018385103cd7a571b9ea0d6f994af6b1129fe7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 May 2017 14:37:15 +0200 Subject: linuxboot_dma: compile for i486 The ROM uses the cmovne instruction, which is new in Pentium Pro and does not work when running QEMU with "-cpu 486". Avoid producing that instruction. Suggested-by: Richard W.M. Jones Suggested-by: Thomas Huth Reported-by: Rob Landley Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini --- pc-bios/linuxboot_dma.bin | Bin 1536 -> 1536 bytes pc-bios/optionrom/Makefile | 1 + 2 files changed, 1 insertion(+) diff --git a/pc-bios/linuxboot_dma.bin b/pc-bios/linuxboot_dma.bin index 218d3ab4a2..d176f62797 100644 Binary files a/pc-bios/linuxboot_dma.bin and b/pc-bios/linuxboot_dma.bin differ diff --git a/pc-bios/optionrom/Makefile b/pc-bios/optionrom/Makefile index fa53d9e58e..a9a9e5e7eb 100644 --- a/pc-bios/optionrom/Makefile +++ b/pc-bios/optionrom/Makefile @@ -13,6 +13,7 @@ $(call set-vpath, $(SRC_PATH)/pc-bios/optionrom) ifeq ($(lastword $(filter -O%, -O0 $(CFLAGS))),-O0) override CFLAGS += -O2 endif +override CFLAGS += -march=i486 # Drop -fstack-protector and the like QEMU_CFLAGS := $(filter -W%, $(QEMU_CFLAGS)) $(CFLAGS_NOPIE) -ffreestanding -- cgit v1.2.3-55-g7522 From c25a67f0c3d0c86231f9653267a222c4effa706f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 May 2017 14:56:37 +0200 Subject: edu: fix memory leak on msi_broken platforms If msi_init fails, the thread has already been created and the mutex/condvar are not destroyed. Initialize everything only after the point where pci_edu_realize cannot fail. Reported-by: Markus Armbruster Cc: Peter Xu Signed-off-by: Paolo Bonzini --- hw/misc/edu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hw/misc/edu.c b/hw/misc/edu.c index 401039c100..01acacf142 100644 --- a/hw/misc/edu.c +++ b/hw/misc/edu.c @@ -343,6 +343,12 @@ static void pci_edu_realize(PCIDevice *pdev, Error **errp) EduState *edu = DO_UPCAST(EduState, pdev, pdev); uint8_t *pci_conf = pdev->config; + pci_config_set_interrupt_pin(pci_conf, 1); + + if (msi_init(pdev, 0, 1, true, false, errp)) { + return; + } + timer_init_ms(&edu->dma_timer, QEMU_CLOCK_VIRTUAL, edu_dma_timer, edu); qemu_mutex_init(&edu->thr_mutex); @@ -350,12 +356,6 @@ static void pci_edu_realize(PCIDevice *pdev, Error **errp) qemu_thread_create(&edu->thread, "edu", edu_fact_thread, edu, QEMU_THREAD_JOINABLE); - pci_config_set_interrupt_pin(pci_conf, 1); - - if (msi_init(pdev, 0, 1, true, false, errp)) { - return; - } - memory_region_init_io(&edu->mmio, OBJECT(edu), &edu_mmio_ops, edu, "edu-mmio", 1 << 20); pci_register_bar(pdev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &edu->mmio); -- cgit v1.2.3-55-g7522 From d45fc087c26674eedda9314b9aaefd8e061bf104 Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Thu, 1 Jun 2017 10:56:04 +0200 Subject: i386/kvm: do not zero out segment flags if segment is unusable or not present This is a fix for the problem [1], where VMCB.CPL was set to 0 and interrupt was taken on userspace stack. The root cause lies in the specific AMD CPU behaviour which manifests itself as unusable segment attributes on SYSRET[2]. Here in this patch flags are not touched even segment is unusable or is not present, therefore CPL (which is stored in DPL field) should not be lost and will be successfully restored on kvm/svm kernel side. Also current patch should not break desired behavior described in this commit: 4cae9c97967a ("target-i386: kvm: clear unusable segments' flags in migration") since present bit will be dropped if segment is unusable or is not present. This is the second part of the whole fix of the corresponding problem [1], first part is related to kvm/svm kernel side and does exactly the same: segment attributes are not zeroed out. [1] Message id: CAJrWOzD6Xq==b-zYCDdFLgSRMPM-NkNuTSDFEtX=7MreT45i7Q@mail.gmail.com [2] Message id: 5d120f358612d73fc909f5bfa47e7bd082db0af0.1429841474.git.luto@kernel.org Signed-off-by: Roman Pen Signed-off-by: Mikhail Sennikovskii Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michael Chapman Cc: qemu-devel@nongnu.org Message-Id: <20170601085604.12980-1-roman.penyaev@profitbricks.com> Signed-off-by: Paolo Bonzini --- target/i386/kvm.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 9087677d00..5936d2761f 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -1301,18 +1301,14 @@ static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs) lhs->selector = rhs->selector; lhs->base = rhs->base; lhs->limit = rhs->limit; - if (rhs->unusable) { - lhs->flags = 0; - } else { - lhs->flags = (rhs->type << DESC_TYPE_SHIFT) | - (rhs->present * DESC_P_MASK) | - (rhs->dpl << DESC_DPL_SHIFT) | - (rhs->db << DESC_B_SHIFT) | - (rhs->s * DESC_S_MASK) | - (rhs->l << DESC_L_SHIFT) | - (rhs->g * DESC_G_MASK) | - (rhs->avl * DESC_AVL_MASK); - } + lhs->flags = (rhs->type << DESC_TYPE_SHIFT) | + ((rhs->present && !rhs->unusable) * DESC_P_MASK) | + (rhs->dpl << DESC_DPL_SHIFT) | + (rhs->db << DESC_B_SHIFT) | + (rhs->s * DESC_S_MASK) | + (rhs->l << DESC_L_SHIFT) | + (rhs->g * DESC_G_MASK) | + (rhs->avl * DESC_AVL_MASK); } static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set) -- cgit v1.2.3-55-g7522 From b8158192fadb3e346372456c25cbbc4be584a85c Mon Sep 17 00:00:00 2001 From: Abdallah Bouassida Date: Thu, 1 Jun 2017 11:33:15 +0200 Subject: target/i386: Add GDB XML description for SSE registers Add an XML description for SSE registers (XMM+MXCSR) for both X86 and X86-64 architectures in the GDB stub: - configure: Define gdb_xml_files for the X86 targets (32 and 64bit). - gdb-xml/i386-32bit-sse.xml & gdb-xml/i386-64bit-sse.xml: The XML files that contain a description of the XMM + MXCSR registers. - gdb-xml/i386-32bit.xml & gdb-xml/i386-64bit.xml: wrappers that include the XML file of the core registers and the other XML file of the SSE registers. - target/i386/cpu.c: Modify the gdb_core_xml_file to the new XML wrapper, modify the gdb_num_core_regs to fit the registers number defined in each XML file. Signed-off-by: Abdallah Bouassida Signed-off-by: Paolo Bonzini --- configure | 4 ++-- gdb-xml/i386-32bit-sse.xml | 52 ++++++++++++++++++++++++++++++++++++++++ gdb-xml/i386-32bit.xml | 14 +++++++++++ gdb-xml/i386-64bit-sse.xml | 60 ++++++++++++++++++++++++++++++++++++++++++++++ gdb-xml/i386-64bit.xml | 14 +++++++++++ target/i386/cpu.c | 8 +++---- 6 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 gdb-xml/i386-32bit-sse.xml create mode 100644 gdb-xml/i386-32bit.xml create mode 100644 gdb-xml/i386-64bit-sse.xml create mode 100644 gdb-xml/i386-64bit.xml diff --git a/configure b/configure index 13e040d28c..71f5612a65 100755 --- a/configure +++ b/configure @@ -6027,11 +6027,11 @@ TARGET_ABI_DIR="" case "$target_name" in i386) - gdb_xml_files="i386-32bit-core.xml" + gdb_xml_files="i386-32bit.xml i386-32bit-core.xml i386-32bit-sse.xml" ;; x86_64) TARGET_BASE_ARCH=i386 - gdb_xml_files="i386-64bit-core.xml" + gdb_xml_files="i386-64bit.xml i386-64bit-core.xml i386-64bit-sse.xml" ;; alpha) mttcg="yes" diff --git a/gdb-xml/i386-32bit-sse.xml b/gdb-xml/i386-32bit-sse.xml new file mode 100644 index 0000000000..57678473d6 --- /dev/null +++ b/gdb-xml/i386-32bit-sse.xml @@ -0,0 +1,52 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/gdb-xml/i386-32bit.xml b/gdb-xml/i386-32bit.xml new file mode 100644 index 0000000000..956fc7f45f --- /dev/null +++ b/gdb-xml/i386-32bit.xml @@ -0,0 +1,14 @@ + + + + + + + + + + diff --git a/gdb-xml/i386-64bit-sse.xml b/gdb-xml/i386-64bit-sse.xml new file mode 100644 index 0000000000..e86efc9ce5 --- /dev/null +++ b/gdb-xml/i386-64bit-sse.xml @@ -0,0 +1,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/gdb-xml/i386-64bit.xml b/gdb-xml/i386-64bit.xml new file mode 100644 index 0000000000..0b2f00ccbe --- /dev/null +++ b/gdb-xml/i386-64bit.xml @@ -0,0 +1,14 @@ + + + + + + + + + + diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 3f832a6a94..b2b1d20cee 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -4068,11 +4068,11 @@ static void x86_cpu_common_class_init(ObjectClass *oc, void *data) #endif cc->gdb_arch_name = x86_gdb_arch_name; #ifdef TARGET_X86_64 - cc->gdb_core_xml_file = "i386-64bit-core.xml"; - cc->gdb_num_core_regs = 40; + cc->gdb_core_xml_file = "i386-64bit.xml"; + cc->gdb_num_core_regs = 57; #else - cc->gdb_core_xml_file = "i386-32bit-core.xml"; - cc->gdb_num_core_regs = 32; + cc->gdb_core_xml_file = "i386-32bit.xml"; + cc->gdb_num_core_regs = 41; #endif #ifndef CONFIG_USER_ONLY cc->debug_excp_handler = breakpoint_handler; -- cgit v1.2.3-55-g7522 From 6bdcc018a6ed760b9dfe43539124e420aed83092 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 1 Jun 2017 12:44:56 +0200 Subject: nbd: make it thread-safe, fix qcow2 over nbd NBD is not thread safe, because it accesses s->in_flight without a CoMutex. Fixing this will be required for multiqueue. CoQueue doesn't have spurious wakeups but, when another coroutine can run between qemu_co_queue_next's wakeup and qemu_co_queue_wait's re-locking of the mutex, the wait condition can become false and a loop is necessary. In fact, it turns out that the loop is necessary even without this multi-threaded scenario. A particular sequence of coroutine wakeups is happening ~80% of the time when starting a guest with qcow2 image served over NBD (i.e. qemu-nbd --format=raw, and QEMU's -drive option has -format=qcow2). This patch fixes that issue too. Signed-off-by: Paolo Bonzini --- block/nbd-client.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 09d955bc4d..87d19c7253 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -119,6 +119,10 @@ static int nbd_co_send_request(BlockDriverState *bs, int rc, ret, i; qemu_co_mutex_lock(&s->send_mutex); + while (s->in_flight == MAX_NBD_REQUESTS) { + qemu_co_queue_wait(&s->free_sema, &s->send_mutex); + } + s->in_flight++; for (i = 0; i < MAX_NBD_REQUESTS; i++) { if (s->recv_coroutine[i] == NULL) { @@ -181,20 +185,6 @@ static void nbd_co_receive_reply(NBDClientSession *s, } } -static void nbd_coroutine_start(NBDClientSession *s, - NBDRequest *request) -{ - /* Poor man semaphore. The free_sema is locked when no other request - * can be accepted, and unlocked after receiving one reply. */ - if (s->in_flight == MAX_NBD_REQUESTS) { - qemu_co_queue_wait(&s->free_sema, NULL); - assert(s->in_flight < MAX_NBD_REQUESTS); - } - s->in_flight++; - - /* s->recv_coroutine[i] is set as soon as we get the send_lock. */ -} - static void nbd_coroutine_end(BlockDriverState *bs, NBDRequest *request) { @@ -202,13 +192,16 @@ static void nbd_coroutine_end(BlockDriverState *bs, int i = HANDLE_TO_INDEX(s, request->handle); s->recv_coroutine[i] = NULL; - s->in_flight--; - qemu_co_queue_next(&s->free_sema); /* Kick the read_reply_co to get the next reply. */ if (s->read_reply_co) { aio_co_wake(s->read_reply_co); } + + qemu_co_mutex_lock(&s->send_mutex); + s->in_flight--; + qemu_co_queue_next(&s->free_sema); + qemu_co_mutex_unlock(&s->send_mutex); } int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, @@ -226,7 +219,6 @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, assert(bytes <= NBD_MAX_BUFFER_SIZE); assert(!flags); - nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; @@ -256,7 +248,6 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, assert(bytes <= NBD_MAX_BUFFER_SIZE); - nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, qiov); if (ret < 0) { reply.error = -ret; @@ -291,7 +282,6 @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, request.flags |= NBD_CMD_FLAG_NO_HOLE; } - nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; @@ -316,7 +306,6 @@ int nbd_client_co_flush(BlockDriverState *bs) request.from = 0; request.len = 0; - nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; @@ -342,7 +331,6 @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) return 0; } - nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; -- cgit v1.2.3-55-g7522 From d870cfdea5b5fc7934cacc9786f185d741eab308 Mon Sep 17 00:00:00 2001 From: Gonglei Date: Thu, 1 Jun 2017 19:35:15 +0800 Subject: kvm: don't register smram_listener when smm is off If the user set disable smm by '-machine smm=off', we should not register smram_listener so that we can avoid waster memory in kvm since the added sencond address space. Meanwhile we should assign value of the global kvm_state before invoking the kvm_arch_init(), because pc_machine_is_smm_enabled() may use it by kvm_has_mm(). Signed-off-by: Gonglei Message-Id: <1496316915-121196-1-git-send-email-arei.gonglei@huawei.com> Signed-off-by: Paolo Bonzini --- kvm-all.c | 4 ++-- target/i386/kvm.c | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 1b9fe23490..44b3cf43cc 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1748,6 +1748,8 @@ static int kvm_init(MachineState *ms) kvm_ioeventfd_any_length_allowed = (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); + kvm_state = s; + ret = kvm_arch_init(ms, s); if (ret < 0) { goto err; @@ -1757,8 +1759,6 @@ static int kvm_init(MachineState *ms) kvm_irqchip_create(ms, s); } - kvm_state = s; - if (kvm_eventfds_allowed) { s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 5936d2761f..ee36502789 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -1255,7 +1255,9 @@ int kvm_arch_init(MachineState *ms, KVMState *s) } } - if (kvm_check_extension(s, KVM_CAP_X86_SMM)) { + if (kvm_check_extension(s, KVM_CAP_X86_SMM) && + object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE) && + pc_machine_is_smm_enabled(PC_MACHINE(ms))) { smram_machine_done.notify = register_smram_listener; qemu_add_machine_init_done_notifier(&smram_machine_done); } -- cgit v1.2.3-55-g7522 From 90bb0c04214545beb75044a2742f711335103269 Mon Sep 17 00:00:00 2001 From: Felipe Franciosi Date: Fri, 19 May 2017 22:29:50 +0100 Subject: cpus: reset throttle_thread_scheduled after sleep Currently, the throttle_thread_scheduled flag is reset back to 0 before sleeping (as part of the throttling logic). Given that throttle_timer (well, any timer) may tick with a slight delay, it so happens that under heavy throttling (ie. close or on CPU_THROTTLE_PCT_MAX) the tick may schedule a further cpu_throttle_thread() work item after the flag reset, but before the previous sleep completed. This results on the vCPU thread sleeping continuously for potentially several seconds in a row. The chances of that happening can be drastically minimised by resetting the flag after the sleep. Signed-off-by: Felipe Franciosi Signed-off-by: Malcolm Crossley Message-Id: <1495229390-18909-1-git-send-email-felipe@nutanix.com> Acked-by: Jason J. Herne Signed-off-by: Paolo Bonzini --- cpus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpus.c b/cpus.c index 6398439946..14bb8d552e 100644 --- a/cpus.c +++ b/cpus.c @@ -677,9 +677,9 @@ static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque) sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS); qemu_mutex_unlock_iothread(); - atomic_set(&cpu->throttle_thread_scheduled, 0); g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */ qemu_mutex_lock_iothread(); + atomic_set(&cpu->throttle_thread_scheduled, 0); } static void cpu_throttle_timer_tick(void *opaque) -- cgit v1.2.3-55-g7522 From ac06724a715864942e2b5e28f92d5d5421f0a0b0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 16:46:26 +0200 Subject: docs: create config/, devel/ and spin/ subdirectories Developer documentation should be its own manual. As a start, move all developer-oriented files to a separate directory. Also move non-text files to their own directories: docs/config/ for QEMU -readconfig input, and docs/spin/ for formal models to be used with the SPIN model checker. Reviewed-by: Daniel P. Berrange Signed-off-by: Paolo Bonzini --- docs/aio_notify.promela | 93 --- docs/aio_notify_accept.promela | 152 ---- docs/aio_notify_bug.promela | 140 ---- docs/atomics.txt | 388 ---------- docs/bitmaps.md | 505 ------------- docs/blkdebug.txt | 162 ----- docs/blkverify.txt | 69 -- docs/build-system.txt | 512 ------------- docs/config/ich9-ehci-uhci.cfg | 37 + docs/config/mach-virt-graphical.cfg | 281 ++++++++ docs/config/mach-virt-serial.cfg | 243 +++++++ docs/config/q35-emulated.cfg | 288 ++++++++ docs/config/q35-virtio-graphical.cfg | 248 +++++++ docs/config/q35-virtio-serial.cfg | 193 +++++ docs/devel/atomics.txt | 388 ++++++++++ docs/devel/bitmaps.md | 505 +++++++++++++ docs/devel/blkdebug.txt | 162 +++++ docs/devel/blkverify.txt | 69 ++ docs/devel/build-system.txt | 512 +++++++++++++ docs/devel/lockcnt.txt | 277 +++++++ docs/devel/memory.txt | 316 ++++++++ docs/devel/migration.txt | 555 ++++++++++++++ docs/devel/multi-thread-tcg.txt | 350 +++++++++ docs/devel/multiple-iothreads.txt | 137 ++++ docs/devel/qapi-code-gen.txt | 1310 ++++++++++++++++++++++++++++++++++ docs/devel/rcu.txt | 390 ++++++++++ docs/devel/tracing.txt | 442 ++++++++++++ docs/devel/virtio-migration.txt | 108 +++ docs/devel/writing-qmp-commands.txt | 607 ++++++++++++++++ docs/ich9-ehci-uhci.cfg | 37 - docs/lockcnt.txt | 277 ------- docs/mach-virt-graphical.cfg | 281 -------- docs/mach-virt-serial.cfg | 243 ------- docs/memory.txt | 316 -------- docs/migration.txt | 555 -------------- docs/multi-thread-tcg.txt | 350 --------- docs/multiple-iothreads.txt | 137 ---- docs/q35-emulated.cfg | 288 -------- docs/q35-virtio-graphical.cfg | 248 ------- docs/q35-virtio-serial.cfg | 193 ----- docs/qapi-code-gen.txt | 1310 ---------------------------------- docs/rcu.txt | 390 ---------- docs/spin/aio_notify.promela | 93 +++ docs/spin/aio_notify_accept.promela | 152 ++++ docs/spin/aio_notify_bug.promela | 140 ++++ docs/spin/tcg-exclusive.promela | 225 ++++++ docs/spin/win32-qemu-event.promela | 98 +++ docs/tcg-exclusive.promela | 225 ------ docs/tracing.txt | 442 ------------ docs/virtio-migration.txt | 108 --- docs/win32-qemu-event.promela | 98 --- docs/writing-qmp-commands.txt | 607 ---------------- 52 files changed, 8126 insertions(+), 8126 deletions(-) delete mode 100644 docs/aio_notify.promela delete mode 100644 docs/aio_notify_accept.promela delete mode 100644 docs/aio_notify_bug.promela delete mode 100644 docs/atomics.txt delete mode 100644 docs/bitmaps.md delete mode 100644 docs/blkdebug.txt delete mode 100644 docs/blkverify.txt delete mode 100644 docs/build-system.txt create mode 100644 docs/config/ich9-ehci-uhci.cfg create mode 100644 docs/config/mach-virt-graphical.cfg create mode 100644 docs/config/mach-virt-serial.cfg create mode 100644 docs/config/q35-emulated.cfg create mode 100644 docs/config/q35-virtio-graphical.cfg create mode 100644 docs/config/q35-virtio-serial.cfg create mode 100644 docs/devel/atomics.txt create mode 100644 docs/devel/bitmaps.md create mode 100644 docs/devel/blkdebug.txt create mode 100644 docs/devel/blkverify.txt create mode 100644 docs/devel/build-system.txt create mode 100644 docs/devel/lockcnt.txt create mode 100644 docs/devel/memory.txt create mode 100644 docs/devel/migration.txt create mode 100644 docs/devel/multi-thread-tcg.txt create mode 100644 docs/devel/multiple-iothreads.txt create mode 100644 docs/devel/qapi-code-gen.txt create mode 100644 docs/devel/rcu.txt create mode 100644 docs/devel/tracing.txt create mode 100644 docs/devel/virtio-migration.txt create mode 100644 docs/devel/writing-qmp-commands.txt delete mode 100644 docs/ich9-ehci-uhci.cfg delete mode 100644 docs/lockcnt.txt delete mode 100644 docs/mach-virt-graphical.cfg delete mode 100644 docs/mach-virt-serial.cfg delete mode 100644 docs/memory.txt delete mode 100644 docs/migration.txt delete mode 100644 docs/multi-thread-tcg.txt delete mode 100644 docs/multiple-iothreads.txt delete mode 100644 docs/q35-emulated.cfg delete mode 100644 docs/q35-virtio-graphical.cfg delete mode 100644 docs/q35-virtio-serial.cfg delete mode 100644 docs/qapi-code-gen.txt delete mode 100644 docs/rcu.txt create mode 100644 docs/spin/aio_notify.promela create mode 100644 docs/spin/aio_notify_accept.promela create mode 100644 docs/spin/aio_notify_bug.promela create mode 100644 docs/spin/tcg-exclusive.promela create mode 100644 docs/spin/win32-qemu-event.promela delete mode 100644 docs/tcg-exclusive.promela delete mode 100644 docs/tracing.txt delete mode 100644 docs/virtio-migration.txt delete mode 100644 docs/win32-qemu-event.promela delete mode 100644 docs/writing-qmp-commands.txt diff --git a/docs/aio_notify.promela b/docs/aio_notify.promela deleted file mode 100644 index fccc7ee1c3..0000000000 --- a/docs/aio_notify.promela +++ /dev/null @@ -1,93 +0,0 @@ -/* - * This model describes the interaction between ctx->notify_me - * and aio_notify(). - * - * Author: Paolo Bonzini - * - * This file is in the public domain. If you really want a license, - * the WTFPL will do. - * - * To simulate it: - * spin -p docs/aio_notify.promela - * - * To verify it: - * spin -a docs/aio_notify.promela - * gcc -O2 pan.c - * ./a.out -a - * - * To verify it (with a bug planted in the model): - * spin -a -DBUG docs/aio_notify.promela - * gcc -O2 pan.c - * ./a.out -a - */ - -#define MAX 4 -#define LAST (1 << (MAX - 1)) -#define FINAL ((LAST << 1) - 1) - -bool notify_me; -bool event; - -int req; -int done; - -active proctype waiter() -{ - int fetch; - - do - :: true -> { - notify_me++; - - if -#ifndef BUG - :: (req > 0) -> skip; -#endif - :: else -> - // Wait for a nudge from the other side - do - :: event == 1 -> { event = 0; break; } - od; - fi; - - notify_me--; - - atomic { fetch = req; req = 0; } - done = done | fetch; - } - od -} - -active proctype notifier() -{ - int next = 1; - - do - :: next <= LAST -> { - // generate a request - req = req | next; - next = next << 1; - - // aio_notify - if - :: notify_me == 1 -> event = 1; - :: else -> printf("Skipped event_notifier_set\n"); skip; - fi; - - // Test both synchronous and asynchronous delivery - if - :: 1 -> do - :: req == 0 -> break; - od; - :: 1 -> skip; - fi; - } - od; -} - -never { /* [] done < FINAL */ -accept_init: - do - :: done < FINAL -> skip; - od; -} diff --git a/docs/aio_notify_accept.promela b/docs/aio_notify_accept.promela deleted file mode 100644 index 9cef2c955d..0000000000 --- a/docs/aio_notify_accept.promela +++ /dev/null @@ -1,152 +0,0 @@ -/* - * This model describes the interaction between ctx->notified - * and ctx->notifier. - * - * Author: Paolo Bonzini - * - * This file is in the public domain. If you really want a license, - * the WTFPL will do. - * - * To verify the buggy version: - * spin -a -DBUG1 docs/aio_notify_bug.promela - * gcc -O2 pan.c - * ./a.out -a -f - * (or -DBUG2) - * - * To verify the fixed version: - * spin -a docs/aio_notify_bug.promela - * gcc -O2 pan.c - * ./a.out -a -f - * - * Add -DCHECK_REQ to test an alternative invariant and the - * "notify_me" optimization. - */ - -int notify_me; -bool notified; -bool event; -bool req; -bool notifier_done; - -#ifdef CHECK_REQ -#define USE_NOTIFY_ME 1 -#else -#define USE_NOTIFY_ME 0 -#endif - -#ifdef BUG -#error Please define BUG1 or BUG2 instead. -#endif - -active proctype notifier() -{ - do - :: true -> { - req = 1; - if - :: !USE_NOTIFY_ME || notify_me -> -#if defined BUG1 - /* CHECK_REQ does not detect this bug! */ - notified = 1; - event = 1; -#elif defined BUG2 - if - :: !notified -> event = 1; - :: else -> skip; - fi; - notified = 1; -#else - event = 1; - notified = 1; -#endif - :: else -> skip; - fi - } - :: true -> break; - od; - notifier_done = 1; -} - -#define AIO_POLL \ - notify_me++; \ - if \ - :: !req -> { \ - if \ - :: event -> skip; \ - fi; \ - } \ - :: else -> skip; \ - fi; \ - notify_me--; \ - \ - atomic { old = notified; notified = 0; } \ - if \ - :: old -> event = 0; \ - :: else -> skip; \ - fi; \ - \ - req = 0; - -active proctype waiter() -{ - bool old; - - do - :: true -> AIO_POLL; - od; -} - -/* Same as waiter(), but disappears after a while. */ -active proctype temporary_waiter() -{ - bool old; - - do - :: true -> AIO_POLL; - :: true -> break; - od; -} - -#ifdef CHECK_REQ -never { - do - :: req -> goto accept_if_req_not_eventually_false; - :: true -> skip; - od; - -accept_if_req_not_eventually_false: - if - :: req -> goto accept_if_req_not_eventually_false; - fi; - assert(0); -} - -#else -/* There must be infinitely many transitions of event as long - * as the notifier does not exit. - * - * If event stayed always true, the waiters would be busy looping. - * If event stayed always false, the waiters would be sleeping - * forever. - */ -never { - do - :: !event -> goto accept_if_event_not_eventually_true; - :: event -> goto accept_if_event_not_eventually_false; - :: true -> skip; - od; - -accept_if_event_not_eventually_true: - if - :: !event && notifier_done -> do :: true -> skip; od; - :: !event && !notifier_done -> goto accept_if_event_not_eventually_true; - fi; - assert(0); - -accept_if_event_not_eventually_false: - if - :: event -> goto accept_if_event_not_eventually_false; - fi; - assert(0); -} -#endif diff --git a/docs/aio_notify_bug.promela b/docs/aio_notify_bug.promela deleted file mode 100644 index b3bfca1ca4..0000000000 --- a/docs/aio_notify_bug.promela +++ /dev/null @@ -1,140 +0,0 @@ -/* - * This model describes a bug in aio_notify. If ctx->notifier is - * cleared too late, a wakeup could be lost. - * - * Author: Paolo Bonzini - * - * This file is in the public domain. If you really want a license, - * the WTFPL will do. - * - * To verify the buggy version: - * spin -a -DBUG docs/aio_notify_bug.promela - * gcc -O2 pan.c - * ./a.out -a -f - * - * To verify the fixed version: - * spin -a docs/aio_notify_bug.promela - * gcc -O2 pan.c - * ./a.out -a -f - * - * Add -DCHECK_REQ to test an alternative invariant and the - * "notify_me" optimization. - */ - -int notify_me; -bool event; -bool req; -bool notifier_done; - -#ifdef CHECK_REQ -#define USE_NOTIFY_ME 1 -#else -#define USE_NOTIFY_ME 0 -#endif - -active proctype notifier() -{ - do - :: true -> { - req = 1; - if - :: !USE_NOTIFY_ME || notify_me -> event = 1; - :: else -> skip; - fi - } - :: true -> break; - od; - notifier_done = 1; -} - -#ifdef BUG -#define AIO_POLL \ - notify_me++; \ - if \ - :: !req -> { \ - if \ - :: event -> skip; \ - fi; \ - } \ - :: else -> skip; \ - fi; \ - notify_me--; \ - \ - req = 0; \ - event = 0; -#else -#define AIO_POLL \ - notify_me++; \ - if \ - :: !req -> { \ - if \ - :: event -> skip; \ - fi; \ - } \ - :: else -> skip; \ - fi; \ - notify_me--; \ - \ - event = 0; \ - req = 0; -#endif - -active proctype waiter() -{ - do - :: true -> AIO_POLL; - od; -} - -/* Same as waiter(), but disappears after a while. */ -active proctype temporary_waiter() -{ - do - :: true -> AIO_POLL; - :: true -> break; - od; -} - -#ifdef CHECK_REQ -never { - do - :: req -> goto accept_if_req_not_eventually_false; - :: true -> skip; - od; - -accept_if_req_not_eventually_false: - if - :: req -> goto accept_if_req_not_eventually_false; - fi; - assert(0); -} - -#else -/* There must be infinitely many transitions of event as long - * as the notifier does not exit. - * - * If event stayed always true, the waiters would be busy looping. - * If event stayed always false, the waiters would be sleeping - * forever. - */ -never { - do - :: !event -> goto accept_if_event_not_eventually_true; - :: event -> goto accept_if_event_not_eventually_false; - :: true -> skip; - od; - -accept_if_event_not_eventually_true: - if - :: !event && notifier_done -> do :: true -> skip; od; - :: !event && !notifier_done -> goto accept_if_event_not_eventually_true; - fi; - assert(0); - -accept_if_event_not_eventually_false: - if - :: event -> goto accept_if_event_not_eventually_false; - fi; - assert(0); -} -#endif diff --git a/docs/atomics.txt b/docs/atomics.txt deleted file mode 100644 index 3ef5d85b1b..0000000000 --- a/docs/atomics.txt +++ /dev/null @@ -1,388 +0,0 @@ -CPUs perform independent memory operations effectively in random order. -but this can be a problem for CPU-CPU interaction (including interactions -between QEMU and the guest). Multi-threaded programs use various tools -to instruct the compiler and the CPU to restrict the order to something -that is consistent with the expectations of the programmer. - -The most basic tool is locking. Mutexes, condition variables and -semaphores are used in QEMU, and should be the default approach to -synchronization. Anything else is considerably harder, but it's -also justified more often than one would like. The two tools that -are provided by qemu/atomic.h are memory barriers and atomic operations. - -Macros defined by qemu/atomic.h fall in three camps: - -- compiler barriers: barrier(); - -- weak atomic access and manual memory barriers: atomic_read(), - atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_mb_acquire(), - smp_mb_release(), smp_read_barrier_depends(); - -- sequentially consistent atomic access: everything else. - - -COMPILER MEMORY BARRIER -======================= - -barrier() prevents the compiler from moving the memory accesses either -side of it to the other side. The compiler barrier has no direct effect -on the CPU, which may then reorder things however it wishes. - -barrier() is mostly used within qemu/atomic.h itself. On some -architectures, CPU guarantees are strong enough that blocking compiler -optimizations already ensures the correct order of execution. In this -case, qemu/atomic.h will reduce stronger memory barriers to simple -compiler barriers. - -Still, barrier() can be useful when writing code that can be interrupted -by signal handlers. - - -SEQUENTIALLY CONSISTENT ATOMIC ACCESS -===================================== - -Most of the operations in the qemu/atomic.h header ensure *sequential -consistency*, where "the result of any execution is the same as if the -operations of all the processors were executed in some sequential order, -and the operations of each individual processor appear in this sequence -in the order specified by its program". - -qemu/atomic.h provides the following set of atomic read-modify-write -operations: - - void atomic_inc(ptr) - void atomic_dec(ptr) - void atomic_add(ptr, val) - void atomic_sub(ptr, val) - void atomic_and(ptr, val) - void atomic_or(ptr, val) - - typeof(*ptr) atomic_fetch_inc(ptr) - typeof(*ptr) atomic_fetch_dec(ptr) - typeof(*ptr) atomic_fetch_add(ptr, val) - typeof(*ptr) atomic_fetch_sub(ptr, val) - typeof(*ptr) atomic_fetch_and(ptr, val) - typeof(*ptr) atomic_fetch_or(ptr, val) - typeof(*ptr) atomic_xchg(ptr, val) - typeof(*ptr) atomic_cmpxchg(ptr, old, new) - -all of which return the old value of *ptr. These operations are -polymorphic; they operate on any type that is as wide as an int. - -Sequentially consistent loads and stores can be done using: - - atomic_fetch_add(ptr, 0) for loads - atomic_xchg(ptr, val) for stores - -However, they are quite expensive on some platforms, notably POWER and -ARM. Therefore, qemu/atomic.h provides two primitives with slightly -weaker constraints: - - typeof(*ptr) atomic_mb_read(ptr) - void atomic_mb_set(ptr, val) - -The semantics of these primitives map to Java volatile variables, -and are strongly related to memory barriers as used in the Linux -kernel (see below). - -As long as you use atomic_mb_read and atomic_mb_set, accesses cannot -be reordered with each other, and it is also not possible to reorder -"normal" accesses around them. - -However, and this is the important difference between -atomic_mb_read/atomic_mb_set and sequential consistency, it is important -for both threads to access the same volatile variable. It is not the -case that everything visible to thread A when it writes volatile field f -becomes visible to thread B after it reads volatile field g. The store -and load have to "match" (i.e., be performed on the same volatile -field) to achieve the right semantics. - - -These operations operate on any type that is as wide as an int or smaller. - - -WEAK ATOMIC ACCESS AND MANUAL MEMORY BARRIERS -============================================= - -Compared to sequentially consistent atomic access, programming with -weaker consistency models can be considerably more complicated. -In general, if the algorithm you are writing includes both writes -and reads on the same side, it is generally simpler to use sequentially -consistent primitives. - -When using this model, variables are accessed with atomic_read() and -atomic_set(), and restrictions to the ordering of accesses is enforced -using the memory barrier macros: smp_rmb(), smp_wmb(), smp_mb(), -smp_mb_acquire(), smp_mb_release(), smp_read_barrier_depends(). - -atomic_read() and atomic_set() prevents the compiler from using -optimizations that might otherwise optimize accesses out of existence -on the one hand, or that might create unsolicited accesses on the other. -In general this should not have any effect, because the same compiler -barriers are already implied by memory barriers. However, it is useful -to do so, because it tells readers which variables are shared with -other threads, and which are local to the current thread or protected -by other, more mundane means. - -Memory barriers control the order of references to shared memory. -They come in six kinds: - -- smp_rmb() guarantees that all the LOAD operations specified before - the barrier will appear to happen before all the LOAD operations - specified after the barrier with respect to the other components of - the system. - - In other words, smp_rmb() puts a partial ordering on loads, but is not - required to have any effect on stores. - -- smp_wmb() guarantees that all the STORE operations specified before - the barrier will appear to happen before all the STORE operations - specified after the barrier with respect to the other components of - the system. - - In other words, smp_wmb() puts a partial ordering on stores, but is not - required to have any effect on loads. - -- smp_mb_acquire() guarantees that all the LOAD operations specified before - the barrier will appear to happen before all the LOAD or STORE operations - specified after the barrier with respect to the other components of - the system. - -- smp_mb_release() guarantees that all the STORE operations specified *after* - the barrier will appear to happen after all the LOAD or STORE operations - specified *before* the barrier with respect to the other components of - the system. - -- smp_mb() guarantees that all the LOAD and STORE operations specified - before the barrier will appear to happen before all the LOAD and - STORE operations specified after the barrier with respect to the other - components of the system. - - smp_mb() puts a partial ordering on both loads and stores. It is - stronger than both a read and a write memory barrier; it implies both - smp_mb_acquire() and smp_mb_release(), but it also prevents STOREs - coming before the barrier from overtaking LOADs coming after the - barrier and vice versa. - -- smp_read_barrier_depends() is a weaker kind of read barrier. On - most processors, whenever two loads are performed such that the - second depends on the result of the first (e.g., the first load - retrieves the address to which the second load will be directed), - the processor will guarantee that the first LOAD will appear to happen - before the second with respect to the other components of the system. - However, this is not always true---for example, it was not true on - Alpha processors. Whenever this kind of access happens to shared - memory (that is not protected by a lock), a read barrier is needed, - and smp_read_barrier_depends() can be used instead of smp_rmb(). - - Note that the first load really has to have a _data_ dependency and not - a control dependency. If the address for the second load is dependent - on the first load, but the dependency is through a conditional rather - than actually loading the address itself, then it's a _control_ - dependency and a full read barrier or better is required. - - -This is the set of barriers that is required *between* two atomic_read() -and atomic_set() operations to achieve sequential consistency: - - | 2nd operation | - |-----------------------------------------------| - 1st operation | (after last) | atomic_read | atomic_set | - ---------------+----------------+-------------+----------------| - (before first) | | none | smp_mb_release | - ---------------+----------------+-------------+----------------| - atomic_read | smp_mb_acquire | smp_rmb | ** | - ---------------+----------------+-------------+----------------| - atomic_set | none | smp_mb()*** | smp_wmb() | - ---------------+----------------+-------------+----------------| - - * Or smp_read_barrier_depends(). - - ** This requires a load-store barrier. This is achieved by - either smp_mb_acquire() or smp_mb_release(). - - *** This requires a store-load barrier. On most machines, the only - way to achieve this is a full barrier. - - -You can see that the two possible definitions of atomic_mb_read() -and atomic_mb_set() are the following: - - 1) atomic_mb_read(p) = atomic_read(p); smp_mb_acquire() - atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); smp_mb() - - 2) atomic_mb_read(p) = smp_mb() atomic_read(p); smp_mb_acquire() - atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); - -Usually the former is used, because smp_mb() is expensive and a program -normally has more reads than writes. Therefore it makes more sense to -make atomic_mb_set() the more expensive operation. - -There are two common cases in which atomic_mb_read and atomic_mb_set -generate too many memory barriers, and thus it can be useful to manually -place barriers instead: - -- when a data structure has one thread that is always a writer - and one thread that is always a reader, manual placement of - memory barriers makes the write side faster. Furthermore, - correctness is easy to check for in this case using the "pairing" - trick that is explained below: - - thread 1 thread 1 - ------------------------- ------------------------ - (other writes) - smp_mb_release() - atomic_mb_set(&a, x) atomic_set(&a, x) - smp_wmb() - atomic_mb_set(&b, y) atomic_set(&b, y) - - => - thread 2 thread 2 - ------------------------- ------------------------ - y = atomic_mb_read(&b) y = atomic_read(&b) - smp_rmb() - x = atomic_mb_read(&a) x = atomic_read(&a) - smp_mb_acquire() - - Note that the barrier between the stores in thread 1, and between - the loads in thread 2, has been optimized here to a write or a - read memory barrier respectively. On some architectures, notably - ARMv7, smp_mb_acquire and smp_mb_release are just as expensive as - smp_mb, but smp_rmb and/or smp_wmb are more efficient. - -- sometimes, a thread is accessing many variables that are otherwise - unrelated to each other (for example because, apart from the current - thread, exactly one other thread will read or write each of these - variables). In this case, it is possible to "hoist" the implicit - barriers provided by atomic_mb_read() and atomic_mb_set() outside - a loop. For example, the above definition atomic_mb_read() gives - the following transformation: - - n = 0; n = 0; - for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) - n += atomic_mb_read(&a[i]); n += atomic_read(&a[i]); - smp_mb_acquire(); - - Similarly, atomic_mb_set() can be transformed as follows: - smp_mb(): - - smp_mb_release(); - for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) - atomic_mb_set(&a[i], false); atomic_set(&a[i], false); - smp_mb(); - - -The two tricks can be combined. In this case, splitting a loop in -two lets you hoist the barriers out of the loops _and_ eliminate the -expensive smp_mb(): - - smp_mb_release(); - for (i = 0; i < 10; i++) { => for (i = 0; i < 10; i++) - atomic_mb_set(&a[i], false); atomic_set(&a[i], false); - atomic_mb_set(&b[i], false); smb_wmb(); - } for (i = 0; i < 10; i++) - atomic_set(&a[i], false); - smp_mb(); - - The other thread can still use atomic_mb_read()/atomic_mb_set() - - -Memory barrier pairing ----------------------- - -A useful rule of thumb is that memory barriers should always, or almost -always, be paired with another barrier. In the case of QEMU, however, -note that the other barrier may actually be in a driver that runs in -the guest! - -For the purposes of pairing, smp_read_barrier_depends() and smp_rmb() -both count as read barriers. A read barrier shall pair with a write -barrier or a full barrier; a write barrier shall pair with a read -barrier or a full barrier. A full barrier can pair with anything. -For example: - - thread 1 thread 2 - =============== =============== - a = 1; - smp_wmb(); - b = 2; x = b; - smp_rmb(); - y = a; - -Note that the "writing" thread is accessing the variables in the -opposite order as the "reading" thread. This is expected: stores -before the write barrier will normally match the loads after the -read barrier, and vice versa. The same is true for more than 2 -access and for data dependency barriers: - - thread 1 thread 2 - =============== =============== - b[2] = 1; - smp_wmb(); - x->i = 2; - smp_wmb(); - a = x; x = a; - smp_read_barrier_depends(); - y = x->i; - smp_read_barrier_depends(); - z = b[y]; - -smp_wmb() also pairs with atomic_mb_read() and smp_mb_acquire(). -and smp_rmb() also pairs with atomic_mb_set() and smp_mb_release(). - - -COMPARISON WITH LINUX KERNEL MEMORY BARRIERS -============================================ - -Here is a list of differences between Linux kernel atomic operations -and memory barriers, and the equivalents in QEMU: - -- atomic operations in Linux are always on a 32-bit int type and - use a boxed atomic_t type; atomic operations in QEMU are polymorphic - and use normal C types. - -- Originally, atomic_read and atomic_set in Linux gave no guarantee - at all. Linux 4.1 updated them to implement volatile - semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE). - - QEMU's atomic_read/set implement, if the compiler supports it, C11 - atomic relaxed semantics, and volatile semantics otherwise. - Both semantics prevent the compiler from doing certain transformations; - the difference is that atomic accesses are guaranteed to be atomic, - while volatile accesses aren't. Thus, in the volatile case we just cross - our fingers hoping that the compiler will generate atomic accesses, - since we assume the variables passed are machine-word sized and - properly aligned. - No barriers are implied by atomic_read/set in either Linux or QEMU. - -- atomic read-modify-write operations in Linux are of three kinds: - - atomic_OP returns void - atomic_OP_return returns new value of the variable - atomic_fetch_OP returns the old value of the variable - atomic_cmpxchg returns the old value of the variable - - In QEMU, the second kind does not exist. Currently Linux has - atomic_fetch_or only. QEMU provides and, or, inc, dec, add, sub. - -- different atomic read-modify-write operations in Linux imply - a different set of memory barriers; in QEMU, all of them enforce - sequential consistency, which means they imply full memory barriers - before and after the operation. - -- Linux does not have an equivalent of atomic_mb_set(). In particular, - note that smp_store_mb() is a little weaker than atomic_mb_set(). - atomic_mb_read() compiles to the same instructions as Linux's - smp_load_acquire(), but this should be treated as an implementation - detail. QEMU does have atomic_load_acquire() and atomic_store_release() - macros, but for now they are only used within atomic.h. This may - change in the future. - - -SOURCES -======= - -* Documentation/memory-barriers.txt from the Linux kernel - -* "The JSR-133 Cookbook for Compiler Writers", available at - http://g.oswego.edu/dl/jmm/cookbook.html diff --git a/docs/bitmaps.md b/docs/bitmaps.md deleted file mode 100644 index a2e8d51163..0000000000 --- a/docs/bitmaps.md +++ /dev/null @@ -1,505 +0,0 @@ - - -# Dirty Bitmaps and Incremental Backup - -* Dirty Bitmaps are objects that track which data needs to be backed up for the - next incremental backup. - -* Dirty bitmaps can be created at any time and attached to any node - (not just complete drives.) - -## Dirty Bitmap Names - -* A dirty bitmap's name is unique to the node, but bitmaps attached to different - nodes can share the same name. - -* Dirty bitmaps created for internal use by QEMU may be anonymous and have no - name, but any user-created bitmaps may not be. There can be any number of - anonymous bitmaps per node. - -* The name of a user-created bitmap must not be empty (""). - -## Bitmap Modes - -* A Bitmap can be "frozen," which means that it is currently in-use by a backup - operation and cannot be deleted, renamed, written to, reset, - etc. - -* The normal operating mode for a bitmap is "active." - -## Basic QMP Usage - -### Supported Commands ### - -* block-dirty-bitmap-add -* block-dirty-bitmap-remove -* block-dirty-bitmap-clear - -### Creation - -* To create a new bitmap, enabled, on the drive with id=drive0: - -```json -{ "execute": "block-dirty-bitmap-add", - "arguments": { - "node": "drive0", - "name": "bitmap0" - } -} -``` - -* This bitmap will have a default granularity that matches the cluster size of - its associated drive, if available, clamped to between [4KiB, 64KiB]. - The current default for qcow2 is 64KiB. - -* To create a new bitmap that tracks changes in 32KiB segments: - -```json -{ "execute": "block-dirty-bitmap-add", - "arguments": { - "node": "drive0", - "name": "bitmap0", - "granularity": 32768 - } -} -``` - -### Deletion - -* Bitmaps that are frozen cannot be deleted. - -* Deleting the bitmap does not impact any other bitmaps attached to the same - node, nor does it affect any backups already created from this node. - -* Because bitmaps are only unique to the node to which they are attached, - you must specify the node/drive name here, too. - -```json -{ "execute": "block-dirty-bitmap-remove", - "arguments": { - "node": "drive0", - "name": "bitmap0" - } -} -``` - -### Resetting - -* Resetting a bitmap will clear all information it holds. - -* An incremental backup created from an empty bitmap will copy no data, - as if nothing has changed. - -```json -{ "execute": "block-dirty-bitmap-clear", - "arguments": { - "node": "drive0", - "name": "bitmap0" - } -} -``` - -## Transactions - -### Justification - -Bitmaps can be safely modified when the VM is paused or halted by using -the basic QMP commands. For instance, you might perform the following actions: - -1. Boot the VM in a paused state. -2. Create a full drive backup of drive0. -3. Create a new bitmap attached to drive0. -4. Resume execution of the VM. -5. Incremental backups are ready to be created. - -At this point, the bitmap and drive backup would be correctly in sync, -and incremental backups made from this point forward would be correctly aligned -to the full drive backup. - -This is not particularly useful if we decide we want to start incremental -backups after the VM has been running for a while, for which we will need to -perform actions such as the following: - -1. Boot the VM and begin execution. -2. Using a single transaction, perform the following operations: - * Create bitmap0. - * Create a full drive backup of drive0. -3. Incremental backups are now ready to be created. - -### Supported Bitmap Transactions - -* block-dirty-bitmap-add -* block-dirty-bitmap-clear - -The usages are identical to their respective QMP commands, but see below -for examples. - -### Example: New Incremental Backup - -As outlined in the justification, perhaps we want to create a new incremental -backup chain attached to a drive. - -```json -{ "execute": "transaction", - "arguments": { - "actions": [ - {"type": "block-dirty-bitmap-add", - "data": {"node": "drive0", "name": "bitmap0"} }, - {"type": "drive-backup", - "data": {"device": "drive0", "target": "/path/to/full_backup.img", - "sync": "full", "format": "qcow2"} } - ] - } -} -``` - -### Example: New Incremental Backup Anchor Point - -Maybe we just want to create a new full backup with an existing bitmap and -want to reset the bitmap to track the new chain. - -```json -{ "execute": "transaction", - "arguments": { - "actions": [ - {"type": "block-dirty-bitmap-clear", - "data": {"node": "drive0", "name": "bitmap0"} }, - {"type": "drive-backup", - "data": {"device": "drive0", "target": "/path/to/new_full_backup.img", - "sync": "full", "format": "qcow2"} } - ] - } -} -``` - -## Incremental Backups - -The star of the show. - -**Nota Bene!** Only incremental backups of entire drives are supported for now. -So despite the fact that you can attach a bitmap to any arbitrary node, they are -only currently useful when attached to the root node. This is because -drive-backup only supports drives/devices instead of arbitrary nodes. - -### Example: First Incremental Backup - -1. Create a full backup and sync it to the dirty bitmap, as in the transactional -examples above; or with the VM offline, manually create a full copy and then -create a new bitmap before the VM begins execution. - - * Let's assume the full backup is named 'full_backup.img'. - * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'. - -2. Create a destination image for the incremental backup that utilizes the -full backup as a backing image. - - * Let's assume it is named 'incremental.0.img'. - - ```sh - # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 - ``` - -3. Issue the incremental backup command: - - ```json - { "execute": "drive-backup", - "arguments": { - "device": "drive0", - "bitmap": "bitmap0", - "target": "incremental.0.img", - "format": "qcow2", - "sync": "incremental", - "mode": "existing" - } - } - ``` - -### Example: Second Incremental Backup - -1. Create a new destination image for the incremental backup that points to the - previous one, e.g.: 'incremental.1.img' - - ```sh - # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2 - ``` - -2. Issue a new incremental backup command. The only difference here is that we - have changed the target image below. - - ```json - { "execute": "drive-backup", - "arguments": { - "device": "drive0", - "bitmap": "bitmap0", - "target": "incremental.1.img", - "format": "qcow2", - "sync": "incremental", - "mode": "existing" - } - } - ``` - -## Errors - -* In the event of an error that occurs after a backup job is successfully - launched, either by a direct QMP command or a QMP transaction, the user - will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied - by a BLOCK_JOB_ERROR event. - -* In the case of an event being cancelled, the user will receive a - BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events. - -* In either case, the incremental backup data contained within the bitmap is - safely rolled back, and the data within the bitmap is not lost. The image - file created for the failed attempt can be safely deleted. - -* Once the underlying problem is fixed (e.g. more storage space is freed up), - you can simply retry the incremental backup command with the same bitmap. - -### Example - -1. Create a target image: - - ```sh - # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 - ``` - -2. Attempt to create an incremental backup via QMP: - - ```json - { "execute": "drive-backup", - "arguments": { - "device": "drive0", - "bitmap": "bitmap0", - "target": "incremental.0.img", - "format": "qcow2", - "sync": "incremental", - "mode": "existing" - } - } - ``` - -3. Receive an event notifying us of failure: - - ```json - { "timestamp": { "seconds": 1424709442, "microseconds": 844524 }, - "data": { "speed": 0, "offset": 0, "len": 67108864, - "error": "No space left on device", - "device": "drive1", "type": "backup" }, - "event": "BLOCK_JOB_COMPLETED" } - ``` - -4. Delete the failed incremental, and re-create the image. - - ```sh - # rm incremental.0.img - # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 - ``` - -5. Retry the command after fixing the underlying problem, - such as freeing up space on the backup volume: - - ```json - { "execute": "drive-backup", - "arguments": { - "device": "drive0", - "bitmap": "bitmap0", - "target": "incremental.0.img", - "format": "qcow2", - "sync": "incremental", - "mode": "existing" - } - } - ``` - -6. Receive confirmation that the job completed successfully: - - ```json - { "timestamp": { "seconds": 1424709668, "microseconds": 526525 }, - "data": { "device": "drive1", "type": "backup", - "speed": 0, "len": 67108864, "offset": 67108864}, - "event": "BLOCK_JOB_COMPLETED" } - ``` - -### Partial Transactional Failures - -* Sometimes, a transaction will succeed in launching and return success, - but then later the backup jobs themselves may fail. It is possible that - a management application may have to deal with a partial backup failure - after a successful transaction. - -* If multiple backup jobs are specified in a single transaction, when one of - them fails, it will not interact with the other backup jobs in any way. - -* The job(s) that succeeded will clear the dirty bitmap associated with the - operation, but the job(s) that failed will not. It is not "safe" to delete - any incremental backups that were created successfully in this scenario, - even though others failed. - -#### Example - -* QMP example highlighting two backup jobs: - - ```json - { "execute": "transaction", - "arguments": { - "actions": [ - { "type": "drive-backup", - "data": { "device": "drive0", "bitmap": "bitmap0", - "format": "qcow2", "mode": "existing", - "sync": "incremental", "target": "d0-incr-1.qcow2" } }, - { "type": "drive-backup", - "data": { "device": "drive1", "bitmap": "bitmap1", - "format": "qcow2", "mode": "existing", - "sync": "incremental", "target": "d1-incr-1.qcow2" } }, - ] - } - } - ``` - -* QMP example response, highlighting one success and one failure: - * Acknowledgement that the Transaction was accepted and jobs were launched: - ```json - { "return": {} } - ``` - - * Later, QEMU sends notice that the first job was completed: - ```json - { "timestamp": { "seconds": 1447192343, "microseconds": 615698 }, - "data": { "device": "drive0", "type": "backup", - "speed": 0, "len": 67108864, "offset": 67108864 }, - "event": "BLOCK_JOB_COMPLETED" - } - ``` - - * Later yet, QEMU sends notice that the second job has failed: - ```json - { "timestamp": { "seconds": 1447192399, "microseconds": 683015 }, - "data": { "device": "drive1", "action": "report", - "operation": "read" }, - "event": "BLOCK_JOB_ERROR" } - ``` - - ```json - { "timestamp": { "seconds": 1447192399, "microseconds": 685853 }, - "data": { "speed": 0, "offset": 0, "len": 67108864, - "error": "Input/output error", - "device": "drive1", "type": "backup" }, - "event": "BLOCK_JOB_COMPLETED" } - -* In the above example, "d0-incr-1.qcow2" is valid and must be kept, - but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide - incremental backup of all drives at a point-in-time is to be made, - new backups for both drives will need to be made, taking into account - that a new incremental backup for drive0 needs to be based on top of - "d0-incr-1.qcow2." - -### Grouped Completion Mode - -* While jobs launched by transactions normally complete or fail on their own, - it is possible to instruct them to complete or fail together as a group. - -* QMP transactions take an optional properties structure that can affect - the semantics of the transaction. - -* The "completion-mode" transaction property can be either "individual" - which is the default, legacy behavior described above, or "grouped," - a new behavior detailed below. - -* Delayed Completion: In grouped completion mode, no jobs will report - success until all jobs are ready to report success. - -* Grouped failure: If any job fails in grouped completion mode, all remaining - jobs will be cancelled. Any incremental backups will restore their dirty - bitmap objects as if no backup command was ever issued. - - * Regardless of if QEMU reports a particular incremental backup job as - CANCELLED or as an ERROR, the in-memory bitmap will be restored. - -#### Example - -* Here's the same example scenario from above with the new property: - - ```json - { "execute": "transaction", - "arguments": { - "actions": [ - { "type": "drive-backup", - "data": { "device": "drive0", "bitmap": "bitmap0", - "format": "qcow2", "mode": "existing", - "sync": "incremental", "target": "d0-incr-1.qcow2" } }, - { "type": "drive-backup", - "data": { "device": "drive1", "bitmap": "bitmap1", - "format": "qcow2", "mode": "existing", - "sync": "incremental", "target": "d1-incr-1.qcow2" } }, - ], - "properties": { - "completion-mode": "grouped" - } - } - } - ``` - -* QMP example response, highlighting a failure for drive2: - * Acknowledgement that the Transaction was accepted and jobs were launched: - ```json - { "return": {} } - ``` - - * Later, QEMU sends notice that the second job has errored out, - but that the first job was also cancelled: - ```json - { "timestamp": { "seconds": 1447193702, "microseconds": 632377 }, - "data": { "device": "drive1", "action": "report", - "operation": "read" }, - "event": "BLOCK_JOB_ERROR" } - ``` - - ```json - { "timestamp": { "seconds": 1447193702, "microseconds": 640074 }, - "data": { "speed": 0, "offset": 0, "len": 67108864, - "error": "Input/output error", - "device": "drive1", "type": "backup" }, - "event": "BLOCK_JOB_COMPLETED" } - ``` - - ```json - { "timestamp": { "seconds": 1447193702, "microseconds": 640163 }, - "data": { "device": "drive0", "type": "backup", "speed": 0, - "len": 67108864, "offset": 16777216 }, - "event": "BLOCK_JOB_CANCELLED" } - ``` - - diff --git a/docs/blkdebug.txt b/docs/blkdebug.txt deleted file mode 100644 index 43d8e8f9c6..0000000000 --- a/docs/blkdebug.txt +++ /dev/null @@ -1,162 +0,0 @@ -Block I/O error injection using blkdebug ----------------------------------------- -Copyright (C) 2014-2015 Red Hat Inc - -This work is licensed under the terms of the GNU GPL, version 2 or later. See -the COPYING file in the top-level directory. - -The blkdebug block driver is a rule-based error injection engine. It can be -used to exercise error code paths in block drivers including ENOSPC (out of -space) and EIO. - -This document gives an overview of the features available in blkdebug. - -Background ----------- -Block drivers have many error code paths that handle I/O errors. Image formats -are especially complex since metadata I/O errors during cluster allocation or -while updating tables happen halfway through request processing and require -discipline to keep image files consistent. - -Error injection allows test cases to trigger I/O errors at specific points. -This way, all error paths can be tested to make sure they are correct. - -Rules ------ -The blkdebug block driver takes a list of "rules" that tell the error injection -engine when to fail an I/O request. - -Each I/O request is evaluated against the rules. If a rule matches the request -then its "action" is executed. - -Rules can be placed in a configuration file; the configuration file -follows the same .ini-like format used by QEMU's -readconfig option, and -each section of the file represents a rule. - -The following configuration file defines a single rule: - - $ cat blkdebug.conf - [inject-error] - event = "read_aio" - errno = "28" - -This rule fails all aio read requests with ENOSPC (28). Note that the errno -value depends on the host. On Linux, see -/usr/include/asm-generic/errno-base.h for errno values. - -Invoke QEMU as follows: - - $ qemu-system-x86_64 - -drive if=none,cache=none,file=blkdebug:blkdebug.conf:test.img,id=drive0 \ - -device virtio-blk-pci,drive=drive0,id=virtio-blk-pci0 - -Rules support the following attributes: - - event - which type of operation to match (e.g. read_aio, write_aio, - flush_to_os, flush_to_disk). See the "Events" section for - information on events. - - state - (optional) the engine must be in this state number in order for this - rule to match. See the "State transitions" section for information - on states. - - errno - the numeric errno value to return when a request matches this rule. - The errno values depend on the host since the numeric values are not - standarized in the POSIX specification. - - sector - (optional) a sector number that the request must overlap in order to - match this rule - - once - (optional, default "off") only execute this action on the first - matching request - - immediately - (optional, default "off") return a NULL BlockAIOCB - pointer and fail without an errno instead. This - exercises the code path where BlockAIOCB fails and the - caller's BlockCompletionFunc is not invoked. - -Events ------- -Block drivers provide information about the type of I/O request they are about -to make so rules can match specific types of requests. For example, the qcow2 -block driver tells blkdebug when it accesses the L1 table so rules can match -only L1 table accesses and not other metadata or guest data requests. - -The core events are: - - read_aio - guest data read - - write_aio - guest data write - - flush_to_os - write out unwritten block driver state (e.g. cached metadata) - - flush_to_disk - flush the host block device's disk cache - -See qapi/block-core.json:BlkdebugEvent for the full list of events. -You may need to grep block driver source code to understand the -meaning of specific events. - -State transitions ------------------ -There are cases where more power is needed to match a particular I/O request in -a longer sequence of requests. For example: - - write_aio - flush_to_disk - write_aio - -How do we match the 2nd write_aio but not the first? This is where state -transitions come in. - -The error injection engine has an integer called the "state" that always starts -initialized to 1. The state integer is internal to blkdebug and cannot be -observed from outside but rules can interact with it for powerful matching -behavior. - -Rules can be conditional on the current state and they can transition to a new -state. - -When a rule's "state" attribute is non-zero then the current state must equal -the attribute in order for the rule to match. - -For example, to match the 2nd write_aio: - - [set-state] - event = "write_aio" - state = "1" - new_state = "2" - - [inject-error] - event = "write_aio" - state = "2" - errno = "5" - -The first write_aio request matches the set-state rule and transitions from -state 1 to state 2. Once state 2 has been entered, the set-state rule no -longer matches since it requires state 1. But the inject-error rule now -matches the next write_aio request and injects EIO (5). - -State transition rules support the following attributes: - - event - which type of operation to match (e.g. read_aio, write_aio, - flush_to_os, flush_to_disk). See the "Events" section for - information on events. - - state - (optional) the engine must be in this state number in order for this - rule to match - - new_state - transition to this state number - -Suspend and resume ------------------- -Exercising code paths in block drivers may require specific ordering amongst -concurrent requests. The "breakpoint" feature allows requests to be halted on -a blkdebug event and resumed later. This makes it possible to achieve -deterministic ordering when multiple requests are in flight. - -Breakpoints on blkdebug events are associated with a user-defined "tag" string. -This tag serves as an identifier by which the request can be resumed at a later -point. - -See the qemu-io(1) break, resume, remove_break, and wait_break commands for -details. diff --git a/docs/blkverify.txt b/docs/blkverify.txt deleted file mode 100644 index d556dc4e6d..0000000000 --- a/docs/blkverify.txt +++ /dev/null @@ -1,69 +0,0 @@ -= Block driver correctness testing with blkverify = - -== Introduction == - -This document describes how to use the blkverify protocol to test that a block -driver is operating correctly. - -It is difficult to test and debug block drivers against real guests. Often -processes inside the guest will crash because corrupt sectors were read as part -of the executable. Other times obscure errors are raised by a program inside -the guest. These issues are extremely hard to trace back to bugs in the block -driver. - -Blkverify solves this problem by catching data corruption inside QEMU the first -time bad data is read and reporting the disk sector that is corrupted. - -== How it works == - -The blkverify protocol has two child block devices, the "test" device and the -"raw" device. Read/write operations are mirrored to both devices so their -state should always be in sync. - -The "raw" device is a raw image, a flat file, that has identical starting -contents to the "test" image. The idea is that the "raw" device will handle -read/write operations correctly and not corrupt data. It can be used as a -reference for comparison against the "test" device. - -After a mirrored read operation completes, blkverify will compare the data and -raise an error if it is not identical. This makes it possible to catch the -first instance where corrupt data is read. - -== Example == - -Imagine raw.img has 0xcd repeated throughout its first sector: - - $ ./qemu-io -c 'read -v 0 512' raw.img - 00000000: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ - 00000010: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ - [...] - 000001e0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ - 000001f0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ - read 512/512 bytes at offset 0 - 512.000000 bytes, 1 ops; 0.0000 sec (97.656 MiB/sec and 200000.0000 ops/sec) - -And test.img is corrupt, its first sector is zeroed when it shouldn't be: - - $ ./qemu-io -c 'read -v 0 512' test.img - 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - [...] - 000001e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - 000001f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - read 512/512 bytes at offset 0 - 512.000000 bytes, 1 ops; 0.0000 sec (81.380 MiB/sec and 166666.6667 ops/sec) - -This error is caught by blkverify: - - $ ./qemu-io -c 'read 0 512' blkverify:a.img:b.img - blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0 - -A more realistic scenario is verifying the installation of a guest OS: - - $ ./qemu-img create raw.img 16G - $ ./qemu-img create -f qcow2 test.qcow2 16G - $ x86_64-softmmu/qemu-system-x86_64 -cdrom debian.iso \ - -drive file=blkverify:raw.img:test.qcow2 - -If the installation is aborted when blkverify detects corruption, use qemu-io -to explore the contents of the disk image at the sector in question. diff --git a/docs/build-system.txt b/docs/build-system.txt deleted file mode 100644 index 2af1e668c5..0000000000 --- a/docs/build-system.txt +++ /dev/null @@ -1,512 +0,0 @@ - The QEMU build system architecture - ================================== - -This document aims to help developers understand the architecture of the -QEMU build system. As with projects using GNU autotools, the QEMU build -system has two stages, first the developer runs the "configure" script -to determine the local build environment characteristics, then they run -"make" to build the project. There is about where the similarities with -GNU autotools end, so try to forget what you know about them. - - -Stage 1: configure -================== - -The QEMU configure script is written directly in shell, and should be -compatible with any POSIX shell, hence it uses #!/bin/sh. An important -implication of this is that it is important to avoid using bash-isms on -development platforms where bash is the primary host. - -In contrast to autoconf scripts, QEMU's configure is expected to be -silent while it is checking for features. It will only display output -when an error occurs, or to show the final feature enablement summary -on completion. - -Adding new checks to the configure script usually comprises the -following tasks: - - - Initialize one or more variables with the default feature state. - - Ideally features should auto-detect whether they are present, - so try to avoid hardcoding the initial state to either enabled - or disabled, as that forces the user to pass a --enable-XXX - / --disable-XXX flag on every invocation of configure. - - - Add support to the command line arg parser to handle any new - --enable-XXX / --disable-XXX flags required by the feature XXX. - - - Add information to the help output message to report on the new - feature flag. - - - Add code to perform the actual feature check. As noted above, try to - be fully dynamic in checking enablement/disablement. - - - Add code to print out the feature status in the configure summary - upon completion. - - - Add any new makefile variables to $config_host_mak on completion. - - -Taking (a simplified version of) the probe for gnutls from configure, -we have the following pieces: - - # Initial variable state - gnutls="" - - ..snip.. - - # Configure flag processing - --disable-gnutls) gnutls="no" - ;; - --enable-gnutls) gnutls="yes" - ;; - - ..snip.. - - # Help output feature message - gnutls GNUTLS cryptography support - - ..snip.. - - # Test for gnutls - if test "$gnutls" != "no"; then - if ! $pkg_config --exists "gnutls"; then - gnutls_cflags=`$pkg_config --cflags gnutls` - gnutls_libs=`$pkg_config --libs gnutls` - libs_softmmu="$gnutls_libs $libs_softmmu" - libs_tools="$gnutls_libs $libs_tools" - QEMU_CFLAGS="$QEMU_CFLAGS $gnutls_cflags" - gnutls="yes" - elif test "$gnutls" = "yes"; then - feature_not_found "gnutls" "Install gnutls devel" - else - gnutls="no" - fi - fi - - ..snip.. - - # Completion feature summary - echo "GNUTLS support $gnutls" - - ..snip.. - - # Define make variables - if test "$gnutls" = "yes" ; then - echo "CONFIG_GNUTLS=y" >> $config_host_mak - fi - - -Helper functions ----------------- - -The configure script provides a variety of helper functions to assist -developers in checking for system features: - - - do_cc $ARGS... - - Attempt to run the system C compiler passing it $ARGS... - - - do_cxx $ARGS... - - Attempt to run the system C++ compiler passing it $ARGS... - - - compile_object $CFLAGS - - Attempt to compile a test program with the system C compiler using - $CFLAGS. The test program must have been previously written to a file - called $TMPC. - - - compile_prog $CFLAGS $LDFLAGS - - Attempt to compile a test program with the system C compiler using - $CFLAGS and link it with the system linker using $LDFLAGS. The test - program must have been previously written to a file called $TMPC. - - - has $COMMAND - - Determine if $COMMAND exists in the current environment, either as a - shell builtin, or executable binary, returning 0 on success. - - - path_of $COMMAND - - Return the fully qualified path of $COMMAND, printing it to stdout, - and returning 0 on success. - - - check_define $NAME - - Determine if the macro $NAME is defined by the system C compiler - - - check_include $NAME - - Determine if the include $NAME file is available to the system C - compiler - - - write_c_skeleton - - Write a minimal C program main() function to the temporary file - indicated by $TMPC - - - feature_not_found $NAME $REMEDY - - Print a message to stderr that the feature $NAME was not available - on the system, suggesting the user try $REMEDY to address the - problem. - - - error_exit $MESSAGE $MORE... - - Print $MESSAGE to stderr, followed by $MORE... and then exit from the - configure script with non-zero status - - - query_pkg_config $ARGS... - - Run pkg-config passing it $ARGS. If QEMU is doing a static build, - then --static will be automatically added to $ARGS - - -Stage 2: makefiles -================== - -The use of GNU make is required with the QEMU build system. - -Although the source code is spread across multiple subdirectories, the -build system should be considered largely non-recursive in nature, in -contrast to common practices seen with automake. There is some recursive -invocation of make, but this is related to the things being built, -rather than the source directory structure. - -QEMU currently supports both VPATH and non-VPATH builds, so there are -three general ways to invoke configure & perform a build. - - - VPATH, build artifacts outside of QEMU source tree entirely - - cd ../ - mkdir build - cd build - ../qemu/configure - make - - - VPATH, build artifacts in a subdir of QEMU source tree - - mkdir build - cd build - ../configure - make - - - non-VPATH, build artifacts everywhere - - ./configure - make - -The QEMU maintainers generally recommend that a VPATH build is used by -developers. Patches to QEMU are expected to ensure VPATH build still -works. - - -Module structure ----------------- - -There are a number of key outputs of the QEMU build system: - - - Tools - qemu-img, qemu-nbd, qga (guest agent), etc - - System emulators - qemu-system-$ARCH - - Userspace emulators - qemu-$ARCH - - Unit tests - -The source code is highly modularized, split across many files to -facilitate building of all of these components with as little duplicated -compilation as possible. There can be considered to be two distinct -groups of files, those which are independent of the QEMU emulation -target and those which are dependent on the QEMU emulation target. - -In the target-independent set lives various general purpose helper code, -such as error handling infrastructure, standard data structures, -platform portability wrapper functions, etc. This code can be compiled -once only and the .o files linked into all output binaries. - -In the target-dependent set lives CPU emulation, device emulation and -much glue code. This sometimes also has to be compiled multiple times, -once for each target being built. - -The utility code that is used by all binaries is built into a -static archive called libqemuutil.a, which is then linked to all the -binaries. In order to provide hooks that are only needed by some of the -binaries, code in libqemuutil.a may depend on other functions that are -not fully implemented by all QEMU binaries. To deal with this there is a -second library called libqemustub.a which provides dummy stubs for all -these functions. These will get lazy linked into the binary if the real -implementation is not present. In this way, the libqemustub.a static -library can be thought of as a portable implementation of the weak -symbols concept. All binaries should link to both libqemuutil.a and -libqemustub.a. e.g. - - qemu-img$(EXESUF): qemu-img.o ..snip.. libqemuutil.a libqemustub.a - - -Windows platform portability ----------------------------- - -On Windows, all binaries have the suffix '.exe', so all Makefile rules -which create binaries must include the $(EXESUF) variable on the binary -name. e.g. - - qemu-img$(EXESUF): qemu-img.o ..snip.. - -This expands to '.exe' on Windows, or '' on other platforms. - -A further complication for the system emulator binaries is that -two separate binaries need to be generated. - -The main binary (e.g. qemu-system-x86_64.exe) is linked against the -Windows console runtime subsystem. These are expected to be run from a -command prompt window, and so will print stderr to the console that -launched them. - -The second binary generated has a 'w' on the end of its name (e.g. -qemu-system-x86_64w.exe) and is linked against the Windows graphical -runtime subsystem. These are expected to be run directly from the -desktop and will open up a dedicated console window for stderr output. - -The Makefile.target will generate the binary for the graphical subsystem -first, and then use objcopy to relink it against the console subsystem -to generate the second binary. - - -Object variable naming ----------------------- - -The QEMU convention is to define variables to list different groups of -object files. These are named with the convention $PREFIX-obj-y. For -example the libqemuutil.a file will be linked with all objects listed -in a variable 'util-obj-y'. So, for example, util/Makefile.obj will -contain a set of definitions looking like - - util-obj-y += bitmap.o bitops.o hbitmap.o - util-obj-y += fifo8.o - util-obj-y += acl.o - util-obj-y += error.o qemu-error.o - -When there is an object file which needs to be conditionally built based -on some characteristic of the host system, the configure script will -define a variable for the conditional. For example, on Windows it will -define $(CONFIG_POSIX) with a value of 'n' and $(CONFIG_WIN32) with a -value of 'y'. It is now possible to use the config variables when -listing object files. For example, - - util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o - util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o - -On Windows this expands to - - util-obj-y += oslib-win32.o qemu-thread-win32.o - util-obj-n += oslib-posix.o qemu-thread-posix.o - -Since libqemutil.a links in $(util-obj-y), the POSIX specific files -listed against $(util-obj-n) are ignored on the Windows platform builds. - - -CFLAGS / LDFLAGS / LIBS handling --------------------------------- - -There are many different binaries being built with differing purposes, -and some of them might even be 3rd party libraries pulled in via git -submodules. As such the use of the global CFLAGS variable is generally -avoided in QEMU, since it would apply to too many build targets. - -Flags that are needed by any QEMU code (i.e. everything *except* GIT -submodule projects) are put in $(QEMU_CFLAGS) variable. For linker -flags the $(LIBS) variable is sometimes used, but a couple of more -targeted variables are preferred. $(libs_softmmu) is used for -libraries that must be linked to system emulator targets, $(LIBS_TOOLS) -is used for tools like qemu-img, qemu-nbd, etc and $(LIBS_QGA) is used -for the QEMU guest agent. There is currently no specific variable for -the userspace emulator targets as the global $(LIBS), or more targeted -variables shown below, are sufficient. - -In addition to these variables, it is possible to provide cflags and -libs against individual source code files, by defining variables of the -form $FILENAME-cflags and $FILENAME-libs. For example, the curl block -driver needs to link to the libcurl library, so block/Makefile defines -some variables: - - curl.o-cflags := $(CURL_CFLAGS) - curl.o-libs := $(CURL_LIBS) - -The scope is a little different between the two variables. The libs get -used when linking any target binary that includes the curl.o object -file, while the cflags get used when compiling the curl.c file only. - - -Statically defined files ------------------------- - -The following key files are statically defined in the source tree, with -the rules needed to build QEMU. Their behaviour is influenced by a -number of dynamically created files listed later. - -- Makefile - -The main entry point used when invoking make to build all the components -of QEMU. The default 'all' target will naturally result in the build of -every component. The various tools and helper binaries are built -directly via a non-recursive set of rules. - -Each system/userspace emulation target needs to have a slightly -different set of make rules / variables. Thus, make will be recursively -invoked for each of the emulation targets. - -The recursive invocation will end up processing the toplevel -Makefile.target file (more on that later). - - -- */Makefile.objs - -Since the source code is spread across multiple directories, the rules -for each file are similarly modularized. Thus each subdirectory -containing .c files will usually also contain a Makefile.objs file. -These files are not directly invoked by a recursive make, but instead -they are imported by the top level Makefile and/or Makefile.target - -Each Makefile.objs usually just declares a set of variables listing the -.o files that need building from the source files in the directory. They -will also define any custom linker or compiler flags. For example in -block/Makefile.objs - - block-obj-$(CONFIG_LIBISCSI) += iscsi.o - block-obj-$(CONFIG_CURL) += curl.o - - ..snip... - - iscsi.o-cflags := $(LIBISCSI_CFLAGS) - iscsi.o-libs := $(LIBISCSI_LIBS) - curl.o-cflags := $(CURL_CFLAGS) - curl.o-libs := $(CURL_LIBS) - -If there are any rules defined in the Makefile.objs file, they should -all use $(obj) as a prefix to the target, e.g. - - $(obj)/generated-tcg-tracers.h: $(obj)/generated-tcg-tracers.h-timestamp - - -- Makefile.target - -This file provides the entry point used to build each individual system -or userspace emulator target. Each enabled target has its own -subdirectory. For example if configure is run with the argument -'--target-list=x86_64-softmmu', then a sub-directory 'x86_64-softmu' -will be created, containing a 'Makefile' which symlinks back to -Makefile.target - -So when the recursive '$(MAKE) -C x86_64-softmmu' is invoked, it ends up -using Makefile.target for the build rules. - - -- rules.mak - -This file provides the generic helper rules for invoking build tools, in -particular the compiler and linker. This also contains the magic (hairy) -'unnest-vars' function which is used to merge the variable definitions -from all Makefile.objs in the source tree down into the main Makefile -context. - - -- default-configs/*.mak - -The files under default-configs/ control what emulated hardware is built -into each QEMU system and userspace emulator targets. They merely -contain a long list of config variable definitions. For example, -default-configs/x86_64-softmmu.mak has: - - include pci.mak - include sound.mak - include usb.mak - CONFIG_QXL=$(CONFIG_SPICE) - CONFIG_VGA_ISA=y - CONFIG_VGA_CIRRUS=y - CONFIG_VMWARE_VGA=y - CONFIG_VIRTIO_VGA=y - ...snip... - -These files rarely need changing unless new devices / hardware need to -be enabled for a particular system/userspace emulation target - - -- tests/Makefile - -Rules for building the unit tests. This file is included directly by the -top level Makefile, so anything defined in this file will influence the -entire build system. Care needs to be taken when writing rules for tests -to ensure they only apply to the unit test execution / build. - -- tests/docker/Makefile.include - -Rules for Docker tests. Like tests/Makefile, this file is included -directly by the top level Makefile, anything defined in this file will -influence the entire build system. - -- po/Makefile - -Rules for building and installing the binary message catalogs from the -text .po file sources. This almost never needs changing for any reason. - - -Dynamically created files -------------------------- - -The following files are generated dynamically by configure in order to -control the behaviour of the statically defined makefiles. This avoids -the need for QEMU makefiles to go through any pre-processing as seen -with autotools, where Makefile.am generates Makefile.in which generates -Makefile. - - -- config-host.mak - -When configure has determined the characteristics of the build host it -will write a long list of variables to config-host.mak file. This -provides the various install directories, compiler / linker flags and a -variety of CONFIG_* variables related to optionally enabled features. -This is imported by the top level Makefile in order to tailor the build -output. - -The variables defined here are those which are applicable to all QEMU -build outputs. Variables which are potentially different for each -emulator target are defined by the next file... - -It is also used as a dependency checking mechanism. If make sees that -the modification timestamp on configure is newer than that on -config-host.mak, then configure will be re-run. - - -- config-host.h - -The config-host.h file is used by source code to determine what features -are enabled. It is generated from the contents of config-host.mak using -the scripts/create_config program. This extracts all the CONFIG_* variables, -most of the HOST_* variables and a few other misc variables from -config-host.mak, formatting them as C preprocessor macros. - - -- $TARGET-NAME/config-target.mak - -TARGET-NAME is the name of a system or userspace emulator, for example, -x86_64-softmmu denotes the system emulator for the x86_64 architecture. -This file contains the variables which need to vary on a per-target -basis. For example, it will indicate whether KVM or Xen are enabled for -the target and any other potential custom libraries needed for linking -the target. - - -- $TARGET-NAME/config-devices.mak - -TARGET-NAME is again the name of a system or userspace emulator. The -config-devices.mak file is automatically generated by make using the -scripts/make_device_config.sh program, feeding it the -default-configs/$TARGET-NAME file as input. - - -- $TARGET-NAME/Makefile - -This is the entrypoint used when make recurses to build a single system -or userspace emulator target. It is merely a symlink back to the -Makefile.target in the top level. diff --git a/docs/config/ich9-ehci-uhci.cfg b/docs/config/ich9-ehci-uhci.cfg new file mode 100644 index 0000000000..a0e9b96f4d --- /dev/null +++ b/docs/config/ich9-ehci-uhci.cfg @@ -0,0 +1,37 @@ +########################################################################### +# +# You can pass this file directly to qemu using the -readconfig +# command line switch. +# +# This config file creates a EHCI adapter with companion UHCI +# controllers as multifunction device in PCI slot "1d". +# +# Specify "bus=ehci.0" when creating usb devices to hook them up +# there. +# + +[device "ehci"] + driver = "ich9-usb-ehci1" + addr = "1d.7" + multifunction = "on" + +[device "uhci-1"] + driver = "ich9-usb-uhci1" + addr = "1d.0" + multifunction = "on" + masterbus = "ehci.0" + firstport = "0" + +[device "uhci-2"] + driver = "ich9-usb-uhci2" + addr = "1d.1" + multifunction = "on" + masterbus = "ehci.0" + firstport = "2" + +[device "uhci-3"] + driver = "ich9-usb-uhci3" + addr = "1d.2" + multifunction = "on" + masterbus = "ehci.0" + firstport = "4" diff --git a/docs/config/mach-virt-graphical.cfg b/docs/config/mach-virt-graphical.cfg new file mode 100644 index 0000000000..0fdf6846dd --- /dev/null +++ b/docs/config/mach-virt-graphical.cfg @@ -0,0 +1,281 @@ +# mach-virt - VirtIO guest (graphical console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-aarch64 \ +# -nodefaults \ +# -readconfig mach-virt-graphical.cfg \ +# -cpu host +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through a graphical console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals, +# such as the PL011 UART, plus a PCI Express Root Bus; the +# user will then have to explicitly add further devices. +# +# The PCI Express Root Bus shows up in the guest as: +# +# 00:00.0 Host bridge +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00:01.0 Display controller +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# 03:00.0 USB controller +# +# More information about these devices is available below. + + +# Machine options +# ========================================================= +# +# We use the virt machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. +# +# Unfortunately, there is no way to configure the CPU model +# in this file, so it will have to be provided on the +# command line, but we can configure the guest to use the +# same GIC version as the host. + +[machine] + type = "virt" + accel = "kvm" + gic-version = "host" + +[memory] + size = "1024" + + +# Firmware configuration +# ========================================================= +# +# There are two parts to the firmware: a read-only image +# containing the executable code, which is shared between +# guests, and a read/write variable store that is owned +# by one specific guest, exclusively, and is used to +# record information such as the UEFI boot order. +# +# For any new guest, its permanent, private variable store +# should initially be copied from the template file +# provided along with the firmware binary. +# +# Depending on the OS distribution you're using on the +# host, the name of the package containing the firmware +# binary and variable store template, as well as the paths +# to the files themselves, will be different. For example: +# +# Fedora +# edk2-aarch64 (pkg) +# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin) +# /usr/share/edk2/aarch64/vars-template-pflash.raw (var) +# +# RHEL +# AAVMF (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) +# +# Debian/Ubuntu +# qemu-efi (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) + +[drive "uefi-binary"] + file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "0" + readonly = "on" + +[drive "uefi-varstore"] + file = "guest_VARS.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "1" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" + + +# USB controller (and input devices) +# ========================================================= +# +# We add a virtualization-friendly USB 3.0 controller and +# a USB keyboard / USB tablet combo so that graphical +# guests can be controlled appropriately. + +[device "usb"] + driver = "nec-usb-xhci" + bus = "pcie.3" + addr = "00.0" + +[device "keyboard"] + driver = "usb-kbd" + bus = "usb.0" + +[device "tablet"] + driver = "usb-tablet" + bus = "usb.0" + + +# Display controller +# ========================================================= +# +# We use virtio-gpu because the legacy VGA framebuffer is +# very troublesome on aarch64, and virtio-gpu is the only +# video device that doesn't implement it. +# +# If you're running the guest on a remote, potentially +# headless host, you will probably want to append something +# like +# +# -display vnc=127.0.0.1:0 +# +# to the command line in order to prevent QEMU from +# creating a graphical display window on the host and +# enable remote access instead. + +[device "video"] + driver = "virtio-gpu" + bus = "pcie.0" + addr = "01.0" diff --git a/docs/config/mach-virt-serial.cfg b/docs/config/mach-virt-serial.cfg new file mode 100644 index 0000000000..aee9f1c5a1 --- /dev/null +++ b/docs/config/mach-virt-serial.cfg @@ -0,0 +1,243 @@ +# mach-virt - VirtIO guest (serial console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-aarch64 \ +# -nodefaults \ +# -readconfig mach-virt-serial.cfg \ +# -display none -serial mon:stdio \ +# -cpu host +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through the serial console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals, +# such as the PL011 UART, plus a PCI Express Root Bus; the +# user will then have to explicitly add further devices. +# +# The PCI Express Root Bus shows up in the guest as: +# +# 00:00.0 Host bridge +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# +# More information about these devices is available below. +# +# We use '-display none' to prevent QEMU from creating a +# graphical display window, which would serve no use in +# this specific configuration, and '-serial mon:stdio' to +# multiplex the guest's serial console and the QEMU monitor +# to the host's stdio; use 'Ctrl+A h' to learn how to +# switch between the two and more. + + +# Machine options +# ========================================================= +# +# We use the virt machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. +# +# Unfortunately, there is no way to configure the CPU model +# in this file, so it will have to be provided on the +# command line, but we can configure the guest to use the +# same GIC version as the host. + +[machine] + type = "virt" + accel = "kvm" + gic-version = "host" + +[memory] + size = "1024" + + +# Firmware configuration +# ========================================================= +# +# There are two parts to the firmware: a read-only image +# containing the executable code, which is shared between +# guests, and a read/write variable store that is owned +# by one specific guest, exclusively, and is used to +# record information such as the UEFI boot order. +# +# For any new guest, its permanent, private variable store +# should initially be copied from the template file +# provided along with the firmware binary. +# +# Depending on the OS distribution you're using on the +# host, the name of the package containing the firmware +# binary and variable store template, as well as the paths +# to the files themselves, will be different. For example: +# +# Fedora +# edk2-aarch64 (pkg) +# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin) +# /usr/share/edk2/aarch64/vars-template-pflash.raw (var) +# +# RHEL +# AAVMF (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) +# +# Debian/Ubuntu +# qemu-efi (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) + +[drive "uefi-binary"] + file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "0" + readonly = "on" + +[drive "uefi-varstore"] + file = "guest_VARS.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "1" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" diff --git a/docs/config/q35-emulated.cfg b/docs/config/q35-emulated.cfg new file mode 100644 index 0000000000..c6416d6545 --- /dev/null +++ b/docs/config/q35-emulated.cfg @@ -0,0 +1,288 @@ +# q35 - Emulated guest (graphical console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-x86_64 \ +# -nodefaults \ +# -readconfig q35-emulated.cfg +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of emulated devices that +# closely resembles that of a physical machine, and will be +# accessed through a graphical console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals +# plus a small selection of core PCI devices and +# controllers; the user will then have to explicitly add +# further devices. +# +# The core PCI devices show up in the guest as: +# +# 00:00.0 Host bridge +# 00:1f.0 ISA bridge / LPC +# 00:1f.2 SATA (AHCI) controller +# 00:1f.3 SMBus controller +# +# This configuration file adds a number of devices that +# are pretty much guaranteed to be present in every single +# physical machine based on q35, more specifically: +# +# 00:01.0 VGA compatible controller +# 00:19.0 Ethernet controller +# 00:1a.* USB controller (#2) +# 00:1b.0 Audio device +# 00:1c.* PCI bridge (PCI Express Root Ports) +# 00:1d.* USB Controller (#1) +# 00:1e.0 PCI bridge (legacy PCI bridge) +# +# More information about these devices is available below. + + +# Machine options +# ========================================================= +# +# We use the q35 machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. +# +# Unfortunately, there is no way to configure the CPU model +# in this file, so it will have to be provided on the +# command line. + +[machine] + type = "q35" + accel = "kvm" + +[memory] + size = "1024" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We add four PCI Express Root Ports, all sharing the same +# slot on the PCI Express Root Bus. These ports support +# hotplug. + +[device "ich9-pcie-port-1"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + +[device "ich9-pcie-port-2"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "ich9-pcie-port-3"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "ich9-pcie-port-4"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + + +# PCI bridge (legacy PCI bridge) +# ========================================================= +# +# This bridge can be used to build an independent topology +# for legacy PCI devices. PCI Express devices should be +# plugged into PCI Express slots instead, so ideally there +# will be no devices connected to this bridge. + +[device "ich9-pci-bridge"] + driver = "i82801b11-bridge" + bus = "pcie.0" + addr = "1e.0" + + +# SATA storage +# ========================================================= +# +# An implicit SATA controller is created automatically for +# every single q35 guest; here we create a disk, backed by +# a qcow2 disk image on the host's filesystem, and attach +# it to that controller so that the guest can use it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "sata-disk"] + driver = "ide-hd" + bus = "ide.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "sata-optical-disk"] + driver = "ide-cd" + bus = "ide.1" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# USB controller (#1) +# ========================================================= +# +# EHCI controller + UHCI companion controllers. + +[device "ich9-ehci-1"] + driver = "ich9-usb-ehci1" + multifunction = "on" + bus = "pcie.0" + addr = "1d.7" + +[device "ich9-uhci-1"] + driver = "ich9-usb-uhci1" + multifunction = "on" + bus = "pcie.0" + addr = "1d.0" + masterbus = "ich9-ehci-1.0" + firstport = "0" + +[device "ich9-uhci-2"] + driver = "ich9-usb-uhci2" + multifunction = "on" + bus = "pcie.0" + addr = "1d.1" + masterbus = "ich9-ehci-1.0" + firstport = "2" + +[device "ich9-uhci-3"] + driver = "ich9-usb-uhci3" + multifunction = "on" + bus = "pcie.0" + addr = "1d.2" + masterbus = "ich9-ehci-1.0" + firstport = "4" + + +# USB controller (#2) +# ========================================================= +# +# EHCI controller + UHCI companion controllers. + +[device "ich9-ehci-2"] + driver = "ich9-usb-ehci2" + multifunction = "on" + bus = "pcie.0" + addr = "1a.7" + +[device "ich9-uhci-4"] + driver = "ich9-usb-uhci4" + multifunction = "on" + bus = "pcie.0" + addr = "1a.0" + masterbus = "ich9-ehci-2.0" + firstport = "0" + +[device "ich9-uhci-5"] + driver = "ich9-usb-uhci5" + multifunction = "on" + bus = "pcie.0" + addr = "1a.1" + masterbus = "ich9-ehci-2.0" + firstport = "2" + +[device "ich9-uhci-6"] + driver = "ich9-usb-uhci6" + multifunction = "on" + bus = "pcie.0" + addr = "1a.2" + masterbus = "ich9-ehci-2.0" + firstport = "4" + + +# Ethernet controller +# ========================================================= +# +# We add a Gigabit Ethernet interface to the guest; on the +# host side, we take advantage of user networking so that +# the QEMU process doesn't require any additional +# privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "e1000" + netdev = "hostnet" + bus = "pcie.0" + addr = "19.0" + + +# VGA compatible controller +# ========================================================= +# +# We use stdvga instead of Cirrus as it supports more video +# modes and is closer to what actual hardware looks like. +# +# If you're running the guest on a remote, potentially +# headless host, you will probably want to append something +# like +# +# -display vnc=127.0.0.1:0 +# +# to the command line in order to prevent QEMU from +# creating a graphical display window on the host and +# enable remote access instead. + +[device "video"] + driver = "VGA" + bus = "pcie.0" + addr = "01.0" + + +# Audio device +# ========================================================= +# +# The sound card is a legacy PCI device that is plugged +# directly into the PCI Express Root Bus. + +[device "ich9-hda-audio"] + driver = "ich9-intel-hda" + bus = "pcie.0" + addr = "1b.0" + +[device "ich9-hda-duplex"] + driver = "hda-duplex" + bus = "ich9-hda-audio.0" + cad = "0" diff --git a/docs/config/q35-virtio-graphical.cfg b/docs/config/q35-virtio-graphical.cfg new file mode 100644 index 0000000000..28bde2fc57 --- /dev/null +++ b/docs/config/q35-virtio-graphical.cfg @@ -0,0 +1,248 @@ +# q35 - VirtIO guest (graphical console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-x86_64 \ +# -nodefaults \ +# -readconfig q35-virtio-graphical.cfg +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through a graphical console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals +# plus a small selection of core PCI devices and +# controllers; the user will then have to explicitly add +# further devices. +# +# The core PCI devices show up in the guest as: +# +# 00:00.0 Host bridge +# 00:1f.0 ISA bridge / LPC +# 00:1f.2 SATA (AHCI) controller +# 00:1f.3 SMBus controller +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00:01.0 VGA compatible controller +# 00:1b.0 Audio device +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# 03:00.0 USB controller +# +# More information about these devices is available below. + + +# Machine options +# ========================================================= +# +# We use the q35 machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. + +[machine] + type = "q35" + accel = "kvm" + +[memory] + size = "1024" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" + + +# USB controller (and input devices) +# ========================================================= +# +# We add a virtualization-friendly USB 3.0 controller and +# a USB tablet so that graphical guests can be controlled +# appropriately. A USB keyboard is not needed, as q35 +# guests get a PS/2 one added automatically. + +[device "usb"] + driver = "nec-usb-xhci" + bus = "pcie.3" + addr = "00.0" + +[device "tablet"] + driver = "usb-tablet" + bus = "usb.0" + + +# VGA compatible controller +# ========================================================= +# +# We plug the QXL video card directly into the PCI Express +# Root Bus as it is a legacy PCI device; this way, we can +# reduce the number of PCI Express controllers in the +# guest. +# +# If you're running the guest on a remote, potentially +# headless host, you will probably want to append something +# like +# +# -display vnc=127.0.0.1:0 +# +# to the command line in order to prevent QEMU from +# creating a graphical display window on the host and +# enable remote access instead. + +[device "video"] + driver = "qxl-vga" + bus = "pcie.0" + addr = "01.0" + + +# Audio device +# ========================================================= +# +# Like the video card, the sound card is a legacy PCI +# device and as such can be plugged directly into the PCI +# Express Root Bus. + +[device "sound"] + driver = "ich9-intel-hda" + bus = "pcie.0" + addr = "1b.0" + +[device "duplex"] + driver = "hda-duplex" + bus = "sound.0" + cad = "0" diff --git a/docs/config/q35-virtio-serial.cfg b/docs/config/q35-virtio-serial.cfg new file mode 100644 index 0000000000..c33c9cc07a --- /dev/null +++ b/docs/config/q35-virtio-serial.cfg @@ -0,0 +1,193 @@ +# q35 - VirtIO guest (serial console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-x86_64 \ +# -nodefaults \ +# -readconfig q35-virtio-serial.cfg \ +# -display none -serial mon:stdio +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through the serial console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals +# plus a small selection of core PCI devices and +# controllers; the user will then have to explicitly add +# further devices. +# +# The core PCI devices show up in the guest as: +# +# 00:00.0 Host bridge +# 00:1f.0 ISA bridge / LPC +# 00:1f.2 SATA (AHCI) controller +# 00:1f.3 SMBus controller +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# +# More information about these devices is available below. +# +# We use '-display none' to prevent QEMU from creating a +# graphical display window, which would serve no use in +# this specific configuration, and '-serial mon:stdio' to +# multiplex the guest's serial console and the QEMU monitor +# to the host's stdio; use 'Ctrl+A h' to learn how to +# switch between the two and more. + + +# Machine options +# ========================================================= +# +# We use the q35 machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. + +[machine] + type = "q35" + accel = "kvm" + +[memory] + size = "1024" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" diff --git a/docs/devel/atomics.txt b/docs/devel/atomics.txt new file mode 100644 index 0000000000..3ef5d85b1b --- /dev/null +++ b/docs/devel/atomics.txt @@ -0,0 +1,388 @@ +CPUs perform independent memory operations effectively in random order. +but this can be a problem for CPU-CPU interaction (including interactions +between QEMU and the guest). Multi-threaded programs use various tools +to instruct the compiler and the CPU to restrict the order to something +that is consistent with the expectations of the programmer. + +The most basic tool is locking. Mutexes, condition variables and +semaphores are used in QEMU, and should be the default approach to +synchronization. Anything else is considerably harder, but it's +also justified more often than one would like. The two tools that +are provided by qemu/atomic.h are memory barriers and atomic operations. + +Macros defined by qemu/atomic.h fall in three camps: + +- compiler barriers: barrier(); + +- weak atomic access and manual memory barriers: atomic_read(), + atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_mb_acquire(), + smp_mb_release(), smp_read_barrier_depends(); + +- sequentially consistent atomic access: everything else. + + +COMPILER MEMORY BARRIER +======================= + +barrier() prevents the compiler from moving the memory accesses either +side of it to the other side. The compiler barrier has no direct effect +on the CPU, which may then reorder things however it wishes. + +barrier() is mostly used within qemu/atomic.h itself. On some +architectures, CPU guarantees are strong enough that blocking compiler +optimizations already ensures the correct order of execution. In this +case, qemu/atomic.h will reduce stronger memory barriers to simple +compiler barriers. + +Still, barrier() can be useful when writing code that can be interrupted +by signal handlers. + + +SEQUENTIALLY CONSISTENT ATOMIC ACCESS +===================================== + +Most of the operations in the qemu/atomic.h header ensure *sequential +consistency*, where "the result of any execution is the same as if the +operations of all the processors were executed in some sequential order, +and the operations of each individual processor appear in this sequence +in the order specified by its program". + +qemu/atomic.h provides the following set of atomic read-modify-write +operations: + + void atomic_inc(ptr) + void atomic_dec(ptr) + void atomic_add(ptr, val) + void atomic_sub(ptr, val) + void atomic_and(ptr, val) + void atomic_or(ptr, val) + + typeof(*ptr) atomic_fetch_inc(ptr) + typeof(*ptr) atomic_fetch_dec(ptr) + typeof(*ptr) atomic_fetch_add(ptr, val) + typeof(*ptr) atomic_fetch_sub(ptr, val) + typeof(*ptr) atomic_fetch_and(ptr, val) + typeof(*ptr) atomic_fetch_or(ptr, val) + typeof(*ptr) atomic_xchg(ptr, val) + typeof(*ptr) atomic_cmpxchg(ptr, old, new) + +all of which return the old value of *ptr. These operations are +polymorphic; they operate on any type that is as wide as an int. + +Sequentially consistent loads and stores can be done using: + + atomic_fetch_add(ptr, 0) for loads + atomic_xchg(ptr, val) for stores + +However, they are quite expensive on some platforms, notably POWER and +ARM. Therefore, qemu/atomic.h provides two primitives with slightly +weaker constraints: + + typeof(*ptr) atomic_mb_read(ptr) + void atomic_mb_set(ptr, val) + +The semantics of these primitives map to Java volatile variables, +and are strongly related to memory barriers as used in the Linux +kernel (see below). + +As long as you use atomic_mb_read and atomic_mb_set, accesses cannot +be reordered with each other, and it is also not possible to reorder +"normal" accesses around them. + +However, and this is the important difference between +atomic_mb_read/atomic_mb_set and sequential consistency, it is important +for both threads to access the same volatile variable. It is not the +case that everything visible to thread A when it writes volatile field f +becomes visible to thread B after it reads volatile field g. The store +and load have to "match" (i.e., be performed on the same volatile +field) to achieve the right semantics. + + +These operations operate on any type that is as wide as an int or smaller. + + +WEAK ATOMIC ACCESS AND MANUAL MEMORY BARRIERS +============================================= + +Compared to sequentially consistent atomic access, programming with +weaker consistency models can be considerably more complicated. +In general, if the algorithm you are writing includes both writes +and reads on the same side, it is generally simpler to use sequentially +consistent primitives. + +When using this model, variables are accessed with atomic_read() and +atomic_set(), and restrictions to the ordering of accesses is enforced +using the memory barrier macros: smp_rmb(), smp_wmb(), smp_mb(), +smp_mb_acquire(), smp_mb_release(), smp_read_barrier_depends(). + +atomic_read() and atomic_set() prevents the compiler from using +optimizations that might otherwise optimize accesses out of existence +on the one hand, or that might create unsolicited accesses on the other. +In general this should not have any effect, because the same compiler +barriers are already implied by memory barriers. However, it is useful +to do so, because it tells readers which variables are shared with +other threads, and which are local to the current thread or protected +by other, more mundane means. + +Memory barriers control the order of references to shared memory. +They come in six kinds: + +- smp_rmb() guarantees that all the LOAD operations specified before + the barrier will appear to happen before all the LOAD operations + specified after the barrier with respect to the other components of + the system. + + In other words, smp_rmb() puts a partial ordering on loads, but is not + required to have any effect on stores. + +- smp_wmb() guarantees that all the STORE operations specified before + the barrier will appear to happen before all the STORE operations + specified after the barrier with respect to the other components of + the system. + + In other words, smp_wmb() puts a partial ordering on stores, but is not + required to have any effect on loads. + +- smp_mb_acquire() guarantees that all the LOAD operations specified before + the barrier will appear to happen before all the LOAD or STORE operations + specified after the barrier with respect to the other components of + the system. + +- smp_mb_release() guarantees that all the STORE operations specified *after* + the barrier will appear to happen after all the LOAD or STORE operations + specified *before* the barrier with respect to the other components of + the system. + +- smp_mb() guarantees that all the LOAD and STORE operations specified + before the barrier will appear to happen before all the LOAD and + STORE operations specified after the barrier with respect to the other + components of the system. + + smp_mb() puts a partial ordering on both loads and stores. It is + stronger than both a read and a write memory barrier; it implies both + smp_mb_acquire() and smp_mb_release(), but it also prevents STOREs + coming before the barrier from overtaking LOADs coming after the + barrier and vice versa. + +- smp_read_barrier_depends() is a weaker kind of read barrier. On + most processors, whenever two loads are performed such that the + second depends on the result of the first (e.g., the first load + retrieves the address to which the second load will be directed), + the processor will guarantee that the first LOAD will appear to happen + before the second with respect to the other components of the system. + However, this is not always true---for example, it was not true on + Alpha processors. Whenever this kind of access happens to shared + memory (that is not protected by a lock), a read barrier is needed, + and smp_read_barrier_depends() can be used instead of smp_rmb(). + + Note that the first load really has to have a _data_ dependency and not + a control dependency. If the address for the second load is dependent + on the first load, but the dependency is through a conditional rather + than actually loading the address itself, then it's a _control_ + dependency and a full read barrier or better is required. + + +This is the set of barriers that is required *between* two atomic_read() +and atomic_set() operations to achieve sequential consistency: + + | 2nd operation | + |-----------------------------------------------| + 1st operation | (after last) | atomic_read | atomic_set | + ---------------+----------------+-------------+----------------| + (before first) | | none | smp_mb_release | + ---------------+----------------+-------------+----------------| + atomic_read | smp_mb_acquire | smp_rmb | ** | + ---------------+----------------+-------------+----------------| + atomic_set | none | smp_mb()*** | smp_wmb() | + ---------------+----------------+-------------+----------------| + + * Or smp_read_barrier_depends(). + + ** This requires a load-store barrier. This is achieved by + either smp_mb_acquire() or smp_mb_release(). + + *** This requires a store-load barrier. On most machines, the only + way to achieve this is a full barrier. + + +You can see that the two possible definitions of atomic_mb_read() +and atomic_mb_set() are the following: + + 1) atomic_mb_read(p) = atomic_read(p); smp_mb_acquire() + atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); smp_mb() + + 2) atomic_mb_read(p) = smp_mb() atomic_read(p); smp_mb_acquire() + atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); + +Usually the former is used, because smp_mb() is expensive and a program +normally has more reads than writes. Therefore it makes more sense to +make atomic_mb_set() the more expensive operation. + +There are two common cases in which atomic_mb_read and atomic_mb_set +generate too many memory barriers, and thus it can be useful to manually +place barriers instead: + +- when a data structure has one thread that is always a writer + and one thread that is always a reader, manual placement of + memory barriers makes the write side faster. Furthermore, + correctness is easy to check for in this case using the "pairing" + trick that is explained below: + + thread 1 thread 1 + ------------------------- ------------------------ + (other writes) + smp_mb_release() + atomic_mb_set(&a, x) atomic_set(&a, x) + smp_wmb() + atomic_mb_set(&b, y) atomic_set(&b, y) + + => + thread 2 thread 2 + ------------------------- ------------------------ + y = atomic_mb_read(&b) y = atomic_read(&b) + smp_rmb() + x = atomic_mb_read(&a) x = atomic_read(&a) + smp_mb_acquire() + + Note that the barrier between the stores in thread 1, and between + the loads in thread 2, has been optimized here to a write or a + read memory barrier respectively. On some architectures, notably + ARMv7, smp_mb_acquire and smp_mb_release are just as expensive as + smp_mb, but smp_rmb and/or smp_wmb are more efficient. + +- sometimes, a thread is accessing many variables that are otherwise + unrelated to each other (for example because, apart from the current + thread, exactly one other thread will read or write each of these + variables). In this case, it is possible to "hoist" the implicit + barriers provided by atomic_mb_read() and atomic_mb_set() outside + a loop. For example, the above definition atomic_mb_read() gives + the following transformation: + + n = 0; n = 0; + for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) + n += atomic_mb_read(&a[i]); n += atomic_read(&a[i]); + smp_mb_acquire(); + + Similarly, atomic_mb_set() can be transformed as follows: + smp_mb(): + + smp_mb_release(); + for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) + atomic_mb_set(&a[i], false); atomic_set(&a[i], false); + smp_mb(); + + +The two tricks can be combined. In this case, splitting a loop in +two lets you hoist the barriers out of the loops _and_ eliminate the +expensive smp_mb(): + + smp_mb_release(); + for (i = 0; i < 10; i++) { => for (i = 0; i < 10; i++) + atomic_mb_set(&a[i], false); atomic_set(&a[i], false); + atomic_mb_set(&b[i], false); smb_wmb(); + } for (i = 0; i < 10; i++) + atomic_set(&a[i], false); + smp_mb(); + + The other thread can still use atomic_mb_read()/atomic_mb_set() + + +Memory barrier pairing +---------------------- + +A useful rule of thumb is that memory barriers should always, or almost +always, be paired with another barrier. In the case of QEMU, however, +note that the other barrier may actually be in a driver that runs in +the guest! + +For the purposes of pairing, smp_read_barrier_depends() and smp_rmb() +both count as read barriers. A read barrier shall pair with a write +barrier or a full barrier; a write barrier shall pair with a read +barrier or a full barrier. A full barrier can pair with anything. +For example: + + thread 1 thread 2 + =============== =============== + a = 1; + smp_wmb(); + b = 2; x = b; + smp_rmb(); + y = a; + +Note that the "writing" thread is accessing the variables in the +opposite order as the "reading" thread. This is expected: stores +before the write barrier will normally match the loads after the +read barrier, and vice versa. The same is true for more than 2 +access and for data dependency barriers: + + thread 1 thread 2 + =============== =============== + b[2] = 1; + smp_wmb(); + x->i = 2; + smp_wmb(); + a = x; x = a; + smp_read_barrier_depends(); + y = x->i; + smp_read_barrier_depends(); + z = b[y]; + +smp_wmb() also pairs with atomic_mb_read() and smp_mb_acquire(). +and smp_rmb() also pairs with atomic_mb_set() and smp_mb_release(). + + +COMPARISON WITH LINUX KERNEL MEMORY BARRIERS +============================================ + +Here is a list of differences between Linux kernel atomic operations +and memory barriers, and the equivalents in QEMU: + +- atomic operations in Linux are always on a 32-bit int type and + use a boxed atomic_t type; atomic operations in QEMU are polymorphic + and use normal C types. + +- Originally, atomic_read and atomic_set in Linux gave no guarantee + at all. Linux 4.1 updated them to implement volatile + semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE). + + QEMU's atomic_read/set implement, if the compiler supports it, C11 + atomic relaxed semantics, and volatile semantics otherwise. + Both semantics prevent the compiler from doing certain transformations; + the difference is that atomic accesses are guaranteed to be atomic, + while volatile accesses aren't. Thus, in the volatile case we just cross + our fingers hoping that the compiler will generate atomic accesses, + since we assume the variables passed are machine-word sized and + properly aligned. + No barriers are implied by atomic_read/set in either Linux or QEMU. + +- atomic read-modify-write operations in Linux are of three kinds: + + atomic_OP returns void + atomic_OP_return returns new value of the variable + atomic_fetch_OP returns the old value of the variable + atomic_cmpxchg returns the old value of the variable + + In QEMU, the second kind does not exist. Currently Linux has + atomic_fetch_or only. QEMU provides and, or, inc, dec, add, sub. + +- different atomic read-modify-write operations in Linux imply + a different set of memory barriers; in QEMU, all of them enforce + sequential consistency, which means they imply full memory barriers + before and after the operation. + +- Linux does not have an equivalent of atomic_mb_set(). In particular, + note that smp_store_mb() is a little weaker than atomic_mb_set(). + atomic_mb_read() compiles to the same instructions as Linux's + smp_load_acquire(), but this should be treated as an implementation + detail. QEMU does have atomic_load_acquire() and atomic_store_release() + macros, but for now they are only used within atomic.h. This may + change in the future. + + +SOURCES +======= + +* Documentation/memory-barriers.txt from the Linux kernel + +* "The JSR-133 Cookbook for Compiler Writers", available at + http://g.oswego.edu/dl/jmm/cookbook.html diff --git a/docs/devel/bitmaps.md b/docs/devel/bitmaps.md new file mode 100644 index 0000000000..a2e8d51163 --- /dev/null +++ b/docs/devel/bitmaps.md @@ -0,0 +1,505 @@ + + +# Dirty Bitmaps and Incremental Backup + +* Dirty Bitmaps are objects that track which data needs to be backed up for the + next incremental backup. + +* Dirty bitmaps can be created at any time and attached to any node + (not just complete drives.) + +## Dirty Bitmap Names + +* A dirty bitmap's name is unique to the node, but bitmaps attached to different + nodes can share the same name. + +* Dirty bitmaps created for internal use by QEMU may be anonymous and have no + name, but any user-created bitmaps may not be. There can be any number of + anonymous bitmaps per node. + +* The name of a user-created bitmap must not be empty (""). + +## Bitmap Modes + +* A Bitmap can be "frozen," which means that it is currently in-use by a backup + operation and cannot be deleted, renamed, written to, reset, + etc. + +* The normal operating mode for a bitmap is "active." + +## Basic QMP Usage + +### Supported Commands ### + +* block-dirty-bitmap-add +* block-dirty-bitmap-remove +* block-dirty-bitmap-clear + +### Creation + +* To create a new bitmap, enabled, on the drive with id=drive0: + +```json +{ "execute": "block-dirty-bitmap-add", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +* This bitmap will have a default granularity that matches the cluster size of + its associated drive, if available, clamped to between [4KiB, 64KiB]. + The current default for qcow2 is 64KiB. + +* To create a new bitmap that tracks changes in 32KiB segments: + +```json +{ "execute": "block-dirty-bitmap-add", + "arguments": { + "node": "drive0", + "name": "bitmap0", + "granularity": 32768 + } +} +``` + +### Deletion + +* Bitmaps that are frozen cannot be deleted. + +* Deleting the bitmap does not impact any other bitmaps attached to the same + node, nor does it affect any backups already created from this node. + +* Because bitmaps are only unique to the node to which they are attached, + you must specify the node/drive name here, too. + +```json +{ "execute": "block-dirty-bitmap-remove", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +### Resetting + +* Resetting a bitmap will clear all information it holds. + +* An incremental backup created from an empty bitmap will copy no data, + as if nothing has changed. + +```json +{ "execute": "block-dirty-bitmap-clear", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +## Transactions + +### Justification + +Bitmaps can be safely modified when the VM is paused or halted by using +the basic QMP commands. For instance, you might perform the following actions: + +1. Boot the VM in a paused state. +2. Create a full drive backup of drive0. +3. Create a new bitmap attached to drive0. +4. Resume execution of the VM. +5. Incremental backups are ready to be created. + +At this point, the bitmap and drive backup would be correctly in sync, +and incremental backups made from this point forward would be correctly aligned +to the full drive backup. + +This is not particularly useful if we decide we want to start incremental +backups after the VM has been running for a while, for which we will need to +perform actions such as the following: + +1. Boot the VM and begin execution. +2. Using a single transaction, perform the following operations: + * Create bitmap0. + * Create a full drive backup of drive0. +3. Incremental backups are now ready to be created. + +### Supported Bitmap Transactions + +* block-dirty-bitmap-add +* block-dirty-bitmap-clear + +The usages are identical to their respective QMP commands, but see below +for examples. + +### Example: New Incremental Backup + +As outlined in the justification, perhaps we want to create a new incremental +backup chain attached to a drive. + +```json +{ "execute": "transaction", + "arguments": { + "actions": [ + {"type": "block-dirty-bitmap-add", + "data": {"node": "drive0", "name": "bitmap0"} }, + {"type": "drive-backup", + "data": {"device": "drive0", "target": "/path/to/full_backup.img", + "sync": "full", "format": "qcow2"} } + ] + } +} +``` + +### Example: New Incremental Backup Anchor Point + +Maybe we just want to create a new full backup with an existing bitmap and +want to reset the bitmap to track the new chain. + +```json +{ "execute": "transaction", + "arguments": { + "actions": [ + {"type": "block-dirty-bitmap-clear", + "data": {"node": "drive0", "name": "bitmap0"} }, + {"type": "drive-backup", + "data": {"device": "drive0", "target": "/path/to/new_full_backup.img", + "sync": "full", "format": "qcow2"} } + ] + } +} +``` + +## Incremental Backups + +The star of the show. + +**Nota Bene!** Only incremental backups of entire drives are supported for now. +So despite the fact that you can attach a bitmap to any arbitrary node, they are +only currently useful when attached to the root node. This is because +drive-backup only supports drives/devices instead of arbitrary nodes. + +### Example: First Incremental Backup + +1. Create a full backup and sync it to the dirty bitmap, as in the transactional +examples above; or with the VM offline, manually create a full copy and then +create a new bitmap before the VM begins execution. + + * Let's assume the full backup is named 'full_backup.img'. + * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'. + +2. Create a destination image for the incremental backup that utilizes the +full backup as a backing image. + + * Let's assume it is named 'incremental.0.img'. + + ```sh + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +3. Issue the incremental backup command: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +### Example: Second Incremental Backup + +1. Create a new destination image for the incremental backup that points to the + previous one, e.g.: 'incremental.1.img' + + ```sh + # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2 + ``` + +2. Issue a new incremental backup command. The only difference here is that we + have changed the target image below. + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.1.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +## Errors + +* In the event of an error that occurs after a backup job is successfully + launched, either by a direct QMP command or a QMP transaction, the user + will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied + by a BLOCK_JOB_ERROR event. + +* In the case of an event being cancelled, the user will receive a + BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events. + +* In either case, the incremental backup data contained within the bitmap is + safely rolled back, and the data within the bitmap is not lost. The image + file created for the failed attempt can be safely deleted. + +* Once the underlying problem is fixed (e.g. more storage space is freed up), + you can simply retry the incremental backup command with the same bitmap. + +### Example + +1. Create a target image: + + ```sh + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +2. Attempt to create an incremental backup via QMP: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +3. Receive an event notifying us of failure: + + ```json + { "timestamp": { "seconds": 1424709442, "microseconds": 844524 }, + "data": { "speed": 0, "offset": 0, "len": 67108864, + "error": "No space left on device", + "device": "drive1", "type": "backup" }, + "event": "BLOCK_JOB_COMPLETED" } + ``` + +4. Delete the failed incremental, and re-create the image. + + ```sh + # rm incremental.0.img + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +5. Retry the command after fixing the underlying problem, + such as freeing up space on the backup volume: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +6. Receive confirmation that the job completed successfully: + + ```json + { "timestamp": { "seconds": 1424709668, "microseconds": 526525 }, + "data": { "device": "drive1", "type": "backup", + "speed": 0, "len": 67108864, "offset": 67108864}, + "event": "BLOCK_JOB_COMPLETED" } + ``` + +### Partial Transactional Failures + +* Sometimes, a transaction will succeed in launching and return success, + but then later the backup jobs themselves may fail. It is possible that + a management application may have to deal with a partial backup failure + after a successful transaction. + +* If multiple backup jobs are specified in a single transaction, when one of + them fails, it will not interact with the other backup jobs in any way. + +* The job(s) that succeeded will clear the dirty bitmap associated with the + operation, but the job(s) that failed will not. It is not "safe" to delete + any incremental backups that were created successfully in this scenario, + even though others failed. + +#### Example + +* QMP example highlighting two backup jobs: + + ```json + { "execute": "transaction", + "arguments": { + "actions": [ + { "type": "drive-backup", + "data": { "device": "drive0", "bitmap": "bitmap0", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d0-incr-1.qcow2" } }, + { "type": "drive-backup", + "data": { "device": "drive1", "bitmap": "bitmap1", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d1-incr-1.qcow2" } }, + ] + } + } + ``` + +* QMP example response, highlighting one success and one failure: + * Acknowledgement that the Transaction was accepted and jobs were launched: + ```json + { "return": {} } + ``` + + * Later, QEMU sends notice that the first job was completed: + ```json + { "timestamp": { "seconds": 1447192343, "microseconds": 615698 }, + "data": { "device": "drive0", "type": "backup", + "speed": 0, "len": 67108864, "offset": 67108864 }, + "event": "BLOCK_JOB_COMPLETED" + } + ``` + + * Later yet, QEMU sends notice that the second job has failed: + ```json + { "timestamp": { "seconds": 1447192399, "microseconds": 683015 }, + "data": { "device": "drive1", "action": "report", + "operation": "read" }, + "event": "BLOCK_JOB_ERROR" } + ``` + + ```json + { "timestamp": { "seconds": 1447192399, "microseconds": 685853 }, + "data": { "speed": 0, "offset": 0, "len": 67108864, + "error": "Input/output error", + "device": "drive1", "type": "backup" }, + "event": "BLOCK_JOB_COMPLETED" } + +* In the above example, "d0-incr-1.qcow2" is valid and must be kept, + but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide + incremental backup of all drives at a point-in-time is to be made, + new backups for both drives will need to be made, taking into account + that a new incremental backup for drive0 needs to be based on top of + "d0-incr-1.qcow2." + +### Grouped Completion Mode + +* While jobs launched by transactions normally complete or fail on their own, + it is possible to instruct them to complete or fail together as a group. + +* QMP transactions take an optional properties structure that can affect + the semantics of the transaction. + +* The "completion-mode" transaction property can be either "individual" + which is the default, legacy behavior described above, or "grouped," + a new behavior detailed below. + +* Delayed Completion: In grouped completion mode, no jobs will report + success until all jobs are ready to report success. + +* Grouped failure: If any job fails in grouped completion mode, all remaining + jobs will be cancelled. Any incremental backups will restore their dirty + bitmap objects as if no backup command was ever issued. + + * Regardless of if QEMU reports a particular incremental backup job as + CANCELLED or as an ERROR, the in-memory bitmap will be restored. + +#### Example + +* Here's the same example scenario from above with the new property: + + ```json + { "execute": "transaction", + "arguments": { + "actions": [ + { "type": "drive-backup", + "data": { "device": "drive0", "bitmap": "bitmap0", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d0-incr-1.qcow2" } }, + { "type": "drive-backup", + "data": { "device": "drive1", "bitmap": "bitmap1", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d1-incr-1.qcow2" } }, + ], + "properties": { + "completion-mode": "grouped" + } + } + } + ``` + +* QMP example response, highlighting a failure for drive2: + * Acknowledgement that the Transaction was accepted and jobs were launched: + ```json + { "return": {} } + ``` + + * Later, QEMU sends notice that the second job has errored out, + but that the first job was also cancelled: + ```json + { "timestamp": { "seconds": 1447193702, "microseconds": 632377 }, + "data": { "device": "drive1", "action": "report", + "operation": "read" }, + "event": "BLOCK_JOB_ERROR" } + ``` + + ```json + { "timestamp": { "seconds": 1447193702, "microseconds": 640074 }, + "data": { "speed": 0, "offset": 0, "len": 67108864, + "error": "Input/output error", + "device": "drive1", "type": "backup" }, + "event": "BLOCK_JOB_COMPLETED" } + ``` + + ```json + { "timestamp": { "seconds": 1447193702, "microseconds": 640163 }, + "data": { "device": "drive0", "type": "backup", "speed": 0, + "len": 67108864, "offset": 16777216 }, + "event": "BLOCK_JOB_CANCELLED" } + ``` + + diff --git a/docs/devel/blkdebug.txt b/docs/devel/blkdebug.txt new file mode 100644 index 0000000000..43d8e8f9c6 --- /dev/null +++ b/docs/devel/blkdebug.txt @@ -0,0 +1,162 @@ +Block I/O error injection using blkdebug +---------------------------------------- +Copyright (C) 2014-2015 Red Hat Inc + +This work is licensed under the terms of the GNU GPL, version 2 or later. See +the COPYING file in the top-level directory. + +The blkdebug block driver is a rule-based error injection engine. It can be +used to exercise error code paths in block drivers including ENOSPC (out of +space) and EIO. + +This document gives an overview of the features available in blkdebug. + +Background +---------- +Block drivers have many error code paths that handle I/O errors. Image formats +are especially complex since metadata I/O errors during cluster allocation or +while updating tables happen halfway through request processing and require +discipline to keep image files consistent. + +Error injection allows test cases to trigger I/O errors at specific points. +This way, all error paths can be tested to make sure they are correct. + +Rules +----- +The blkdebug block driver takes a list of "rules" that tell the error injection +engine when to fail an I/O request. + +Each I/O request is evaluated against the rules. If a rule matches the request +then its "action" is executed. + +Rules can be placed in a configuration file; the configuration file +follows the same .ini-like format used by QEMU's -readconfig option, and +each section of the file represents a rule. + +The following configuration file defines a single rule: + + $ cat blkdebug.conf + [inject-error] + event = "read_aio" + errno = "28" + +This rule fails all aio read requests with ENOSPC (28). Note that the errno +value depends on the host. On Linux, see +/usr/include/asm-generic/errno-base.h for errno values. + +Invoke QEMU as follows: + + $ qemu-system-x86_64 + -drive if=none,cache=none,file=blkdebug:blkdebug.conf:test.img,id=drive0 \ + -device virtio-blk-pci,drive=drive0,id=virtio-blk-pci0 + +Rules support the following attributes: + + event - which type of operation to match (e.g. read_aio, write_aio, + flush_to_os, flush_to_disk). See the "Events" section for + information on events. + + state - (optional) the engine must be in this state number in order for this + rule to match. See the "State transitions" section for information + on states. + + errno - the numeric errno value to return when a request matches this rule. + The errno values depend on the host since the numeric values are not + standarized in the POSIX specification. + + sector - (optional) a sector number that the request must overlap in order to + match this rule + + once - (optional, default "off") only execute this action on the first + matching request + + immediately - (optional, default "off") return a NULL BlockAIOCB + pointer and fail without an errno instead. This + exercises the code path where BlockAIOCB fails and the + caller's BlockCompletionFunc is not invoked. + +Events +------ +Block drivers provide information about the type of I/O request they are about +to make so rules can match specific types of requests. For example, the qcow2 +block driver tells blkdebug when it accesses the L1 table so rules can match +only L1 table accesses and not other metadata or guest data requests. + +The core events are: + + read_aio - guest data read + + write_aio - guest data write + + flush_to_os - write out unwritten block driver state (e.g. cached metadata) + + flush_to_disk - flush the host block device's disk cache + +See qapi/block-core.json:BlkdebugEvent for the full list of events. +You may need to grep block driver source code to understand the +meaning of specific events. + +State transitions +----------------- +There are cases where more power is needed to match a particular I/O request in +a longer sequence of requests. For example: + + write_aio + flush_to_disk + write_aio + +How do we match the 2nd write_aio but not the first? This is where state +transitions come in. + +The error injection engine has an integer called the "state" that always starts +initialized to 1. The state integer is internal to blkdebug and cannot be +observed from outside but rules can interact with it for powerful matching +behavior. + +Rules can be conditional on the current state and they can transition to a new +state. + +When a rule's "state" attribute is non-zero then the current state must equal +the attribute in order for the rule to match. + +For example, to match the 2nd write_aio: + + [set-state] + event = "write_aio" + state = "1" + new_state = "2" + + [inject-error] + event = "write_aio" + state = "2" + errno = "5" + +The first write_aio request matches the set-state rule and transitions from +state 1 to state 2. Once state 2 has been entered, the set-state rule no +longer matches since it requires state 1. But the inject-error rule now +matches the next write_aio request and injects EIO (5). + +State transition rules support the following attributes: + + event - which type of operation to match (e.g. read_aio, write_aio, + flush_to_os, flush_to_disk). See the "Events" section for + information on events. + + state - (optional) the engine must be in this state number in order for this + rule to match + + new_state - transition to this state number + +Suspend and resume +------------------ +Exercising code paths in block drivers may require specific ordering amongst +concurrent requests. The "breakpoint" feature allows requests to be halted on +a blkdebug event and resumed later. This makes it possible to achieve +deterministic ordering when multiple requests are in flight. + +Breakpoints on blkdebug events are associated with a user-defined "tag" string. +This tag serves as an identifier by which the request can be resumed at a later +point. + +See the qemu-io(1) break, resume, remove_break, and wait_break commands for +details. diff --git a/docs/devel/blkverify.txt b/docs/devel/blkverify.txt new file mode 100644 index 0000000000..d556dc4e6d --- /dev/null +++ b/docs/devel/blkverify.txt @@ -0,0 +1,69 @@ += Block driver correctness testing with blkverify = + +== Introduction == + +This document describes how to use the blkverify protocol to test that a block +driver is operating correctly. + +It is difficult to test and debug block drivers against real guests. Often +processes inside the guest will crash because corrupt sectors were read as part +of the executable. Other times obscure errors are raised by a program inside +the guest. These issues are extremely hard to trace back to bugs in the block +driver. + +Blkverify solves this problem by catching data corruption inside QEMU the first +time bad data is read and reporting the disk sector that is corrupted. + +== How it works == + +The blkverify protocol has two child block devices, the "test" device and the +"raw" device. Read/write operations are mirrored to both devices so their +state should always be in sync. + +The "raw" device is a raw image, a flat file, that has identical starting +contents to the "test" image. The idea is that the "raw" device will handle +read/write operations correctly and not corrupt data. It can be used as a +reference for comparison against the "test" device. + +After a mirrored read operation completes, blkverify will compare the data and +raise an error if it is not identical. This makes it possible to catch the +first instance where corrupt data is read. + +== Example == + +Imagine raw.img has 0xcd repeated throughout its first sector: + + $ ./qemu-io -c 'read -v 0 512' raw.img + 00000000: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + 00000010: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + [...] + 000001e0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + 000001f0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + read 512/512 bytes at offset 0 + 512.000000 bytes, 1 ops; 0.0000 sec (97.656 MiB/sec and 200000.0000 ops/sec) + +And test.img is corrupt, its first sector is zeroed when it shouldn't be: + + $ ./qemu-io -c 'read -v 0 512' test.img + 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + [...] + 000001e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 000001f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + read 512/512 bytes at offset 0 + 512.000000 bytes, 1 ops; 0.0000 sec (81.380 MiB/sec and 166666.6667 ops/sec) + +This error is caught by blkverify: + + $ ./qemu-io -c 'read 0 512' blkverify:a.img:b.img + blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0 + +A more realistic scenario is verifying the installation of a guest OS: + + $ ./qemu-img create raw.img 16G + $ ./qemu-img create -f qcow2 test.qcow2 16G + $ x86_64-softmmu/qemu-system-x86_64 -cdrom debian.iso \ + -drive file=blkverify:raw.img:test.qcow2 + +If the installation is aborted when blkverify detects corruption, use qemu-io +to explore the contents of the disk image at the sector in question. diff --git a/docs/devel/build-system.txt b/docs/devel/build-system.txt new file mode 100644 index 0000000000..2af1e668c5 --- /dev/null +++ b/docs/devel/build-system.txt @@ -0,0 +1,512 @@ + The QEMU build system architecture + ================================== + +This document aims to help developers understand the architecture of the +QEMU build system. As with projects using GNU autotools, the QEMU build +system has two stages, first the developer runs the "configure" script +to determine the local build environment characteristics, then they run +"make" to build the project. There is about where the similarities with +GNU autotools end, so try to forget what you know about them. + + +Stage 1: configure +================== + +The QEMU configure script is written directly in shell, and should be +compatible with any POSIX shell, hence it uses #!/bin/sh. An important +implication of this is that it is important to avoid using bash-isms on +development platforms where bash is the primary host. + +In contrast to autoconf scripts, QEMU's configure is expected to be +silent while it is checking for features. It will only display output +when an error occurs, or to show the final feature enablement summary +on completion. + +Adding new checks to the configure script usually comprises the +following tasks: + + - Initialize one or more variables with the default feature state. + + Ideally features should auto-detect whether they are present, + so try to avoid hardcoding the initial state to either enabled + or disabled, as that forces the user to pass a --enable-XXX + / --disable-XXX flag on every invocation of configure. + + - Add support to the command line arg parser to handle any new + --enable-XXX / --disable-XXX flags required by the feature XXX. + + - Add information to the help output message to report on the new + feature flag. + + - Add code to perform the actual feature check. As noted above, try to + be fully dynamic in checking enablement/disablement. + + - Add code to print out the feature status in the configure summary + upon completion. + + - Add any new makefile variables to $config_host_mak on completion. + + +Taking (a simplified version of) the probe for gnutls from configure, +we have the following pieces: + + # Initial variable state + gnutls="" + + ..snip.. + + # Configure flag processing + --disable-gnutls) gnutls="no" + ;; + --enable-gnutls) gnutls="yes" + ;; + + ..snip.. + + # Help output feature message + gnutls GNUTLS cryptography support + + ..snip.. + + # Test for gnutls + if test "$gnutls" != "no"; then + if ! $pkg_config --exists "gnutls"; then + gnutls_cflags=`$pkg_config --cflags gnutls` + gnutls_libs=`$pkg_config --libs gnutls` + libs_softmmu="$gnutls_libs $libs_softmmu" + libs_tools="$gnutls_libs $libs_tools" + QEMU_CFLAGS="$QEMU_CFLAGS $gnutls_cflags" + gnutls="yes" + elif test "$gnutls" = "yes"; then + feature_not_found "gnutls" "Install gnutls devel" + else + gnutls="no" + fi + fi + + ..snip.. + + # Completion feature summary + echo "GNUTLS support $gnutls" + + ..snip.. + + # Define make variables + if test "$gnutls" = "yes" ; then + echo "CONFIG_GNUTLS=y" >> $config_host_mak + fi + + +Helper functions +---------------- + +The configure script provides a variety of helper functions to assist +developers in checking for system features: + + - do_cc $ARGS... + + Attempt to run the system C compiler passing it $ARGS... + + - do_cxx $ARGS... + + Attempt to run the system C++ compiler passing it $ARGS... + + - compile_object $CFLAGS + + Attempt to compile a test program with the system C compiler using + $CFLAGS. The test program must have been previously written to a file + called $TMPC. + + - compile_prog $CFLAGS $LDFLAGS + + Attempt to compile a test program with the system C compiler using + $CFLAGS and link it with the system linker using $LDFLAGS. The test + program must have been previously written to a file called $TMPC. + + - has $COMMAND + + Determine if $COMMAND exists in the current environment, either as a + shell builtin, or executable binary, returning 0 on success. + + - path_of $COMMAND + + Return the fully qualified path of $COMMAND, printing it to stdout, + and returning 0 on success. + + - check_define $NAME + + Determine if the macro $NAME is defined by the system C compiler + + - check_include $NAME + + Determine if the include $NAME file is available to the system C + compiler + + - write_c_skeleton + + Write a minimal C program main() function to the temporary file + indicated by $TMPC + + - feature_not_found $NAME $REMEDY + + Print a message to stderr that the feature $NAME was not available + on the system, suggesting the user try $REMEDY to address the + problem. + + - error_exit $MESSAGE $MORE... + + Print $MESSAGE to stderr, followed by $MORE... and then exit from the + configure script with non-zero status + + - query_pkg_config $ARGS... + + Run pkg-config passing it $ARGS. If QEMU is doing a static build, + then --static will be automatically added to $ARGS + + +Stage 2: makefiles +================== + +The use of GNU make is required with the QEMU build system. + +Although the source code is spread across multiple subdirectories, the +build system should be considered largely non-recursive in nature, in +contrast to common practices seen with automake. There is some recursive +invocation of make, but this is related to the things being built, +rather than the source directory structure. + +QEMU currently supports both VPATH and non-VPATH builds, so there are +three general ways to invoke configure & perform a build. + + - VPATH, build artifacts outside of QEMU source tree entirely + + cd ../ + mkdir build + cd build + ../qemu/configure + make + + - VPATH, build artifacts in a subdir of QEMU source tree + + mkdir build + cd build + ../configure + make + + - non-VPATH, build artifacts everywhere + + ./configure + make + +The QEMU maintainers generally recommend that a VPATH build is used by +developers. Patches to QEMU are expected to ensure VPATH build still +works. + + +Module structure +---------------- + +There are a number of key outputs of the QEMU build system: + + - Tools - qemu-img, qemu-nbd, qga (guest agent), etc + - System emulators - qemu-system-$ARCH + - Userspace emulators - qemu-$ARCH + - Unit tests + +The source code is highly modularized, split across many files to +facilitate building of all of these components with as little duplicated +compilation as possible. There can be considered to be two distinct +groups of files, those which are independent of the QEMU emulation +target and those which are dependent on the QEMU emulation target. + +In the target-independent set lives various general purpose helper code, +such as error handling infrastructure, standard data structures, +platform portability wrapper functions, etc. This code can be compiled +once only and the .o files linked into all output binaries. + +In the target-dependent set lives CPU emulation, device emulation and +much glue code. This sometimes also has to be compiled multiple times, +once for each target being built. + +The utility code that is used by all binaries is built into a +static archive called libqemuutil.a, which is then linked to all the +binaries. In order to provide hooks that are only needed by some of the +binaries, code in libqemuutil.a may depend on other functions that are +not fully implemented by all QEMU binaries. To deal with this there is a +second library called libqemustub.a which provides dummy stubs for all +these functions. These will get lazy linked into the binary if the real +implementation is not present. In this way, the libqemustub.a static +library can be thought of as a portable implementation of the weak +symbols concept. All binaries should link to both libqemuutil.a and +libqemustub.a. e.g. + + qemu-img$(EXESUF): qemu-img.o ..snip.. libqemuutil.a libqemustub.a + + +Windows platform portability +---------------------------- + +On Windows, all binaries have the suffix '.exe', so all Makefile rules +which create binaries must include the $(EXESUF) variable on the binary +name. e.g. + + qemu-img$(EXESUF): qemu-img.o ..snip.. + +This expands to '.exe' on Windows, or '' on other platforms. + +A further complication for the system emulator binaries is that +two separate binaries need to be generated. + +The main binary (e.g. qemu-system-x86_64.exe) is linked against the +Windows console runtime subsystem. These are expected to be run from a +command prompt window, and so will print stderr to the console that +launched them. + +The second binary generated has a 'w' on the end of its name (e.g. +qemu-system-x86_64w.exe) and is linked against the Windows graphical +runtime subsystem. These are expected to be run directly from the +desktop and will open up a dedicated console window for stderr output. + +The Makefile.target will generate the binary for the graphical subsystem +first, and then use objcopy to relink it against the console subsystem +to generate the second binary. + + +Object variable naming +---------------------- + +The QEMU convention is to define variables to list different groups of +object files. These are named with the convention $PREFIX-obj-y. For +example the libqemuutil.a file will be linked with all objects listed +in a variable 'util-obj-y'. So, for example, util/Makefile.obj will +contain a set of definitions looking like + + util-obj-y += bitmap.o bitops.o hbitmap.o + util-obj-y += fifo8.o + util-obj-y += acl.o + util-obj-y += error.o qemu-error.o + +When there is an object file which needs to be conditionally built based +on some characteristic of the host system, the configure script will +define a variable for the conditional. For example, on Windows it will +define $(CONFIG_POSIX) with a value of 'n' and $(CONFIG_WIN32) with a +value of 'y'. It is now possible to use the config variables when +listing object files. For example, + + util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o + util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o + +On Windows this expands to + + util-obj-y += oslib-win32.o qemu-thread-win32.o + util-obj-n += oslib-posix.o qemu-thread-posix.o + +Since libqemutil.a links in $(util-obj-y), the POSIX specific files +listed against $(util-obj-n) are ignored on the Windows platform builds. + + +CFLAGS / LDFLAGS / LIBS handling +-------------------------------- + +There are many different binaries being built with differing purposes, +and some of them might even be 3rd party libraries pulled in via git +submodules. As such the use of the global CFLAGS variable is generally +avoided in QEMU, since it would apply to too many build targets. + +Flags that are needed by any QEMU code (i.e. everything *except* GIT +submodule projects) are put in $(QEMU_CFLAGS) variable. For linker +flags the $(LIBS) variable is sometimes used, but a couple of more +targeted variables are preferred. $(libs_softmmu) is used for +libraries that must be linked to system emulator targets, $(LIBS_TOOLS) +is used for tools like qemu-img, qemu-nbd, etc and $(LIBS_QGA) is used +for the QEMU guest agent. There is currently no specific variable for +the userspace emulator targets as the global $(LIBS), or more targeted +variables shown below, are sufficient. + +In addition to these variables, it is possible to provide cflags and +libs against individual source code files, by defining variables of the +form $FILENAME-cflags and $FILENAME-libs. For example, the curl block +driver needs to link to the libcurl library, so block/Makefile defines +some variables: + + curl.o-cflags := $(CURL_CFLAGS) + curl.o-libs := $(CURL_LIBS) + +The scope is a little different between the two variables. The libs get +used when linking any target binary that includes the curl.o object +file, while the cflags get used when compiling the curl.c file only. + + +Statically defined files +------------------------ + +The following key files are statically defined in the source tree, with +the rules needed to build QEMU. Their behaviour is influenced by a +number of dynamically created files listed later. + +- Makefile + +The main entry point used when invoking make to build all the components +of QEMU. The default 'all' target will naturally result in the build of +every component. The various tools and helper binaries are built +directly via a non-recursive set of rules. + +Each system/userspace emulation target needs to have a slightly +different set of make rules / variables. Thus, make will be recursively +invoked for each of the emulation targets. + +The recursive invocation will end up processing the toplevel +Makefile.target file (more on that later). + + +- */Makefile.objs + +Since the source code is spread across multiple directories, the rules +for each file are similarly modularized. Thus each subdirectory +containing .c files will usually also contain a Makefile.objs file. +These files are not directly invoked by a recursive make, but instead +they are imported by the top level Makefile and/or Makefile.target + +Each Makefile.objs usually just declares a set of variables listing the +.o files that need building from the source files in the directory. They +will also define any custom linker or compiler flags. For example in +block/Makefile.objs + + block-obj-$(CONFIG_LIBISCSI) += iscsi.o + block-obj-$(CONFIG_CURL) += curl.o + + ..snip... + + iscsi.o-cflags := $(LIBISCSI_CFLAGS) + iscsi.o-libs := $(LIBISCSI_LIBS) + curl.o-cflags := $(CURL_CFLAGS) + curl.o-libs := $(CURL_LIBS) + +If there are any rules defined in the Makefile.objs file, they should +all use $(obj) as a prefix to the target, e.g. + + $(obj)/generated-tcg-tracers.h: $(obj)/generated-tcg-tracers.h-timestamp + + +- Makefile.target + +This file provides the entry point used to build each individual system +or userspace emulator target. Each enabled target has its own +subdirectory. For example if configure is run with the argument +'--target-list=x86_64-softmmu', then a sub-directory 'x86_64-softmu' +will be created, containing a 'Makefile' which symlinks back to +Makefile.target + +So when the recursive '$(MAKE) -C x86_64-softmmu' is invoked, it ends up +using Makefile.target for the build rules. + + +- rules.mak + +This file provides the generic helper rules for invoking build tools, in +particular the compiler and linker. This also contains the magic (hairy) +'unnest-vars' function which is used to merge the variable definitions +from all Makefile.objs in the source tree down into the main Makefile +context. + + +- default-configs/*.mak + +The files under default-configs/ control what emulated hardware is built +into each QEMU system and userspace emulator targets. They merely +contain a long list of config variable definitions. For example, +default-configs/x86_64-softmmu.mak has: + + include pci.mak + include sound.mak + include usb.mak + CONFIG_QXL=$(CONFIG_SPICE) + CONFIG_VGA_ISA=y + CONFIG_VGA_CIRRUS=y + CONFIG_VMWARE_VGA=y + CONFIG_VIRTIO_VGA=y + ...snip... + +These files rarely need changing unless new devices / hardware need to +be enabled for a particular system/userspace emulation target + + +- tests/Makefile + +Rules for building the unit tests. This file is included directly by the +top level Makefile, so anything defined in this file will influence the +entire build system. Care needs to be taken when writing rules for tests +to ensure they only apply to the unit test execution / build. + +- tests/docker/Makefile.include + +Rules for Docker tests. Like tests/Makefile, this file is included +directly by the top level Makefile, anything defined in this file will +influence the entire build system. + +- po/Makefile + +Rules for building and installing the binary message catalogs from the +text .po file sources. This almost never needs changing for any reason. + + +Dynamically created files +------------------------- + +The following files are generated dynamically by configure in order to +control the behaviour of the statically defined makefiles. This avoids +the need for QEMU makefiles to go through any pre-processing as seen +with autotools, where Makefile.am generates Makefile.in which generates +Makefile. + + +- config-host.mak + +When configure has determined the characteristics of the build host it +will write a long list of variables to config-host.mak file. This +provides the various install directories, compiler / linker flags and a +variety of CONFIG_* variables related to optionally enabled features. +This is imported by the top level Makefile in order to tailor the build +output. + +The variables defined here are those which are applicable to all QEMU +build outputs. Variables which are potentially different for each +emulator target are defined by the next file... + +It is also used as a dependency checking mechanism. If make sees that +the modification timestamp on configure is newer than that on +config-host.mak, then configure will be re-run. + + +- config-host.h + +The config-host.h file is used by source code to determine what features +are enabled. It is generated from the contents of config-host.mak using +the scripts/create_config program. This extracts all the CONFIG_* variables, +most of the HOST_* variables and a few other misc variables from +config-host.mak, formatting them as C preprocessor macros. + + +- $TARGET-NAME/config-target.mak + +TARGET-NAME is the name of a system or userspace emulator, for example, +x86_64-softmmu denotes the system emulator for the x86_64 architecture. +This file contains the variables which need to vary on a per-target +basis. For example, it will indicate whether KVM or Xen are enabled for +the target and any other potential custom libraries needed for linking +the target. + + +- $TARGET-NAME/config-devices.mak + +TARGET-NAME is again the name of a system or userspace emulator. The +config-devices.mak file is automatically generated by make using the +scripts/make_device_config.sh program, feeding it the +default-configs/$TARGET-NAME file as input. + + +- $TARGET-NAME/Makefile + +This is the entrypoint used when make recurses to build a single system +or userspace emulator target. It is merely a symlink back to the +Makefile.target in the top level. diff --git a/docs/devel/lockcnt.txt b/docs/devel/lockcnt.txt new file mode 100644 index 0000000000..2a79b3205b --- /dev/null +++ b/docs/devel/lockcnt.txt @@ -0,0 +1,277 @@ +DOCUMENTATION FOR LOCKED COUNTERS (aka QemuLockCnt) +=================================================== + +QEMU often uses reference counts to track data structures that are being +accessed and should not be freed. For example, a loop that invoke +callbacks like this is not safe: + + QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { + if (ioh->revents & G_IO_OUT) { + ioh->fd_write(ioh->opaque); + } + } + +QLIST_FOREACH_SAFE protects against deletion of the current node (ioh) +by stashing away its "next" pointer. However, ioh->fd_write could +actually delete the next node from the list. The simplest way to +avoid this is to mark the node as deleted, and remove it from the +list in the above loop: + + QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { + if (ioh->deleted) { + QLIST_REMOVE(ioh, next); + g_free(ioh); + } else { + if (ioh->revents & G_IO_OUT) { + ioh->fd_write(ioh->opaque); + } + } + } + +If however this loop must also be reentrant, i.e. it is possible that +ioh->fd_write invokes the loop again, some kind of counting is needed: + + walking_handlers++; + QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { + if (ioh->deleted) { + if (walking_handlers == 1) { + QLIST_REMOVE(ioh, next); + g_free(ioh); + } + } else { + if (ioh->revents & G_IO_OUT) { + ioh->fd_write(ioh->opaque); + } + } + } + walking_handlers--; + +One may think of using the RCU primitives, rcu_read_lock() and +rcu_read_unlock(); effectively, the RCU nesting count would take +the place of the walking_handlers global variable. Indeed, +reference counting and RCU have similar purposes, but their usage in +general is complementary: + +- reference counting is fine-grained and limited to a single data + structure; RCU delays reclamation of *all* RCU-protected data + structures; + +- reference counting works even in the presence of code that keeps + a reference for a long time; RCU critical sections in principle + should be kept short; + +- reference counting is often applied to code that is not thread-safe + but is reentrant; in fact, usage of reference counting in QEMU predates + the introduction of threads by many years. RCU is generally used to + protect readers from other threads freeing memory after concurrent + modifications to a data structure. + +- reclaiming data can be done by a separate thread in the case of RCU; + this can improve performance, but also delay reclamation undesirably. + With reference counting, reclamation is deterministic. + +This file documents QemuLockCnt, an abstraction for using reference +counting in code that has to be both thread-safe and reentrant. + + +QemuLockCnt concepts +-------------------- + +A QemuLockCnt comprises both a counter and a mutex; it has primitives +to increment and decrement the counter, and to take and release the +mutex. The counter notes how many visits to the data structures are +taking place (the visits could be from different threads, or there could +be multiple reentrant visits from the same thread). The basic rules +governing the counter/mutex pair then are the following: + +- Data protected by the QemuLockCnt must not be freed unless the + counter is zero and the mutex is taken. + +- A new visit cannot be started while the counter is zero and the + mutex is taken. + +Most of the time, the mutex protects all writes to the data structure, +not just frees, though there could be cases where this is not necessary. + +Reads, instead, can be done without taking the mutex, as long as the +readers and writers use the same macros that are used for RCU, for +example atomic_rcu_read, atomic_rcu_set, QLIST_FOREACH_RCU, etc. This is +because the reads are done outside a lock and a set or QLIST_INSERT_HEAD +can happen concurrently with the read. The RCU API ensures that the +processor and the compiler see all required memory barriers. + +This could be implemented simply by protecting the counter with the +mutex, for example: + + // (1) + qemu_mutex_lock(&walking_handlers_mutex); + walking_handlers++; + qemu_mutex_unlock(&walking_handlers_mutex); + + ... + + // (2) + qemu_mutex_lock(&walking_handlers_mutex); + if (--walking_handlers == 0) { + QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { + if (ioh->deleted) { + QLIST_REMOVE(ioh, next); + g_free(ioh); + } + } + } + qemu_mutex_unlock(&walking_handlers_mutex); + +Here, no frees can happen in the code represented by the ellipsis. +If another thread is executing critical section (2), that part of +the code cannot be entered, because the thread will not be able +to increment the walking_handlers variable. And of course +during the visit any other thread will see a nonzero value for +walking_handlers, as in the single-threaded code. + +Note that it is possible for multiple concurrent accesses to delay +the cleanup arbitrarily; in other words, for the walking_handlers +counter to never become zero. For this reason, this technique is +more easily applicable if concurrent access to the structure is rare. + +However, critical sections are easy to forget since you have to do +them for each modification of the counter. QemuLockCnt ensures that +all modifications of the counter take the lock appropriately, and it +can also be more efficient in two ways: + +- it avoids taking the lock for many operations (for example + incrementing the counter while it is non-zero); + +- on some platforms, one can implement QemuLockCnt to hold the lock + and the mutex in a single word, making the fast path no more expensive + than simply managing a counter using atomic operations (see + docs/atomics.txt). This can be very helpful if concurrent access to + the data structure is expected to be rare. + + +Using the same mutex for frees and writes can still incur some small +inefficiencies; for example, a visit can never start if the counter is +zero and the mutex is taken---even if the mutex is taken by a write, +which in principle need not block a visit of the data structure. +However, these are usually not a problem if any of the following +assumptions are valid: + +- concurrent access is possible but rare + +- writes are rare + +- writes are frequent, but this kind of write (e.g. appending to a + list) has a very small critical section. + +For example, QEMU uses QemuLockCnt to manage an AioContext's list of +bottom halves and file descriptor handlers. Modifications to the list +of file descriptor handlers are rare. Creation of a new bottom half is +frequent and can happen on a fast path; however: 1) it is almost never +concurrent with a visit to the list of bottom halves; 2) it only has +three instructions in the critical path, two assignments and a smp_wmb(). + + +QemuLockCnt API +--------------- + +The QemuLockCnt API is described in include/qemu/thread.h. + + +QemuLockCnt usage +----------------- + +This section explains the typical usage patterns for QemuLockCnt functions. + +Setting a variable to a non-NULL value can be done between +qemu_lockcnt_lock and qemu_lockcnt_unlock: + + qemu_lockcnt_lock(&xyz_lockcnt); + if (!xyz) { + new_xyz = g_new(XYZ, 1); + ... + atomic_rcu_set(&xyz, new_xyz); + } + qemu_lockcnt_unlock(&xyz_lockcnt); + +Accessing the value can be done between qemu_lockcnt_inc and +qemu_lockcnt_dec: + + qemu_lockcnt_inc(&xyz_lockcnt); + if (xyz) { + XYZ *p = atomic_rcu_read(&xyz); + ... + /* Accesses can now be done through "p". */ + } + qemu_lockcnt_dec(&xyz_lockcnt); + +Freeing the object can similarly use qemu_lockcnt_lock and +qemu_lockcnt_unlock, but you also need to ensure that the count +is zero (i.e. there is no concurrent visit). Because qemu_lockcnt_inc +takes the QemuLockCnt's lock, the count cannot become non-zero while +the object is being freed. Freeing an object looks like this: + + qemu_lockcnt_lock(&xyz_lockcnt); + if (!qemu_lockcnt_count(&xyz_lockcnt)) { + g_free(xyz); + xyz = NULL; + } + qemu_lockcnt_unlock(&xyz_lockcnt); + +If an object has to be freed right after a visit, you can combine +the decrement, the locking and the check on count as follows: + + qemu_lockcnt_inc(&xyz_lockcnt); + if (xyz) { + XYZ *p = atomic_rcu_read(&xyz); + ... + /* Accesses can now be done through "p". */ + } + if (qemu_lockcnt_dec_and_lock(&xyz_lockcnt)) { + g_free(xyz); + xyz = NULL; + qemu_lockcnt_unlock(&xyz_lockcnt); + } + +QemuLockCnt can also be used to access a list as follows: + + qemu_lockcnt_inc(&io_handlers_lockcnt); + QLIST_FOREACH_RCU(ioh, &io_handlers, pioh) { + if (ioh->revents & G_IO_OUT) { + ioh->fd_write(ioh->opaque); + } + } + + if (qemu_lockcnt_dec_and_lock(&io_handlers_lockcnt)) { + QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { + if (ioh->deleted) { + QLIST_REMOVE(ioh, next); + g_free(ioh); + } + } + qemu_lockcnt_unlock(&io_handlers_lockcnt); + } + +Again, the RCU primitives are used because new items can be added to the +list during the walk. QLIST_FOREACH_RCU ensures that the processor and +the compiler see the appropriate memory barriers. + +An alternative pattern uses qemu_lockcnt_dec_if_lock: + + qemu_lockcnt_inc(&io_handlers_lockcnt); + QLIST_FOREACH_SAFE_RCU(ioh, &io_handlers, next, pioh) { + if (ioh->deleted) { + if (qemu_lockcnt_dec_if_lock(&io_handlers_lockcnt)) { + QLIST_REMOVE(ioh, next); + g_free(ioh); + qemu_lockcnt_inc_and_unlock(&io_handlers_lockcnt); + } + } else { + if (ioh->revents & G_IO_OUT) { + ioh->fd_write(ioh->opaque); + } + } + } + qemu_lockcnt_dec(&io_handlers_lockcnt); + +Here you can use qemu_lockcnt_dec instead of qemu_lockcnt_dec_and_lock, +because there is no special task to do if the count goes from 1 to 0. diff --git a/docs/devel/memory.txt b/docs/devel/memory.txt new file mode 100644 index 0000000000..811b1bd3c5 --- /dev/null +++ b/docs/devel/memory.txt @@ -0,0 +1,316 @@ +The memory API +============== + +The memory API models the memory and I/O buses and controllers of a QEMU +machine. It attempts to allow modelling of: + + - ordinary RAM + - memory-mapped I/O (MMIO) + - memory controllers that can dynamically reroute physical memory regions + to different destinations + +The memory model provides support for + + - tracking RAM changes by the guest + - setting up coalesced memory for kvm + - setting up ioeventfd regions for kvm + +Memory is modelled as an acyclic graph of MemoryRegion objects. Sinks +(leaves) are RAM and MMIO regions, while other nodes represent +buses, memory controllers, and memory regions that have been rerouted. + +In addition to MemoryRegion objects, the memory API provides AddressSpace +objects for every root and possibly for intermediate MemoryRegions too. +These represent memory as seen from the CPU or a device's viewpoint. + +Types of regions +---------------- + +There are multiple types of memory regions (all represented by a single C type +MemoryRegion): + +- RAM: a RAM region is simply a range of host memory that can be made available + to the guest. + You typically initialize these with memory_region_init_ram(). Some special + purposes require the variants memory_region_init_resizeable_ram(), + memory_region_init_ram_from_file(), or memory_region_init_ram_ptr(). + +- MMIO: a range of guest memory that is implemented by host callbacks; + each read or write causes a callback to be called on the host. + You initialize these with memory_region_init_io(), passing it a + MemoryRegionOps structure describing the callbacks. + +- ROM: a ROM memory region works like RAM for reads (directly accessing + a region of host memory), and forbids writes. You initialize these with + memory_region_init_rom(). + +- ROM device: a ROM device memory region works like RAM for reads + (directly accessing a region of host memory), but like MMIO for + writes (invoking a callback). You initialize these with + memory_region_init_rom_device(). + +- IOMMU region: an IOMMU region translates addresses of accesses made to it + and forwards them to some other target memory region. As the name suggests, + these are only needed for modelling an IOMMU, not for simple devices. + You initialize these with memory_region_init_iommu(). + +- container: a container simply includes other memory regions, each at + a different offset. Containers are useful for grouping several regions + into one unit. For example, a PCI BAR may be composed of a RAM region + and an MMIO region. + + A container's subregions are usually non-overlapping. In some cases it is + useful to have overlapping regions; for example a memory controller that + can overlay a subregion of RAM with MMIO or ROM, or a PCI controller + that does not prevent card from claiming overlapping BARs. + + You initialize a pure container with memory_region_init(). + +- alias: a subsection of another region. Aliases allow a region to be + split apart into discontiguous regions. Examples of uses are memory banks + used when the guest address space is smaller than the amount of RAM + addressed, or a memory controller that splits main memory to expose a "PCI + hole". Aliases may point to any type of region, including other aliases, + but an alias may not point back to itself, directly or indirectly. + You initialize these with memory_region_init_alias(). + +- reservation region: a reservation region is primarily for debugging. + It claims I/O space that is not supposed to be handled by QEMU itself. + The typical use is to track parts of the address space which will be + handled by the host kernel when KVM is enabled. + You initialize these with memory_region_init_reservation(), or by + passing a NULL callback parameter to memory_region_init_io(). + +It is valid to add subregions to a region which is not a pure container +(that is, to an MMIO, RAM or ROM region). This means that the region +will act like a container, except that any addresses within the container's +region which are not claimed by any subregion are handled by the +container itself (ie by its MMIO callbacks or RAM backing). However +it is generally possible to achieve the same effect with a pure container +one of whose subregions is a low priority "background" region covering +the whole address range; this is often clearer and is preferred. +Subregions cannot be added to an alias region. + +Region names +------------ + +Regions are assigned names by the constructor. For most regions these are +only used for debugging purposes, but RAM regions also use the name to identify +live migration sections. This means that RAM region names need to have ABI +stability. + +Region lifecycle +---------------- + +A region is created by one of the memory_region_init*() functions and +attached to an object, which acts as its owner or parent. QEMU ensures +that the owner object remains alive as long as the region is visible to +the guest, or as long as the region is in use by a virtual CPU or another +device. For example, the owner object will not die between an +address_space_map operation and the corresponding address_space_unmap. + +After creation, a region can be added to an address space or a +container with memory_region_add_subregion(), and removed using +memory_region_del_subregion(). + +Various region attributes (read-only, dirty logging, coalesced mmio, +ioeventfd) can be changed during the region lifecycle. They take effect +as soon as the region is made visible. This can be immediately, later, +or never. + +Destruction of a memory region happens automatically when the owner +object dies. + +If however the memory region is part of a dynamically allocated data +structure, you should call object_unparent() to destroy the memory region +before the data structure is freed. For an example see VFIOMSIXInfo +and VFIOQuirk in hw/vfio/pci.c. + +You must not destroy a memory region as long as it may be in use by a +device or CPU. In order to do this, as a general rule do not create or +destroy memory regions dynamically during a device's lifetime, and only +call object_unparent() in the memory region owner's instance_finalize +callback. The dynamically allocated data structure that contains the +memory region then should obviously be freed in the instance_finalize +callback as well. + +If you break this rule, the following situation can happen: + +- the memory region's owner had a reference taken via memory_region_ref + (for example by address_space_map) + +- the region is unparented, and has no owner anymore + +- when address_space_unmap is called, the reference to the memory region's + owner is leaked. + + +There is an exception to the above rule: it is okay to call +object_unparent at any time for an alias or a container region. It is +therefore also okay to create or destroy alias and container regions +dynamically during a device's lifetime. + +This exceptional usage is valid because aliases and containers only help +QEMU building the guest's memory map; they are never accessed directly. +memory_region_ref and memory_region_unref are never called on aliases +or containers, and the above situation then cannot happen. Exploiting +this exception is rarely necessary, and therefore it is discouraged, +but nevertheless it is used in a few places. + +For regions that "have no owner" (NULL is passed at creation time), the +machine object is actually used as the owner. Since instance_finalize is +never called for the machine object, you must never call object_unparent +on regions that have no owner, unless they are aliases or containers. + + +Overlapping regions and priority +-------------------------------- +Usually, regions may not overlap each other; a memory address decodes into +exactly one target. In some cases it is useful to allow regions to overlap, +and sometimes to control which of an overlapping regions is visible to the +guest. This is done with memory_region_add_subregion_overlap(), which +allows the region to overlap any other region in the same container, and +specifies a priority that allows the core to decide which of two regions at +the same address are visible (highest wins). +Priority values are signed, and the default value is zero. This means that +you can use memory_region_add_subregion_overlap() both to specify a region +that must sit 'above' any others (with a positive priority) and also a +background region that sits 'below' others (with a negative priority). + +If the higher priority region in an overlap is a container or alias, then +the lower priority region will appear in any "holes" that the higher priority +region has left by not mapping subregions to that area of its address range. +(This applies recursively -- if the subregions are themselves containers or +aliases that leave holes then the lower priority region will appear in these +holes too.) + +For example, suppose we have a container A of size 0x8000 with two subregions +B and C. B is a container mapped at 0x2000, size 0x4000, priority 2; C is +an MMIO region mapped at 0x0, size 0x6000, priority 1. B currently has two +of its own subregions: D of size 0x1000 at offset 0 and E of size 0x1000 at +offset 0x2000. As a diagram: + + 0 1000 2000 3000 4000 5000 6000 7000 8000 + |------|------|------|------|------|------|------|------| + A: [ ] + C: [CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC] + B: [ ] + D: [DDDDD] + E: [EEEEE] + +The regions that will be seen within this address range then are: + [CCCCCCCCCCCC][DDDDD][CCCCC][EEEEE][CCCCC] + +Since B has higher priority than C, its subregions appear in the flat map +even where they overlap with C. In ranges where B has not mapped anything +C's region appears. + +If B had provided its own MMIO operations (ie it was not a pure container) +then these would be used for any addresses in its range not handled by +D or E, and the result would be: + [CCCCCCCCCCCC][DDDDD][BBBBB][EEEEE][BBBBB] + +Priority values are local to a container, because the priorities of two +regions are only compared when they are both children of the same container. +This means that the device in charge of the container (typically modelling +a bus or a memory controller) can use them to manage the interaction of +its child regions without any side effects on other parts of the system. +In the example above, the priorities of D and E are unimportant because +they do not overlap each other. It is the relative priority of B and C +that causes D and E to appear on top of C: D and E's priorities are never +compared against the priority of C. + +Visibility +---------- +The memory core uses the following rules to select a memory region when the +guest accesses an address: + +- all direct subregions of the root region are matched against the address, in + descending priority order + - if the address lies outside the region offset/size, the subregion is + discarded + - if the subregion is a leaf (RAM or MMIO), the search terminates, returning + this leaf region + - if the subregion is a container, the same algorithm is used within the + subregion (after the address is adjusted by the subregion offset) + - if the subregion is an alias, the search is continued at the alias target + (after the address is adjusted by the subregion offset and alias offset) + - if a recursive search within a container or alias subregion does not + find a match (because of a "hole" in the container's coverage of its + address range), then if this is a container with its own MMIO or RAM + backing the search terminates, returning the container itself. Otherwise + we continue with the next subregion in priority order +- if none of the subregions match the address then the search terminates + with no match found + +Example memory map +------------------ + +system_memory: container@0-2^48-1 + | + +---- lomem: alias@0-0xdfffffff ---> #ram (0-0xdfffffff) + | + +---- himem: alias@0x100000000-0x11fffffff ---> #ram (0xe0000000-0xffffffff) + | + +---- vga-window: alias@0xa0000-0xbffff ---> #pci (0xa0000-0xbffff) + | (prio 1) + | + +---- pci-hole: alias@0xe0000000-0xffffffff ---> #pci (0xe0000000-0xffffffff) + +pci (0-2^32-1) + | + +--- vga-area: container@0xa0000-0xbffff + | | + | +--- alias@0x00000-0x7fff ---> #vram (0x010000-0x017fff) + | | + | +--- alias@0x08000-0xffff ---> #vram (0x020000-0x027fff) + | + +---- vram: ram@0xe1000000-0xe1ffffff + | + +---- vga-mmio: mmio@0xe2000000-0xe200ffff + +ram: ram@0x00000000-0xffffffff + +This is a (simplified) PC memory map. The 4GB RAM block is mapped into the +system address space via two aliases: "lomem" is a 1:1 mapping of the first +3.5GB; "himem" maps the last 0.5GB at address 4GB. This leaves 0.5GB for the +so-called PCI hole, that allows a 32-bit PCI bus to exist in a system with +4GB of memory. + +The memory controller diverts addresses in the range 640K-768K to the PCI +address space. This is modelled using the "vga-window" alias, mapped at a +higher priority so it obscures the RAM at the same addresses. The vga window +can be removed by programming the memory controller; this is modelled by +removing the alias and exposing the RAM underneath. + +The pci address space is not a direct child of the system address space, since +we only want parts of it to be visible (we accomplish this using aliases). +It has two subregions: vga-area models the legacy vga window and is occupied +by two 32K memory banks pointing at two sections of the framebuffer. +In addition the vram is mapped as a BAR at address e1000000, and an additional +BAR containing MMIO registers is mapped after it. + +Note that if the guest maps a BAR outside the PCI hole, it would not be +visible as the pci-hole alias clips it to a 0.5GB range. + +MMIO Operations +--------------- + +MMIO regions are provided with ->read() and ->write() callbacks; in addition +various constraints can be supplied to control how these callbacks are called: + + - .valid.min_access_size, .valid.max_access_size define the access sizes + (in bytes) which the device accepts; accesses outside this range will + have device and bus specific behaviour (ignored, or machine check) + - .valid.unaligned specifies that the *device being modelled* supports + unaligned accesses; if false, unaligned accesses will invoke the + appropriate bus or CPU specific behaviour. + - .impl.min_access_size, .impl.max_access_size define the access sizes + (in bytes) supported by the *implementation*; other access sizes will be + emulated using the ones available. For example a 4-byte write will be + emulated using four 1-byte writes, if .impl.max_access_size = 1. + - .impl.unaligned specifies that the *implementation* supports unaligned + accesses; if false, unaligned accesses will be emulated by two aligned + accesses. + - .old_mmio eases the porting of code that was formerly using + cpu_register_io_memory(). It should not be used in new code. diff --git a/docs/devel/migration.txt b/docs/devel/migration.txt new file mode 100644 index 0000000000..1b940a829b --- /dev/null +++ b/docs/devel/migration.txt @@ -0,0 +1,555 @@ += Migration = + +QEMU has code to load/save the state of the guest that it is running. +These are two complementary operations. Saving the state just does +that, saves the state for each device that the guest is running. +Restoring a guest is just the opposite operation: we need to load the +state of each device. + +For this to work, QEMU has to be launched with the same arguments the +two times. I.e. it can only restore the state in one guest that has +the same devices that the one it was saved (this last requirement can +be relaxed a bit, but for now we can consider that configuration has +to be exactly the same). + +Once that we are able to save/restore a guest, a new functionality is +requested: migration. This means that QEMU is able to start in one +machine and being "migrated" to another machine. I.e. being moved to +another machine. + +Next was the "live migration" functionality. This is important +because some guests run with a lot of state (specially RAM), and it +can take a while to move all state from one machine to another. Live +migration allows the guest to continue running while the state is +transferred. Only while the last part of the state is transferred has +the guest to be stopped. Typically the time that the guest is +unresponsive during live migration is the low hundred of milliseconds +(notice that this depends on a lot of things). + +=== Types of migration === + +Now that we have talked about live migration, there are several ways +to do migration: + +- tcp migration: do the migration using tcp sockets +- unix migration: do the migration using unix sockets +- exec migration: do the migration using the stdin/stdout through a process. +- fd migration: do the migration using an file descriptor that is + passed to QEMU. QEMU doesn't care how this file descriptor is opened. + +All these four migration protocols use the same infrastructure to +save/restore state devices. This infrastructure is shared with the +savevm/loadvm functionality. + +=== State Live Migration === + +This is used for RAM and block devices. It is not yet ported to vmstate. + + +=== What is the common infrastructure === + +QEMU uses a QEMUFile abstraction to be able to do migration. Any type +of migration that wants to use QEMU infrastructure has to create a +QEMUFile with: + +QEMUFile *qemu_fopen_ops(void *opaque, + QEMUFilePutBufferFunc *put_buffer, + QEMUFileGetBufferFunc *get_buffer, + QEMUFileCloseFunc *close); + +The functions have the following functionality: + +This function writes a chunk of data to a file at the given position. +The pos argument can be ignored if the file is only used for +streaming. The handler should try to write all of the data it can. + +typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf, + int64_t pos, int size); + +Read a chunk of data from a file at the given position. The pos argument +can be ignored if the file is only be used for streaming. The number of +bytes actually read should be returned. + +typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf, + int64_t pos, int size); + +Close a file and return an error code. + +typedef int (QEMUFileCloseFunc)(void *opaque); + +You can use any internal state that you need using the opaque void * +pointer that is passed to all functions. + +The important functions for us are put_buffer()/get_buffer() that +allow to write/read a buffer into the QEMUFile. + +=== How to save the state of one device === + +The state of a device is saved using intermediate buffers. There are +some helper functions to assist this saving. + +There is a new concept that we have to explain here: device state +version. When we migrate a device, we save/load the state as a series +of fields. Some times, due to bugs or new functionality, we need to +change the state to store more/different information. We use the +version to identify each time that we do a change. Each version is +associated with a series of fields saved. The save_state always saves +the state as the newer version. But load_state sometimes is able to +load state from an older version. + +=== Legacy way === + +This way is going to disappear as soon as all current users are ported to VMSTATE. + +Each device has to register two functions, one to save the state and +another to load the state back. + +int register_savevm(DeviceState *dev, + const char *idstr, + int instance_id, + int version_id, + SaveStateHandler *save_state, + LoadStateHandler *load_state, + void *opaque); + +typedef void SaveStateHandler(QEMUFile *f, void *opaque); +typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id); + +The important functions for the device state format are the save_state +and load_state. Notice that load_state receives a version_id +parameter to know what state format is receiving. save_state doesn't +have a version_id parameter because it always uses the latest version. + +=== VMState === + +The legacy way of saving/loading state of the device had the problem +that we have to maintain two functions in sync. If we did one change +in one of them and not in the other, we would get a failed migration. + +VMState changed the way that state is saved/loaded. Instead of using +a function to save the state and another to load it, it was changed to +a declarative way of what the state consisted of. Now VMState is able +to interpret that definition to be able to load/save the state. As +the state is declared only once, it can't go out of sync in the +save/load functions. + +An example (from hw/input/pckbd.c) + +static const VMStateDescription vmstate_kbd = { + .name = "pckbd", + .version_id = 3, + .minimum_version_id = 3, + .fields = (VMStateField[]) { + VMSTATE_UINT8(write_cmd, KBDState), + VMSTATE_UINT8(status, KBDState), + VMSTATE_UINT8(mode, KBDState), + VMSTATE_UINT8(pending, KBDState), + VMSTATE_END_OF_LIST() + } +}; + +We are declaring the state with name "pckbd". +The version_id is 3, and the fields are 4 uint8_t in a KBDState structure. +We registered this with: + + vmstate_register(NULL, 0, &vmstate_kbd, s); + +Note: talk about how vmstate <-> qdev interact, and what the instance ids mean. + +You can search for VMSTATE_* macros for lots of types used in QEMU in +include/hw/hw.h. + +=== More about versions === + +Version numbers are intended for major incompatible changes to the +migration of a device, and using them breaks backwards-migration +compatibility; in general most changes can be made by adding Subsections +(see below) or _TEST macros (see below) which won't break compatibility. + +You can see that there are several version fields: + +- version_id: the maximum version_id supported by VMState for that device. +- minimum_version_id: the minimum version_id that VMState is able to understand + for that device. +- minimum_version_id_old: For devices that were not able to port to vmstate, we can + assign a function that knows how to read this old state. This field is + ignored if there is no load_state_old handler. + +So, VMState is able to read versions from minimum_version_id to +version_id. And the function load_state_old() (if present) is able to +load state from minimum_version_id_old to minimum_version_id. This +function is deprecated and will be removed when no more users are left. + +Saving state will always create a section with the 'version_id' value +and thus can't be loaded by any older QEMU. + +=== Massaging functions === + +Sometimes, it is not enough to be able to save the state directly +from one structure, we need to fill the correct values there. One +example is when we are using kvm. Before saving the cpu state, we +need to ask kvm to copy to QEMU the state that it is using. And the +opposite when we are loading the state, we need a way to tell kvm to +load the state for the cpu that we have just loaded from the QEMUFile. + +The functions to do that are inside a vmstate definition, and are called: + +- int (*pre_load)(void *opaque); + + This function is called before we load the state of one device. + +- int (*post_load)(void *opaque, int version_id); + + This function is called after we load the state of one device. + +- void (*pre_save)(void *opaque); + + This function is called before we save the state of one device. + +Example: You can look at hpet.c, that uses the three function to + massage the state that is transferred. + +If you use memory API functions that update memory layout outside +initialization (i.e., in response to a guest action), this is a strong +indication that you need to call these functions in a post_load callback. +Examples of such memory API functions are: + + - memory_region_add_subregion() + - memory_region_del_subregion() + - memory_region_set_readonly() + - memory_region_set_enabled() + - memory_region_set_address() + - memory_region_set_alias_offset() + +=== Subsections === + +The use of version_id allows to be able to migrate from older versions +to newer versions of a device. But not the other way around. This +makes very complicated to fix bugs in stable branches. If we need to +add anything to the state to fix a bug, we have to disable migration +to older versions that don't have that bug-fix (i.e. a new field). + +But sometimes, that bug-fix is only needed sometimes, not always. For +instance, if the device is in the middle of a DMA operation, it is +using a specific functionality, .... + +It is impossible to create a way to make migration from any version to +any other version to work. But we can do better than only allowing +migration from older versions to newer ones. For that fields that are +only needed sometimes, we add the idea of subsections. A subsection +is "like" a device vmstate, but with a particularity, it has a Boolean +function that tells if that values are needed to be sent or not. If +this functions returns false, the subsection is not sent. + +On the receiving side, if we found a subsection for a device that we +don't understand, we just fail the migration. If we understand all +the subsections, then we load the state with success. + +One important note is that the post_load() function is called "after" +loading all subsections, because a newer subsection could change same +value that it uses. + +Example: + +static bool ide_drive_pio_state_needed(void *opaque) +{ + IDEState *s = opaque; + + return ((s->status & DRQ_STAT) != 0) + || (s->bus->error_status & BM_STATUS_PIO_RETRY); +} + +const VMStateDescription vmstate_ide_drive_pio_state = { + .name = "ide_drive/pio_state", + .version_id = 1, + .minimum_version_id = 1, + .pre_save = ide_drive_pio_pre_save, + .post_load = ide_drive_pio_post_load, + .needed = ide_drive_pio_state_needed, + .fields = (VMStateField[]) { + VMSTATE_INT32(req_nb_sectors, IDEState), + VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1, + vmstate_info_uint8, uint8_t), + VMSTATE_INT32(cur_io_buffer_offset, IDEState), + VMSTATE_INT32(cur_io_buffer_len, IDEState), + VMSTATE_UINT8(end_transfer_fn_idx, IDEState), + VMSTATE_INT32(elementary_transfer_size, IDEState), + VMSTATE_INT32(packet_transfer_size, IDEState), + VMSTATE_END_OF_LIST() + } +}; + +const VMStateDescription vmstate_ide_drive = { + .name = "ide_drive", + .version_id = 3, + .minimum_version_id = 0, + .post_load = ide_drive_post_load, + .fields = (VMStateField[]) { + .... several fields .... + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription*[]) { + &vmstate_ide_drive_pio_state, + NULL + } +}; + +Here we have a subsection for the pio state. We only need to +save/send this state when we are in the middle of a pio operation +(that is what ide_drive_pio_state_needed() checks). If DRQ_STAT is +not enabled, the values on that fields are garbage and don't need to +be sent. + +Using a condition function that checks a 'property' to determine whether +to send a subsection allows backwards migration compatibility when +new subsections are added. + +For example; + a) Add a new property using DEFINE_PROP_BOOL - e.g. support-foo and + default it to true. + b) Add an entry to the HW_COMPAT_ for the previous version + that sets the property to false. + c) Add a static bool support_foo function that tests the property. + d) Add a subsection with a .needed set to the support_foo function + e) (potentially) Add a pre_load that sets up a default value for 'foo' + to be used if the subsection isn't loaded. + +Now that subsection will not be generated when using an older +machine type and the migration stream will be accepted by older +QEMU versions. pre-load functions can be used to initialise state +on the newer version so that they default to suitable values +when loading streams created by older QEMU versions that do not +generate the subsection. + +In some cases subsections are added for data that had been accidentally +omitted by earlier versions; if the missing data causes the migration +process to succeed but the guest to behave badly then it may be better +to send the subsection and cause the migration to explicitly fail +with the unknown subsection error. If the bad behaviour only happens +with certain data values, making the subsection conditional on +the data value (rather than the machine type) allows migrations to succeed +in most cases. In general the preference is to tie the subsection to +the machine type, and allow reliable migrations, unless the behaviour +from omission of the subsection is really bad. + += Not sending existing elements = + +Sometimes members of the VMState are no longer needed; + removing them will break migration compatibility + making them version dependent and bumping the version will break backwards + migration compatibility. + +The best way is to: + a) Add a new property/compatibility/function in the same way for subsections + above. + b) replace the VMSTATE macro with the _TEST version of the macro, e.g.: + VMSTATE_UINT32(foo, barstruct) + becomes + VMSTATE_UINT32_TEST(foo, barstruct, pre_version_baz) + + Sometime in the future when we no longer care about the ancient +versions these can be killed off. + += Return path = + +In most migration scenarios there is only a single data path that runs +from the source VM to the destination, typically along a single fd (although +possibly with another fd or similar for some fast way of throwing pages across). + +However, some uses need two way communication; in particular the Postcopy +destination needs to be able to request pages on demand from the source. + +For these scenarios there is a 'return path' from the destination to the source; +qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return +path. + + Source side + Forward path - written by migration thread + Return path - opened by main thread, read by return-path thread + + Destination side + Forward path - read by main thread + Return path - opened by main thread, written by main thread AND postcopy + thread (protected by rp_mutex) + += Postcopy = +'Postcopy' migration is a way to deal with migrations that refuse to converge +(or take too long to converge) its plus side is that there is an upper bound on +the amount of migration traffic and time it takes, the down side is that during +the postcopy phase, a failure of *either* side or the network connection causes +the guest to be lost. + +In postcopy the destination CPUs are started before all the memory has been +transferred, and accesses to pages that are yet to be transferred cause +a fault that's translated by QEMU into a request to the source QEMU. + +Postcopy can be combined with precopy (i.e. normal migration) so that if precopy +doesn't finish in a given time the switch is made to postcopy. + +=== Enabling postcopy === + +To enable postcopy, issue this command on the monitor prior to the +start of migration: + +migrate_set_capability postcopy-ram on + +The normal commands are then used to start a migration, which is still +started in precopy mode. Issuing: + +migrate_start_postcopy + +will now cause the transition from precopy to postcopy. +It can be issued immediately after migration is started or any +time later on. Issuing it after the end of a migration is harmless. + +Note: During the postcopy phase, the bandwidth limits set using +migrate_set_speed is ignored (to avoid delaying requested pages that +the destination is waiting for). + +=== Postcopy device transfer === + +Loading of device data may cause the device emulation to access guest RAM +that may trigger faults that have to be resolved by the source, as such +the migration stream has to be able to respond with page data *during* the +device load, and hence the device data has to be read from the stream completely +before the device load begins to free the stream up. This is achieved by +'packaging' the device data into a blob that's read in one go. + +Source behaviour + +Until postcopy is entered the migration stream is identical to normal +precopy, except for the addition of a 'postcopy advise' command at +the beginning, to tell the destination that postcopy might happen. +When postcopy starts the source sends the page discard data and then +forms the 'package' containing: + + Command: 'postcopy listen' + The device state + A series of sections, identical to the precopy streams device state stream + containing everything except postcopiable devices (i.e. RAM) + Command: 'postcopy run' + +The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the +contents are formatted in the same way as the main migration stream. + +During postcopy the source scans the list of dirty pages and sends them +to the destination without being requested (in much the same way as precopy), +however when a page request is received from the destination, the dirty page +scanning restarts from the requested location. This causes requested pages +to be sent quickly, and also causes pages directly after the requested page +to be sent quickly in the hope that those pages are likely to be used +by the destination soon. + +Destination behaviour + +Initially the destination looks the same as precopy, with a single thread +reading the migration stream; the 'postcopy advise' and 'discard' commands +are processed to change the way RAM is managed, but don't affect the stream +processing. + +------------------------------------------------------------------------------ + 1 2 3 4 5 6 7 +main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) +thread | | + | (page request) + | \___ + v \ +listen thread: --- page -- page -- page -- page -- page -- + + a b c +------------------------------------------------------------------------------ + +On receipt of CMD_PACKAGED (1) + All the data associated with the package - the ( ... ) section in the +diagram - is read into memory, and the main thread recurses into +qemu_loadvm_state_main to process the contents of the package (2) +which contains commands (3,6) and devices (4...) + +On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) +a new thread (a) is started that takes over servicing the migration stream, +while the main thread carries on loading the package. It loads normal +background page data (b) but if during a device load a fault happens (5) the +returned page (c) is loaded by the listen thread allowing the main threads +device load to carry on. + +The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination +CPUs start running. +At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour +and is no longer used by migration, while the listen thread carries +on servicing page data until the end of migration. + +=== Postcopy states === + +Postcopy moves through a series of states (see postcopy_state) from +ADVISE->DISCARD->LISTEN->RUNNING->END + + Advise: Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) + + Discard: Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. + + Listen: The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + Running: POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + End: The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +=== Source side page maps === + +The source side keeps two bitmaps during postcopy; 'the migration bitmap' +and 'unsent map'. The 'migration bitmap' is basically the same as in +the precopy case, and holds a bit to indicate that page is 'dirty' - +i.e. needs sending. During the precopy phase this is updated as the CPU +dirties pages, however during postcopy the CPUs are stopped and nothing +should dirty anything any more. + +The 'unsent map' is used for the transition to postcopy. It is a bitmap that +has a bit cleared whenever a page is sent to the destination, however during +the transition to postcopy mode it is combined with the migration bitmap +to form a set of pages that: + a) Have been sent but then redirtied (which must be discarded) + b) Have not yet been sent - which also must be discarded to cause any + transparent huge pages built during precopy to be broken. + +Note that the contents of the unsentmap are sacrificed during the calculation +of the discard set and thus aren't valid once in postcopy. The dirtymap +is still valid and is used to ensure that no page is sent more than once. Any +request for a page that has already been sent is ignored. Duplicate requests +such as this can happen as a page is sent at about the same time the +destination accesses it. + +=== Postcopy with hugepages === + +Postcopy now works with hugetlbfs backed memory: + a) The linux kernel on the destination must support userfault on hugepages. + b) The huge-page configuration on the source and destination VMs must be + identical; i.e. RAMBlocks on both sides must use the same page size. + c) Note that -mem-path /dev/hugepages will fall back to allocating normal + RAM if it doesn't have enough hugepages, triggering (b) to fail. + Using -mem-prealloc enforces the allocation using hugepages. + d) Care should be taken with the size of hugepage used; postcopy with 2MB + hugepages works well, however 1GB hugepages are likely to be problematic + since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, + and until the full page is transferred the destination thread is blocked. diff --git a/docs/devel/multi-thread-tcg.txt b/docs/devel/multi-thread-tcg.txt new file mode 100644 index 0000000000..a99b4564c6 --- /dev/null +++ b/docs/devel/multi-thread-tcg.txt @@ -0,0 +1,350 @@ +Copyright (c) 2015-2016 Linaro Ltd. + +This work is licensed under the terms of the GNU GPL, version 2 or +later. See the COPYING file in the top-level directory. + +Introduction +============ + +This document outlines the design for multi-threaded TCG system-mode +emulation. The current user-mode emulation mirrors the thread +structure of the translated executable. Some of the work will be +applicable to both system and linux-user emulation. + +The original system-mode TCG implementation was single threaded and +dealt with multiple CPUs with simple round-robin scheduling. This +simplified a lot of things but became increasingly limited as systems +being emulated gained additional cores and per-core performance gains +for host systems started to level off. + +vCPU Scheduling +=============== + +We introduce a new running mode where each vCPU will run on its own +user-space thread. This will be enabled by default for all FE/BE +combinations that have had the required work done to support this +safely. + +In the general case of running translated code there should be no +inter-vCPU dependencies and all vCPUs should be able to run at full +speed. Synchronisation will only be required while accessing internal +shared data structures or when the emulated architecture requires a +coherent representation of the emulated machine state. + +Shared Data Structures +====================== + +Main Run Loop +------------- + +Even when there is no code being generated there are a number of +structures associated with the hot-path through the main run-loop. +These are associated with looking up the next translation block to +execute. These include: + + tb_jmp_cache (per-vCPU, cache of recent jumps) + tb_ctx.htable (global hash table, phys address->tb lookup) + +As TB linking only occurs when blocks are in the same page this code +is critical to performance as looking up the next TB to execute is the +most common reason to exit the generated code. + +DESIGN REQUIREMENT: Make access to lookup structures safe with +multiple reader/writer threads. Minimise any lock contention to do it. + +The hot-path avoids using locks where possible. The tb_jmp_cache is +updated with atomic accesses to ensure consistent results. The fall +back QHT based hash table is also designed for lockless lookups. Locks +are only taken when code generation is required or TranslationBlocks +have their block-to-block jumps patched. + +Global TCG State +---------------- + +We need to protect the entire code generation cycle including any post +generation patching of the translated code. This also implies a shared +translation buffer which contains code running on all cores. Any +execution path that comes to the main run loop will need to hold a +mutex for code generation. This also includes times when we need flush +code or entries from any shared lookups/caches. Structures held on a +per-vCPU basis won't need locking unless other vCPUs will need to +modify them. + +DESIGN REQUIREMENT: Add locking around all code generation and TB +patching. + +(Current solution) + +Mainly as part of the linux-user work all code generation is +serialised with a tb_lock(). For the SoftMMU tb_lock() also takes the +place of mmap_lock() in linux-user. + +Translation Blocks +------------------ + +Currently the whole system shares a single code generation buffer +which when full will force a flush of all translations and start from +scratch again. Some operations also force a full flush of translations +including: + + - debugging operations (breakpoint insertion/removal) + - some CPU helper functions + +This is done with the async_safe_run_on_cpu() mechanism to ensure all +vCPUs are quiescent when changes are being made to shared global +structures. + +More granular translation invalidation events are typically due +to a change of the state of a physical page: + + - code modification (self modify code, patching code) + - page changes (new page mapping in linux-user mode) + +While setting the invalid flag in a TranslationBlock will stop it +being used when looked up in the hot-path there are a number of other +book-keeping structures that need to be safely cleared. + +Any TranslationBlocks which have been patched to jump directly to the +now invalid blocks need the jump patches reversing so they will return +to the C code. + +There are a number of look-up caches that need to be properly updated +including the: + + - jump lookup cache + - the physical-to-tb lookup hash table + - the global page table + +The global page table (l1_map) which provides a multi-level look-up +for PageDesc structures which contain pointers to the start of a +linked list of all Translation Blocks in that page (see page_next). + +Both the jump patching and the page cache involve linked lists that +the invalidated TranslationBlock needs to be removed from. + +DESIGN REQUIREMENT: Safely handle invalidation of TBs + - safely patch/revert direct jumps + - remove central PageDesc lookup entries + - ensure lookup caches/hashes are safely updated + +(Current solution) + +The direct jump themselves are updated atomically by the TCG +tb_set_jmp_target() code. Modification to the linked lists that allow +searching for linked pages are done under the protect of the +tb_lock(). + +The global page table is protected by the tb_lock() in system-mode and +mmap_lock() in linux-user mode. + +The lookup caches are updated atomically and the lookup hash uses QHT +which is designed for concurrent safe lookup. + + +Memory maps and TLBs +-------------------- + +The memory handling code is fairly critical to the speed of memory +access in the emulated system. The SoftMMU code is designed so the +hot-path can be handled entirely within translated code. This is +handled with a per-vCPU TLB structure which once populated will allow +a series of accesses to the page to occur without exiting the +translated code. It is possible to set flags in the TLB address which +will ensure the slow-path is taken for each access. This can be done +to support: + + - Memory regions (dividing up access to PIO, MMIO and RAM) + - Dirty page tracking (for code gen, SMC detection, migration and display) + - Virtual TLB (for translating guest address->real address) + +When the TLB tables are updated by a vCPU thread other than their own +we need to ensure it is done in a safe way so no inconsistent state is +seen by the vCPU thread. + +Some operations require updating a number of vCPUs TLBs at the same +time in a synchronised manner. + +DESIGN REQUIREMENTS: + + - TLB Flush All/Page + - can be across-vCPUs + - cross vCPU TLB flush may need other vCPU brought to halt + - change may need to be visible to the calling vCPU immediately + - TLB Flag Update + - usually cross-vCPU + - want change to be visible as soon as possible + - TLB Update (update a CPUTLBEntry, via tlb_set_page_with_attrs) + - This is a per-vCPU table - by definition can't race + - updated by its own thread when the slow-path is forced + +(Current solution) + +We have updated cputlb.c to defer operations when a cross-vCPU +operation with async_run_on_cpu() which ensures each vCPU sees a +coherent state when it next runs its work (in a few instructions +time). + +A new set up operations (tlb_flush_*_all_cpus) take an additional flag +which when set will force synchronisation by setting the source vCPUs +work as "safe work" and exiting the cpu run loop. This ensure by the +time execution restarts all flush operations have completed. + +TLB flag updates are all done atomically and are also protected by the +tb_lock() which is used by the functions that update the TLB in bulk. + +(Known limitation) + +Not really a limitation but the wait mechanism is overly strict for +some architectures which only need flushes completed by a barrier +instruction. This could be a future optimisation. + +Emulated hardware state +----------------------- + +Currently thanks to KVM work any access to IO memory is automatically +protected by the global iothread mutex, also known as the BQL (Big +Qemu Lock). Any IO region that doesn't use global mutex is expected to +do its own locking. + +However IO memory isn't the only way emulated hardware state can be +modified. Some architectures have model specific registers that +trigger hardware emulation features. Generally any translation helper +that needs to update more than a single vCPUs of state should take the +BQL. + +As the BQL, or global iothread mutex is shared across the system we +push the use of the lock as far down into the TCG code as possible to +minimise contention. + +(Current solution) + +MMIO access automatically serialises hardware emulation by way of the +BQL. Currently ARM targets serialise all ARM_CP_IO register accesses +and also defer the reset/startup of vCPUs to the vCPU context by way +of async_run_on_cpu(). + +Updates to interrupt state are also protected by the BQL as they can +often be cross vCPU. + +Memory Consistency +================== + +Between emulated guests and host systems there are a range of memory +consistency models. Even emulating weakly ordered systems on strongly +ordered hosts needs to ensure things like store-after-load re-ordering +can be prevented when the guest wants to. + +Memory Barriers +--------------- + +Barriers (sometimes known as fences) provide a mechanism for software +to enforce a particular ordering of memory operations from the point +of view of external observers (e.g. another processor core). They can +apply to any memory operations as well as just loads or stores. + +The Linux kernel has an excellent write-up on the various forms of +memory barrier and the guarantees they can provide [1]. + +Barriers are often wrapped around synchronisation primitives to +provide explicit memory ordering semantics. However they can be used +by themselves to provide safe lockless access by ensuring for example +a change to a signal flag will only be visible once the changes to +payload are. + +DESIGN REQUIREMENT: Add a new tcg_memory_barrier op + +This would enforce a strong load/store ordering so all loads/stores +complete at the memory barrier. On single-core non-SMP strongly +ordered backends this could become a NOP. + +Aside from explicit standalone memory barrier instructions there are +also implicit memory ordering semantics which comes with each guest +memory access instruction. For example all x86 load/stores come with +fairly strong guarantees of sequential consistency where as ARM has +special variants of load/store instructions that imply acquire/release +semantics. + +In the case of a strongly ordered guest architecture being emulated on +a weakly ordered host the scope for a heavy performance impact is +quite high. + +DESIGN REQUIREMENTS: Be efficient with use of memory barriers + - host systems with stronger implied guarantees can skip some barriers + - merge consecutive barriers to the strongest one + +(Current solution) + +The system currently has a tcg_gen_mb() which will add memory barrier +operations if code generation is being done in a parallel context. The +tcg_optimize() function attempts to merge barriers up to their +strongest form before any load/store operations. The solution was +originally developed and tested for linux-user based systems. All +backends have been converted to emit fences when required. So far the +following front-ends have been updated to emit fences when required: + + - target-i386 + - target-arm + - target-aarch64 + - target-alpha + - target-mips + +Memory Control and Maintenance +------------------------------ + +This includes a class of instructions for controlling system cache +behaviour. While QEMU doesn't model cache behaviour these instructions +are often seen when code modification has taken place to ensure the +changes take effect. + +Synchronisation Primitives +-------------------------- + +There are two broad types of synchronisation primitives found in +modern ISAs: atomic instructions and exclusive regions. + +The first type offer a simple atomic instruction which will guarantee +some sort of test and conditional store will be truly atomic w.r.t. +other cores sharing access to the memory. The classic example is the +x86 cmpxchg instruction. + +The second type offer a pair of load/store instructions which offer a +guarantee that an region of memory has not been touched between the +load and store instructions. An example of this is ARM's ldrex/strex +pair where the strex instruction will return a flag indicating a +successful store only if no other CPU has accessed the memory region +since the ldrex. + +Traditionally TCG has generated a series of operations that work +because they are within the context of a single translation block so +will have completed before another CPU is scheduled. However with +the ability to have multiple threads running to emulate multiple CPUs +we will need to explicitly expose these semantics. + +DESIGN REQUIREMENTS: + - Support classic atomic instructions + - Support load/store exclusive (or load link/store conditional) pairs + - Generic enough infrastructure to support all guest architectures +CURRENT OPEN QUESTIONS: + - How problematic is the ABA problem in general? + +(Current solution) + +The TCG provides a number of atomic helpers (tcg_gen_atomic_*) which +can be used directly or combined to emulate other instructions like +ARM's ldrex/strex instructions. While they are susceptible to the ABA +problem so far common guests have not implemented patterns where +this may be a problem - typically presenting a locking ABI which +assumes cmpxchg like semantics. + +The code also includes a fall-back for cases where multi-threaded TCG +ops can't work (e.g. guest atomic width > host atomic width). In this +case an EXCP_ATOMIC exit occurs and the instruction is emulated with +an exclusive lock which ensures all emulation is serialised. + +While the atomic helpers look good enough for now there may be a need +to look at solutions that can more closely model the guest +architectures semantics. + +========== + +[1] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/plain/Documentation/memory-barriers.txt diff --git a/docs/devel/multiple-iothreads.txt b/docs/devel/multiple-iothreads.txt new file mode 100644 index 0000000000..e4d340bbb7 --- /dev/null +++ b/docs/devel/multiple-iothreads.txt @@ -0,0 +1,137 @@ +Copyright (c) 2014 Red Hat Inc. + +This work is licensed under the terms of the GNU GPL, version 2 or later. See +the COPYING file in the top-level directory. + + +This document explains the IOThread feature and how to write code that runs +outside the QEMU global mutex. + +The main loop and IOThreads +--------------------------- +QEMU is an event-driven program that can do several things at once using an +event loop. The VNC server and the QMP monitor are both processed from the +same event loop, which monitors their file descriptors until they become +readable and then invokes a callback. + +The default event loop is called the main loop (see main-loop.c). It is +possible to create additional event loop threads using -object +iothread,id=my-iothread. + +Side note: The main loop and IOThread are both event loops but their code is +not shared completely. Sometimes it is useful to remember that although they +are conceptually similar they are currently not interchangeable. + +Why IOThreads are useful +------------------------ +IOThreads allow the user to control the placement of work. The main loop is a +scalability bottleneck on hosts with many CPUs. Work can be spread across +several IOThreads instead of just one main loop. When set up correctly this +can improve I/O latency and reduce jitter seen by the guest. + +The main loop is also deeply associated with the QEMU global mutex, which is a +scalability bottleneck in itself. vCPU threads and the main loop use the QEMU +global mutex to serialize execution of QEMU code. This mutex is necessary +because a lot of QEMU's code historically was not thread-safe. + +The fact that all I/O processing is done in a single main loop and that the +QEMU global mutex is contended by all vCPU threads and the main loop explain +why it is desirable to place work into IOThreads. + +The experimental virtio-blk data-plane implementation has been benchmarked and +shows these effects: +ftp://public.dhe.ibm.com/linux/pdfs/KVM_Virtualized_IO_Performance_Paper.pdf + +How to program for IOThreads +---------------------------- +The main difference between legacy code and new code that can run in an +IOThread is dealing explicitly with the event loop object, AioContext +(see include/block/aio.h). Code that only works in the main loop +implicitly uses the main loop's AioContext. Code that supports running +in IOThreads must be aware of its AioContext. + +AioContext supports the following services: + * File descriptor monitoring (read/write/error on POSIX hosts) + * Event notifiers (inter-thread signalling) + * Timers + * Bottom Halves (BH) deferred callbacks + +There are several old APIs that use the main loop AioContext: + * LEGACY qemu_aio_set_fd_handler() - monitor a file descriptor + * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier + * LEGACY timer_new_ms() - create a timer + * LEGACY qemu_bh_new() - create a BH + * LEGACY qemu_aio_wait() - run an event loop iteration + +Since they implicitly work on the main loop they cannot be used in code that +runs in an IOThread. They might cause a crash or deadlock if called from an +IOThread since the QEMU global mutex is not held. + +Instead, use the AioContext functions directly (see include/block/aio.h): + * aio_set_fd_handler() - monitor a file descriptor + * aio_set_event_notifier() - monitor an event notifier + * aio_timer_new() - create a timer + * aio_bh_new() - create a BH + * aio_poll() - run an event loop iteration + +The AioContext can be obtained from the IOThread using +iothread_get_aio_context() or for the main loop using qemu_get_aio_context(). +Code that takes an AioContext argument works both in IOThreads or the main +loop, depending on which AioContext instance the caller passes in. + +How to synchronize with an IOThread +----------------------------------- +AioContext is not thread-safe so some rules must be followed when using file +descriptors, event notifiers, timers, or BHs across threads: + +1. AioContext functions can always be called safely. They handle their +own locking internally. + +2. Other threads wishing to access the AioContext must use +aio_context_acquire()/aio_context_release() for mutual exclusion. Once the +context is acquired no other thread can access it or run event loop iterations +in this AioContext. + +aio_context_acquire()/aio_context_release() calls may be nested. This +means you can call them if you're not sure whether #2 applies. + +There is currently no lock ordering rule if a thread needs to acquire multiple +AioContexts simultaneously. Therefore, it is only safe for code holding the +QEMU global mutex to acquire other AioContexts. + +Side note: the best way to schedule a function call across threads is to call +aio_bh_schedule_oneshot(). No acquire/release or locking is needed. + +AioContext and the block layer +------------------------------ +The AioContext originates from the QEMU block layer, even though nowadays +AioContext is a generic event loop that can be used by any QEMU subsystem. + +The block layer has support for AioContext integrated. Each BlockDriverState +is associated with an AioContext using bdrv_set_aio_context() and +bdrv_get_aio_context(). This allows block layer code to process I/O inside the +right AioContext. Other subsystems may wish to follow a similar approach. + +Block layer code must therefore expect to run in an IOThread and avoid using +old APIs that implicitly use the main loop. See the "How to program for +IOThreads" above for information on how to do that. + +If main loop code such as a QMP function wishes to access a BlockDriverState +it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure +that callbacks in the IOThread do not run in parallel. + +Code running in the monitor typically needs to ensure that past +requests from the guest are completed. When a block device is running +in an IOThread, the IOThread can also process requests from the guest +(via ioeventfd). To achieve both objects, wrap the code between +bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained +section". The functions must be called between aio_context_acquire() +and aio_context_release(). You can freely release and re-acquire the +AioContext within a drained section. + +Long-running jobs (usually in the form of coroutines) are best scheduled in +the BlockDriverState's AioContext to avoid the need to acquire/release around +each bdrv_*() call. The functions bdrv_add/remove_aio_context_notifier, +or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends, +can be used to get a notification whenever bdrv_set_aio_context() moves a +BlockDriverState to a different AioContext. diff --git a/docs/devel/qapi-code-gen.txt b/docs/devel/qapi-code-gen.txt new file mode 100644 index 0000000000..52e3874efe --- /dev/null +++ b/docs/devel/qapi-code-gen.txt @@ -0,0 +1,1310 @@ += How to use the QAPI code generator = + +Copyright IBM Corp. 2011 +Copyright (C) 2012-2016 Red Hat, Inc. + +This work is licensed under the terms of the GNU GPL, version 2 or +later. See the COPYING file in the top-level directory. + +== Introduction == + +QAPI is a native C API within QEMU which provides management-level +functionality to internal and external users. For external +users/processes, this interface is made available by a JSON-based wire +format for the QEMU Monitor Protocol (QMP) for controlling qemu, as +well as the QEMU Guest Agent (QGA) for communicating with the guest. +The remainder of this document uses "Client JSON Protocol" when +referring to the wire contents of a QMP or QGA connection. + +To map Client JSON Protocol interfaces to the native C QAPI +implementations, a JSON-based schema is used to define types and +function signatures, and a set of scripts is used to generate types, +signatures, and marshaling/dispatch code. This document will describe +how the schemas, scripts, and resulting code are used. + + +== QMP/Guest agent schema == + +A QAPI schema file is designed to be loosely based on JSON +(http://www.ietf.org/rfc/rfc7159.txt) with changes for quoting style +and the use of comments; a QAPI schema file is then parsed by a python +code generation program. A valid QAPI schema consists of a series of +top-level expressions, with no commas between them. Where +dictionaries (JSON objects) are used, they are parsed as python +OrderedDicts so that ordering is preserved (for predictable layout of +generated C structs and parameter lists). Ordering doesn't matter +between top-level expressions or the keys within an expression, but +does matter within dictionary values for 'data' and 'returns' members +of a single expression. QAPI schema input is written using 'single +quotes' instead of JSON's "double quotes" (in contrast, Client JSON +Protocol uses no comments, and while input accepts 'single quotes' as +an extension, output is strict JSON using only "double quotes"). As +in JSON, trailing commas are not permitted in arrays or dictionaries. +Input must be ASCII (although QMP supports full Unicode strings, the +QAPI parser does not). At present, there is no place where a QAPI +schema requires the use of JSON numbers or null. + + +=== Comments === + +Comments are allowed; anything between an unquoted # and the following +newline is ignored. + +A multi-line comment that starts and ends with a '##' line is a +documentation comment. These are parsed by the documentation +generator, which recognizes certain markup detailed below. + + +==== Documentation markup ==== + +Comment text starting with '=' is a section title: + + # = Section title + +Double the '=' for a subsection title: + + # == Subection title + +'|' denotes examples: + + # | Text of the example, may span + # | multiple lines + +'*' starts an itemized list: + + # * First item, may span + # multiple lines + # * Second item + +You can also use '-' instead of '*'. + +A decimal number followed by '.' starts a numbered list: + + # 1. First item, may span + # multiple lines + # 2. Second item + +The actual number doesn't matter. You could even use '*' instead of +'2.' for the second item. + +Lists can't be nested. Blank lines are currently not supported within +lists. + +Additional whitespace between the initial '#' and the comment text is +permitted. + +*foo* and _foo_ are for strong and emphasis styles respectively (they +do not work over multiple lines). @foo is used to reference a name in +the schema. + +Example: + +## +# = Section +# == Subsection +# +# Some text foo with *strong* and _emphasis_ +# 1. with a list +# 2. like that +# +# And some code: +# | $ echo foo +# | -> do this +# | <- get that +# +## + + +==== Expression documentation ==== + +Each expression that isn't an include directive may be preceded by a +documentation block. Such blocks are called expression documentation +blocks. + +When documentation is required (see pragma 'doc-required'), expression +documentation blocks are mandatory. + +The documentation block consists of a first line naming the +expression, an optional overview, a description of each argument (for +commands and events) or member (for structs, unions and alternates), +and optional tagged sections. + +FIXME: the parser accepts these things in almost any order. + +Extensions added after the expression was first released carry a +'(since x.y.z)' comment. + +A tagged section starts with one of the following words: +"Note:"/"Notes:", "Since:", "Example"/"Examples", "Returns:", "TODO:". +The section ends with the start of a new section. + +A 'Since: x.y.z' tagged section lists the release that introduced the +expression. + +For example: + +## +# @BlockStats: +# +# Statistics of a virtual block device or a block backing device. +# +# @device: If the stats are for a virtual block device, the name +# corresponding to the virtual block device. +# +# @node-name: The node name of the device. (since 2.3) +# +# ... more members ... +# +# Since: 0.14.0 +## +{ 'struct': 'BlockStats', + 'data': {'*device': 'str', '*node-name': 'str', + ... more members ... } } + +## +# @query-blockstats: +# +# Query the @BlockStats for all virtual block devices. +# +# @query-nodes: If true, the command will query all the +# block nodes ... explain, explain ... (since 2.3) +# +# Returns: A list of @BlockStats for each virtual block devices. +# +# Since: 0.14.0 +# +# Example: +# +# -> { "execute": "query-blockstats" } +# <- { +# ... lots of output ... +# } +# +## +{ 'command': 'query-blockstats', + 'data': { '*query-nodes': 'bool' }, + 'returns': ['BlockStats'] } + +==== Free-form documentation ==== + +A documentation block that isn't an expression documentation block is +a free-form documentation block. These may be used to provide +additional text and structuring content. + + +=== Schema overview === + +The schema sets up a series of types, as well as commands and events +that will use those types. Forward references are allowed: the parser +scans in two passes, where the first pass learns all type names, and +the second validates the schema and generates the code. This allows +the definition of complex structs that can have mutually recursive +types, and allows for indefinite nesting of Client JSON Protocol that +satisfies the schema. A type name should not be defined more than +once. It is permissible for the schema to contain additional types +not used by any commands or events in the Client JSON Protocol, for +the side effect of generated C code used internally. + +There are eight top-level expressions recognized by the parser: +'include', 'pragma', 'command', 'struct', 'enum', 'union', +'alternate', and 'event'. There are several groups of types: simple +types (a number of built-in types, such as 'int' and 'str'; as well as +enumerations), complex types (structs and two flavors of unions), and +alternate types (a choice between other types). The 'command' and +'event' expressions can refer to existing types by name, or list an +anonymous type as a dictionary. Listing a type name inside an array +refers to a single-dimension array of that type; multi-dimension +arrays are not directly supported (although an array of a complex +struct that contains an array member is possible). + +All names must begin with a letter, and contain only ASCII letters, +digits, hyphen, and underscore. There are two exceptions: enum values +may start with a digit, and names that are downstream extensions (see +section Downstream extensions) start with underscore. + +Names beginning with 'q_' are reserved for the generator, which uses +them for munging QMP names that resemble C keywords or other +problematic strings. For example, a member named "default" in qapi +becomes "q_default" in the generated C code. + +Types, commands, and events share a common namespace. Therefore, +generally speaking, type definitions should always use CamelCase for +user-defined type names, while built-in types are lowercase. + +Type names ending with 'Kind' or 'List' are reserved for the +generator, which uses them for implicit union enums and array types, +respectively. + +Command names, and member names within a type, should be all lower +case with words separated by a hyphen. However, some existing older +commands and complex types use underscore; when extending such +expressions, consistency is preferred over blindly avoiding +underscore. + +Event names should be ALL_CAPS with words separated by underscore. + +Member names starting with 'has-' or 'has_' are reserved for the +generator, which uses them for tracking optional members. + +Any name (command, event, type, member, or enum value) beginning with +"x-" is marked experimental, and may be withdrawn or changed +incompatibly in a future release. + +Pragma 'name-case-whitelist' lets you violate the rules on use of +upper and lower case. Use for new code is strongly discouraged. + +In the rest of this document, usage lines are given for each +expression type, with literal strings written in lower case and +placeholders written in capitals. If a literal string includes a +prefix of '*', that key/value pair can be omitted from the expression. +For example, a usage statement that includes '*base':STRUCT-NAME +means that an expression has an optional key 'base', which if present +must have a value that forms a struct name. + + +=== Built-in Types === + +The following types are predefined, and map to C as follows: + + Schema C JSON + str char * any JSON string, UTF-8 + number double any JSON number + int int64_t a JSON number without fractional part + that fits into the C integer type + int8 int8_t likewise + int16 int16_t likewise + int32 int32_t likewise + int64 int64_t likewise + uint8 uint8_t likewise + uint16 uint16_t likewise + uint32 uint32_t likewise + uint64 uint64_t likewise + size uint64_t like uint64_t, except StringInputVisitor + accepts size suffixes + bool bool JSON true or false + any QObject * any JSON value + QType QType JSON string matching enum QType values + + +=== Include directives === + +Usage: { 'include': STRING } + +The QAPI schema definitions can be modularized using the 'include' directive: + + { 'include': 'path/to/file.json' } + +The directive is evaluated recursively, and include paths are relative to the +file using the directive. Multiple includes of the same file are +idempotent. No other keys should appear in the expression, and the include +value should be a string. + +As a matter of style, it is a good idea to have all files be +self-contained, but at the moment, nothing prevents an included file +from making a forward reference to a type that is only introduced by +an outer file. The parser may be made stricter in the future to +prevent incomplete include files. + + +=== Pragma directives === + +Usage: { 'pragma': DICT } + +The pragma directive lets you control optional generator behavior. +The dictionary's entries are pragma names and values. + +Pragma's scope is currently the complete schema. Setting the same +pragma to different values in parts of the schema doesn't work. + +Pragma 'doc-required' takes a boolean value. If true, documentation +is required. Default is false. + +Pragma 'returns-whitelist' takes a list of command names that may +violate the rules on permitted return types. Default is none. + +Pragma 'name-case-whitelist' takes a list of names that may violate +rules on use of upper- vs. lower-case letters. Default is none. + + +=== Struct types === + +Usage: { 'struct': STRING, 'data': DICT, '*base': STRUCT-NAME } + +A struct is a dictionary containing a single 'data' key whose value is +a dictionary; the dictionary may be empty. This corresponds to a +struct in C or an Object in JSON. Each value of the 'data' dictionary +must be the name of a type, or a one-element array containing a type +name. An example of a struct is: + + { 'struct': 'MyType', + 'data': { 'member1': 'str', 'member2': 'int', '*member3': 'str' } } + +The use of '*' as a prefix to the name means the member is optional in +the corresponding JSON protocol usage. + +The default initialization value of an optional argument should not be changed +between versions of QEMU unless the new default maintains backward +compatibility to the user-visible behavior of the old default. + +With proper documentation, this policy still allows some flexibility; for +example, documenting that a default of 0 picks an optimal buffer size allows +one release to declare the optimal size at 512 while another release declares +the optimal size at 4096 - the user-visible behavior is not the bytes used by +the buffer, but the fact that the buffer was optimal size. + +On input structures (only mentioned in the 'data' side of a command), changing +from mandatory to optional is safe (older clients will supply the option, and +newer clients can benefit from the default); changing from optional to +mandatory is backwards incompatible (older clients may be omitting the option, +and must continue to work). + +On output structures (only mentioned in the 'returns' side of a command), +changing from mandatory to optional is in general unsafe (older clients may be +expecting the member, and could crash if it is missing), although it +can be done if the only way that the optional argument will be omitted +is when it is triggered by the presence of a new input flag to the +command that older clients don't know to send. Changing from optional +to mandatory is safe. + +A structure that is used in both input and output of various commands +must consider the backwards compatibility constraints of both directions +of use. + +A struct definition can specify another struct as its base. +In this case, the members of the base type are included as top-level members +of the new struct's dictionary in the Client JSON Protocol wire +format. An example definition is: + + { 'struct': 'BlockdevOptionsGenericFormat', 'data': { 'file': 'str' } } + { 'struct': 'BlockdevOptionsGenericCOWFormat', + 'base': 'BlockdevOptionsGenericFormat', + 'data': { '*backing': 'str' } } + +An example BlockdevOptionsGenericCOWFormat object on the wire could use +both members like this: + + { "file": "/some/place/my-image", + "backing": "/some/place/my-backing-file" } + + +=== Enumeration types === + +Usage: { 'enum': STRING, 'data': ARRAY-OF-STRING } + { 'enum': STRING, '*prefix': STRING, 'data': ARRAY-OF-STRING } + +An enumeration type is a dictionary containing a single 'data' key +whose value is a list of strings. An example enumeration is: + + { 'enum': 'MyEnum', 'data': [ 'value1', 'value2', 'value3' ] } + +Nothing prevents an empty enumeration, although it is probably not +useful. The list of strings should be lower case; if an enum name +represents multiple words, use '-' between words. The string 'max' is +not allowed as an enum value, and values should not be repeated. + +The enum constants will be named by using a heuristic to turn the +type name into a set of underscore separated words. For the example +above, 'MyEnum' will turn into 'MY_ENUM' giving a constant name +of 'MY_ENUM_VALUE1' for the first value. If the default heuristic +does not result in a desirable name, the optional 'prefix' member +can be used when defining the enum. + +The enumeration values are passed as strings over the Client JSON +Protocol, but are encoded as C enum integral values in generated code. +While the C code starts numbering at 0, it is better to use explicit +comparisons to enum values than implicit comparisons to 0; the C code +will also include a generated enum member ending in _MAX for tracking +the size of the enum, useful when using common functions for +converting between strings and enum values. Since the wire format +always passes by name, it is acceptable to reorder or add new +enumeration members in any location without breaking clients of Client +JSON Protocol; however, removing enum values would break +compatibility. For any struct that has a member that will only contain +a finite set of string values, using an enum type for that member is +better than open-coding the member to be type 'str'. + + +=== Union types === + +Usage: { 'union': STRING, 'data': DICT } +or: { 'union': STRING, 'data': DICT, 'base': STRUCT-NAME-OR-DICT, + 'discriminator': ENUM-MEMBER-OF-BASE } + +Union types are used to let the user choose between several different +variants for an object. There are two flavors: simple (no +discriminator or base), and flat (both discriminator and base). A union +type is defined using a data dictionary as explained in the following +paragraphs. The data dictionary for either type of union must not +be empty. + +A simple union type defines a mapping from automatic discriminator +values to data types like in this example: + + { 'struct': 'BlockdevOptionsFile', 'data': { 'filename': 'str' } } + { 'struct': 'BlockdevOptionsQcow2', + 'data': { 'backing': 'str', '*lazy-refcounts': 'bool' } } + + { 'union': 'BlockdevOptionsSimple', + 'data': { 'file': 'BlockdevOptionsFile', + 'qcow2': 'BlockdevOptionsQcow2' } } + +In the Client JSON Protocol, a simple union is represented by a +dictionary that contains the 'type' member as a discriminator, and a +'data' member that is of the specified data type corresponding to the +discriminator value, as in these examples: + + { "type": "file", "data": { "filename": "/some/place/my-image" } } + { "type": "qcow2", "data": { "backing": "/some/place/my-image", + "lazy-refcounts": true } } + +The generated C code uses a struct containing a union. Additionally, +an implicit C enum 'NameKind' is created, corresponding to the union +'Name', for accessing the various branches of the union. No branch of +the union can be named 'max', as this would collide with the implicit +enum. The value for each branch can be of any type. + +A flat union definition avoids nesting on the wire, and specifies a +set of common members that occur in all variants of the union. The +'base' key must specify either a type name (the type must be a +struct, not a union), or a dictionary representing an anonymous type. +All branches of the union must be complex types, and the top-level +members of the union dictionary on the wire will be combination of +members from both the base type and the appropriate branch type (when +merging two dictionaries, there must be no keys in common). The +'discriminator' member must be the name of a non-optional enum-typed +member of the base struct. + +The following example enhances the above simple union example by +adding an optional common member 'read-only', renaming the +discriminator to something more applicable than the simple union's +default of 'type', and reducing the number of {} required on the wire: + + { 'enum': 'BlockdevDriver', 'data': [ 'file', 'qcow2' ] } + { 'union': 'BlockdevOptions', + 'base': { 'driver': 'BlockdevDriver', '*read-only': 'bool' }, + 'discriminator': 'driver', + 'data': { 'file': 'BlockdevOptionsFile', + 'qcow2': 'BlockdevOptionsQcow2' } } + +Resulting in these JSON objects: + + { "driver": "file", "read-only": true, + "filename": "/some/place/my-image" } + { "driver": "qcow2", "read-only": false, + "backing": "/some/place/my-image", "lazy-refcounts": true } + +Notice that in a flat union, the discriminator name is controlled by +the user, but because it must map to a base member with enum type, the +code generator can ensure that branches exist for all values of the +enum (although the order of the keys need not match the declaration of +the enum). In the resulting generated C data types, a flat union is +represented as a struct with the base members included directly, and +then a union of structures for each branch of the struct. + +A simple union can always be re-written as a flat union where the base +class has a single member named 'type', and where each branch of the +union has a struct with a single member named 'data'. That is, + + { 'union': 'Simple', 'data': { 'one': 'str', 'two': 'int' } } + +is identical on the wire to: + + { 'enum': 'Enum', 'data': ['one', 'two'] } + { 'struct': 'Branch1', 'data': { 'data': 'str' } } + { 'struct': 'Branch2', 'data': { 'data': 'int' } } + { 'union': 'Flat': 'base': { 'type': 'Enum' }, 'discriminator': 'type', + 'data': { 'one': 'Branch1', 'two': 'Branch2' } } + + +=== Alternate types === + +Usage: { 'alternate': STRING, 'data': DICT } + +An alternate type is one that allows a choice between two or more JSON +data types (string, integer, number, or object, but currently not +array) on the wire. The definition is similar to a simple union type, +where each branch of the union names a QAPI type. For example: + + { 'alternate': 'BlockdevRef', + 'data': { 'definition': 'BlockdevOptions', + 'reference': 'str' } } + +Unlike a union, the discriminator string is never passed on the wire +for the Client JSON Protocol. Instead, the value's JSON type serves +as an implicit discriminator, which in turn means that an alternate +can only express a choice between types represented differently in +JSON. If a branch is typed as the 'bool' built-in, the alternate +accepts true and false; if it is typed as any of the various numeric +built-ins, it accepts a JSON number; if it is typed as a 'str' +built-in or named enum type, it accepts a JSON string; and if it is +typed as a complex type (struct or union), it accepts a JSON object. +Two different complex types, for instance, aren't permitted, because +both are represented as a JSON object. + +The example alternate declaration above allows using both of the +following example objects: + + { "file": "my_existing_block_device_id" } + { "file": { "driver": "file", + "read-only": false, + "filename": "/tmp/mydisk.qcow2" } } + + +=== Commands === + +Usage: { 'command': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT, + '*returns': TYPE-NAME, '*boxed': true, + '*gen': false, '*success-response': false } + +Commands are defined by using a dictionary containing several members, +where three members are most common. The 'command' member is a +mandatory string, and determines the "execute" value passed in a +Client JSON Protocol command exchange. + +The 'data' argument maps to the "arguments" dictionary passed in as +part of a Client JSON Protocol command. The 'data' member is optional +and defaults to {} (an empty dictionary). If present, it must be the +string name of a complex type, or a dictionary that declares an +anonymous type with the same semantics as a 'struct' expression. + +The 'returns' member describes what will appear in the "return" member +of a Client JSON Protocol reply on successful completion of a command. +The member is optional from the command declaration; if absent, the +"return" member will be an empty dictionary. If 'returns' is present, +it must be the string name of a complex or built-in type, a +one-element array containing the name of a complex or built-in type. +To return anything else, you have to list the command in pragma +'returns-whitelist'. If you do this, the command cannot be extended +to return additional information in the future. Use of +'returns-whitelist' for new commands is strongly discouraged. + +All commands in Client JSON Protocol use a dictionary to report +failure, with no way to specify that in QAPI. Where the error return +is different than the usual GenericError class in order to help the +client react differently to certain error conditions, it is worth +documenting this in the comments before the command declaration. + +Some example commands: + + { 'command': 'my-first-command', + 'data': { 'arg1': 'str', '*arg2': 'str' } } + { 'struct': 'MyType', 'data': { '*value': 'str' } } + { 'command': 'my-second-command', + 'returns': [ 'MyType' ] } + +which would validate this Client JSON Protocol transaction: + + => { "execute": "my-first-command", + "arguments": { "arg1": "hello" } } + <= { "return": { } } + => { "execute": "my-second-command" } + <= { "return": [ { "value": "one" }, { } ] } + +The generator emits a prototype for the user's function implementing +the command. Normally, 'data' is a dictionary for an anonymous type, +or names a struct type (possibly empty, but not a union), and its +members are passed as separate arguments to this function. If the +command definition includes a key 'boxed' with the boolean value true, +then 'data' is instead the name of any non-empty complex type +(struct, union, or alternate), and a pointer to that QAPI type is +passed as a single argument. + +The generator also emits a marshalling function that extracts +arguments for the user's function out of an input QDict, calls the +user's function, and if it succeeded, builds an output QObject from +its return value. + +In rare cases, QAPI cannot express a type-safe representation of a +corresponding Client JSON Protocol command. You then have to suppress +generation of a marshalling function by including a key 'gen' with +boolean value false, and instead write your own function. Please try +to avoid adding new commands that rely on this, and instead use +type-safe unions. For an example of this usage: + + { 'command': 'netdev_add', + 'data': {'type': 'str', 'id': 'str'}, + 'gen': false } + +Normally, the QAPI schema is used to describe synchronous exchanges, +where a response is expected. But in some cases, the action of a +command is expected to change state in a way that a successful +response is not possible (although the command will still return a +normal dictionary error on failure). When a successful reply is not +possible, the command expression should include the optional key +'success-response' with boolean value false. So far, only QGA makes +use of this member. + + +=== Events === + +Usage: { 'event': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT, + '*boxed': true } + +Events are defined with the keyword 'event'. It is not allowed to +name an event 'MAX', since the generator also produces a C enumeration +of all event names with a generated _MAX value at the end. When +'data' is also specified, additional info will be included in the +event, with similar semantics to a 'struct' expression. Finally there +will be C API generated in qapi-event.h; when called by QEMU code, a +message with timestamp will be emitted on the wire. + +An example event is: + +{ 'event': 'EVENT_C', + 'data': { '*a': 'int', 'b': 'str' } } + +Resulting in this JSON object: + +{ "event": "EVENT_C", + "data": { "b": "test string" }, + "timestamp": { "seconds": 1267020223, "microseconds": 435656 } } + +The generator emits a function to send the event. Normally, 'data' is +a dictionary for an anonymous type, or names a struct type (possibly +empty, but not a union), and its members are passed as separate +arguments to this function. If the event definition includes a key +'boxed' with the boolean value true, then 'data' is instead the name of +any non-empty complex type (struct, union, or alternate), and a +pointer to that QAPI type is passed as a single argument. + + +=== Downstream extensions === + +QAPI schema names that are externally visible, say in the Client JSON +Protocol, need to be managed with care. Names starting with a +downstream prefix of the form __RFQDN_ are reserved for the downstream +who controls the valid, reverse fully qualified domain name RFQDN. +RFQDN may only contain ASCII letters, digits, hyphen and period. + +Example: Red Hat, Inc. controls redhat.com, and may therefore add a +downstream command __com.redhat_drive-mirror. + + +== Client JSON Protocol introspection == + +Clients of a Client JSON Protocol commonly need to figure out what +exactly the server (QEMU) supports. + +For this purpose, QMP provides introspection via command +query-qmp-schema. QGA currently doesn't support introspection. + +While Client JSON Protocol wire compatibility should be maintained +between qemu versions, we cannot make the same guarantees for +introspection stability. For example, one version of qemu may provide +a non-variant optional member of a struct, and a later version rework +the member to instead be non-optional and associated with a variant. +Likewise, one version of qemu may list a member with open-ended type +'str', and a later version could convert it to a finite set of strings +via an enum type; or a member may be converted from a specific type to +an alternate that represents a choice between the original type and +something else. + +query-qmp-schema returns a JSON array of SchemaInfo objects. These +objects together describe the wire ABI, as defined in the QAPI schema. +There is no specified order to the SchemaInfo objects returned; a +client must search for a particular name throughout the entire array +to learn more about that name, but is at least guaranteed that there +will be no collisions between type, command, and event names. + +However, the SchemaInfo can't reflect all the rules and restrictions +that apply to QMP. It's interface introspection (figuring out what's +there), not interface specification. The specification is in the QAPI +schema. To understand how QMP is to be used, you need to study the +QAPI schema. + +Like any other command, query-qmp-schema is itself defined in the QAPI +schema, along with the SchemaInfo type. This text attempts to give an +overview how things work. For details you need to consult the QAPI +schema. + +SchemaInfo objects have common members "name" and "meta-type", and +additional variant members depending on the value of meta-type. + +Each SchemaInfo object describes a wire ABI entity of a certain +meta-type: a command, event or one of several kinds of type. + +SchemaInfo for commands and events have the same name as in the QAPI +schema. + +Command and event names are part of the wire ABI, but type names are +not. Therefore, the SchemaInfo for types have auto-generated +meaningless names. For readability, the examples in this section use +meaningful type names instead. + +To examine a type, start with a command or event using it, then follow +references by name. + +QAPI schema definitions not reachable that way are omitted. + +The SchemaInfo for a command has meta-type "command", and variant +members "arg-type" and "ret-type". On the wire, the "arguments" +member of a client's "execute" command must conform to the object type +named by "arg-type". The "return" member that the server passes in a +success response conforms to the type named by "ret-type". + +If the command takes no arguments, "arg-type" names an object type +without members. Likewise, if the command returns nothing, "ret-type" +names an object type without members. + +Example: the SchemaInfo for command query-qmp-schema + + { "name": "query-qmp-schema", "meta-type": "command", + "arg-type": "q_empty", "ret-type": "SchemaInfoList" } + + Type "q_empty" is an automatic object type without members, and type + "SchemaInfoList" is the array of SchemaInfo type. + +The SchemaInfo for an event has meta-type "event", and variant member +"arg-type". On the wire, a "data" member that the server passes in an +event conforms to the object type named by "arg-type". + +If the event carries no additional information, "arg-type" names an +object type without members. The event may not have a data member on +the wire then. + +Each command or event defined with dictionary-valued 'data' in the +QAPI schema implicitly defines an object type. + +Example: the SchemaInfo for EVENT_C from section Events + + { "name": "EVENT_C", "meta-type": "event", + "arg-type": "q_obj-EVENT_C-arg" } + + Type "q_obj-EVENT_C-arg" is an implicitly defined object type with + the two members from the event's definition. + +The SchemaInfo for struct and union types has meta-type "object". + +The SchemaInfo for a struct type has variant member "members". + +The SchemaInfo for a union type additionally has variant members "tag" +and "variants". + +"members" is a JSON array describing the object's common members, if +any. Each element is a JSON object with members "name" (the member's +name), "type" (the name of its type), and optionally "default". The +member is optional if "default" is present. Currently, "default" can +only have value null. Other values are reserved for future +extensions. The "members" array is in no particular order; clients +must search the entire object when learning whether a particular +member is supported. + +Example: the SchemaInfo for MyType from section Struct types + + { "name": "MyType", "meta-type": "object", + "members": [ + { "name": "member1", "type": "str" }, + { "name": "member2", "type": "int" }, + { "name": "member3", "type": "str", "default": null } ] } + +"tag" is the name of the common member serving as type tag. +"variants" is a JSON array describing the object's variant members. +Each element is a JSON object with members "case" (the value of type +tag this element applies to) and "type" (the name of an object type +that provides the variant members for this type tag value). The +"variants" array is in no particular order, and is not guaranteed to +list cases in the same order as the corresponding "tag" enum type. + +Example: the SchemaInfo for flat union BlockdevOptions from section +Union types + + { "name": "BlockdevOptions", "meta-type": "object", + "members": [ + { "name": "driver", "type": "BlockdevDriver" }, + { "name": "read-only", "type": "bool", "default": null } ], + "tag": "driver", + "variants": [ + { "case": "file", "type": "BlockdevOptionsFile" }, + { "case": "qcow2", "type": "BlockdevOptionsQcow2" } ] } + +Note that base types are "flattened": its members are included in the +"members" array. + +A simple union implicitly defines an enumeration type for its implicit +discriminator (called "type" on the wire, see section Union types). + +A simple union implicitly defines an object type for each of its +variants. + +Example: the SchemaInfo for simple union BlockdevOptionsSimple from section +Union types + + { "name": "BlockdevOptionsSimple", "meta-type": "object", + "members": [ + { "name": "type", "type": "BlockdevOptionsSimpleKind" } ], + "tag": "type", + "variants": [ + { "case": "file", "type": "q_obj-BlockdevOptionsFile-wrapper" }, + { "case": "qcow2", "type": "q_obj-BlockdevOptionsQcow2-wrapper" } ] } + + Enumeration type "BlockdevOptionsSimpleKind" and the object types + "q_obj-BlockdevOptionsFile-wrapper", "q_obj-BlockdevOptionsQcow2-wrapper" + are implicitly defined. + +The SchemaInfo for an alternate type has meta-type "alternate", and +variant member "members". "members" is a JSON array. Each element is +a JSON object with member "type", which names a type. Values of the +alternate type conform to exactly one of its member types. There is +no guarantee on the order in which "members" will be listed. + +Example: the SchemaInfo for BlockdevRef from section Alternate types + + { "name": "BlockdevRef", "meta-type": "alternate", + "members": [ + { "type": "BlockdevOptions" }, + { "type": "str" } ] } + +The SchemaInfo for an array type has meta-type "array", and variant +member "element-type", which names the array's element type. Array +types are implicitly defined. For convenience, the array's name may +resemble the element type; however, clients should examine member +"element-type" instead of making assumptions based on parsing member +"name". + +Example: the SchemaInfo for ['str'] + + { "name": "[str]", "meta-type": "array", + "element-type": "str" } + +The SchemaInfo for an enumeration type has meta-type "enum" and +variant member "values". The values are listed in no particular +order; clients must search the entire enum when learning whether a +particular value is supported. + +Example: the SchemaInfo for MyEnum from section Enumeration types + + { "name": "MyEnum", "meta-type": "enum", + "values": [ "value1", "value2", "value3" ] } + +The SchemaInfo for a built-in type has the same name as the type in +the QAPI schema (see section Built-in Types), with one exception +detailed below. It has variant member "json-type" that shows how +values of this type are encoded on the wire. + +Example: the SchemaInfo for str + + { "name": "str", "meta-type": "builtin", "json-type": "string" } + +The QAPI schema supports a number of integer types that only differ in +how they map to C. They are identical as far as SchemaInfo is +concerned. Therefore, they get all mapped to a single type "int" in +SchemaInfo. + +As explained above, type names are not part of the wire ABI. Not even +the names of built-in types. Clients should examine member +"json-type" instead of hard-coding names of built-in types. + + +== Code generation == + +Schemas are fed into five scripts to generate all the code/files that, +paired with the core QAPI libraries, comprise everything required to +take JSON commands read in by a Client JSON Protocol server, unmarshal +the arguments into the underlying C types, call into the corresponding +C function, map the response back to a Client JSON Protocol response +to be returned to the user, and introspect the commands. + +As an example, we'll use the following schema, which describes a +single complex user-defined type, along with command which takes a +list of that type as a parameter, and returns a single element of that +type. The user is responsible for writing the implementation of +qmp_my_command(); everything else is produced by the generator. + + $ cat example-schema.json + { 'struct': 'UserDefOne', + 'data': { 'integer': 'int', '*string': 'str' } } + + { 'command': 'my-command', + 'data': { 'arg1': ['UserDefOne'] }, + 'returns': 'UserDefOne' } + + { 'event': 'MY_EVENT' } + +For a more thorough look at generated code, the testsuite includes +tests/qapi-schema/qapi-schema-tests.json that covers more examples of +what the generator will accept, and compiles the resulting C code as +part of 'make check-unit'. + +=== scripts/qapi-types.py === + +Used to generate the C types defined by a schema, along with +supporting code. The following files are created: + +$(prefix)qapi-types.h - C types corresponding to types defined in + the schema you pass in +$(prefix)qapi-types.c - Cleanup functions for the above C types + +The $(prefix) is an optional parameter used as a namespace to keep the +generated code from one schema/code-generation separated from others so code +can be generated/used from multiple schemas without clobbering previously +created code. + +Example: + + $ python scripts/qapi-types.py --output-dir="qapi-generated" \ + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qapi-types.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QAPI_TYPES_H + #define EXAMPLE_QAPI_TYPES_H + +[Built-in types omitted...] + + typedef struct UserDefOne UserDefOne; + + typedef struct UserDefOneList UserDefOneList; + + struct UserDefOne { + int64_t integer; + bool has_string; + char *string; + }; + + void qapi_free_UserDefOne(UserDefOne *obj); + + struct UserDefOneList { + UserDefOneList *next; + UserDefOne *value; + }; + + void qapi_free_UserDefOneList(UserDefOneList *obj); + + #endif + $ cat qapi-generated/example-qapi-types.c +[Uninteresting stuff omitted...] + + void qapi_free_UserDefOne(UserDefOne *obj) + { + Visitor *v; + + if (!obj) { + return; + } + + v = qapi_dealloc_visitor_new(); + visit_type_UserDefOne(v, NULL, &obj, NULL); + visit_free(v); + } + + void qapi_free_UserDefOneList(UserDefOneList *obj) + { + Visitor *v; + + if (!obj) { + return; + } + + v = qapi_dealloc_visitor_new(); + visit_type_UserDefOneList(v, NULL, &obj, NULL); + visit_free(v); + } + +=== scripts/qapi-visit.py === + +Used to generate the visitor functions used to walk through and +convert between a native QAPI C data structure and some other format +(such as QObject); the generated functions are named visit_type_FOO() +and visit_type_FOO_members(). + +The following files are generated: + +$(prefix)qapi-visit.c: visitor function for a particular C type, used + to automagically convert QObjects into the + corresponding C type and vice-versa, as well + as for deallocating memory for an existing C + type + +$(prefix)qapi-visit.h: declarations for previously mentioned visitor + functions + +Example: + + $ python scripts/qapi-visit.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qapi-visit.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QAPI_VISIT_H + #define EXAMPLE_QAPI_VISIT_H + +[Visitors for built-in types omitted...] + + void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp); + void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp); + void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp); + + #endif + $ cat qapi-generated/example-qapi-visit.c +[Uninteresting stuff omitted...] + + void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp) + { + Error *err = NULL; + + visit_type_int(v, "integer", &obj->integer, &err); + if (err) { + goto out; + } + if (visit_optional(v, "string", &obj->has_string)) { + visit_type_str(v, "string", &obj->string, &err); + if (err) { + goto out; + } + } + + out: + error_propagate(errp, err); + } + + void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp) + { + Error *err = NULL; + + visit_start_struct(v, name, (void **)obj, sizeof(UserDefOne), &err); + if (err) { + goto out; + } + if (!*obj) { + goto out_obj; + } + visit_type_UserDefOne_members(v, *obj, &err); + if (err) { + goto out_obj; + } + visit_check_struct(v, &err); + out_obj: + visit_end_struct(v, (void **)obj); + if (err && visit_is_input(v)) { + qapi_free_UserDefOne(*obj); + *obj = NULL; + } + out: + error_propagate(errp, err); + } + + void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp) + { + Error *err = NULL; + UserDefOneList *tail; + size_t size = sizeof(**obj); + + visit_start_list(v, name, (GenericList **)obj, size, &err); + if (err) { + goto out; + } + + for (tail = *obj; tail; + tail = (UserDefOneList *)visit_next_list(v, (GenericList *)tail, size)) { + visit_type_UserDefOne(v, NULL, &tail->value, &err); + if (err) { + break; + } + } + + visit_end_list(v, (void **)obj); + if (err && visit_is_input(v)) { + qapi_free_UserDefOneList(*obj); + *obj = NULL; + } + out: + error_propagate(errp, err); + } + +=== scripts/qapi-commands.py === + +Used to generate the marshaling/dispatch functions for the commands +defined in the schema. The generated code implements +qmp_marshal_COMMAND() (registered automatically), and declares +qmp_COMMAND() that the user must implement. The following files are +generated: + +$(prefix)qmp-marshal.c: command marshal/dispatch functions for each + QMP command defined in the schema. Functions + generated by qapi-visit.py are used to + convert QObjects received from the wire into + function parameters, and uses the same + visitor functions to convert native C return + values to QObjects from transmission back + over the wire. + +$(prefix)qmp-commands.h: Function prototypes for the QMP commands + specified in the schema. + +Example: + + $ python scripts/qapi-commands.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qmp-commands.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QMP_COMMANDS_H + #define EXAMPLE_QMP_COMMANDS_H + + #include "example-qapi-types.h" + #include "qapi/qmp/qdict.h" + #include "qapi/error.h" + + UserDefOne *qmp_my_command(UserDefOneList *arg1, Error **errp); + + #endif + $ cat qapi-generated/example-qmp-marshal.c +[Uninteresting stuff omitted...] + + static void qmp_marshal_output_UserDefOne(UserDefOne *ret_in, QObject **ret_out, Error **errp) + { + Error *err = NULL; + Visitor *v; + + v = qobject_output_visitor_new(ret_out); + visit_type_UserDefOne(v, "unused", &ret_in, &err); + if (!err) { + visit_complete(v, ret_out); + } + error_propagate(errp, err); + visit_free(v); + v = qapi_dealloc_visitor_new(); + visit_type_UserDefOne(v, "unused", &ret_in, NULL); + visit_free(v); + } + + static void qmp_marshal_my_command(QDict *args, QObject **ret, Error **errp) + { + Error *err = NULL; + UserDefOne *retval; + Visitor *v; + UserDefOneList *arg1 = NULL; + + v = qobject_input_visitor_new(QOBJECT(args)); + visit_start_struct(v, NULL, NULL, 0, &err); + if (err) { + goto out; + } + visit_type_UserDefOneList(v, "arg1", &arg1, &err); + if (!err) { + visit_check_struct(v, &err); + } + visit_end_struct(v, NULL); + if (err) { + goto out; + } + + retval = qmp_my_command(arg1, &err); + if (err) { + goto out; + } + + qmp_marshal_output_UserDefOne(retval, ret, &err); + + out: + error_propagate(errp, err); + visit_free(v); + v = qapi_dealloc_visitor_new(); + visit_start_struct(v, NULL, NULL, 0, NULL); + visit_type_UserDefOneList(v, "arg1", &arg1, NULL); + visit_end_struct(v, NULL); + visit_free(v); + } + + static void qmp_init_marshal(void) + { + qmp_register_command("my-command", qmp_marshal_my_command, QCO_NO_OPTIONS); + } + + qapi_init(qmp_init_marshal); + +=== scripts/qapi-event.py === + +Used to generate the event-related C code defined by a schema, with +implementations for qapi_event_send_FOO(). The following files are +created: + +$(prefix)qapi-event.h - Function prototypes for each event type, plus an + enumeration of all event names +$(prefix)qapi-event.c - Implementation of functions to send an event + +Example: + + $ python scripts/qapi-event.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qapi-event.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QAPI_EVENT_H + #define EXAMPLE_QAPI_EVENT_H + + #include "qapi/error.h" + #include "qapi/qmp/qdict.h" + #include "example-qapi-types.h" + + + void qapi_event_send_my_event(Error **errp); + + typedef enum example_QAPIEvent { + EXAMPLE_QAPI_EVENT_MY_EVENT = 0, + EXAMPLE_QAPI_EVENT__MAX = 1, + } example_QAPIEvent; + + extern const char *const example_QAPIEvent_lookup[]; + + #endif + $ cat qapi-generated/example-qapi-event.c +[Uninteresting stuff omitted...] + + void qapi_event_send_my_event(Error **errp) + { + QDict *qmp; + Error *err = NULL; + QMPEventFuncEmit emit; + emit = qmp_event_get_func_emit(); + if (!emit) { + return; + } + + qmp = qmp_event_build_dict("MY_EVENT"); + + emit(EXAMPLE_QAPI_EVENT_MY_EVENT, qmp, &err); + + error_propagate(errp, err); + QDECREF(qmp); + } + + const char *const example_QAPIEvent_lookup[] = { + [EXAMPLE_QAPI_EVENT_MY_EVENT] = "MY_EVENT", + [EXAMPLE_QAPI_EVENT__MAX] = NULL, + }; + +=== scripts/qapi-introspect.py === + +Used to generate the introspection C code for a schema. The following +files are created: + +$(prefix)qmp-introspect.c - Defines a string holding a JSON + description of the schema. +$(prefix)qmp-introspect.h - Declares the above string. + +Example: + + $ python scripts/qapi-introspect.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qmp-introspect.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QMP_INTROSPECT_H + #define EXAMPLE_QMP_INTROSPECT_H + + extern const char example_qmp_schema_json[]; + + #endif + $ cat qapi-generated/example-qmp-introspect.c +[Uninteresting stuff omitted...] + + const char example_qmp_schema_json[] = "[" + "{\"arg-type\": \"0\", \"meta-type\": \"event\", \"name\": \"MY_EVENT\"}, " + "{\"arg-type\": \"1\", \"meta-type\": \"command\", \"name\": \"my-command\", \"ret-type\": \"2\"}, " + "{\"members\": [], \"meta-type\": \"object\", \"name\": \"0\"}, " + "{\"members\": [{\"name\": \"arg1\", \"type\": \"[2]\"}], \"meta-type\": \"object\", \"name\": \"1\"}, " + "{\"members\": [{\"name\": \"integer\", \"type\": \"int\"}, {\"default\": null, \"name\": \"string\", \"type\": \"str\"}], \"meta-type\": \"object\", \"name\": \"2\"}, " + "{\"element-type\": \"2\", \"meta-type\": \"array\", \"name\": \"[2]\"}, " + "{\"json-type\": \"int\", \"meta-type\": \"builtin\", \"name\": \"int\"}, " + "{\"json-type\": \"string\", \"meta-type\": \"builtin\", \"name\": \"str\"}]"; diff --git a/docs/devel/rcu.txt b/docs/devel/rcu.txt new file mode 100644 index 0000000000..c84e7f42b2 --- /dev/null +++ b/docs/devel/rcu.txt @@ -0,0 +1,390 @@ +Using RCU (Read-Copy-Update) for synchronization +================================================ + +Read-copy update (RCU) is a synchronization mechanism that is used to +protect read-mostly data structures. RCU is very efficient and scalable +on the read side (it is wait-free), and thus can make the read paths +extremely fast. + +RCU supports concurrency between a single writer and multiple readers, +thus it is not used alone. Typically, the write-side will use a lock to +serialize multiple updates, but other approaches are possible (e.g., +restricting updates to a single task). In QEMU, when a lock is used, +this will often be the "iothread mutex", also known as the "big QEMU +lock" (BQL). Also, restricting updates to a single task is done in +QEMU using the "bottom half" API. + +RCU is fundamentally a "wait-to-finish" mechanism. The read side marks +sections of code with "critical sections", and the update side will wait +for the execution of all *currently running* critical sections before +proceeding, or before asynchronously executing a callback. + +The key point here is that only the currently running critical sections +are waited for; critical sections that are started _after_ the beginning +of the wait do not extend the wait, despite running concurrently with +the updater. This is the reason why RCU is more scalable than, +for example, reader-writer locks. It is so much more scalable that +the system will have a single instance of the RCU mechanism; a single +mechanism can be used for an arbitrary number of "things", without +having to worry about things such as contention or deadlocks. + +How is this possible? The basic idea is to split updates in two phases, +"removal" and "reclamation". During removal, we ensure that subsequent +readers will not be able to get a reference to the old data. After +removal has completed, a critical section will not be able to access +the old data. Therefore, critical sections that begin after removal +do not matter; as soon as all previous critical sections have finished, +there cannot be any readers who hold references to the data structure, +and these can now be safely reclaimed (e.g., freed or unref'ed). + +Here is a picture: + + thread 1 thread 2 thread 3 + ------------------- ------------------------ ------------------- + enter RCU crit.sec. + | finish removal phase + | begin wait + | | enter RCU crit.sec. + exit RCU crit.sec | | + complete wait | + begin reclamation phase | + exit RCU crit.sec. + + +Note how thread 3 is still executing its critical section when thread 2 +starts reclaiming data. This is possible, because the old version of the +data structure was not accessible at the time thread 3 began executing +that critical section. + + +RCU API +======= + +The core RCU API is small: + + void rcu_read_lock(void); + + Used by a reader to inform the reclaimer that the reader is + entering an RCU read-side critical section. + + void rcu_read_unlock(void); + + Used by a reader to inform the reclaimer that the reader is + exiting an RCU read-side critical section. Note that RCU + read-side critical sections may be nested and/or overlapping. + + void synchronize_rcu(void); + + Blocks until all pre-existing RCU read-side critical sections + on all threads have completed. This marks the end of the removal + phase and the beginning of reclamation phase. + + Note that it would be valid for another update to come while + synchronize_rcu is running. Because of this, it is better that + the updater releases any locks it may hold before calling + synchronize_rcu. If this is not possible (for example, because + the updater is protected by the BQL), you can use call_rcu. + + void call_rcu1(struct rcu_head * head, + void (*func)(struct rcu_head *head)); + + This function invokes func(head) after all pre-existing RCU + read-side critical sections on all threads have completed. This + marks the end of the removal phase, with func taking care + asynchronously of the reclamation phase. + + The foo struct needs to have an rcu_head structure added, + perhaps as follows: + + struct foo { + struct rcu_head rcu; + int a; + char b; + long c; + }; + + so that the reclaimer function can fetch the struct foo address + and free it: + + call_rcu1(&foo.rcu, foo_reclaim); + + void foo_reclaim(struct rcu_head *rp) + { + struct foo *fp = container_of(rp, struct foo, rcu); + g_free(fp); + } + + For the common case where the rcu_head member is the first of the + struct, you can use the following macro. + + void call_rcu(T *p, + void (*func)(T *p), + field-name); + void g_free_rcu(T *p, + field-name); + + call_rcu1 is typically used through these macro, in the common case + where the "struct rcu_head" is the first field in the struct. If + the callback function is g_free, in particular, g_free_rcu can be + used. In the above case, one could have written simply: + + g_free_rcu(&foo, rcu); + + typeof(*p) atomic_rcu_read(p); + + atomic_rcu_read() is similar to atomic_mb_read(), but it makes + some assumptions on the code that calls it. This allows a more + optimized implementation. + + atomic_rcu_read assumes that whenever a single RCU critical + section reads multiple shared data, these reads are either + data-dependent or need no ordering. This is almost always the + case when using RCU, because read-side critical sections typically + navigate one or more pointers (the pointers that are changed on + every update) until reaching a data structure of interest, + and then read from there. + + RCU read-side critical sections must use atomic_rcu_read() to + read data, unless concurrent writes are prevented by another + synchronization mechanism. + + Furthermore, RCU read-side critical sections should traverse the + data structure in a single direction, opposite to the direction + in which the updater initializes it. + + void atomic_rcu_set(p, typeof(*p) v); + + atomic_rcu_set() is also similar to atomic_mb_set(), and it also + makes assumptions on the code that calls it in order to allow a more + optimized implementation. + + In particular, atomic_rcu_set() suffices for synchronization + with readers, if the updater never mutates a field within a + data item that is already accessible to readers. This is the + case when initializing a new copy of the RCU-protected data + structure; just ensure that initialization of *p is carried out + before atomic_rcu_set() makes the data item visible to readers. + If this rule is observed, writes will happen in the opposite + order as reads in the RCU read-side critical sections (or if + there is just one update), and there will be no need for other + synchronization mechanism to coordinate the accesses. + +The following APIs must be used before RCU is used in a thread: + + void rcu_register_thread(void); + + Mark a thread as taking part in the RCU mechanism. Such a thread + will have to report quiescent points regularly, either manually + or through the QemuCond/QemuSemaphore/QemuEvent APIs. + + void rcu_unregister_thread(void); + + Mark a thread as not taking part anymore in the RCU mechanism. + It is not a problem if such a thread reports quiescent points, + either manually or by using the QemuCond/QemuSemaphore/QemuEvent + APIs. + +Note that these APIs are relatively heavyweight, and should _not_ be +nested. + + +DIFFERENCES WITH LINUX +====================== + +- Waiting on a mutex is possible, though discouraged, within an RCU critical + section. This is because spinlocks are rarely (if ever) used in userspace + programming; not allowing this would prevent upgrading an RCU read-side + critical section to become an updater. + +- atomic_rcu_read and atomic_rcu_set replace rcu_dereference and + rcu_assign_pointer. They take a _pointer_ to the variable being accessed. + +- call_rcu is a macro that has an extra argument (the name of the first + field in the struct, which must be a struct rcu_head), and expects the + type of the callback's argument to be the type of the first argument. + call_rcu1 is the same as Linux's call_rcu. + + +RCU PATTERNS +============ + +Many patterns using read-writer locks translate directly to RCU, with +the advantages of higher scalability and deadlock immunity. + +In general, RCU can be used whenever it is possible to create a new +"version" of a data structure every time the updater runs. This may +sound like a very strict restriction, however: + +- the updater does not mean "everything that writes to a data structure", + but rather "everything that involves a reclamation step". See the + array example below + +- in some cases, creating a new version of a data structure may actually + be very cheap. For example, modifying the "next" pointer of a singly + linked list is effectively creating a new version of the list. + +Here are some frequently-used RCU idioms that are worth noting. + + +RCU list processing +------------------- + +TBD (not yet used in QEMU) + + +RCU reference counting +---------------------- + +Because grace periods are not allowed to complete while there is an RCU +read-side critical section in progress, the RCU read-side primitives +may be used as a restricted reference-counting mechanism. For example, +consider the following code fragment: + + rcu_read_lock(); + p = atomic_rcu_read(&foo); + /* do something with p. */ + rcu_read_unlock(); + +The RCU read-side critical section ensures that the value of "p" remains +valid until after the rcu_read_unlock(). In some sense, it is acquiring +a reference to p that is later released when the critical section ends. +The write side looks simply like this (with appropriate locking): + + qemu_mutex_lock(&foo_mutex); + old = foo; + atomic_rcu_set(&foo, new); + qemu_mutex_unlock(&foo_mutex); + synchronize_rcu(); + free(old); + +If the processing cannot be done purely within the critical section, it +is possible to combine this idiom with a "real" reference count: + + rcu_read_lock(); + p = atomic_rcu_read(&foo); + foo_ref(p); + rcu_read_unlock(); + /* do something with p. */ + foo_unref(p); + +The write side can be like this: + + qemu_mutex_lock(&foo_mutex); + old = foo; + atomic_rcu_set(&foo, new); + qemu_mutex_unlock(&foo_mutex); + synchronize_rcu(); + foo_unref(old); + +or with call_rcu: + + qemu_mutex_lock(&foo_mutex); + old = foo; + atomic_rcu_set(&foo, new); + qemu_mutex_unlock(&foo_mutex); + call_rcu(foo_unref, old, rcu); + +In both cases, the write side only performs removal. Reclamation +happens when the last reference to a "foo" object is dropped. +Using synchronize_rcu() is undesirably expensive, because the +last reference may be dropped on the read side. Hence you can +use call_rcu() instead: + + foo_unref(struct foo *p) { + if (atomic_fetch_dec(&p->refcount) == 1) { + call_rcu(foo_destroy, p, rcu); + } + } + + +Note that the same idioms would be possible with reader/writer +locks: + + read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock); + p = foo; p = foo; + /* do something with p. */ foo = new; + read_unlock(&foo_rwlock); free(p); + write_mutex_unlock(&foo_rwlock); + free(p); + + ------------------------------------------------------------------ + + read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock); + p = foo; old = foo; + foo_ref(p); foo = new; + read_unlock(&foo_rwlock); foo_unref(old); + /* do something with p. */ write_mutex_unlock(&foo_rwlock); + read_lock(&foo_rwlock); + foo_unref(p); + read_unlock(&foo_rwlock); + +foo_unref could use a mechanism such as bottom halves to move deallocation +out of the write-side critical section. + + +RCU resizable arrays +-------------------- + +Resizable arrays can be used with RCU. The expensive RCU synchronization +(or call_rcu) only needs to take place when the array is resized. +The two items to take care of are: + +- ensuring that the old version of the array is available between removal + and reclamation; + +- avoiding mismatches in the read side between the array data and the + array size. + +The first problem is avoided simply by not using realloc. Instead, +each resize will allocate a new array and copy the old data into it. +The second problem would arise if the size and the data pointers were +two members of a larger struct: + + struct mystuff { + ... + int data_size; + int data_alloc; + T *data; + ... + }; + +Instead, we store the size of the array with the array itself: + + struct arr { + int size; + int alloc; + T data[]; + }; + struct arr *global_array; + + read side: + rcu_read_lock(); + struct arr *array = atomic_rcu_read(&global_array); + x = i < array->size ? array->data[i] : -1; + rcu_read_unlock(); + return x; + + write side (running under a lock): + if (global_array->size == global_array->alloc) { + /* Creating a new version. */ + new_array = g_malloc(sizeof(struct arr) + + global_array->alloc * 2 * sizeof(T)); + new_array->size = global_array->size; + new_array->alloc = global_array->alloc * 2; + memcpy(new_array->data, global_array->data, + global_array->alloc * sizeof(T)); + + /* Removal phase. */ + old_array = global_array; + atomic_rcu_set(&new_array->data, new_array); + synchronize_rcu(); + + /* Reclamation phase. */ + free(old_array); + } + + +SOURCES +======= + +* Documentation/RCU/ from the Linux kernel diff --git a/docs/devel/tracing.txt b/docs/devel/tracing.txt new file mode 100644 index 0000000000..8c0029beca --- /dev/null +++ b/docs/devel/tracing.txt @@ -0,0 +1,442 @@ += Tracing = + +== Introduction == + +This document describes the tracing infrastructure in QEMU and how to use it +for debugging, profiling, and observing execution. + +== Quickstart == + +1. Build with the 'simple' trace backend: + + ./configure --enable-trace-backends=simple + make + +2. Create a file with the events you want to trace: + + echo bdrv_aio_readv > /tmp/events + echo bdrv_aio_writev >> /tmp/events + +3. Run the virtual machine to produce a trace file: + + qemu -trace events=/tmp/events ... # your normal QEMU invocation + +4. Pretty-print the binary trace file: + + ./scripts/simpletrace.py trace-events-all trace-* # Override * with QEMU + +== Trace events == + +=== Sub-directory setup === + +Each directory in the source tree can declare a set of static trace events +in a local "trace-events" file. All directories which contain "trace-events" +files must be listed in the "trace-events-subdirs" make variable in the top +level Makefile.objs. During build, the "trace-events" file in each listed +subdirectory will be processed by the "tracetool" script to generate code for +the trace events. + +The individual "trace-events" files are merged into a "trace-events-all" file, +which is also installed into "/usr/share/qemu" with the name "trace-events". +This merged file is to be used by the "simpletrace.py" script to later analyse +traces in the simpletrace data format. + +In the sub-directory the following files will be automatically generated + + - trace.c - the trace event state declarations + - trace.h - the trace event enums and probe functions + - trace-dtrace.h - DTrace event probe specification + - trace-dtrace.dtrace - DTrace event probe helper declaration + - trace-dtrace.o - binary DTrace provider (generated by dtrace) + - trace-ust.h - UST event probe helper declarations + +Source files in the sub-directory should #include the local 'trace.h' file, +without any sub-directory path prefix. eg io/channel-buffer.c would do + + #include "trace.h" + +To access the 'io/trace.h' file. While it is possible to include a trace.h +file from outside a source files' own sub-directory, this is discouraged in +general. It is strongly preferred that all events be declared directly in +the sub-directory that uses them. The only exception is where there are some +shared trace events defined in the top level directory trace-events file. +The top level directory generates trace files with a filename prefix of +"trace-root" instead of just "trace". This is to avoid ambiguity between +a trace.h in the current directory, vs the top level directory. + +=== Using trace events === + +Trace events are invoked directly from source code like this: + + #include "trace.h" /* needed for trace event prototype */ + + void *qemu_vmalloc(size_t size) + { + void *ptr; + size_t align = QEMU_VMALLOC_ALIGN; + + if (size < align) { + align = getpagesize(); + } + ptr = qemu_memalign(align, size); + trace_qemu_vmalloc(size, ptr); + return ptr; + } + +=== Declaring trace events === + +The "tracetool" script produces the trace.h header file which is included by +every source file that uses trace events. Since many source files include +trace.h, it uses a minimum of types and other header files included to keep the +namespace clean and compile times and dependencies down. + +Trace events should use types as follows: + + * Use stdint.h types for fixed-size types. Most offsets and guest memory + addresses are best represented with uint32_t or uint64_t. Use fixed-size + types over primitive types whose size may change depending on the host + (32-bit versus 64-bit) so trace events don't truncate values or break + the build. + + * Use void * for pointers to structs or for arrays. The trace.h header + cannot include all user-defined struct declarations and it is therefore + necessary to use void * for pointers to structs. + + * For everything else, use primitive scalar types (char, int, long) with the + appropriate signedness. + +Format strings should reflect the types defined in the trace event. Take +special care to use PRId64 and PRIu64 for int64_t and uint64_t types, +respectively. This ensures portability between 32- and 64-bit platforms. + +Each event declaration will start with the event name, then its arguments, +finally a format string for pretty-printing. For example: + + qemu_vmalloc(size_t size, void *ptr) "size %zu ptr %p" + qemu_vfree(void *ptr) "ptr %p" + + +=== Hints for adding new trace events === + +1. Trace state changes in the code. Interesting points in the code usually + involve a state change like starting, stopping, allocating, freeing. State + changes are good trace events because they can be used to understand the + execution of the system. + +2. Trace guest operations. Guest I/O accesses like reading device registers + are good trace events because they can be used to understand guest + interactions. + +3. Use correlator fields so the context of an individual line of trace output + can be understood. For example, trace the pointer returned by malloc and + used as an argument to free. This way mallocs and frees can be matched up. + Trace events with no context are not very useful. + +4. Name trace events after their function. If there are multiple trace events + in one function, append a unique distinguisher at the end of the name. + +== Generic interface and monitor commands == + +You can programmatically query and control the state of trace events through a +backend-agnostic interface provided by the header "trace/control.h". + +Note that some of the backends do not provide an implementation for some parts +of this interface, in which case QEMU will just print a warning (please refer to +header "trace/control.h" to see which routines are backend-dependent). + +The state of events can also be queried and modified through monitor commands: + +* info trace-events + View available trace events and their state. State 1 means enabled, state 0 + means disabled. + +* trace-event NAME on|off + Enable/disable a given trace event or a group of events (using wildcards). + +The "-trace events=" command line argument can be used to enable the +events listed in from the very beginning of the program. This file must +contain one event name per line. + +If a line in the "-trace events=" file begins with a '-', the trace event +will be disabled instead of enabled. This is useful when a wildcard was used +to enable an entire family of events but one noisy event needs to be disabled. + +Wildcard matching is supported in both the monitor command "trace-event" and the +events list file. That means you can enable/disable the events having a common +prefix in a batch. For example, virtio-blk trace events could be enabled using +the following monitor command: + + trace-event virtio_blk_* on + +== Trace backends == + +The "tracetool" script automates tedious trace event code generation and also +keeps the trace event declarations independent of the trace backend. The trace +events are not tightly coupled to a specific trace backend, such as LTTng or +SystemTap. Support for trace backends can be added by extending the "tracetool" +script. + +The trace backends are chosen at configure time: + + ./configure --enable-trace-backends=simple + +For a list of supported trace backends, try ./configure --help or see below. +If multiple backends are enabled, the trace is sent to them all. + +If no backends are explicitly selected, configure will default to the +"log" backend. + +The following subsections describe the supported trace backends. + +=== Nop === + +The "nop" backend generates empty trace event functions so that the compiler +can optimize out trace events completely. This imposes no performance +penalty. + +Note that regardless of the selected trace backend, events with the "disable" +property will be generated with the "nop" backend. + +=== Log === + +The "log" backend sends trace events directly to standard error. This +effectively turns trace events into debug printfs. + +This is the simplest backend and can be used together with existing code that +uses DPRINTF(). + +=== Simpletrace === + +The "simple" backend supports common use cases and comes as part of the QEMU +source tree. It may not be as powerful as platform-specific or third-party +trace backends but it is portable. This is the recommended trace backend +unless you have specific needs for more advanced backends. + +=== Ftrace === + +The "ftrace" backend writes trace data to ftrace marker. This effectively +sends trace events to ftrace ring buffer, and you can compare qemu trace +data and kernel(especially kvm.ko when using KVM) trace data. + +if you use KVM, enable kvm events in ftrace: + + # echo 1 > /sys/kernel/debug/tracing/events/kvm/enable + +After running qemu by root user, you can get the trace: + + # cat /sys/kernel/debug/tracing/trace + +Restriction: "ftrace" backend is restricted to Linux only. + +=== Syslog === + +The "syslog" backend sends trace events using the POSIX syslog API. The log +is opened specifying the LOG_DAEMON facility and LOG_PID option (so events +are tagged with the pid of the particular QEMU process that generated +them). All events are logged at LOG_INFO level. + +NOTE: syslog may squash duplicate consecutive trace events and apply rate + limiting. + +Restriction: "syslog" backend is restricted to POSIX compliant OS. + +==== Monitor commands ==== + +* trace-file on|off|flush|set + Enable/disable/flush the trace file or set the trace file name. + +==== Analyzing trace files ==== + +The "simple" backend produces binary trace files that can be formatted with the +simpletrace.py script. The script takes the "trace-events-all" file and the +binary trace: + + ./scripts/simpletrace.py trace-events-all trace-12345 + +You must ensure that the same "trace-events-all" file was used to build QEMU, +otherwise trace event declarations may have changed and output will not be +consistent. + +=== LTTng Userspace Tracer === + +The "ust" backend uses the LTTng Userspace Tracer library. There are no +monitor commands built into QEMU, instead UST utilities should be used to list, +enable/disable, and dump traces. + +Package lttng-tools is required for userspace tracing. You must ensure that the +current user belongs to the "tracing" group, or manually launch the +lttng-sessiond daemon for the current user prior to running any instance of +QEMU. + +While running an instrumented QEMU, LTTng should be able to list all available +events: + + lttng list -u + +Create tracing session: + + lttng create mysession + +Enable events: + + lttng enable-event qemu:g_malloc -u + +Where the events can either be a comma-separated list of events, or "-a" to +enable all tracepoint events. Start and stop tracing as needed: + + lttng start + lttng stop + +View the trace: + + lttng view + +Destroy tracing session: + + lttng destroy + +Babeltrace can be used at any later time to view the trace: + + babeltrace $HOME/lttng-traces/mysession--