On Fri, Jul 26, 2024 at 01:28:17PM +0100, David Woodhouse wrote:
> diff --git a/include/uapi/linux/vmclock-abi.h 
> b/include/uapi/linux/vmclock-abi.h
> new file mode 100644
> index 000000000000..7b1b4759363c
> --- /dev/null
> +++ b/include/uapi/linux/vmclock-abi.h
> @@ -0,0 +1,187 @@
> +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR 
> BSD-2-Clause) */
> +
> +/*
> + * This structure provides a vDSO-style clock to VM guests, exposing the
> + * relationship (or lack thereof) between the CPU clock (TSC, timebase, arch
> + * counter, etc.) and real time. It is designed to address the problem of
> + * live migration, which other clock enlightenments do not.
> + *
> + * When a guest is live migrated, this affects the clock in two ways.
> + *
> + * First, even between identical hosts the actual frequency of the underlying
> + * counter will change within the tolerances of its specification (typically
> + * ±50PPM, or 4 seconds a day). This frequency also varies over time on the
> + * same host, but can be tracked by NTP as it generally varies slowly. With
> + * live migration there is a step change in the frequency, with no warning.
> + *
> + * Second, there may be a step change in the value of the counter itself, as
> + * its accuracy is limited by the precision of the NTP synchronization on the
> + * source and destination hosts.
> + *
> + * So any calibration (NTP, PTP, etc.) which the guest has done on the source
> + * host before migration is invalid, and needs to be redone on the new host.
> + *
> + * In its most basic mode, this structure provides only an indication to the
> + * guest that live migration has occurred. This allows the guest to know that
> + * its clock is invalid and take remedial action. For applications that need
> + * reliable accurate timestamps (e.g. distributed databases), the structure
> + * can be mapped all the way to userspace. This allows the application to see
> + * directly for itself that the clock is disrupted and take appropriate
> + * action, even when using a vDSO-style method to get the time instead of a
> + * system call.
> + *
> + * In its more advanced mode. this structure can also be used to expose the
> + * precise relationship of the CPU counter to real time, as calibrated by the
> + * host. This means that userspace applications can have accurate time
> + * immediately after live migration, rather than having to pause operations
> + * and wait for NTP to recover. This mode does, of course, rely on the
> + * counter being reliable and consistent across CPUs.
> + *
> + * Note that this must be true UTC, never with smeared leap seconds. If a
> + * guest wishes to construct a smeared clock, it can do so. Presenting a
> + * smeared clock through this interface would be problematic because it
> + * actually messes with the apparent counter *period*. A linear smearing
> + * of 1 ms per second would effectively tweak the counter period by 1000PPM
> + * at the start/end of the smearing period, while a sinusoidal smear would
> + * basically be impossible to represent.
> + *
> + * This structure is offered with the intent that it be adopted into the
> + * nascent virtio-rtc standard, as a virtio-rtc that does not address the 
> live
> + * migration problem seems a little less than fit for purpose. For that
> + * reason, certain fields use precisely the same numeric definitions as in
> + * the virtio-rtc proposal. The structure can also be exposed through an ACPI
> + * device with the CID "VMCLOCK", modelled on the "VMGENID" device except for
> + * the fact that it uses a real _CRS to convey the address of the structure
> + * (which should be a full page, to allow for mapping directly to userspace).
> + */
> +
> +#ifndef __VMCLOCK_ABI_H__
> +#define __VMCLOCK_ABI_H__
> +
> +#ifdef __KERNEL__
> +#include <linux/types.h>
> +#else
> +#include <stdint.h>
> +#endif
> +
> +struct vmclock_abi {
> +     /* CONSTANT FIELDS */
> +     uint32_t magic;
> +#define VMCLOCK_MAGIC        0x4b4c4356 /* "VCLK" */
> +     uint32_t size;          /* Size of region containing this structure */
> +     uint16_t version;       /* 1 */
> +     uint8_t counter_id; /* Matches VIRTIO_RTC_COUNTER_xxx except INVALID */
> +#define VMCLOCK_COUNTER_ARM_VCNT     0
> +#define VMCLOCK_COUNTER_X86_TSC              1
> +#define VMCLOCK_COUNTER_INVALID              0xff
> +     uint8_t time_type; /* Matches VIRTIO_RTC_TYPE_xxx */
> +#define VMCLOCK_TIME_UTC                     0       /* Since 1970-01-01 
> 00:00:00z */
> +#define VMCLOCK_TIME_TAI                     1       /* Since 1970-01-01 
> 00:00:00z */
> +#define VMCLOCK_TIME_MONOTONIC                       2       /* Since 
> undefined epoch */
> +#define VMCLOCK_TIME_INVALID_SMEARED         3       /* Not supported */
> +#define VMCLOCK_TIME_INVALID_MAYBE_SMEARED   4       /* Not supported */
> +
> +     /* NON-CONSTANT FIELDS PROTECTED BY SEQCOUNT LOCK */
> +     uint32_t seq_count;     /* Low bit means an update is in progress */
> +     /*
> +      * This field changes to another non-repeating value when the CPU
> +      * counter is disrupted, for example on live migration. This lets
> +      * the guest know that it should discard any calibration it has
> +      * performed of the counter against external sources (NTP/PTP/etc.).
> +      */
> +     uint64_t disruption_marker;
> +     uint64_t flags;
> +     /* Indicates that the tai_offset_sec field is valid */
> +#define VMCLOCK_FLAG_TAI_OFFSET_VALID                (1 << 0)
> +     /*
> +      * Optionally used to notify guests of pending maintenance events.
> +      * A guest which provides latency-sensitive services may wish to
> +      * remove itself from service if an event is coming up. Two flags
> +      * indicate the approximate imminence of the event.
> +      */
> +#define VMCLOCK_FLAG_DISRUPTION_SOON         (1 << 1) /* About a day */
> +#define VMCLOCK_FLAG_DISRUPTION_IMMINENT     (1 << 2) /* About an hour */
> +#define VMCLOCK_FLAG_PERIOD_ESTERROR_VALID   (1 << 3)
> +#define VMCLOCK_FLAG_PERIOD_MAXERROR_VALID   (1 << 4)
> +#define VMCLOCK_FLAG_TIME_ESTERROR_VALID     (1 << 5)
> +#define VMCLOCK_FLAG_TIME_MAXERROR_VALID     (1 << 6)
> +     /*
> +      * If the MONOTONIC flag is set then (other than leap seconds) it is
> +      * guaranteed that the time calculated according this structure at
> +      * any given moment shall never appear to be later than the time
> +      * calculated via the structure at any *later* moment.
> +      *
> +      * In particular, a timestamp based on a counter reading taken
> +      * immediately after setting the low bit of seq_count (and the
> +      * associated memory barrier), using the previously-valid time and
> +      * period fields, shall never be later than a timestamp based on
> +      * a counter reading taken immediately before *clearing* the low
> +      * bit again after the update, using the about-to-be-valid fields.
> +      */
> +#define VMCLOCK_FLAG_TIME_MONOTONIC          (1 << 7)
> +
> +     uint8_t pad[2];
> +     uint8_t clock_status;
> +#define VMCLOCK_STATUS_UNKNOWN               0
> +#define VMCLOCK_STATUS_INITIALIZING  1
> +#define VMCLOCK_STATUS_SYNCHRONIZED  2
> +#define VMCLOCK_STATUS_FREERUNNING   3
> +#define VMCLOCK_STATUS_UNRELIABLE    4
> +
> +     /*
> +      * The time exposed through this device is never smeared. This field
> +      * corresponds to the 'subtype' field in virtio-rtc, which indicates
> +      * the smearing method. However in this case it provides a *hint* to
> +      * the guest operating system, such that *if* the guest OS wants to
> +      * provide its users with an alternative clock which does not follow
> +      * UTC, it may do so in a fashion consistent with the other systems
> +      * in the nearby environment.
> +      */
> +     uint8_t leap_second_smearing_hint; /* Matches VIRTIO_RTC_SUBTYPE_xxx */
> +#define VMCLOCK_SMEARING_STRICT              0
> +#define VMCLOCK_SMEARING_NOON_LINEAR 1
> +#define VMCLOCK_SMEARING_UTC_SLS     2
> +     int16_t tai_offset_sec;
> +     uint8_t leap_indicator;
> +     /*
> +      * This field is based on the the VIRTIO_RTC_LEAP_xxx values as
> +      * defined in the current draft of virtio-rtc, but since smearing
> +      * cannot be used with the shared memory device, some values are
> +      * not used.
> +      *
> +      * The _POST_POS and _POST_NEG values allow the guest to perform
> +      * its own smearing during the day or so after a leap second when
> +      * such smearing may need to continue being applied for a leap
> +      * second which is now theoretically "historical".
> +      */
> +#define VMCLOCK_LEAP_NONE    0x00    /* No known nearby leap second */
> +#define VMCLOCK_LEAP_PRE_POS 0x01    /* Positive leap second at EOM */
> +#define VMCLOCK_LEAP_PRE_NEG 0x02    /* Negative leap second at EOM */
> +#define VMCLOCK_LEAP_POS     0x03    /* Set during 23:59:60 second */
> +#define VMCLOCK_LEAP_POST_POS        0x04
> +#define VMCLOCK_LEAP_POST_NEG        0x05
> +
> +     /* Bit shift for counter_period_frac_sec and its error rate */
> +     uint8_t counter_period_shift;
> +     /*
> +      * Paired values of counter and UTC at a given point in time.
> +      */
> +     uint64_t counter_value;
> +     /*
> +      * Counter period, and error margin of same. The unit of these
> +      * fields is 1/2^(64 + counter_period_shift) of a second.
> +      */
> +     uint64_t counter_period_frac_sec;
> +     uint64_t counter_period_esterror_rate_frac_sec;
> +     uint64_t counter_period_maxerror_rate_frac_sec;
> +
> +     /*
> +      * Time according to time_type field above.
> +      */
> +     uint64_t time_sec;              /* Seconds since time_type epoch */
> +     uint64_t time_frac_sec;         /* Units of 1/2^64 of a second */
> +     uint64_t time_esterror_nanosec;
> +     uint64_t time_maxerror_nanosec;
> +};
> +
> +#endif /*  __VMCLOCK_ABI_H__ */

For purposes of virtio, should we label all the fields here
__le?


> -- 
> 2.44.0
> 
> 



Reply via email to