This patch supports the storage of TPM persisten state. The TPM creates state of varying size, depending for example how many keys are loaded into it a a certain time. The worst-case sizes of the different blobs the TPM can write have been pre-calculated and this value is used to determine the minimum size of the Qcow2 image. It needs to be 63kb. 'qemu-... -tpm ?' shows this number when this backend driver is available.
The layout of the TPM's persistent data in the block storage is as follows: The first sector (512 bytes) holds a primitive directory for the different types of blobs that the TPM can write. This directory holds a revision number, a checksum over its content, the number of entries, and the entries themselves. The entries are described through their absolute offsets, their maximum sizes, the number of currently valid bytes (the blobs infalte and deflate) and what type of blob it is (see below for the types). typedef struct BSDir { uint16_t rev; uint32_t checksum; uint32_t num_entries; BSEntry entries[BS_DIR_MAX_NUM_ENTRIES]; } __attribute__((packed)) BSDir; Their worst case sizes have been calculated and according to these sizes the blobs are written at certain offsets into the blockstorage. Their offsets are all aligned to sectors (512 byte boundaries). The TPM provides three different blobs that are written into the storage: - volatile state - permanent state - save state The 'save state' is written when the VM suspends (ACPI S3) and read when it resumes. This is done in concert with the BIOS where the BIOS needs to send a command to the TPM upon resume (TPM_Startup(ST_STATE)), while the OS issues the command TPM_SaveState(). The 'perment state' is written when the TPM receives a command that alters its permenent state, i.e., when the a key is loaded into the TPM that is expected to be there upon reboot of the machine / VM. Volatile state is written when the frontend triggers it to do so, i.e., when the VM's state is written out during taking of a snapshot, migration or suspension to disk (as in 'virsh save'). This state serves to resume at the point where the TPM previously stopped but there is no need for it after a machine reboot for example. Tricky parts here are related to encrypted storage where certain operations need to be deferred since the key for the storage only becomes available much later than the time that the backend is instantiated. The backend also tries to check for the validity of the block storage for example. If the Qcow2 is not encrypted and the checksum is found to be bad, the block storage directory will be initialized. In case the Qcow2 is encrypted, initialization will only be done if the directory is found to be all 0s. In case the directory cannot be checksummed correctly, but is not all 0s, it is assumed that the user provided a wrong key. In this case I am not exiting qemu, but black-out the TPM interface (returns 0xff in all memory location) due to a presumed fatal error and let the VM run (without TPM functionality). Signed-off-by: Stefan Berger <stef...@linux.vnet.ibm.com> --- hw/tpm_builtin.c | 685 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 684 insertions(+), 1 deletion(-) Index: qemu-git/hw/tpm_builtin.c =================================================================== --- qemu-git.orig/hw/tpm_builtin.c +++ qemu-git/hw/tpm_builtin.c @@ -44,6 +44,33 @@ #define PERMSTATE_TYPE 'P' #define VOLASTATE_TYPE 'V' +#define ALIGN(VAL, SIZE) \ + ( ( (VAL) + (SIZE) - 1 ) & ~( (SIZE) - 1 ) ) + + +#define DIRECTORY_SIZE BDRV_SECTOR_SIZE + +#define PERMSTATE_DISK_OFFSET ALIGN(DIRECTORY_SIZE, BDRV_SECTOR_SIZE) +#define PERMSTATE_DISK_SPACE \ + ALIGN(tpmlib_get_prop(TPMPROP_TPM_MAX_NV_SPACE),\ + BDRV_SECTOR_SIZE) +#define SAVESTATE_DISK_OFFSET (PERMSTATE_DISK_OFFSET + PERMSTATE_DISK_SPACE) +#define SAVESTATE_DISK_SPACE \ + ALIGN(tpmlib_get_prop(TPMPROP_TPM_MAX_SAVESTATE_SPACE),\ + BDRV_SECTOR_SIZE) +#define VOLASTATE_DISK_OFFSET (SAVESTATE_DISK_OFFSET + SAVESTATE_DISK_SPACE) +#define VOLASTATE_DISK_SPACE \ + ALIGN(tpmlib_get_prop(TPMPROP_TPM_MAX_VOLATILESTATE_SPACE),\ + BDRV_SECTOR_SIZE) + +# define MINIMUM_BS_SIZE ALIGN(ALIGN(VOLASTATE_DISK_OFFSET +\ + VOLASTATE_DISK_SPACE, \ + BDRV_SECTOR_SIZE), \ + 1024) + +#define MINIMUM_BS_SIZE_KB (int)(MINIMUM_BS_SIZE / 1024) + + /* data structures */ typedef struct ThreadParams { @@ -53,6 +80,37 @@ typedef struct ThreadParams { } ThreadParams; +enum BSEntryType { + BS_ENTRY_PERMSTATE, + BS_ENTRY_SAVESTATE, + BS_ENTRY_VOLASTATE, + + BS_ENTRY_LAST, +}; + + +typedef struct BSEntry { + enum BSEntryType type; + uint64_t offset; + uint32_t space; + uint32_t blobsize; +} __attribute__((packed)) BSEntry; + + +#define BS_DIR_MAX_NUM_ENTRIES 3 /* permanent, volatile savestate */ + +typedef struct BSDir { + uint16_t rev; + uint32_t checksum; + uint32_t num_entries; + BSEntry entries[BS_DIR_MAX_NUM_ENTRIES]; +} __attribute__((packed)) BSDir; + + +#define BS_DIR_REV1 1 + +#define BS_DIR_REV_CURRENT BS_DIR_REV1 + /* local variables */ static QemuThread thread; @@ -71,6 +129,7 @@ static bool had_startup_error = false; static bool need_read_volatile = false; static ThreadParams tpm_thread_params; +static BlockDriverState *bs; /* locality of the command being executed by libtpms */ static uint8_t g_locty; @@ -82,6 +141,10 @@ static const unsigned char tpm_std_fatal static char dev_description[80]; +static void adjust_data_layout(BlockDriverState *bs, BSDir *dir); + + + static int tpmlib_get_prop(enum TPMLIB_TPMProperty prop) { int result; @@ -94,7 +157,6 @@ static int tpmlib_get_prop(enum TPMLIB_T } -#if defined DEBUG_TPM || defined DEBUG_TPM_SR static unsigned int memsum(const unsigned char *buf, int len) { int res = 0, i; @@ -105,7 +167,628 @@ static unsigned int memsum(const unsigne return res; } + + +/************************************************ + Block Storage interaction + ***********************************************/ +static int find_bs_entry_idx(BSDir *dir, enum BSEntryType type) +{ + unsigned int c; + + for (c = 0; c < dir->num_entries; c++) { + if (dir->entries[c].type == type) { + return c; + } + } + + return -ENOENT; +} + + +static unsigned int sizeof_bsdir(BSDir *dir) +{ + return offsetof(BSDir, entries) + + dir->num_entries * sizeof(BSEntry); +} + + +static uint32_t calc_dir_checksum(BSDir *dir) +{ + uint16_t checksum, orig; + + orig = dir->checksum; + dir->checksum = 0; + + checksum = crc32(0, (unsigned char *)dir, sizeof_bsdir(dir)); + + dir->checksum = orig; + + return checksum; +} + + +static void dir_be_to_cpu(BSDir *dir) +{ + unsigned int c; + + be16_to_cpus(&dir->rev); + be32_to_cpus(&dir->checksum); + be32_to_cpus(&dir->num_entries); + + for (c = 0; c < dir->num_entries && c < BS_DIR_MAX_NUM_ENTRIES; c++) { + be32_to_cpus(&dir->entries[c].type); + be64_to_cpus(&dir->entries[c].offset); + be32_to_cpus(&dir->entries[c].space); + be32_to_cpus(&dir->entries[c].blobsize); + } +} + + +static void dir_cpu_to_be(BSDir *dir) +{ + unsigned int c; + + for (c = 0; c < dir->num_entries && c < BS_DIR_MAX_NUM_ENTRIES; c++) { + dir->entries[c].type = cpu_to_be32(dir->entries[c].type); + dir->entries[c].offset = cpu_to_be64(dir->entries[c].offset); + dir->entries[c].space = cpu_to_be32(dir->entries[c].space); + dir->entries[c].blobsize = cpu_to_be32(dir->entries[c].blobsize); + } + + dir->rev = cpu_to_be16(dir->rev); + dir->checksum = cpu_to_be32(dir->checksum); + dir->num_entries = cpu_to_be32(dir->num_entries); +} + + +static bool is_valid_bsdir(BSDir *dir) +{ + if (dir->rev != BS_DIR_REV_CURRENT || + dir->num_entries > BS_DIR_MAX_NUM_ENTRIES) { + return false; + } + return (dir->checksum == calc_dir_checksum(dir)); +} + + +static int create_blank_dir(BlockDriverState *bs) +{ + uint8_t buf[BDRV_SECTOR_SIZE]; + BSDir *dir; + + memset(buf, 0x0, sizeof(buf)); + + dir = (BSDir *)buf; + dir->rev = BS_DIR_REV_CURRENT; + dir->num_entries = 0; + + dir->checksum = calc_dir_checksum(dir); + + dir_cpu_to_be(dir); + + if (bdrv_write(bs, 0, buf, 1) < 0) { + return -EIO; + } + + return 0; +} + + +/** + * Validate the block storage doing some basic tests. That's + * all that can be done at this point since we don't have the + * key yet in case it is encrypted. + */ +static int check_bs(BlockDriverState *bs) +{ + int64_t len; + char buf[20]; + + if (!bs) { + fprintf(stderr, "Need a block driver for this vTPM type.\n"); + goto err_exit; + } + + len = bdrv_getlength(bs); + if (len < MINIMUM_BS_SIZE) { + fprintf(stderr, "Required size for vTPM backing store is %dkb\n", + MINIMUM_BS_SIZE_KB); + goto err_exit; + } + + bdrv_get_format(bs, buf, sizeof(buf)); + if (strcmp(buf, "qcow2")) { + fprintf(stderr, "vTPM backing store must be of type qcow2\n"); + goto err_exit; + } + + return 0; + + err_exit: + fprintf(stderr, + "Create the drive using 'qemu-img create -f qcow2 " + "<filename> %dk'\n", MINIMUM_BS_SIZE_KB); + return -EFAULT; +} + + +static uint32_t get_bs_entry_type_space(enum BSEntryType type) +{ + switch (type) { + case BS_ENTRY_PERMSTATE: + return PERMSTATE_DISK_SPACE; + case BS_ENTRY_SAVESTATE: + return SAVESTATE_DISK_SPACE; + case BS_ENTRY_VOLASTATE: + return VOLASTATE_DISK_SPACE; + default: + assert(false); + } +} + + +/* + * Startup the block storage: read the directory and check whether its + * checksum is valid. If the checksum is not valid then + * + * - if the block storage is not encrypted initialize it assuming it's + * been freshly created or corrupted + * + * - if the block storage is encrypted + * - check whether it's been freshly created (expecting a 0 sum of the + * directory; seems to work with any key) and initialize it in that case + * - otherwise, if there are some unreadable data, assume that + * the wrong key was given and mark it as a starup error. We log it + * but won't exit() here. + */ +static int startup_bs(BlockDriverState *bs) +{ + uint8_t buf[BDRV_SECTOR_SIZE]; + BSDir *dir; + + if (bdrv_read(bs, 0, buf, 1) < 0) { + return -EIO; + } + + dir = (BSDir *)buf; + + dir_be_to_cpu(dir); + + if (!is_valid_bsdir(dir)) { + /* if it's encrypted and has something else than null-content, + we assume to have the wrong key */ + if (bdrv_is_encrypted(bs)) { + if (memsum(buf, sizeof(buf)) != 0) { + fprintf(stderr, + "vTPM block storage directory is not valid. " + "Assuming the key is wrong.\n"); + had_startup_error = true; + return 1; + } + } +#ifdef DEBUG_TPM + fprintf(stderr, "*** tpm: Blanking the storage directory.\n"); #endif + return create_blank_dir(bs); + } + + /* not that we can read the dir, make sure the data are layed out + * correctly. + */ + adjust_data_layout(bs, dir); + + return 0; +} + + +static int create_bs_entry(BlockDriverState *bs, + BSDir *dir, + enum BSEntryType type, + uint32_t blobsize) +{ + uint8_t buf[BDRV_SECTOR_SIZE]; + uint32_t idx = dir->num_entries++; + unsigned int bsdir_size; + + dir->entries[idx].offset = (idx == 0) + ? ALIGN(DIRECTORY_SIZE, BDRV_SECTOR_SIZE) + : dir->entries[idx-1].offset + ALIGN(dir->entries[idx-1].space, + BDRV_SECTOR_SIZE); + + dir->entries[idx].type = type; + + dir->entries[idx].space = get_bs_entry_type_space(type); + dir->entries[idx].blobsize = blobsize; + + dir->checksum = calc_dir_checksum(dir); + + bsdir_size = sizeof_bsdir(dir); + + dir_cpu_to_be(dir); + + assert(dir->entries[idx].space >= blobsize); + + memset(buf, 0x0, sizeof(buf)); + memcpy(buf, dir, bsdir_size); + + if (bdrv_write(bs, 0, buf, 1) < 0) { + idx = -EIO; + } + + dir_be_to_cpu(dir); + + return idx; +} + + +static int get_bs_entry(BlockDriverState *bs, + enum BSEntryType type, + BSEntry *entry) +{ + uint8_t buf[BDRV_SECTOR_SIZE]; + BSDir *dir; + int idx; + + if (bdrv_read(bs, 0, buf, 1) < 0) { + return -EIO; + } + + dir = (BSDir *)buf; + + dir_be_to_cpu(dir); + + assert(is_valid_bsdir(dir)); + + if ((idx = find_bs_entry_idx(dir, type)) < 0) { + if ((idx = create_bs_entry(bs, dir, type, 0)) < 0) { + return -EIO; + } + } + + memcpy(entry, &dir->entries[idx], sizeof(*entry)); + + return 0; +} + + +static int set_bs_entry_size(BlockDriverState *bs, + enum BSEntryType type, + BSEntry *entry, + uint32_t blobsize) +{ + uint8_t buf[BDRV_SECTOR_SIZE]; + BSDir *dir; + int idx; + + if (bdrv_read(bs, 0, buf, 1) < 0) { + return -EIO; + } + + dir = (BSDir *)buf; + + dir_be_to_cpu(dir); + + assert(is_valid_bsdir(dir)); + + if ((idx = find_bs_entry_idx(dir, type)) < 0) { + if ((idx = create_bs_entry(bs, dir, type, 0)) < 0) { + return -EIO; + } + } + + assert(blobsize <= dir->entries[idx].space); + dir->entries[idx].blobsize = blobsize; + + dir->checksum = calc_dir_checksum(dir); + + dir_cpu_to_be(dir); + + if (bdrv_write(bs, 0, buf, 1) < 0) { + return -EIO; + } + + dir_be_to_cpu(dir); + + memcpy(entry, &dir->entries[idx], sizeof(*entry)); + + return 0; +} + + +static int load_sized_data_from_bs(BlockDriverState *bs, + enum BSEntryType be, + TPMSizedBuffer *tsb) +{ + BSEntry entry; + int n; + + if ((n = get_bs_entry(bs, be, &entry)) < 0) { + return n; + } + +#if defined DEBUG_TPM || defined DEBUG_TPM_SR + fprintf(stderr,"load: be-type: %d, offset: %6ld, size: %5d\n", + be, entry.offset, entry.blobsize); +#endif + + if (entry.blobsize == 0) { + return 0; + } + + tsb->buffer = qemu_malloc(entry.blobsize); + if (!tsb->buffer) { + return -ENOMEM; + } + + tsb->size = entry.blobsize; + + if (bdrv_pread(bs, entry.offset, tsb->buffer, tsb->size) != tsb->size) { + clear_sized_buffer(tsb); + fprintf(stderr,"tpm: Error while reading sized data!\n"); + return -EIO; + } + + return 0; +} + + +static int load_tpm_permanent_state_from_bs(BlockDriverState *bs, + TPMSizedBuffer *tsb) +{ + return load_sized_data_from_bs(bs, BS_ENTRY_PERMSTATE, tsb); +} + + +static int load_tpm_savestate_from_bs(BlockDriverState *bs, + TPMSizedBuffer *tsb) +{ + return load_sized_data_from_bs(bs, BS_ENTRY_SAVESTATE, tsb); +} + + +static int load_tpm_volatile_state_from_bs(BlockDriverState *bs, + TPMSizedBuffer *tsb) +{ + return load_sized_data_from_bs(bs, BS_ENTRY_VOLASTATE, tsb); +} + + +static int save_sized_data_to_bs(BlockDriverState *bs, + enum BSEntryType be, + uint8_t *data, uint32_t data_len) +{ + BSEntry entry; + int n; + + if ((n = set_bs_entry_size(bs, be, &entry, data_len)) < 0) { + return n; + } + + if (data_len > 0) { + if (bdrv_pwrite(bs, entry.offset, data, data_len) != data_len) { + return -EIO; + } + } + + return 0; +} + + +/* Write the TPM's state to block storage */ +static int sync_permanent_state_to_disk(BlockDriverState *bs) +{ + int rc = 0; + + if (permanent_state.size) { + rc = save_sized_data_to_bs(bs, BS_ENTRY_PERMSTATE, + permanent_state.buffer, + permanent_state.size); + } + + return rc; +} + + +static int sync_savestate_to_disk(BlockDriverState *bs) +{ + return save_sized_data_to_bs(bs, BS_ENTRY_SAVESTATE, + save_state.buffer, save_state.size); +} + + +static int sync_volatile_state_to_disk(BlockDriverState *bs) +{ + return save_sized_data_to_bs(bs, BS_ENTRY_VOLASTATE, + volatile_state.buffer, volatile_state.size); +} + + +/* + * Write a given type of state, identified by the char, to block + * storage. If anything goes wrong, set the had_fatal_error variable + */ +static int write_state_to_bs(char what) +{ + int rc = 0; + + qemu_mutex_lock(&state_mutex); + + switch (what) { + case PERMSTATE_TYPE: + rc = sync_permanent_state_to_disk(bs); + break; + case SAVESTATE_TYPE: + rc = sync_savestate_to_disk(bs); + break; + case VOLASTATE_TYPE: + rc = sync_volatile_state_to_disk(bs); + break; + default: + assert(false); + } + + if (rc) { + fprintf(stderr,"tpm: Error while writing TPM state to bs. " + "Setting fatal error."); + had_fatal_error = true; + } + + qemu_mutex_unlock(&state_mutex); + + return rc; +} + + +/* + * Write the 'savestate' or 'permanent state' in the + * global buffer to disk. The requester tells us what + * to writy by a single byte in the pipe. If anything + * goes wrong, we'll set the had_fatal_error flag. + * We sync with the requester using signals on a + * condition. + */ +static void fulfill_sync_to_bs_request(void *opaque) +{ + char buf[10]; + int c, n; + + while ((n = read(pipefd[0], buf, sizeof(buf))) > 0) { + for (c = 0; c < n; c++) { + write_state_to_bs(buf[c]); + } + } + + qemu_cond_signal(&bs_write_result_cond); +} + + +/* + * Request that either savestate or permanent state be written + * to the disk. Call this function with the state_mutex held. + * It will synchronize with the sync_to_bs function that does + * the work. In case a previous fatal error occurred, nothing + * will be done. + */ +static bool request_sync_to_bs(char what) +{ + char cmd[1] = { what }; + + if (had_fatal_error) { + return had_fatal_error; + } + + if (write(pipefd[1], cmd, 1) != 1) { + had_fatal_error = true; + return true; + } + + if (tpm_initialized) { + qemu_cond_wait(&bs_write_result_cond, &state_mutex); + } else { + /* during initialization: defer the write */ +#if defined DEBUG_TPM || defined DEBUG_TPM_SR + fprintf(stderr,"tpm: deferred write/sync since not in thread\n"); +#endif + } + + return had_fatal_error; +} + + +static void load_tpm_state_from_bs(BlockDriverState *bs) +{ + load_tpm_permanent_state_from_bs(bs, &permanent_state); + load_tpm_savestate_from_bs(bs, &save_state); + + if (need_read_volatile) { + clear_sized_buffer(&volatile_state); + load_tpm_volatile_state_from_bs(bs, &volatile_state); + need_read_volatile = false; + } +} + + +/* + * Adjust the layout of the data. This may be necessary if + * we migrated to a TPM implementation with different blob + * sizes that would altogether fit onto the disk but write into + * the next blob due to their current layout. + * + * At this point we must already have passed the size check of + * the drive and know it is big enough. For encrypted storage + * we must have the (right!) key at this point. + */ +static void adjust_data_layout(BlockDriverState *bs, BSDir *dir) +{ + unsigned int c; + int rc; + uint64_t exp_offset = ALIGN(DIRECTORY_SIZE, BDRV_SECTOR_SIZE); + bool need_move = false; + TPMSizedBuffer tsb[BS_ENTRY_LAST]; + enum BSEntryType type; + + /* check all entries' space and offsets */ + for (c = 0; c < dir->num_entries; c++) { + if (dir->entries[c].offset != exp_offset) { + fprintf(stdout, + "entry[%d].type = %d, offset = %ld, expected = %ld\n", + c, + dir->entries[c].type, + dir->entries[c].offset, + exp_offset); + need_move = true; + break; + } + if (dir->entries[c].space != + get_bs_entry_type_space(dir->entries[c].type)) { + fprintf(stdout, + "entry[%d].type = %d, space = %d, expected = %d\n", + c, + dir->entries[c].type, + dir->entries[c].space, + get_bs_entry_type_space(dir->entries[c].type)); + need_move = true; + break; + } + exp_offset = ALIGN(exp_offset + dir->entries[c].space, + BDRV_SECTOR_SIZE); + } + +#if defined DEBUG_TPM || defined DEBUG_TPM_SR + fprintf(stderr," Data move necessary: %s\n", + (need_move) ? "true" : "false"); +#endif + + if (need_move) { + /* + * read data + * create blank dir + * write all data back + */ + + for (type = 0; type < BS_ENTRY_LAST; type++) { + tsb[type].size = 0; + tsb[type].buffer = NULL; + + load_sized_data_from_bs(bs, type, &tsb[type]); + } + + create_blank_dir(bs); + dir = NULL; + + for (type = 0; type < BS_ENTRY_LAST; type++) { + rc = save_sized_data_to_bs(bs, type, + tsb[type].buffer, + tsb[type].size); + if (rc) { + had_fatal_error = true; + break; + } + clear_sized_buffer(&tsb[type]); + } + } +} /** Index: qemu-git/configure =================================================================== --- qemu-git.orig/configure +++ qemu-git/configure @@ -3340,6 +3340,9 @@ if test "$linux" = "yes" && test "$tpm" fi if test "$has_tpm" = "1"; then + if test -r /usr/include/libtpms/tpm_library.h ; then + echo "CONFIG_TPM_BUILTIN=y" >> $config_target_mak + fi echo "CONFIG_TPM=y" >> $config_host_mak fi fi