[PATCH v4 03/12] mm: balloon: use general non-lru movable page feature
Now, VM has a feature to migrate non-lru movable pages so balloon doesn't need custom migration hooks in migrate.c and compaction.c. Instead, this patch implements page->mapping->a_ops->{isolate|migrate|putback} functions. With that, we could remove hooks for ballooning in general migration functions and make balloon compaction simple. Cc: virtualization@lists.linux-foundation.org Cc: Rafael Aquini Cc: Konstantin Khlebnikov Signed-off-by: Gioh Kim Signed-off-by: Minchan Kim --- drivers/virtio/virtio_balloon.c| 52 +++-- include/linux/balloon_compaction.h | 51 ++--- include/uapi/linux/magic.h | 1 + mm/balloon_compaction.c| 94 +++--- mm/compaction.c| 7 --- mm/migrate.c | 19 +--- mm/vmscan.c| 2 +- 7 files changed, 83 insertions(+), 143 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 7b6d74f0c72f..04fc63b4a735 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -30,6 +30,7 @@ #include #include #include +#include /* * Balloon device works in 4K page units. So each page is pointed to by @@ -45,6 +46,10 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES; module_param(oom_pages, int, S_IRUSR | S_IWUSR); MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); +#ifdef CONFIG_BALLOON_COMPACTION +static struct vfsmount *balloon_mnt; +#endif + struct virtio_balloon { struct virtio_device *vdev; struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; @@ -486,6 +491,24 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, return MIGRATEPAGE_SUCCESS; } + +static struct dentry *balloon_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static const struct dentry_operations ops = { + .d_dname = simple_dname, + }; + + return mount_pseudo(fs_type, "balloon-kvm:", NULL, &ops, + BALLOON_KVM_MAGIC); +} + +static struct file_system_type balloon_fs = { + .name = "balloon-kvm", + .mount = balloon_mount, + .kill_sb= kill_anon_super, +}; + #endif /* CONFIG_BALLOON_COMPACTION */ static int virtballoon_probe(struct virtio_device *vdev) @@ -515,9 +538,6 @@ static int virtballoon_probe(struct virtio_device *vdev) vb->vdev = vdev; balloon_devinfo_init(&vb->vb_dev_info); -#ifdef CONFIG_BALLOON_COMPACTION - vb->vb_dev_info.migratepage = virtballoon_migratepage; -#endif err = init_vqs(vb); if (err) @@ -527,13 +547,33 @@ static int virtballoon_probe(struct virtio_device *vdev) vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY; err = register_oom_notifier(&vb->nb); if (err < 0) - goto out_oom_notify; + goto out_del_vqs; + +#ifdef CONFIG_BALLOON_COMPACTION + balloon_mnt = kern_mount(&balloon_fs); + if (IS_ERR(balloon_mnt)) { + err = PTR_ERR(balloon_mnt); + unregister_oom_notifier(&vb->nb); + goto out_del_vqs; + } + + vb->vb_dev_info.migratepage = virtballoon_migratepage; + vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb); + if (IS_ERR(vb->vb_dev_info.inode)) { + err = PTR_ERR(vb->vb_dev_info.inode); + kern_unmount(balloon_mnt); + unregister_oom_notifier(&vb->nb); + vb->vb_dev_info.inode = NULL; + goto out_del_vqs; + } + vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops; +#endif virtio_device_ready(vdev); return 0; -out_oom_notify: +out_del_vqs: vdev->config->del_vqs(vdev); out_free_vb: kfree(vb); @@ -567,6 +607,8 @@ static void virtballoon_remove(struct virtio_device *vdev) cancel_work_sync(&vb->update_balloon_stats_work); remove_common(vb); + if (vb->vb_dev_info.inode) + iput(vb->vb_dev_info.inode); kfree(vb); } diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 9b0a15d06a4f..79542b2698ec 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -48,6 +48,7 @@ #include #include #include +#include /* * Balloon device information descriptor. @@ -62,6 +63,7 @@ struct balloon_dev_info { struct list_head pages; /* Pages enqueued & handled to Host */ int (*migratepage)(struct balloon_dev_info *, struct page *newpage, struct page *page, enum migrate_mode mode); + struct inode *inode; }; extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info); @@ -73,45 +75,19 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) spin_loc
[PATCH v4 02/12] mm: migrate: support non-lru movable page migration
We have allowed migration for only LRU pages until now and it was enough to make high-order pages. But recently, embedded system(e.g., webOS, android) uses lots of non-movable pages(e.g., zram, GPU memory) so we have seen several reports about troubles of small high-order allocation. For fixing the problem, there were several efforts (e,g,. enhance compaction algorithm, SLUB fallback to 0-order page, reserved memory, vmalloc and so on) but if there are lots of non-movable pages in system, their solutions are void in the long run. So, this patch is to support facility to change non-movable pages with movable. For the feature, this patch introduces functions related to migration to address_space_operations as well as some page flags. If a driver want to make own pages movable, it should define three functions which are function pointers of struct address_space_operations. 1. bool (*isolate_page) (struct page *page, isolate_mode_t mode); What VM expects on isolate_page function of driver is to return *true* if driver isolates page successfully. On returing true, VM marks the page as PG_isolated so concurrent isolation in several CPUs skip the page for isolation. If a driver cannot isolate the page, it should return *false*. Once page is successfully isolated, VM uses page.lru fields so driver shouldn't expect to preserve values in that fields. 2. int (*migratepage) (struct address_space *mapping, struct page *newpage, struct page *oldpage, enum migrate_mode); After isolation, VM calls migratepage of driver with isolated page. The function of migratepage is to move content of the old page to new page and set up fields of struct page newpage. Keep in mind that you should clear PG_movable of oldpage via __ClearPageMovable under page_lock if you migrated the oldpage successfully and returns MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time because VM interprets -EAGAIN as "temporal migration failure". On returning any error except -EAGAIN, VM will give up the page migration without retrying in this time. Driver shouldn't touch page.lru field VM using in the functions. 3. void (*putback_page)(struct page *); If migration fails on isolated page, VM should return the isolated page to the driver so VM calls driver's putback_page with migration failed page. In this function, driver should put the isolated page back to the own data structure. 4. non-lru movable page flags There are two page flags for supporting non-lru movable page. * PG_movable Driver should use the below function to make page movable under page_lock. void __SetPageMovable(struct page *page, struct address_space *mapping) It needs argument of address_space for registering migration family functions which will be called by VM. Exactly speaking, PG_movable is not a real flag of struct page. Rather than, VM reuses page->mapping's lower bits to represent it. #define PAGE_MAPPING_MOVABLE 0x2 page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; so driver shouldn't access page->mapping directly. Instead, driver should use page_mapping which mask off the low two bits of page->mapping so it can get right struct address_space. For testing of non-lru movable page, VM supports __PageMovable function. However, it doesn't guarantee to identify non-lru movable page because page->mapping field is unified with other variables in struct page. As well, if driver releases the page after isolation by VM, page->mapping doesn't have stable value although it has PAGE_MAPPING_MOVABLE (Look at __ClearPageMovable). But __PageMovable is cheap to catch whether page is LRU or non-lru movable once the page has been isolated. Because LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also good for just peeking to test non-lru movable pages before more expensive checking with lock_page in pfn scanning to select victim. For guaranteeing non-lru movable page, VM provides PageMovable function. Unlike __PageMovable, PageMovable functions validates page->mapping and mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden destroying of page->mapping. Driver using __SetPageMovable should clear the flag via __ClearMovablePage under page_lock before the releasing the page. * PG_isolated To prevent concurrent isolation among several CPUs, VM marks isolated page as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru movable page, it can skip it. Driver doesn't need to manipulate the flag because VM will set/clear it automatically. Keep in mind that if driver sees PG_isolated page, it means the page have been isolated by VM so it shouldn't touch page.lru field. PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag for own purpose. Cc: Rik van Riel Cc: Vlastimil Babka Cc: Joonsoo Kim Cc: Mel Gorman Cc: Hugh Dickins Cc: Rafael Aquini Cc: virtualiz
[PATCH v4 00/13] Support non-lru page migration
Recently, I got many reports about perfermance degradation in embedded system(Android mobile phone, webOS TV and so on) and easy fork fail. The problem was fragmentation caused by zram and GPU driver mainly. With memory pressure, their pages were spread out all of pageblock and it cannot be migrated with current compaction algorithm which supports only LRU pages. In the end, compaction cannot work well so reclaimer shrinks all of working set pages. It made system very slow and even to fail to fork easily which requires order-[2 or 3] allocations. Other pain point is that they cannot use CMA memory space so when OOM kill happens, I can see many free pages in CMA area, which is not memory efficient. In our product which has big CMA memory, it reclaims zones too exccessively to allocate GPU and zram page although there are lots of free space in CMA so system becomes very slow easily. To solve these problem, this patch tries to add facility to migrate non-lru pages via introducing new functions and page flags to help migration. struct address_space_operations { .. .. bool (*isolate_page)(struct page *, isolate_mode_t); void (*putback_page)(struct page *); .. } new page flags PG_movable PG_isolated For details, please read description in "mm: migrate: support non-lru movable page migration". Originally, Gioh Kim had tried to support this feature but he moved so I took over the work. I took many code from his work and changed a little bit and Konstantin Khlebnikov helped Gioh a lot so he should deserve to have many credit, too. Thanks, Gioh and Konstantin! This patchset consists of five parts. 1. clean up migration mm: use put_page to free page instead of putback_lru_page 2. add non-lru page migration feature mm: migrate: support non-lru movable page migration 3. rework KVM memory-ballooning mm: balloon: use general non-lru movable page feature 4. zsmalloc refactoring for preparing page migration zsmalloc: keep max_object in size_class zsmalloc: use bit_spin_lock zsmalloc: use accessor zsmalloc: factor page chain functionality out zsmalloc: introduce zspage structure zsmalloc: separate free_zspage from putback_zspage zsmalloc: use freeobj for index 5. zsmalloc page migration zsmalloc: page migration support zram: use __GFP_MOVABLE for memory allocation * From v3 * rebase on mmotm-2016-04-06-20-40 * fix swap_info deadlock - Chulmin * race without page_lock - Vlastimil * no use page._mapcount for potential user-mapped page driver - Vlastimil * fix and enhance doc/description - Vlastimil * use page->mapping lower bits to represent PG_movable * make driver side's rule simple. * From v2 * rebase on mmotm-2016-03-29-15-54-16 * check PageMovable before lock_page - Joonsoo * check PageMovable before PageIsolated checking - Joonsoo * add more description about rule * From v1 * rebase on v4.5-mmotm-2016-03-17-15-04 * reordering patches to merge clean-up patches first * add Acked-by/Reviewed-by from Vlastimil and Sergey * use each own mount model instead of reusing anon_inode_fs - Al Viro * small changes - YiPing, Gioh Cc: Vlastimil Babka Cc: dri-de...@lists.freedesktop.org Cc: Hugh Dickins Cc: John Einar Reitan Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Konstantin Khlebnikov Cc: Mel Gorman Cc: Naoya Horiguchi Cc: Rafael Aquini Cc: Rik van Riel Cc: Sergey Senozhatsky Cc: virtualization@lists.linux-foundation.org Cc: Gioh Kim Cc: Chan Gyun Jeong Cc: Sangseok Lee Cc: Kyeongdon Kim Cc: Chulmin Kim Minchan Kim (12): mm: use put_page to free page instead of putback_lru_page mm: migrate: support non-lru movable page migration mm: balloon: use general non-lru movable page feature zsmalloc: keep max_object in size_class zsmalloc: use bit_spin_lock zsmalloc: use accessor zsmalloc: factor page chain functionality out zsmalloc: introduce zspage structure zsmalloc: separate free_zspage from putback_zspage zsmalloc: use freeobj for index zsmalloc: page migration support zram: use __GFP_MOVABLE for memory allocation Documentation/filesystems/Locking |4 + Documentation/filesystems/vfs.txt | 11 + Documentation/vm/page_migration| 107 +++- drivers/block/zram/zram_drv.c |3 +- drivers/virtio/virtio_balloon.c| 52 +- include/linux/balloon_compaction.h | 51 +- include/linux/fs.h |2 + include/linux/ksm.h|3 +- include/linux/migrate.h|5 + include/linux/mm.h |1 + include/linux/page-flags.h | 23 +- include/uapi/linux/magic.h |2 + mm/balloon_compaction.c| 94 +-- mm/compaction.c| 40 +- mm/ksm.c |4 +- mm/migrate.c | 287 +++-- mm/page_alloc.c|4 +- mm/util.c |6 +- mm/vmscan.c
Re: [PATCH] vhost_net: stop polling socket during rx processing
On Tue, Apr 26, 2016 at 03:35:53AM -0400, Jason Wang wrote: > We don't stop polling socket during rx processing, this will lead > unnecessary wakeups from under layer net devices (E.g > sock_def_readable() form tun). Rx will be slowed down in this > way. This patch avoids this by stop polling socket during rx > processing. A small drawback is that this introduces some overheads in > light load case because of the extra start/stop polling, but single > netperf TCP_RR does not notice any change. In a super heavy load case, > e.g using pktgen to inject packet to guest, we get about ~17% > improvement on pps: > > before: ~137 pkt/s > after: ~150 pkt/s > > Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin There is one other possible enhancement: we actually have the wait queue lock taken in _wake_up, but we give it up only to take it again in the handler. It would be nicer to just remove the entry when we wake the vhost thread. Re-add it if required. I think that something like the below would give you the necessary API. Pls feel free to use it if you are going to implement a patch on top doing this - that's not a reason not to include this simple patch though. ---> wait: add API to drop a wait_queue_t entry from wake up handler A wake up handler might want to remove its own wait queue entry to avoid future wakeups. In particular, vhost has such a need. As wait queue lock is already taken, all we need is an API to remove the entry without wait_queue_head_t which isn't currently accessible to wake up handlers. Signed-off-by: Michael S. Tsirkin --- diff --git a/include/linux/wait.h b/include/linux/wait.h index 27d7a0a..9c6604b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -191,11 +191,17 @@ __add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait) } static inline void -__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) +__remove_wait_queue_entry(wait_queue_t *old) { list_del(&old->task_list); } +static inline void +__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) +{ + __remove_wait_queue_entry(old); +} + typedef int wait_bit_action_f(struct wait_bit_key *, int mode); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC PATCH V2 1/2] vhost: convert pre sorted vhost memory array to interval tree
On Fri, Mar 25, 2016 at 10:34:33AM +0800, Jason Wang wrote: > Current pre-sorted memory region array has some limitations for future > device IOTLB conversion: > > 1) need extra work for adding and removing a single region, and it's >expected to be slow because of sorting or memory re-allocation. > 2) need extra work of removing a large range which may intersect >several regions with different size. > 3) need trick for a replacement policy like LRU > > To overcome the above shortcomings, this patch convert it to interval > tree which can easily address the above issue with almost no extra > work. > > The patch could be used for: > > - Extend the current API and only let the userspace to send diffs of > memory table. > - Simplify Device IOTLB implementation. Does this affect performance at all? > > Signed-off-by: Jason Wang > --- > drivers/vhost/net.c | 8 +-- > drivers/vhost/vhost.c | 182 > -- > drivers/vhost/vhost.h | 27 ++-- > 3 files changed, 128 insertions(+), 89 deletions(-) > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index 9eda69e..481db96 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -968,20 +968,20 @@ static long vhost_net_reset_owner(struct vhost_net *n) > struct socket *tx_sock = NULL; > struct socket *rx_sock = NULL; > long err; > - struct vhost_memory *memory; > + struct vhost_umem *umem; > > mutex_lock(&n->dev.mutex); > err = vhost_dev_check_owner(&n->dev); > if (err) > goto done; > - memory = vhost_dev_reset_owner_prepare(); > - if (!memory) { > + umem = vhost_dev_reset_owner_prepare(); > + if (!umem) { > err = -ENOMEM; > goto done; > } > vhost_net_stop(n, &tx_sock, &rx_sock); > vhost_net_flush(n); > - vhost_dev_reset_owner(&n->dev, memory); > + vhost_dev_reset_owner(&n->dev, umem); > vhost_net_vq_reset(n); > done: > mutex_unlock(&n->dev.mutex); > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index ad2146a..32c35a9 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -27,6 +27,7 @@ > #include > #include > #include > +#include > > #include "vhost.h" > > @@ -42,6 +43,10 @@ enum { > #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num]) > #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num]) > > +INTERVAL_TREE_DEFINE(struct vhost_umem_node, > + rb, __u64, __subtree_last, > + START, LAST, , vhost_umem_interval_tree); > + > #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY > static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq) > { > @@ -275,7 +280,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, > vq->call_ctx = NULL; > vq->call = NULL; > vq->log_ctx = NULL; > - vq->memory = NULL; > + vq->umem = NULL; > vq->is_le = virtio_legacy_is_little_endian(); > vhost_vq_reset_user_be(vq); > } > @@ -381,7 +386,7 @@ void vhost_dev_init(struct vhost_dev *dev, > mutex_init(&dev->mutex); > dev->log_ctx = NULL; > dev->log_file = NULL; > - dev->memory = NULL; > + dev->umem = NULL; > dev->mm = NULL; > spin_lock_init(&dev->work_lock); > INIT_LIST_HEAD(&dev->work_list); > @@ -486,27 +491,36 @@ err_mm: > } > EXPORT_SYMBOL_GPL(vhost_dev_set_owner); > > -struct vhost_memory *vhost_dev_reset_owner_prepare(void) > +static void *vhost_kvzalloc(unsigned long size) > +{ > + void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); > + > + if (!n) > + n = vzalloc(size); > + return n; > +} > + > +struct vhost_umem *vhost_dev_reset_owner_prepare(void) > { > - return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL); > + return vhost_kvzalloc(sizeof(struct vhost_umem)); > } > EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare); > > /* Caller should have device mutex */ > -void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory > *memory) > +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem) > { > int i; > > vhost_dev_cleanup(dev, true); > > /* Restore memory to default empty mapping. */ > - memory->nregions = 0; > - dev->memory = memory; > + INIT_LIST_HEAD(&umem->umem_list); > + dev->umem = umem; > /* We don't need VQ locks below since vhost_dev_cleanup makes sure >* VQs aren't running. >*/ > for (i = 0; i < dev->nvqs; ++i) > - dev->vqs[i]->memory = memory; > + dev->vqs[i]->umem = umem; > } > EXPORT_SYMBOL_GPL(vhost_dev_reset_owner); > > @@ -523,6 +537,21 @@ void vhost_dev_stop(struct vhost_dev *dev) > } > EXPORT_SYMBOL_GPL(vhost_dev_stop); > > +static void vhost_umem_clean(struct vhost_umem *umem) > +{ > + struct vhost_umem_node *node, *tmp; > + > + if
Re: [RFC PATCH V2 2/2] vhost: device IOTLB API
On Fri, Mar 25, 2016 at 10:34:34AM +0800, Jason Wang wrote: > This patch tries to implement an device IOTLB for vhost. This could be > used with for co-operation with userspace(qemu) implementation of DMA > remapping. > > The idea is simple. When vhost meets an IOTLB miss, it will request > the assistance of userspace to do the translation, this is done > through: > > - Fill the translation request in a preset userspace address (This > address is set through ioctl VHOST_SET_IOTLB_REQUEST_ENTRY). > - Notify userspace through eventfd (This eventfd was set through ioctl > VHOST_SET_IOTLB_FD). Why use an eventfd for this? We use them for interrupts because that happens to be what kvm wants, but here - why don't we just add a generic support for reading out events on the vhost fd itself? > - device IOTLB were started and stopped through VHOST_RUN_IOTLB ioctl > > When userspace finishes the translation, it will update the vhost > IOTLB through VHOST_UPDATE_IOTLB ioctl. Userspace is also in charge of > snooping the IOTLB invalidation of IOMMU IOTLB and use > VHOST_UPDATE_IOTLB to invalidate the possible entry in vhost. There's one problem here, and that is that VQs still do not undergo translation. In theory VQ could be mapped in such a way that it's not contigious in userspace memory. > Signed-off-by: Jason Wang What limits amount of entries that kernel keeps around? Do we want at least a mod parameter for this? > --- > drivers/vhost/net.c| 6 +- > drivers/vhost/vhost.c | 301 > +++-- > drivers/vhost/vhost.h | 17 ++- > fs/eventfd.c | 3 +- > include/uapi/linux/vhost.h | 35 ++ > 5 files changed, 320 insertions(+), 42 deletions(-) > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index 481db96..7cbdeed 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -334,7 +334,7 @@ static void handle_tx(struct vhost_net *net) > head = vhost_get_vq_desc(vq, vq->iov, >ARRAY_SIZE(vq->iov), >&out, &in, > - NULL, NULL); > + NULL, NULL, VHOST_ACCESS_RO); > /* On error, stop handling until the next kick. */ > if (unlikely(head < 0)) > break; > @@ -470,7 +470,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq, > } > r = vhost_get_vq_desc(vq, vq->iov + seg, > ARRAY_SIZE(vq->iov) - seg, &out, > - &in, log, log_num); > + &in, log, log_num, VHOST_ACCESS_WO); > if (unlikely(r < 0)) > goto err; > > @@ -1083,7 +1083,7 @@ static long vhost_net_ioctl(struct file *f, unsigned > int ioctl, > r = vhost_dev_ioctl(&n->dev, ioctl, argp); > if (r == -ENOIOCTLCMD) > r = vhost_vring_ioctl(&n->dev, ioctl, argp); > - else > + else if (ioctl != VHOST_UPDATE_IOTLB) > vhost_net_flush(n); > mutex_unlock(&n->dev.mutex); > return r; > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 32c35a9..1dd43e8 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -280,6 +280,10 @@ static void vhost_vq_reset(struct vhost_dev *dev, > vq->call_ctx = NULL; > vq->call = NULL; > vq->log_ctx = NULL; > + vq->iotlb_call = NULL; > + vq->iotlb_call_ctx = NULL; > + vq->iotlb_request = NULL; > + vq->pending_request.flags.type = VHOST_IOTLB_INVALIDATE; > vq->umem = NULL; > vq->is_le = virtio_legacy_is_little_endian(); > vhost_vq_reset_user_be(vq); > @@ -387,8 +391,10 @@ void vhost_dev_init(struct vhost_dev *dev, > dev->log_ctx = NULL; > dev->log_file = NULL; > dev->umem = NULL; > + dev->iotlb = NULL; > dev->mm = NULL; > spin_lock_init(&dev->work_lock); > + spin_lock_init(&dev->iotlb_lock); > INIT_LIST_HEAD(&dev->work_list); > dev->worker = NULL; > > @@ -537,6 +543,15 @@ void vhost_dev_stop(struct vhost_dev *dev) > } > EXPORT_SYMBOL_GPL(vhost_dev_stop); > > +static void vhost_umem_free(struct vhost_umem *umem, > + struct vhost_umem_node *node) > +{ > + vhost_umem_interval_tree_remove(node, &umem->umem_tree); > + list_del(&node->link); > + kfree(node); > + umem->numem--; > +} > + > static void vhost_umem_clean(struct vhost_umem *umem) > { > struct vhost_umem_node *node, *tmp; > @@ -544,11 +559,9 @@ static void vhost_umem_clean(struct vhost_umem *umem) > if (!umem) > return; > > - list_for_each_entry_safe(node, tmp, &umem->umem_list, link) { > - vhost_umem_interval_tree_remove(node, &umem->umem_tree); > - list_del
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
> > On some systems, including Xen and any system with a physical device > > that speaks virtio behind a physical IOMMU, we must use the DMA API > > for virtio DMA to work at all. > > > > Add a feature bit to detect that: VIRTIO_F_IOMMU_PLATFORM. > > > > If not there, we preserve historic behavior and bypass the DMA > > API unless within Xen guest. This is actually required for > > systems, including SPARC and PPC64, where virtio-pci devices are > > enumerated as though they are behind an IOMMU, but the virtio host > > ignores the IOMMU, so we must either pretend that the IOMMU isn't > > there or somehow map everything as the identity. > > > > Re: non-virtio devices. > > > > It turns out that on old QEMU hosts, only emulated devices which were > > part of QEMU use the IOMMU. Should we want to bypass the IOMMU for such > > devices *only*, it would be rather easy to detect them by looking at > > subsystem vendor and device ID. Thus, no new interfaces are required > > except for virtio which always uses the same subsystem vendor and device ID. Apologies for dropping this thread; I've been travelling. But seriously, NO! I understand why you want to see this as a virtio-specific issue, but it isn't. And we don't *want* it to be. In the guest, drivers SHALL use the DMA API. And the DMA API SHALL do the right thing for each device according to its needs. So any information passed from qemu to the guest should be directed at the platform IOMMU code (or handled by qemu-detection quirks in the guest, if we must). It is *not* acceptable for the virtio drivers in the guest to just eschew the DMA API completely, triggered by some device-specific flag. The qemu implementation is, of course, monolithic. In qemu the fact that virtio doesn't get translated by the emulated IOMMU *is* actually down to code in the virtio implementation. I get that. But then again, it's not just virtio. *Any* device which we emulate for the guest could have that same issue, and appear as untranslated. (And assigned PCI devices currently do). Let's think about the parallel with a system-on-chip. Let's say we have a peripheral which got included, but which was wired up such that it bypasses the IOMMU and gets to do direct physical DMA. Is that a feature of that specific peripheral? Do we hack its drivers to make the distinction between this incarnation, and a normal discrete version of the same device? No! It's a feature of the *system* and needs to be conveyed to the OS IOMMU code to do the right thing. Not to the driver. In my opinion, adding the VIRTIO_F_IOMMU_PLATFORM feature bit is absolutely the wrong thing to do. What we *should* do is a patchset in the guest which both fixes virtio drivers to *always* use the DMA API, and fixes the DMA API to DTRT at the same time — by detecting qemu and installing no-op DMA ops for the appropriate devices, perhaps. Then we can look at giving qemu a way to properly indicate which devices it actually does DMA mapping for, so we can remove those heuristic assumptions. But that flag does *not* live in the virtio host←→guest ABI. -- David WoodhouseOpen Source Technology Centre david.woodho...@intel.com Intel Corporation smime.p7s Description: S/MIME cryptographic signature ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, Apr 27, 2016 at 01:18:21PM +0100, David Woodhouse wrote: > > > > On some systems, including Xen and any system with a physical device > > > that speaks virtio behind a physical IOMMU, we must use the DMA API > > > for virtio DMA to work at all. > > > > > > Add a feature bit to detect that: VIRTIO_F_IOMMU_PLATFORM. > > > > > > If not there, we preserve historic behavior and bypass the DMA > > > API unless within Xen guest. This is actually required for > > > systems, including SPARC and PPC64, where virtio-pci devices are > > > enumerated as though they are behind an IOMMU, but the virtio host > > > ignores the IOMMU, so we must either pretend that the IOMMU isn't > > > there or somehow map everything as the identity. > > > > > > Re: non-virtio devices. > > > > > > It turns out that on old QEMU hosts, only emulated devices which were > > > part of QEMU use the IOMMU. Should we want to bypass the IOMMU for such > > > devices *only*, it would be rather easy to detect them by looking at > > > subsystem vendor and device ID. Thus, no new interfaces are required > > > except for virtio which always uses the same subsystem vendor and device > > > ID. > > Apologies for dropping this thread; I've been travelling. > > But seriously, NO! > > I understand why you want to see this as a virtio-specific issue, but > it isn't. And we don't *want* it to be. > > In the guest, drivers SHALL use the DMA API. And the DMA API SHALL do > the right thing for each device according to its needs. > > So any information passed from qemu to the guest should be directed at > the platform IOMMU code (or handled by qemu-detection quirks in the > guest, if we must). > > It is *not* acceptable for the virtio drivers in the guest to just > eschew the DMA API completely, triggered by some device-specific flag. > > The qemu implementation is, of course, monolithic. In qemu the fact > that virtio doesn't get translated by the emulated IOMMU *is* actually > down to code in the virtio implementation. I get that. > > But then again, it's not just virtio. *Any* device which we emulate for > the guest could have that same issue, and appear as untranslated. (And > assigned PCI devices currently do). > > Let's think about the parallel with a system-on-chip. Let's say we have > a peripheral which got included, but which was wired up such that it > bypasses the IOMMU and gets to do direct physical DMA. Is that a > feature of that specific peripheral? Do we hack its drivers to make the > distinction between this incarnation, and a normal discrete version of > the same device? No! It's a feature of the *system* One correction: it's a feature of the device in the system. There could be a mix of devices bypassing and not bypassing the IOMMU. > and needs to be > conveyed to the OS IOMMU code to do the right thing. Not to the driver. > > In my opinion, adding the VIRTIO_F_IOMMU_PLATFORM feature bit is > absolutely the wrong thing to do. > > What we *should* do is a patchset in the guest which both fixes virtio > drivers to *always* use the DMA API, and fixes the DMA API to DTRT at > the same time — by detecting qemu and installing no-op DMA ops for the > appropriate devices, perhaps. Sounds good. And a way to detect appropriate devices could be by looking at the feature flag, perhaps? > Then we can look at giving qemu a way to properly indicate which > devices it actually does DMA mapping for, so we can remove those > heuristic assumptions. > > But that flag does *not* live in the virtio host←→guest ABI. > > -- > David WoodhouseOpen Source Technology Centre > david.woodho...@intel.com Intel Corporation > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, Apr 27, 2016 at 04:23:32PM +0200, Joerg Roedel wrote: > On Wed, Apr 27, 2016 at 04:37:04PM +0300, Michael S. Tsirkin wrote: > > One correction: it's a feature of the device in the system. > > There could be a mix of devices bypassing and not > > bypassing the IOMMU. > > No, it really is not. A device can't chose to bypass the IOMMU. But the > IOMMU can chose to let the device bypass. QEMU can choose to bypass IOMMU for one device and not the other. IOMMU in QEMU isn't involved when it's bypassed. > So any fix here belongs > into the platform/iommu code too and not into some driver. Fine but this is beside the point. Almost all virtio devices bypass IOMMU and what this patch does is create a way to detect devices that don't. This code can maybe go into platform. > > Sounds good. And a way to detect appropriate devices could > > be by looking at the feature flag, perhaps? > > Again, no! The way to detect that is to look into the iommu description > structures provided by the firmware. They provide everything necessary > to tell the iommu code which devices are not translated. > > > > Joerg It would be easy if they did but they don't do this on all systems. In particular the idea for firmware interface was clearly that a given bus either is connected through IOMMU or bypassing it. Whether virtio bypasses the iommu is unrelated to the bus it's on. -- MST ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, Apr 27, 2016 at 07:31:43AM -0700, Andy Lutomirski wrote: > On Wed, Apr 27, 2016 at 7:23 AM, Joerg Roedel wrote: > > On Wed, Apr 27, 2016 at 04:37:04PM +0300, Michael S. Tsirkin wrote: > >> One correction: it's a feature of the device in the system. > >> There could be a mix of devices bypassing and not > >> bypassing the IOMMU. > > > > No, it really is not. A device can't chose to bypass the IOMMU. But the > > IOMMU can chose to let the device bypass. So any fix here belongs > > into the platform/iommu code too and not into some driver. > > > >> Sounds good. And a way to detect appropriate devices could > >> be by looking at the feature flag, perhaps? > > > > Again, no! The way to detect that is to look into the iommu description > > structures provided by the firmware. They provide everything necessary > > to tell the iommu code which devices are not translated. > > > > Except on PPC and SPARC. As far as I know, those are the only > problematic platforms. > > Is it too late to *disable* QEMU's q35-iommu thingy until it can be > fixed to report correct data in the DMAR tables? > > --Andy Meaning virtio or assigned devices? For virtio - it's way too late since these are working configurations. For assigned devices - they don't work on x86 so it doesn't have to be disabled, it's safe to ignore. -- MST ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, Apr 27, 2016 at 7:23 AM, Joerg Roedel wrote: > On Wed, Apr 27, 2016 at 04:37:04PM +0300, Michael S. Tsirkin wrote: >> One correction: it's a feature of the device in the system. >> There could be a mix of devices bypassing and not >> bypassing the IOMMU. > > No, it really is not. A device can't chose to bypass the IOMMU. But the > IOMMU can chose to let the device bypass. So any fix here belongs > into the platform/iommu code too and not into some driver. > >> Sounds good. And a way to detect appropriate devices could >> be by looking at the feature flag, perhaps? > > Again, no! The way to detect that is to look into the iommu description > structures provided by the firmware. They provide everything necessary > to tell the iommu code which devices are not translated. > Except on PPC and SPARC. As far as I know, those are the only problematic platforms. Is it too late to *disable* QEMU's q35-iommu thingy until it can be fixed to report correct data in the DMAR tables? --Andy ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, Apr 27, 2016 at 7:38 AM, Michael S. Tsirkin wrote: > On Wed, Apr 27, 2016 at 07:31:43AM -0700, Andy Lutomirski wrote: >> On Wed, Apr 27, 2016 at 7:23 AM, Joerg Roedel wrote: >> > On Wed, Apr 27, 2016 at 04:37:04PM +0300, Michael S. Tsirkin wrote: >> >> One correction: it's a feature of the device in the system. >> >> There could be a mix of devices bypassing and not >> >> bypassing the IOMMU. >> > >> > No, it really is not. A device can't chose to bypass the IOMMU. But the >> > IOMMU can chose to let the device bypass. So any fix here belongs >> > into the platform/iommu code too and not into some driver. >> > >> >> Sounds good. And a way to detect appropriate devices could >> >> be by looking at the feature flag, perhaps? >> > >> > Again, no! The way to detect that is to look into the iommu description >> > structures provided by the firmware. They provide everything necessary >> > to tell the iommu code which devices are not translated. >> > >> >> Except on PPC and SPARC. As far as I know, those are the only >> problematic platforms. >> >> Is it too late to *disable* QEMU's q35-iommu thingy until it can be >> fixed to report correct data in the DMAR tables? >> >> --Andy > > Meaning virtio or assigned devices? > For virtio - it's way too late since these are working configurations. > For assigned devices - they don't work on x86 so it doesn't have > to be disabled, it's safe to ignore. I mean actually prevent QEMU from running in q35-iommu mode with any virtio devices attached or maybe even turn off q35-iommu mode entirely [1]. Doesn't it require that the user literally pass the word "experimental" into QEMU right now? It did at some point IIRC. The reason I'm asking is that, other than q35-iommu, QEMU's virtio devices *don't* bypass the IOMMU except on PPC and SPARC, simply because there is no other configuration AFAICT that has virtio and and IOMMU. So maybe the right solution is to fix q35-iommu to use DMAR correctly (thus breaking q35-iommu users with older guest kernels, which hopefully don't actually exist) and to come up with a PPC- and SPARC-specific solution, or maybe OpenFirmware-specific solution, to handle PPC and SPARC down the road. [1] I'm pretty sure I emailed the QEMU list before q35-iommu ever showed up in a release asking the QEMU team to please not do that until this issue was resolved. Sadly, that email was ignored :( --Andy ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, Apr 27, 2016 at 07:43:07AM -0700, Andy Lutomirski wrote: > On Wed, Apr 27, 2016 at 7:38 AM, Michael S. Tsirkin wrote: > > On Wed, Apr 27, 2016 at 07:31:43AM -0700, Andy Lutomirski wrote: > >> On Wed, Apr 27, 2016 at 7:23 AM, Joerg Roedel wrote: > >> > On Wed, Apr 27, 2016 at 04:37:04PM +0300, Michael S. Tsirkin wrote: > >> >> One correction: it's a feature of the device in the system. > >> >> There could be a mix of devices bypassing and not > >> >> bypassing the IOMMU. > >> > > >> > No, it really is not. A device can't chose to bypass the IOMMU. But the > >> > IOMMU can chose to let the device bypass. So any fix here belongs > >> > into the platform/iommu code too and not into some driver. > >> > > >> >> Sounds good. And a way to detect appropriate devices could > >> >> be by looking at the feature flag, perhaps? > >> > > >> > Again, no! The way to detect that is to look into the iommu description > >> > structures provided by the firmware. They provide everything necessary > >> > to tell the iommu code which devices are not translated. > >> > > >> > >> Except on PPC and SPARC. As far as I know, those are the only > >> problematic platforms. > >> > >> Is it too late to *disable* QEMU's q35-iommu thingy until it can be > >> fixed to report correct data in the DMAR tables? > >> > >> --Andy > > > > Meaning virtio or assigned devices? > > For virtio - it's way too late since these are working configurations. > > For assigned devices - they don't work on x86 so it doesn't have > > to be disabled, it's safe to ignore. > > I mean actually prevent QEMU from running in q35-iommu mode with any > virtio devices attached or maybe even turn off q35-iommu mode entirely > [1]. Doesn't it require that the user literally pass the word > "experimental" into QEMU right now? It did at some point IIRC. > > The reason I'm asking is that, other than q35-iommu, QEMU's virtio > devices *don't* bypass the IOMMU except on PPC and SPARC, simply > because there is no other configuration AFAICT that has virtio and and > IOMMU. So maybe the right solution is to fix q35-iommu to use DMAR > correctly (thus breaking q35-iommu users with older guest kernels, > which hopefully don't actually exist) and to come up with a PPC- and > SPARC-specific solution, or maybe OpenFirmware-specific solution, to > handle PPC and SPARC down the road. > > [1] I'm pretty sure I emailed the QEMU list before q35-iommu ever > showed up in a release asking the QEMU team to please not do that > until this issue was resolved. Sadly, that email was ignored :( > > --Andy Sorry, I didn't make myself clear. Point is, QEMU is not the only virtio implementation out there. So we can't know no virtio implementations have an IOMMU as long as linux supports this IOMMU. virtio always used physical addresses since it was born and if it changes that it must do this in a way that does not break existing users. -- MST ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, Apr 27, 2016 at 04:56:32PM +0200, Joerg Roedel wrote: > On Wed, Apr 27, 2016 at 05:34:30PM +0300, Michael S. Tsirkin wrote: > > On Wed, Apr 27, 2016 at 04:23:32PM +0200, Joerg Roedel wrote: > > > QEMU can choose to bypass IOMMU for one device and not the other. > > IOMMU in QEMU isn't involved when it's bypassed. > > And it is QEMU's task to tell the OS, right? And the correct way to do > this is via the firmware ACPI tables. Going forward, this might be reasonable. Of course it didn't in the past and no one cared because virtio devices used physical addresses. We have to keep these setups working. > > Fine but this is beside the point. Almost all virtio devices > > bypass IOMMU and what this patch does is create a way > > to detect devices that don't. This code can maybe go into > > platform. > > Again, the way to detect this is in platform code must not be device > specific. This is what the DMAR and IVRS tables on x86 are for. > > When there is no way to do this in the firmware (or there is no firmware > at all), we have to do a quirk in the platform code for it. > > > > Joerg I really don't get it. There's exactly one device that works now and needs the work-around and so that we need to support, and that is virtio. It happens to have exactly the same issue on all platforms. Why would we want to work hard to build platform-specific solutions to a problem that can be solved in 5 lines of generic code? -- MST ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, Apr 27, 2016 at 04:58:51PM +0200, Joerg Roedel wrote: > On Wed, Apr 27, 2016 at 05:54:57PM +0300, Michael S. Tsirkin wrote: > > Point is, QEMU is not the only virtio implementation out there. > > So we can't know no virtio implementations have an IOMMU as long as > > linux supports this IOMMU. > > virtio always used physical addresses since it was born and if it > > changes that it must do this in a way that does not break existing > > users. > > FWIW, virtio in qemu can continue to just use physical addresses. But > qemu needs to advertise that fact correctly to the OS in the DMAR table. > This way old kernels (where virtio does not use DMA-API) will also > continue to work on the fixed qemu. > > > > Joerg It's not clear it can do this since DMAR tables seem to assume a given slot is either bypassing IOMMU or going through it. QEMU allows reusing same slot for virtio and non-virtio devices. Besides, this patch is not about it, it's a flag for QEMU to tell guest that it can trust DMAR tables. -- MST ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, Apr 27, 2016 at 7:54 AM, Michael S. Tsirkin wrote: > On Wed, Apr 27, 2016 at 07:43:07AM -0700, Andy Lutomirski wrote: >> On Wed, Apr 27, 2016 at 7:38 AM, Michael S. Tsirkin wrote: >> > On Wed, Apr 27, 2016 at 07:31:43AM -0700, Andy Lutomirski wrote: >> >> On Wed, Apr 27, 2016 at 7:23 AM, Joerg Roedel wrote: >> >> > On Wed, Apr 27, 2016 at 04:37:04PM +0300, Michael S. Tsirkin wrote: >> >> >> One correction: it's a feature of the device in the system. >> >> >> There could be a mix of devices bypassing and not >> >> >> bypassing the IOMMU. >> >> > >> >> > No, it really is not. A device can't chose to bypass the IOMMU. But the >> >> > IOMMU can chose to let the device bypass. So any fix here belongs >> >> > into the platform/iommu code too and not into some driver. >> >> > >> >> >> Sounds good. And a way to detect appropriate devices could >> >> >> be by looking at the feature flag, perhaps? >> >> > >> >> > Again, no! The way to detect that is to look into the iommu description >> >> > structures provided by the firmware. They provide everything necessary >> >> > to tell the iommu code which devices are not translated. >> >> > >> >> >> >> Except on PPC and SPARC. As far as I know, those are the only >> >> problematic platforms. >> >> >> >> Is it too late to *disable* QEMU's q35-iommu thingy until it can be >> >> fixed to report correct data in the DMAR tables? >> >> >> >> --Andy >> > >> > Meaning virtio or assigned devices? >> > For virtio - it's way too late since these are working configurations. >> > For assigned devices - they don't work on x86 so it doesn't have >> > to be disabled, it's safe to ignore. >> >> I mean actually prevent QEMU from running in q35-iommu mode with any >> virtio devices attached or maybe even turn off q35-iommu mode entirely >> [1]. Doesn't it require that the user literally pass the word >> "experimental" into QEMU right now? It did at some point IIRC. >> >> The reason I'm asking is that, other than q35-iommu, QEMU's virtio >> devices *don't* bypass the IOMMU except on PPC and SPARC, simply >> because there is no other configuration AFAICT that has virtio and and >> IOMMU. So maybe the right solution is to fix q35-iommu to use DMAR >> correctly (thus breaking q35-iommu users with older guest kernels, >> which hopefully don't actually exist) and to come up with a PPC- and >> SPARC-specific solution, or maybe OpenFirmware-specific solution, to >> handle PPC and SPARC down the road. >> >> [1] I'm pretty sure I emailed the QEMU list before q35-iommu ever >> showed up in a release asking the QEMU team to please not do that >> until this issue was resolved. Sadly, that email was ignored :( >> >> --Andy > > Sorry, I didn't make myself clear. > Point is, QEMU is not the only virtio implementation out there. > So we can't know no virtio implementations have an IOMMU as long as > linux supports this IOMMU. > virtio always used physical addresses since it was born and if it > changes that it must do this in a way that does not break existing > users. Is there any non-QEMU virtio implementation can provide an IOMMU-bypassing virtio device on a platform that has a nontrivial IOMMU? --Andy ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, 2016-04-27 at 18:05 +0300, Michael S. Tsirkin wrote: > > I really don't get it. > > There's exactly one device that works now and needs the work-around and > so that we need to support, and that is virtio. It happens to have > exactly the same issue on all platforms. False. We have other devices which are currently *not* translated by the emulated IOMMU and which aren't going to be in the short term either. We also have other devices (emulated hardware NICs) to which precisely the same "we don't need protection" arguments apply, and which we *could* expose to the guest without an IOMMU translation if we really wanted to. It makes as much sense as exposing virtio without an IOMMU, going forward. > Why would we want to work hard to build platform-specific > solutions to a problem that can be solved in 5 lines of > generic code? Because it's a dirty hack in the *wrong* place. -- dwmw2 smime.p7s Description: S/MIME cryptographic signature ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, Apr 27, 2016 at 04:15:35PM +0100, David Woodhouse wrote: > On Wed, 2016-04-27 at 18:05 +0300, Michael S. Tsirkin wrote: > > > > I really don't get it. > > > > There's exactly one device that works now and needs the work-around and > > so that we need to support, and that is virtio. It happens to have > > exactly the same issue on all platforms. > > False. We have other devices which are currently *not* translated by > the emulated IOMMU and which aren't going to be in the short term > either. > > We also have other devices (emulated hardware NICs) to which precisely > the same "we don't need protection" arguments apply, and which we > *could* expose to the guest without an IOMMU translation if we really > wanted to. It makes as much sense as exposing virtio without an IOMMU, > going forward. The reasons for virtio are mostly dealing legacy. We don't need protection is a separate issue that I'd rather drop for now. > > Why would we want to work hard to build platform-specific > > solutions to a problem that can be solved in 5 lines of > > generic code? > > Because it's a dirty hack in the *wrong* place. No one came up with a better one so far :( > -- > dwmw2 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 RFC] fixup! virtio: convert to use DMA api
On Wed, 2016-04-27 at 21:17 +0300, Michael S. Tsirkin wrote: > > > Because it's a dirty hack in the *wrong* place. > > No one came up with a better one so far :( Seriously? Take a look at drivers/iommu/intel-iommu.c. It has quirks for all kinds of shitty devices that have to be put in passthrough mode or otherwise excluded. We don't actually *need* it for the Intel IOMMU; all we need is for QEMU to stop lying in its DMAR tables. We *do* want the same kind of quirks in the relevant POWER and ARM IOMMU code in the kernel. Do that (hell, a simple quirk for all virtio devices will suffice, but NOT in the virtio driver) at the same moment you fix the virtio devices to use the DMA API. Job done. Some time *later* we can work on *refining* that quirk, and a way for QEMU to tell the guest (via something generic like fwcfg, maybe) that some devices are and aren't translated. Actually, I'm about to look at moving dma_ops into struct device and cleaning up the way we detect which IOMMU is attached, at device instantiation time. Perhaps I can shove the virtio-exception quirk in there while I'm at it... -- dwmw2 smime.p7s Description: S/MIME cryptographic signature ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v4 00/13] Support non-lru page migration
On Wed, 27 Apr 2016 16:48:13 +0900 Minchan Kim wrote: > Recently, I got many reports about perfermance degradation in embedded > system(Android mobile phone, webOS TV and so on) and easy fork fail. > > The problem was fragmentation caused by zram and GPU driver mainly. > With memory pressure, their pages were spread out all of pageblock and > it cannot be migrated with current compaction algorithm which supports > only LRU pages. In the end, compaction cannot work well so reclaimer > shrinks all of working set pages. It made system very slow and even to > fail to fork easily which requires order-[2 or 3] allocations. > > Other pain point is that they cannot use CMA memory space so when OOM > kill happens, I can see many free pages in CMA area, which is not > memory efficient. In our product which has big CMA memory, it reclaims > zones too exccessively to allocate GPU and zram page although there are > lots of free space in CMA so system becomes very slow easily. > > To solve these problem, this patch tries to add facility to migrate > non-lru pages via introducing new functions and page flags to help > migration. I'm seeing some rejects here against Mel's changes and our patch bandwidth is getting waaay way ahead of our review bandwidth. So I think I'll loadshed this patchset at this time, sorry. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v4 00/13] Support non-lru page migration
Hello Andrew, On Wed, Apr 27, 2016 at 01:20:35PM -0700, Andrew Morton wrote: > On Wed, 27 Apr 2016 16:48:13 +0900 Minchan Kim wrote: > > > Recently, I got many reports about perfermance degradation in embedded > > system(Android mobile phone, webOS TV and so on) and easy fork fail. > > > > The problem was fragmentation caused by zram and GPU driver mainly. > > With memory pressure, their pages were spread out all of pageblock and > > it cannot be migrated with current compaction algorithm which supports > > only LRU pages. In the end, compaction cannot work well so reclaimer > > shrinks all of working set pages. It made system very slow and even to > > fail to fork easily which requires order-[2 or 3] allocations. > > > > Other pain point is that they cannot use CMA memory space so when OOM > > kill happens, I can see many free pages in CMA area, which is not > > memory efficient. In our product which has big CMA memory, it reclaims > > zones too exccessively to allocate GPU and zram page although there are > > lots of free space in CMA so system becomes very slow easily. > > > > To solve these problem, this patch tries to add facility to migrate > > non-lru pages via introducing new functions and page flags to help > > migration. > > I'm seeing some rejects here against Mel's changes and our patch > bandwidth is getting waaay way ahead of our review bandwidth. So I > think I'll loadshed this patchset at this time, sorry. I expected the conflict with Mel's change in recent mmotm but doesn't want to send patches against recent mmotm because it has several problems in compaction so my test was really trobule. I just picked patches from Hugh and Vlastimil and finally can test on it. Anyway, I will rebase my patches on recent mmotm, hoping you picked every patches on compaction part and respin after a few days. Thanks for let me knowing your plan. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] vhost_net: stop polling socket during rx processing
On 04/27/2016 07:28 PM, Michael S. Tsirkin wrote: > On Tue, Apr 26, 2016 at 03:35:53AM -0400, Jason Wang wrote: >> We don't stop polling socket during rx processing, this will lead >> unnecessary wakeups from under layer net devices (E.g >> sock_def_readable() form tun). Rx will be slowed down in this >> way. This patch avoids this by stop polling socket during rx >> processing. A small drawback is that this introduces some overheads in >> light load case because of the extra start/stop polling, but single >> netperf TCP_RR does not notice any change. In a super heavy load case, >> e.g using pktgen to inject packet to guest, we get about ~17% >> improvement on pps: >> >> before: ~137 pkt/s >> after: ~150 pkt/s >> >> Signed-off-by: Jason Wang > Acked-by: Michael S. Tsirkin > > There is one other possible enhancement: we actually have the wait queue > lock taken in _wake_up, but we give it up only to take it again in the > handler. > > It would be nicer to just remove the entry when we wake > the vhost thread. Re-add it if required. > I think that something like the below would give you the necessary API. > Pls feel free to use it if you are going to implement a patch on top > doing this - that's not a reason not to include this simple patch > though. Thanks, this looks useful, will give it a try. > > ---> > > wait: add API to drop a wait_queue_t entry from wake up handler > > A wake up handler might want to remove its own wait queue entry to avoid > future wakeups. In particular, vhost has such a need. As wait queue > lock is already taken, all we need is an API to remove the entry without > wait_queue_head_t which isn't currently accessible to wake up handlers. > > Signed-off-by: Michael S. Tsirkin > > --- > > diff --git a/include/linux/wait.h b/include/linux/wait.h > index 27d7a0a..9c6604b 100644 > --- a/include/linux/wait.h > +++ b/include/linux/wait.h > @@ -191,11 +191,17 @@ __add_wait_queue_tail_exclusive(wait_queue_head_t *q, > wait_queue_t *wait) > } > > static inline void > -__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) > +__remove_wait_queue_entry(wait_queue_t *old) > { > list_del(&old->task_list); > } > > +static inline void > +__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) > +{ > + __remove_wait_queue_entry(old); > +} > + > typedef int wait_bit_action_f(struct wait_bit_key *, int mode); > void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); > void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void > *key); ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC PATCH V2 1/2] vhost: convert pre sorted vhost memory array to interval tree
On 04/27/2016 07:30 PM, Michael S. Tsirkin wrote: > On Fri, Mar 25, 2016 at 10:34:33AM +0800, Jason Wang wrote: >> > Current pre-sorted memory region array has some limitations for future >> > device IOTLB conversion: >> > >> > 1) need extra work for adding and removing a single region, and it's >> >expected to be slow because of sorting or memory re-allocation. >> > 2) need extra work of removing a large range which may intersect >> >several regions with different size. >> > 3) need trick for a replacement policy like LRU >> > >> > To overcome the above shortcomings, this patch convert it to interval >> > tree which can easily address the above issue with almost no extra >> > work. >> > >> > The patch could be used for: >> > >> > - Extend the current API and only let the userspace to send diffs of >> > memory table. >> > - Simplify Device IOTLB implementation. > Does this affect performance at all? > In pktgen test, no difference. Thanks ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC PATCH V2 2/2] vhost: device IOTLB API
On 04/27/2016 07:45 PM, Michael S. Tsirkin wrote: > On Fri, Mar 25, 2016 at 10:34:34AM +0800, Jason Wang wrote: >> This patch tries to implement an device IOTLB for vhost. This could be >> used with for co-operation with userspace(qemu) implementation of DMA >> remapping. >> >> The idea is simple. When vhost meets an IOTLB miss, it will request >> the assistance of userspace to do the translation, this is done >> through: >> >> - Fill the translation request in a preset userspace address (This >> address is set through ioctl VHOST_SET_IOTLB_REQUEST_ENTRY). >> - Notify userspace through eventfd (This eventfd was set through ioctl >> VHOST_SET_IOTLB_FD). > Why use an eventfd for this? The aim is to implement the API all through ioctls. > We use them for interrupts because > that happens to be what kvm wants, but here - why don't we > just add a generic support for reading out events > on the vhost fd itself? I've considered this approach, but what's the advantages of this? I mean looks like all other ioctls could be done through vhost fd reading/writing too. > >> - device IOTLB were started and stopped through VHOST_RUN_IOTLB ioctl >> >> When userspace finishes the translation, it will update the vhost >> IOTLB through VHOST_UPDATE_IOTLB ioctl. Userspace is also in charge of >> snooping the IOTLB invalidation of IOMMU IOTLB and use >> VHOST_UPDATE_IOTLB to invalidate the possible entry in vhost. > There's one problem here, and that is that VQs still do not undergo > translation. In theory VQ could be mapped in such a way > that it's not contigious in userspace memory. I'm not sure I get the issue, current vhost API support setting desc_user_addr, used_user_addr and avail_user_addr independently. So looks ok? If not, looks not a problem to device IOTLB API itself. > > >> Signed-off-by: Jason Wang > What limits amount of entries that kernel keeps around? It depends on guest working set I think. Looking at http://dpdk.org/doc/guides/linux_gsg/sys_reqs.html: - For 2MB page size in guest, it suggests hugepages=1024 - For 1GB page size, it suggests a hugepages=4 So I choose 2048 to make sure it can cover this. > Do we want at least a mod parameter for this? Maybe. > >> --- >> drivers/vhost/net.c| 6 +- >> drivers/vhost/vhost.c | 301 >> +++-- >> drivers/vhost/vhost.h | 17 ++- >> fs/eventfd.c | 3 +- >> include/uapi/linux/vhost.h | 35 ++ >> 5 files changed, 320 insertions(+), 42 deletions(-) >> [...] >> +struct vhost_iotlb_entry { >> +__u64 iova; >> +__u64 size; >> +__u64 userspace_addr; > Alignment requirements? The API does not require any alignment. Will add a comment for this. > >> +struct { >> +#define VHOST_ACCESS_RO 0x1 >> +#define VHOST_ACCESS_WO 0x2 >> +#define VHOST_ACCESS_RW 0x3 >> +__u8 perm; >> +#define VHOST_IOTLB_MISS 1 >> +#define VHOST_IOTLB_UPDATE 2 >> +#define VHOST_IOTLB_INVALIDATE 3 >> +__u8 type; >> +#define VHOST_IOTLB_INVALID0x1 >> +#define VHOST_IOTLB_VALID 0x2 >> +__u8 valid; > why do we need this flag? Useless, will remove. > >> +__u8 u8_padding; >> +__u32 padding; >> +} flags; >> +}; >> + >> +struct vhost_vring_iotlb_entry { >> +unsigned int index; >> +__u64 userspace_addr; >> +}; >> + >> struct vhost_memory_region { >> __u64 guest_phys_addr; >> __u64 memory_size; /* bytes */ >> @@ -127,6 +153,15 @@ struct vhost_memory { >> /* Set eventfd to signal an error */ >> #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct >> vhost_vring_file) >> >> +/* IOTLB */ >> +/* Specify an eventfd file descriptor to signle on IOTLB miss */ > typo Will fix it. > >> +#define VHOST_SET_VRING_IOTLB_CALL _IOW(VHOST_VIRTIO, 0x23, struct \ >> +vhost_vring_file) >> +#define VHOST_SET_VRING_IOTLB_REQUEST _IOW(VHOST_VIRTIO, 0x25, struct \ >> + vhost_vring_iotlb_entry) >> +#define VHOST_UPDATE_IOTLB _IOW(VHOST_VIRTIO, 0x24, struct >> vhost_iotlb_entry) >> +#define VHOST_RUN_IOTLB _IOW(VHOST_VIRTIO, 0x26, int) >> + > Is the assumption that userspace must dedicate a thread to running the iotlb? > I dislike this one. > Please support asynchronous APIs at least optionally to make > userspace make its own threading decisions. Nope, my qemu patches does not use a dedicated thread. This API is used to start or top DMAR according to e.g whether guest enable DMAR for intel IOMMU. > >> /* VHOST_NET specific defines */ >> >> /* Attach virtio net ring to a raw socket, or tap device. > Don't we need a feature bit for this? Yes we need it. The feature bit is not considered in this patch and looks like it was still under discussion. After we finalize it, I will add. > Are we short on feature bits? If yes maybe it's time to add > something like PROTOCOL_FE