Couple of numbers...
Remapping of the physical page took about 25-50% less time than 1500
bytes copying using memcpy().
And 15 times faster just after reboot, i.e. without anything in the
cache.
CPU is Xeon with HT enabled:
cpu family : 15
model : 2
model name : Intel(R) Xeon(TM) CPU 2.40GHz
stepping : 7
cpu MHz : 800.384
1.
packet_mmap_test: 1000 remaps took 1495 usec.
packet_mmap_test: 1000 copyings took 1988 usec.
2.
packet_mmap_test: 1000 remaps took 1406 usec.
packet_mmap_test: 1000 copyings took 2613 usec.
3. And just after reboot, when there is nothing in cache:
packet_mmap_test: 1000 remaps took 1387 usec.
packet_mmap_test: 1000 copyings took 20173 usec.
4. Yet another "just after reboot":
packet_mmap_test: 1000 remaps took 1295 usec.
packet_mmap_test: 1000 copyings took 14889 usec.
Above copying is being done using arbitrary kernel virtual address
as source address and with PAGE_SIZE addition to it before each
memcpy().
On Thu, Jul 28, 2005 at 12:44:41PM +0400, Evgeniy Polyakov ([EMAIL PROTECTED])
wrote:
> Hello, developers.
>
> This cruft works now much better.
> Unfortunately I need to add some scary PTE insults- you can find them in
> update_address().
> One big nitpick is that this module can not be unloaded if application
> do not closes socket - socket is being removed after mapping is destroyed,
> so I need to grab MM reference, but can not drop it.
> Also it uses flush_tlb() all over the place, but it is only one macros,
> that can be used in modules - tlb_flush_page() and tlb_flush_one() are not
> exported. It also has a race on startup, when there is only one page
> mapped (control page), but userspace (very simple) may want to access
> data pages.
> Control page contains set of control structures one per mapped page,
> i.e. mapped skb, control structure has an offset of skb->mac.raw in the
> page and flags field.
>
> I gladly want to listen your comments.
> Thanks.
>
> Included files:
> af_tlb.[ch] - zero-copy sniffer implementation.
> tlb_test.c - simple userspace sniffer.
>
> af_tlb.c
> /*
> * af_tlb.c
> *
> * 2005 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]>
> * All rights reserved.
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> * the Free Software Foundation; either version 2 of the License, or
> * (at your option) any later version.
> *
> * This program is distributed in the hope that it will be useful,
> * but WITHOUT ANY WARRANTY; without even the implied warranty of
> * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> * GNU General Public License for more details.
> *
> * You should have received a copy of the GNU General Public License
> * along with this program; if not, write to the Free Software
> * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> */
>
>
> #include <linux/config.h>
> #include <linux/types.h>
> #include <linux/sched.h>
> #include <linux/mm.h>
> #include <linux/fcntl.h>
> #include <linux/socket.h>
> #include <linux/in.h>
> #include <linux/inet.h>
> #include <linux/netdevice.h>
> #include <linux/if_packet.h>
> #include <linux/wireless.h>
> #include <linux/kmod.h>
> #include <net/ip.h>
> #include <net/protocol.h>
> #include <linux/skbuff.h>
> #include <net/sock.h>
> #include <linux/errno.h>
> #include <linux/timer.h>
> #include <linux/module.h>
> #include <linux/moduleparam.h>
> #include <linux/init.h>
> #include <linux/workqueue.h>
>
> #include <linux/mempolicy.h>
> #include <linux/rmap.h>
> #include <linux/fs.h>
> #include <linux/shm.h>
> #include <linux/mm.h>
> #include <linux/mman.h>
> #include <linux/pagemap.h>
> #include <linux/swap.h>
> #include <linux/hugetlb.h>
> #include <linux/mman.h>
> #include <linux/slab.h>
> #include <linux/swapops.h>
>
> #include <asm/io.h>
> #include <asm/uaccess.h>
> #include <asm/tlb.h>
> #include <asm/tlbflush.h>
> #include <asm/pgtable.h>
> #include <asm/pgalloc.h>
> #include <asm/uaccess.h>
> #include <asm/cacheflush.h>
>
> #include "af_tlb.h"
>
> static unsigned int free_timeout = 10;
> module_param(free_timeout, uint, 0);
>
> static void test_timer_func(void *data);
> static DECLARE_WORK(w, test_timer_func, NULL);
>
> static void packet_free_skbs(struct packet_sock *po, int clear_last);
>
> static inline struct packet_sock *pkt_sk(struct sock *sk)
> {
> return (struct packet_sock *)sk;
> }
>
> static void packet_sock_destruct(struct sock *sk)
> {
> BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
> BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
>
> if (!sock_flag(sk, SOCK_DEAD)) {
> printk("Attempt to release alive packet socket: %p\n", sk);
> return;
> }
> }
>
>
> static struct proto_ops packet_ops_spkt;
>
> static void dump_skb(struct sk_buff *skb)
> {
> struct ethhdr *eth;
> int i;
>
> printk("shared=%d, cloned=%d, len=%4d: ", skb_shared(skb),
> skb_cloned(skb), skb->len);
>
> eth = eth_hdr(skb);
>
> printk("MAC: proto=%04x, src=", eth->h_proto);
> for (i=0; i<ETH_ALEN-1; ++i)
> printk("%02x:", eth->h_source[i]);
> printk("%02x, dst=", eth->h_source[ETH_ALEN-1]);
> for (i=0; i<ETH_ALEN-1; ++i)
> printk("%02x:", eth->h_dest[i]);
> printk("%02x.\n", eth->h_dest[ETH_ALEN-1]);
> }
>
> static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
> struct packet_type *pt)
> {
> struct sock *sk;
> struct sockaddr_pkt *spkt;
> struct packet_sock *po;
> int err;
>
> sk = pt->af_packet_priv;
> po = pkt_sk(sk);
>
> po->total++;
>
> /*
> * Yank back the headers [hope the device set this
> * right or kerboom...]
> *
> * Incoming packets have ll header pulled,
> * push it back.
> *
> * For outgoing ones skb->data == skb->mac.raw
> * so that this procedure is noop.
> */
>
> if (skb->pkt_type == PACKET_LOOPBACK)
> goto out;
>
> if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
> goto oom;
>
> /* drop any routing info */
> dst_release(skb->dst);
> skb->dst = NULL;
>
> spkt = (struct sockaddr_pkt*)skb->cb;
>
> skb_push(skb, skb->data-skb->mac.raw);
>
> /*
> * The SOCK_PACKET socket receives _all_ frames.
> */
>
> spkt->spkt_family = dev->type;
> strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
> spkt->spkt_protocol = skb->protocol;
>
> err = sock_queue_rcv_skb(sk, skb);
> if (!err)
> po->queued++;
> else
> po->dropped++;
>
> if (test_bit(PACKET_SOCKET_MAPPED, &po->flags))
> schedule_work(&w);
>
> if (!err)
> return 0;
>
> out:
> kfree_skb(skb);
> oom:
> return 0;
> }
>
>
> /*
> * Close a PACKET socket. This is fairly simple. We immediately go
> * to 'closed' state and remove our protocol entry in the device list.
> */
>
> static int packet_release(struct socket *sock)
> {
> struct sock *sk = sock->sk;
> struct packet_sock *po;
>
> if (!sk)
> return 0;
>
> po = pkt_sk(sk);
>
> sk_del_node_init(sk);
>
> if (test_bit(PACKET_SOCKET_RUNNING, &po->flags)) {
> dev_remove_pack(&po->prot_hook);
> clear_bit(PACKET_SOCKET_RUNNING, &po->flags);
> __sock_put(sk);
> }
>
> sock_orphan(sk);
> sock->sk = NULL;
>
> printk("%s: Waiting to workqueue.\n", __func__);
>
> clear_bit(PACKET_SOCKET_RUNNING, &po->flags);
>
> cancel_delayed_work(&w);
> flush_scheduled_work();
>
> skb_queue_purge(&sk->sk_receive_queue);
> skb_queue_purge(&po->sk_free_queue);
>
> printk("%s: releasing page.\n", __func__);
>
> free_page(po->page);
> sock_put(sk);
>
> mmput(po->tsk->mm);
>
> return 0;
> }
>
> /*
> * Attach a packet hook.
> */
>
> static int packet_do_bind(struct sock *sk, struct net_device *dev, int
> protocol)
> {
> struct packet_sock *po = pkt_sk(sk);
>
> lock_sock(sk);
>
> spin_lock(&po->bind_lock);
> if (test_bit(PACKET_SOCKET_RUNNING, &po->flags)) {
> __sock_put(sk);
> clear_bit(PACKET_SOCKET_RUNNING, &po->flags);
> po->num = 0;
> spin_unlock(&po->bind_lock);
> dev_remove_pack(&po->prot_hook);
> spin_lock(&po->bind_lock);
> }
>
> po->num = protocol;
> po->prot_hook.type = protocol;
> po->prot_hook.dev = dev;
>
> po->ifindex = dev ? dev->ifindex : 0;
>
> if (protocol == 0)
> goto out_unlock;
>
> if (dev) {
> if (dev->flags&IFF_UP) {
> dev_add_pack(&po->prot_hook);
> sock_hold(sk);
> set_bit(PACKET_SOCKET_RUNNING, &po->flags);
> } else {
> sk->sk_err = ENETDOWN;
> if (!sock_flag(sk, SOCK_DEAD))
> sk->sk_error_report(sk);
> }
> } else {
> dev_add_pack(&po->prot_hook);
> sock_hold(sk);
> set_bit(PACKET_SOCKET_RUNNING, &po->flags);
> }
>
> out_unlock:
> spin_unlock(&po->bind_lock);
> release_sock(sk);
> return 0;
> }
>
> static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int
> addr_len)
> {
> struct sock *sk=sock->sk;
> char name[15];
> struct net_device *dev;
> int err = -ENODEV;
>
> strlcpy(name, uaddr->sa_data, sizeof(name));
> printk( "%s: name=%s.\n", __func__, name);
>
> if(addr_len!=sizeof(struct sockaddr))
> return -EINVAL;
>
> dev = dev_get_by_name(name);
> if (dev) {
> err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
> dev_put(dev);
> }
> return err;
> }
>
> static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long
> arg)
> {
> switch(cmd) {
> default:
> return dev_ioctl(cmd, (void __user *)arg);
> }
> return 0;
> }
>
> static struct proto packet_proto = {
> .name = "PACKET",
> .owner = THIS_MODULE,
> .obj_size = sizeof(struct packet_sock),
> };
>
> static int packet_sock_init(struct packet_sock *po, int protocol, struct sock
> *sk)
> {
> skb_queue_head_init(&po->sk_free_queue);
>
> po->last = 0;
> po->total = 0;
> po->dropped = 0;
> po->queued = 0;
> po->flags = 0;
> po->budget = 1;
> po->next_free = jiffies + msecs_to_jiffies(free_timeout);
>
> spin_lock_init(&po->bind_lock);
>
> po->tsk = current;
>
> po->page = __get_free_page(GFP_KERNEL);
> if (!po->page)
> return -ENOMEM;
>
> memset((void *)po->page, 0, PAGE_SIZE);
>
> po->num = protocol;
> po->prot_hook.func = packet_rcv_spkt;
> po->prot_hook.af_packet_priv = sk;
>
> get_task_mm(po->tsk);
>
> return 0;
> }
>
> static int packet_create(struct socket *sock, int protocol)
> {
> struct sock *sk;
> struct packet_sock *po;
> int err;
>
> if (!capable(CAP_NET_RAW))
> return -EPERM;
> if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type !=
> SOCK_PACKET)
> return -ESOCKTNOSUPPORT;
>
> sock->state = SS_UNCONNECTED;
>
> err = -ENOBUFS;
> sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
> if (sk == NULL)
> goto err_out_exit;
>
> sock->ops = &packet_ops_spkt;
>
> sock_init_data(sock, sk);
>
> po = pkt_sk(sk);
> sk->sk_family = PF_PACKET;
> sk->sk_destruct = packet_sock_destruct;
>
> err = packet_sock_init(po, protocol, sk);
> if (err)
> goto err_out_sock_free;
>
> if (protocol) {
> po->prot_hook.type = protocol;
> dev_add_pack(&po->prot_hook);
> sock_hold(sk);
> set_bit(PACKET_SOCKET_RUNNING, &po->flags);
> }
>
> return 0;
>
> err_out_sock_free:
> sk_free(sk);
> err_out_exit:
> return err;
> }
>
> static struct packet_shared *packet_find_shared_lazy(struct packet_sock *po,
> struct sk_buff *skb)
> {
> u16 offset = offset_in_page(skb->mac.raw);
> struct packet_shared *ps = (struct packet_shared *)po->page;
> int i;
>
> for (i=0; i<po->budget; ++i) {
> if (ps->offset == offset)
> break;
>
> ps++;
> }
>
> if (i == po->budget)
> return NULL;
>
> return ps;
> }
>
> static void packet_free_skbs(struct packet_sock *po, int clear_last)
> {
> struct sk_buff *skb;
> int num = 0;
> //struct sock *sk = po->prot_hook.af_packet_priv;
> struct packet_shared *ps;
> struct page *page;
>
> while ((!skb_queue_empty(&po->sk_free_queue) && po->free_queued >
> po->budget) || clear_last > 0) {
> spin_lock_bh(&po->sk_free_queue.lock);
> skb = __skb_dequeue(&po->sk_free_queue);
> if (skb)
> po->free_queued--;
> spin_unlock_bh(&po->sk_free_queue.lock);
>
> if (!skb)
> break;
>
> ps = packet_find_shared_lazy(po, skb);
> if (ps) {
> if (!test_bit(PACKET_MAPPED, &ps->flags))
> printk("%s: pos=%d, offset=%04x,
> flags=%08lx.\n", __func__, ps->pos, ps->offset, ps->flags);
> clear_bit(PACKET_MAPPED, &ps->flags);
> }
>
> page = virt_to_page(skb->mac.raw);
>
> put_page(page);
> if (!page_count(page)) {
> ClearPageReserved(page);
> }
>
> kfree_skb(skb);
> num++;
> clear_last--;
> }
> #if 0
> printk("%s: freed=%d, free_queued=%d, qeued=%d [rmem=%d,
> max=%d], budget=%d, queued=%lu, dropped=%lu, total=%lu.\n",
> __func__, num, po->free_queued,
> skb_queue_len(&sk->sk_receive_queue),
> atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf,
> po->budget, po->queued, po->dropped, po->total);
> #endif
> }
>
> static inline pte_t *get_pte(struct vm_area_struct *vma, unsigned long addr)
> {
> pgd_t *pgd;
> pud_t *pud;
> pmd_t *pmd;
> pte_t *pte;
>
> pgd = pgd_offset(vma->vm_mm, addr);
> pud = pud_offset(pgd, addr);
> pmd = pmd_offset(pud, addr);
>
> if (pmd_none(*pmd))
> vma->vm_mm->nr_ptes--;
>
> pte = pte_offset_map(pmd, addr);
>
> printk("%s: addr=%08lx, pte=%p, %08lx, pmd=%p, pud=%p, pgd=%p,
> nr_pte=%ld.\n",
> __func__, addr, pte, pte_val(*pte), pmd, pud, pgd,
> vma->vm_mm->nr_ptes);
>
>
> return pte;
> }
>
> static inline void update_address(struct vm_area_struct *vma, unsigned long
> addr, unsigned long pfn)
> {
> pte_t *pte;
> struct page *page;
>
> pte = get_pte(vma, addr);
> page = pfn_to_page(pfn);
>
> printk("%s: pfn=%08lx, valid=%d, page=%p, res=%d, mapcount=%d.\n",
> __func__, pfn, pfn_valid(pfn), page,
> PageReserved(page), page_mapcount(page));
>
> pte_clear(vma->mm, addr, pte);
> pte_unmap(pte);
> }
>
> static void test_timer_func(void *data)
> {
> struct sock *sk = (struct sock *)data;
> struct packet_sock *po;
> struct packet_shared *ps;
> struct sk_buff *skb;
> unsigned long virt, start;
> int num = 0;
>
> if (!sk)
> return;
>
> po = pkt_sk(sk);
> if (!po || !po->tsk || !po->tsk->mm || !test_bit(PACKET_SOCKET_RUNNING,
> &po->flags) || !test_bit(PACKET_SOCKET_MAPPED, &po->flags))
> return;
>
> down_write(&po->tsk->mm->mmap_sem);
> #if 1
> printk("%s: free_queued=%d, qeued=%d [rmem=%d, max=%d], budget=%d,
> queued=%lu, dropped=%lu, total=%lu.\n",
> __func__, po->free_queued,
> skb_queue_len(&sk->sk_receive_queue),
> atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf,
> po->budget, po->queued, po->dropped, po->total);
> #endif
> while (++num <= po->budget && (skb =
> skb_dequeue(&sk->sk_receive_queue))) {
> virt = (unsigned long)skb->mac.raw;
> if (!virt)
> goto out;
>
> start = po->vma->vm_start + PAGE_SIZE*(1+po->last);
> ps = &((struct packet_shared *)po->page)[po->last];
>
> printk("s=%08lx, p=%p, pos=%d, offset=%04x, flags=%08lx.\n",
> start, virt_to_page(virt), ps->pos, ps->offset, ps->flags);
> if (0) {
> //int i;
>
> printk("offset=%4lx, num=%2d, last=%2d, users=%1d,
> dataref=%1d: ",
> offset_in_page(virt), num, po->last,
> atomic_read(&skb->users),
> atomic_read(&skb_shinfo(skb)->dataref));
> dump_skb(skb);
> #if 0
> for (i=0; i<32; ++i)
> printk("%02x ", ((unsigned char *)virt)[i]);
> printk("\n");
> #endif
> }
>
> /*
> * This actually should not be flush_tlb(),
> * but it is the only one call that can be used in modules.
> * --zbr
> */
> update_address(po->vma, start, __pa(virt) >> PAGE_SHIFT);
> __flush_tlb();
>
> SetPageReserved(virt_to_page(virt));
> get_page(virt_to_page(virt));
> if (remap_pfn_range(po->vma, start, __pa(virt) >> PAGE_SHIFT,
> PAGE_SIZE, po->vma->vm_page_prot)) {
> printk("Remapping error.\n");
> ClearPageReserved(virt_to_page(virt));
> goto out;
> }
>
> flush_dcache_page(virt_to_page(virt));
>
> if (test_bit(PACKET_MAPPED, &ps->flags))
> packet_free_skbs(po, 1);
>
> ps->offset = offset_in_page(virt);
> set_bit(PACKET_MAPPED, &ps->flags);
>
> if (++po->last == po->budget)
> po->last = 0;
>
> {
> start = po->vma->vm_start;
>
> while (start < po->vma->vm_end) {
> pte_t *pte = get_pte(po->vma, start);
>
> if (pte_present(*pte)) {
> struct page *page = NULL;
> unsigned long pfn = pte_pfn(*pte);
> if (pfn_valid(pfn)) {
> page = pfn_to_page(pfn);
>
> printk("s=%08lx, p=%p, r=%d,
> m=%d, pfn=%08lx.\n",
> start, page,
> PageReserved(page), page_mapcount(page), pfn);
> } else
> printk("p=NULL, pfn=%08lx.\n",
> pfn);
>
> } else {
> printk("pte=%p is not present.\n", pte);
> }
>
> start += PAGE_SIZE;
> }
> }
>
> out:
> /*
> * Actually here should be some smart algo, which will defer
> skb freeing
> * until userspace "read" it, so userspace should provide some
> kind of callback,
> * which will require write permisions to the area, so it
> should be splitted.
> * Or better just to free it after some timeout, say 100 msec
> should be enough.
> * --zbr
> *
> * Tricky algo is to place skbs into new list, which will be
> traversed
> * in a some interval and skbs will be unlinked and freed.
> * Actually, there is no need to lock this queue against
> freeing, since it happens
> * synchroniously, but if someday freeing will be separate
> nothing will be changed.
> * --zbr
> */
>
> spin_lock_bh(&po->sk_free_queue.lock);
> po->free_queued++;
> __skb_queue_tail(&po->sk_free_queue, skb);
> spin_unlock_bh(&po->sk_free_queue.lock);
> }
> #if 0
> if (time_after(jiffies, po->next_free)) {
> po->next_free = jiffies + msecs_to_jiffies(free_timeout);
> packet_free_skbs(po, 0);
> }
> #endif
> printk("%s: UP: po->tsk->mm=%p.\n", __func__, po->tsk->mm);
> up_write(&po->tsk->mm->mmap_sem);
>
> printk("%s finished.\n", __func__);
> }
>
> static void packet_mm_open(struct vm_area_struct *vma)
> {
> struct file *file = vma->vm_file;
> struct inode *inode = file->f_dentry->d_inode;
> struct socket * sock = SOCKET_I(inode);
> struct sock *sk = sock->sk;
>
> printk( "%s, sk=%p.\n", __func__, sk);
> }
>
> static void packet_mm_close(struct vm_area_struct *vma)
> {
> struct file *file = vma->vm_file;
> struct inode *inode = file->f_dentry->d_inode;
> struct socket *sock = SOCKET_I(inode);
> struct sock *sk = sock->sk;
>
> printk( "%s, sk=%p.\n", __func__, sk);
>
> if (sk) {
> struct packet_sock *po = pkt_sk(sk);
>
> if (po) {
> down_write(&vma->vm_mm->mmap_sem);
> clear_bit(PACKET_SOCKET_MAPPED, &po->flags);
> up_write(&vma->vm_mm->mmap_sem);
> }
> }
> }
>
> static struct vm_operations_struct packet_mmap_ops = {
> .open = packet_mm_open,
> .close = packet_mm_close,
> };
>
> static int packet_mmap_test(struct socket *sock, struct vm_area_struct *vma)
> {
> int i;
> struct timeval tv1, tv2;
> unsigned long start = vma->vm_start;
> u8 *data1, *data2;
>
> do_gettimeofday(&tv1);
> for (i=0; i<1000; i++) {
>
> update_address(vma, start, __pa(PAGE_OFFSET) >> PAGE_SHIFT);
> __flush_tlb();
>
> if (remap_pfn_range(vma, start,
> __pa(PAGE_OFFSET) >> PAGE_SHIFT,
> PAGE_SIZE,
> vma->vm_page_prot))
> break;
>
> start += PAGE_SIZE;
> }
> do_gettimeofday(&tv2);
>
> printk("%s: 1000 remaps took %lu usec.\n", __func__, (tv2.tv_sec -
> tv1.tv_sec)*1000000 + tv2.tv_usec - tv1.tv_usec);
>
> data1 = kmalloc(PAGE_SIZE, GFP_KERNEL);
> if (!data1)
> return -ENOMEM;
> data2 = kmalloc(PAGE_SIZE, GFP_KERNEL);
> if (!data2) {
> kfree(data2);
> return -ENOMEM;
> }
>
> do_gettimeofday(&tv1);
> for (i=0; i<1000; i++) {
> memcpy(data1, data2, 1500);
> }
> do_gettimeofday(&tv2);
>
> printk("%s: 1000 copyings took %lu usec.\n", __func__, (tv2.tv_sec -
> tv1.tv_sec)*1000000 + tv2.tv_usec - tv1.tv_usec);
>
> kfree(data1);
> kfree(data2);
>
> return 0;
> }
>
> static int packet_mmap(struct file *file, struct socket *sock, struct
> vm_area_struct *vma)
> {
> struct sock *sk = sock->sk;
> struct packet_sock *po = pkt_sk(sk);
> unsigned long size = vma->vm_end - vma->vm_start;
> int err = 0;
>
> vma->vm_ops = &packet_mmap_ops;
>
> //err = packet_mmap_test(sock, vma);
> if (err)
> return err;
>
> lock_sock(sk);
> po->budget = (size - PAGE_SIZE) / PAGE_SIZE;
>
> update_address(vma, vma->vm_start, __pa(po->page) >> PAGE_SHIFT);
> __flush_tlb();
>
> SetPageReserved(virt_to_page(po->page));
> if (remap_pfn_range(vma, vma->vm_start, __pa(po->page) >> PAGE_SHIFT,
> PAGE_SIZE, vma->vm_page_prot)) {
> ClearPageReserved(virt_to_page(po->page));
> err = -EIO;
> goto err_out_unlock;
> }
>
> po->vma = vma;
>
> release_sock(sk);
>
> INIT_WORK(&w, test_timer_func, sk);
>
> set_bit(PACKET_SOCKET_MAPPED, &po->flags);
>
> return 0;
>
> err_out_unlock:
> release_sock(sk);
> return err;
> }
>
> static unsigned int packet_poll(struct file * file, struct socket *sock,
> poll_table *wait)
> {
> struct sock *sk = sock->sk;
> struct packet_sock *po = pkt_sk(sk);
> unsigned int mask = datagram_poll(file, sock, wait);
>
> spin_lock_bh(&sk->sk_receive_queue.lock);
> if (po->free_queued < po->total)
> mask |= POLLIN | POLLRDNORM;
> spin_unlock_bh(&sk->sk_receive_queue.lock);
> return mask;
> }
>
> static struct proto_ops packet_ops_spkt = {
> .family = PF_PACKET,
> .owner = THIS_MODULE,
> .release = packet_release,
> .bind = packet_bind,
> .connect = sock_no_connect,
> .socketpair = sock_no_socketpair,
> .accept = sock_no_accept,
> .getname = sock_no_getname,
> .poll = packet_poll,
> .ioctl = packet_ioctl,
> .listen = sock_no_listen,
> .shutdown = sock_no_shutdown,
> .setsockopt = sock_no_setsockopt,
> .getsockopt = sock_no_getsockopt,
> .sendmsg = sock_no_sendmsg,
> .recvmsg = sock_no_recvmsg,
> .mmap = packet_mmap,
> .sendpage = sock_no_sendpage,
> };
>
> static struct net_proto_family packet_family_ops = {
> .family = PF_PACKET,
> .create = packet_create,
> .owner = THIS_MODULE,
> };
>
> static void __exit packet_exit(void)
> {
> sock_unregister(PF_PACKET);
> proto_unregister(&packet_proto);
> }
>
> static int __init packet_init(void)
> {
> int rc = proto_register(&packet_proto, 0);
>
> if (rc != 0)
> goto out;
>
> sock_register(&packet_family_ops);
>
> printk("%s: initialized at %lu.\n", __func__, jiffies);
> out:
> return rc;
> }
>
> module_init(packet_init);
> module_exit(packet_exit);
> MODULE_LICENSE("GPL");
> MODULE_ALIAS_NETPROTO(PF_PACKET);
>
> af_tlb.h
>
> /*
> * af_tlb.h
> *
> * 2005 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]>
> * All rights reserved.
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> * the Free Software Foundation; either version 2 of the License, or
> * (at your option) any later version.
> *
> * This program is distributed in the hope that it will be useful,
> * but WITHOUT ANY WARRANTY; without even the implied warranty of
> * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> * GNU General Public License for more details.
> *
> * You should have received a copy of the GNU General Public License
> * along with this program; if not, write to the Free Software
> * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> */
>
> #ifndef __AF_TLB_H
> #define __AF_TLB_H
>
> enum packet_shared_flags {
> PACKET_MAPPED = 0,
> };
>
> struct packet_shared {
> __u16 offset;
> __u16 reserved;
> int pos;
> long flags;
> } __attribute__ ((packed));
>
> #ifdef __KERNEL__
>
> enum packet_flags {
> PACKET_SOCKET_RUNNING = 0,
> PACKET_SOCKET_MAPPED,
> };
>
> struct packet_sock {
> struct sock sk;
> struct packet_type prot_hook;
> spinlock_t bind_lock;
>
> long flags;
> int ifindex;
> unsigned short num;
>
> struct vm_area_struct *vma;
>
> struct task_struct *tsk;
>
> int budget, last;
> unsigned long page;
>
> struct sk_buff_head sk_free_queue;
> int free_queued;
>
> unsigned long next_free;
>
> unsigned long queued;
> unsigned long dropped;
> unsigned long total;
> };
>
> #endif /* __KERNEL__ */
>
> #endif /* __AF_TLB_H */
>
> tlb_test.c
> #include <sys/types.h>
> #include <sys/socket.h>
> #include <sys/mman.h>
> #include <sys/poll.h>
>
> #include <stdio.h>
> #include <string.h>
> #include <stdlib.h>
> #include <errno.h>
> #include <unistd.h>
>
> #include <netinet/in.h>
> #include <netinet/ip.h>
> #include <net/ethernet.h>
>
> #include <linux/if_ether.h>
> #include <linux/types.h>
>
> #include "af_tlb.h"
>
> #define PAGE_SIZE 4096
> static size_t mmap_size = 17*PAGE_SIZE;
>
> #define ulog(f, a...) do { fprintf(stderr, f, ##a); fflush(stderr); } while
> (0)
> #define NIPQUAD(addr) \
> ((unsigned char *)&addr)[0], \
> ((unsigned char *)&addr)[1], \
> ((unsigned char *)&addr)[2], \
> ((unsigned char *)&addr)[3]
>
> static __inline__ void set_bit(int bit, uint32_t *f)
> {
> *f |= (1<<bit);
> }
>
> static __inline__ void clear_bit(int bit, uint32_t *f)
> {
> *f &= ~(1<<bit);
> }
>
> static __inline__ int test_bit(int bit, uint32_t *f)
> {
> return ((*f >> bit) & 1);
> }
>
> static void dump_data(void *ptr, __u16 offset, int size)
> {
> int i;
> unsigned char *data = ptr + offset;
>
> ulog("%p: ", ptr);
> for (i=0; i<size; ++i)
> ulog("%02x ", data[i]);
> ulog("\n");
> }
>
> static int dump_network(void *ptr, __u16 offset)
> {
> struct ether_header *eth = ptr + offset;
> struct iphdr *ip;
> char *proto;
> int i;
> unsigned short ether_type;
>
> //ulog("offset=%x: ", offset);
>
> ether_type = ntohs(eth->ether_type);
> if (ether_type != ETH_P_IP && ether_type != ETH_P_ARP) {
> //ulog("\n");
> return -1;
> }
>
> ulog("MAC: proto=%04x, src=", eth->ether_type);
> for (i=0; i<ETH_ALEN-1; ++i)
> ulog("%02x:", eth->ether_shost[i]);
> ulog("%02x, dst=", eth->ether_shost[ETH_ALEN-1]);
> for (i=0; i<ETH_ALEN-1; ++i)
> ulog("%02x:", eth->ether_dhost[i]);
> ulog("%02x. ", eth->ether_dhost[ETH_ALEN-1]);
>
> if (ether_type != ETH_P_IP) {
> dump_data(ptr, offset + sizeof(*eth), 16);
> return 0;
> }
>
> ip = (struct iphdr *)(ptr + offset + sizeof(*eth));
>
> switch (ip->protocol) {
> case IPPROTO_TCP:
> proto = "TCP ";
> break;
> case IPPROTO_UDP:
> proto = "UDP ";
> break;
> case IPPROTO_ICMP:
> proto = "ICMP";
> break;
> default:
> proto = "UNKN";
> dump_data(ptr, offset + sizeof(*eth), 16);
> return 0;
> }
>
> ulog("%s: ", proto);
> ulog("%u.%u.%u.%u -> %u.%u.%u.%u.\n", NIPQUAD(ip->saddr),
> NIPQUAD(ip->daddr));
>
> return 0;
> }
>
> int main(int argc, char *argv[])
> {
> struct sockaddr sa;
> int s, err, num, i, j;
> socklen_t len = sizeof(sa);
> void *mmap_ptr;
> struct packet_shared *ps, *ops;
> void *old_ps;
> struct pollfd pfd;
>
> if (argc > 1)
> memcpy(sa.sa_data, argv[1], sizeof(sa.sa_data));
> else
> memcpy(sa.sa_data, "eth0", sizeof(sa.sa_data));
>
> old_ps = malloc(PAGE_SIZE);
> if (!old_ps) {
> ulog("Failed to allocate backup packet shared page.\n");
> return -ENOMEM;
> }
>
> memset(old_ps, 0, PAGE_SIZE);
>
> s = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
> if (s == -1) {
> ulog("Failed to create PF_PACKET socket: %s [%d].\n",
> strerror(errno), errno);
> err = -errno;
> goto err_out_free_old_ps;
> }
>
> mmap_ptr = mmap(NULL, mmap_size, PROT_READ, MAP_SHARED, s, 0);
> if (mmap_ptr == MAP_FAILED) {
> ulog("Failed to map socket %d: %s [%d].\n", s, strerror(errno),
> errno);
> err = -errno;
> goto err_out_close;
> }
>
> err = bind(s, &sa, len);
> if (err == -1) {
> ulog("Failed to bind socket %d to device %s: %s [%d].\n",
> s, sa.sa_data, strerror(errno), errno);
> goto err_out_unmap;
> }
>
> pfd.fd = s;
> pfd.events = POLLIN;
> pfd.revents = 0;
>
> num = (mmap_size - PAGE_SIZE) / PAGE_SIZE;
>
> j = 0;
> while (1) {
> /*err = poll(&pfd, 1, -1);
>
> if ((err == 0 || err == -1) && (errno != EINTR)) {
> err = -errno;
> break;
> }*/
>
> ps = (struct packet_shared *)mmap_ptr;
> ops = (struct packet_shared *)old_ps;
>
> for (i=0; i<num; ++i) {
> void *ptr = mmap_ptr + PAGE_SIZE*(i+1);
>
> if (test_bit(PACKET_MAPPED, &ps->flags) && ps->offset
> != ops->offset) {
> err = dump_network(ptr, ps->offset);
> if (++j > 1000)
> goto err_out_unmap;
> }
> #if 0
> if (err && ps->offset)
> dump_data(ptr, ps->offset, 32);
> #endif
> *ops++ = *ps++;
> }
>
> pfd.events = POLLIN;
> pfd.revents = 0;
> }
>
> err = 0;
>
> err_out_unmap:
> munmap(mmap_ptr, mmap_size);
> err_out_close:
> close(s);
> err_out_free_old_ps:
> free(old_ps);
>
> return err;
> }
>
> Makefile.
>
> obj-m := af_tlb.o
>
> KDIR := /lib/modules/`uname -r`/build
> #KDIR := /usr/local/src/linux-2.6
> PWD := $(shell pwd)
> UCFLAGS := -W -Wall
>
> default:
> $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules
>
> test:
> gcc $(UCFLAGS) tlb_test.c -o tlb_test
>
> clean:
> $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) clean
> @rm -f *~
>
> --
> Evgeniy Polyakov
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Evgeniy Polyakov
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html