On Wed 30-11-16 16:45:30, Ross Zwisler wrote:
> Tracepoints are the standard way to capture debugging and tracing
> information in many parts of the kernel, including the XFS and ext4
> filesystems.  Create a tracepoint header for FS DAX and add the first DAX
> tracepoints to the PMD fault handler.  This allows the tracing for DAX to
> be done in the same way as the filesystem tracing so that developers can
> look at them together and get a coherent idea of what the system is doing.
> 
> I added both an entry and exit tracepoint because future patches will add
> tracepoints to child functions of dax_iomap_pmd_fault() like
> dax_pmd_load_hole() and dax_pmd_insert_mapping(). We want those messages to
> be wrapped by the parent function tracepoints so the code flow is more
> easily understood.  Having entry and exit tracepoints for faults also
> allows us to easily see what filesystems functions were called during the
> fault.  These filesystem functions get executed via iomap_begin() and
> iomap_end() calls, for example, and will have their own tracepoints.
> 
> For PMD faults we primarily want to understand the type of mapping, the
> fault flags, the faulting address and whether it fell back to 4k faults.
> If it fell back to 4k faults the tracepoints should let us understand why.
> 
> I named the new tracepoint header file "fs_dax.h" to allow for device DAX
> to have its own separate tracing header in the same directory at some
> point.
> 
> Here is an example output for these events from a successful PMD fault:
> 
> big-1441  [005] ....    32.582758: xfs_filemap_pmd_fault: dev 259:0 ino
> 0x1003
> 
> big-1441  [005] ....    32.582776: dax_pmd_fault: dev 259:0 ino 0x1003
> shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start
> 0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400
> 
> big-1441  [005] ....    32.583292: dax_pmd_fault_done: dev 259:0 ino 0x1003
> shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start
> 0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400 NOPAGE
> 
> Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> Suggested-by: Dave Chinner <da...@fromorbit.com>

Looks good. You can add:

Reviewed-by: Jan Kara <j...@suse.cz>

                                                                Honza

> ---
>  fs/dax.c                      | 30 ++++++++++++-------
>  include/linux/mm.h            | 25 ++++++++++++++++
>  include/trace/events/fs_dax.h | 68 
> +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 113 insertions(+), 10 deletions(-)
>  create mode 100644 include/trace/events/fs_dax.h
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index b14335c..4a99c2e 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -35,6 +35,9 @@
>  #include <linux/iomap.h>
>  #include "internal.h"
>  
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/fs_dax.h>
> +
>  /* We choose 4096 entries - same as per-zone page wait tables */
>  #define DAX_WAIT_TABLE_BITS 12
>  #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
> @@ -1311,6 +1314,16 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, 
> unsigned long address,
>       loff_t pos;
>       int error;
>  
> +     /*
> +      * Check whether offset isn't beyond end of file now. Caller is
> +      * supposed to hold locks serializing us with truncate / punch hole so
> +      * this is a reliable test.
> +      */
> +     pgoff = linear_page_index(vma, pmd_addr);
> +     max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
> +
> +     trace_dax_pmd_fault(inode, vma, address, flags, pgoff, max_pgoff, 0);
> +
>       /* Fall back to PTEs if we're going to COW */
>       if (write && !(vma->vm_flags & VM_SHARED))
>               goto fallback;
> @@ -1321,16 +1334,10 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, 
> unsigned long address,
>       if ((pmd_addr + PMD_SIZE) > vma->vm_end)
>               goto fallback;
>  
> -     /*
> -      * Check whether offset isn't beyond end of file now. Caller is
> -      * supposed to hold locks serializing us with truncate / punch hole so
> -      * this is a reliable test.
> -      */
> -     pgoff = linear_page_index(vma, pmd_addr);
> -     max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
> -
> -     if (pgoff > max_pgoff)
> -             return VM_FAULT_SIGBUS;
> +     if (pgoff > max_pgoff) {
> +             result = VM_FAULT_SIGBUS;
> +             goto out;
> +     }
>  
>       /* If the PMD would extend beyond the file size */
>       if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
> @@ -1401,6 +1408,9 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, 
> unsigned long address,
>               split_huge_pmd(vma, pmd, address);
>               count_vm_event(THP_FAULT_FALLBACK);
>       }
> +out:
> +     trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff,
> +                     result);
>       return result;
>  }
>  EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a5f52c0..30f416a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -281,6 +281,17 @@ extern pgprot_t protection_map[16];
>  #define FAULT_FLAG_REMOTE    0x80    /* faulting for non current tsk/mm */
>  #define FAULT_FLAG_INSTRUCTION  0x100        /* The fault was during an 
> instruction fetch */
>  
> +#define FAULT_FLAG_TRACE \
> +     { FAULT_FLAG_WRITE,             "WRITE" }, \
> +     { FAULT_FLAG_MKWRITE,           "MKWRITE" }, \
> +     { FAULT_FLAG_ALLOW_RETRY,       "ALLOW_RETRY" }, \
> +     { FAULT_FLAG_RETRY_NOWAIT,      "RETRY_NOWAIT" }, \
> +     { FAULT_FLAG_KILLABLE,          "KILLABLE" }, \
> +     { FAULT_FLAG_TRIED,             "TRIED" }, \
> +     { FAULT_FLAG_USER,              "USER" }, \
> +     { FAULT_FLAG_REMOTE,            "REMOTE" }, \
> +     { FAULT_FLAG_INSTRUCTION,       "INSTRUCTION" }
> +
>  /*
>   * vm_fault is filled by the the pagefault handler and passed to the vma's
>   * ->fault function. The vma's ->fault is responsible for returning a bitmask
> @@ -1107,6 +1118,20 @@ static inline void clear_page_pfmemalloc(struct page 
> *page)
>                        VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
>                        VM_FAULT_FALLBACK)
>  
> +#define VM_FAULT_RESULT_TRACE \
> +     { VM_FAULT_OOM,                 "OOM" }, \
> +     { VM_FAULT_SIGBUS,              "SIGBUS" }, \
> +     { VM_FAULT_MAJOR,               "MAJOR" }, \
> +     { VM_FAULT_WRITE,               "WRITE" }, \
> +     { VM_FAULT_HWPOISON,            "HWPOISON" }, \
> +     { VM_FAULT_HWPOISON_LARGE,      "HWPOISON_LARGE" }, \
> +     { VM_FAULT_SIGSEGV,             "SIGSEGV" }, \
> +     { VM_FAULT_NOPAGE,              "NOPAGE" }, \
> +     { VM_FAULT_LOCKED,              "LOCKED" }, \
> +     { VM_FAULT_RETRY,               "RETRY" }, \
> +     { VM_FAULT_FALLBACK,            "FALLBACK" }, \
> +     { VM_FAULT_DONE_COW,            "DONE_COW" }
> +
>  /* Encode hstate index for a hwpoisoned large page */
>  #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
>  #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
> diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
> new file mode 100644
> index 0000000..5acc016
> --- /dev/null
> +++ b/include/trace/events/fs_dax.h
> @@ -0,0 +1,68 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM fs_dax
> +
> +#if !defined(_TRACE_FS_DAX_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_FS_DAX_H
> +
> +#include <linux/tracepoint.h>
> +
> +DECLARE_EVENT_CLASS(dax_pmd_fault_class,
> +     TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
> +             unsigned long address, unsigned int flags, pgoff_t pgoff,
> +             pgoff_t max_pgoff, int result),
> +     TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result),
> +     TP_STRUCT__entry(
> +             __field(dev_t, dev)
> +             __field(unsigned long, ino)
> +             __field(unsigned long, vm_start)
> +             __field(unsigned long, vm_end)
> +             __field(unsigned long, vm_flags)
> +             __field(unsigned long, address)
> +             __field(unsigned int, flags)
> +             __field(pgoff_t, pgoff)
> +             __field(pgoff_t, max_pgoff)
> +             __field(int, result)
> +     ),
> +     TP_fast_assign(
> +             __entry->dev = inode->i_sb->s_dev;
> +             __entry->ino = inode->i_ino;
> +             __entry->vm_start = vma->vm_start;
> +             __entry->vm_end = vma->vm_end;
> +             __entry->vm_flags = vma->vm_flags;
> +             __entry->address = address;
> +             __entry->flags = flags;
> +             __entry->pgoff = pgoff;
> +             __entry->max_pgoff = max_pgoff;
> +             __entry->result = result;
> +     ),
> +     TP_printk("dev %d:%d ino %#lx %s %s address %#lx vm_start "
> +                     "%#lx vm_end %#lx pgoff %#lx max_pgoff %#lx %s",
> +             MAJOR(__entry->dev),
> +             MINOR(__entry->dev),
> +             __entry->ino,
> +             __entry->vm_flags & VM_SHARED ? "shared" : "private",
> +             __print_flags(__entry->flags, "|", FAULT_FLAG_TRACE),
> +             __entry->address,
> +             __entry->vm_start,
> +             __entry->vm_end,
> +             __entry->pgoff,
> +             __entry->max_pgoff,
> +             __print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE)
> +     )
> +)
> +
> +#define DEFINE_PMD_FAULT_EVENT(name) \
> +DEFINE_EVENT(dax_pmd_fault_class, name, \
> +     TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
> +             unsigned long address, unsigned int flags, pgoff_t pgoff, \
> +             pgoff_t max_pgoff, int result), \
> +     TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result))
> +
> +DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
> +DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
> +
> +
> +#endif /* _TRACE_FS_DAX_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> -- 
> 2.7.4
> 
-- 
Jan Kara <j...@suse.com>
SUSE Labs, CR
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

Reply via email to