[ndctl PATCH 3/7] ndctl, list: add option to filter namespace by mode
For example, "ndctl list -m dax" list all the namespaces that are hosting a device-dax region. Signed-off-by: Dan Williams --- ndctl/builtin-list.c | 33 - ndctl/libndctl.h.in |1 + 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/ndctl/builtin-list.c b/ndctl/builtin-list.c index caafec8b8f39..b09da3057519 100644 --- a/ndctl/builtin-list.c +++ b/ndctl/builtin-list.c @@ -30,6 +30,7 @@ static struct { const char *region; const char *type; const char *dimm; + const char *mode; const char *namespace; } param; @@ -43,6 +44,25 @@ do { \ VERSION, __func__, __LINE__, ##__VA_ARGS__); \ } while (0) +static enum ndctl_namespace_mode mode_to_type(const char *mode) +{ + if (!mode) + return -ENXIO; + + if (strcasecmp(param.mode, "memory") == 0) + return NDCTL_NS_MODE_MEMORY; + else if (strcasecmp(param.mode, "sector") == 0) + return NDCTL_NS_MODE_SAFE; + else if (strcasecmp(param.mode, "safe") == 0) + return NDCTL_NS_MODE_SAFE; + else if (strcasecmp(param.mode, "dax") == 0) + return NDCTL_NS_MODE_DAX; + else if (strcasecmp(param.mode, "raw") == 0) + return NDCTL_NS_MODE_RAW; + + return NDCTL_NS_MODE_UNKNOWN; +} + static struct json_object *list_namespaces(struct ndctl_region *region, struct json_object *container, struct json_object *jnamespaces, bool continue_array) @@ -50,6 +70,7 @@ static struct json_object *list_namespaces(struct ndctl_region *region, struct ndctl_namespace *ndns; ndctl_namespace_foreach(region, ndns) { + enum ndctl_namespace_mode mode = ndctl_namespace_get_mode(ndns); struct json_object *jndns; /* are we emitting namespaces? */ @@ -59,6 +80,9 @@ static struct json_object *list_namespaces(struct ndctl_region *region, if (!util_namespace_filter(ndns, param.namespace)) continue; + if (param.mode && mode_to_type(param.mode) != mode) + continue; + if (!list.idle && !util_namespace_active(ndns)) continue; @@ -197,7 +221,9 @@ int cmd_list(int argc, const char **argv, void *ctx) OPT_STRING('d', "dimm", ¶m.dimm, "dimm-id", "filter by dimm"), OPT_STRING('n', "namespace", ¶m.namespace, "namespace-id", - "filter by namespace"), + "filter by namespace id"), + OPT_STRING('m', "mode", ¶m.mode, "namespace-mode", + "filter by namespace mode"), OPT_STRING('t', "type", ¶m.type, "region-type", "filter by region-type"), OPT_BOOLEAN('B', "buses", &list.buses, "include bus info"), @@ -251,6 +277,11 @@ int cmd_list(int argc, const char **argv, void *ctx) type = ND_DEVICE_REGION_BLK; } + if (mode_to_type(param.mode) == NDCTL_NS_MODE_UNKNOWN) { + error("invalid mode: '%s'\n", param.mode); + return -EINVAL; + } + ndctl_bus_foreach(ctx, bus) { struct json_object *jbus = NULL; struct ndctl_region *region; diff --git a/ndctl/libndctl.h.in b/ndctl/libndctl.h.in index d45111775525..394d22e24c7a 100644 --- a/ndctl/libndctl.h.in +++ b/ndctl/libndctl.h.in @@ -475,6 +475,7 @@ enum ndctl_namespace_mode { NDCTL_NS_MODE_SAFE, NDCTL_NS_MODE_RAW, NDCTL_NS_MODE_DAX, + NDCTL_NS_MODE_UNKNOWN, }; enum ndctl_namespace_mode ndctl_namespace_get_mode( struct ndctl_namespace *ndns); ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[ndctl PATCH 2/7] ndctl, daxctl: move json helpers to be available across both utilities
--- ndctl/Makefile.am |2 +- test/Makefile.am |4 ++-- util/json.c |0 util/json.h |0 4 files changed, 3 insertions(+), 3 deletions(-) rename ndctl/util/json.c => util/json.c (100%) rename ndctl/util/json.h => util/json.h (100%) diff --git a/ndctl/Makefile.am b/ndctl/Makefile.am index f03647ae9d99..c563e9411cc3 100644 --- a/ndctl/Makefile.am +++ b/ndctl/Makefile.am @@ -10,7 +10,7 @@ ndctl_SOURCES = ndctl.c \ ../util/log.c \ builtin-list.c \ builtin-test.c \ - util/json.c + ../util/json.c if ENABLE_SMART ndctl_SOURCES += util/json-smart.c diff --git a/test/Makefile.am b/test/Makefile.am index 46a1acf98f0d..9e3ae1dab411 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -75,7 +75,7 @@ device_dax_SOURCES = \ dax-pmd.c \ core.c \ ../ndctl/builtin-xaction-namespace.c \ - ../ndctl/util/json.c + ../util/json.c device_dax_LDADD = \ $(LIBNDCTL_LIB) \ $(JSON_LIBS) \ @@ -85,7 +85,7 @@ multi_pmem_SOURCES = \ multi-pmem.c \ core.c \ ../ndctl/builtin-xaction-namespace.c \ - ../ndctl/util/json.c + ../util/json.c multi_pmem_LDADD = \ $(LIBNDCTL_LIB) \ $(JSON_LIBS) \ diff --git a/ndctl/util/json.c b/util/json.c similarity index 100% rename from ndctl/util/json.c rename to util/json.c diff --git a/ndctl/util/json.h b/util/json.h similarity index 100% rename from ndctl/util/json.h rename to util/json.h ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[ndctl PATCH 6/7] ndctl, {enable,disable}-region: filter by type
Allow the region to be selected by type. For example, disable all pmem: ndctl disable-region -t pmem all One case this is useful for is scripting the bring-up of individual namespaces rather than the default of all at once and in parallel. For example: * blacklist the nd_pmem module * disable all pmem regions * load the nd_pmem module * individually enable pmem regions Signed-off-by: Dan Williams --- ndctl/builtin-xable-region.c | 31 +++ 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/ndctl/builtin-xable-region.c b/ndctl/builtin-xable-region.c index 50cbdef5b339..0d46192fe6a4 100644 --- a/ndctl/builtin-xable-region.c +++ b/ndctl/builtin-xable-region.c @@ -6,11 +6,16 @@ #include #include -static const char *region_bus; +static struct { + const char *bus; + const char *type; +} param; static const struct option region_options[] = { - OPT_STRING('b', "bus", ®ion_bus, "bus-id", + OPT_STRING('b', "bus", ¶m.bus, "bus-id", " must be on a bus with an id/provider of "), + OPT_STRING('t', "type", ¶m.type, "region-type", + " must be of the specified type"), OPT_END(), }; @@ -33,6 +38,20 @@ static const char *parse_region_options(int argc, const char **argv, usage_with_options(u, region_options); return NULL; /* we won't return from usage_with_options() */ } + + if (param.type) { + if (strcmp(param.type, "pmem") == 0) + /* pass */; + else if (strcmp(param.type, "blk") == 0) + /* pass */; + else { + error("unknown region type '%s', should be 'pmem' or 'blk'\n", + param.type); + usage_with_options(u, region_options); + return NULL; + } + } + return argv[0]; } @@ -47,10 +66,14 @@ static int do_xable_region(const char *region_arg, goto out; ndctl_bus_foreach(ctx, bus) { - if (!util_bus_filter(bus, region_bus)) + if (!util_bus_filter(bus, param.bus)) continue; ndctl_region_foreach(bus, region) { + const char *type = ndctl_region_get_type_name(region); + + if (param.type && strcmp(param.type, type) != 0) + continue; if (!util_region_filter(region, region_arg)) continue; if (xable_fn(region) == 0) @@ -60,7 +83,7 @@ static int do_xable_region(const char *region_arg, rc = success; out: - region_bus = NULL; + param.bus = NULL; return rc; } ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[ndctl PATCH 7/7] ndctl, list: limit mappings when --dimm is specified
Single out the mapping for the requested dimm device when specified. Signed-off-by: Dan Williams --- ndctl/builtin-list.c |3 +++ 1 file changed, 3 insertions(+) diff --git a/ndctl/builtin-list.c b/ndctl/builtin-list.c index ed4edf6f0249..e8d007003b62 100644 --- a/ndctl/builtin-list.c +++ b/ndctl/builtin-list.c @@ -173,6 +173,9 @@ static struct json_object *region_to_json(struct ndctl_region *region) if (!list.dimms) break; + if (!util_dimm_filter(dimm, param.dimm)) + continue; + if (!list.idle && !ndctl_dimm_is_enabled(dimm)) continue; ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[ndctl PATCH 1/7] ndctl, daxctl: refactor main boilerplate for a new 'daxctl' utility
Prepare the generic command infrastructure for reuse with daxctl. Signed-off-by: Dan Williams --- Makefile.am |2 ndctl/Makefile.am |1 ndctl/builtin-bat.c |2 ndctl/builtin-create-nfit.c |2 ndctl/builtin-dimm.c | 14 ++- ndctl/builtin-list.c |2 ndctl/builtin-test.c |2 ndctl/builtin-xable-region.c |4 - ndctl/builtin-xaction-namespace.c |8 +- ndctl/builtin.h | 36 ndctl/ndctl.c | 160 + test/device-dax.c |2 util/help.c | 44 ++ util/main.c | 123 util/main.h | 10 ++ 15 files changed, 219 insertions(+), 193 deletions(-) rename ndctl/builtin-help.c => util/help.c (73%) create mode 100644 util/main.c create mode 100644 util/main.h diff --git a/Makefile.am b/Makefile.am index 9eb396639efe..01caca803540 100644 --- a/Makefile.am +++ b/Makefile.am @@ -64,6 +64,8 @@ libutil_a_SOURCES = \ util/parse-options.h \ util/usage.c \ util/size.c \ + util/main.c \ + util/help.c \ util/strbuf.c \ util/wrapper.c \ util/filter.c diff --git a/ndctl/Makefile.am b/ndctl/Makefile.am index 2d0d8eb40841..f03647ae9d99 100644 --- a/ndctl/Makefile.am +++ b/ndctl/Makefile.am @@ -10,7 +10,6 @@ ndctl_SOURCES = ndctl.c \ ../util/log.c \ builtin-list.c \ builtin-test.c \ - builtin-help.c \ util/json.c if ENABLE_SMART diff --git a/ndctl/builtin-bat.c b/ndctl/builtin-bat.c index 48b41dab6d92..5e14d39cfac5 100644 --- a/ndctl/builtin-bat.c +++ b/ndctl/builtin-bat.c @@ -4,7 +4,7 @@ #include #include -int cmd_bat(int argc, const char **argv, struct ndctl_ctx *ctx) +int cmd_bat(int argc, const char **argv, void *ctx) { int loglevel = LOG_DEBUG, i, rc; struct ndctl_test *test; diff --git a/ndctl/builtin-create-nfit.c b/ndctl/builtin-create-nfit.c index 780bb84580f5..ebcd7446d14f 100644 --- a/ndctl/builtin-create-nfit.c +++ b/ndctl/builtin-create-nfit.c @@ -164,7 +164,7 @@ static int write_nfit(struct nfit *nfit, const char *file, int force) } struct ndctl_ctx; -int cmd_create_nfit(int argc, const char **argv, struct ndctl_ctx *ctx) +int cmd_create_nfit(int argc, const char **argv, void *ctx) { int i, rc = -ENXIO, force = 0; const char * const u[] = { diff --git a/ndctl/builtin-dimm.c b/ndctl/builtin-dimm.c index 4c433d56dfe3..8a041cee8cab 100644 --- a/ndctl/builtin-dimm.c +++ b/ndctl/builtin-dimm.c @@ -772,7 +772,7 @@ static const struct option init_options[] = { OPT_END(), }; -static int dimm_action(int argc, const char **argv, struct ndctl_ctx *ctx, +static int dimm_action(int argc, const char **argv, void *ctx, int (*action)(struct ndctl_dimm *dimm, struct action_context *actx), const struct option *options, const char *usage) { @@ -874,7 +874,7 @@ static int dimm_action(int argc, const char **argv, struct ndctl_ctx *ctx, return count; } -int cmd_read_labels(int argc, const char **argv, struct ndctl_ctx *ctx) +int cmd_read_labels(int argc, const char **argv, void *ctx) { int count = dimm_action(argc, argv, ctx, action_read, read_options, "ndctl read-labels [..] [-o ]"); @@ -884,7 +884,7 @@ int cmd_read_labels(int argc, const char **argv, struct ndctl_ctx *ctx) return count >= 0 ? 0 : EXIT_FAILURE; } -int cmd_zero_labels(int argc, const char **argv, struct ndctl_ctx *ctx) +int cmd_zero_labels(int argc, const char **argv, void *ctx) { int count = dimm_action(argc, argv, ctx, action_zero, base_options, "ndctl zero-labels [..] []"); @@ -894,7 +894,7 @@ int cmd_zero_labels(int argc, const char **argv, struct ndctl_ctx *ctx) return count >= 0 ? 0 : EXIT_FAILURE; } -int cmd_init_labels(int argc, const char **argv, struct ndctl_ctx *ctx) +int cmd_init_labels(int argc, const char **argv, void *ctx) { int count = dimm_action(argc, argv, ctx, action_init, init_options, "ndctl init-labels [..] []"); @@ -904,7 +904,7 @@ int cmd_init_labels(int argc, const char **argv, struct ndctl_ctx *ctx) return count >= 0 ? 0 : EXIT_FAILURE; } -int cmd_check_labels(int argc, const char **argv, struct ndctl_ctx *ctx) +int cmd_check_labels(int argc, const char **argv, void *ctx) { int count = dimm_action(argc, argv, ctx, action_check, base_options, "ndctl check-labels [..] []"); @@ -914,7 +914,7 @@ int cmd_check_labels(int argc, const char **argv, struct ndctl_ctx *ctx) return count >= 0 ? 0 : EXIT_FAILURE; } -int cmd_disable_dimm(int argc, const char **argv, struct ndctl_ctx *ctx) +int
[ndctl PATCH 4/7] ndctl, list: add '--device-dax' option
Only include dax region and sub-device info when requested by this new option to ndctl list. If '--device-dax' is the only option then the listing will only include namespaces in device-dax mode. Signed-off-by: Dan Williams --- ndctl/builtin-list.c |7 ++- ndctl/builtin-xaction-namespace.c |2 +- util/json.c | 12 +++- util/json.h |2 +- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/ndctl/builtin-list.c b/ndctl/builtin-list.c index b09da3057519..ed4edf6f0249 100644 --- a/ndctl/builtin-list.c +++ b/ndctl/builtin-list.c @@ -23,6 +23,7 @@ static struct { bool namespaces; bool idle; bool health; + bool dax; } list; static struct { @@ -98,7 +99,7 @@ static struct json_object *list_namespaces(struct ndctl_region *region, jnamespaces); } - jndns = util_namespace_to_json(ndns, list.idle); + jndns = util_namespace_to_json(ndns, list.idle, list.dax); if (!jndns) { fail("\n"); continue; @@ -233,6 +234,8 @@ int cmd_list(int argc, const char **argv, void *ctx) "include region info"), OPT_BOOLEAN('N', "namespaces", &list.namespaces, "include namespace info (default)"), + OPT_BOOLEAN('X', "device-dax", &list.dax, + "include device-dax info"), OPT_BOOLEAN('i', "idle", &list.idle, "include idle devices"), OPT_END(), }; @@ -265,6 +268,8 @@ int cmd_list(int argc, const char **argv, void *ctx) list.buses = !!param.bus; list.regions = !!param.region; list.dimms = !!param.dimm; + if (list.dax && !param.mode) + param.mode = "dax"; } if (num_list_flags() == 0) diff --git a/ndctl/builtin-xaction-namespace.c b/ndctl/builtin-xaction-namespace.c index f7a4e5b74a16..2c4f85f5e4ac 100644 --- a/ndctl/builtin-xaction-namespace.c +++ b/ndctl/builtin-xaction-namespace.c @@ -352,7 +352,7 @@ static int setup_namespace(struct ndctl_region *region, error("%s: failed to enable\n", ndctl_namespace_get_devname(ndns)); } else { - struct json_object *jndns = util_namespace_to_json(ndns, 0); + struct json_object *jndns = util_namespace_to_json(ndns, 0, 0); if (jndns) printf("%s\n", json_object_to_json_string_ext(jndns, diff --git a/util/json.c b/util/json.c index 82e677c2c7a7..299c1b5782ee 100644 --- a/util/json.c +++ b/util/json.c @@ -139,7 +139,7 @@ static json_object *util_daxctl_region_to_json(struct daxctl_region *region, } struct json_object *util_namespace_to_json(struct ndctl_namespace *ndns, - bool include_idle) + bool include_idle, bool include_dax) { struct json_object *jndns = json_object_new_object(); unsigned long long size = ULLONG_MAX; @@ -231,10 +231,12 @@ struct json_object *util_namespace_to_json(struct ndctl_namespace *ndns, if (!jobj) goto err; json_object_object_add(jndns, "uuid", jobj); - dax_region = ndctl_dax_get_daxctl_region(dax); - jobj = util_daxctl_region_to_json(dax_region, include_idle); - if (jobj) - json_object_object_add(jndns, "daxdevs", jobj); + if (include_dax) { + dax_region = ndctl_dax_get_daxctl_region(dax); + jobj = util_daxctl_region_to_json(dax_region, include_idle); + if (jobj) + json_object_object_add(jndns, "daxdevs", jobj); + } } else if (ndctl_namespace_get_type(ndns) != ND_DEVICE_NAMESPACE_IO) { const char *name; diff --git a/util/json.h b/util/json.h index 7492e51621a7..b8fc00f57ea8 100644 --- a/util/json.h +++ b/util/json.h @@ -11,7 +11,7 @@ struct json_object *util_bus_to_json(struct ndctl_bus *bus); struct json_object *util_dimm_to_json(struct ndctl_dimm *dimm); struct json_object *util_mapping_to_json(struct ndctl_mapping *mapping); struct json_object *util_namespace_to_json(struct ndctl_namespace *ndns, - bool include_idle); + bool include_idle, bool include_dax); #ifdef HAVE_NDCTL_SMART struct json_object *util_dimm_health_to_json(struct ndctl_dimm *dimm); #else ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[ndctl PATCH 5/7] daxctl: add list command
Similar to "ndctl list", provide a utility for generically dumping all the device-dax regions and device instances in a system. Signed-off-by: Dan Williams --- Makefile.am|2 - builtin.h |0 configure.ac |1 daxctl/Makefile.am | 13 ++ daxctl/daxctl.c| 91 +++ daxctl/lib/Makefile.am |3 + daxctl/libdaxctl.h |1 daxctl/list.c | 112 ndctl.spec.in | 12 + test/device-dax.c |2 - test/multi-pmem.c |2 - util/filter.c | 21 + util/filter.h |6 ++- util/json.c| 113 +++- util/json.h|8 +++ 15 files changed, 360 insertions(+), 27 deletions(-) rename ndctl/builtin.h => builtin.h (100%) create mode 100644 daxctl/Makefile.am create mode 100644 daxctl/daxctl.c create mode 100644 daxctl/list.c diff --git a/Makefile.am b/Makefile.am index 01caca803540..06cd1b059160 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,7 +1,7 @@ include Makefile.am.in ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS} -SUBDIRS = . daxctl/lib ndctl/lib ndctl +SUBDIRS = . daxctl/lib ndctl/lib ndctl daxctl if ENABLE_DOCS SUBDIRS += Documentation endif diff --git a/ndctl/builtin.h b/builtin.h similarity index 100% rename from ndctl/builtin.h rename to builtin.h diff --git a/configure.ac b/configure.ac index 7b4af616cf2b..e79623ac1d82 100644 --- a/configure.ac +++ b/configure.ac @@ -263,6 +263,7 @@ AC_CONFIG_FILES([ daxctl/lib/Makefile ndctl/lib/Makefile ndctl/Makefile +daxctl/Makefile test/Makefile Documentation/Makefile ]) diff --git a/daxctl/Makefile.am b/daxctl/Makefile.am new file mode 100644 index ..9153c418cdaf --- /dev/null +++ b/daxctl/Makefile.am @@ -0,0 +1,13 @@ +include $(top_srcdir)/Makefile.am.in + +bin_PROGRAMS = daxctl + +daxctl_SOURCES =\ + daxctl.c \ + list.c \ + ../util/json.c + +daxctl_LDADD =\ + lib/libdaxctl.la \ + ../libutil.a \ + $(JSON_LIBS) diff --git a/daxctl/daxctl.c b/daxctl/daxctl.c new file mode 100644 index ..31d230a68756 --- /dev/null +++ b/daxctl/daxctl.c @@ -0,0 +1,91 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +const char daxctl_usage_string[] = "daxctl [--version] [--help] COMMAND [ARGS]"; +const char daxctl_more_info_string[] = + "See 'daxctl help COMMAND' for more information on a specific command.\n" + " daxctl --list-cmds to see all available commands"; + +static int cmd_version(int argc, const char **argv, void *ctx) +{ + printf("%s\n", VERSION); + return 0; +} + +static int cmd_help(int argc, const char **argv, void *ctx) +{ + const char * const builtin_help_subcommands[] = { + "list", NULL, + }; + struct option builtin_help_options[] = { + OPT_END(), + }; + const char *builtin_help_usage[] = { + "daxctl help [command]", + NULL + }; + + argc = parse_options_subcommand(argc, argv, builtin_help_options, + builtin_help_subcommands, builtin_help_usage, 0); + + if (!argv[0]) { + printf("\n usage: %s\n\n", daxctl_usage_string); + printf("\n %s\n\n", daxctl_more_info_string); + return 0; + } + + return help_show_man_page(argv[0], "daxctl", "DAXCTL_MAN_VIEWER"); +} + +int cmd_list(int argc, const char **argv, void *ctx); + +static struct cmd_struct commands[] = { + { "version", cmd_version }, + { "list", cmd_list }, + { "help", cmd_help }, +}; + +int main(int argc, const char **argv) +{ + struct daxctl_ctx *ctx; + int rc; + + /* Look for flags.. */ + argv++; + argc--; + main_handle_options(&argv, &argc, daxctl_usage_string, commands, + ARRAY_SIZE(commands)); + + if (argc > 0) { + if (!prefixcmp(argv[0], "--")) + argv[0] += 2; + } else { + /* The user didn't specify a command; give them help */ + printf("\n usage: %s\n\n", daxctl_usage_string); + printf("\n %s\n\n", daxctl_more_info_string); + goto out; + } + + rc = daxctl_new(&ctx); + if (rc) + goto out; + main_handle_internal_command(argc, argv, ctx, commands, + ARRAY_SIZE(commands)); + daxctl_unref(ctx); + fprintf(stderr, "Unknown command: '%s'\n", argv[0]); +out: + return 1; +} diff --git a/daxctl/lib/Makefile.am b/daxctl/lib/Makefile.am index 92783847266a..0167e3995b00 100644 --- a/daxctl/lib/Makefile.am +++ b/daxctl/lib/Makefile.am @@ -1
[ndctl PATCH 0/7] introduce 'daxctl list', and 'ndctl list' updates
* The 'ndctl list' command awkwardly prints out all the corresponding device-dax information when a namespace is in 'dax' mode. Conversely if someone is only interested in listing device-dax information they need to contend with libnvdimm data. Introduce a separate daxctl utility with its own 'list' command for this purpose, and make the listing of device-dax data through 'ndctl list' optional (new --device-dax option). * Enhance 'ndctl list' with the option to filter by namespace mode (new --mode option). * Allow 'ndctl {enable,disable}-region' to limit itself to regions matching a given type (blk or pmem). * Fix 'ndctl list' to trim region mapping data (i.e. the dimms in a region), when a specific dimm is indicated with --dimm. --- Dan Williams (7): ndctl, daxctl: refactor main boilerplate for a new 'daxctl' utility ndctl, daxctl: move json helpers to be available across both utilities ndctl, list: add option to filter namespace by mode ndctl, list: add '--device-dax' option daxctl: add list command ndctl, {enable,disable}-region: filter by type ndctl, list: limit mappings when --dimm is specified Makefile.am |4 + builtin.h | 31 +++ configure.ac |1 daxctl/Makefile.am| 13 +++ daxctl/daxctl.c | 91 + daxctl/lib/Makefile.am|3 + daxctl/libdaxctl.h|1 daxctl/list.c | 112 ++ ndctl.spec.in | 12 +++ ndctl/Makefile.am |3 - ndctl/builtin-bat.c |2 ndctl/builtin-create-nfit.c |2 ndctl/builtin-dimm.c | 14 ++- ndctl/builtin-list.c | 45 ++ ndctl/builtin-test.c |2 ndctl/builtin-xable-region.c | 35 +++- ndctl/builtin-xaction-namespace.c | 10 +- ndctl/builtin.h | 33 ndctl/libndctl.h.in |1 ndctl/ndctl.c | 160 + test/Makefile.am |4 - test/device-dax.c |4 - test/multi-pmem.c |2 util/filter.c | 21 + util/filter.h |6 + util/help.c | 44 ++ util/json.c | 121 ++-- util/json.h |8 ++ util/main.c | 123 util/main.h | 10 ++ 30 files changed, 671 insertions(+), 247 deletions(-) create mode 100644 builtin.h create mode 100644 daxctl/Makefile.am create mode 100644 daxctl/daxctl.c create mode 100644 daxctl/list.c delete mode 100644 ndctl/builtin.h rename ndctl/builtin-help.c => util/help.c rename ndctl/util/json.c => util/json.c rename ndctl/util/json.h => util/json.h create mode 100644 util/main.c create mode 100644 util/main.h ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[RFC PATCH v2 2/2] block: fix blk_get_backing_dev_info() crash, use bdev->bd_queue
The ->bd_queue member of struct block_device was added in commit 87192a2a49c4 ("vfs: cache request_queue in struct block_device") in v3.3. However, blk_get_backing_dev_info() has been using bdev_get_queue() and grabbing the request_queue through the gendisk since before the git era. At final __blkdev_put() time ->bd_disk is cleared while ->bd_queue is not. The queue remains valid until the final put of the parent disk. The following crash signature results from blk_get_backing_dev_info() trying to lookup the queue through ->bd_disk after the final put of the block device. Simply switch bdev_get_queue() to use ->bd_queue directly which is guaranteed to still be valid since the request_queue is alive as long as the inode corresponding to the bdev has not been destroyed. BUG: unable to handle kernel NULL pointer dereference at 0568 IP: blk_get_backing_dev_info+0x10/0x20 [..] Call Trace: __inode_attach_wb+0x3a7/0x5d0 __filemap_fdatawrite_range+0xf8/0x100 filemap_write_and_wait+0x40/0x90 fsync_bdev+0x54/0x60 ? bdget_disk+0x30/0x40 invalidate_partition+0x24/0x50 del_gendisk+0xfa/0x230 Cc: Jan Kara Cc: Jens Axboe Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Wei Fang Cc: Rabin Vincent Cc: Andi Kleen Signed-off-by: Dan Williams --- block/blk-core.c |4 ++-- include/linux/blkdev.h |6 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index e8713137b846..cfd6731dfed7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -108,8 +108,8 @@ void blk_queue_congestion_threshold(struct request_queue *q) * @bdev: device * * Locates the passed device's request queue and returns the address of its - * backing_dev_info. This function can only be called if @bdev is opened - * and the return value is never NULL. + * backing_dev_info. This function can be called until the final iput() + * of the bdev inode. */ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c47c358ba052..fd332da0fc38 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -841,7 +841,11 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { - return bdev->bd_disk->queue;/* this is never NULL */ + /* +* ->bd_queue is valid as long as there is a reference against +* the bdev inode. +*/ + return bdev->bd_queue; } /* ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[RFC PATCH v2 0/2] block: fix backing_dev_info lifetime
v1 of these changes [1] was a one line change to bdev_get_queue() to prevent a shutdown crash when del_gendisk() races the final __blkdev_put(). While it is known at del_gendisk() time that the queue is still alive, Jan Kara points to other paths [2] that are racing __blkdev_put() where the assumption that ->bd_queue, or inode->i_wb is valid does not hold. Fix that broken assumption, make it the case that if you have a live block_device, or block_device-inode that the corresponding queue and inode-write-back data is still valid. These changes survive a run of the libnvdimm unit test suite which puts some stress on the block_device shutdown path. --- Changes since v1 [1]: * Introduce "block: fix lifetime of request_queue / backing_dev_info relative to bdev" to keep the queue allocated and the inode attached for writeback until ->destroy_inode() time. * Rework the comments in "block: fix blk_get_backing_dev_info() crash, use bdev->bd_queue" to reflect the assumptions about the liveness of ->bd_queue. [1]: http://marc.info/?l=linux-block&m=148366637105761&w=2 [2]: http://www.spinics.net/lists/linux-fsdevel/msg105153.html --- Dan Williams (2): block: fix lifetime of request_queue / backing_dev_info relative to bdev block: fix blk_get_backing_dev_info() crash, use bdev->bd_queue block/blk-core.c |7 --- fs/block_dev.c | 25 +++-- include/linux/blkdev.h |6 +- include/linux/fs.h |1 + 4 files changed, 25 insertions(+), 14 deletions(-) ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[RFC PATCH v2 1/2] block: fix lifetime of request_queue / backing_dev_info relative to bdev
By definition the lifetime of a struct block_device is equal to the lifetime of its corresponding inode since they both live in struct bdev_inode. Up until the inode is destroyed it may be the target of delayed write-back requests. Issuing write-back to a block_device requires a lookup of the bdev backing_dev_info that is embedded in the request_queue. It follows that a struct request_queue must be alive as long as a corresponding block device inode is the active target of requests. Match the lifetimes of backing_dev_info and its usage in inode-write-back by arranging for the bdev to take a reference against its request_queue when the bdev is instantiated. Release the request_queue reference when the inode is freed. Cc: Jens Axboe Cc: Jeff Moyer Cc: Christoph Hellwig Reported-by: Jan Kara Signed-off-by: Dan Williams --- block/blk-core.c |3 ++- fs/block_dev.c | 25 +++-- include/linux/fs.h |1 + 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 14d7c0740dc0..e8713137b846 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -378,7 +378,8 @@ EXPORT_SYMBOL(blk_run_queue); void blk_put_queue(struct request_queue *q) { - kobject_put(&q->kobj); + if (q) + kobject_put(&q->kobj); } EXPORT_SYMBOL(blk_put_queue); diff --git a/fs/block_dev.c b/fs/block_dev.c index 899fa8ccc347..8f94b74bfc7d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -589,12 +589,22 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } +static void bdev_release(struct work_struct *w) +{ + struct block_device *bdev = container_of(w, typeof(*bdev), bd_release); + struct bdev_inode *bdi = container_of(bdev, typeof(*bdi), bdev); + + blk_put_queue(bdev->bd_queue); + kmem_cache_free(bdev_cachep, bdi); +} + static void bdev_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - struct bdev_inode *bdi = BDEV_I(inode); + struct block_device *bdev = &BDEV_I(inode)->bdev; - kmem_cache_free(bdev_cachep, bdi); + /* blk_put_queue needs process context */ + schedule_work(&bdev->bd_release); } static void bdev_destroy_inode(struct inode *inode) @@ -613,6 +623,7 @@ static void init_once(void *foo) #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif + INIT_WORK(&bdev->bd_release, bdev_release); inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); @@ -1268,6 +1279,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + if (!blk_get_queue(disk->queue)) + goto out_clear; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; @@ -1288,7 +1301,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; - bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); put_disk(disk); @@ -1364,7 +1376,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; - bdev->bd_queue = NULL; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; @@ -1586,12 +1597,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) kill_bdev(bdev); bdev_write_inode(bdev); - /* -* Detaching bdev inode from its wb in __destroy_inode() -* is too late: the queue which embeds its bdi (along with -* root wb) can be gone as soon as we put_disk() below. -*/ - inode_detach_wb(bdev->bd_inode); } if (bdev->bd_contains == bdev) { if (disk->fops->release) diff --git a/include/linux/fs.h b/include/linux/fs.h index dc0478c07b2a..f084e4a2198b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -488,6 +488,7 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutexbd_fsfreeze_mutex; + struct work_struct bd_release; }; /* ___ Linux-nvdimm mailing list Linux-nvd
Re: Enabling peer to peer device transactions for PCIe devices
On 06/01/17 11:26 AM, Jason Gunthorpe wrote: > Make a generic API for all of this and you'd have my vote.. > > IMHO, you must support basic pinning semantics - that is necessary to > support generic short lived DMA (eg filesystem, etc). That hardware > can clearly do that if it can support ODP. I agree completely. What we want is for RDMA, O_DIRECT, etc to just work with special VMAs (ie. at least those backed with ZONE_DEVICE memory). Then GPU/NVME/DAX/whatever drivers can just hand these VMAs to userspace (using whatever interface is most appropriate) and userspace can do what it pleases with them. This makes _so_ much sense and actually largely already works today (as demonstrated by iopmem). Though, of course, there are many aspects that could still be improved like denying CPU access to special VMAs and having get_user_pages avoid pinning device memory, etc, etc. But all this would just be enhancements to how VMAs work and not be effected by the basic design described above. We experimented with GPU Direct and the peer memory patchset for IB and they were broken by design. They were just a very specific hack into the IB core and thus didn't help to support O_DIRECT or any other possible DMA user. And the invalidation thing was completely nuts. We had to pray an invalidation would never occur because, if it did, our application would just break. Logan ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [PATCH v5] x86: fix kaslr and memmap collision
On Thu, Jan 5, 2017 at 6:44 PM, Baoquan He wrote: > Add Kees to let him have a look at this too. > > On 01/05/17 at 05:21pm, Baoquan He wrote: >> On 01/04/17 at 11:29am, Dave Jiang wrote: >> > CONFIG_RANDOMIZE_BASE relocates the kernel to a random base address. >> > However it does not take into account the memmap= parameter passed in from >> > the kernel cmdline. This results in the kernel sometimes being put in >> > the middle of memmap. Teaching kaslr to not insert the kernel in >> > memmap defined regions. We will support up to 4 memmap regions. Any >> > additional regions will cause kaslr to disable. The mem_avoid set has >> > been augmented to add up to 4 unusable regions of memmaps provided by the >> > user to exclude those regions from the set of valid address range to insert >> > the uncompressed kernel image. The nn@ss ranges will be skipped by the >> > mem_avoid set since it indicates memory useable. >> > >> > Signed-off-by: Dave Jiang >> > --- >> > >> > v2: >> > Addressing comments from Ingo. >> > - Handle entire list of memmaps >> > v3: >> > Fix 32bit build issue >> > v4: >> > Addressing comments from Baoquan >> > - Not exclude nn@ss ranges >> > v5: >> > Addressing additional comments from Baoquan >> > - Update commit header and various coding style changes >> > >> > diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h >> > index e5612f3..59c2075 100644 >> > --- a/arch/x86/boot/boot.h >> > +++ b/arch/x86/boot/boot.h >> > @@ -332,7 +332,10 @@ int strncmp(const char *cs, const char *ct, size_t >> > count); >> > size_t strnlen(const char *s, size_t maxlen); >> > unsigned int atou(const char *s); >> > unsigned long long simple_strtoull(const char *cp, char **endp, unsigned >> > int base); >> > +unsigned long simple_strtoul(const char *cp, char **endp, unsigned int >> > base); >> > +long simple_strtol(const char *cp, char **endp, unsigned int base); >> > size_t strlen(const char *s); >> > +char *strchr(const char *s, int c); >> > >> > /* tty.c */ >> > void puts(const char *); >> > diff --git a/arch/x86/boot/compressed/kaslr.c >> > b/arch/x86/boot/compressed/kaslr.c >> > index a66854d..036b514 100644 >> > --- a/arch/x86/boot/compressed/kaslr.c >> > +++ b/arch/x86/boot/compressed/kaslr.c >> > @@ -11,6 +11,7 @@ >> > */ >> > #include "misc.h" >> > #include "error.h" >> > +#include "../boot.h" >> > >> > #include >> > #include >> > @@ -56,11 +57,16 @@ struct mem_vector { >> > unsigned long size; >> > }; >> > >> > +/* only supporting at most 4 unusable memmap regions with kaslr */ >> > +#define MAX_MEMMAP_REGIONS 4 >> > + >> > enum mem_avoid_index { >> > MEM_AVOID_ZO_RANGE = 0, >> > MEM_AVOID_INITRD, >> > MEM_AVOID_CMDLINE, >> > MEM_AVOID_BOOTPARAMS, >> > + MEM_AVOID_MEMMAP_BEGIN, >> > + MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1, >> > MEM_AVOID_MAX, >> > }; >> > >> > @@ -77,6 +83,121 @@ static bool mem_overlaps(struct mem_vector *one, >> > struct mem_vector *two) >> > return true; >> > } >> > >> > +/** >> > + * _memparse - parse a string with mem suffixes into a number >> > + * @ptr: Where parse begins >> > + * @retptr: (output) Optional pointer to next char after parse completes >> > + * >> > + * Parses a string into a number. The number stored at @ptr is >> > + * potentially suffixed with K, M, G, T, P, E. >> > + */ >> > +static unsigned long long _memparse(const char *ptr, char **retptr) >> > +{ >> > + char *endptr; /* local pointer to end of parsed string */ >> > + >> > + unsigned long long ret = simple_strtoull(ptr, &endptr, 0); >> > + >> > + switch (*endptr) { >> > + case 'E': >> > + case 'e': >> > + ret <<= 10; >> > + case 'P': >> > + case 'p': >> > + ret <<= 10; >> > + case 'T': >> > + case 't': >> > + ret <<= 10; >> > + case 'G': >> > + case 'g': >> > + ret <<= 10; >> > + case 'M': >> > + case 'm': >> > + ret <<= 10; >> > + case 'K': >> > + case 'k': >> > + ret <<= 10; >> > + endptr++; >> > + default: >> > + break; >> > + } >> > + >> > + if (retptr) >> > + *retptr = endptr; >> > + >> > + return ret; >> > +} >> > + >> > +static int >> > +parse_memmap(char *p, unsigned long long *start, unsigned long long *size) >> > +{ >> > + char *oldp; >> > + >> > + if (!p) >> > + return -EINVAL; >> > + >> > + /* we don't care about this option here */ >> > + if (!strncmp(p, "exactmap", 8)) >> > + return -EINVAL; >> > + >> > + oldp = p; >> > + *size = _memparse(p, &p); >> > + if (p == oldp) >> > + return -EINVAL; >> > + >> > + switch (*p) { >> > + case '@': >> > + /* skip this region, usable */ >> > + *start = 0; >> > + *size = 0; >> > + return 0; >> > + case '#': >> > + case '$': >> > + case '!': >> > + *start = _memparse(p + 1, &p); >> > + return 0; >> > + } >> > + >> > + retur
RE: Enabling peer to peer device transactions for PCIe devices
> -Original Message- > From: Jason Gunthorpe [mailto:jguntho...@obsidianresearch.com] > Sent: Friday, January 06, 2017 1:26 PM > To: Jerome Glisse > Cc: Sagalovitch, Serguei; Jerome Glisse; Deucher, Alexander; 'linux- > ker...@vger.kernel.org'; 'linux-r...@vger.kernel.org'; 'linux- > nvd...@lists.01.org'; 'linux-me...@vger.kernel.org'; 'dri- > de...@lists.freedesktop.org'; 'linux-...@vger.kernel.org'; Kuehling, Felix; > Blinzer, Paul; Koenig, Christian; Suthikulpanit, Suravee; Sander, Ben; > h...@infradead.org; Zhou, David(ChunMing); Yu, Qiang > Subject: Re: Enabling peer to peer device transactions for PCIe devices > > On Fri, Jan 06, 2017 at 12:37:22PM -0500, Jerome Glisse wrote: > > On Fri, Jan 06, 2017 at 11:56:30AM -0500, Serguei Sagalovitch wrote: > > > On 2017-01-05 08:58 PM, Jerome Glisse wrote: > > > > On Thu, Jan 05, 2017 at 05:30:34PM -0700, Jason Gunthorpe wrote: > > > > > On Thu, Jan 05, 2017 at 06:23:52PM -0500, Jerome Glisse wrote: > > > > > > > > > > > > I still don't understand what you driving at - you've said in both > > > > > > > cases a user VMA exists. > > > > > > In the former case no, there is no VMA directly but if you want one > than > > > > > > a device can provide one. But such VMA is useless as CPU access is > not > > > > > > expected. > > > > > I disagree it is useless, the VMA is going to be necessary to support > > > > > upcoming things like CAPI, you need it to support O_DIRECT from the > > > > > filesystem, DPDK, etc. This is why I am opposed to any model that is > > > > > not VMA based for setting up RDMA - that is shorted sighted and > does > > > > > not seem to reflect where the industry is going. > > > > > > > > > > So focus on having VMA backed by actual physical memory that > covers > > > > > your GPU objects and ask how do we wire up the '__user *' to the > DMA > > > > > API in the best way so the DMA API still has enough information to > > > > > setup IOMMUs and whatnot. > > > > I am talking about 2 different thing. Existing hardware and API where > you > > > > _do not_ have a vma and you do not need one. This is just > > > > > existing stuff. > > > > I do not understand why you assume that existing API doesn't need one. > > > I would say that a lot of __existing__ user level API and their support in > > > kernel (especially outside of graphics domain) assumes that we have vma > and > > > deal with __user * pointers. > > +1 > > > Well i am thinking to GPUDirect here. Some of GPUDirect use case do not > have > > vma (struct vm_area_struct) associated with them they directly apply to > GPU > > object that aren't expose to CPU. Yes some use case have vma for share > buffer. > > Lets stop talkind about GPU direct. Today we can't even make VMA > pointing at a PCI bar work properly in the kernel - lets start there > please. People can argue over other options once that is done. > > > For HMM plan is to restrict to ODP and either to replace ODP with HMM or > change > > ODP to not use get_user_pages_remote() but directly fetch informations > from > > CPU page table. Everything else stay as it is. I posted patchset to replace > > ODP with HMM in the past. > > Make a generic API for all of this and you'd have my vote.. > > IMHO, you must support basic pinning semantics - that is necessary to > support generic short lived DMA (eg filesystem, etc). That hardware > can clearly do that if it can support ODP. We would definitely like to have support for hardware that can't handle page faults gracefully. Alex ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH v4] libnvdimm: clear poison in mem map metadata
Clearing out the poison in the metadata block of the namespace before we use it. Range from start + 8k to pfn_sb->dataoff. Signed-off-by: Dave Jiang --- drivers/nvdimm/pfn_devs.c | 25 + 1 file changed, 25 insertions(+) diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index a2ac9e6..fa5ba33 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -527,11 +527,36 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn, .base_pfn = init_altmap_base(base), .reserve = init_altmap_reserve(base), }; + sector_t sector; + resource_size_t meta_start, meta_size; + long cleared; + unsigned int sz_align; memcpy(res, &nsio->res, sizeof(*res)); res->start += start_pad; res->end -= end_trunc; + meta_start = res->start + SZ_8K; + meta_size = offset; + + sector = meta_start >> 9; + sz_align = ALIGN(meta_size + (meta_start & (512 - 1)), 512); + + if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) { + if (!IS_ALIGNED(meta_start, 512) || + !IS_ALIGNED(meta_size, 512)) + return ERR_PTR(-EIO); + + cleared = nvdimm_clear_poison(&nd_pfn->dev, + meta_start, meta_size); + if (cleared <= 0) + return ERR_PTR(-EIO); + + badblocks_clear(&nsio->bb, sector, cleared >> 9); + if (cleared != meta_size) + return ERR_PTR(-EIO); + } + if (nd_pfn->mode == PFN_MODE_RAM) { if (offset < SZ_8K) return ERR_PTR(-EINVAL); ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: Enabling peer to peer device transactions for PCIe devices
On Fri, Jan 06, 2017 at 12:37:22PM -0500, Jerome Glisse wrote: > On Fri, Jan 06, 2017 at 11:56:30AM -0500, Serguei Sagalovitch wrote: > > On 2017-01-05 08:58 PM, Jerome Glisse wrote: > > > On Thu, Jan 05, 2017 at 05:30:34PM -0700, Jason Gunthorpe wrote: > > > > On Thu, Jan 05, 2017 at 06:23:52PM -0500, Jerome Glisse wrote: > > > > > > > > > > I still don't understand what you driving at - you've said in both > > > > > > cases a user VMA exists. > > > > > In the former case no, there is no VMA directly but if you want one > > > > > than > > > > > a device can provide one. But such VMA is useless as CPU access is not > > > > > expected. > > > > I disagree it is useless, the VMA is going to be necessary to support > > > > upcoming things like CAPI, you need it to support O_DIRECT from the > > > > filesystem, DPDK, etc. This is why I am opposed to any model that is > > > > not VMA based for setting up RDMA - that is shorted sighted and does > > > > not seem to reflect where the industry is going. > > > > > > > > So focus on having VMA backed by actual physical memory that covers > > > > your GPU objects and ask how do we wire up the '__user *' to the DMA > > > > API in the best way so the DMA API still has enough information to > > > > setup IOMMUs and whatnot. > > > I am talking about 2 different thing. Existing hardware and API where you > > > _do not_ have a vma and you do not need one. This is just > > > > existing stuff. > > I do not understand why you assume that existing API doesn't need one. > > I would say that a lot of __existing__ user level API and their support in > > kernel (especially outside of graphics domain) assumes that we have vma and > > deal with __user * pointers. +1 > Well i am thinking to GPUDirect here. Some of GPUDirect use case do not have > vma (struct vm_area_struct) associated with them they directly apply to GPU > object that aren't expose to CPU. Yes some use case have vma for share buffer. Lets stop talkind about GPU direct. Today we can't even make VMA pointing at a PCI bar work properly in the kernel - lets start there please. People can argue over other options once that is done. > For HMM plan is to restrict to ODP and either to replace ODP with HMM or > change > ODP to not use get_user_pages_remote() but directly fetch informations from > CPU page table. Everything else stay as it is. I posted patchset to replace > ODP with HMM in the past. Make a generic API for all of this and you'd have my vote.. IMHO, you must support basic pinning semantics - that is necessary to support generic short lived DMA (eg filesystem, etc). That hardware can clearly do that if it can support ODP. Jason ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [PATCH v2 0/4] Write protect DAX PMDs in *sync path
On Thu, Jan 05, 2017 at 05:27:34PM -0800, Andrew Morton wrote: > On Tue, 3 Jan 2017 17:13:49 -0700 Ross Zwisler > wrote: > > > On Thu, Dec 22, 2016 at 02:18:52PM -0700, Ross Zwisler wrote: > > > Currently dax_mapping_entry_mkclean() fails to clean and write protect the > > > pmd_t of a DAX PMD entry during an *sync operation. This can result in > > > data loss, as detailed in patch 4. > > > > > > You can find a working tree here: > > > > > > https://git.kernel.org/cgit/linux/kernel/git/zwisler/linux.git/log/?h=dax_pmd_clean_v2 > > > > > > This series applies cleanly to mmotm-2016-12-19-16-31. > > > > > > Changes since v1: > > > - Included Dan's patch to kill DAX support for UML. > > > - Instead of wrapping the DAX PMD code in dax_mapping_entry_mkclean() in > > >an #ifdef, we now create a stub for pmdp_huge_clear_flush() for the > > > case > > >when CONFIG_TRANSPARENT_HUGEPAGE isn't defined. (Dan & Jan) > > > > > > Dan Williams (1): > > > dax: kill uml support > > > > > > Ross Zwisler (3): > > > dax: add stub for pmdp_huge_clear_flush() > > > mm: add follow_pte_pmd() > > > dax: wrprotect pmd_t in dax_mapping_entry_mkclean > > > > > > fs/Kconfig| 2 +- > > > fs/dax.c | 49 > > > ++- > > > include/asm-generic/pgtable.h | 10 + > > > include/linux/mm.h| 4 ++-- > > > mm/memory.c | 41 > > > 5 files changed, 79 insertions(+), 27 deletions(-) > > > > Well, 0-day found another architecture that doesn't define pmd_pfn() et al., > > so we'll need some more fixes. (Thank you, 0-day, for the coverage!) > > > > I have to apologize, I didn't understand that Dan intended his "dax: kill > > uml > > support" patch to land in v4.11. I thought he intended it as a cleanup to > > my > > series, which really needs to land in v4.10. That's why I folded them > > together into this v2, along with the wrapper suggested by Jan. > > > > Andrew, does it work for you to just keep v1 of this series, and eventually > > send that to Linus for v4.10? > > > > https://lkml.org/lkml/2016/12/20/649 > > > > You've already pulled that one into -mm, and it does correctly solve the > > data > > loss issue. > > > > That would let us deal with getting rid of the #ifdef, blacklisting > > architectures and introducing the pmdp_huge_clear_flush() strub in a > > follow-on > > series for v4.11. > > I have mm-add-follow_pte_pmd.patch and > dax-wrprotect-pmd_t-in-dax_mapping_entry_mkclean.patch queued for 4.10. > Please (re)send any additional patches, indicating for each one > whether you believe it should also go into 4.10? The two patches that you already have queued are correct, and no additional patches are necessary for v4.10 for this issue. ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: Enabling peer to peer device transactions for PCIe devices
On Fri, Jan 06, 2017 at 11:56:30AM -0500, Serguei Sagalovitch wrote: > On 2017-01-05 08:58 PM, Jerome Glisse wrote: > > On Thu, Jan 05, 2017 at 05:30:34PM -0700, Jason Gunthorpe wrote: > > > On Thu, Jan 05, 2017 at 06:23:52PM -0500, Jerome Glisse wrote: > > > > > > > > I still don't understand what you driving at - you've said in both > > > > > cases a user VMA exists. > > > > In the former case no, there is no VMA directly but if you want one than > > > > a device can provide one. But such VMA is useless as CPU access is not > > > > expected. > > > I disagree it is useless, the VMA is going to be necessary to support > > > upcoming things like CAPI, you need it to support O_DIRECT from the > > > filesystem, DPDK, etc. This is why I am opposed to any model that is > > > not VMA based for setting up RDMA - that is shorted sighted and does > > > not seem to reflect where the industry is going. > > > > > > So focus on having VMA backed by actual physical memory that covers > > > your GPU objects and ask how do we wire up the '__user *' to the DMA > > > API in the best way so the DMA API still has enough information to > > > setup IOMMUs and whatnot. > > I am talking about 2 different thing. Existing hardware and API where you > > _do not_ have a vma and you do not need one. This is just existing stuff. > I do not understand why you assume that existing API doesn't need one. > I would say that a lot of __existing__ user level API and their support in > kernel (especially outside of graphics domain) assumes that we have vma and > deal with __user * pointers. Well i am thinking to GPUDirect here. Some of GPUDirect use case do not have vma (struct vm_area_struct) associated with them they directly apply to GPU object that aren't expose to CPU. Yes some use case have vma for share buffer. In the open source driver it is true that we have vma most often than not. > > Some close driver provide a functionality on top of this design. Question > > is do we want to do the same ? If yes and you insist on having a vma we > > could provide one but this is does not apply and is useless for where we > > are going with new hardware. > > > > With new hardware you just use malloc or mmap to allocate memory and then > > you use it directly with the device. Device driver can migrate any part of > > the process address space to device memory. In this scheme you have your > > usual VMAs but there is nothing special about them. > > Assuming that the whole device memory is CPU accessible and it looks > like the direction where we are going: > - You forgot about use case when we want or need to allocate memory > directly on device (why we need to migrate anything if not needed?). > - We may want to use CPU to access such memory on device to avoid > any unnecessary migration back. > - We may have more device memory than the system one. > E.g. if you have 12 GPUs w/64GB each it will already give us ~0.7 TB > not mentioning NVDIMM cards which could also be used as memory > storage for other device access. > - We also may want/need to share GPU memory between different > processes. Here i am talking about platform where GPU memory is not accessible at all by the CPU (because of PCIe restriction, think CPU atomic operation on IO memory). So i really distinguish between CAPI/CCIX and PCIe. Some platform will have CAPI/CCIX other wont. HMM apply mostly to the latter. Some of HMM functionalities are still usefull with CAPI/CCIX. Note that HMM do support allocation on GPU first. In current design this can happen when GPU is the first to access an unpopulated virtual address. For platform where GPU memory is accessible plan is either something like CDM (Coherent Device Memory) or rely on ZONE_DEVICE. So all GPU memory have struct page and those are like ordinary pages. CDM still wants some restrictions like avoiding CPU allocation to happen on GPU when there is memory pressure ... For all intent and purposes this will work transparently in respect to RDMA because we assume on those system that the RDMA is CAPI/CCIX and that it can peer to other device. > > Now when you try to do get_user_page() on any page that is inside the > > device it will fails because we do not allow any device memory to be pin. > > There is various reasons for that and they are not going away in any hw > > in the planing (so for next few years). > > > > Still we do want to support peer to peer mapping. Plan is to only do so > > with ODP capable hardware. Still we need to solve the IOMMU issue and > > it needs special handling inside the RDMA device. The way it works is > > that RDMA ask for a GPU page, GPU check if it has place inside its PCI > > bar to map this page for the device, this can fail. If it succeed then > > you need the IOMMU to let the RDMA device access the GPU PCI bar. > > > > So here we have 2 orthogonal problem. First one is how to make 2 drivers > > talks to each other to setup mapping to allow peer to peer But I would > > assume and
Re: Enabling peer to peer device transactions for PCIe devices
On 2017-01-05 08:58 PM, Jerome Glisse wrote: On Thu, Jan 05, 2017 at 05:30:34PM -0700, Jason Gunthorpe wrote: On Thu, Jan 05, 2017 at 06:23:52PM -0500, Jerome Glisse wrote: I still don't understand what you driving at - you've said in both cases a user VMA exists. In the former case no, there is no VMA directly but if you want one than a device can provide one. But such VMA is useless as CPU access is not expected. I disagree it is useless, the VMA is going to be necessary to support upcoming things like CAPI, you need it to support O_DIRECT from the filesystem, DPDK, etc. This is why I am opposed to any model that is not VMA based for setting up RDMA - that is shorted sighted and does not seem to reflect where the industry is going. So focus on having VMA backed by actual physical memory that covers your GPU objects and ask how do we wire up the '__user *' to the DMA API in the best way so the DMA API still has enough information to setup IOMMUs and whatnot. I am talking about 2 different thing. Existing hardware and API where you _do not_ have a vma and you do not need one. This is just existing stuff. I do not understand why you assume that existing API doesn't need one. I would say that a lot of __existing__ user level API and their support in kernel (especially outside of graphics domain) assumes that we have vma and deal with __user * pointers. Some close driver provide a functionality on top of this design. Question is do we want to do the same ? If yes and you insist on having a vma we could provide one but this is does not apply and is useless for where we are going with new hardware. With new hardware you just use malloc or mmap to allocate memory and then you use it directly with the device. Device driver can migrate any part of the process address space to device memory. In this scheme you have your usual VMAs but there is nothing special about them. Assuming that the whole device memory is CPU accessible and it looks like the direction where we are going: - You forgot about use case when we want or need to allocate memory directly on device (why we need to migrate anything if not needed?). - We may want to use CPU to access such memory on device to avoid any unnecessary migration back. - We may have more device memory than the system one. E.g. if you have 12 GPUs w/64GB each it will already give us ~0.7 TB not mentioning NVDIMM cards which could also be used as memory storage for other device access. - We also may want/need to share GPU memory between different processes. Now when you try to do get_user_page() on any page that is inside the device it will fails because we do not allow any device memory to be pin. There is various reasons for that and they are not going away in any hw in the planing (so for next few years). Still we do want to support peer to peer mapping. Plan is to only do so with ODP capable hardware. Still we need to solve the IOMMU issue and it needs special handling inside the RDMA device. The way it works is that RDMA ask for a GPU page, GPU check if it has place inside its PCI bar to map this page for the device, this can fail. If it succeed then you need the IOMMU to let the RDMA device access the GPU PCI bar. So here we have 2 orthogonal problem. First one is how to make 2 drivers talks to each other to setup mapping to allow peer to peer But I would assume and second is about IOMMU. I think that there is the third problem: A lot of existing user level API (MPI, IB Verbs, file i/o, etc.) deal with pointers to the buffers. Potentially it would be ideally to support use cases when those buffers are located in device memory avoiding any unnecessary migration / double-buffering. Currently a lot of infrastructure in kernel assumes that this is the user pointer and call "get_user_pages" to get s/g. What is your opinion how it should be changed to deal with cases when "buffer" is in device memory? ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: Enabling peer to peer device transactions for PCIe devices
Hello, I've been watching this thread not as a kernel developer, but as an user interested in doing peer-to-peer access between network card and GPU. I believe that merging raw direct access with vma overcomplicates things for our use case. We'll have a very large camera streaming data at high throughput (up to 100 Gbps) to the GPU, which will operate in soft real time mode and write back the results to a RDMA enabled network storage. The CPU will only arrange the connection between GPU and network card. Having things like paging or memory overcommit is possible, but they are not required and they might consistently decrease the quality of the data acquisition. I see my use case something likely to exist for others and a strong reason to split the implementation in two. 2017-01-05 16:01 GMT-03:00 Jason Gunthorpe : > On Thu, Jan 05, 2017 at 01:39:29PM -0500, Jerome Glisse wrote: > >> 1) peer-to-peer because of userspace specific API like NVidia GPU >> direct (AMD is pushing its own similar API i just can't remember >> marketing name). This does not happen through a vma, this happens >> through specific device driver call going through device specific >> ioctl on both side (GPU and RDMA). So both kernel driver are aware >> of each others. > > Today you can only do user-initiated RDMA operations in conjection > with a VMA. > > We'd need a really big and strong reason to create an entirely new > non-VMA based memory handle scheme for RDMA. > > So my inclination is to just completely push back on this idea. You > need a VMA to do RMA. > > GPUs need to create VMAs for the memory they want to RDMA from, even > if the VMA handle just causes SIGBUS for any CPU access. > > Jason > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm