date:20240404

[RFC PATCH v3 6/7] mm/damon/paddr: introduce DAMOS_MIGRATE_HOT action for promotion

2024-04-04 Thread Honggyu Kim

From: Hyeongtak Ji 

This patch introduces DAMOS_MIGRATE_HOT action, which is similar to
DAMOS_MIGRATE_COLD, but it is targeted to migrate hot pages.

It migrates pages inside the given region to the 'target_nid' NUMA node
in the sysfs.

Here is one of the example usage of this 'migrate_hot' action.

  $ cd /sys/kernel/mm/damon/admin/kdamonds/
  $ cat contexts//schemes//action
  migrate_hot
  $ echo 0 > contexts//schemes//target_nid
  $ echo commit > state
  $ numactl -p 2 ./hot_cold 500M 600M &
  $ numastat -c -p hot_cold

  Per-node process memory usage (in MBs)
  PID Node 0 Node 1 Node 2 Total
  --  -- -- -- -
  701 (hot_cold) 501  0601  1101

Signed-off-by: Hyeongtak Ji 
Signed-off-by: Honggyu Kim 
---
 include/linux/damon.h|  2 ++
 mm/damon/paddr.c | 12 ++--
 mm/damon/sysfs-schemes.c |  4 +++-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index df8671e69a70..934c95a7c042 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -105,6 +105,7 @@ struct damon_target {
  * @DAMOS_NOHUGEPAGE:  Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
  * @DAMOS_LRU_PRIO:Prioritize the region on its LRU lists.
  * @DAMOS_LRU_DEPRIO:  Deprioritize the region on its LRU lists.
+ * @DAMOS_MIGRATE_HOT:  Migrate for the given hot region.
  * @DAMOS_MIGRATE_COLD: Migrate for the given cold region.
  * @DAMOS_STAT:Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:  Total number of DAMOS actions
@@ -123,6 +124,7 @@ enum damos_action {
DAMOS_NOHUGEPAGE,
DAMOS_LRU_PRIO,
DAMOS_LRU_DEPRIO,
+   DAMOS_MIGRATE_HOT,
DAMOS_MIGRATE_COLD,
DAMOS_STAT, /* Do nothing but only record the stat */
NR_DAMOS_ACTIONS,
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index fe217a26f788..fd9d35b5cc83 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -229,6 +229,7 @@ static bool damos_pa_filter_out(struct damos *scheme, 
struct folio *folio)
 
 enum migration_mode {
MIG_PAGEOUT,
+   MIG_MIGRATE_HOT,
MIG_MIGRATE_COLD,
 };
 
@@ -375,8 +376,10 @@ static unsigned long damon_pa_migrate(struct damon_region 
*r, struct damos *s,
if (damos_pa_filter_out(s, folio))
goto put_folio;
 
-   folio_clear_referenced(folio);
-   folio_test_clear_young(folio);
+   if (mm != MIG_MIGRATE_HOT) {
+   folio_clear_referenced(folio);
+   folio_test_clear_young(folio);
+   }
if (!folio_isolate_lru(folio))
goto put_folio;
/*
@@ -394,6 +397,7 @@ static unsigned long damon_pa_migrate(struct damon_region 
*r, struct damos *s,
case MIG_PAGEOUT:
applied = reclaim_pages(&folio_list);
break;
+   case MIG_MIGRATE_HOT:
case MIG_MIGRATE_COLD:
applied = damon_pa_migrate_pages(&folio_list, mm,
 s->target_nid);
@@ -454,6 +458,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx 
*ctx,
return damon_pa_mark_accessed(r, scheme);
case DAMOS_LRU_DEPRIO:
return damon_pa_deactivate_pages(r, scheme);
+   case DAMOS_MIGRATE_HOT:
+   return damon_pa_migrate(r, scheme, MIG_MIGRATE_HOT);
case DAMOS_MIGRATE_COLD:
return damon_pa_migrate(r, scheme, MIG_MIGRATE_COLD);
case DAMOS_STAT:
@@ -476,6 +482,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
return damon_hot_score(context, r, scheme);
case DAMOS_LRU_DEPRIO:
return damon_cold_score(context, r, scheme);
+   case DAMOS_MIGRATE_HOT:
+   return damon_hot_score(context, r, scheme);
case DAMOS_MIGRATE_COLD:
return damon_cold_score(context, r, scheme);
default:
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 18b7d054c748..1d2f62aa79ca 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1406,6 +1406,7 @@ static const char * const damon_sysfs_damos_action_strs[] 
= {
"nohugepage",
"lru_prio",
"lru_deprio",
+   "migrate_hot",
"migrate_cold",
"stat",
 };
@@ -1660,7 +1661,8 @@ static ssize_t target_nid_store(struct kobject *kobj,
struct damon_sysfs_scheme, kobj);
int err = 0;
 
-if (scheme->action != DAMOS_MIGRATE_COLD)
+if (scheme->action != DAMOS_MIGRATE_HOT &&
+scheme->action != DAMOS_MIGRATE_COLD)
 return -EINVAL;
 
/* TODO: error handling for target_nid range. */
-- 
2.34.1

[RFC PATCH v3 7/7] mm/damon: Add "damon_migrate_{hot,cold}" vmstat

2024-04-04 Thread Honggyu Kim

This patch adds "damon_migrate_{hot,cold}" under node specific vmstat
counters at the following location.

  /sys/devices/system/node/node*/vmstat

The counted values are accumulcated to the global vmstat so it also
introduces the same counter at /proc/vmstat as well.

Signed-off-by: Honggyu Kim 
---
 include/linux/mmzone.h |  4 
 mm/damon/paddr.c   | 17 -
 mm/vmstat.c|  4 
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a497f189d988..0005372c5503 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -214,6 +214,10 @@ enum node_stat_item {
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
PGDEMOTE_KHUGEPAGED,
+#ifdef CONFIG_DAMON_PADDR
+   DAMON_MIGRATE_HOT,
+   DAMON_MIGRATE_COLD,
+#endif
NR_VM_NODE_STAT_ITEMS
 };
 
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index fd9d35b5cc83..d559c242d151 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -235,10 +235,23 @@ enum migration_mode {
 
 static unsigned int migrate_folio_list(struct list_head *migrate_folios,
   struct pglist_data *pgdat,
+  enum migration_mode mm,
   int target_nid)
 {
unsigned int nr_succeeded;
nodemask_t allowed_mask = NODE_MASK_NONE;
+   enum node_stat_item node_stat;
+
+   switch (mm) {
+   case MIG_MIGRATE_HOT:
+   node_stat = DAMON_MIGRATE_HOT;
+   break;
+   case MIG_MIGRATE_COLD:
+   node_stat = DAMON_MIGRATE_COLD;
+   break;
+   default:
+   return 0;
+   }
 
struct migration_target_control mtc = {
/*
@@ -263,6 +276,8 @@ static unsigned int migrate_folio_list(struct list_head 
*migrate_folios,
  (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
  &nr_succeeded);
 
+   mod_node_page_state(pgdat, node_stat, nr_succeeded);
+
return nr_succeeded;
 }
 
@@ -302,7 +317,7 @@ static unsigned int damon_pa_migrate_folio_list(struct 
list_head *folio_list,
/* 'folio_list' is always empty here */
 
/* Migrate folios selected for migration */
-   nr_migrated += migrate_folio_list(&migrate_folios, pgdat, target_nid);
+   nr_migrated += migrate_folio_list(&migrate_folios, pgdat, mm, 
target_nid);
/* Folios that could not be migrated are still in @migrate_folios */
if (!list_empty(&migrate_folios)) {
/* Folios which weren't migrated go back on @folio_list */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db79935e4a54..be9ba89fede1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1252,6 +1252,10 @@ const char * const vmstat_text[] = {
"pgdemote_kswapd",
"pgdemote_direct",
"pgdemote_khugepaged",
+#ifdef CONFIG_DAMON_PADDR
+   "damon_migrate_hot",
+   "damon_migrate_cold",
+#endif
 
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
-- 
2.34.1

[RFC PATCH v3 5/7] mm/damon/paddr: introduce DAMOS_MIGRATE_COLD action for demotion

2024-04-04 Thread Honggyu Kim

This patch introduces DAMOS_MIGRATE_COLD action, which is similar to
DAMOS_PAGEOUT, but migrate folios to the given 'target_nid' in the sysfs
instead of swapping them out.

The 'target_nid' sysfs knob is created by this patch to inform the
migration target node ID.

Here is one of the example usage of this 'migrate_cold' action.

  $ cd /sys/kernel/mm/damon/admin/kdamonds/
  $ cat contexts//schemes//action
  migrate_cold
  $ echo 2 > contexts//schemes//target_nid
  $ echo commit > state
  $ numactl -p 0 ./hot_cold 500M 600M &
  $ numastat -c -p hot_cold

  Per-node process memory usage (in MBs)
  PID Node 0 Node 1 Node 2 Total
  --  -- -- -- -
  701 (hot_cold) 501  0601  1101

Since there are some common routines with pageout, many functions have
similar logics between pageout and migrate cold.

damon_pa_migrate_folio_list() is a minimized version of
shrink_folio_list(), but it's minified only for demotion.

Signed-off-by: Honggyu Kim 
Signed-off-by: Hyeongtak Ji 
---
 include/linux/damon.h|   2 +
 mm/damon/paddr.c | 146 ++-
 mm/damon/sysfs-schemes.c |   4 ++
 3 files changed, 151 insertions(+), 1 deletion(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 24ea33a03d5d..df8671e69a70 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -105,6 +105,7 @@ struct damon_target {
  * @DAMOS_NOHUGEPAGE:  Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
  * @DAMOS_LRU_PRIO:Prioritize the region on its LRU lists.
  * @DAMOS_LRU_DEPRIO:  Deprioritize the region on its LRU lists.
+ * @DAMOS_MIGRATE_COLD: Migrate for the given cold region.
  * @DAMOS_STAT:Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:  Total number of DAMOS actions
  *
@@ -122,6 +123,7 @@ enum damos_action {
DAMOS_NOHUGEPAGE,
DAMOS_LRU_PRIO,
DAMOS_LRU_DEPRIO,
+   DAMOS_MIGRATE_COLD,
DAMOS_STAT, /* Do nothing but only record the stat */
NR_DAMOS_ACTIONS,
 };
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 277a1c4d833c..fe217a26f788 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -12,6 +12,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #include "../internal.h"
 #include "ops-common.h"
@@ -226,8 +229,137 @@ static bool damos_pa_filter_out(struct damos *scheme, 
struct folio *folio)
 
 enum migration_mode {
MIG_PAGEOUT,
+   MIG_MIGRATE_COLD,
 };
 
+static unsigned int migrate_folio_list(struct list_head *migrate_folios,
+  struct pglist_data *pgdat,
+  int target_nid)
+{
+   unsigned int nr_succeeded;
+   nodemask_t allowed_mask = NODE_MASK_NONE;
+
+   struct migration_target_control mtc = {
+   /*
+* Allocate from 'node', or fail quickly and quietly.
+* When this happens, 'page' will likely just be discarded
+* instead of migrated.
+*/
+   .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | 
__GFP_NOWARN |
+   __GFP_NOMEMALLOC | GFP_NOWAIT,
+   .nid = target_nid,
+   .nmask = &allowed_mask
+   };
+
+   if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE)
+   return 0;
+
+   if (list_empty(migrate_folios))
+   return 0;
+
+   /* Migration ignores all cpuset and mempolicy settings */
+   migrate_pages(migrate_folios, alloc_migrate_folio, NULL,
+ (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
+ &nr_succeeded);
+
+   return nr_succeeded;
+}
+
+static unsigned int damon_pa_migrate_folio_list(struct list_head *folio_list,
+   struct pglist_data *pgdat,
+   enum migration_mode mm,
+   int target_nid)
+{
+   unsigned int nr_migrated = 0;
+   struct folio *folio;
+   LIST_HEAD(ret_folios);
+   LIST_HEAD(migrate_folios);
+
+   cond_resched();
+
+   while (!list_empty(folio_list)) {
+   struct folio *folio;
+
+   cond_resched();
+
+   folio = lru_to_folio(folio_list);
+   list_del(&folio->lru);
+
+   if (!folio_trylock(folio))
+   goto keep;
+
+   VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
+
+   /* Relocate its contents to another node. */
+   list_add(&folio->lru, &migrate_folios);
+   folio_unlock(folio);
+   continue;
+keep:
+   list_add(&folio->lru, &ret_folios);
+   VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+   }
+   /* 'folio_list' is always empty here */
+
+   /* Migrate folios selected for migration */
+   nr_migrated += migrate_folio_li

[RFC PATCH v3 3/7] mm/damon/sysfs-schemes: add target_nid on sysfs-schemes

2024-04-04 Thread Honggyu Kim

From: Hyeongtak Ji 

This patch adds target_nid under
  /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//

The 'target_nid' can be used as the destination node for DAMOS actions
such as DAMOS_MIGRATE_{HOT,COLD} in the follow up patches.

Signed-off-by: Hyeongtak Ji 
Signed-off-by: Honggyu Kim 
---
 include/linux/damon.h| 11 ++-
 mm/damon/core.c  |  5 -
 mm/damon/dbgfs.c |  2 +-
 mm/damon/lru_sort.c  |  3 ++-
 mm/damon/reclaim.c   |  3 ++-
 mm/damon/sysfs-schemes.c | 33 -
 6 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5881e4ac30be..24ea33a03d5d 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -337,6 +337,7 @@ struct damos_access_pattern {
  * @apply_interval_us: The time between applying the @action.
  * @quota: Control the aggressiveness of this scheme.
  * @wmarks:Watermarks for automated (in)activation of this scheme.
+ * @target_nid:Destination node if @action is 
"migrate_{hot,cold}".
  * @filters:   Additional set of &struct damos_filter for &action.
  * @stat:  Statistics of this scheme.
  * @list:  List head for siblings.
@@ -352,6 +353,10 @@ struct damos_access_pattern {
  * monitoring context are inactive, DAMON stops monitoring either, and just
  * repeatedly checks the watermarks.
  *
+ * @target_nid is used to set the migration target node for migrate_hot or
+ * migrate_cold actions, which means it's only meaningful when @action is 
either
+ * "migrate_hot" or "migrate_cold".
+ *
  * Before applying the &action to a memory region, &struct damon_operations
  * implementation could check pages of the region and skip &action to respect
  * &filters
@@ -373,6 +378,9 @@ struct damos {
 /* public: */
struct damos_quota quota;
struct damos_watermarks wmarks;
+   union {
+   int target_nid;
+   };
struct list_head filters;
struct damos_stat stat;
struct list_head list;
@@ -677,7 +685,8 @@ struct damos *damon_new_scheme(struct damos_access_pattern 
*pattern,
enum damos_action action,
unsigned long apply_interval_us,
struct damos_quota *quota,
-   struct damos_watermarks *wmarks);
+   struct damos_watermarks *wmarks,
+   int target_nid);
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
 void damon_destroy_scheme(struct damos *s);
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 5b325749fc12..7ff0259d9fa6 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -316,7 +316,8 @@ struct damos *damon_new_scheme(struct damos_access_pattern 
*pattern,
enum damos_action action,
unsigned long apply_interval_us,
struct damos_quota *quota,
-   struct damos_watermarks *wmarks)
+   struct damos_watermarks *wmarks,
+   int target_nid)
 {
struct damos *scheme;
 
@@ -341,6 +342,8 @@ struct damos *damon_new_scheme(struct damos_access_pattern 
*pattern,
scheme->wmarks = *wmarks;
scheme->wmarks.activated = true;
 
+   scheme->target_nid = target_nid;
+
return scheme;
 }
 
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 7dac24e69e3b..d04fdccfa65b 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -279,7 +279,7 @@ static struct damos **str_to_schemes(const char *str, 
ssize_t len,
 
pos += parsed;
scheme = damon_new_scheme(&pattern, action, 0, "a,
-   &wmarks);
+   &wmarks, NUMA_NO_NODE);
if (!scheme)
goto fail;
 
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 3de2916a65c3..3775f0f2743d 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -163,7 +163,8 @@ static struct damos *damon_lru_sort_new_scheme(
/* under the quota. */
"a,
/* (De)activate this according to the watermarks. */
-   &damon_lru_sort_wmarks);
+   &damon_lru_sort_wmarks,
+   NUMA_NO_NODE);
 }
 
 /* Create a DAMON-based operation scheme for hot memory regions */
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 66e190f0374a..84e6e96b5dcc 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -147,7 +147,8 @@ static struct damos *damon_reclaim_new_scheme(void)
/* under the quota. */
&damon_reclaim_quota,
/* (De)activate this according to the watermarks. */
-   &damon_reclaim_wmarks);
+   &damon_reclaim_wmarks,
+   NUMA_NO_N

[RFC PATCH v3 4/7] mm/migrate: add MR_DAMON to migrate_reason

2024-04-04 Thread Honggyu Kim

The current patch series introduces DAMON based migration across NUMA
nodes so it'd be better to have a new migrate_reason in trace events.

Signed-off-by: Honggyu Kim 
---
 include/linux/migrate_mode.h   | 1 +
 include/trace/events/migrate.h | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index f37cc03f9369..cec36b7e7ced 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -29,6 +29,7 @@ enum migrate_reason {
MR_CONTIG_RANGE,
MR_LONGTERM_PIN,
MR_DEMOTION,
+   MR_DAMON,
MR_TYPES
 };
 
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 0190ef725b43..cd01dd7b3640 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -22,7 +22,8 @@
EM( MR_NUMA_MISPLACED,  "numa_misplaced")   \
EM( MR_CONTIG_RANGE,"contig_range") \
EM( MR_LONGTERM_PIN,"longterm_pin") \
-   EMe(MR_DEMOTION,"demotion")
+   EM( MR_DEMOTION,"demotion") \
+   EMe(MR_DAMON,   "damon")
 
 /*
  * First define the enums in the above macros to be exported to userspace
-- 
2.34.1

Re: [PATCH] [v3] module: don't ignore sysfs_create_link() failures

2024-04-04 Thread Greg Kroah-Hartman

On Tue, Mar 26, 2024 at 03:57:18PM +0100, Arnd Bergmann wrote:
> From: Arnd Bergmann 
> 
> The sysfs_create_link() return code is marked as __must_check, but the
> module_add_driver() function tries hard to not care, by assigning the
> return code to a variable. When building with 'make W=1', gcc still
> warns because this variable is only assigned but not used:
> 
> drivers/base/module.c: In function 'module_add_driver':
> drivers/base/module.c:36:6: warning: variable 'no_warn' set but not used 
> [-Wunused-but-set-variable]
> 
> Rework the code to properly unwind and return the error code to the
> caller. My reading of the original code was that it tries to
> not fail when the links already exist, so keep ignoring -EEXIST
> errors.
> 
> Cc: Luis Chamberlain 
> Cc: linux-modu...@vger.kernel.org
> Cc: Greg Kroah-Hartman 
> Cc: "Rafael J. Wysocki" 
> Fixes: e17e0f51aeea ("Driver core: show drivers in /sys/module/")
> See-also: 4a7fb6363f2d ("add __must_check to device management code")
> Signed-off-by: Arnd Bergmann 
> ---
> v3: make error handling stricter, add unwinding,
>  fix build fail with CONFIG_MODULES=n
> v2: rework to actually handle the error. I have not tested the
> error handling beyond build testing, so please review carefully.
> ---
>  drivers/base/base.h   |  9 ++---
>  drivers/base/bus.c|  9 -
>  drivers/base/module.c | 42 +++---
>  3 files changed, 45 insertions(+), 15 deletions(-)

Reviewed-by: Greg Kroah-Hartman

[RFC PATCH v3 1/7] mm/damon/paddr: refactor DAMOS_PAGEOUT with migration_mode

2024-04-04 Thread Honggyu Kim

This is a preparation patch that introduces migration modes.

The damon_pa_pageout is renamed to damon_pa_migrate and it receives an
extra argument for migration_mode.

No functional changes applied.

Signed-off-by: Honggyu Kim 
---
 mm/damon/paddr.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 081e2a325778..277a1c4d833c 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -224,7 +224,12 @@ static bool damos_pa_filter_out(struct damos *scheme, 
struct folio *folio)
return false;
 }
 
-static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
+enum migration_mode {
+   MIG_PAGEOUT,
+};
+
+static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
+ enum migration_mode mm)
 {
unsigned long addr, applied;
LIST_HEAD(folio_list);
@@ -249,7 +254,14 @@ static unsigned long damon_pa_pageout(struct damon_region 
*r, struct damos *s)
 put_folio:
folio_put(folio);
}
-   applied = reclaim_pages(&folio_list);
+   switch (mm) {
+   case MIG_PAGEOUT:
+   applied = reclaim_pages(&folio_list);
+   break;
+   default:
+   /* Unexpected migration mode. */
+   return 0;
+   }
cond_resched();
return applied * PAGE_SIZE;
 }
@@ -297,7 +309,7 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx 
*ctx,
 {
switch (scheme->action) {
case DAMOS_PAGEOUT:
-   return damon_pa_pageout(r, scheme);
+   return damon_pa_migrate(r, scheme, MIG_PAGEOUT);
case DAMOS_LRU_PRIO:
return damon_pa_mark_accessed(r, scheme);
case DAMOS_LRU_DEPRIO:
-- 
2.34.1

[RFC PATCH v3 2/7] mm: make alloc_demote_folio externally invokable for migration

2024-04-04 Thread Honggyu Kim

The alloc_demote_folio can be used out of vmscan.c so it'd be better to
remove static keyword from it.

This function can also be used for both demotion and promotion so it'd
be better to rename it from alloc_demote_folio to alloc_migrate_folio.

Signed-off-by: Honggyu Kim 
---
 mm/internal.h |  1 +
 mm/vmscan.c   | 10 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index f309a010d50f..c96ff9bc82d0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -866,6 +866,7 @@ extern unsigned long  __must_check vm_mmap_pgoff(struct 
file *, unsigned long,
 unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
+struct folio *alloc_migrate_folio(struct folio *src, unsigned long private);
 unsigned long reclaim_pages(struct list_head *folio_list);
 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *folio_list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4255619a1a31..9e456cac03b4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -910,8 +910,7 @@ static void folio_check_dirty_writeback(struct folio *folio,
mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
 }
 
-static struct folio *alloc_demote_folio(struct folio *src,
-   unsigned long private)
+struct folio *alloc_migrate_folio(struct folio *src, unsigned long private)
 {
struct folio *dst;
nodemask_t *allowed_mask;
@@ -935,6 +934,11 @@ static struct folio *alloc_demote_folio(struct folio *src,
if (dst)
return dst;
 
+   /*
+* Allocation failed from the target node so try to allocate from
+* fallback nodes based on allowed_mask.
+* See fallback_alloc() at mm/slab.c.
+*/
mtc->gfp_mask &= ~__GFP_THISNODE;
mtc->nmask = allowed_mask;
 
@@ -973,7 +977,7 @@ static unsigned int demote_folio_list(struct list_head 
*demote_folios,
node_get_allowed_targets(pgdat, &allowed_mask);
 
/* Demotion ignores all cpuset and mempolicy settings */
-   migrate_pages(demote_folios, alloc_demote_folio, NULL,
+   migrate_pages(demote_folios, alloc_migrate_folio, NULL,
  (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
  &nr_succeeded);
 
-- 
2.34.1

[RFC PATCH v3 0/7] DAMON based tiered memory management for CXL memory

2024-04-04 Thread Honggyu Kim

There was an RFC IDEA "DAMOS-based Tiered-Memory Management" previously
posted at [1].

It says there is no implementation of the demote/promote DAMOS action
are made.  This RFC is about its implementation for physical address
space.


Changes from RFC v2:
  1. Rename DAMOS_{PROMOTE,DEMOTE} actions to DAMOS_MIGRATE_{HOT,COLD}.
  2. Create 'target_nid' to set the migration target node instead of
 depending on node distance based information.
  3. Instead of having page level access check in this patch series,
 delegate the job to a new DAMOS filter type YOUNG[2].
  4. Introduce vmstat counters "damon_migrate_{hot,cold}".
  5. Rebase from v6.7 to v6.8.

Changes from RFC:
  1. Move most of implementation from mm/vmscan.c to mm/damon/paddr.c.
  2. Simplify some functions of vmscan.c and used in paddr.c, but need
 to be reviewed more in depth.
  3. Refactor most functions for common usage for both promote and
 demote actions and introduce an enum migration_mode for its control.
  4. Add "target_nid" sysfs knob for migration destination node for both
 promote and demote actions.
  5. Move DAMOS_PROMOTE before DAMOS_DEMOTE and move then even above
 DAMOS_STAT.


Introduction


With the advent of CXL/PCIe attached DRAM, which will be called simply
as CXL memory in this cover letter, some systems are becoming more
heterogeneous having memory systems with different latency and bandwidth
characteristics.  They are usually handled as different NUMA nodes in
separate memory tiers and CXL memory is used as slow tiers because of
its protocol overhead compared to local DRAM.

In this kind of systems, we need to be careful placing memory pages on
proper NUMA nodes based on the memory access frequency.  Otherwise, some
frequently accessed pages might reside on slow tiers and it makes
performance degradation unexpectedly.  Moreover, the memory access
patterns can be changed at runtime.

To handle this problem, we need a way to monitor the memory access
patterns and migrate pages based on their access temperature.  The
DAMON(Data Access MONitor) framework and its DAMOS(DAMON-based Operation
Schemes) can be useful features for monitoring and migrating pages.
DAMOS provides multiple actions based on DAMON monitoring results and it
can be used for proactive reclaim, which means swapping cold pages out
with DAMOS_PAGEOUT action, but it doesn't support migration actions such
as demotion and promotion between tiered memory nodes.

This series supports two new DAMOS actions; DAMOS_MIGRATE_HOT for
promotion from slow tiers and DAMOS_MIGRATE_COLD for demotion from fast
tiers.  This prevents hot pages from being stuck on slow tiers, which
makes performance degradation and cold pages can be proactively demoted
to slow tiers so that the system can increase the chance to allocate
more hot pages to fast tiers.

The DAMON provides various tuning knobs but we found that the proactive
demotion for cold pages is especially useful when the system is running
out of memory on its fast tier nodes.

Our evaluation result shows that it reduces the performance slowdown
compared to the default memory policy from 17~18% to 4~5% when the
system runs under high memory pressure on its fast tier DRAM nodes.


DAMON configuration
===

The specific DAMON configuration doesn't have to be in the scope of this
patch series, but some rough idea is better to be shared to explain the
evaluation result.

The DAMON provides many knobs for fine tuning but its configuration file
is generated by HMSDK[3].  It includes gen_config.py script that
generates a json file with the full config of DAMON knobs and it creates
multiple kdamonds for each NUMA node when the DAMON is enabled so that
it can run hot/cold based migration for tiered memory.


Evaluation Workload
===

The performance evaluation is done with redis[4], which is a widely used
in-memory database and the memory access patterns are generated via
YCSB[5].  We have measured two different workloads with zipfian and
latest distributions but their configs are slightly modified to make
memory usage higher and execution time longer for better evaluation.

The idea of evaluation using these migrate_{hot,cold} actions covers
system-wide memory management rather than partitioning hot/cold pages of
a single workload.  The default memory allocation policy creates pages
to the fast tier DRAM node first, then allocates newly created pages to
the slow tier CXL node when the DRAM node has insufficient free space.
Once the page allocation is done then those pages never move between
NUMA nodes.  It's not true when using numa balancing, but it is not the
scope of this DAMON based tiered memory management support.

If the working set of redis can be fit fully into the DRAM node, then
the redis will access the fast DRAM only.  Since the performance of DRAM
only is faster than partially accessing CXL memory in slow tiers, this
environment is not useful to evaluate t

Re: [PATCH fs/proc/bootconfig] remove redundant comments from /proc/bootconfig

2024-04-04 Thread Paul E. McKenney

On Fri, Apr 05, 2024 at 11:57:45AM +0900, Masami Hiramatsu wrote:
> On Fri, 5 Apr 2024 10:23:24 +0900
> Masami Hiramatsu (Google)  wrote:
> 
> > On Thu, 4 Apr 2024 10:43:14 -0700
> > "Paul E. McKenney"  wrote:
> > 
> > > On Thu, Apr 04, 2024 at 08:55:22AM +0900, Masami Hiramatsu wrote:
> > > > On Wed, 3 Apr 2024 12:16:28 -0700
> > > > "Paul E. McKenney"  wrote:
> > > > 
> > > > > commit 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to
> > > > > /proc/bootconfig") adds bootloader argument comments into 
> > > > > /proc/bootconfig.
> > > > > 
> > > > > /proc/bootconfig shows boot_command_line[] multiple times following
> > > > > every xbc key value pair, that's duplicated and not necessary.
> > > > > Remove redundant ones.
> > > > > 
> > > > > Output before and after the fix is like:
> > > > > key1 = value1
> > > > > *bootloader argument comments*
> > > > > key2 = value2
> > > > > *bootloader argument comments*
> > > > > key3 = value3
> > > > > *bootloader argument comments*
> > > > > ...
> > > > > 
> > > > > key1 = value1
> > > > > key2 = value2
> > > > > key3 = value3
> > > > > *bootloader argument comments*
> > > > > ...
> > > > > 
> > > > > Fixes: 717c7c894d4b ("fs/proc: Add boot loader arguments as comment 
> > > > > to /proc/bootconfig")
> > > > > Signed-off-by: Zhenhua Huang 
> > > > > Signed-off-by: Paul E. McKenney 
> > > > > Cc: Masami Hiramatsu 
> > > > > Cc: 
> > > > > Cc: 
> > > > 
> > > > OOps, good catch! Let me pick it.
> > > > 
> > > > Acked-by: Masami Hiramatsu (Google) 
> > > 
> > > Thank you, and I have applied your ack and pulled this into its own
> > > bootconfig.2024.04.04a.
> > > 
> > > My guess is that you will push this via your own tree, and so I will
> > > drop my copy as soon as yours hits -next.
> > 
> > Thanks! I would like to make PR this soon as bootconfig fixes for v6.9-rc2.
> 
> Hmm I found that this always shows the command line comment in
> /proc/bootconfig even without "bootconfig" option.
> I think that is easier for user-tools but changes the behavior and
> a bit redundant.
> 
> We should skip showing this original argument comment if bootconfig is
> not initialized (no "bootconfig" in cmdline) as it is now.

So something like this folded into that patch?



diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index e5635a6b127b0..7d2520378f5f2 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -63,7 +63,7 @@ static int __init copy_xbc_key_value_list(char *dst, size_t 
size)
dst += ret;
}
}
-   if (ret >= 0 && boot_command_line[0]) {
+   if (bootconfig_is_present() && ret >= 0 && boot_command_line[0]) {
ret = snprintf(dst, rest(dst, end), "# Parameters from 
bootloader:\n# %s\n",
   boot_command_line);
if (ret > 0)
diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index ca73940e26df8..ef70d1b381421 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -10,6 +10,7 @@
 #ifdef __KERNEL__
 #include 
 #include 
+int bootconfig_is_present(void);
 #else /* !__KERNEL__ */
 /*
  * NOTE: This is only for tools/bootconfig, because tools/bootconfig will
diff --git a/init/main.c b/init/main.c
index 2ca52474d0c30..720a669b1493d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1572,3 +1572,8 @@ static noinline void __init kernel_init_freeable(void)
 
integrity_load_keys();
 }
+
+int bootconfig_is_present(void)
+{
+   return bootconfig_found || IS_ENABLED(CONFIG_BOOT_CONFIG_FORCE);
+}



Give or take placement of the bootconfig_is_present() function's
declaration and definition.

Thanx, Paul

Thanx, Paul

> Thank you,
> 
> 
> > Thank you,
> > 
> > > 
> > >   Thanx, Paul
> > > 
> > > > Thank you!
> > > > 
> > > > > 
> > > > > diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
> > > > > index 902b326e1e560..e5635a6b127b0 100644
> > > > > --- a/fs/proc/bootconfig.c
> > > > > +++ b/fs/proc/bootconfig.c
> > > > > @@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char 
> > > > > *dst, size_t size)
> > > > >   break;
> > > > >   dst += ret;
> > > > >   }
> > > > > - if (ret >= 0 && boot_command_line[0]) {
> > > > > - ret = snprintf(dst, rest(dst, end), "# 
> > > > > Parameters from bootloader:\n# %s\n",
> > > > > -boot_command_line);
> > > > > - if (ret > 0)
> > > > > - dst += ret;
> > > > > - }
> > > > > + }
> > > > > + if (ret >= 0 && boot_command_line[0]) {
> > > > > + ret

Re: [PATCH v10 08/14] x86/sgx: Add basic EPC reclamation flow for cgroup

2024-04-04 Thread Huang, Kai

On Thu, 2024-04-04 at 12:05 -0500, Haitao Huang wrote:
> > > -static inline int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg)
> > > +static inline int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg,  
> > > enum sgx_reclaim r)
> > 
> > Is the @r here intentional for shorter typing?
> > 
> 
> yes :-)
> Will speel out to make it consistent if that's the concern.

I kinda prefer the full name to match the CONFIG_CGROUP_SGX_EPC on case.  You
can put the 'enum sgx_reclaim reclaim' parameter into the new line if needed:

static inline int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg,
enum sgx_reclaim reclaim)
{
return 0;
}

Re: [PATCH v10 08/14] x86/sgx: Add basic EPC reclamation flow for cgroup

2024-04-04 Thread Huang, Kai

On Thu, 2024-04-04 at 12:05 -0500, Haitao Huang wrote:
> > Please also mention why "leaving asynchronous reclamation to later  
> > patch(es)" is
> > fine.  E.g., it won't break anything I suppose.
> > 
> 
> Right. Pages are still in the global list at the moment and only global  
> reclaiming is active until the "turn on" patch. Separating out is really  
> just for the purpose of review IMHO.

Sounds good to me.  Thanks.

Re: [PATCH fs/proc/bootconfig] remove redundant comments from /proc/bootconfig

2024-04-04 Thread Google

On Fri, 5 Apr 2024 10:23:24 +0900
Masami Hiramatsu (Google)  wrote:

> On Thu, 4 Apr 2024 10:43:14 -0700
> "Paul E. McKenney"  wrote:
> 
> > On Thu, Apr 04, 2024 at 08:55:22AM +0900, Masami Hiramatsu wrote:
> > > On Wed, 3 Apr 2024 12:16:28 -0700
> > > "Paul E. McKenney"  wrote:
> > > 
> > > > commit 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to
> > > > /proc/bootconfig") adds bootloader argument comments into 
> > > > /proc/bootconfig.
> > > > 
> > > > /proc/bootconfig shows boot_command_line[] multiple times following
> > > > every xbc key value pair, that's duplicated and not necessary.
> > > > Remove redundant ones.
> > > > 
> > > > Output before and after the fix is like:
> > > > key1 = value1
> > > > *bootloader argument comments*
> > > > key2 = value2
> > > > *bootloader argument comments*
> > > > key3 = value3
> > > > *bootloader argument comments*
> > > > ...
> > > > 
> > > > key1 = value1
> > > > key2 = value2
> > > > key3 = value3
> > > > *bootloader argument comments*
> > > > ...
> > > > 
> > > > Fixes: 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to 
> > > > /proc/bootconfig")
> > > > Signed-off-by: Zhenhua Huang 
> > > > Signed-off-by: Paul E. McKenney 
> > > > Cc: Masami Hiramatsu 
> > > > Cc: 
> > > > Cc: 
> > > 
> > > OOps, good catch! Let me pick it.
> > > 
> > > Acked-by: Masami Hiramatsu (Google) 
> > 
> > Thank you, and I have applied your ack and pulled this into its own
> > bootconfig.2024.04.04a.
> > 
> > My guess is that you will push this via your own tree, and so I will
> > drop my copy as soon as yours hits -next.
> 
> Thanks! I would like to make PR this soon as bootconfig fixes for v6.9-rc2.
> 

Hmm I found that this always shows the command line comment in
/proc/bootconfig even without "bootconfig" option.
I think that is easier for user-tools but changes the behavior and
a bit redundant.

We should skip showing this original argument comment if bootconfig is
not initialized (no "bootconfig" in cmdline) as it is now.

Thank you,


> Thank you,
> 
> > 
> > Thanx, Paul
> > 
> > > Thank you!
> > > 
> > > > 
> > > > diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
> > > > index 902b326e1e560..e5635a6b127b0 100644
> > > > --- a/fs/proc/bootconfig.c
> > > > +++ b/fs/proc/bootconfig.c
> > > > @@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char 
> > > > *dst, size_t size)
> > > > break;
> > > > dst += ret;
> > > > }
> > > > -   if (ret >= 0 && boot_command_line[0]) {
> > > > -   ret = snprintf(dst, rest(dst, end), "# 
> > > > Parameters from bootloader:\n# %s\n",
> > > > -  boot_command_line);
> > > > -   if (ret > 0)
> > > > -   dst += ret;
> > > > -   }
> > > > +   }
> > > > +   if (ret >= 0 && boot_command_line[0]) {
> > > > +   ret = snprintf(dst, rest(dst, end), "# Parameters from 
> > > > bootloader:\n# %s\n",
> > > > +  boot_command_line);
> > > > +   if (ret > 0)
> > > > +   dst += ret;
> > > > }
> > > >  out:
> > > > kfree(key);
> > > 
> > > 
> > > -- 
> > > Masami Hiramatsu (Google) 
> 
> 
> -- 
> Masami Hiramatsu (Google) 


-- 
Masami Hiramatsu (Google)

Re: [PATCH v10 05/14] x86/sgx: Implement basic EPC misc cgroup functionality

2024-04-04 Thread Huang, Kai

On Thu, 2024-04-04 at 20:24 -0500, Haitao Huang wrote:
> > Again, IMHO having CONFIG_CGROUP_SGX_EPC here is ugly, because it  
> > doesn't even
> > match the try_charge() above, which doesn't have the  
> > CONFIG_CGROUP_SGX_EPC.
> > 
> > If you add a wrapper in "epc_cgroup.h"
> > 
> Agree. but in sgx.h so sgx_epc_page struct is not exposed in epc_cgroup.h.

I am fine with any place that suits.

Re: [PATCH v10 05/14] x86/sgx: Implement basic EPC misc cgroup functionality

2024-04-04 Thread Haitao Huang


On Thu, 28 Mar 2024 07:53:45 -0500, Huang, Kai  wrote:




--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2022 Intel Corporation.


It's 2024 now.

And looks you need to use C style comment for /* Copyright ... */, after  
looking

at some other C files.


Ok, will update years and use C style.


+
+#include 
+#include 
+#include "epc_cgroup.h"
+
+/* The root SGX EPC cgroup */
+static struct sgx_cgroup sgx_cg_root;
+
+/**
+ * sgx_cgroup_try_charge() - try to charge cgroup for a single EPC page
+ *
+ * @sgx_cg:The EPC cgroup to be charged for the page.
+ * Return:
+ * * %0 - If successfully charged.
+ * * -errno - for failures.
+ */
+int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg)
+{
+   return misc_cg_try_charge(MISC_CG_RES_SGX_EPC, sgx_cg->cg, PAGE_SIZE);
+}
+
+/**
+ * sgx_cgroup_uncharge() - uncharge a cgroup for an EPC page
+ * @sgx_cg:The charged sgx cgroup
+ */
+void sgx_cgroup_uncharge(struct sgx_cgroup *sgx_cg)
+{
+   misc_cg_uncharge(MISC_CG_RES_SGX_EPC, sgx_cg->cg, PAGE_SIZE);
+}
+
+static void sgx_cgroup_free(struct misc_cg *cg)
+{
+   struct sgx_cgroup *sgx_cg;
+
+   sgx_cg = sgx_cgroup_from_misc_cg(cg);
+   if (!sgx_cg)
+   return;
+
+   kfree(sgx_cg);
+}
+
+static int sgx_cgroup_alloc(struct misc_cg *cg);


Again, this declaration can be removed if you move the below structure  
...



+
+const struct misc_res_ops sgx_cgroup_ops = {
+   .alloc = sgx_cgroup_alloc,
+   .free = sgx_cgroup_free,
+};
+
+static void sgx_cgroup_misc_init(struct misc_cg *cg, struct sgx_cgroup  
*sgx_cg)

+{
+   cg->res[MISC_CG_RES_SGX_EPC].priv = sgx_cg;
+   sgx_cg->cg = cg;
+}
+
+static int sgx_cgroup_alloc(struct misc_cg *cg)
+{
+   struct sgx_cgroup *sgx_cg;
+
+   sgx_cg = kzalloc(sizeof(*sgx_cg), GFP_KERNEL);
+   if (!sgx_cg)
+   return -ENOMEM;
+
+   sgx_cgroup_misc_init(cg, sgx_cg);
+
+   return 0;
+}


... here.



yes, thanks


+
+void sgx_cgroup_init(void)
+{
+   misc_cg_set_ops(MISC_CG_RES_SGX_EPC, &sgx_cgroup_ops);
+   sgx_cgroup_misc_init(misc_cg_root(), &sgx_cg_root);
+}
diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.h  
b/arch/x86/kernel/cpu/sgx/epc_cgroup.h

new file mode 100644
index ..8f794e23fad6
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2022 Intel Corporation. */
+#ifndef _SGX_EPC_CGROUP_H_
+#define _SGX_EPC_CGROUP_H_
+
+#include 
+#include 
+#include 
+
+#include "sgx.h"
+
+#ifndef CONFIG_CGROUP_SGX_EPC


Nit: add an empty line to make text more breathable.



ok


+#define MISC_CG_RES_SGX_EPC MISC_CG_RES_TYPES
+struct sgx_cgroup;
+
+static inline struct sgx_cgroup *sgx_get_current_cg(void)
+{
+   return NULL;
+}
+
+static inline void sgx_put_cg(struct sgx_cgroup *sgx_cg) { }
+
+static inline int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg)
+{
+   return 0;
+}
+
+static inline void sgx_cgroup_uncharge(struct sgx_cgroup *sgx_cg) { }
+
+static inline void sgx_cgroup_init(void) { }
+#else


Nit: I prefer two empty lines before and after the 'else'.



ok


+struct sgx_cgroup {
+   struct misc_cg *cg;
+};
+
+static inline struct sgx_cgroup *sgx_cgroup_from_misc_cg(struct  
misc_cg *cg)

+{
+   return (struct sgx_cgroup *)(cg->res[MISC_CG_RES_SGX_EPC].priv);
+}
+
+/**
+ * sgx_get_current_cg() - get the EPC cgroup of current process.
+ *
+ * Returned cgroup has its ref count increased by 1. Caller must call
+ * sgx_put_cg() to return the reference.
+ *
+ * Return: EPC cgroup to which the current task belongs to.
+ */
+static inline struct sgx_cgroup *sgx_get_current_cg(void)
+{
+   return sgx_cgroup_from_misc_cg(get_current_misc_cg());
+}


Again, I _think_ you need to check whether get_current_misc_cg() returns  
NULL?


Misc cgroup can be disabled by command line even it is on in the Kconfig.

I am not expert on cgroup, so could you check on this?



Good catch. Will add NULL check in sgx_cgroup_from_misc_cg()


+
+/**
+ * sgx_put_sgx_cg() - Put the EPC cgroup and reduce its ref count.
+ * @sgx_cg - EPC cgroup to put.
+ */
+static inline void sgx_put_cg(struct sgx_cgroup *sgx_cg)
+{
+   if (sgx_cg)
+   put_misc_cg(sgx_cg->cg);
+}
+
+int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg);
+void sgx_cgroup_uncharge(struct sgx_cgroup *sgx_cg);
+void sgx_cgroup_init(void);
+
+#endif
+
+#endif /* _SGX_EPC_CGROUP_H_ */
diff --git a/arch/x86/kernel/cpu/sgx/main.c  
b/arch/x86/kernel/cpu/sgx/main.c

index d219f14365d4..023af54c1beb 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -17,6 +18,7 @@
 #include "driver.h"
 #include "encl.h"
 #include "encls.h"
+#include "epc_cgroup.h"

 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
 static int sgx_nr_epc

Re: [PATCH fs/proc/bootconfig] remove redundant comments from /proc/bootconfig

2024-04-04 Thread Google

On Thu, 4 Apr 2024 10:43:14 -0700
"Paul E. McKenney"  wrote:

> On Thu, Apr 04, 2024 at 08:55:22AM +0900, Masami Hiramatsu wrote:
> > On Wed, 3 Apr 2024 12:16:28 -0700
> > "Paul E. McKenney"  wrote:
> > 
> > > commit 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to
> > > /proc/bootconfig") adds bootloader argument comments into 
> > > /proc/bootconfig.
> > > 
> > > /proc/bootconfig shows boot_command_line[] multiple times following
> > > every xbc key value pair, that's duplicated and not necessary.
> > > Remove redundant ones.
> > > 
> > > Output before and after the fix is like:
> > > key1 = value1
> > > *bootloader argument comments*
> > > key2 = value2
> > > *bootloader argument comments*
> > > key3 = value3
> > > *bootloader argument comments*
> > > ...
> > > 
> > > key1 = value1
> > > key2 = value2
> > > key3 = value3
> > > *bootloader argument comments*
> > > ...
> > > 
> > > Fixes: 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to 
> > > /proc/bootconfig")
> > > Signed-off-by: Zhenhua Huang 
> > > Signed-off-by: Paul E. McKenney 
> > > Cc: Masami Hiramatsu 
> > > Cc: 
> > > Cc: 
> > 
> > OOps, good catch! Let me pick it.
> > 
> > Acked-by: Masami Hiramatsu (Google) 
> 
> Thank you, and I have applied your ack and pulled this into its own
> bootconfig.2024.04.04a.
> 
> My guess is that you will push this via your own tree, and so I will
> drop my copy as soon as yours hits -next.

Thanks! I would like to make PR this soon as bootconfig fixes for v6.9-rc2.

Thank you,

> 
>   Thanx, Paul
> 
> > Thank you!
> > 
> > > 
> > > diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
> > > index 902b326e1e560..e5635a6b127b0 100644
> > > --- a/fs/proc/bootconfig.c
> > > +++ b/fs/proc/bootconfig.c
> > > @@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, 
> > > size_t size)
> > >   break;
> > >   dst += ret;
> > >   }
> > > - if (ret >= 0 && boot_command_line[0]) {
> > > - ret = snprintf(dst, rest(dst, end), "# Parameters from 
> > > bootloader:\n# %s\n",
> > > -boot_command_line);
> > > - if (ret > 0)
> > > - dst += ret;
> > > - }
> > > + }
> > > + if (ret >= 0 && boot_command_line[0]) {
> > > + ret = snprintf(dst, rest(dst, end), "# Parameters from 
> > > bootloader:\n# %s\n",
> > > +boot_command_line);
> > > + if (ret > 0)
> > > + dst += ret;
> > >   }
> > >  out:
> > >   kfree(key);
> > 
> > 
> > -- 
> > Masami Hiramatsu (Google) 


-- 
Masami Hiramatsu (Google)

Re: [PATCHv2 1/3] uprobe: Add uretprobe syscall to speed up return probe

2024-04-04 Thread Google

On Thu, 4 Apr 2024 18:11:09 +0200
Oleg Nesterov  wrote:

> On 04/05, Masami Hiramatsu wrote:
> >
> > Can we make this syscall and uprobe behavior clearer? As you said, if
> > the application use sigreturn or longjump, it may skip returns and
> > shadow stack entries are left in the kernel. In such cases, can uretprobe
> > detect it properly, or just crash the process (or process runs wrongly)?
> 
> Please see the comment in handle_trampoline(), it tries to detect this case.
> This patch should not make any difference.

I think you mean this loop will skip and discard the stacked return_instance
to find the valid one.

do {
/*
 * We should throw out the frames invalidated by longjmp().
 * If this chain is valid, then the next one should be alive
 * or NULL; the latter case means that nobody but ri->func
 * could hit this trampoline on return. TODO: sigaltstack().
 */
next = find_next_ret_chain(ri);
valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, 
regs);

instruction_pointer_set(regs, ri->orig_ret_vaddr);
do {
if (valid)
handle_uretprobe_chain(ri, regs);
ri = free_ret_instance(ri);
utask->depth--;
} while (ri != next);
} while (!valid);

I think this expects setjmp/longjmp as below

foo() { <- retprobe1
setjmp()
bar() { <- retprobe2
longjmp()
}
} <- return to trampoline

In this case, we need to skip retprobe2's instance.
My concern is, if we can not find appropriate return instance, what happen?
e.g.

foo() { <-- retprobe1
   bar() { # sp is decremented
   sys_uretprobe() <-- ??
}
}

It seems sys_uretprobe() will handle retprobe1 at that point instead of
SIGILL.

Can we avoid this with below strict check?

if (ri->stack != regs->sp + expected_offset)
goto sigill;

expected_offset should be 16 (push * 3 - ret) on x64 if we ri->stack is the
regs->sp right after call.

Thank you,

-- 
Masami Hiramatsu (Google)

[PATCH v11 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-04-04 Thread Ho-Ren (Jack) Chuang

The current implementation treats emulated memory devices, such as
CXL1.1 type3 memory, as normal DRAM when they are emulated as normal memory
(E820_TYPE_RAM). However, these emulated devices have different
characteristics than traditional DRAM, making it important to
distinguish them. Thus, we modify the tiered memory initialization process
to introduce a delay specifically for CPUless NUMA nodes. This delay
ensures that the memory tier initialization for these nodes is deferred
until HMAT information is obtained during the boot process. Finally,
demotion tables are recalculated at the end.

* late_initcall(memory_tier_late_init);
Some device drivers may have initialized memory tiers between
`memory_tier_init()` and `memory_tier_late_init()`, potentially bringing
online memory nodes and configuring memory tiers. They should be excluded
in the late init.

* Handle cases where there is no HMAT when creating memory tiers
There is a scenario where a CPUless node does not provide HMAT information.
If no HMAT is specified, it falls back to using the default DRAM tier.

* Introduce another new lock `default_dram_perf_lock` for adist calculation
In the current implementation, iterating through CPUlist nodes requires
holding the `memory_tier_lock`. However, `mt_calc_adistance()` will end up
trying to acquire the same lock, leading to a potential deadlock.
Therefore, we propose introducing a standalone `default_dram_perf_lock` to
protect `default_dram_perf_*`. This approach not only avoids deadlock
but also prevents holding a large lock simultaneously.

* Upgrade `set_node_memory_tier` to support additional cases, including
  default DRAM, late CPUless, and hot-plugged initializations.
To cover hot-plugged memory nodes, `mt_calc_adistance()` and
`mt_find_alloc_memory_type()` are moved into `set_node_memory_tier()` to
handle cases where memtype is not initialized and where HMAT information is
available.

* Introduce `default_memory_types` for those memory types that are not
  initialized by device drivers.
Because late initialized memory and default DRAM memory need to be managed,
a default memory type is created for storing all memory types that are
not initialized by device drivers and as a fallback.

Signed-off-by: Ho-Ren (Jack) Chuang 
Signed-off-by: Hao Xiang 
Reviewed-by: "Huang, Ying" 
---
 mm/memory-tiers.c | 94 +++
 1 file changed, 70 insertions(+), 24 deletions(-)

diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 516b144fd45a..6632102bd5c9 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -36,6 +36,11 @@ struct node_memory_type_map {
 
 static DEFINE_MUTEX(memory_tier_lock);
 static LIST_HEAD(memory_tiers);
+/*
+ * The list is used to store all memory types that are not created
+ * by a device driver.
+ */
+static LIST_HEAD(default_memory_types);
 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
 struct memory_dev_type *default_dram_type;
 
@@ -108,6 +113,8 @@ static struct demotion_nodes *node_demotion __read_mostly;
 
 static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
 
+/* The lock is used to protect `default_dram_perf*` info and nid. */
+static DEFINE_MUTEX(default_dram_perf_lock);
 static bool default_dram_perf_error;
 static struct access_coordinate default_dram_perf;
 static int default_dram_perf_ref_nid = NUMA_NO_NODE;
@@ -505,7 +512,8 @@ static inline void __init_node_memory_type(int node, struct 
memory_dev_type *mem
 static struct memory_tier *set_node_memory_tier(int node)
 {
struct memory_tier *memtier;
-   struct memory_dev_type *memtype;
+   struct memory_dev_type *memtype = default_dram_type;
+   int adist = MEMTIER_ADISTANCE_DRAM;
pg_data_t *pgdat = NODE_DATA(node);
 
 
@@ -514,7 +522,16 @@ static struct memory_tier *set_node_memory_tier(int node)
if (!node_state(node, N_MEMORY))
return ERR_PTR(-EINVAL);
 
-   __init_node_memory_type(node, default_dram_type);
+   mt_calc_adistance(node, &adist);
+   if (!node_memory_types[node].memtype) {
+   memtype = mt_find_alloc_memory_type(adist, 
&default_memory_types);
+   if (IS_ERR(memtype)) {
+   memtype = default_dram_type;
+   pr_info("Failed to allocate a memory type. Fall 
back.\n");
+   }
+   }
+
+   __init_node_memory_type(node, memtype);
 
memtype = node_memory_types[node].memtype;
node_set(node, memtype->nodes);
@@ -652,6 +669,35 @@ void mt_put_memory_types(struct list_head *memory_types)
 }
 EXPORT_SYMBOL_GPL(mt_put_memory_types);
 
+/*
+ * This is invoked via `late_initcall()` to initialize memory tiers for
+ * CPU-less memory nodes after driver initialization, which is
+ * expected to provide `adistance` algorithms.
+ */
+static int __init memory_tier_late_init(void)
+{
+   int nid;
+
+   guard(mutex)(&memory_tier_lock);
+   for_each_node_state(nid, N_MEMORY) {
+   /*

[PATCH v11 1/2] memory tier: dax/kmem: introduce an abstract layer for finding, allocating, and putting memory types

2024-04-04 Thread Ho-Ren (Jack) Chuang

Since different memory devices require finding, allocating, and putting
memory types, these common steps are abstracted in this patch,
enhancing the scalability and conciseness of the code.

Signed-off-by: Ho-Ren (Jack) Chuang 
Reviewed-by: "Huang, Ying" 
---
 drivers/dax/kmem.c   | 30 --
 include/linux/memory-tiers.h | 13 +
 mm/memory-tiers.c| 29 +
 3 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 42ee360cf4e3..4fe9d040e375 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -55,36 +55,14 @@ static LIST_HEAD(kmem_memory_types);
 
 static struct memory_dev_type *kmem_find_alloc_memory_type(int adist)
 {
-   bool found = false;
-   struct memory_dev_type *mtype;
-
-   mutex_lock(&kmem_memory_type_lock);
-   list_for_each_entry(mtype, &kmem_memory_types, list) {
-   if (mtype->adistance == adist) {
-   found = true;
-   break;
-   }
-   }
-   if (!found) {
-   mtype = alloc_memory_type(adist);
-   if (!IS_ERR(mtype))
-   list_add(&mtype->list, &kmem_memory_types);
-   }
-   mutex_unlock(&kmem_memory_type_lock);
-
-   return mtype;
+   guard(mutex)(&kmem_memory_type_lock);
+   return mt_find_alloc_memory_type(adist, &kmem_memory_types);
 }
 
 static void kmem_put_memory_types(void)
 {
-   struct memory_dev_type *mtype, *mtn;
-
-   mutex_lock(&kmem_memory_type_lock);
-   list_for_each_entry_safe(mtype, mtn, &kmem_memory_types, list) {
-   list_del(&mtype->list);
-   put_memory_type(mtype);
-   }
-   mutex_unlock(&kmem_memory_type_lock);
+   guard(mutex)(&kmem_memory_type_lock);
+   mt_put_memory_types(&kmem_memory_types);
 }
 
 static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 69e781900082..0d70788558f4 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -48,6 +48,9 @@ int mt_calc_adistance(int node, int *adist);
 int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
 const char *source);
 int mt_perf_to_adistance(struct access_coordinate *perf, int *adist);
+struct memory_dev_type *mt_find_alloc_memory_type(int adist,
+ struct list_head 
*memory_types);
+void mt_put_memory_types(struct list_head *memory_types);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -136,5 +139,15 @@ static inline int mt_perf_to_adistance(struct 
access_coordinate *perf, int *adis
 {
return -EIO;
 }
+
+static inline struct memory_dev_type *mt_find_alloc_memory_type(int adist,
+   struct 
list_head *memory_types)
+{
+   return NULL;
+}
+
+static inline void mt_put_memory_types(struct list_head *memory_types)
+{
+}
 #endif /* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 0537664620e5..516b144fd45a 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -623,6 +623,35 @@ void clear_node_memory_type(int node, struct 
memory_dev_type *memtype)
 }
 EXPORT_SYMBOL_GPL(clear_node_memory_type);
 
+struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head 
*memory_types)
+{
+   struct memory_dev_type *mtype;
+
+   list_for_each_entry(mtype, memory_types, list)
+   if (mtype->adistance == adist)
+   return mtype;
+
+   mtype = alloc_memory_type(adist);
+   if (IS_ERR(mtype))
+   return mtype;
+
+   list_add(&mtype->list, memory_types);
+
+   return mtype;
+}
+EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type);
+
+void mt_put_memory_types(struct list_head *memory_types)
+{
+   struct memory_dev_type *mtype, *mtn;
+
+   list_for_each_entry_safe(mtype, mtn, memory_types, list) {
+   list_del(&mtype->list);
+   put_memory_type(mtype);
+   }
+}
+EXPORT_SYMBOL_GPL(mt_put_memory_types);
+
 static void dump_hmem_attrs(struct access_coordinate *coord, const char 
*prefix)
 {
pr_info(
-- 
Ho-Ren (Jack) Chuang

[PATCH v11 0/2] Improved Memory Tier Creation for CPUless NUMA Nodes

2024-04-04 Thread Ho-Ren (Jack) Chuang

When a memory device, such as CXL1.1 type3 memory, is emulated as
normal memory (E820_TYPE_RAM), the memory device is indistinguishable from
normal DRAM in terms of memory tiering with the current implementation.
The current memory tiering assigns all detected normal memory nodes to
the same DRAM tier. This results in normal memory devices with different
attributions being unable to be assigned to the correct memory tier,
leading to the inability to migrate pages between different
types of memory.
https://lore.kernel.org/linux-mm/ph0pr08mb7955e9f08ccb64f23963b5c3a8...@ph0pr08mb7955.namprd08.prod.outlook.com/T/

This patchset automatically resolves the issues. It delays the
initialization of memory tiers for CPUless NUMA nodes until they obtain
HMAT information and after all devices are initialized at boot time,
eliminating the need for user intervention. If no HMAT is specified,
it falls back to using `default_dram_type`.

Example usecase:
We have CXL memory on the host, and we create VMs with a new system memory
device backed by host CXL memory. We inject CXL memory performance
attributes through QEMU, and the guest now sees memory nodes with
performance attributes in HMAT. With this change, we enable the
guest kernel to construct the correct memory tiering for the memory nodes.

- v11:
 Thanks to comments from Jonathan,
 * Replace `mutex_lock()` with `guard(mutex)()`
 * Reorder some modifications within the patchset
 * Rewrite the code for improved readability and fixing alignment issues
 * Pass all strict rules in checkpatch.pl
- v10:
 Thanks to Andrew's and SeongJae's comments,
 * Address kunit compilation errors
 * Resolve the bug of not returning the correct error code in
   `mt_perf_to_adistance`
 * 
https://lore.kernel.org/lkml/20240402001739.2521623-1-horenchu...@bytedance.com/T/#u
-v9:
 * Address corner cases in `memory_tier_late_init`. Thank Ying's comments.
 * 
https://lore.kernel.org/lkml/20240329053353.309557-1-horenchu...@bytedance.com/T/#u
-v8:
 * Fix email format
 * 
https://lore.kernel.org/lkml/20240329004815.195476-1-horenchu...@bytedance.com/T/#u
-v7:
 * Add Reviewed-by: "Huang, Ying" 
-v6:
 Thanks to Ying's comments,
 * Move `default_dram_perf_lock` to the function's beginning for clarity
 * Fix double unlocking at v5
 * 
https://lore.kernel.org/lkml/20240327072729.3381685-1-horenchu...@bytedance.com/T/#u
-v5:
 Thanks to Ying's comments,
 * Add comments about what is protected by `default_dram_perf_lock`
 * Fix an uninitialized pointer mtype
 * Slightly shorten the time holding `default_dram_perf_lock`
 * Fix a deadlock bug in `mt_perf_to_adistance`
 * 
https://lore.kernel.org/lkml/20240327041646.3258110-1-horenchu...@bytedance.com/T/#u
-v4:
 Thanks to Ying's comments,
 * Remove redundant code
 * Reorganize patches accordingly
 * 
https://lore.kernel.org/lkml/20240322070356.315922-1-horenchu...@bytedance.com/T/#u
-v3:
 Thanks to Ying's comments,
 * Make the newly added code independent of HMAT
 * Upgrade set_node_memory_tier to support more cases
 * Put all non-driver-initialized memory types into default_memory_types
   instead of using hmat_memory_types
 * find_alloc_memory_type -> mt_find_alloc_memory_type
 * 
https://lore.kernel.org/lkml/20240320061041.3246828-1-horenchu...@bytedance.com/T/#u
-v2:
 Thanks to Ying's comments,
 * Rewrite cover letter & patch description
 * Rename functions, don't use _hmat
 * Abstract common functions into find_alloc_memory_type()
 * Use the expected way to use set_node_memory_tier instead of modifying it
 * 
https://lore.kernel.org/lkml/20240312061729.1997111-1-horenchu...@bytedance.com/T/#u
-v1:
 * 
https://lore.kernel.org/lkml/20240301082248.3456086-1-horenchu...@bytedance.com/T/#u

Ho-Ren (Jack) Chuang (2):
  memory tier: dax/kmem: introduce an abstract layer for finding,
allocating, and putting memory types
  memory tier: create CPUless memory tiers after obtaining HMAT info

 drivers/dax/kmem.c   |  30 ++---
 include/linux/memory-tiers.h |  13 
 mm/memory-tiers.c| 123 ---
 3 files changed, 116 insertions(+), 50 deletions(-)

-- 
Ho-Ren (Jack) Chuang

Re: [PATCH v3 23/25] drivers: media: i2c: imx258: Add support for powerdown gpio

2024-04-04 Thread Luis Garcia

On 4/4/24 08:12, Dave Stevenson wrote:
> Hi Luigi
> 
> On Wed, 3 Apr 2024 at 20:34, Luigi311  wrote:
>>
>> On 4/3/24 10:57, Ondřej Jirman wrote:
>>> Hi Sakari and Luis,
>>>
>>> On Wed, Apr 03, 2024 at 04:25:41PM GMT, Sakari Ailus wrote:
 Hi Luis, Ondrej,

 On Wed, Apr 03, 2024 at 09:03:52AM -0600, g...@luigi311.com wrote:
> From: Luis Garcia 
>
> On some boards powerdown signal needs to be deasserted for this
> sensor to be enabled.
>
> Signed-off-by: Ondrej Jirman 
> Signed-off-by: Luis Garcia 
> ---
>  drivers/media/i2c/imx258.c | 13 +
>  1 file changed, 13 insertions(+)
>
> diff --git a/drivers/media/i2c/imx258.c b/drivers/media/i2c/imx258.c
> index 30352c33f63c..163f04f6f954 100644
> --- a/drivers/media/i2c/imx258.c
> +++ b/drivers/media/i2c/imx258.c
> @@ -679,6 +679,8 @@ struct imx258 {
> unsigned int lane_mode_idx;
> unsigned int csi2_flags;
>
> +   struct gpio_desc *powerdown_gpio;
> +
> /*
>  * Mutex for serialized access:
>  * Protect sensor module set pad format and start/stop streaming 
> safely.
> @@ -1213,6 +1215,8 @@ static int imx258_power_on(struct device *dev)
> struct imx258 *imx258 = to_imx258(sd);
> int ret;
>
> +   gpiod_set_value_cansleep(imx258->powerdown_gpio, 0);

 What does the spec say? Should this really happen before switching on the
 supplies below?
>>>
>>> There's no powerdown input in the IMX258 manual. The manual only mentions
>>> that XCLR (reset) should be held low during power on.
>>>
>>> https://megous.com/dl/tmp/15b0992a720ab82d.png
>>>
>>> https://megous.com/dl/tmp/f2cc991046d97641.png
>>>
>>>This sensor doesn’t have a built-in “Power ON Reset” function. The XCLR 
>>> pin
>>>is set to “LOW” and the power supplies are brought up. Then the XCLR pin
>>>should be set to “High” after INCK supplied.
>>>
>>> So this input is some feature on camera module itself outside of the
>>> IMX258 chip, which I think is used to gate power to the module. Eg. on 
>>> Pinephone
>>> Pro, there are two modules with shared power rails, so enabling supply to
>>> one module enables it to the other one, too. So this input becomes the only 
>>> way
>>> to really enable/disable power to the chip when both are used at once at 
>>> some
>>> point, because regulator_bulk_enable/disable becomes ineffective at that 
>>> point.
>>>
>>> Luis, maybe you saw some other datasheet that mentions this input? IMO,
>>> it just gates the power rails via some mosfets on the module itself, since
>>> there's not power down input to the chip itself.
>>>
>>> kind regards,
>>>   o.
>>>
>>
>> Ondrej, I did not see anything else in the datasheet since I'm pretty sure
>> I'm looking at the same datasheet as it was supplied to me by Pine64. I'm
>> not sure what datasheet Dave has access to since he got his for a
>> completely different module than what we are testing with though.
> 
> I only have a leaked datasheet (isn't the internet wonderful!)  [1]
> XCLR is documented in that, as Ondrej has said.
> 
> If this powerdown GPIO is meant to be driving XCLR, then it is in the
> wrong order against the supplies.
> 
> This does make me confused over the difference between this powerdown
> GPIO and the reset GPIO that you implement in 24/25.
> 
> Following the PinePhone Pro DT [3] and schematics [4]
> reset-gpios = <&gpio1 RK_PA0 GPIO_ACTIVE_LOW>;
> powerdown-gpios = <&gpio2 RK_PD4 GPIO_ACTIVE_HIGH>;
> 
> Schematic page 11 upper right block
> GPIO1_A0/ISP0_SHUTTER_EN/ISP1_SHUTTER_EN/TCPD_VBUS_SINK_EN_d becomes
> Camera_RST_L. Page 18 feeds that through to the RESET on the camera
> connector.
> Page 11 left middle block GPIO2_D4/SDIO0_BKPWR_d becomes DVP_PDN1_H.
> Page 18 feeds that through to the PWDN on the camera connector.
> 
> Seeing as we apparently have a lens driver kicking around as well,
> potentially one is reset to the VCM, and one to the sensor? DW9714
> does have an XSD shutdown pin.
> Only the module integrator is going to really know the answer,
> although potentially a little poking with gpioset and i2cdetect may
> tell you more.
> 
>   Dave
> 
> [1] https://web.archive.org/web/20201027131326/www.hi.app/IMX258-datasheet.pdf
> [2] 
> https://files.pine64.org/doc/PinePhonePro/PinephonePro-Schematic-V1.0-20211127.pdf
> [3] 
> https://xff.cz/git/linux/tree/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts?h=orange-pi-5.18#n868
> [4] 
> https://files.pine64.org/doc/PinePhonePro/PinephonePro-Schematic-V1.0-20211127.pdf
> 
> 

Out of curiosity I dropped this and tested it on my PPP and it still loads
up the camera correctly so I am fine with dropping this and patch 22 that
adds in the dt binding

> +
> ret = regulator_bulk_enable(IMX258_NUM_SUPPLIES,
> imx258->supplies);
> if (ret) {
> @@ -1224,6 +1228,7 @@ static int imx258_power_on(struct device *dev)
>

Re: 回复：回复：general protection fault in refill_obj_stock

2024-04-04 Thread Roman Gushchin

On Tue, Apr 02, 2024 at 02:14:58PM +0800, Ubisectech Sirius wrote:
> > On Tue, Apr 02, 2024 at 09:50:54AM +0800, Ubisectech Sirius wrote:
> >>> On Mon, Apr 01, 2024 at 03:04:46PM +0800, Ubisectech Sirius wrote:
> >>> Hello.
> >>> We are Ubisectech Sirius Team, the vulnerability lab of China ValiantSec. 
> >>> Recently, our team has discovered a issue in Linux kernel 6.7. Attached 
> >>> to the email were a PoC file of the issue.
> >>
> >>> Thank you for the report!
> >>
> >>> I tried to compile and run your test program for about half an hour
> >>> on a virtual machine running 6.7 with enabled KASAN, but wasn't able
> >>> to reproduce the problem.
> >> 
> >>> Can you, please, share a bit more information? How long does it take
> >>> to reproduce? Do you mind sharing your kernel config? Is there anything 
> >>> special
> >>> about your setup? What are exact steps to reproduce the problem?
> >>> Is this problem reproducible on 6.6?
> >> 
> >> Hi. 
> >> The .config of linux kernel 6.7 has send to you as attachment.
> > Thanks!
> > How long it takes to reproduce a problem? Do you just start your reproducer 
> > and wait?
> I just start the reproducer and wait without any other operation. The speed 
> of reproducing this problem is vary fast(Less than 5 seconds). 
> >> And The problem is reproducible on 6.6.
> > Hm, it rules out my recent changes.
> > Did you try any older kernels? 6.5? 6.0? Did you try to bisect the problem?
> > if it's fast to reproduce, it might be the best option.
> I have try the 6.0, 6.3, 6.4, 6.5 kernel. The Linux kernel 6.5 will get same 
> error output. But other version will get different output like below:
> [ 55.306672][ T7950] KASAN: null-ptr-deref in range 
> [0x0018-0x001f]
> [ 55.307259][ T7950] CPU: 1 PID: 7950 Comm: poc Not tainted 6.3.0 #1
> [ 55.307714][ T7950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
> BIOS 1.15.0-1 04/01/2014
> [ 55.308363][ T7950] RIP: 0010:tomoyo_check_acl (security/tomoyo/domain.c:173)
> [ 55.316475][ T7950] Call Trace:
> [ 55.316713][ T7950] 
> [ 55.317353][ T7950] tomoyo_path_permission (security/tomoyo/file.c:170 
> security/tomoyo/file.c:587 security/tomoyo/file.c:573)
> [ 55.317744][ T7950] tomoyo_check_open_permission (security/tomoyo/file.c:779)
> [ 55.320152][ T7950] tomoyo_file_open (security/tomoyo/tomoyo.c:332 
> security/tomoyo/tomoyo.c:327)
> [ 55.320495][ T7950] security_file_open (security/security.c:1719 
> (discriminator 13))
> [ 55.320850][ T7950] do_dentry_open (fs/open.c:908)
> [ 55.321526][ T7950] path_openat (fs/namei.c:3561 fs/namei.c:3715)
> [ 55.322614][ T7950] do_filp_open (fs/namei.c:3743)
> [ 55.325086][ T7950] do_sys_openat2 (fs/open.c:1349)
> [ 55.326249][ T7950] __x64_sys_openat (fs/open.c:1375)
> [ 55.327428][ T7950] do_syscall_64 (arch/x86/entry/common.c:50 
> arch/x86/entry/common.c:80)
> [ 55.327756][ T7950] entry_SYSCALL_64_after_hwframe 
> (arch/x86/entry/entry_64.S:120)
> [ 55.328185][ T7950] RIP: 0033:0x7f1c4a484f29
> [ 55.328504][ T7950] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 
> 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 
> <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 37 8f 0d 00 f7 d8 64 89 01 48
> [ 55.329864][ T7950] RSP: 002b:7ffd7bfe8398 EFLAGS: 0246 ORIG_RAX: 
> 0101
> [ 55.330464][ T7950] RAX: ffda RBX:  RCX: 
> 7f1c4a484f29
> [ 55.331024][ T7950] RDX: 00141842 RSI: 2380 RDI: 
> ff9c
> [ 55.331585][ T7950] RBP: 7ffd7bfe83a0 R08:  R09: 
> 7ffd7bfe83f0
> [ 55.332148][ T7950] R10:  R11: 0246 R12: 
> 55c5e36482d0
> [ 55.332707][ T7950] R13:  R14:  R15: 
> 
> [ 55.333268][ T7950] 
> [ 55.333488][ T7950] Modules linked in:
> [ 55.340525][ T7950] ---[ end trace  ]---
> [ 55.340936][ T7950] RIP: 0010:tomoyo_check_acl (security/tomoyo/domain.c:173)
> It look like other problem?
> > Also, are you running vanilla kernels or you do have some custom changes on 
> > top?
> I haven't made any custom changes. 
> >Thanks!

Ok, I installed a new toolchain, built a kernel with your config and reproduced 
the (a?) problem.
It definitely smells a generic memory corruption, as I get new stacktraces 
every time I run it.
I got something similar to your tomoyo stacktrace, then I got something about
ima_add_template_entry() and then something else. Never saw your original 
obj_cgroup_get()
stack.

It seems to be connected to your very full kernel config, as I can't reproduce 
anything
with my original more minimal config. It also doesn't seem to be connected to 
the
kernel memory accounting directly.

It would be helpful to understand which kernel config options are required to 
reproduce
the issue as well as what exactly the reproducer does. I'll try to spend some 
cycles
on this, but can't promise much.

Thanks!

Re: [PATCH v3 19/25] media: i2c: imx258: Change register settings for variants of the sensor

2024-04-04 Thread Luigi311

On 4/3/24 10:18, Sakari Ailus wrote:
> Hi Luis, Dave,
> 
> On Wed, Apr 03, 2024 at 09:03:48AM -0600, g...@luigi311.com wrote:
>> From: Dave Stevenson 
>>
>> Sony have advised that there are variants of the IMX258 sensor which
>> require slightly different register configuration to the mainline
>> imx258 driver defaults.
>>
>> There is no available run-time detection for the variant, so add
>> configuration via the DT compatible string.
>>
>> The Vision Components imx258 module supports PDAF, so add the
>> register differences for that variant
>>
>> Signed-off-by: Dave Stevenson 
>> Signed-off-by: Luis Garcia 
>> ---
>>  drivers/media/i2c/imx258.c | 48 ++
>>  1 file changed, 44 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/media/i2c/imx258.c b/drivers/media/i2c/imx258.c
>> index 775d957c9b87..fa48da212037 100644
>> --- a/drivers/media/i2c/imx258.c
>> +++ b/drivers/media/i2c/imx258.c
>> @@ -6,6 +6,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -321,8 +322,6 @@ static const struct imx258_reg mipi_642mbps_24mhz_4l[] = 
>> {
>>  
>>  static const struct imx258_reg mode_common_regs[] = {
>>  { 0x3051, 0x00 },
>> -{ 0x3052, 0x00 },
>> -{ 0x4E21, 0x14 },
>>  { 0x6B11, 0xCF },
>>  { 0x7FF0, 0x08 },
>>  { 0x7FF1, 0x0F },
>> @@ -345,7 +344,6 @@ static const struct imx258_reg mode_common_regs[] = {
>>  { 0x7FA8, 0x03 },
>>  { 0x7FA9, 0xFE },
>>  { 0x7B24, 0x81 },
>> -{ 0x7B25, 0x00 },
>>  { 0x6564, 0x07 },
>>  { 0x6B0D, 0x41 },
>>  { 0x653D, 0x04 },
>> @@ -460,6 +458,33 @@ static const struct imx258_reg mode_1048_780_regs[] = {
>>  { 0x034F, 0x0C },
>>  };
>>  
>> +struct imx258_variant_cfg {
>> +const struct imx258_reg *regs;
>> +unsigned int num_regs;
>> +};
>> +
>> +static const struct imx258_reg imx258_cfg_regs[] = {
>> +{ 0x3052, 0x00 },
>> +{ 0x4E21, 0x14 },
>> +{ 0x7B25, 0x00 },
>> +};
>> +
>> +static const struct imx258_variant_cfg imx258_cfg = {
>> +.regs = imx258_cfg_regs,
>> +.num_regs = ARRAY_SIZE(imx258_cfg_regs),
>> +};
>> +
>> +static const struct imx258_reg imx258_pdaf_cfg_regs[] = {
>> +{ 0x3052, 0x01 },
>> +{ 0x4E21, 0x10 },
>> +{ 0x7B25, 0x01 },
>> +};
>> +
>> +static const struct imx258_variant_cfg imx258_pdaf_cfg = {
>> +.regs = imx258_pdaf_cfg_regs,
>> +.num_regs = ARRAY_SIZE(imx258_pdaf_cfg_regs),
>> +};
>> +
>>  static const char * const imx258_test_pattern_menu[] = {
>>  "Disabled",
>>  "Solid Colour",
>> @@ -637,6 +662,8 @@ struct imx258 {
>>  struct v4l2_subdev sd;
>>  struct media_pad pad;
>>  
>> +const struct imx258_variant_cfg *variant_cfg;
>> +
>>  struct v4l2_ctrl_handler ctrl_handler;
>>  /* V4L2 Controls */
>>  struct v4l2_ctrl *link_freq;
>> @@ -1104,6 +1131,14 @@ static int imx258_start_streaming(struct imx258 
>> *imx258)
>>  return ret;
>>  }
>>  
>> +ret = imx258_write_regs(imx258, imx258->variant_cfg->regs,
>> +imx258->variant_cfg->num_regs);
>> +if (ret) {
>> +dev_err(&client->dev, "%s failed to set variant config\n",
>> +__func__);
>> +return ret;
>> +}
>> +
>>  ret = imx258_write_reg(imx258, IMX258_CLK_BLANK_STOP,
>> IMX258_REG_VALUE_08BIT,
>> imx258->csi2_flags & 
>> V4L2_MBUS_CSI2_NONCONTINUOUS_CLOCK ?
>> @@ -1492,6 +1527,10 @@ static int imx258_probe(struct i2c_client *client)
>>  
>>  imx258->csi2_flags = ep.bus.mipi_csi2.flags;
>>  
>> +imx258->variant_cfg = of_device_get_match_data(&client->dev);
> 
> You'll also need to keep this working for ACPI based systems. I.e. in
> practice remove "of_" prefix here and add the non-PDAF variant data to the
> relevant ACPI ID list.
> 

Removing of_ is easy enough and looking at all the other commits that make
this change in other drivers I dont see anything else being done besides
adding in the .data section that is down below for both imx258 and pdaf
versions. Is that what you are referencing or is there some other place
to add variant data to ACPI ID list?

>> +if (!imx258->variant_cfg)
>> +imx258->variant_cfg = &imx258_cfg;
>> +
>>  /* Initialize subdev */
>>  v4l2_i2c_subdev_init(&imx258->sd, client, &imx258_subdev_ops);
>>  
>> @@ -1579,7 +1618,8 @@ MODULE_DEVICE_TABLE(acpi, imx258_acpi_ids);
>>  #endif
>>  
>>  static const struct of_device_id imx258_dt_ids[] = {
>> -{ .compatible = "sony,imx258" },
>> +{ .compatible = "sony,imx258", .data = &imx258_cfg },
>> +{ .compatible = "sony,imx258-pdaf", .data = &imx258_pdaf_cfg },
>>  { /* sentinel */ }
>>  };
>>  MODULE_DEVICE_TABLE(of, imx258_dt_ids);
>

Re: [PATCH v3 12/25] media: i2c: imx258: Allow configuration of clock lane behaviour

2024-04-04 Thread Luigi311

On 4/3/24 12:48, Pavel Machek wrote:
> Hi!
> 
>> The sensor supports the clock lane either remaining in HS mode
>> during frame blanking, or dropping to LP11.
>>
>> Add configuration of the mode via V4L2_MBUS_CSI2_NONCONTINUOUS_CLOCK.
> 
>> +ret = imx258_write_reg(imx258, IMX258_CLK_BLANK_STOP,
>> +   IMX258_REG_VALUE_08BIT,
>> +   imx258->csi2_flags & 
>> V4L2_MBUS_CSI2_NONCONTINUOUS_CLOCK ?
>> +   1 : 0);
> 
> !! can be used to turn value into 1/0. I find it easier to read than ?
> 1 : 0  combination, but possibly that's fine, too.
> 
> Best regards,
>   Pavel
> 

I assume you mean by using 

!!(imx258->csi2_flags & V4L2_MBUS_CSI2_NONCONTINUOUS_CLOCK)

I can go ahead and use that instead

Re: [PATCH] dax/bus.c: replace WARN_ON_ONCE() with lockdep asserts

2024-04-04 Thread Verma, Vishal L

On Thu, 2024-04-04 at 14:23 -0700, Andrew Morton wrote:
> On Tue, 02 Apr 2024 00:24:28 -0600 Vishal Verma  
> wrote:
> 
> > In [1], Dan points out that all of the WARN_ON_ONCE() usage in the
> > referenced patch should be replaced with lockdep_assert_held(_write)().
> > 
> > Replace those, and additionally, replace a couple of other
> > WARN_ON_ONCE() introduced in the same patch for actual failure
> > cases (i.e. when acquiring a semaphore fails in a remove / unregister
> > path) with dev_WARN_ONCE() as is the precedent here.
> > 
> > Recall that previously, unregistration paths was implicitly protected by
> > overloading the device lock, which the patch in [1] sought to remove.
> > This meant adding a semaphore acquisition in these unregistration paths.
> > Since that can fail, and it doesn't make sense to return errors from
> > these paths, retain the two instances of (now) dev_WARN_ONCE().
> > 
> > ...
> > 
> > @@ -471,6 +471,7 @@ static void __unregister_dev_dax(void *dev)
> >  
> >     dev_dbg(dev, "%s\n", __func__);
> >  
> > +   lockdep_assert_held_write(&dax_region_rwsem);
> >     kill_dev_dax(dev_dax);
> >     device_del(dev);
> >     free_dev_dax_ranges(dev_dax);
> 
> This is new and unchangelogged?
> 
> I'm taking Dan's reply to your patch as Not-A-Nack ;)
> 
True, but with Dan's new feedback, that results in a bit more rework,
this will likely turn into 2-3 patches. Working on it now, will be out
shortly!

Re: (subset) [PATCH v2 0/3] Split sony-castor into shinano-common and add Sony Xperia Z3

2024-04-04 Thread Bjorn Andersson



On Thu, 14 Mar 2024 19:56:21 +0100, Luca Weiss wrote:
> Prepare for adding sony-leo dts by splitting common parts into a
> separate dtsi file.
> 
> Then add the dts for Sony Xperia Z3.
> 
> Depends on:
> https://lore.kernel.org/linux-arm-msm/20240306-castor-changes-v1-0-2286eaf85...@z3ntu.xyz/T/
> 
> [...]

Applied, thanks!

[1/3] ARM: dts: qcom: msm8974-sony-castor: Split into shinano-common
  commit: 53426f53eda5e4a17197a8bc7dd1045601db407e
[3/3] ARM: dts: qcom: Add Sony Xperia Z3 smartphone
  commit: 8d91a5a4a6f5aff714a14ac4a86931aa789655d8

Best regards,
-- 
Bjorn Andersson

Re: [PATCH] dax/bus.c: replace WARN_ON_ONCE() with lockdep asserts

2024-04-04 Thread Andrew Morton

On Tue, 02 Apr 2024 00:24:28 -0600 Vishal Verma  
wrote:

> In [1], Dan points out that all of the WARN_ON_ONCE() usage in the
> referenced patch should be replaced with lockdep_assert_held(_write)().
> 
> Replace those, and additionally, replace a couple of other
> WARN_ON_ONCE() introduced in the same patch for actual failure
> cases (i.e. when acquiring a semaphore fails in a remove / unregister
> path) with dev_WARN_ONCE() as is the precedent here.
> 
> Recall that previously, unregistration paths was implicitly protected by
> overloading the device lock, which the patch in [1] sought to remove.
> This meant adding a semaphore acquisition in these unregistration paths.
> Since that can fail, and it doesn't make sense to return errors from
> these paths, retain the two instances of (now) dev_WARN_ONCE().
> 
> ...
>
> @@ -471,6 +471,7 @@ static void __unregister_dev_dax(void *dev)
>  
>   dev_dbg(dev, "%s\n", __func__);
>  
> + lockdep_assert_held_write(&dax_region_rwsem);
>   kill_dev_dax(dev_dax);
>   device_del(dev);
>   free_dev_dax_ranges(dev_dax);

This is new and unchangelogged?

I'm taking Dan's reply to your patch as Not-A-Nack ;)

Re: (subset) [PATCH 1/1] clk: qcom: smd-rpm: Restore msm8976 num_clk

2024-04-04 Thread Bjorn Andersson



On Mon, 01 Apr 2024 19:16:39 +0200, Adam Skladowski wrote:
> During rework somehow msm8976 num_clk got removed, restore it.
> 
> 

Applied, thanks!

[1/1] clk: qcom: smd-rpm: Restore msm8976 num_clk
  commit: 0d4ce2458cd7d1d66a5ee2f3c036592fb663d5bc

Best regards,
-- 
Bjorn Andersson

Re: [External] Re: [PATCH v10 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-04-04 Thread Ho-Ren (Jack) Chuang

Hi Jonathan,

Thank you! I will fix them and send a V11 soon.

On Thu, Apr 4, 2024 at 6:37 AM Jonathan Cameron
 wrote:
>
> 
>
> > > > @@ -858,7 +910,8 @@ static int __init memory_tier_init(void)
> > > >* For now we can have 4 faster memory tiers with smaller 
> > > > adistance
> > > >* than default DRAM tier.
> > > >*/
> > > > - default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
> > > > + default_dram_type = 
> > > > mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM,
> > > > + 
> > > > &default_memory_types);
> > >
> > > Unusual indenting.  Align with just after (
> > >
> >
> > Aligning with "(" will exceed 100 columns. Would that be acceptable?
> I think we are talking cross purposes.
>
> default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM,
>   &default_memory_types);
>
> Is what I was suggesting.
>

Oh, now I see. Thanks!

> >
> > > >   if (IS_ERR(default_dram_type))
> > > >   panic("%s() failed to allocate default DRAM tier\n", 
> > > > __func__);
> > > >
> > > > @@ -868,6 +921,14 @@ static int __init memory_tier_init(void)
> > > >* types assigned.
> > > >*/
> > > >   for_each_node_state(node, N_MEMORY) {
> > > > + if (!node_state(node, N_CPU))
> > > > + /*
> > > > +  * Defer memory tier initialization on CPUless 
> > > > numa nodes.
> > > > +  * These will be initialized after firmware and 
> > > > devices are
> > >
> > > I think this wraps at just over 80 chars.  Seems silly to wrap so tightly 
> > > and not
> > > quite fit under 80. (this is about 83 chars.
> > >
> >
> > I can fix this.
> > I have a question. From my patch, this is <80 chars. However,
> > in an email, this is >80 chars. Does that mean we need to
> > count the number of chars in an email, not in a patch? Or if I
> > missed something? like vim configuration or?
>
> 3 tabs + 1 space + the text from * (58)
> = 24 + 1 + 58 = 83
>
> Advantage of using claws email for kernel stuff is it has a nice per character
> ruler at the top of the window.
>
> I wonder if you have a different tab indent size?  The kernel uses 8
> characters.  It might explain the few other odd indents if perhaps
> you have it at 4 in your editor?
>
> https://www.kernel.org/doc/html/v4.10/process/coding-style.html
>

Got it. I was using tab=4. I will change to 8. Thanks!

> Jonathan
>
> >
> > > > +  * initialized.
> > > > +  */
> > > > + continue;
> > > > +
> > > >   memtier = set_node_memory_tier(node);
> > > >   if (IS_ERR(memtier))
> > > >   /*
> > >
> >
> >
>


-- 
Best regards,
Ho-Ren (Jack) Chuang
莊賀任

Copying TLS/user register data per perf-sample?

2024-04-04 Thread Beau Belgrave

Hello,

I'm looking into the possibility of capturing user data that is pointed
to by a user register (IE: fs/gs for TLS on x86/64) for each sample via
perf_events.

I was hoping to find a way to do this similar to PERF_SAMPLE_STACK_USER.
I think it could even use roughly the same ABI in the perf ring buffer.
Or it may be possible by some kprobe linked to the perf sample function.

This would allow a profiler to collect TLS (or other values) on x64. In
the Open Telemetry profiling SIG [1], we are trying to find a fast way
to grab a tracing association quickly on a per-thread basis. The team
at Elastic has a bespoke way to do this [2], however, I'd like to see a
more general way to achieve this. The folks I've been talking with seem
open to the idea of just having a TLS value for this we could capture
upon each sample. We could then just state, Open Telemetry SDKs should
have a TLS value for span correlation. However, we need a way to sample
the TLS value(s) when a sampling event is generated.

Is this already possible via some other means? It'd be great to be able
to do this directly at the perf_event sample via the ABI or a probe.

Thanks,
-Beau

1. https://opentelemetry.io/blog/2024/profiling/
2. 
https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation

Re: [PATCH v3 1/7] mm: Add a bitmap into mmu_notifier_{clear,test}_young

2024-04-04 Thread David Hildenbrand


On 02.04.24 01:29, James Houghton wrote:

The bitmap is provided for secondary MMUs to use if they support it. For
test_young(), after it returns, the bitmap represents the pages that
were young in the interval [start, end). For clear_young, it represents
the pages that we wish the secondary MMU to clear the accessed/young bit
for.

If a bitmap is not provided, the mmu_notifier_{test,clear}_young() API
should be unchanged except that if young PTEs are found and the
architecture supports passing in a bitmap, instead of returning 1,
MMU_NOTIFIER_YOUNG_FAST is returned.

This allows MGLRU's look-around logic to work faster, resulting in a 4%
improvement in real workloads[1]. Also introduce MMU_NOTIFIER_YOUNG_FAST
to indicate to main mm that doing look-around is likely to be
beneficial.

If the secondary MMU doesn't support the bitmap, it must return
an int that contains MMU_NOTIFIER_YOUNG_BITMAP_UNRELIABLE.

[1]: https://lore.kernel.org/all/20230609005935.42390-1-yuz...@google.com/

Suggested-by: Yu Zhao 
Signed-off-by: James Houghton 
---
  include/linux/mmu_notifier.h | 93 +---
  include/trace/events/kvm.h   | 13 +++--
  mm/mmu_notifier.c| 20 +---
  virt/kvm/kvm_main.c  | 19 ++--
  4 files changed, 123 insertions(+), 22 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index f349e08a9dfe..daaa9db625d3 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -61,6 +61,10 @@ enum mmu_notifier_event {
  
  #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
  
+#define MMU_NOTIFIER_YOUNG			(1 << 0)

+#define MMU_NOTIFIER_YOUNG_BITMAP_UNRELIABLE   (1 << 1)


Especially this one really deserves some documentation :)


+#define MMU_NOTIFIER_YOUNG_FAST(1 << 2)


And that one as well.

Likely best to briefly document all of them, and how they are
supposed to be used (return value for X).


+
  struct mmu_notifier_ops {
/*
 * Called either by mmu_notifier_unregister or when the mm is
@@ -106,21 +110,36 @@ struct mmu_notifier_ops {
 * clear_young is a lightweight version of clear_flush_young. Like the
 * latter, it is supposed to test-and-clear the young/accessed bitflag
 * in the secondary pte, but it may omit flushing the secondary tlb.
+*
+* If @bitmap is given but is not supported, return
+* MMU_NOTIFIER_YOUNG_BITMAP_UNRELIABLE.
+*
+* If the walk is done "quickly" and there were young PTEs,
+* MMU_NOTIFIER_YOUNG_FAST is returned.
 */
int (*clear_young)(struct mmu_notifier *subscription,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end);
+  unsigned long end,
+  unsigned long *bitmap);
  
  	/*

 * test_young is called to check the young/accessed bitflag in
 * the secondary pte. This is used to know if the page is
 * frequently used without actually clearing the flag or tearing
 * down the secondary mapping on the page.
+*
+* If @bitmap is given but is not supported, return
+* MMU_NOTIFIER_YOUNG_BITMAP_UNRELIABLE.
+*
+* If the walk is done "quickly" and there were young PTEs,
+* MMU_NOTIFIER_YOUNG_FAST is returned.
 */
int (*test_young)(struct mmu_notifier *subscription,
  struct mm_struct *mm,
- unsigned long address);
+ unsigned long start,
+ unsigned long end,
+ unsigned long *bitmap);


What does "quickly" mean (why not use "fast")? What are the semantics, I 
don't find any existing usage of that in this file.


Further, what is MMU_NOTIFIER_YOUNG you introduce used for?

--
Cheers,

David / dhildenb

iMX8MP Cortex-M7 Relation to Audio Power Domain

2024-04-04 Thread João Paulo Silva Gonçalves

Hello all,

I was investigating why the kernel freezes on the iMX8MP when attempting to boot
the Cortex-M7 processor using the Linux remoteproc interface. However, with
v6.5, it started to work, and I was able to pinpoint to commit
b86c3afabb4f ('arm64: dts: imx8mp: Add SAI, SDMA, AudioMIX') [1] through 
bisection.
The patch appeared unrelated to remoteproc, and after some time, I realized 
there
is a connection between the functioning of remoteproc and the audio power 
domain.
For instance, adding the audio power domain to the node in the device
tree (below) made it work. The same behavior occurs in the downstream kernel.
There is a workaround for the problem by setting clkim8mp.mcore_booted=1 in the 
kernel arguments, but this is not seen as a final solution (it seems to 
disable all clock gating).

imx8mp-cm7 {
compatible = "fsl,imx8mp-cm7";
clocks = <&clk IMX8MP_CLK_M7_CORE>;
clock-names = "core", "audio";
mbox-names = "tx", "rx", "rxdb";
mboxes = <&mu 0 1
&mu 1 1
&mu 3 1>;
memory-region = <&vdevbuffer>, <&vdev0vring0>, <&vdev0vring1>, 
<&rsc_table>, <&m7_reserved>;
rsc-da = <0x5500>;
syscon = <&src>;
fsl,startup-delay-ms = <500>;
power-domains = <&pgc_audio>;
};


Do any of you know anything about the relationship between the audio domain and
the Cortex-M7 on iMX8MP? The TRM is not very clear on this, and the only thing
I could find is that there are some mailboxes for Cortex-M7/Audio processor
communication managed by the audio power domain.

Thanks for the help!

[1] 
https://github.com/torvalds/linux/commit/b86c3afabb4f4ea146c206508527eb2a15485bcc


Regards,
João Paulo S. Goncalves

Re: [PATCH] livepatch: Add KLP_IDLE state

2024-04-04 Thread Joe Lawrence

On 4/4/24 11:17, Petr Mladek wrote:
> On Tue 2024-04-02 09:52:31, Joe Lawrence wrote:
>> On Tue, Apr 02, 2024 at 11:09:54AM +0800, zhangwar...@gmail.com wrote:
>>> From: Wardenjohn 
>>>
>>> In livepatch, using KLP_UNDEFINED is seems to be confused.
>>> When kernel is ready, livepatch is ready too, which state is
>>> idle but not undefined. What's more, if one livepatch process
>>> is finished, the klp state should be idle rather than undefined.
>>>
>>> Therefore, using KLP_IDLE to replace KLP_UNDEFINED is much better
>>> in reading and understanding.
>>> ---
>>>  include/linux/livepatch.h |  1 +
>>>  kernel/livepatch/patch.c  |  2 +-
>>>  kernel/livepatch/transition.c | 24 
>>>  3 files changed, 14 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
>>> index 9b9b38e89563..c1c53cd5b227 100644
>>> --- a/include/linux/livepatch.h
>>> +++ b/include/linux/livepatch.h
>>> @@ -19,6 +19,7 @@
>>>  
>>>  /* task patch states */
>>>  #define KLP_UNDEFINED  -1
>>> +#define KLP_IDLE   -1
>>
>> Hi Wardenjohn,
>>
>> Quick question, does this patch intend to:
>>
>> - Completely replace KLP_UNDEFINED with KLP_IDLE
>> - Introduce KLP_IDLE as an added, fourth potential state
>> - Introduce KLP_IDLE as synonym of sorts for KLP_UNDEFINED under certain
>>   conditions
>>
>> I ask because this patch leaves KLP_UNDEFINED defined and used in other
>> parts of the tree (ie, init/init_task.c), yet KLP_IDLE is added and
>> continues to use the same -1 enumeration.
> 
> Having two names for the same state adds more harm than good.
> 
> Honestly, neither "task->patch_state == KLP_UNDEFINED" nor "KLP_IDLE"
> make much sense.
> 
> The problem is in the variable name. It is not a state of a patch.
> It is the state of the transition. The right solution would be
> something like:
> 
>   klp_target_state -> klp_transition_target_state
>   task->patch_state -> task->klp_transition_state
>   KLP_UNKNOWN -> KLP_IDLE
> 

Yes, this is exactly how I think of these when reading the code.  The
model starts to make a lot more sense once you look at it thru this lens :)

> But it would also require renaming:
> 
>   /proc//patch_state -> klp_transition_state
> 
> which might break userspace tools => likely not acceptable.
> 
> 
> My opinion:
> 
> It would be nice to clean this up but it does not look worth the
> effort.
> 

Agreed.  Instead of changing code and the sysfs interface, we could
still add comments like:

  /* task patch transition target states */
  #define KLP_UNDEFINED   -1  /* idle, no transition in progress */
  #define KLP_UNPATCHED0  /* transitioning to unpatched state */
  #define KLP_PATCHED  1  /* transitioning to patched state */

  /* klp transition target state */
  static int klp_target_state = KLP_UNDEFINED;

  struct task_struct = {
  .patch_state= KLP_UNDEFINED,   /* klp transition state */

Maybe just one comment is enough?  Alternatively, we could elaborate in
Documentation/livepatch/livepatch.rst if it's really confusing.

Wardenjohn, since you're probably reading this code with fresh(er) eyes,
would any of the above be helpful?

-- 
Joe

Re: [PATCH fs/proc/bootconfig] remove redundant comments from /proc/bootconfig

2024-04-04 Thread Paul E. McKenney

On Thu, Apr 04, 2024 at 08:55:22AM +0900, Masami Hiramatsu wrote:
> On Wed, 3 Apr 2024 12:16:28 -0700
> "Paul E. McKenney"  wrote:
> 
> > commit 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to
> > /proc/bootconfig") adds bootloader argument comments into /proc/bootconfig.
> > 
> > /proc/bootconfig shows boot_command_line[] multiple times following
> > every xbc key value pair, that's duplicated and not necessary.
> > Remove redundant ones.
> > 
> > Output before and after the fix is like:
> > key1 = value1
> > *bootloader argument comments*
> > key2 = value2
> > *bootloader argument comments*
> > key3 = value3
> > *bootloader argument comments*
> > ...
> > 
> > key1 = value1
> > key2 = value2
> > key3 = value3
> > *bootloader argument comments*
> > ...
> > 
> > Fixes: 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to 
> > /proc/bootconfig")
> > Signed-off-by: Zhenhua Huang 
> > Signed-off-by: Paul E. McKenney 
> > Cc: Masami Hiramatsu 
> > Cc: 
> > Cc: 
> 
> OOps, good catch! Let me pick it.
> 
> Acked-by: Masami Hiramatsu (Google) 

Thank you, and I have applied your ack and pulled this into its own
bootconfig.2024.04.04a.

My guess is that you will push this via your own tree, and so I will
drop my copy as soon as yours hits -next.

Thanx, Paul

> Thank you!
> 
> > 
> > diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
> > index 902b326e1e560..e5635a6b127b0 100644
> > --- a/fs/proc/bootconfig.c
> > +++ b/fs/proc/bootconfig.c
> > @@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, 
> > size_t size)
> > break;
> > dst += ret;
> > }
> > -   if (ret >= 0 && boot_command_line[0]) {
> > -   ret = snprintf(dst, rest(dst, end), "# Parameters from 
> > bootloader:\n# %s\n",
> > -  boot_command_line);
> > -   if (ret > 0)
> > -   dst += ret;
> > -   }
> > +   }
> > +   if (ret >= 0 && boot_command_line[0]) {
> > +   ret = snprintf(dst, rest(dst, end), "# Parameters from 
> > bootloader:\n# %s\n",
> > +  boot_command_line);
> > +   if (ret > 0)
> > +   dst += ret;
> > }
> >  out:
> > kfree(key);
> 
> 
> -- 
> Masami Hiramatsu (Google)

Re: [PATCH v10 08/14] x86/sgx: Add basic EPC reclamation flow for cgroup

2024-04-04 Thread Haitao Huang


Hi Kai,
Thanks for your suggestions. I'll adopt most of it as it.
Minor details below.

On Wed, 03 Apr 2024 08:08:28 -0500, Huang, Kai  wrote:


On Wed, 2024-03-27 at 17:22 -0700, Haitao Huang wrote:

From: Kristen Carlson Accardi 

When a cgroup usage reaches its limit, and it is to be charged, i.e.,
sgx_cgroup_try_charge() called for new allocations, the cgroup needs to
reclaim pages from its LRU or LRUs of its descendants to make room for
any new allocations. This patch adds the basic building block for the
per-cgroup reclamation flow and use it for synchronous reclamation in
sgx_cgroup_try_charge().


It's better to firstly mention _why_ we need this first:

Currently in the EPC page allocation, the kernel simply fails the  
allocation
when the current EPC cgroup fails to charge due to its usage reaching  
limit.
This is not ideal.  When that happens, a better way is to reclaim EPC  
page(s)
from the current EPC cgroup (and/or its descendants) to reduce its usage  
so the

new allocation can succeed.

Add the basic building blocks to support the per-cgroup reclamation flow  
...




ok



First, modify sgx_reclaim_pages() to let callers to pass in the LRU from
which pages are reclaimed, so it can be reused by both the global and
cgroup reclaimers. Also return the number of pages attempted, so a
cgroup reclaimer can use it to track reclamation progress from its
descendants.


IMHO you are jumping too fast to the implementation details.  Better to  
have

some more background:

"
Currently the kernel only has one place to reclaim EPC pages: the global  
EPC LRU
list.  To support the "per-cgroup" EPC reclaim, maintain an LRU list for  
each
EPC cgroup, and introduce a "cgroup" variant function to reclaim EPC  
page(s)

from a given EPC cgroup (and its descendants).
"



ok



For the global reclaimer, replace all call sites of sgx_reclaim_pages()
with calls to a newly created wrapper, sgx_reclaim_pages_global(), which
just calls sgx_reclaim_pages() with the global LRU passed in.

For cgroup reclamation, implement a basic reclamation flow, encapsulated
in the top-level function, sgx_cgroup_reclaim_pages(). It performs a
pre-order walk on a given cgroup subtree, and calls sgx_reclaim_pages()
at each node passing in the LRU of that node. It keeps track of total
attempted pages and stops the walk if desired number of pages are
attempted.


Then it's time to jump to implementation details:

"
Currently the kernel does the global EPC reclaim in sgx_reclaim_page().   
It

always tries to reclaim EPC pages in batch of SGX_NR_TO_SCAN (16) pages.
Specifically, it always "scans", or "isolates" SGX_NR_TO_SCAN pages from  
the

global LRU, and then tries to reclaim these pages at once for better
performance.

Use similar way to implement the "cgroup" variant EPC reclaim, but keep  
the
implementation simple: 1) change sgx_reclaim_pages() to take an LRU as  
input,
and return the pages that are "scanned" (but not actually reclaimed); 2)  
loop
the given EPC cgroup and its descendants and do the new  
sgx_reclaim_pages()

until SGX_NR_TO_SCAN pages are "scanned".

This implementation always tries to reclaim SGX_NR_TO_SCAN pages from  
the LRU of
the given EPC cgroup, and only moves to its descendants when there's no  
enough
reclaimable EPC pages to "scan" in its LRU.  It should be enough for  
most cases.

"



ok


Then I think it's better to explain why "alternatives" are not chosen:

"
Note, this simple implementation doesn't _exactly_ mimic the current  
global EPC
reclaim (which always tries to do the actual reclaim in batch of  
SGX_NR_TO_SCAN
pages): when LRUs have less than SGX_NR_TO_SCAN reclaimable pages, the  
actual
reclaim of EPC pages will be split into smaller batches _across_  
multiple LRUs

with each being smaller than SGX_NR_TO_SCAN pages.

A more precise way to mimic the current global EPC reclaim would be to  
have a
new function to only "scan" (or "isolate") SGX_NR_TO_SCAN pages _across_  
the
given EPC cgroup _AND_ its descendants, and then do the actual reclaim  
in one

batch.  But this is unnecessarily complicated at this stage.

Alternatively, the current sgx_reclaim_pages() could be changed to  
return the
actual "reclaimed" pages, but not "scanned" pages.  However this  
solution also

has cons: 
"

:

I recall you mentioned "unable to control latency of each reclaim" etc,  
but IIUC

one could be:

This approach may result in higher chance of "reclaiming EPC pages from
descendants but not the root/given EPC cgorup", e.g., when all EPC pages  
in the
root EPC cgroup are all young while these in its descendants are not.   
This may

not be desired.

Makes sense?



Agree with the flow.
The con is that this function may block too long that is unacceptable for  
some callers like synchronous flow which only needs some minimal (e.g.,  
one page to pass try_charge()) to make forward progress. Convention is to  
call this function loops to ensure caller's condition is met, i.e., the  
way the or

Re: [PATCHv2 1/3] uprobe: Add uretprobe syscall to speed up return probe

2024-04-04 Thread Oleg Nesterov

On 04/05, Masami Hiramatsu wrote:
>
> Can we make this syscall and uprobe behavior clearer? As you said, if
> the application use sigreturn or longjump, it may skip returns and
> shadow stack entries are left in the kernel. In such cases, can uretprobe
> detect it properly, or just crash the process (or process runs wrongly)?

Please see the comment in handle_trampoline(), it tries to detect this case.
This patch should not make any difference.

Oleg.

Re: [PATCHv2 1/3] uprobe: Add uretprobe syscall to speed up return probe

2024-04-04 Thread Google

On Thu, 4 Apr 2024 13:58:43 +0200
Jiri Olsa  wrote:

> On Wed, Apr 03, 2024 at 07:00:07PM -0700, Andrii Nakryiko wrote:
> 
> SNIP
> 
> > Check rt_sigreturn syscall (manpage at [0], for example).
> > 
> >sigreturn() exists only to allow the implementation of signal
> >handlers.  It should never be called directly.  (Indeed, a simple
> >sigreturn() wrapper in the GNU C library simply returns -1, with
> >errno set to ENOSYS.)  Details of the arguments (if any) passed
> >to sigreturn() vary depending on the architecture.  (On some
> >architectures, such as x86-64, sigreturn() takes no arguments,
> >since all of the information that it requires is available in the
> >stack frame that was previously created by the kernel on the
> >user-space stack.)
> > 
> > This is a very similar use case. Also, check its source code in
> > arch/x86/kernel/signal_64.c. It sends SIGSEGV to the calling process
> > on any sign of something not being right. It's exactly the same with
> > sys_uretprobe.
> > 
> >   [0] https://man7.org/linux/man-pages/man2/sigreturn.2.html
> > 
> > > And the number of syscalls are limited resource.
> > 
> > We have almost 500 of them, it didn't seems like adding 1-2 for good
> > reasons would be a problem. Can you please point to where the limits
> > on syscalls as a resource are described? I'm curious to learn.
> > 
> > >
> > > I'm actually not sure how much we need to care of it, but adding a new
> > > syscall is worth to be discussed carefully because all of them are
> > > user-space compatibility.
> > 
> > Absolutely, it's a good discussion to have.
> > 
> > >
> > > > > > > Also, we should run syzkaller on this syscall. And if uretprobe is
> > > > > >
> > > > > > right, I'll check on syzkaller
> > > > > >
> > > > > > > set in the user function, what happen if the user function 
> > > > > > > directly
> > > > > > > calls this syscall? (maybe it consumes shadow stack?)
> > > > > >
> > > > > > the process should receive SIGILL if there's no pending uretprobe 
> > > > > > for
> > > > > > the current task, or it will trigger uretprobe if there's one 
> > > > > > pending
> > > > >
> > > > > No, that is too aggressive and not safe. Since the syscall is exposed 
> > > > > to
> > > > > user program, it should return appropriate error code instead of 
> > > > > SIGILL.
> > > > >
> > > >
> > > > This is the way it is today with uretprobes even through interrupt.
> > >
> > > I doubt that the interrupt (exception) and syscall should be handled
> > > differently. Especially, this exception is injected by uprobes but
> > > syscall will be caused by itself. But syscall can be called from user
> > > program (of couse this works as sys_kill(self, SIGILL)).
> > 
> > Yep, I'd keep the behavior the same between uretprobes implemented
> > through int3 and sys_uretprobe.
> 
> +1 
> 
> > 
> > >
> > > > E.g., it could happen that user process is using fibers and is
> > > > replacing stack pointer without kernel realizing this, which will
> > > > trigger some defensive checks in uretprobe handling code and kernel
> > > > will send SIGILL because it can't support such cases. This is
> > > > happening today already, and it works fine in practice (except for
> > > > applications that manually change stack pointer, too bad, you can't
> > > > trace them with uretprobes, unfortunately).
> > >
> > > OK, we at least need to document it.
> > 
> > +1, yep
> > 
> > >
> > > >
> > > > So I think it's absolutely adequate to have this behavior if the user
> > > > process is *intentionally* abusing this API.
> > >
> > > Of course user expected that it is abusing. So at least we need to
> > > add a document that this syscall number is reserved to uprobes and
> > > user program must not use it.
> > >
> > 
> > Totally agree about documenting this.
> 
> ok there's map page on sigreturn.. do you think we should add man page
> for uretprobe or you can think of some other place to document it?

I think it is better to have a man-page. Anyway, to discuss and explain
this syscall, the man-page is a good format to describe it.

> 
> > 
> > > >
> > > > > >
> > > > > > but we could limit the syscall to be executed just from the 
> > > > > > trampoline,
> > > > > > that should prevent all the user space use cases, I'll do that in 
> > > > > > next
> > > > > > version and add more tests for that
> > > > >
> > > > > Why not limit? :) The uprobe_handle_trampoline() expects it is called
> > > > > only from the trampoline, so it is natural to check the caller 
> > > > > address.
> > > > > (and uprobe should know where is the trampoline)
> > > > >
> > > > > Since the syscall is always exposed to the user program, it should
> > > > > - Do nothing and return an error unless it is properly called.
> > > > > - check the prerequisites for operation strictly.
> > > > > I concern that new system calls introduce vulnerabilities.
> > > > >
> > > >
> > > > As Oleg and Jiri mentioned, this syscal

Re: [PATCHv2 1/3] uprobe: Add uretprobe syscall to speed up return probe

2024-04-04 Thread Google

On Wed, 3 Apr 2024 19:00:07 -0700
Andrii Nakryiko  wrote:

> On Wed, Apr 3, 2024 at 5:58 PM Masami Hiramatsu  wrote:
> >
> > On Wed, 3 Apr 2024 09:58:12 -0700
> > Andrii Nakryiko  wrote:
> >
> > > On Wed, Apr 3, 2024 at 7:09 AM Masami Hiramatsu  
> > > wrote:
> > > >
> > > > On Wed, 3 Apr 2024 11:47:41 +0200
> > > > Jiri Olsa  wrote:
> > > >
> > > > > On Wed, Apr 03, 2024 at 10:07:08AM +0900, Masami Hiramatsu wrote:
> > > > > > Hi Jiri,
> > > > > >
> > > > > > On Tue,  2 Apr 2024 11:33:00 +0200
> > > > > > Jiri Olsa  wrote:
> > > > > >
> > > > > > > Adding uretprobe syscall instead of trap to speed up return probe.
> > > > > >
> > > > > > This is interesting approach. But I doubt we need to add additional
> > > > > > syscall just for this purpose. Can't we use another syscall or 
> > > > > > ioctl?
> > > > >
> > > > > so the plan is to optimize entry uprobe in a similar way and given
> > > > > the syscall is not a scarce resource I wanted to add another syscall
> > > > > for that one as well
> > > > >
> > > > > tbh I'm not sure sure which syscall or ioctl to reuse for this, it's
> > > > > possible to do that, the trampoline will just have to save one or
> > > > > more additional registers, but adding new syscall seems cleaner to me
> > > >
> > > > Hmm, I think a similar syscall is ptrace? prctl may also be a candidate.
> > >
> > > I think both ptrace and prctl are for completely different use cases
> > > and it would be an abuse of existing API to reuse them for uretprobe
> > > tracing. Also, keep in mind, that any extra argument that has to be
> > > passed into this syscall means that we need to complicate and slow
> > > generated assembly code that is injected into user process (to
> > > save/restore registers) and also kernel-side (again, to deal with all
> > > the extra registers that would be stored/restored on stack).
> > >
> > > Given syscalls are not some kind of scarce resources, what's the
> > > downside to have a dedicated and simple syscall?
> >
> > Syscalls are explicitly exposed to user space, thus, even if it is used
> > ONLY for a very specific situation, it is an official kernel interface,
> > and need to care about the compatibility. (If it causes SIGILL unless
> > a specific use case, I don't know there is a "compatibility".)
> 
> Check rt_sigreturn syscall (manpage at [0], for example).
> 
>sigreturn() exists only to allow the implementation of signal
>handlers.  It should never be called directly.  (Indeed, a simple
>sigreturn() wrapper in the GNU C library simply returns -1, with
>errno set to ENOSYS.)  Details of the arguments (if any) passed
>to sigreturn() vary depending on the architecture.  (On some
>architectures, such as x86-64, sigreturn() takes no arguments,
>since all of the information that it requires is available in the
>stack frame that was previously created by the kernel on the
>user-space stack.)
> 
> This is a very similar use case. Also, check its source code in
> arch/x86/kernel/signal_64.c. It sends SIGSEGV to the calling process
> on any sign of something not being right. It's exactly the same with
> sys_uretprobe.
> 
>   [0] https://man7.org/linux/man-pages/man2/sigreturn.2.html

Thanks for a good example.
Hm, in the case of rt_sigreturn, it has no other way to do it so it
needs to use syscall. OTOH, sys_uretprobe is only for performance
optimization, and the performance may depend on the architecture.

> > And the number of syscalls are limited resource.
> 
> We have almost 500 of them, it didn't seems like adding 1-2 for good
> reasons would be a problem. Can you please point to where the limits
> on syscalls as a resource are described? I'm curious to learn.

Syscall table is compiled as a fixed array, so if we increase
the number, we need more tables. Of course this just increase 1 entry
and at least for x86 we already allocated bigger table, so it is OK.
But I'm just afraid if we can add more syscalls without any clear
rules, we may fill the tables with more specific syscalls.

Ah, we also should follow this document.

https://docs.kernel.org/process/adding-syscalls.html

Let me Cc linux-...@vger.kernel.org.

> >
> > I'm actually not sure how much we need to care of it, but adding a new
> > syscall is worth to be discussed carefully because all of them are
> > user-space compatibility.
> 
> Absolutely, it's a good discussion to have.

Thanks, if this is discussed enough and agreed from other maintainers,
I can safely pick this on my tree.

> 
> >
> > > > > > Also, we should run syzkaller on this syscall. And if uretprobe is
> > > > >
> > > > > right, I'll check on syzkaller
> > > > >
> > > > > > set in the user function, what happen if the user function directly
> > > > > > calls this syscall? (maybe it consumes shadow stack?)
> > > > >
> > > > > the process should receive SIGILL if there's no pending uretprobe for
> > > > > the current task, or it will trigger uretprobe if there's one

Re: [PATCH v10 09/14] x86/sgx: Implement async reclamation for cgroup

2024-04-04 Thread Haitao Huang


On Thu, 04 Apr 2024 06:16:54 -0500, Huang, Kai  wrote:


On Wed, 2024-03-27 at 17:22 -0700, Haitao Huang wrote:

  void sgx_cgroup_init(void)
 {
+	sgx_cg_wq = alloc_workqueue("sgx_cg_wq", WQ_UNBOUND | WQ_FREEZABLE,  
WQ_UNBOUND_MAX_ACTIVE);

+
+   /* All Cgroups functionalities are disabled. */
+   if (WARN_ON(!sgx_cg_wq))
+   return;
+


I don't think you should WARN(), because it's not a kernel bug or  
similar.  Just

print a message saying EPC cgroup is disabled and move on.

if (!sgx_cg_wq) {
pr_err("SGX EPC cgroup disabled: alloc_workqueue() failed.\n");
return;
}



Sure
Thanks
Haitao

Re: [PATCH] livepatch: Add KLP_IDLE state

2024-04-04 Thread Petr Mladek

On Tue 2024-04-02 09:52:31, Joe Lawrence wrote:
> On Tue, Apr 02, 2024 at 11:09:54AM +0800, zhangwar...@gmail.com wrote:
> > From: Wardenjohn 
> > 
> > In livepatch, using KLP_UNDEFINED is seems to be confused.
> > When kernel is ready, livepatch is ready too, which state is
> > idle but not undefined. What's more, if one livepatch process
> > is finished, the klp state should be idle rather than undefined.
> > 
> > Therefore, using KLP_IDLE to replace KLP_UNDEFINED is much better
> > in reading and understanding.
> > ---
> >  include/linux/livepatch.h |  1 +
> >  kernel/livepatch/patch.c  |  2 +-
> >  kernel/livepatch/transition.c | 24 
> >  3 files changed, 14 insertions(+), 13 deletions(-)
> > 
> > diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
> > index 9b9b38e89563..c1c53cd5b227 100644
> > --- a/include/linux/livepatch.h
> > +++ b/include/linux/livepatch.h
> > @@ -19,6 +19,7 @@
> >  
> >  /* task patch states */
> >  #define KLP_UNDEFINED  -1
> > +#define KLP_IDLE   -1
> 
> Hi Wardenjohn,
> 
> Quick question, does this patch intend to:
> 
> - Completely replace KLP_UNDEFINED with KLP_IDLE
> - Introduce KLP_IDLE as an added, fourth potential state
> - Introduce KLP_IDLE as synonym of sorts for KLP_UNDEFINED under certain
>   conditions
> 
> I ask because this patch leaves KLP_UNDEFINED defined and used in other
> parts of the tree (ie, init/init_task.c), yet KLP_IDLE is added and
> continues to use the same -1 enumeration.

Having two names for the same state adds more harm than good.

Honestly, neither "task->patch_state == KLP_UNDEFINED" nor "KLP_IDLE"
make much sense.

The problem is in the variable name. It is not a state of a patch.
It is the state of the transition. The right solution would be
something like:

  klp_target_state -> klp_transition_target_state
  task->patch_state -> task->klp_transition_state
  KLP_UNKNOWN -> KLP_IDLE

But it would also require renaming:

  /proc//patch_state -> klp_transition_state

which might break userspace tools => likely not acceptable.


My opinion:

It would be nice to clean this up but it does not look worth the
effort.

Best Regards,
Petr

[PATCH] [v5] kallsyms: rework symbol lookup return codes

2024-04-04 Thread Arnd Bergmann

From: Arnd Bergmann 

Building with W=1 in some configurations produces a false positive
warning for kallsyms:

kernel/kallsyms.c: In function '__sprint_symbol.isra':
kernel/kallsyms.c:503:17: error: 'strcpy' source argument is the same as 
destination [-Werror=restrict]
  503 | strcpy(buffer, name);
  | ^~~~

This originally showed up while building with -O3, but later started
happening in other configurations as well, depending on inlining
decisions. The underlying issue is that the local 'name' variable is
always initialized to the be the same as 'buffer' in the called functions
that fill the buffer, which gcc notices while inlining, though it could
see that the address check always skips the copy.

The calling conventions here are rather unusual, as all of the internal
lookup functions (bpf_address_lookup, ftrace_mod_address_lookup,
ftrace_func_address_lookup, module_address_lookup and
kallsyms_lookup_buildid) already use the provided buffer and either return
the address of that buffer to indicate success, or NULL for failure,
but the callers are written to also expect an arbitrary other buffer
to be returned.

Rework the calling conventions to return the length of the filled buffer
instead of its address, which is simpler and easier to follow as well
as avoiding the warning. Leave only the kallsyms_lookup() calling conventions
unchanged, since that is called from 16 different functions and
adapting this would be a much bigger change.

Link: https://lore.kernel.org/all/20200107214042.855757-1-a...@arndb.de/
Link: https://lore.kernel.org/lkml/20240326130647.7bfb1...@gandalf.local.home/
Reviewed-by: Luis Chamberlain 
Acked-by: Steven Rostedt (Google) 
Signed-off-by: Arnd Bergmann 
---
v5: fix ftrace_mod_address_lookup return value,
rebased on top of 2e114248e086 ("bpf: Replace deprecated strncpy with 
strscpy")
v4: fix string length
v3: use strscpy() instead of strlcpy()
v2: complete rewrite after the first patch was rejected (in 2020). This
is now one of only two warnings that are in the way of enabling
-Wextra/-Wrestrict by default.
Signed-off-by: Arnd Bergmann 
---
 include/linux/filter.h   | 14 +++---
 include/linux/ftrace.h   |  6 +++---
 include/linux/module.h   | 14 +++---
 kernel/bpf/core.c|  7 +++
 kernel/kallsyms.c| 23 ---
 kernel/module/kallsyms.c | 26 +-
 kernel/trace/ftrace.c| 13 +
 7 files changed, 50 insertions(+), 53 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 161d5f7b64ed..e3a8f51fdf84 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1202,18 +1202,18 @@ static inline bool bpf_jit_kallsyms_enabled(void)
return false;
 }
 
-const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+int __bpf_address_lookup(unsigned long addr, unsigned long *size,
 unsigned long *off, char *sym);
 bool is_bpf_text_address(unsigned long addr);
 int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
char *sym);
 struct bpf_prog *bpf_prog_ksym_find(unsigned long addr);
 
-static inline const char *
+static inline int
 bpf_address_lookup(unsigned long addr, unsigned long *size,
   unsigned long *off, char **modname, char *sym)
 {
-   const char *ret = __bpf_address_lookup(addr, size, off, sym);
+   int ret = __bpf_address_lookup(addr, size, off, sym);
 
if (ret && modname)
*modname = NULL;
@@ -1257,11 +1257,11 @@ static inline bool bpf_jit_kallsyms_enabled(void)
return false;
 }
 
-static inline const char *
+static inline int
 __bpf_address_lookup(unsigned long addr, unsigned long *size,
 unsigned long *off, char *sym)
 {
-   return NULL;
+   return 0;
 }
 
 static inline bool is_bpf_text_address(unsigned long addr)
@@ -1280,11 +1280,11 @@ static inline struct bpf_prog 
*bpf_prog_ksym_find(unsigned long addr)
return NULL;
 }
 
-static inline const char *
+static inline int
 bpf_address_lookup(unsigned long addr, unsigned long *size,
   unsigned long *off, char **modname, char *sym)
 {
-   return NULL;
+   return 0;
 }
 
 static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 54d53f345d14..56834a3fa9be 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -87,15 +87,15 @@ struct ftrace_direct_func;
 
 #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \
defined(CONFIG_DYNAMIC_FTRACE)
-const char *
+int
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
   unsigned long *off, char **modname, char *sym);
 #else
-static inline const char *
+static inline int
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
   unsigned

Re: [PATCH v10 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-04-04 Thread Jonathan Cameron



> > > @@ -858,7 +910,8 @@ static int __init memory_tier_init(void)
> > >* For now we can have 4 faster memory tiers with smaller adistance
> > >* than default DRAM tier.
> > >*/
> > > - default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
> > > + default_dram_type = 
> > > mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM,
> > > + 
> > > &default_memory_types);  
> >
> > Unusual indenting.  Align with just after (
> >  
> 
> Aligning with "(" will exceed 100 columns. Would that be acceptable?
I think we are talking cross purposes.

default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM,
  &default_memory_types);  

Is what I was suggesting.

> 
> > >   if (IS_ERR(default_dram_type))
> > >   panic("%s() failed to allocate default DRAM tier\n", 
> > > __func__);
> > >
> > > @@ -868,6 +921,14 @@ static int __init memory_tier_init(void)
> > >* types assigned.
> > >*/
> > >   for_each_node_state(node, N_MEMORY) {
> > > + if (!node_state(node, N_CPU))
> > > + /*
> > > +  * Defer memory tier initialization on CPUless numa 
> > > nodes.
> > > +  * These will be initialized after firmware and 
> > > devices are  
> >
> > I think this wraps at just over 80 chars.  Seems silly to wrap so tightly 
> > and not
> > quite fit under 80. (this is about 83 chars.
> >  
> 
> I can fix this.
> I have a question. From my patch, this is <80 chars. However,
> in an email, this is >80 chars. Does that mean we need to
> count the number of chars in an email, not in a patch? Or if I
> missed something? like vim configuration or?

3 tabs + 1 space + the text from * (58)
= 24 + 1 + 58 = 83

Advantage of using claws email for kernel stuff is it has a nice per character
ruler at the top of the window.

I wonder if you have a different tab indent size?  The kernel uses 8
characters.  It might explain the few other odd indents if perhaps
you have it at 4 in your editor?

https://www.kernel.org/doc/html/v4.10/process/coding-style.html

Jonathan

> 
> > > +  * initialized.
> > > +  */
> > > + continue;
> > > +
> > >   memtier = set_node_memory_tier(node);
> > >   if (IS_ERR(memtier))
> > >   /*  
> >  
> 
>

Re: [PATCHv2 1/3] uprobe: Add uretprobe syscall to speed up return probe

2024-04-04 Thread Jiri Olsa

On Wed, Apr 03, 2024 at 07:00:07PM -0700, Andrii Nakryiko wrote:

SNIP

> Check rt_sigreturn syscall (manpage at [0], for example).
> 
>sigreturn() exists only to allow the implementation of signal
>handlers.  It should never be called directly.  (Indeed, a simple
>sigreturn() wrapper in the GNU C library simply returns -1, with
>errno set to ENOSYS.)  Details of the arguments (if any) passed
>to sigreturn() vary depending on the architecture.  (On some
>architectures, such as x86-64, sigreturn() takes no arguments,
>since all of the information that it requires is available in the
>stack frame that was previously created by the kernel on the
>user-space stack.)
> 
> This is a very similar use case. Also, check its source code in
> arch/x86/kernel/signal_64.c. It sends SIGSEGV to the calling process
> on any sign of something not being right. It's exactly the same with
> sys_uretprobe.
> 
>   [0] https://man7.org/linux/man-pages/man2/sigreturn.2.html
> 
> > And the number of syscalls are limited resource.
> 
> We have almost 500 of them, it didn't seems like adding 1-2 for good
> reasons would be a problem. Can you please point to where the limits
> on syscalls as a resource are described? I'm curious to learn.
> 
> >
> > I'm actually not sure how much we need to care of it, but adding a new
> > syscall is worth to be discussed carefully because all of them are
> > user-space compatibility.
> 
> Absolutely, it's a good discussion to have.
> 
> >
> > > > > > Also, we should run syzkaller on this syscall. And if uretprobe is
> > > > >
> > > > > right, I'll check on syzkaller
> > > > >
> > > > > > set in the user function, what happen if the user function directly
> > > > > > calls this syscall? (maybe it consumes shadow stack?)
> > > > >
> > > > > the process should receive SIGILL if there's no pending uretprobe for
> > > > > the current task, or it will trigger uretprobe if there's one pending
> > > >
> > > > No, that is too aggressive and not safe. Since the syscall is exposed to
> > > > user program, it should return appropriate error code instead of SIGILL.
> > > >
> > >
> > > This is the way it is today with uretprobes even through interrupt.
> >
> > I doubt that the interrupt (exception) and syscall should be handled
> > differently. Especially, this exception is injected by uprobes but
> > syscall will be caused by itself. But syscall can be called from user
> > program (of couse this works as sys_kill(self, SIGILL)).
> 
> Yep, I'd keep the behavior the same between uretprobes implemented
> through int3 and sys_uretprobe.

+1 

> 
> >
> > > E.g., it could happen that user process is using fibers and is
> > > replacing stack pointer without kernel realizing this, which will
> > > trigger some defensive checks in uretprobe handling code and kernel
> > > will send SIGILL because it can't support such cases. This is
> > > happening today already, and it works fine in practice (except for
> > > applications that manually change stack pointer, too bad, you can't
> > > trace them with uretprobes, unfortunately).
> >
> > OK, we at least need to document it.
> 
> +1, yep
> 
> >
> > >
> > > So I think it's absolutely adequate to have this behavior if the user
> > > process is *intentionally* abusing this API.
> >
> > Of course user expected that it is abusing. So at least we need to
> > add a document that this syscall number is reserved to uprobes and
> > user program must not use it.
> >
> 
> Totally agree about documenting this.

ok there's map page on sigreturn.. do you think we should add man page
for uretprobe or you can think of some other place to document it?

> 
> > >
> > > > >
> > > > > but we could limit the syscall to be executed just from the 
> > > > > trampoline,
> > > > > that should prevent all the user space use cases, I'll do that in next
> > > > > version and add more tests for that
> > > >
> > > > Why not limit? :) The uprobe_handle_trampoline() expects it is called
> > > > only from the trampoline, so it is natural to check the caller address.
> > > > (and uprobe should know where is the trampoline)
> > > >
> > > > Since the syscall is always exposed to the user program, it should
> > > > - Do nothing and return an error unless it is properly called.
> > > > - check the prerequisites for operation strictly.
> > > > I concern that new system calls introduce vulnerabilities.
> > > >
> > >
> > > As Oleg and Jiri mentioned, this syscall can't harm kernel or other
> > > processes, only the process that is abusing the API. So any extra
> > > checks that would slow down this approach is an unnecessary overhead
> > > and complication that will never be useful in practice.
> >
> > I think at least it should check the caller address to ensure the
> > address is in the trampoline.
> > But anyway, uprobes itself can break the target process, so no one
> > might care if this system call breaks the process now.
> 
> If we already h

Re: [PATCH v10 09/14] x86/sgx: Implement async reclamation for cgroup

2024-04-04 Thread Huang, Kai

On Wed, 2024-03-27 at 17:22 -0700, Haitao Huang wrote:
>  
>  void sgx_cgroup_init(void)
>  {
> + sgx_cg_wq = alloc_workqueue("sgx_cg_wq", WQ_UNBOUND | WQ_FREEZABLE, 
> WQ_UNBOUND_MAX_ACTIVE);
> +
> + /* All Cgroups functionalities are disabled. */
> + if (WARN_ON(!sgx_cg_wq))
> + return;
> +

I don't think you should WARN(), because it's not a kernel bug or similar.  Just
print a message saying EPC cgroup is disabled and move on.

if (!sgx_cg_wq) {
pr_err("SGX EPC cgroup disabled: alloc_workqueue() failed.\n");
return;
}

Re: [PATCH] uprobes: reduce contention on uprobes_tree access

2024-04-04 Thread Jonthan Haslam

> > Things to note about the results:
> >
> > - The results are slightly variable so don't get too caught up on
> >   individual thread count - it's the trend that is important.
> > - In terms of throughput with this specific benchmark a *very* macro view
> >   is that the RW spinlock provides 40-60% more throughput than the
> >   spinlock.  The per-CPU RW semaphore provides in the order of 50-100%
> >   more throughput then the spinlock.
> > - This doesn't fully reflect the large reduction in latency that we have
> >   seen in application based measurements. However, it does demonstrate
> >   that even the trivial change of going to a RW spinlock provides
> >   significant benefits.
> 
> This is probably because trig-uprobe-nop creates a single uprobe that
> is triggered on many CPUs. While in production we have also *many*
> uprobes running on many CPUs. In this benchmark, besides contention on
> uprobes_treelock, we are also hammering on other per-uprobe locks
> (register_rwsem, also if you don't have [0] patch locally, there will
> be another filter lock taken each time, filter->rwlock). There is also
> atomic refcounting going on, which when you have the same uprobe
> across all CPUs at the same time will cause a bunch of cache line
> bouncing.
> 
> So yes, it's understandable that in practice in production you see an
> even larger effect of optimizing uprobe_treelock than in this
> micro-benchmark.
> 
>   [0] 
> https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git/commit/?h=probes/for-next&id=366f7afd3de31d3ce2f4cbff97c6c23b6aa6bcdf

Thanks for the reply and the thoughts on this Andrii. Yes, I do have the
filter->rwlock fix applied but, as you say, there are no doubt other
effects at play here as to be expected in such a synthetic workload. I'm
pleased with the outcomes though as they show a good result even if they
are at the lower end of what I expect.

The results also show that pursuing an RCU solution is definitely worth it
but that write penalty is brutal in the case of a full synchronize_rcu()!
Should be fun.

> > for num_threads in {1..20}
> > do
> > sudo ./bench -p $num_threads trig-uprobe-nop | grep Summary
> 
> just want to mention -a (affinity) option that you can pass a bench
> tool, it will pin each thread on its own CPU. It generally makes tests
> more uniform, eliminating CPU migrations variability.

Thanks for pointing that flag  out!

Jon.

> 
> > done
> >
> >
> > spinlock
> >
> > Summary: hits1.453 ± 0.005M/s (  1.453M/prod)
> > Summary: hits2.087 ± 0.005M/s (  1.043M/prod)
> > Summary: hits2.701 ± 0.012M/s (  0.900M/prod)
> 
> I also wanted to point out that the first measurement (1.453M/s in
> this row) is total throughput across all threads, while value in
> parenthesis (0.900M/prod) is averaged throughput per each thread. So
> this M/prod value is the most interesting in this benchmark where we
> assess the effect of reducing contention.
> 
> > Summary: hits1.917 ± 0.011M/s (  0.479M/prod)
> > Summary: hits2.105 ± 0.003M/s (  0.421M/prod)
> > Summary: hits1.615 ± 0.006M/s (  0.269M/prod)
> 
> [...]

45 matches

Mail list logo