[PATCH v5 7/9] powerpc/pseries/vas: Reopen windows with DLPAR core add

2022-02-27 Thread Haren Myneni


VAS windows can be closed in the hypervisor due to lost credits
when the core is removed and the kernel gets fault for NX
requests on these inactive windows. If the NX requests are
issued on these inactive windows, OS gets page faults and the
paste failure will be returned to the user space. If the lost
credits are available later with core add, reopen these windows
and set them active. Later when the OS sees page faults on these
active windows, it creates mapping on the new paste address.
Then the user space can continue to use these windows and send
HW compression requests to NX successfully.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/pseries/vas.c | 91 +++-
 1 file changed, 90 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/vas.c 
b/arch/powerpc/platforms/pseries/vas.c
index a297720bcdae..96178dd58adf 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -565,6 +565,88 @@ static int __init get_vas_capabilities(u8 feat, enum 
vas_cop_feat_type type,
return 0;
 }
 
+/*
+ * VAS windows can be closed due to lost credits when the core is
+ * removed. So reopen them if credits are available due to DLPAR
+ * core add and set the window active status. When NX sees the page
+ * fault on the unmapped paste address, the kernel handles the fault
+ * by setting the remapping to new paste address if the window is
+ * active.
+ */
+static int reconfig_open_windows(struct vas_caps *vcaps, int creds)
+{
+   long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID};
+   struct vas_cop_feat_caps *caps = &vcaps->caps;
+   struct pseries_vas_window *win = NULL, *tmp;
+   int rc, mv_ents = 0;
+
+   /*
+* Nothing to do if there are no closed windows.
+*/
+   if (!vcaps->nr_close_wins)
+   return 0;
+
+   /*
+* For the core removal, the hypervisor reduces the credits
+* assigned to the LPAR and the kernel closes VAS windows
+* in the hypervisor depends on reduced credits. The kernel
+* uses LIFO (the last windows that are opened will be closed
+* first) and expects to open in the same order when credits
+* are available.
+* For example, 40 windows are closed when the LPAR lost 2 cores
+* (dedicated). If 1 core is added, this LPAR can have 20 more
+* credits. It means the kernel can reopen 20 windows. So move
+* 20 entries in the VAS windows lost and reopen next 20 windows.
+*/
+   if (vcaps->nr_close_wins > creds)
+   mv_ents = vcaps->nr_close_wins - creds;
+
+   list_for_each_entry_safe(win, tmp, &vcaps->list, win_list) {
+   if (!mv_ents)
+   break;
+
+   mv_ents--;
+   }
+
+   list_for_each_entry_safe_from(win, tmp, &vcaps->list, win_list) {
+   /*
+* Nothing to do on this window if it is not closed
+* with VAS_WIN_NO_CRED_CLOSE
+*/
+   if (!(win->vas_win.status & VAS_WIN_NO_CRED_CLOSE))
+   continue;
+
+   rc = allocate_setup_window(win, (u64 *)&domain[0],
+  caps->win_type);
+   if (rc)
+   return rc;
+
+   rc = h_modify_vas_window(win);
+   if (rc)
+   goto out;
+
+   mutex_lock(&win->vas_win.task_ref.mmap_mutex);
+   /*
+* Set window status to active
+*/
+   win->vas_win.status &= ~VAS_WIN_NO_CRED_CLOSE;
+   mutex_unlock(&win->vas_win.task_ref.mmap_mutex);
+   win->win_type = caps->win_type;
+   if (!--vcaps->nr_close_wins)
+   break;
+   }
+
+   return 0;
+out:
+   /*
+* Window modify HCALL failed. So close the window to the
+* hypervisor and return.
+*/
+   free_irq_setup(win);
+   h_deallocate_vas_window(win->vas_win.winid);
+   return rc;
+}
+
 /*
  * The hypervisor reduces the available credits if the LPAR lost core. It
  * means the excessive windows should not be active and the user space
@@ -673,7 +755,14 @@ static int vas_reconfig_capabilties(u8 type)
 * closed / reopened. Hold the vas_pseries_mutex so that the
 * the user space can not open new windows.
 */
-   if (old_nr_creds >  new_nr_creds) {
+   if (old_nr_creds <  new_nr_creds) {
+   /*
+* If the existing target credits is less than the new
+* target, reopen windows if they are closed due to
+* the previous DLPAR (core removal).
+*/
+   rc = reconfig_open_windows(vcaps, new_nr_creds - old_nr_creds);
+   } else {
/*
 * # active windows is more than new LPAR available
 * cre

[PATCH v5 4/9] powerpc/vas: Return paste instruction failure if no active window

2022-02-27 Thread Haren Myneni


The VAS window may not be active if the system looses credits and
the NX generates page fault when it receives request on unmap
paste address.

The kernel handles the fault by remap new paste address if the
window is active again, Otherwise return the paste instruction
failure if the executed instruction that caused the fault was
a paste.

Signed-off-by: Nicholas Piggin 
Signed-off-by: Haren Myneni 
---
 arch/powerpc/include/asm/ppc-opcode.h   |  2 +
 arch/powerpc/platforms/book3s/vas-api.c | 55 -
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index 9675303b724e..82f1f0041c6f 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -262,6 +262,8 @@
 #define PPC_INST_MFSPR_PVR 0x7c1f42a6
 #define PPC_INST_MFSPR_PVR_MASK0xfc1e
 #define PPC_INST_MTMSRD0x7c000164
+#define PPC_INST_PASTE 0x7c20070d
+#define PPC_INST_PASTE_MASK0xfc2007ff
 #define PPC_INST_POPCNTB   0x7cf4
 #define PPC_INST_POPCNTB_MASK  0xfc0007fe
 #define PPC_INST_RFEBB 0x4c000124
diff --git a/arch/powerpc/platforms/book3s/vas-api.c 
b/arch/powerpc/platforms/book3s/vas-api.c
index f359e7b2bf90..f3e421511ea6 100644
--- a/arch/powerpc/platforms/book3s/vas-api.c
+++ b/arch/powerpc/platforms/book3s/vas-api.c
@@ -351,6 +351,41 @@ static int coproc_release(struct inode *inode, struct file 
*fp)
return 0;
 }
 
+/*
+ * If the executed instruction that caused the fault was a paste, then
+ * clear regs CR0[EQ], advance NIP, and return 0. Else return error code.
+ */
+static int do_fail_paste(void)
+{
+   struct pt_regs *regs = current->thread.regs;
+   u32 instword;
+
+   if (WARN_ON_ONCE(!regs))
+   return -EINVAL;
+
+   if (WARN_ON_ONCE(!user_mode(regs)))
+   return -EINVAL;
+
+   /*
+* If we couldn't translate the instruction, the driver should
+* return success without handling the fault, it will be retried
+* or the instruction fetch will fault.
+*/
+   if (get_user(instword, (u32 __user *)(regs->nip)))
+   return -EAGAIN;
+
+   /*
+* Not a paste instruction, driver may fail the fault.
+*/
+   if ((instword & PPC_INST_PASTE_MASK) != PPC_INST_PASTE)
+   return -ENOENT;
+
+   regs->ccr &= ~0xe000;   /* Clear CR0[0-2] to fail paste */
+   regs_add_return_ip(regs, 4);/* Emulate the paste */
+
+   return 0;
+}
+
 /*
  * This fault handler is invoked when the core generates page fault on
  * the paste address. Happens if the kernel closes window in hypervisor
@@ -408,9 +443,27 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf)
}
mutex_unlock(&txwin->task_ref.mmap_mutex);
 
-   return VM_FAULT_SIGBUS;
+   /*
+* Received this fault due to closing the actual window.
+* It can happen during migration or lost credits.
+* Since no mapping, return the paste instruction failure
+* to the user space.
+*/
+   ret = do_fail_paste();
+   /*
+* The user space can retry several times until success (needed
+* for migration) or should fallback to SW compression or
+* manage with the existing open windows if available.
+* Looking at sysfs interface, it can determine whether these
+* failures are coming during migration or core removal:
+* nr_used_credits > nr_total_credits when lost credits
+*/
+   if (!ret || (ret == -EAGAIN))
+   return VM_FAULT_NOPAGE;
 
+   return VM_FAULT_SIGBUS;
 }
+
 static const struct vm_operations_struct vas_vm_ops = {
.fault = vas_mmap_fault,
 };
-- 
2.27.0




[PATCH v4 3/3] powerpc/pseries/vas: Add VAS migration handler

2022-02-27 Thread Haren Myneni


Since the VAS windows belong to the VAS hardware resource, the
hypervisor expects the partition to close them on source partition
and reopen them after the partition migrated on the destination
machine.

This handler is called before pseries_suspend() to close these
windows and again invoked after migration. All active windows
for both default and QoS types will be closed and mark them
inactive and reopened after migration with this handler.
During the migration, the user space receives paste instruction
failure if it issues copy/paste on these inactive windows.

The current migration implementation does not freeze the user
space and applications can continue to open VAS windows while
migration is in progress. So when the migration_in_progress flag
is set, VAS open window API returns -EBUSY.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/pseries/mobility.c |  5 ++
 arch/powerpc/platforms/pseries/vas.c  | 98 ++-
 arch/powerpc/platforms/pseries/vas.h  |  6 ++
 3 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/mobility.c 
b/arch/powerpc/platforms/pseries/mobility.c
index 85033f392c78..70004243e25e 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include "pseries.h"
+#include "vas.h"   /* vas_migration_handler() */
 #include "../../kernel/cacheinfo.h"
 
 static struct kobject *mobility_kobj;
@@ -669,12 +670,16 @@ static int pseries_migrate_partition(u64 handle)
if (ret)
return ret;
 
+   vas_migration_handler(VAS_SUSPEND);
+
ret = pseries_suspend(handle);
if (ret == 0)
post_mobility_fixup();
else
pseries_cancel_migration(handle, ret);
 
+   vas_migration_handler(VAS_RESUME);
+
return ret;
 }
 
diff --git a/arch/powerpc/platforms/pseries/vas.c 
b/arch/powerpc/platforms/pseries/vas.c
index fbcf311da0ec..1f59d78c77a1 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -30,6 +30,7 @@ static struct hv_vas_cop_feat_caps hv_cop_caps;
 
 static struct vas_caps vascaps[VAS_MAX_FEAT_TYPE];
 static DEFINE_MUTEX(vas_pseries_mutex);
+static bool migration_in_progress;
 
 static long hcall_return_busy_check(long rc)
 {
@@ -356,7 +357,10 @@ static struct vas_window *vas_allocate_window(int vas_id, 
u64 flags,
 * same fault IRQ is not freed by the OS before.
 */
mutex_lock(&vas_pseries_mutex);
-   rc = allocate_setup_window(txwin, (u64 *)&domain[0],
+   if (migration_in_progress)
+   rc = -EBUSY;
+   else
+   rc = allocate_setup_window(txwin, (u64 *)&domain[0],
   cop_feat_caps->win_type);
mutex_unlock(&vas_pseries_mutex);
if (rc)
@@ -869,6 +873,98 @@ static struct notifier_block pseries_vas_nb = {
.notifier_call = pseries_vas_notifier,
 };
 
+/*
+ * For LPM, all windows have to be closed on the source partition
+ * before migration and reopen them on the destination partition
+ * after migration. So closing windows during suspend and
+ * reopen them during resume.
+ */
+int vas_migration_handler(int action)
+{
+   struct vas_cop_feat_caps *caps;
+   int old_nr_creds, new_nr_creds = 0;
+   struct vas_caps *vcaps;
+   int i, rc = 0;
+
+   /*
+* NX-GZIP is not enabled. Nothing to do for migration.
+*/
+   if (!copypaste_feat)
+   return rc;
+
+   mutex_lock(&vas_pseries_mutex);
+
+   if (action == VAS_SUSPEND)
+   migration_in_progress = true;
+   else
+   migration_in_progress = false;
+
+   for (i = 0; i < VAS_MAX_FEAT_TYPE; i++) {
+   vcaps = &vascaps[i];
+   caps = &vcaps->caps;
+   old_nr_creds = atomic_read(&caps->nr_total_credits);
+
+   rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES,
+ vcaps->feat,
+ (u64)virt_to_phys(&hv_cop_caps));
+   if (!rc) {
+   new_nr_creds = 
be16_to_cpu(hv_cop_caps.target_lpar_creds);
+   /*
+* Should not happen. But incase print messages, close
+* all windows in the list during suspend and reopen
+* windows based on new lpar_creds on the destination
+* system.
+*/
+   if (old_nr_creds != new_nr_creds) {
+   pr_err("Target credits mismatch with the 
hypervisor\n");
+   pr_err("state(%d): lpar creds: %d HV lpar 
creds: %d\n",
+   action, old_nr_creds, new_nr_creds);
+   pr_err("Used creds: %d, Active creds: %d\n",
+

[PATCH v4 2/3] powerpc/pseries/vas: Modify reconfig open/close functions for migration

2022-02-27 Thread Haren Myneni


VAS is a hardware engine stays on the chip. So when the partition
migrates, all VAS windows on the source system have to be closed
and reopen them on the destination after migration.

The kernel has to consider both DLPAR CPU and migration events to
take action on VAS windows. So using VAS_WIN_NO_CRED_CLOSE and
VAS_WIN_MIGRATE_CLOSE status bits and windows will be reopened
after migration only after both status bits are cleared.

This patch make changes to the current reconfig_open/close_windows
functions to support migration:
- Set VAS_WIN_MIGRATE_CLOSE to the window status when closes and
  reopen windows with the same status during resume.
- Continue to close all windows even if deallocate HCALL failed
  (should not happen) since no way to stop migration with the
  current LPM implementation.
- If the DLPAR CPU event happens while migration is in progress,
  set VAS_WIN_NO_CRED_CLOSE to the window status. Close window
  happens with the first event (migration or DLPAR) and Reopen
  window happens only with the last event (migration or DLPAR).

Signed-off-by: Haren Myneni 
---
 arch/powerpc/include/asm/vas.h   |  2 +
 arch/powerpc/platforms/pseries/vas.c | 88 ++--
 2 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 6baf7b9ffed4..83afcb6c194b 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -36,6 +36,8 @@
/* vas mmap() */
 /* Window is closed in the hypervisor due to lost credit */
 #define VAS_WIN_NO_CRED_CLOSE  0x0001
+/* Window is closed due to migration */
+#define VAS_WIN_MIGRATE_CLOSE  0x0002
 
 /*
  * Get/Set bit fields
diff --git a/arch/powerpc/platforms/pseries/vas.c 
b/arch/powerpc/platforms/pseries/vas.c
index 3bb219f54806..fbcf311da0ec 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -457,11 +457,12 @@ static int vas_deallocate_window(struct vas_window *vwin)
mutex_lock(&vas_pseries_mutex);
/*
 * VAS window is already closed in the hypervisor when
-* lost the credit. So just remove the entry from
-* the list, remove task references and free vas_window
+* lost the credit or with migration. So just remove the entry
+* from the list, remove task references and free vas_window
 * struct.
 */
-   if (win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) {
+   if (!(win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) &&
+   !(win->vas_win.status & VAS_WIN_MIGRATE_CLOSE)) {
rc = deallocate_free_window(win);
if (rc) {
mutex_unlock(&vas_pseries_mutex);
@@ -578,12 +579,14 @@ static int __init get_vas_capabilities(u8 feat, enum 
vas_cop_feat_type type,
  * by setting the remapping to new paste address if the window is
  * active.
  */
-static int reconfig_open_windows(struct vas_caps *vcaps, int creds)
+static int reconfig_open_windows(struct vas_caps *vcaps, int creds,
+bool migrate)
 {
long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID};
struct vas_cop_feat_caps *caps = &vcaps->caps;
struct pseries_vas_window *win = NULL, *tmp;
int rc, mv_ents = 0;
+   int flag;
 
/*
 * Nothing to do if there are no closed windows.
@@ -602,8 +605,10 @@ static int reconfig_open_windows(struct vas_caps *vcaps, 
int creds)
 * (dedicated). If 1 core is added, this LPAR can have 20 more
 * credits. It means the kernel can reopen 20 windows. So move
 * 20 entries in the VAS windows lost and reopen next 20 windows.
+* For partition migration, reopen all windows that are closed
+* during resume.
 */
-   if (vcaps->nr_close_wins > creds)
+   if ((vcaps->nr_close_wins > creds) && !migrate)
mv_ents = vcaps->nr_close_wins - creds;
 
list_for_each_entry_safe(win, tmp, &vcaps->list, win_list) {
@@ -613,12 +618,35 @@ static int reconfig_open_windows(struct vas_caps *vcaps, 
int creds)
mv_ents--;
}
 
+   /*
+* Open windows if they are closed only with migration or
+* DLPAR (lost credit) before.
+*/
+   if (migrate)
+   flag = VAS_WIN_MIGRATE_CLOSE;
+   else
+   flag = VAS_WIN_NO_CRED_CLOSE;
+
list_for_each_entry_safe_from(win, tmp, &vcaps->list, win_list) {
+   /*
+* This window is closed with DLPAR and migration events.
+* So reopen the window with the last event.
+* The user space is not suspended with the current
+* migration notifier. So the user space can issue DLPAR
+* CPU hotplug while migration in progress. In this case
+* this window will be opened with the last event.
+*/
+  

[PATCH v4 1/3] powerpc/pseries/vas: Define global hv_cop_caps struct

2022-02-27 Thread Haren Myneni


The coprocessor capabilities struct is used to get default and
QoS capabilities from the hypervisor during init, DLPAR event and
migration. So instead of allocating this struct for each event,
define global struct and reuse it which allows the migration code
to avoid adding an error path.

Also disable copy/paste feature flag if any capabilities HCALL
is failed.

Signed-off-by: Haren Myneni 
Acked-by: Nathan Lynch 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/vas.c | 47 
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/vas.c 
b/arch/powerpc/platforms/pseries/vas.c
index 591c7597db5a..3bb219f54806 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -26,6 +26,7 @@
 
 static struct vas_all_caps caps_all;
 static bool copypaste_feat;
+static struct hv_vas_cop_feat_caps hv_cop_caps;
 
 static struct vas_caps vascaps[VAS_MAX_FEAT_TYPE];
 static DEFINE_MUTEX(vas_pseries_mutex);
@@ -724,7 +725,6 @@ static int reconfig_close_windows(struct vas_caps *vcap, 
int excess_creds)
  */
 int vas_reconfig_capabilties(u8 type)
 {
-   struct hv_vas_cop_feat_caps *hv_caps;
struct vas_cop_feat_caps *caps;
int old_nr_creds, new_nr_creds;
struct vas_caps *vcaps;
@@ -738,17 +738,13 @@ int vas_reconfig_capabilties(u8 type)
vcaps = &vascaps[type];
caps = &vcaps->caps;
 
-   hv_caps = kmalloc(sizeof(*hv_caps), GFP_KERNEL);
-   if (!hv_caps)
-   return -ENOMEM;
-
mutex_lock(&vas_pseries_mutex);
rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, vcaps->feat,
- (u64)virt_to_phys(hv_caps));
+ (u64)virt_to_phys(&hv_cop_caps));
if (rc)
goto out;
 
-   new_nr_creds = be16_to_cpu(hv_caps->target_lpar_creds);
+   new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds);
 
old_nr_creds = atomic_read(&caps->nr_total_credits);
 
@@ -780,7 +776,6 @@ int vas_reconfig_capabilties(u8 type)
 
 out:
mutex_unlock(&vas_pseries_mutex);
-   kfree(hv_caps);
return rc;
 }
 /*
@@ -822,9 +817,8 @@ static struct notifier_block pseries_vas_nb = {
 
 static int __init pseries_vas_init(void)
 {
-   struct hv_vas_cop_feat_caps *hv_cop_caps;
struct hv_vas_all_caps *hv_caps;
-   int rc;
+   int rc = 0;
 
/*
 * Linux supports user space COPY/PASTE only with Radix
@@ -850,38 +844,37 @@ static int __init pseries_vas_init(void)
 
sysfs_pseries_vas_init(&caps_all);
 
-   hv_cop_caps = kmalloc(sizeof(*hv_cop_caps), GFP_KERNEL);
-   if (!hv_cop_caps) {
-   rc = -ENOMEM;
-   goto out;
-   }
/*
 * QOS capabilities available
 */
if (caps_all.feat_type & VAS_GZIP_QOS_FEAT_BIT) {
rc = get_vas_capabilities(VAS_GZIP_QOS_FEAT,
- VAS_GZIP_QOS_FEAT_TYPE, hv_cop_caps);
+ VAS_GZIP_QOS_FEAT_TYPE, &hv_cop_caps);
 
if (rc)
-   goto out_cop;
+   goto out;
}
/*
 * Default capabilities available
 */
-   if (caps_all.feat_type & VAS_GZIP_DEF_FEAT_BIT) {
+   if (caps_all.feat_type & VAS_GZIP_DEF_FEAT_BIT)
rc = get_vas_capabilities(VAS_GZIP_DEF_FEAT,
- VAS_GZIP_DEF_FEAT_TYPE, hv_cop_caps);
-   if (rc)
-   goto out_cop;
-   }
+ VAS_GZIP_DEF_FEAT_TYPE, &hv_cop_caps);
 
-   if (copypaste_feat && firmware_has_feature(FW_FEATURE_LPAR))
-   of_reconfig_notifier_register(&pseries_vas_nb);
+   if (!rc && copypaste_feat) {
+   if (firmware_has_feature(FW_FEATURE_LPAR))
+   of_reconfig_notifier_register(&pseries_vas_nb);
 
-   pr_info("GZIP feature is available\n");
+   pr_info("GZIP feature is available\n");
+   } else {
+   /*
+* Should not happen, but only when get default
+* capabilities HCALL failed. So disable copy paste
+* feature.
+*/
+   copypaste_feat = false;
+   }
 
-out_cop:
-   kfree(hv_cop_caps);
 out:
kfree(hv_caps);
return rc;
-- 
2.27.0




[PATCH v5 7/9] powerpc/pseries/vas: Reopen windows with DLPAR core add

2022-02-27 Thread Haren Myneni


VAS windows can be closed in the hypervisor due to lost credits
when the core is removed and the kernel gets fault for NX
requests on these inactive windows. If the NX requests are
issued on these inactive windows, OS gets page faults and the
paste failure will be returned to the user space. If the lost
credits are available later with core add, reopen these windows
and set them active. Later when the OS sees page faults on these
active windows, it creates mapping on the new paste address.
Then the user space can continue to use these windows and send
HW compression requests to NX successfully.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/pseries/vas.c | 91 +++-
 1 file changed, 90 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/vas.c 
b/arch/powerpc/platforms/pseries/vas.c
index a297720bcdae..96178dd58adf 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -565,6 +565,88 @@ static int __init get_vas_capabilities(u8 feat, enum 
vas_cop_feat_type type,
return 0;
 }
 
+/*
+ * VAS windows can be closed due to lost credits when the core is
+ * removed. So reopen them if credits are available due to DLPAR
+ * core add and set the window active status. When NX sees the page
+ * fault on the unmapped paste address, the kernel handles the fault
+ * by setting the remapping to new paste address if the window is
+ * active.
+ */
+static int reconfig_open_windows(struct vas_caps *vcaps, int creds)
+{
+   long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID};
+   struct vas_cop_feat_caps *caps = &vcaps->caps;
+   struct pseries_vas_window *win = NULL, *tmp;
+   int rc, mv_ents = 0;
+
+   /*
+* Nothing to do if there are no closed windows.
+*/
+   if (!vcaps->nr_close_wins)
+   return 0;
+
+   /*
+* For the core removal, the hypervisor reduces the credits
+* assigned to the LPAR and the kernel closes VAS windows
+* in the hypervisor depends on reduced credits. The kernel
+* uses LIFO (the last windows that are opened will be closed
+* first) and expects to open in the same order when credits
+* are available.
+* For example, 40 windows are closed when the LPAR lost 2 cores
+* (dedicated). If 1 core is added, this LPAR can have 20 more
+* credits. It means the kernel can reopen 20 windows. So move
+* 20 entries in the VAS windows lost and reopen next 20 windows.
+*/
+   if (vcaps->nr_close_wins > creds)
+   mv_ents = vcaps->nr_close_wins - creds;
+
+   list_for_each_entry_safe(win, tmp, &vcaps->list, win_list) {
+   if (!mv_ents)
+   break;
+
+   mv_ents--;
+   }
+
+   list_for_each_entry_safe_from(win, tmp, &vcaps->list, win_list) {
+   /*
+* Nothing to do on this window if it is not closed
+* with VAS_WIN_NO_CRED_CLOSE
+*/
+   if (!(win->vas_win.status & VAS_WIN_NO_CRED_CLOSE))
+   continue;
+
+   rc = allocate_setup_window(win, (u64 *)&domain[0],
+  caps->win_type);
+   if (rc)
+   return rc;
+
+   rc = h_modify_vas_window(win);
+   if (rc)
+   goto out;
+
+   mutex_lock(&win->vas_win.task_ref.mmap_mutex);
+   /*
+* Set window status to active
+*/
+   win->vas_win.status &= ~VAS_WIN_NO_CRED_CLOSE;
+   mutex_unlock(&win->vas_win.task_ref.mmap_mutex);
+   win->win_type = caps->win_type;
+   if (!--vcaps->nr_close_wins)
+   break;
+   }
+
+   return 0;
+out:
+   /*
+* Window modify HCALL failed. So close the window to the
+* hypervisor and return.
+*/
+   free_irq_setup(win);
+   h_deallocate_vas_window(win->vas_win.winid);
+   return rc;
+}
+
 /*
  * The hypervisor reduces the available credits if the LPAR lost core. It
  * means the excessive windows should not be active and the user space
@@ -673,7 +755,14 @@ static int vas_reconfig_capabilties(u8 type)
 * closed / reopened. Hold the vas_pseries_mutex so that the
 * the user space can not open new windows.
 */
-   if (old_nr_creds >  new_nr_creds) {
+   if (old_nr_creds <  new_nr_creds) {
+   /*
+* If the existing target credits is less than the new
+* target, reopen windows if they are closed due to
+* the previous DLPAR (core removal).
+*/
+   rc = reconfig_open_windows(vcaps, new_nr_creds - old_nr_creds);
+   } else {
/*
 * # active windows is more than new LPAR available
 * cre

[PATCH v4 0/3] powerpc/pseries/vas: VAS/NXGZIP support with LPM

2022-02-27 Thread Haren Myneni


Virtual Accelerator Switchboard (VAS) is an engine stays on the
chip. So all windows opened on a specific engine belongs to VAS
the chip. The hypervisor expects the partition to close all
active windows on the sources system and reopen them after
migration on the destination machine.

This patch series adds VAS support with the partition migration.
When the migration initiates, the VAS migration handler will be
invoked before pseries_suspend() to close all active windows and
mark them in-active with VAS_WIN_MIGRATE_CLOSE status. Whereas
this migration handler is called after migration to reopen all
windows which has VAS_WIN_MIGRATE_CLOSE status and make them
active again. The user space gets paste instruction failure
when it sends requests on these in-active windows.

These patches depend on VAS/DLPAR support patch series

Changes in v2:
- Added new patch "Define global hv_cop_caps struct" to eliminate
  memory allocation failure during migration (suggestion by
  Nathan Lynch)

Changes in v3:
- Rebase on 5.17-rc4
- Naming changes for VAS capability struct elemets based on the V4 DLPAR
  support patch series.

Changes in v4:
- Rebase on 5.17-rc5
- Include migration_in_progress enable patch in "VAS migration handler"
  and other changes as suggested by Nicholas Piggin 

Haren Myneni (3):
  powerpc/pseries/vas: Define global hv_cop_caps struct
  powerpc/pseries/vas: Modify reconfig open/close functions for
migration
  powerpc/pseries/vas: Add VAS migration handler

 arch/powerpc/include/asm/vas.h|   2 +
 arch/powerpc/platforms/pseries/mobility.c |   5 +
 arch/powerpc/platforms/pseries/vas.c  | 233 +-
 arch/powerpc/platforms/pseries/vas.h  |   6 +
 4 files changed, 201 insertions(+), 45 deletions(-)

-- 
2.27.0




[PATCH v5 9/9] powerpc/pseries/vas: Add 'update_total_credits' entry for QoS capabilities

2022-02-27 Thread Haren Myneni


pseries supports two types of credits - Default (uses normal priority
FIFO) and Qality of service (QoS uses high priority FIFO). The user
decides the number of QoS credits and sets this value with HMC
interface. The total credits for QoS capabilities can be changed
dynamically with HMC interface which invokes drmgr to communicate
to the kernel.

This patch creats 'update_total_credits' entry for QoS capabilities
so that drmgr command can write the new target QoS credits in sysfs.
Instead of using this value, the kernel gets the new QoS capabilities
from the hypervisor whenever update_total_credits is updated to make
sure sync with the QoS target credits in the hypervisor.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/pseries/vas-sysfs.c | 54 +++---
 arch/powerpc/platforms/pseries/vas.c   |  2 +-
 arch/powerpc/platforms/pseries/vas.h   |  1 +
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/vas-sysfs.c 
b/arch/powerpc/platforms/pseries/vas-sysfs.c
index e24d3edb3021..4a7fcde5afc0 100644
--- a/arch/powerpc/platforms/pseries/vas-sysfs.c
+++ b/arch/powerpc/platforms/pseries/vas-sysfs.c
@@ -25,6 +25,27 @@ struct vas_caps_entry {
 
 #define to_caps_entry(entry) container_of(entry, struct vas_caps_entry, kobj)
 
+/*
+ * This function is used to get the notification from the drmgr when
+ * QoS credits are changed. Though receiving the target total QoS
+ * credits here, get the official QoS capabilities from the hypervisor.
+ */
+static ssize_t update_total_credits_trigger(struct vas_cop_feat_caps *caps,
+   const char *buf, size_t count)
+{
+   int err;
+   u16 creds;
+
+   err = kstrtou16(buf, 0, &creds);
+   if (!err)
+   err = vas_reconfig_capabilties(caps->win_type);
+
+   if (err)
+   return -EINVAL;
+
+   return count;
+}
+
 #define sysfs_caps_entry_read(_name)   \
 static ssize_t _name##_show(struct vas_cop_feat_caps *caps, char *buf) 
\
 {  \
@@ -63,17 +84,29 @@ struct vas_sysfs_entry {
  * changed dynamically by the user.
  * /sys/devices/vas/vas0/gzip/qos_capabilities/nr_used_credits
  * Number of credits used by the user space.
+ * /sys/devices/vas/vas0/gzip/qos_capabilities/update_total_credits
+ * Update total QoS credits dynamically
  */
 
 VAS_ATTR_RO(nr_total_credits);
 VAS_ATTR_RO(nr_used_credits);
 
-static struct attribute *vas_capab_attrs[] = {
+static struct vas_sysfs_entry update_total_credits_attribute =
+   __ATTR(update_total_credits, 0200, NULL, update_total_credits_trigger);
+
+static struct attribute *vas_def_capab_attrs[] = {
&nr_total_credits_attribute.attr,
&nr_used_credits_attribute.attr,
NULL,
 };
 
+static struct attribute *vas_qos_capab_attrs[] = {
+   &nr_total_credits_attribute.attr,
+   &nr_used_credits_attribute.attr,
+   &update_total_credits_attribute.attr,
+   NULL,
+};
+
 static ssize_t vas_type_show(struct kobject *kobj, struct attribute *attr,
 char *buf)
 {
@@ -118,19 +151,29 @@ static const struct sysfs_ops vas_sysfs_ops = {
.store  =   vas_type_store,
 };
 
-static struct kobj_type vas_attr_type = {
+static struct kobj_type vas_def_attr_type = {
.release=   vas_type_release,
.sysfs_ops  =   &vas_sysfs_ops,
-   .default_attrs  =   vas_capab_attrs,
+   .default_attrs  =   vas_def_capab_attrs,
 };
 
-static char *vas_caps_kobj_name(struct vas_cop_feat_caps *caps,
+static struct kobj_type vas_qos_attr_type = {
+   .release=   vas_type_release,
+   .sysfs_ops  =   &vas_sysfs_ops,
+   .default_attrs  =   vas_qos_capab_attrs,
+};
+
+static char *vas_caps_kobj_name(struct vas_caps_entry *centry,
struct kobject **kobj)
 {
+   struct vas_cop_feat_caps *caps = centry->caps;
+
if (caps->descriptor == VAS_GZIP_QOS_CAPABILITIES) {
+   kobject_init(¢ry->kobj, &vas_qos_attr_type);
*kobj = gzip_caps_kobj;
return "qos_capabilities";
} else if (caps->descriptor == VAS_GZIP_DEFAULT_CAPABILITIES) {
+   kobject_init(¢ry->kobj, &vas_def_attr_type);
*kobj = gzip_caps_kobj;
return "default_capabilities";
} else
@@ -152,9 +195,8 @@ int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps)
if (!centry)
return -ENOMEM;
 
-   kobject_init(¢ry->kobj, &vas_attr_type);
centry->caps = caps;
-   name  = vas_caps_kobj_name(caps, &kobj);
+   name  = vas_caps_kobj_name(centry, &kobj);
 
if (kobj) {
ret = kobject_add(¢ry->kobj, kobj, "%s", name);
diff --git a/arch/powerpc/platform

[PATCH v5 8/9] powerpc/pseries/vas: sysfs interface to export capabilities

2022-02-27 Thread Haren Myneni


The hypervisor provides the available VAS GZIP capabilities such
as default or QoS window type and the target available credits in
each type. This patch creates sysfs entries and exports the target,
used and the available credits for each feature.

This interface can be used by the user space to determine the credits
usage or to set the target credits in the case of QoS type (for DLPAR).

/sys/devices/vas/vas0/gzip/default_capabilities (default GZIP capabilities)
nr_total_credits /* Total credits available. Can be
 /* changed with DLPAR operation */
nr_used_credits  /* Used credits */

/sys/devices/vas/vas0/gzip/qos_capabilities (QoS GZIP capabilities)
nr_total_credits
nr_used_credits

Signed-off-by: Haren Myneni 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/Makefile|   2 +-
 arch/powerpc/platforms/pseries/vas-sysfs.c | 226 +
 arch/powerpc/platforms/pseries/vas.c   |   6 +
 arch/powerpc/platforms/pseries/vas.h   |   6 +
 4 files changed, 239 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/platforms/pseries/vas-sysfs.c

diff --git a/arch/powerpc/platforms/pseries/Makefile 
b/arch/powerpc/platforms/pseries/Makefile
index ee60b59024b4..29b522d2c755 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -29,6 +29,6 @@ obj-$(CONFIG_PPC_SVM) += svm.o
 obj-$(CONFIG_FA_DUMP)  += rtas-fadump.o
 
 obj-$(CONFIG_SUSPEND)  += suspend.o
-obj-$(CONFIG_PPC_VAS)  += vas.o
+obj-$(CONFIG_PPC_VAS)  += vas.o vas-sysfs.o
 
 obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o
diff --git a/arch/powerpc/platforms/pseries/vas-sysfs.c 
b/arch/powerpc/platforms/pseries/vas-sysfs.c
new file mode 100644
index ..e24d3edb3021
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/vas-sysfs.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2022-23 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vas.h"
+
+#ifdef CONFIG_SYSFS
+static struct kobject *pseries_vas_kobj;
+static struct kobject *gzip_caps_kobj;
+
+struct vas_caps_entry {
+   struct kobject kobj;
+   struct vas_cop_feat_caps *caps;
+};
+
+#define to_caps_entry(entry) container_of(entry, struct vas_caps_entry, kobj)
+
+#define sysfs_caps_entry_read(_name)   \
+static ssize_t _name##_show(struct vas_cop_feat_caps *caps, char *buf) 
\
+{  \
+   return sprintf(buf, "%d\n", atomic_read(&caps->_name)); \
+}
+
+struct vas_sysfs_entry {
+   struct attribute attr;
+   ssize_t (*show)(struct vas_cop_feat_caps *, char *);
+   ssize_t (*store)(struct vas_cop_feat_caps *, const char *, size_t);
+};
+
+#define VAS_ATTR_RO(_name) \
+   sysfs_caps_entry_read(_name);   \
+   static struct vas_sysfs_entry _name##_attribute = __ATTR(_name, \
+   0444, _name##_show, NULL);
+
+/*
+ * Create sysfs interface:
+ * /sys/devices/vas/vas0/gzip/default_capabilities
+ * This directory contains the following VAS GZIP capabilities
+ * for the defaule credit type.
+ * /sys/devices/vas/vas0/gzip/default_capabilities/nr_total_credits
+ * Total number of default credits assigned to the LPAR which
+ * can be changed with DLPAR operation.
+ * /sys/devices/vas/vas0/gzip/default_capabilities/nr_used_credits
+ * Number of credits used by the user space. One credit will
+ * be assigned for each window open.
+ *
+ * /sys/devices/vas/vas0/gzip/qos_capabilities
+ * This directory contains the following VAS GZIP capabilities
+ * for the Quality of Service (QoS) credit type.
+ * /sys/devices/vas/vas0/gzip/qos_capabilities/nr_total_credits
+ * Total number of QoS credits assigned to the LPAR. The user
+ * has to define this value using HMC interface. It can be
+ * changed dynamically by the user.
+ * /sys/devices/vas/vas0/gzip/qos_capabilities/nr_used_credits
+ * Number of credits used by the user space.
+ */
+
+VAS_ATTR_RO(nr_total_credits);
+VAS_ATTR_RO(nr_used_credits);
+
+static struct attribute *vas_capab_attrs[] = {
+   &nr_total_credits_attribute.attr,
+   &nr_used_credits_attribute.attr,
+   NULL,
+};
+
+static ssize_t vas_type_show(struct kobject *kobj, struct attribute *attr,
+char *buf)
+{
+   struct vas_caps_entry *centry;
+   struct vas_cop_feat_caps *caps;
+   struct vas_sysfs_entry *entry;
+
+   centry = to_caps_entry(kobj);
+   caps = centry->caps;
+   entry = container_of(attr, struct vas_sysfs_entry, attr);
+
+   if (!entry->show)
+   return -EIO;
+
+   return entry->show(caps, buf);
+}
+
+static ssize_t vas_type_store(struct kobject *kobj, struct attribute *attr,
+  

[PATCH v5 6/9] powerpc/pseries/vas: Close windows with DLPAR core removal

2022-02-27 Thread Haren Myneni


The hypervisor assigns vas credits (windows) for each LPAR based
on the number of cores configured in that system. The OS is
expected to release credits when cores are removed, and may
allocate more when cores are added. So there is a possibility of
using excessive credits (windows) in the LPAR and the hypervisor
expects the system to close the excessive windows so that NX load
can be equally distributed across all LPARs in the system.

When the OS closes the excessive windows in the hypervisor,
it sets the window status inactive and invalidates window
virtual address mapping. The user space receives paste instruction
failure if any NX requests are issued on the inactive window.
Then the user space can use with the available open windows or
retry NX requests until this window active again.

This patch also adds the notifier for core removal/add to close
windows in the hypervisor if the system lost credits (core
removal) and reopen windows in the hypervisor when the previously
lost credits are available.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/include/asm/vas.h   |   2 +
 arch/powerpc/platforms/pseries/vas.c | 207 +--
 arch/powerpc/platforms/pseries/vas.h |   3 +
 3 files changed, 204 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 27251af18c65..6baf7b9ffed4 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -34,6 +34,8 @@
  */
 #define VAS_WIN_ACTIVE 0x0 /* Used in platform independent */
/* vas mmap() */
+/* Window is closed in the hypervisor due to lost credit */
+#define VAS_WIN_NO_CRED_CLOSE  0x0001
 
 /*
  * Get/Set bit fields
diff --git a/arch/powerpc/platforms/pseries/vas.c 
b/arch/powerpc/platforms/pseries/vas.c
index 1035446f985b..a297720bcdae 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -370,13 +370,28 @@ static struct vas_window *vas_allocate_window(int vas_id, 
u64 flags,
if (rc)
goto out_free;
 
-   vas_user_win_add_mm_context(&txwin->vas_win.task_ref);
txwin->win_type = cop_feat_caps->win_type;
mutex_lock(&vas_pseries_mutex);
-   list_add(&txwin->win_list, &caps->list);
+   /*
+* Possible to lose the acquired credit with DLPAR core
+* removal after the window is opened. So if there are any
+* closed windows (means with lost credits), do not give new
+* window to user space. New windows will be opened only
+* after the existing windows are reopened when credits are
+* available.
+*/
+   if (!caps->nr_close_wins) {
+   list_add(&txwin->win_list, &caps->list);
+   caps->nr_open_windows++;
+   mutex_unlock(&vas_pseries_mutex);
+   vas_user_win_add_mm_context(&txwin->vas_win.task_ref);
+   return &txwin->vas_win;
+   }
mutex_unlock(&vas_pseries_mutex);
 
-   return &txwin->vas_win;
+   put_vas_user_win_ref(&txwin->vas_win.task_ref);
+   rc = -EBUSY;
+   pr_err("No credit is available to allocate window\n");
 
 out_free:
/*
@@ -439,14 +454,24 @@ static int vas_deallocate_window(struct vas_window *vwin)
 
caps = &vascaps[win->win_type].caps;
mutex_lock(&vas_pseries_mutex);
-   rc = deallocate_free_window(win);
-   if (rc) {
-   mutex_unlock(&vas_pseries_mutex);
-   return rc;
-   }
+   /*
+* VAS window is already closed in the hypervisor when
+* lost the credit. So just remove the entry from
+* the list, remove task references and free vas_window
+* struct.
+*/
+   if (win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) {
+   rc = deallocate_free_window(win);
+   if (rc) {
+   mutex_unlock(&vas_pseries_mutex);
+   return rc;
+   }
+   } else
+   vascaps[win->win_type].nr_close_wins--;
 
list_del(&win->win_list);
atomic_dec(&caps->nr_used_credits);
+   vascaps[win->win_type].nr_open_windows--;
mutex_unlock(&vas_pseries_mutex);
 
put_vas_user_win_ref(&vwin->task_ref);
@@ -501,6 +526,7 @@ static int __init get_vas_capabilities(u8 feat, enum 
vas_cop_feat_type type,
memset(vcaps, 0, sizeof(*vcaps));
INIT_LIST_HEAD(&vcaps->list);
 
+   vcaps->feat = feat;
caps = &vcaps->caps;
 
rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, feat,
@@ -539,6 +565,168 @@ static int __init get_vas_capabilities(u8 feat, enum 
vas_cop_feat_type type,
return 0;
 }
 
+/*
+ * The hypervisor reduces the available credits if the LPAR lost core. It
+ * means the excessive windows should not be active and the user space
+ * should not be using these windows to send compression requests to NX.
+ * So the kernel closes the 

[PATCH v5 5/9] powerpc/vas: Map paste address only if window is active

2022-02-27 Thread Haren Myneni


The paste address mapping is done with mmap() after the window is
opened with ioctl. The partition has to close VAS windows in the
hypervisor if it lost credits due to DLPAR core removal. But the
kernel marks these windows inactive until the previously lost
credits are available later. If the window is inactive due to
DLPAR after this mmap(), the paste instruction returns failure
until the the OS reopens this window again.

Before the user space issuing mmap(), there is a possibility of
happening DLPAR core removal event which causes the corresponding
window inactive. So if the window is not active, return mmap()
failure with -EACCES and expects the user space reissue mmap()
when the window is active or open a new window when the credit
is available.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/book3s/vas-api.c | 23 ++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/book3s/vas-api.c 
b/arch/powerpc/platforms/book3s/vas-api.c
index f3e421511ea6..5372dbc2e37f 100644
--- a/arch/powerpc/platforms/book3s/vas-api.c
+++ b/arch/powerpc/platforms/book3s/vas-api.c
@@ -496,10 +496,29 @@ static int coproc_mmap(struct file *fp, struct 
vm_area_struct *vma)
return -EACCES;
}
 
+   /*
+* The initial mmap is done after the window is opened
+* with ioctl. But before mmap(), this window can be closed in
+* the hypervisor due to lost credit (core removal on pseries).
+* So if the window is not active, return mmap() failure with
+* -EACCES and expects the user space reissue mmap() when it
+* is active again or open new window when the credit is available.
+* mmap_mutex protects the paste address mmap() with DLPAR
+* close/open event and allows mmap() only when the window is
+* active.
+*/
+   mutex_lock(&txwin->task_ref.mmap_mutex);
+   if (txwin->status != VAS_WIN_ACTIVE) {
+   pr_err("%s(): Window is not active\n", __func__);
+   rc = -EACCES;
+   goto out;
+   }
+
paste_addr = cp_inst->coproc->vops->paste_addr(txwin);
if (!paste_addr) {
pr_err("%s(): Window paste address failed\n", __func__);
-   return -EINVAL;
+   rc = -EINVAL;
+   goto out;
}
 
pfn = paste_addr >> PAGE_SHIFT;
@@ -519,6 +538,8 @@ static int coproc_mmap(struct file *fp, struct 
vm_area_struct *vma)
txwin->task_ref.vma = vma;
vma->vm_ops = &vas_vm_ops;
 
+out:
+   mutex_unlock(&txwin->task_ref.mmap_mutex);
return rc;
 }
 
-- 
2.27.0




[PATCH v5 4/9] powerpc/vas: Return paste instruction failure if no active window

2022-02-27 Thread Haren Myneni


The VAS window may not be active if the system looses credits and
the NX generates page fault when it receives request on unmap
paste address.

The kernel handles the fault by remap new paste address if the
window is active again, Otherwise return the paste instruction
failure if the executed instruction that caused the fault was
a paste.

Signed-off-by: Nicholas Piggin 
Signed-off-by: Haren Myneni 
---
 arch/powerpc/include/asm/ppc-opcode.h   |  2 +
 arch/powerpc/platforms/book3s/vas-api.c | 55 -
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index 9675303b724e..82f1f0041c6f 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -262,6 +262,8 @@
 #define PPC_INST_MFSPR_PVR 0x7c1f42a6
 #define PPC_INST_MFSPR_PVR_MASK0xfc1e
 #define PPC_INST_MTMSRD0x7c000164
+#define PPC_INST_PASTE 0x7c20070d
+#define PPC_INST_PASTE_MASK0xfc2007ff
 #define PPC_INST_POPCNTB   0x7cf4
 #define PPC_INST_POPCNTB_MASK  0xfc0007fe
 #define PPC_INST_RFEBB 0x4c000124
diff --git a/arch/powerpc/platforms/book3s/vas-api.c 
b/arch/powerpc/platforms/book3s/vas-api.c
index f359e7b2bf90..f3e421511ea6 100644
--- a/arch/powerpc/platforms/book3s/vas-api.c
+++ b/arch/powerpc/platforms/book3s/vas-api.c
@@ -351,6 +351,41 @@ static int coproc_release(struct inode *inode, struct file 
*fp)
return 0;
 }
 
+/*
+ * If the executed instruction that caused the fault was a paste, then
+ * clear regs CR0[EQ], advance NIP, and return 0. Else return error code.
+ */
+static int do_fail_paste(void)
+{
+   struct pt_regs *regs = current->thread.regs;
+   u32 instword;
+
+   if (WARN_ON_ONCE(!regs))
+   return -EINVAL;
+
+   if (WARN_ON_ONCE(!user_mode(regs)))
+   return -EINVAL;
+
+   /*
+* If we couldn't translate the instruction, the driver should
+* return success without handling the fault, it will be retried
+* or the instruction fetch will fault.
+*/
+   if (get_user(instword, (u32 __user *)(regs->nip)))
+   return -EAGAIN;
+
+   /*
+* Not a paste instruction, driver may fail the fault.
+*/
+   if ((instword & PPC_INST_PASTE_MASK) != PPC_INST_PASTE)
+   return -ENOENT;
+
+   regs->ccr &= ~0xe000;   /* Clear CR0[0-2] to fail paste */
+   regs_add_return_ip(regs, 4);/* Emulate the paste */
+
+   return 0;
+}
+
 /*
  * This fault handler is invoked when the core generates page fault on
  * the paste address. Happens if the kernel closes window in hypervisor
@@ -408,9 +443,27 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf)
}
mutex_unlock(&txwin->task_ref.mmap_mutex);
 
-   return VM_FAULT_SIGBUS;
+   /*
+* Received this fault due to closing the actual window.
+* It can happen during migration or lost credits.
+* Since no mapping, return the paste instruction failure
+* to the user space.
+*/
+   ret = do_fail_paste();
+   /*
+* The user space can retry several times until success (needed
+* for migration) or should fallback to SW compression or
+* manage with the existing open windows if available.
+* Looking at sysfs interface, it can determine whether these
+* failures are coming during migration or core removal:
+* nr_used_credits > nr_total_credits when lost credits
+*/
+   if (!ret || (ret == -EAGAIN))
+   return VM_FAULT_NOPAGE;
 
+   return VM_FAULT_SIGBUS;
 }
+
 static const struct vm_operations_struct vas_vm_ops = {
.fault = vas_mmap_fault,
 };
-- 
2.27.0




[PATCH v5 3/9] powerpc/vas: Add paste address mmap fault handler

2022-02-27 Thread Haren Myneni


The user space opens VAS windows and issues NX requests by pasting
CRB on the corresponding paste address mmap. When the system lost
credits due to core removal, the kernel has to close the window in
the hypervisor and make the window inactive by unmapping this paste
address. Also the OS has to handle NX request page faults if the user
space issue NX requests.

This handler maps the new paste address with the same VMA when the
window is active again (due to core add with DLPAR). Otherwise
returns paste failure.

Signed-off-by: Haren Myneni 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/vas.h  | 10 
 arch/powerpc/platforms/book3s/vas-api.c | 68 +
 2 files changed, 78 insertions(+)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 57573d9c1e09..27251af18c65 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -29,6 +29,12 @@
 #define VAS_THRESH_FIFO_GT_QTR_FULL2
 #define VAS_THRESH_FIFO_GT_EIGHTH_FULL 3
 
+/*
+ * VAS window Linux status bits
+ */
+#define VAS_WIN_ACTIVE 0x0 /* Used in platform independent */
+   /* vas mmap() */
+
 /*
  * Get/Set bit fields
  */
@@ -59,6 +65,9 @@ struct vas_user_win_ref {
struct pid *pid;/* PID of owner */
struct pid *tgid;   /* Thread group ID of owner */
struct mm_struct *mm;   /* Linux process mm_struct */
+   struct mutex mmap_mutex;/* protects paste address mmap() */
+   /* with DLPAR close/open windows */
+   struct vm_area_struct *vma; /* Save VMA and used in DLPAR ops */
 };
 
 /*
@@ -67,6 +76,7 @@ struct vas_user_win_ref {
 struct vas_window {
u32 winid;
u32 wcreds_max; /* Window credits */
+   u32 status; /* Window status used in OS */
enum vas_cop_type cop;
struct vas_user_win_ref task_ref;
char *dbgname;
diff --git a/arch/powerpc/platforms/book3s/vas-api.c 
b/arch/powerpc/platforms/book3s/vas-api.c
index 4d82c92ddd52..f359e7b2bf90 100644
--- a/arch/powerpc/platforms/book3s/vas-api.c
+++ b/arch/powerpc/platforms/book3s/vas-api.c
@@ -316,6 +316,7 @@ static int coproc_ioc_tx_win_open(struct file *fp, unsigned 
long arg)
return PTR_ERR(txwin);
}
 
+   mutex_init(&txwin->task_ref.mmap_mutex);
cp_inst->txwin = txwin;
 
return 0;
@@ -350,6 +351,70 @@ static int coproc_release(struct inode *inode, struct file 
*fp)
return 0;
 }
 
+/*
+ * This fault handler is invoked when the core generates page fault on
+ * the paste address. Happens if the kernel closes window in hypervisor
+ * (on pseries) due to lost credit or the paste address is not mapped.
+ */
+static vm_fault_t vas_mmap_fault(struct vm_fault *vmf)
+{
+   struct vm_area_struct *vma = vmf->vma;
+   struct file *fp = vma->vm_file;
+   struct coproc_instance *cp_inst = fp->private_data;
+   struct vas_window *txwin;
+   u64 paste_addr;
+   int ret;
+
+   /*
+* window is not opened. Shouldn't expect this error.
+*/
+   if (!cp_inst || !cp_inst->txwin) {
+   pr_err("%s(): Unexpected fault on paste address with TX window 
closed\n",
+   __func__);
+   return VM_FAULT_SIGBUS;
+   }
+
+   txwin = cp_inst->txwin;
+   /*
+* When the LPAR lost credits due to core removal or during
+* migration, invalidate the existing mapping for the current
+* paste addresses and set windows in-active (zap_page_range in
+* reconfig_close_windows()).
+* New mapping will be done later after migration or new credits
+* available. So continue to receive faults if the user space
+* issue NX request.
+*/
+   if (txwin->task_ref.vma != vmf->vma) {
+   pr_err("%s(): No previous mapping with paste address\n",
+   __func__);
+   return VM_FAULT_SIGBUS;
+   }
+
+   mutex_lock(&txwin->task_ref.mmap_mutex);
+   /*
+* The window may be inactive due to lost credit (Ex: core
+* removal with DLPAR). If the window is active again when
+* the credit is available, map the new paste address at the
+* the window virtual address.
+*/
+   if (txwin->status == VAS_WIN_ACTIVE) {
+   paste_addr = cp_inst->coproc->vops->paste_addr(txwin);
+   if (paste_addr) {
+   ret = vmf_insert_pfn(vma, vma->vm_start,
+   (paste_addr >> PAGE_SHIFT));
+   mutex_unlock(&txwin->task_ref.mmap_mutex);
+   return ret;
+   }
+   }
+   mutex_unlock(&txwin->task_ref.mmap_mutex);
+
+   return VM_FAULT_SIGBUS;
+
+}
+static const struct vm_operations_struct vas_vm_ops = {
+   .fault = vas_mmap_fault,
+};
+
 static int 

[PATCH v5 2/9] powerpc/pseries/vas: Save PID in pseries_vas_window struct

2022-02-27 Thread Haren Myneni


The kernel sets the VAS window with PID when it is opened in
the hypervisor. During DLPAR operation, windows can be closed and
reopened in the hypervisor when the credit is available. So saves
this PID in pseries_vas_window struct when the window is opened
initially and reuse it later during DLPAR operation.

Signed-off-by: Haren Myneni 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/vas.c | 9 +
 arch/powerpc/platforms/pseries/vas.h | 1 +
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/vas.c 
b/arch/powerpc/platforms/pseries/vas.c
index 18aae037ffe9..1035446f985b 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -107,7 +107,6 @@ static int h_deallocate_vas_window(u64 winid)
 static int h_modify_vas_window(struct pseries_vas_window *win)
 {
long rc;
-   u32 lpid = mfspr(SPRN_PID);
 
/*
 * AMR value is not supported in Linux VAS implementation.
@@ -115,7 +114,7 @@ static int h_modify_vas_window(struct pseries_vas_window 
*win)
 */
do {
rc = plpar_hcall_norets(H_MODIFY_VAS_WINDOW,
-   win->vas_win.winid, lpid, 0,
+   win->vas_win.winid, win->pid, 0,
VAS_MOD_WIN_FLAGS, 0);
 
rc = hcall_return_busy_check(rc);
@@ -124,8 +123,8 @@ static int h_modify_vas_window(struct pseries_vas_window 
*win)
if (rc == H_SUCCESS)
return 0;
 
-   pr_err("H_MODIFY_VAS_WINDOW error: %ld, winid %u lpid %u\n",
-   rc, win->vas_win.winid, lpid);
+   pr_err("H_MODIFY_VAS_WINDOW error: %ld, winid %u pid %u\n",
+   rc, win->vas_win.winid, win->pid);
return -EIO;
 }
 
@@ -338,6 +337,8 @@ static struct vas_window *vas_allocate_window(int vas_id, 
u64 flags,
}
}
 
+   txwin->pid = mfspr(SPRN_PID);
+
/*
 * Allocate / Deallocate window hcalls and setup / free IRQs
 * have to be protected with mutex.
diff --git a/arch/powerpc/platforms/pseries/vas.h 
b/arch/powerpc/platforms/pseries/vas.h
index d6ea8ab8b07a..2872532ed72a 100644
--- a/arch/powerpc/platforms/pseries/vas.h
+++ b/arch/powerpc/platforms/pseries/vas.h
@@ -114,6 +114,7 @@ struct pseries_vas_window {
u64 domain[6];  /* Associativity domain Ids */
/* this window is allocated */
u64 util;
+   u32 pid;/* PID associated with this window */
 
/* List of windows opened which is used for LPM */
struct list_head win_list;
-- 
2.27.0




[PATCH v5 1/9] powerpc/pseries/vas: Use common names in VAS capability structure

2022-02-27 Thread Haren Myneni


nr_total/nr_used_credits provides credits usage to user space
via sysfs and the same interface can be used on PowerNV in
future. Changed with proper naming so that applicable on both
pseries and PowerNV.

Signed-off-by: Haren Myneni 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/vas.c | 10 +-
 arch/powerpc/platforms/pseries/vas.h |  5 ++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/vas.c 
b/arch/powerpc/platforms/pseries/vas.c
index d243ddc58827..18aae037ffe9 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -310,8 +310,8 @@ static struct vas_window *vas_allocate_window(int vas_id, 
u64 flags,
 
cop_feat_caps = &caps->caps;
 
-   if (atomic_inc_return(&cop_feat_caps->used_lpar_creds) >
-   atomic_read(&cop_feat_caps->target_lpar_creds)) {
+   if (atomic_inc_return(&cop_feat_caps->nr_used_credits) >
+   atomic_read(&cop_feat_caps->nr_total_credits)) {
pr_err("Credits are not available to allocate window\n");
rc = -EINVAL;
goto out;
@@ -385,7 +385,7 @@ static struct vas_window *vas_allocate_window(int vas_id, 
u64 flags,
free_irq_setup(txwin);
h_deallocate_vas_window(txwin->vas_win.winid);
 out:
-   atomic_dec(&cop_feat_caps->used_lpar_creds);
+   atomic_dec(&cop_feat_caps->nr_used_credits);
kfree(txwin);
return ERR_PTR(rc);
 }
@@ -445,7 +445,7 @@ static int vas_deallocate_window(struct vas_window *vwin)
}
 
list_del(&win->win_list);
-   atomic_dec(&caps->used_lpar_creds);
+   atomic_dec(&caps->nr_used_credits);
mutex_unlock(&vas_pseries_mutex);
 
put_vas_user_win_ref(&vwin->task_ref);
@@ -521,7 +521,7 @@ static int __init get_vas_capabilities(u8 feat, enum 
vas_cop_feat_type type,
}
caps->max_lpar_creds = be16_to_cpu(hv_caps->max_lpar_creds);
caps->max_win_creds = be16_to_cpu(hv_caps->max_win_creds);
-   atomic_set(&caps->target_lpar_creds,
+   atomic_set(&caps->nr_total_credits,
   be16_to_cpu(hv_caps->target_lpar_creds));
if (feat == VAS_GZIP_DEF_FEAT) {
caps->def_lpar_creds = be16_to_cpu(hv_caps->def_lpar_creds);
diff --git a/arch/powerpc/platforms/pseries/vas.h 
b/arch/powerpc/platforms/pseries/vas.h
index 4ecb3fcabd10..d6ea8ab8b07a 100644
--- a/arch/powerpc/platforms/pseries/vas.h
+++ b/arch/powerpc/platforms/pseries/vas.h
@@ -72,9 +72,8 @@ struct vas_cop_feat_caps {
};
/* Total LPAR available credits. Can be different from max LPAR */
/* credits due to DLPAR operation */
-   atomic_ttarget_lpar_creds;
-   atomic_tused_lpar_creds; /* Used credits so far */
-   u16 avail_lpar_creds; /* Remaining available credits */
+   atomic_tnr_total_credits;   /* Total credits assigned to 
LPAR */
+   atomic_tnr_used_credits;/* Used credits so far */
 };
 
 /*
-- 
2.27.0




[PATCH v5 0/9] powerpc/pseries/vas: NXGZIP support with DLPAR

2022-02-27 Thread Haren Myneni


PowerPC provides HW compression with NX coprocessor. This feature
is available on both PowerNV and PowerVM and included in Linux.
Since each powerpc chip has one NX coprocessor, the VAS introduces
the concept of windows / credits to manage access to this hardware
resource. On powerVM, these limited resources should be available
across all LPARs. So the hypervisor assigns the specific credits
to each LPAR based on processor entitlement so that one LPAR does
not overload NX. The hypervisor can reject the window open request
to a partition if exceeds its credit limit (1 credit per window).

So the total number of target credits in a partition can be changed
if the core configuration is modified. The hypervisor expects the
partition to modify its window usage depends on new target
credits. For example, if the partition uses more credits than the
new target credits, it should close the excessive windows so that
the NX resource will be available to other partitions.

This patch series enables OS to support this dynamic credit
management with DLPAR core removal/add.

Core removal operation:
- Get new VAS capabilities from the hypervisor when the DLPAR
  notifier is received. This capabilities provides the new target
  credits based on new processor entitlement. In the case of QoS
  credit changes, the notification will be issued by updating
  the target_creds via sysfs.
- If the partition is already used more than the new target credits,
  the kernel selects windows, unmap the current paste address and
  close them in the hypervisor, It uses FIFO to identify these
  windows - last windows that are opened are the first ones to be
  closed.
- When the user space issue requests on these windows, NX generates
  page fault on the unmap paste address. The kernel handles the
  fault by returning the paste instruction failure if the window is
  not active (means unmap paste). Then up to the library / user
  space to fall back to SW compression or manage with the current
  windows.

Core add operation:
- The kernel can see increased target credits from the new VAS
  capabilities.
- Scans the window list for the closed windows in the hypervisor
  due to lost credit before and selects windows based on same FIFO.
- Make these corresponding windows active and create remap with
  the same VMA on the new paste address in the fault handler.
- Then the user space should expect paste successful later.

Patch 1: Define common names for sysfs target/used/avail_creds so
 that same sysfs entries can be used even on PowerNV later.
Patch 2: Save PID in the vas window struct  during initial window
 open and use it when reopen later.
Patch 3: Add new mmap fault handler which handles the page fault
 from NX on paste address.
Patch 4: Return the paste instruction failure if the window is not
 active.
Patch 5: If the window is closed in the hypervisor before the user
 space issue the initial mmap(), return -EACCES failure.
Patch 6: Close windows in the hypervisor when the partition exceeds
 its usage than the new target credits.
Patch 7: When credits are available, reopen windows that are closed
 before with core removal.
Patch 8 & 9: The user space determines the credit usage with sysfs
 target/avail/used_creds interfaces. drmgr uses target_creds
 to notify OS for QoS credit changes.

Thanks to Nicholas Piggin and Aneesh Kumar for the valuable suggestions
on the NXGZIP design to support DLPAR operations.

Changes in v2:
- Rebase on 5.16-rc5
- Use list safe functions to iterate windows list
- Changes to show the actual value in sysfs used_credits even though
  some windows are inactive with core removal. Reflects -ve value in
  sysfs avail_creds to let userspace know that it opened more windows
  than the current maximum LPAR credits.

Changes in v3:
- Rebase on 5.16
- Reconfigure VAS windows only for CPU hotplug events.

Changes in v4:
- Rebase on 5.17-rc4
- Changes based on comments from Nicholas Piggin
- Included VAS DLPAR notifer code in 'Close windows with DLPAR'
  patch instead of as a separate patch
- Patches reordering and other changes

Changes in v5:
- Rebase on 5.17-rc5
- Add update_total_credits sysfs entry to update QoS target credits
  and other commit descriptions as suggested by Nicholas Piggin

Haren Myneni (9):
  powerpc/pseries/vas: Use common names in VAS capability structure
  powerpc/pseries/vas: Save PID in pseries_vas_window struct
  powerpc/vas: Add paste address mmap fault handler
  powerpc/vas: Return paste instruction failure if no active window
  powerpc/vas: Map paste address only if window is active
  powerpc/pseries/vas: Close windows with DLPAR core removal
  powerpc/pseries/vas: Reopen windows with DLPAR core add
  powerpc/pseries/vas: sysfs interface to export capabilities
  powerpc/pseries/vas: Add 'update_total_credits' entry for QoS
capabilities

 arch/powerpc/include/asm/ppc-opcode.h  |   2 +
 arch/powerpc/incl

RE: [PATCH V7 03/20] compat: consolidate the compat_flock{,64} definition

2022-02-27 Thread David Laight
From: guo...@kernel.org
> Sent: 27 February 2022 16:28
> 
> From: Christoph Hellwig 
> 
> Provide a single common definition for the compat_flock and
> compat_flock64 structures using the same tricks as for the native
> variants.  Another extra define is added for the packing required on
> x86.
...
> diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
...
>  /*
> - * IA32 uses 4 byte alignment for 64 bit quantities,
> - * so we need to pack this structure.
> + * IA32 uses 4 byte alignment for 64 bit quantities, so we need to pack the
> + * compat flock64 structure.
>   */
...
> +#define __ARCH_NEED_COMPAT_FLOCK64_PACKED
> 
>  struct compat_statfs {
>   int f_type;
> diff --git a/include/linux/compat.h b/include/linux/compat.h
> index 1c758b0e0359..a0481fe6c5d5 100644
> --- a/include/linux/compat.h
> +++ b/include/linux/compat.h
> @@ -258,6 +258,37 @@ struct compat_rlimit {
>   compat_ulong_t  rlim_max;
>  };
> 
> +#ifdef __ARCH_NEED_COMPAT_FLOCK64_PACKED
> +#define __ARCH_COMPAT_FLOCK64_PACK   __attribute__((packed))
> +#else
> +#define __ARCH_COMPAT_FLOCK64_PACK
> +#endif
...
> +struct compat_flock64 {
> + short   l_type;
> + short   l_whence;
> + compat_loff_t   l_start;
> + compat_loff_t   l_len;
> + compat_pid_tl_pid;
> +#ifdef __ARCH_COMPAT_FLOCK64_PAD
> + __ARCH_COMPAT_FLOCK64_PAD
> +#endif
> +} __ARCH_COMPAT_FLOCK64_PACK;
> +

Provided compat_loff_t are correctly defined with __aligned__(4)
marking the structure packed isn't needed.
I believe compat_u64 and compat_s64 both have aligned(4).
It is also wrong, consider:

struct foo {
char x;
struct compat_flock64 fl64;
};

There should be 3 bytes of padding after 'x'.
But you've removed it.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)



Question about update_persistent_clock() routine

2022-02-27 Thread Jimmie Mayfield

Hi,

I'm investigating a timekeeping problem on an old PPC-based embedded 
device and am curious about the update_persistent_clock() routine.   
This device runs a 3.10-era kernel but a similar (deprecated) function 
exists in the 5.16 kernels so I'll refer to the latter:


In arch/powerpc/kernel/time.c :: update_persistent_clock64() we have the 
following:


    rtc_time64_to_tm(now.tv_sec + 1 + timezone_offset, &tm);

I'm particularly curious about why a second is added to the value when 
building 'tm'.  This behavior appears to go back at least to the 2.6-era 
kernels.  Is it simply trying to account for the time spent in the 
kernel prior to updating the RTC?



Jimmie



RE: [PATCH 08/11] swiotlb: make the swiotlb_init interface more useful

2022-02-27 Thread Michael Kelley (LINUX)
From: Christoph Hellwig  Sent: Sunday, February 27, 2022 6:31 AM
> 
> Pass a bool to pass if swiotlb needs to be enabled based on the
> addressing needs and replace the verbose argument with a set of
> flags, including one to force enable bounce buffering.
> 
> Note that this patch removes the possibility to force xen-swiotlb
> use using swiotlb=force on the command line on x86 (arm and arm64
> never supported that), but this interface will be restored shortly.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  arch/arm/mm/init.c |  6 +
>  arch/arm64/mm/init.c   |  6 +
>  arch/ia64/mm/init.c|  4 +--
>  arch/mips/cavium-octeon/dma-octeon.c   |  2 +-
>  arch/mips/loongson64/dma.c |  2 +-
>  arch/mips/sibyte/common/dma.c  |  2 +-
>  arch/powerpc/include/asm/swiotlb.h |  1 +
>  arch/powerpc/mm/mem.c  |  3 ++-
>  arch/powerpc/platforms/pseries/setup.c |  3 ---
>  arch/riscv/mm/init.c   |  8 +-
>  arch/s390/mm/init.c|  3 +--
>  arch/x86/kernel/cpu/mshyperv.c |  8 --
>  arch/x86/kernel/pci-dma.c  | 15 ++-
>  arch/x86/mm/mem_encrypt_amd.c  |  3 ---
>  drivers/xen/swiotlb-xen.c  |  4 +--
>  include/linux/swiotlb.h| 15 ++-
>  include/trace/events/swiotlb.h | 29 -
>  kernel/dma/swiotlb.c   | 35 ++
>  18 files changed, 56 insertions(+), 93 deletions(-)

[snip]

> 
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index 5a99f993e6392..568274917f1cd 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -336,14 +336,6 @@ static void __init ms_hyperv_init_platform(void)
>   swiotlb_unencrypted_base = 
> ms_hyperv.shared_gpa_boundary;
>  #endif
>   }
> -
> -#ifdef CONFIG_SWIOTLB
> - /*
> -  * Enable swiotlb force mode in Isolation VM to
> -  * use swiotlb bounce buffer for dma transaction.
> -  */
> - swiotlb_force = SWIOTLB_FORCE;
> -#endif

With this code removed, it's not clear to me what forces the use of the
swiotlb in a Hyper-V isolated VM.  The code in pci_swiotlb_detect_4g() doesn't
catch this case because cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)
returns "false" in a Hyper-V guest.  In the Hyper-V guest, it's only
cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) that returns "true".  I'm
looking more closely at the meaning of the CC_ATTR_* values, and it may
be that Hyper-V should also return "true" for CC_ATTR_MEM_ENCRYPT,
but I don't think CC_ATTR_HOST_MEM_ENCRYPT should return "true".

Michael





Re: [PATCH] powerpc: Avoid nmi_enter/nmi_exit in real mode interrupt.

2022-02-27 Thread Christophe Leroy


Le 25/02/2022 à 11:24, Mahesh Salgaonkar a écrit :
> nmi_enter()/nmi_exit() touches per cpu variables which can lead to kernel
> crash when invoked during real mode interrupt handling (e.g. early HMI/MCE
> interrupt handler) if percpu allocation comes from vmalloc area.
> 
> Early HMI/MCE handlers are called through DEFINE_INTERRUPT_HANDLER_NMI()
> wrapper which invokes nmi_enter/nmi_exit calls. We don't see any issue when
> percpu allocation is from the embedded first chunk. However with
> CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where percpu
> allocation can come from the vmalloc area.
> 
> With kernel command line "percpu_alloc=page" we can force percpu allocation
> to come from vmalloc area and can see kernel crash in machine_check_early:
> 
> [1.215714] NIP [c0e49eb4] rcu_nmi_enter+0x24/0x110
> [1.215717] LR [c00461a0] machine_check_early+0xf0/0x2c0
> [1.215719] --- interrupt: 200
> [1.215720] [c00fffd73180] [] 0x0 (unreliable)
> [1.215722] [c00fffd731b0] [] 0x0
> [1.215724] [c00fffd73210] [c0008364] 
> machine_check_early_common+0x134/0x1f8
> 
> Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if percpu
> first chunk is not embedded.
> 
> Signed-off-by: Mahesh Salgaonkar 
> ---
>   arch/powerpc/include/asm/interrupt.h |   15 +++
>   arch/powerpc/include/asm/percpu.h|2 ++
>   arch/powerpc/kernel/setup_64.c   |3 +++
>   3 files changed, 20 insertions(+)


In file included from /linux/arch/powerpc/kernel/irq.c:57:0:
/linux/arch/powerpc/include/asm/interrupt.h: In function 
'interrupt_nmi_enter_prepare':
/linux/arch/powerpc/include/asm/interrupt.h:337:109: error: 
'__percpu_embed_first_chunk' undeclared (first use in this function)
/linux/arch/powerpc/include/asm/interrupt.h:337:109: note: each 
undeclared identifier is reported only once for each function it appears in
In file included from /linux/arch/powerpc/kernel/irq.c:57:0:
/linux/arch/powerpc/include/asm/interrupt.h: In function 
'interrupt_nmi_exit_prepare':
/linux/arch/powerpc/include/asm/interrupt.h:353:109: error: 
'__percpu_embed_first_chunk' undeclared (first use in this function)
In file included from /linux/arch/powerpc/kernel/irq.c:57:0:
/linux/arch/powerpc/include/asm/interrupt.h:366:1: error: label at end 
of compound statement
  skip_nmi_exit:
  ^
/linux/scripts/Makefile.build:288: recipe for target 
'arch/powerpc/kernel/irq.o' failed
make[3]: *** [arch/powerpc/kernel/irq.o] Error 1

Re: [PATCH 08/11] swiotlb: make the swiotlb_init interface more useful

2022-02-27 Thread Christophe Leroy


Le 27/02/2022 à 15:30, Christoph Hellwig a écrit :
> Pass a bool to pass if swiotlb needs to be enabled based on the
> addressing needs and replace the verbose argument with a set of
> flags, including one to force enable bounce buffering.
> 
> Note that this patch removes the possibility to force xen-swiotlb
> use using swiotlb=force on the command line on x86 (arm and arm64
> never supported that), but this interface will be restored shortly.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>   arch/arm/mm/init.c |  6 +
>   arch/arm64/mm/init.c   |  6 +
>   arch/ia64/mm/init.c|  4 +--
>   arch/mips/cavium-octeon/dma-octeon.c   |  2 +-
>   arch/mips/loongson64/dma.c |  2 +-
>   arch/mips/sibyte/common/dma.c  |  2 +-
>   arch/powerpc/include/asm/swiotlb.h |  1 +
>   arch/powerpc/mm/mem.c  |  3 ++-

arch/powerpc/mm/mem.o:(.toc+0x0): undefined reference to `ppc_swiotlb_flags'
make[1]: *** [vmlinux] Error 1
/linux/Makefile:1155: recipe for target 'vmlinux' failed


>   arch/powerpc/platforms/pseries/setup.c |  3 ---
>   arch/riscv/mm/init.c   |  8 +-
>   arch/s390/mm/init.c|  3 +--
>   arch/x86/kernel/cpu/mshyperv.c |  8 --
>   arch/x86/kernel/pci-dma.c  | 15 ++-
>   arch/x86/mm/mem_encrypt_amd.c  |  3 ---
>   drivers/xen/swiotlb-xen.c  |  4 +--
>   include/linux/swiotlb.h| 15 ++-
>   include/trace/events/swiotlb.h | 29 -
>   kernel/dma/swiotlb.c   | 35 ++
>   18 files changed, 56 insertions(+), 93 deletions(-)
> 

[PATCH V7 20/20] riscv: compat: Add COMPAT Kbuild skeletal support

2022-02-27 Thread guoren
From: Guo Ren 

Adds initial skeletal COMPAT Kbuild (Running 32bit U-mode on
64bit S-mode) support.
 - Setup kconfig & dummy functions for compiling.
 - Implement compat_start_thread by the way.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Arnd Bergmann 
Cc: Palmer Dabbelt 
---
 arch/riscv/Kconfig | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 5adcbd9b5e88..6f11df8c189f 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -73,6 +73,7 @@ config RISCV
select HAVE_ARCH_KGDB if !XIP_KERNEL
select HAVE_ARCH_KGDB_QXFER_PKT
select HAVE_ARCH_MMAP_RND_BITS if MMU
+   select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
@@ -123,12 +124,18 @@ config ARCH_MMAP_RND_BITS_MIN
default 18 if 64BIT
default 8
 
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+   default 8
+
 # max bits determined by the following formula:
 #  VA_BITS - PAGE_SHIFT - 3
 config ARCH_MMAP_RND_BITS_MAX
default 24 if 64BIT # SV39 based
default 17
 
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+   default 17
+
 # set if we run in machine mode, cleared if we run in supervisor mode
 config RISCV_M_MODE
bool
@@ -406,6 +413,18 @@ config CRASH_DUMP
 
  For more details see Documentation/admin-guide/kdump/kdump.rst
 
+config COMPAT
+   bool "Kernel support for 32-bit U-mode"
+   default 64BIT
+   depends on 64BIT && MMU
+   help
+ This option enables support for a 32-bit U-mode running under a 64-bit
+ kernel at S-mode. riscv32-specific components such as system calls,
+ the user helper functions (vdso), signal rt_frame functions and the
+ ptrace interface are handled appropriately by the kernel.
+
+ If you want to execute 32-bit userspace applications, say Y.
+
 endmenu
 
 menu "Boot options"
-- 
2.25.1



[PATCH V7 19/20] riscv: compat: ptrace: Add compat_arch_ptrace implement

2022-02-27 Thread guoren
From: Guo Ren 

Now, you can use native gdb on riscv64 for rv32 app debugging.

$ uname -a
Linux buildroot 5.16.0-rc4-00036-gbef6b82fdf23-dirty #53 SMP Mon Dec 20 
23:06:53 CST 2021 riscv64 GNU/Linux
$ cat /proc/cpuinfo
processor   : 0
hart: 0
isa : rv64imafdcsuh
mmu : sv48

$ file /bin/busybox
/bin/busybox: setuid ELF 32-bit LSB shared object, UCB RISC-V, version 1 
(SYSV), dynamically linked, interpreter /lib/ld-linux-riscv32-ilp32d.so.1, for 
GNU/Linux 5.15.0, stripped
$ file /usr/bin/gdb
/usr/bin/gdb: ELF 32-bit LSB shared object, UCB RISC-V, version 1 (GNU/Linux), 
dynamically linked, interpreter /lib/ld-linux-riscv32-ilp32d.so.1, for 
GNU/Linux 5.15.0, stripped
$ /usr/bin/gdb /bin/busybox
GNU gdb (GDB) 10.2
Copyright (C) 2021 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later 
...
Reading symbols from /bin/busybox...
(No debugging symbols found in /bin/busybox)
(gdb) b main
Breakpoint 1 at 0x8ddc
(gdb) r
Starting program: /bin/busybox
Failed to read a valid object file image from memory.

Breakpoint 1, 0x555a8ddc in main ()
(gdb) i r
ra 0x77df0b74   0x77df0b74
sp 0x7fdd3d10   0x7fdd3d10
gp 0x5567e800   0x5567e800 
tp 0x77f64280   0x77f64280
t0 0x0  0
t1 0x555a6fac   1431990188
t2 0x77dd8db4   2011008436
fp 0x7fdd3e34   0x7fdd3e34
s1 0x7fdd3e34   2145205812
a0 0x   -1
a1 0x2000   8192
a2 0x7fdd3e3c   2145205820
a3 0x0  0
a4 0x7fdd3d30   2145205552
a5 0x555a8dc0   1431997888
a6 0x77f2c170   2012397936
a7 0x6a7c7a2f   1786542639
s2 0x0  0
s3 0x0  0
s4 0x555a8dc0   1431997888
s5 0x77f8a3a8   2012783528
s6 0x7fdd3e3c   2145205820
s7 0x5567cecc   1432866508
--Type  for more, q to quit, c to continue without paging--
s8 0x1  1
s9 0x0  0
s100x55634448   1432568904
s110x0  0
t3 0x77df0bb8   2011106232
t4 0x42fc   17148
t5 0x0  0
t6 0x40 64
pc 0x555a8ddc   0x555a8ddc 
(gdb) si
0x555a78f0 in mallopt@plt ()
(gdb) c
Continuing.
BusyBox v1.34.1 (2021-12-19 22:39:48 CST) multi-call binary.
BusyBox is copyrighted by many authors between 1998-2015.
Licensed under GPLv2. See source distribution for detailed
copyright notices.

Usage: busybox [function [arguments]...]
   or: busybox --list[-full]
...
[Inferior 1 (process 107) exited normally]
(gdb) q

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Palmer Dabbelt 
Reviewed-by: Arnd Bergmann 
---
 arch/riscv/kernel/ptrace.c | 87 +++---
 1 file changed, 82 insertions(+), 5 deletions(-)

diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index a89243730153..bb387593a121 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -111,11 +112,6 @@ static const struct user_regset_view 
riscv_user_native_view = {
.n = ARRAY_SIZE(riscv_user_regset),
 };
 
-const struct user_regset_view *task_user_regset_view(struct task_struct *task)
-{
-   return &riscv_user_native_view;
-}
-
 struct pt_regs_offset {
const char *name;
int offset;
@@ -273,3 +269,84 @@ __visible void do_syscall_trace_exit(struct pt_regs *regs)
trace_sys_exit(regs, regs_return_value(regs));
 #endif
 }
+
+#ifdef CONFIG_COMPAT
+static int compat_riscv_gpr_get(struct task_struct *target,
+   const struct user_regset *regset,
+   struct membuf to)
+{
+   struct compat_user_regs_struct cregs;
+
+   regs_to_cregs(&cregs, task_pt_regs(target));
+
+   return membuf_write(&to, &cregs,
+   sizeof(struct compat_user_regs_struct));
+}
+
+static int compat_riscv_gpr_set(struct task_struct *target,
+   const struct user_regset *regset,
+   unsigned int pos, unsigned int count,
+   const void *kbuf, const void __user *ubuf)
+{
+   int ret;
+   struct compat_user_regs_struct cregs;
+
+   ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &cregs, 0, -1);
+
+   cregs_to_regs(&cregs, task_pt_regs(target));
+
+   return ret;
+}
+
+static const struct user_regset compat_riscv_user_regset[] = {
+   [REGSET_X] = {
+   .core_note_type = NT_PRSTATUS,
+   .n = ELF_NGREG,
+   .size = sizeof(compat_elf_greg_t),
+   .align = sizeof(compat_elf_greg_t),
+   .regset_get = co

[PATCH V7 18/20] riscv: compat: signal: Add rt_frame implementation

2022-02-27 Thread guoren
From: Guo Ren 

Implement compat_setup_rt_frame for sigcontext save & restore. The
main process is the same with signal, but the rv32 pt_regs' size
is different from rv64's, so we needs convert them.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Palmer Dabbelt 
Cc: Arnd Bergmann 
---
 arch/riscv/kernel/Makefile|   1 +
 arch/riscv/kernel/compat_signal.c | 243 ++
 arch/riscv/kernel/signal.c|  13 +-
 3 files changed, 256 insertions(+), 1 deletion(-)
 create mode 100644 arch/riscv/kernel/compat_signal.c

diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 88e79f481c21..a46f9807c59e 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -67,4 +67,5 @@ obj-$(CONFIG_JUMP_LABEL)  += jump_label.o
 
 obj-$(CONFIG_EFI)  += efi.o
 obj-$(CONFIG_COMPAT)   += compat_syscall_table.o
+obj-$(CONFIG_COMPAT)   += compat_signal.o
 obj-$(CONFIG_COMPAT)   += compat_vdso/
diff --git a/arch/riscv/kernel/compat_signal.c 
b/arch/riscv/kernel/compat_signal.c
new file mode 100644
index ..7041742ded08
--- /dev/null
+++ b/arch/riscv/kernel/compat_signal.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+#define COMPAT_DEBUG_SIG 0
+
+struct compat_sigcontext {
+   struct compat_user_regs_struct sc_regs;
+   union __riscv_fp_state sc_fpregs;
+};
+
+struct compat_ucontext {
+   compat_ulong_t  uc_flags;
+   struct compat_ucontext  *uc_link;
+   compat_stack_t  uc_stack;
+   sigset_tuc_sigmask;
+   /* There's some padding here to allow sigset_t to be expanded in the
+* future.  Though this is unlikely, other architectures put uc_sigmask
+* at the end of this structure and explicitly state it can be
+* expanded, so we didn't want to box ourselves in here. */
+   __u8  __unused[1024 / 8 - sizeof(sigset_t)];
+   /* We can't put uc_sigmask at the end of this structure because we need
+* to be able to expand sigcontext in the future.  For example, the
+* vector ISA extension will almost certainly add ISA state.  We want
+* to ensure all user-visible ISA state can be saved and restored via a
+* ucontext, so we're putting this at the end in order to allow for
+* infinite extensibility.  Since we know this will be extended and we
+* assume sigset_t won't be extended an extreme amount, we're
+* prioritizing this. */
+   struct compat_sigcontext uc_mcontext;
+};
+
+struct compat_rt_sigframe {
+   struct compat_siginfo info;
+   struct compat_ucontext uc;
+};
+
+#ifdef CONFIG_FPU
+static long compat_restore_fp_state(struct pt_regs *regs,
+   union __riscv_fp_state __user *sc_fpregs)
+{
+   long err;
+   struct __riscv_d_ext_state __user *state = &sc_fpregs->d;
+   size_t i;
+
+   err = __copy_from_user(¤t->thread.fstate, state, sizeof(*state));
+   if (unlikely(err))
+   return err;
+
+   fstate_restore(current, regs);
+
+   /* We support no other extension state at this time. */
+   for (i = 0; i < ARRAY_SIZE(sc_fpregs->q.reserved); i++) {
+   u32 value;
+
+   err = __get_user(value, &sc_fpregs->q.reserved[i]);
+   if (unlikely(err))
+   break;
+   if (value != 0)
+   return -EINVAL;
+   }
+
+   return err;
+}
+
+static long compat_save_fp_state(struct pt_regs *regs,
+ union __riscv_fp_state __user *sc_fpregs)
+{
+   long err;
+   struct __riscv_d_ext_state __user *state = &sc_fpregs->d;
+   size_t i;
+
+   fstate_save(current, regs);
+   err = __copy_to_user(state, ¤t->thread.fstate, sizeof(*state));
+   if (unlikely(err))
+   return err;
+
+   /* We support no other extension state at this time. */
+   for (i = 0; i < ARRAY_SIZE(sc_fpregs->q.reserved); i++) {
+   err = __put_user(0, &sc_fpregs->q.reserved[i]);
+   if (unlikely(err))
+   break;
+   }
+
+   return err;
+}
+#else
+#define compat_save_fp_state(task, regs) (0)
+#define compat_restore_fp_state(task, regs) (0)
+#endif
+
+static long compat_restore_sigcontext(struct pt_regs *regs,
+   struct compat_sigcontext __user *sc)
+{
+   long err;
+   struct compat_user_regs_struct cregs;
+
+   /* sc_regs is structured the same as the start of pt_regs */
+   err = __copy_from_user(&cregs, &sc->sc_regs, sizeof(sc->sc_regs));
+
+   cregs_to_regs(&cregs, regs);
+
+   /* Restore the floating-point state. */
+   if (has_fpu())
+   err |= compat_restore_fp_state(regs, &sc->sc_fpregs);
+   return err;
+}
+
+COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
+{
+ 

[PATCH V7 17/20] riscv: compat: vdso: Add setup additional pages implementation

2022-02-27 Thread guoren
From: Guo Ren 

Reconstruct __setup_additional_pages() by appending vdso info
pointer argument to meet compat_vdso_info requirement. And change
vm_special_mapping *dm, *cm initialization into static.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Palmer Dabbelt 
Cc: Arnd Bergmann 
---
 arch/riscv/include/asm/elf.h |   5 ++
 arch/riscv/include/asm/mmu.h |   1 +
 arch/riscv/kernel/vdso.c | 105 +++
 3 files changed, 76 insertions(+), 35 deletions(-)

diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h
index 3a4293dc7229..d87d3bcc758d 100644
--- a/arch/riscv/include/asm/elf.h
+++ b/arch/riscv/include/asm/elf.h
@@ -134,5 +134,10 @@ do {if ((ex).e_ident[EI_CLASS] == ELFCLASS32)  
\
 typedef compat_ulong_t compat_elf_greg_t;
 typedef compat_elf_greg_t  compat_elf_gregset_t[ELF_NGREG];
 
+extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp);
+#define compat_arch_setup_additional_pages \
+   compat_arch_setup_additional_pages
+
 #endif /* CONFIG_COMPAT */
 #endif /* _ASM_RISCV_ELF_H */
diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h
index 0099dc116168..cedcf8ea3c76 100644
--- a/arch/riscv/include/asm/mmu.h
+++ b/arch/riscv/include/asm/mmu.h
@@ -16,6 +16,7 @@ typedef struct {
atomic_long_t id;
 #endif
void *vdso;
+   void *vdso_info;
 #ifdef CONFIG_SMP
/* A local icache flush is needed before user execution can resume. */
cpumask_t icache_stale_mask;
diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c
index a9436a65161a..50fe4c877603 100644
--- a/arch/riscv/kernel/vdso.c
+++ b/arch/riscv/kernel/vdso.c
@@ -23,6 +23,9 @@ struct vdso_data {
 #endif
 
 extern char vdso_start[], vdso_end[];
+#ifdef CONFIG_COMPAT
+extern char compat_vdso_start[], compat_vdso_end[];
+#endif
 
 enum vvar_pages {
VVAR_DATA_PAGE_OFFSET,
@@ -30,6 +33,11 @@ enum vvar_pages {
VVAR_NR_PAGES,
 };
 
+enum rv_vdso_map {
+   RV_VDSO_MAP_VVAR,
+   RV_VDSO_MAP_VDSO,
+};
+
 #define VVAR_SIZE  (VVAR_NR_PAGES << PAGE_SHIFT)
 
 /*
@@ -52,12 +60,6 @@ struct __vdso_info {
struct vm_special_mapping *cm;
 };
 
-static struct __vdso_info vdso_info __ro_after_init = {
-   .name = "vdso",
-   .vdso_code_start = vdso_start,
-   .vdso_code_end = vdso_end,
-};
-
 static int vdso_mremap(const struct vm_special_mapping *sm,
   struct vm_area_struct *new_vma)
 {
@@ -66,37 +68,33 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
return 0;
 }
 
-static int __init __vdso_init(void)
+static void __init __vdso_init(struct __vdso_info *vdso_info)
 {
unsigned int i;
struct page **vdso_pagelist;
unsigned long pfn;
 
-   if (memcmp(vdso_info.vdso_code_start, "\177ELF", 4)) {
-   pr_err("vDSO is not a valid ELF object!\n");
-   return -EINVAL;
-   }
+   if (memcmp(vdso_info->vdso_code_start, "\177ELF", 4))
+   panic("vDSO is not a valid ELF object!\n");
 
-   vdso_info.vdso_pages = (
-   vdso_info.vdso_code_end -
-   vdso_info.vdso_code_start) >>
+   vdso_info->vdso_pages = (
+   vdso_info->vdso_code_end -
+   vdso_info->vdso_code_start) >>
PAGE_SHIFT;
 
-   vdso_pagelist = kcalloc(vdso_info.vdso_pages,
+   vdso_pagelist = kcalloc(vdso_info->vdso_pages,
sizeof(struct page *),
GFP_KERNEL);
if (vdso_pagelist == NULL)
-   return -ENOMEM;
+   panic("vDSO kcalloc failed!\n");
 
/* Grab the vDSO code pages. */
-   pfn = sym_to_pfn(vdso_info.vdso_code_start);
+   pfn = sym_to_pfn(vdso_info->vdso_code_start);
 
-   for (i = 0; i < vdso_info.vdso_pages; i++)
+   for (i = 0; i < vdso_info->vdso_pages; i++)
vdso_pagelist[i] = pfn_to_page(pfn + i);
 
-   vdso_info.cm->pages = vdso_pagelist;
-
-   return 0;
+   vdso_info->cm->pages = vdso_pagelist;
 }
 
 #ifdef CONFIG_TIME_NS
@@ -116,13 +114,14 @@ int vdso_join_timens(struct task_struct *task, struct 
time_namespace *ns)
 {
struct mm_struct *mm = task->mm;
struct vm_area_struct *vma;
+   struct __vdso_info *vdso_info = mm->context.vdso_info;
 
mmap_read_lock(mm);
 
for (vma = mm->mmap; vma; vma = vma->vm_next) {
unsigned long size = vma->vm_end - vma->vm_start;
 
-   if (vma_is_special_mapping(vma, vdso_info.dm))
+   if (vma_is_special_mapping(vma, vdso_info->dm))
zap_page_range(vma, vma->vm_start, size);
}
 
@@ -187,11 +186,6 @@ static vm_fault_t vvar_fault(const struct 
vm_special_mapping *sm,
return vmf_insert_pfn(vma, vmf->address, pfn);
 }
 
-enum rv_vds

[PATCH V7 16/20] riscv: compat: vdso: Add COMPAT_VDSO base code implementation

2022-02-27 Thread guoren
From: Guo Ren 

There is no vgettimeofday supported in rv32 that makes simple to
generate rv32 vdso code which only needs riscv64 compiler. Other
architectures need change compiler or -m (machine parameter) to
support vdso32 compiling. If rv32 support vgettimeofday (which
cause C compile) in future, we would add CROSS_COMPILE to support
that makes more requirement on compiler enviornment.

linux-rv64/arch/riscv/kernel/compat_vdso/compat_vdso.so.dbg:
file format elf64-littleriscv

Disassembly of section .text:

0800 <__vdso_rt_sigreturn>:
 800:   08b00893li  a7,139
 804:   0073ecall
 808:   unimp
...

080c <__vdso_getcpu>:
 80c:   0a800893li  a7,168
 810:   0073ecall
 814:   8082ret
...

0818 <__vdso_flush_icache>:
 818:   10300893li  a7,259
 81c:   0073ecall
 820:   8082ret

linux-rv32/arch/riscv/kernel/vdso/vdso.so.dbg:
file format elf32-littleriscv

Disassembly of section .text:

0800 <__vdso_rt_sigreturn>:
 800:   08b00893li  a7,139
 804:   0073ecall
 808:   unimp
...

080c <__vdso_getcpu>:
 80c:   0a800893li  a7,168
 810:   0073ecall
 814:   8082ret
...

0818 <__vdso_flush_icache>:
 818:   10300893li  a7,259
 81c:   0073ecall
 820:   8082ret

Finally, reuse all *.S from vdso in compat_vdso that makes
implementation clear and readable.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Cc: Arnd Bergmann 
Cc: Palmer Dabbelt 
---
 arch/riscv/Makefile   |  5 ++
 arch/riscv/include/asm/vdso.h |  9 +++
 arch/riscv/kernel/Makefile|  1 +
 arch/riscv/kernel/compat_vdso/.gitignore  |  2 +
 arch/riscv/kernel/compat_vdso/Makefile| 78 +++
 arch/riscv/kernel/compat_vdso/compat_vdso.S   |  8 ++
 .../kernel/compat_vdso/compat_vdso.lds.S  |  3 +
 arch/riscv/kernel/compat_vdso/flush_icache.S  |  3 +
 .../compat_vdso/gen_compat_vdso_offsets.sh|  5 ++
 arch/riscv/kernel/compat_vdso/getcpu.S|  3 +
 arch/riscv/kernel/compat_vdso/note.S  |  3 +
 arch/riscv/kernel/compat_vdso/rt_sigreturn.S  |  3 +
 arch/riscv/kernel/vdso/vdso.S |  6 +-
 13 files changed, 128 insertions(+), 1 deletion(-)
 create mode 100644 arch/riscv/kernel/compat_vdso/.gitignore
 create mode 100644 arch/riscv/kernel/compat_vdso/Makefile
 create mode 100644 arch/riscv/kernel/compat_vdso/compat_vdso.S
 create mode 100644 arch/riscv/kernel/compat_vdso/compat_vdso.lds.S
 create mode 100644 arch/riscv/kernel/compat_vdso/flush_icache.S
 create mode 100755 arch/riscv/kernel/compat_vdso/gen_compat_vdso_offsets.sh
 create mode 100644 arch/riscv/kernel/compat_vdso/getcpu.S
 create mode 100644 arch/riscv/kernel/compat_vdso/note.S
 create mode 100644 arch/riscv/kernel/compat_vdso/rt_sigreturn.S

diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index c6ca1b9cbf71..6a494029b8bd 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -112,12 +112,17 @@ libs-$(CONFIG_EFI_STUB) += 
$(objtree)/drivers/firmware/efi/libstub/lib.a
 PHONY += vdso_install
 vdso_install:
$(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@
+   $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \
+   $(build)=arch/riscv/kernel/compat_vdso $@)
 
 ifeq ($(KBUILD_EXTMOD),)
 ifeq ($(CONFIG_MMU),y)
 prepare: vdso_prepare
 vdso_prepare: prepare0
$(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso 
include/generated/vdso-offsets.h
+   $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \
+   $(build)=arch/riscv/kernel/compat_vdso 
include/generated/compat_vdso-offsets.h)
+
 endif
 endif
 
diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h
index bc6f75f3a199..af981426fe0f 100644
--- a/arch/riscv/include/asm/vdso.h
+++ b/arch/riscv/include/asm/vdso.h
@@ -21,6 +21,15 @@
 
 #define VDSO_SYMBOL(base, name)
\
(void __user *)((unsigned long)(base) + __vdso_##name##_offset)
+
+#ifdef CONFIG_COMPAT
+#include 
+
+#define COMPAT_VDSO_SYMBOL(base, name) 
\
+   (void __user *)((unsigned long)(base) + compat__vdso_##name##_offset)
+
+#endif /* CONFIG_COMPAT */
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* CONFIG_MMU */
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 954dc7043ad2..88e79f481c21 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -67,3 +67,4 @@ obj-$(CONFIG_JUMP_LABEL)  += jump_label.o
 
 obj-$(CONFIG_EFI)  += efi.o
 obj-$(CONFIG_COMPAT)   += compat_syscall_table.o
+obj-$(CONFIG_COMPAT)   += compat_vdso/
diff --git a/arch/r

[PATCH V7 15/20] riscv: compat: Add hw capability check for elf

2022-02-27 Thread guoren
From: Guo Ren 

Detect hardware COMPAT (32bit U-mode) capability in rv64. If not
support COMPAT mode in hw, compat_elf_check_arch would return
false by compat_binfmt_elf.c

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Cc: Arnd Bergmann 
Cc: Christoph Hellwig 
---
 arch/riscv/include/asm/elf.h |  3 ++-
 arch/riscv/kernel/process.c  | 26 ++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h
index aee40040917b..3a4293dc7229 100644
--- a/arch/riscv/include/asm/elf.h
+++ b/arch/riscv/include/asm/elf.h
@@ -40,7 +40,8 @@
  * elf64_hdr e_machine's offset are different. The checker is
  * a little bit simple compare to other architectures.
  */
-#define compat_elf_check_arch(x) ((x)->e_machine == EM_RISCV)
+extern bool compat_elf_check_arch(Elf32_Ehdr *hdr);
+#define compat_elf_check_arch  compat_elf_check_arch
 
 #define CORE_DUMP_USE_REGSET
 #define ELF_EXEC_PAGESIZE  (PAGE_SIZE)
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 54787ca9806a..7bbe4dd95e85 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -83,6 +83,32 @@ void show_regs(struct pt_regs *regs)
dump_backtrace(regs, NULL, KERN_DEFAULT);
 }
 
+#ifdef CONFIG_COMPAT
+static bool compat_mode_supported __read_mostly;
+
+bool compat_elf_check_arch(Elf32_Ehdr *hdr)
+{
+   return compat_mode_supported && hdr->e_machine == EM_RISCV;
+}
+
+static int __init compat_mode_detect(void)
+{
+   unsigned long tmp = csr_read(CSR_STATUS);
+
+   csr_write(CSR_STATUS, (tmp & ~SR_UXL) | SR_UXL_32);
+   compat_mode_supported =
+   (csr_read(CSR_STATUS) & SR_UXL) == SR_UXL_32;
+
+   csr_write(CSR_STATUS, tmp);
+
+   pr_info("riscv: ELF compat mode %s",
+   compat_mode_supported ? "supported" : "failed");
+
+   return 0;
+}
+early_initcall(compat_mode_detect);
+#endif
+
 void start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
 {
-- 
2.25.1



[PATCH V7 14/20] riscv: compat: Add elf.h implementation

2022-02-27 Thread guoren
From: Guo Ren 

Implement necessary type and macro for compat elf. See the code
comment for detail.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Arnd Bergmann 
---
 arch/riscv/include/asm/elf.h | 46 +++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h
index f53c40026c7a..aee40040917b 100644
--- a/arch/riscv/include/asm/elf.h
+++ b/arch/riscv/include/asm/elf.h
@@ -8,6 +8,8 @@
 #ifndef _ASM_RISCV_ELF_H
 #define _ASM_RISCV_ELF_H
 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -18,11 +20,13 @@
  */
 #define ELF_ARCH   EM_RISCV
 
+#ifndef ELF_CLASS
 #ifdef CONFIG_64BIT
 #define ELF_CLASS  ELFCLASS64
 #else
 #define ELF_CLASS  ELFCLASS32
 #endif
+#endif
 
 #define ELF_DATA   ELFDATA2LSB
 
@@ -31,6 +35,13 @@
  */
 #define elf_check_arch(x) ((x)->e_machine == EM_RISCV)
 
+/*
+ * Use the same code with elf_check_arch, because elf32_hdr &
+ * elf64_hdr e_machine's offset are different. The checker is
+ * a little bit simple compare to other architectures.
+ */
+#define compat_elf_check_arch(x) ((x)->e_machine == EM_RISCV)
+
 #define CORE_DUMP_USE_REGSET
 #define ELF_EXEC_PAGESIZE  (PAGE_SIZE)
 
@@ -43,8 +54,14 @@
 #define ELF_ET_DYN_BASE((TASK_SIZE / 3) * 2)
 
 #ifdef CONFIG_64BIT
+#ifdef CONFIG_COMPAT
+#define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \
+0x7ff >> (PAGE_SHIFT - 12) : \
+0x3 >> (PAGE_SHIFT - 12))
+#else
 #define STACK_RND_MASK (0x3 >> (PAGE_SHIFT - 12))
 #endif
+#endif
 /*
  * This yields a mask that user programs can use to figure out what
  * instruction set this CPU supports.  This could be done in user space,
@@ -60,11 +77,19 @@ extern unsigned long elf_hwcap;
  */
 #define ELF_PLATFORM   (NULL)
 
+#define COMPAT_ELF_PLATFORM(NULL)
+
 #ifdef CONFIG_MMU
 #define ARCH_DLINFO\
 do {   \
+   /*  \
+* Note that we add ulong after elf_addr_t because  \
+* casting current->mm->context.vdso triggers a cast\
+* warning of cast from pointer to integer for  \
+* COMPAT ELFCLASS32.   \
+*/ \
NEW_AUX_ENT(AT_SYSINFO_EHDR,\
-   (elf_addr_t)current->mm->context.vdso); \
+   (elf_addr_t)(ulong)current->mm->context.vdso);  \
NEW_AUX_ENT(AT_L1I_CACHESIZE,   \
get_cache_size(1, CACHE_TYPE_INST));\
NEW_AUX_ENT(AT_L1I_CACHEGEOMETRY,   \
@@ -90,4 +115,23 @@ do {
\
*(struct user_regs_struct *)regs;   \
 } while (0);
 
+#ifdef CONFIG_COMPAT
+
+#define SET_PERSONALITY(ex)\
+do {if ((ex).e_ident[EI_CLASS] == ELFCLASS32)  \
+   set_thread_flag(TIF_32BIT); \
+   else\
+   clear_thread_flag(TIF_32BIT);   \
+   if (personality(current->personality) != PER_LINUX32)   \
+   set_personality(PER_LINUX | \
+   (current->personality & (~PER_MASK)));  \
+} while (0)
+
+#define COMPAT_ELF_ET_DYN_BASE ((TASK_SIZE_32 / 3) * 2)
+
+/* rv32 registers */
+typedef compat_ulong_t compat_elf_greg_t;
+typedef compat_elf_greg_t  compat_elf_gregset_t[ELF_NGREG];
+
+#endif /* CONFIG_COMPAT */
 #endif /* _ASM_RISCV_ELF_H */
-- 
2.25.1



[PATCH V7 13/20] riscv: compat: process: Add UXL_32 support in start_thread

2022-02-27 Thread guoren
From: Guo Ren 

If the current task is in COMPAT mode, set SR_UXL_32 in status for
returning userspace. We need CONFIG _COMPAT to prevent compiling
errors with rv32 defconfig.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Cc: Arnd Bergmann 
Cc: Palmer Dabbelt 
---
 arch/riscv/kernel/process.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 03ac3aa611f5..54787ca9806a 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -97,6 +97,11 @@ void start_thread(struct pt_regs *regs, unsigned long pc,
}
regs->epc = pc;
regs->sp = sp;
+
+   if (is_compat_task())
+   regs->status = (regs->status & ~SR_UXL) | SR_UXL_32;
+   else
+   regs->status = (regs->status & ~SR_UXL) | SR_UXL_64;
 }
 
 void flush_thread(void)
-- 
2.25.1



[PATCH V7 12/20] riscv: compat: syscall: Add entry.S implementation

2022-02-27 Thread guoren
From: Guo Ren 

Implement the entry of compat_sys_call_table[] in asm. Ref to
riscv-privileged spec 4.1.1 Supervisor Status Register (sstatus):

 BIT[32:33] = UXL[1:0]:
 - 1:32
 - 2:64
 - 3:128

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Palmer Dabbelt 
Cc: Arnd Bergmann 
---
 arch/riscv/include/asm/csr.h |  7 +++
 arch/riscv/kernel/entry.S| 18 --
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index ae711692eec9..7f05c65654c8 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -36,6 +36,13 @@
 #define SR_SD  _AC(0x8000, UL) /* FS/XS dirty */
 #endif
 
+#ifdef CONFIG_64BIT
+#define SR_UXL _AC(0x3, UL) /* XLEN mask for U-mode */
+#define SR_UXL_32  _AC(0x1, UL) /* XLEN = 32 for U-mode */
+#define SR_UXL_64  _AC(0x2, UL) /* XLEN = 64 for U-mode */
+#define SR_UXL_SHIFT   32
+#endif
+
 /* SATP flags */
 #ifndef CONFIG_64BIT
 #define SATP_PPN   _AC(0x003F, UL)
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index ed29e9c8f660..1951743f09b3 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -207,13 +207,27 @@ check_syscall_nr:
 * Syscall number held in a7.
 * If syscall number is above allowed value, redirect to ni_syscall.
 */
-   bgeu a7, t0, 1f
+   bgeu a7, t0, 3f
+#ifdef CONFIG_COMPAT
+   REG_L s0, PT_STATUS(sp)
+   srli s0, s0, SR_UXL_SHIFT
+   andi s0, s0, (SR_UXL >> SR_UXL_SHIFT)
+   li t0, (SR_UXL_32 >> SR_UXL_SHIFT)
+   sub t0, s0, t0
+   bnez t0, 1f
+
+   /* Call compat_syscall */
+   la s0, compat_sys_call_table
+   j 2f
+1:
+#endif
/* Call syscall */
la s0, sys_call_table
+2:
slli t0, a7, RISCV_LGPTR
add s0, s0, t0
REG_L s0, 0(s0)
-1:
+3:
jalr s0
 
 ret_from_syscall:
-- 
2.25.1



[PATCH V7 11/20] riscv: compat: syscall: Add compat_sys_call_table implementation

2022-02-27 Thread guoren
From: Guo Ren 

Implement compat sys_call_table and some system call functions:
truncate64, ftruncate64, fallocate, pread64, pwrite64,
sync_file_range, readahead, fadvise64_64 which need argument
translation.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Arnd Bergmann 
Cc: Palmer Dabbelt 
---
 arch/riscv/include/asm/syscall.h |  1 +
 arch/riscv/include/asm/unistd.h  | 11 +++
 arch/riscv/include/uapi/asm/unistd.h |  2 +-
 arch/riscv/kernel/Makefile   |  1 +
 arch/riscv/kernel/compat_syscall_table.c | 19 
 arch/riscv/kernel/sys_riscv.c|  6 ++--
 fs/open.c| 24 +++
 fs/read_write.c  | 16 ++
 fs/sync.c|  9 ++
 include/asm-generic/compat.h |  7 +
 include/linux/compat.h   | 37 
 mm/fadvise.c | 11 +++
 mm/readahead.c   |  7 +
 13 files changed, 148 insertions(+), 3 deletions(-)
 create mode 100644 arch/riscv/kernel/compat_syscall_table.c

diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h
index 7ac6a0e275f2..384a63b86420 100644
--- a/arch/riscv/include/asm/syscall.h
+++ b/arch/riscv/include/asm/syscall.h
@@ -16,6 +16,7 @@
 
 /* The array of function pointers for syscalls. */
 extern void * const sys_call_table[];
+extern void * const compat_sys_call_table[];
 
 /*
  * Only the low 32 bits of orig_r0 are meaningful, so we return int.
diff --git a/arch/riscv/include/asm/unistd.h b/arch/riscv/include/asm/unistd.h
index 6c316093a1e5..5ddac412b578 100644
--- a/arch/riscv/include/asm/unistd.h
+++ b/arch/riscv/include/asm/unistd.h
@@ -11,6 +11,17 @@
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_MEMFD_SECRET
 
+#ifdef CONFIG_COMPAT
+#define __ARCH_WANT_COMPAT_TRUNCATE64
+#define __ARCH_WANT_COMPAT_FTRUNCATE64
+#define __ARCH_WANT_COMPAT_FALLOCATE
+#define __ARCH_WANT_COMPAT_PREAD64
+#define __ARCH_WANT_COMPAT_PWRITE64
+#define __ARCH_WANT_COMPAT_SYNC_FILE_RANGE
+#define __ARCH_WANT_COMPAT_READAHEAD
+#define __ARCH_WANT_COMPAT_FADVISE64_64
+#endif
+
 #include 
 
 #define NR_syscalls (__NR_syscalls)
diff --git a/arch/riscv/include/uapi/asm/unistd.h 
b/arch/riscv/include/uapi/asm/unistd.h
index 8062996c2dfd..c9e50eed14aa 100644
--- a/arch/riscv/include/uapi/asm/unistd.h
+++ b/arch/riscv/include/uapi/asm/unistd.h
@@ -15,7 +15,7 @@
  * along with this program.  If not, see .
  */
 
-#ifdef __LP64__
+#if defined(__LP64__) && !defined(__SYSCALL_COMPAT)
 #define __ARCH_WANT_NEW_STAT
 #define __ARCH_WANT_SET_GET_RLIMIT
 #endif /* __LP64__ */
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 612556faa527..954dc7043ad2 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -66,3 +66,4 @@ obj-$(CONFIG_CRASH_DUMP)  += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL)   += jump_label.o
 
 obj-$(CONFIG_EFI)  += efi.o
+obj-$(CONFIG_COMPAT)   += compat_syscall_table.o
diff --git a/arch/riscv/kernel/compat_syscall_table.c 
b/arch/riscv/kernel/compat_syscall_table.c
new file mode 100644
index ..651f2b009c28
--- /dev/null
+++ b/arch/riscv/kernel/compat_syscall_table.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define __SYSCALL_COMPAT
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#undef __SYSCALL
+#define __SYSCALL(nr, call)  [nr] = (call),
+
+asmlinkage long compat_sys_rt_sigreturn(void);
+
+void * const compat_sys_call_table[__NR_syscalls] = {
+   [0 ... __NR_syscalls - 1] = sys_ni_syscall,
+#include 
+};
diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c
index 12f8a7fce78b..9c0194f176fc 100644
--- a/arch/riscv/kernel/sys_riscv.c
+++ b/arch/riscv/kernel/sys_riscv.c
@@ -33,7 +33,9 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 {
return riscv_sys_mmap(addr, len, prot, flags, fd, offset, 0);
 }
-#else
+#endif
+
+#if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT)
 SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, off_t, offset)
@@ -44,7 +46,7 @@ SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, 
len,
 */
return riscv_sys_mmap(addr, len, prot, flags, fd, offset, 12);
 }
-#endif /* !CONFIG_64BIT */
+#endif
 
 /*
  * Allows the instruction cache to be flushed from userspace.  Despite RISC-V
diff --git a/fs/open.c b/fs/open.c
index 9ff2f621b760..b25613f7c0a7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -224,6 +224,21 @@ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, 
length)
 }
 #endif /* BITS_PER_LONG == 32 */
 
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64)
+COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname,
+  compat_arg_u64_dual(length)

[PATCH V7 10/20] riscv: compat: Re-implement TASK_SIZE for COMPAT_32BIT

2022-02-27 Thread guoren
From: Guo Ren 

Make TASK_SIZE from const to dynamic detect TIF_32BIT flag
function. Refer to arm64 to implement DEFAULT_MAP_WINDOW_64 for
efi-stub.

Limit 32-bit compatible process in 0-2GB virtual address range
(which is enough for real scenarios), because it could avoid
address sign extend problem when 32-bit enter 64-bit and ease
software design.

The standard 32-bit TASK_SIZE is 0x9dc0:FIXADDR_START, and
compared to a compatible 32-bit, it increases 476MB for the
application's virtual address.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Arnd Bergmann 
---
 arch/riscv/include/asm/pgtable.h | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 7e949f25c933..f0d125ea3ceb 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -704,8 +704,17 @@ static inline pmd_t pmdp_establish(struct vm_area_struct 
*vma,
  * 63–48 all equal to bit 47, or else a page-fault exception will occur."
  */
 #ifdef CONFIG_64BIT
-#define TASK_SIZE  (PGDIR_SIZE * PTRS_PER_PGD / 2)
-#define TASK_SIZE_MIN  (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
+#define TASK_SIZE_64   (PGDIR_SIZE * PTRS_PER_PGD / 2)
+#define TASK_SIZE_MIN  (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
+
+#ifdef CONFIG_COMPAT
+#define TASK_SIZE_32   (_AC(0x8000, UL) - PAGE_SIZE)
+#define TASK_SIZE  (test_thread_flag(TIF_32BIT) ? \
+TASK_SIZE_32 : TASK_SIZE_64)
+#else
+#define TASK_SIZE  TASK_SIZE_64
+#endif
+
 #else
 #define TASK_SIZE  FIXADDR_START
 #define TASK_SIZE_MIN  TASK_SIZE
-- 
2.25.1



[PATCH V7 09/20] riscv: compat: Add basic compat data type implementation

2022-02-27 Thread guoren
From: Guo Ren 

Implement riscv asm/compat.h for struct compat_xxx,
is_compat_task, compat_user_regset, regset convert.

The rv64 compat.h has inherited most of the structs
from the generic one.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Cc: Arnd Bergmann 
Cc: Palmer Dabbelt 
---
 arch/riscv/include/asm/compat.h  | 129 +++
 arch/riscv/include/asm/thread_info.h |   1 +
 2 files changed, 130 insertions(+)
 create mode 100644 arch/riscv/include/asm/compat.h

diff --git a/arch/riscv/include/asm/compat.h b/arch/riscv/include/asm/compat.h
new file mode 100644
index ..2ac955b51148
--- /dev/null
+++ b/arch/riscv/include/asm/compat.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_COMPAT_H
+#define __ASM_COMPAT_H
+
+#define COMPAT_UTS_MACHINE "riscv\0\0"
+
+/*
+ * Architecture specific compatibility types
+ */
+#include 
+#include 
+#include 
+#include 
+
+static inline int is_compat_task(void)
+{
+   return test_thread_flag(TIF_32BIT);
+}
+
+struct compat_user_regs_struct {
+   compat_ulong_t pc;
+   compat_ulong_t ra;
+   compat_ulong_t sp;
+   compat_ulong_t gp;
+   compat_ulong_t tp;
+   compat_ulong_t t0;
+   compat_ulong_t t1;
+   compat_ulong_t t2;
+   compat_ulong_t s0;
+   compat_ulong_t s1;
+   compat_ulong_t a0;
+   compat_ulong_t a1;
+   compat_ulong_t a2;
+   compat_ulong_t a3;
+   compat_ulong_t a4;
+   compat_ulong_t a5;
+   compat_ulong_t a6;
+   compat_ulong_t a7;
+   compat_ulong_t s2;
+   compat_ulong_t s3;
+   compat_ulong_t s4;
+   compat_ulong_t s5;
+   compat_ulong_t s6;
+   compat_ulong_t s7;
+   compat_ulong_t s8;
+   compat_ulong_t s9;
+   compat_ulong_t s10;
+   compat_ulong_t s11;
+   compat_ulong_t t3;
+   compat_ulong_t t4;
+   compat_ulong_t t5;
+   compat_ulong_t t6;
+};
+
+static inline void regs_to_cregs(struct compat_user_regs_struct *cregs,
+struct pt_regs *regs)
+{
+   cregs->pc   = (compat_ulong_t) regs->epc;
+   cregs->ra   = (compat_ulong_t) regs->ra;
+   cregs->sp   = (compat_ulong_t) regs->sp;
+   cregs->gp   = (compat_ulong_t) regs->gp;
+   cregs->tp   = (compat_ulong_t) regs->tp;
+   cregs->t0   = (compat_ulong_t) regs->t0;
+   cregs->t1   = (compat_ulong_t) regs->t1;
+   cregs->t2   = (compat_ulong_t) regs->t2;
+   cregs->s0   = (compat_ulong_t) regs->s0;
+   cregs->s1   = (compat_ulong_t) regs->s1;
+   cregs->a0   = (compat_ulong_t) regs->a0;
+   cregs->a1   = (compat_ulong_t) regs->a1;
+   cregs->a2   = (compat_ulong_t) regs->a2;
+   cregs->a3   = (compat_ulong_t) regs->a3;
+   cregs->a4   = (compat_ulong_t) regs->a4;
+   cregs->a5   = (compat_ulong_t) regs->a5;
+   cregs->a6   = (compat_ulong_t) regs->a6;
+   cregs->a7   = (compat_ulong_t) regs->a7;
+   cregs->s2   = (compat_ulong_t) regs->s2;
+   cregs->s3   = (compat_ulong_t) regs->s3;
+   cregs->s4   = (compat_ulong_t) regs->s4;
+   cregs->s5   = (compat_ulong_t) regs->s5;
+   cregs->s6   = (compat_ulong_t) regs->s6;
+   cregs->s7   = (compat_ulong_t) regs->s7;
+   cregs->s8   = (compat_ulong_t) regs->s8;
+   cregs->s9   = (compat_ulong_t) regs->s9;
+   cregs->s10  = (compat_ulong_t) regs->s10;
+   cregs->s11  = (compat_ulong_t) regs->s11;
+   cregs->t3   = (compat_ulong_t) regs->t3;
+   cregs->t4   = (compat_ulong_t) regs->t4;
+   cregs->t5   = (compat_ulong_t) regs->t5;
+   cregs->t6   = (compat_ulong_t) regs->t6;
+};
+
+static inline void cregs_to_regs(struct compat_user_regs_struct *cregs,
+struct pt_regs *regs)
+{
+   regs->epc   = (unsigned long) cregs->pc;
+   regs->ra= (unsigned long) cregs->ra;
+   regs->sp= (unsigned long) cregs->sp;
+   regs->gp= (unsigned long) cregs->gp;
+   regs->tp= (unsigned long) cregs->tp;
+   regs->t0= (unsigned long) cregs->t0;
+   regs->t1= (unsigned long) cregs->t1;
+   regs->t2= (unsigned long) cregs->t2;
+   regs->s0= (unsigned long) cregs->s0;
+   regs->s1= (unsigned long) cregs->s1;
+   regs->a0= (unsigned long) cregs->a0;
+   regs->a1= (unsigned long) cregs->a1;
+   regs->a2= (unsigned long) cregs->a2;
+   regs->a3= (unsigned long) cregs->a3;
+   regs->a4= (unsigned long) cregs->a4;
+   regs->a5= (unsigned long) cregs->a5;
+   regs->a6= (unsigned long) cregs->a6;
+   regs->a7= (unsigned long) cregs->a7;
+   regs->s2= (unsigned long) cregs->s2;
+   regs->s3= (unsigned long) cregs->s3;
+   regs->s4= (unsigned

[PATCH V7 08/20] riscv: Fixup difference with defconfig

2022-02-27 Thread guoren
From: Guo Ren 

Let's follow the origin patch's spirit:

The only difference between rv32_defconfig and defconfig is that
rv32_defconfig has  CONFIG_ARCH_RV32I=y.

This is helpful to compare rv64-compat-rv32 v.s. rv32-linux.

Fixes: 1b937e8faa87ccfb ("RISC-V: Add separate defconfig for 32bit systems")
Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Arnd Bergmann 
Cc: Palmer Dabbelt 
---
 arch/riscv/Makefile   |   4 +
 arch/riscv/configs/rv32_defconfig | 135 --
 2 files changed, 4 insertions(+), 135 deletions(-)
 delete mode 100644 arch/riscv/configs/rv32_defconfig

diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index 7d81102cffd4..c6ca1b9cbf71 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -154,3 +154,7 @@ PHONY += rv64_randconfig
 rv64_randconfig:
$(Q)$(MAKE) 
KCONFIG_ALLCONFIG=$(srctree)/arch/riscv/configs/64-bit.config \
-f $(srctree)/Makefile randconfig
+
+PHONY += rv32_defconfig
+rv32_defconfig:
+   $(Q)$(MAKE) -f $(srctree)/Makefile defconfig 32-bit.config
diff --git a/arch/riscv/configs/rv32_defconfig 
b/arch/riscv/configs/rv32_defconfig
deleted file mode 100644
index 8b56a7f1eb06..
--- a/arch/riscv/configs/rv32_defconfig
+++ /dev/null
@@ -1,135 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ_IDLE=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_BPF_SYSCALL=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_CFS_BANDWIDTH=y
-CONFIG_CGROUP_BPF=y
-CONFIG_NAMESPACES=y
-CONFIG_USER_NS=y
-CONFIG_CHECKPOINT_RESTORE=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_EXPERT=y
-# CONFIG_SYSFS_SYSCALL is not set
-CONFIG_SOC_SIFIVE=y
-CONFIG_SOC_VIRT=y
-CONFIG_ARCH_RV32I=y
-CONFIG_SMP=y
-CONFIG_HOTPLUG_CPU=y
-CONFIG_VIRTUALIZATION=y
-CONFIG_KVM=m
-CONFIG_JUMP_LABEL=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_PNP_BOOTP=y
-CONFIG_IP_PNP_RARP=y
-CONFIG_NETLINK_DIAG=y
-CONFIG_NET_9P=y
-CONFIG_NET_9P_VIRTIO=y
-CONFIG_PCI=y
-CONFIG_PCIEPORTBUS=y
-CONFIG_PCI_HOST_GENERIC=y
-CONFIG_PCIE_XILINX=y
-CONFIG_DEVTMPFS=y
-CONFIG_DEVTMPFS_MOUNT=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_VIRTIO_BLK=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_BLK_DEV_SR=y
-CONFIG_SCSI_VIRTIO=y
-CONFIG_ATA=y
-CONFIG_SATA_AHCI=y
-CONFIG_SATA_AHCI_PLATFORM=y
-CONFIG_NETDEVICES=y
-CONFIG_VIRTIO_NET=y
-CONFIG_MACB=y
-CONFIG_E1000E=y
-CONFIG_R8169=y
-CONFIG_MICROSEMI_PHY=y
-CONFIG_INPUT_MOUSEDEV=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_OF_PLATFORM=y
-CONFIG_SERIAL_EARLYCON_RISCV_SBI=y
-CONFIG_HVC_RISCV_SBI=y
-CONFIG_VIRTIO_CONSOLE=y
-CONFIG_HW_RANDOM=y
-CONFIG_HW_RANDOM_VIRTIO=y
-CONFIG_SPI=y
-CONFIG_SPI_SIFIVE=y
-# CONFIG_PTP_1588_CLOCK is not set
-CONFIG_DRM=y
-CONFIG_DRM_RADEON=y
-CONFIG_DRM_VIRTIO_GPU=y
-CONFIG_FB=y
-CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_USB=y
-CONFIG_USB_XHCI_HCD=y
-CONFIG_USB_XHCI_PLATFORM=y
-CONFIG_USB_EHCI_HCD=y
-CONFIG_USB_EHCI_HCD_PLATFORM=y
-CONFIG_USB_OHCI_HCD=y
-CONFIG_USB_OHCI_HCD_PLATFORM=y
-CONFIG_USB_STORAGE=y
-CONFIG_USB_UAS=y
-CONFIG_MMC=y
-CONFIG_MMC_SPI=y
-CONFIG_RTC_CLASS=y
-CONFIG_VIRTIO_PCI=y
-CONFIG_VIRTIO_BALLOON=y
-CONFIG_VIRTIO_INPUT=y
-CONFIG_VIRTIO_MMIO=y
-CONFIG_RPMSG_CHAR=y
-CONFIG_RPMSG_VIRTIO=y
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_POSIX_ACL=y
-CONFIG_AUTOFS4_FS=y
-CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=y
-CONFIG_TMPFS=y
-CONFIG_TMPFS_POSIX_ACL=y
-CONFIG_NFS_FS=y
-CONFIG_NFS_V4=y
-CONFIG_NFS_V4_1=y
-CONFIG_NFS_V4_2=y
-CONFIG_ROOT_NFS=y
-CONFIG_9P_FS=y
-CONFIG_CRYPTO_USER_API_HASH=y
-CONFIG_CRYPTO_DEV_VIRTIO=y
-CONFIG_PRINTK_TIME=y
-CONFIG_DEBUG_FS=y
-CONFIG_DEBUG_PAGEALLOC=y
-CONFIG_SCHED_STACK_END_CHECK=y
-CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_VM_PGFLAGS=y
-CONFIG_DEBUG_MEMORY_INIT=y
-CONFIG_DEBUG_PER_CPU_MAPS=y
-CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_WQ_WATCHDOG=y
-CONFIG_DEBUG_TIMEKEEPING=y
-CONFIG_DEBUG_RT_MUTEXES=y
-CONFIG_DEBUG_SPINLOCK=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_DEBUG_RWSEMS=y
-CONFIG_DEBUG_ATOMIC_SLEEP=y
-CONFIG_STACKTRACE=y
-CONFIG_DEBUG_LIST=y
-CONFIG_DEBUG_PLIST=y
-CONFIG_DEBUG_SG=y
-# CONFIG_RCU_TRACE is not set
-CONFIG_RCU_EQS_DEBUG=y
-# CONFIG_FTRACE is not set
-# CONFIG_RUNTIME_TESTING_MENU is not set
-CONFIG_MEMTEST=y
-- 
2.25.1



[PATCH V7 07/20] syscalls: compat: Fix the missing part for __SYSCALL_COMPAT

2022-02-27 Thread guoren
From: Guo Ren 

Make "uapi asm unistd.h" could be used for architectures' COMPAT
mode. The __SYSCALL_COMPAT is first used in riscv.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Arnd Bergmann 
Reviewed-by: Christoph Hellwig 
---
 include/uapi/asm-generic/unistd.h   | 4 ++--
 tools/include/uapi/asm-generic/unistd.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/uapi/asm-generic/unistd.h 
b/include/uapi/asm-generic/unistd.h
index 1c48b0ae3ba3..45fa180cc56a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -383,7 +383,7 @@ __SYSCALL(__NR_syslog, sys_syslog)
 
 /* kernel/ptrace.c */
 #define __NR_ptrace 117
-__SYSCALL(__NR_ptrace, sys_ptrace)
+__SC_COMP(__NR_ptrace, sys_ptrace, compat_sys_ptrace)
 
 /* kernel/sched/core.c */
 #define __NR_sched_setparam 118
@@ -779,7 +779,7 @@ __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_kexec_file_load 294
 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load)
 /* 295 through 402 are unassigned to sync up with generic numbers, don't use */
-#if __BITS_PER_LONG == 32
+#if defined(__SYSCALL_COMPAT) || __BITS_PER_LONG == 32
 #define __NR_clock_gettime64 403
 __SYSCALL(__NR_clock_gettime64, sys_clock_gettime)
 #define __NR_clock_settime64 404
diff --git a/tools/include/uapi/asm-generic/unistd.h 
b/tools/include/uapi/asm-generic/unistd.h
index 1c48b0ae3ba3..45fa180cc56a 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -383,7 +383,7 @@ __SYSCALL(__NR_syslog, sys_syslog)
 
 /* kernel/ptrace.c */
 #define __NR_ptrace 117
-__SYSCALL(__NR_ptrace, sys_ptrace)
+__SC_COMP(__NR_ptrace, sys_ptrace, compat_sys_ptrace)
 
 /* kernel/sched/core.c */
 #define __NR_sched_setparam 118
@@ -779,7 +779,7 @@ __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_kexec_file_load 294
 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load)
 /* 295 through 402 are unassigned to sync up with generic numbers, don't use */
-#if __BITS_PER_LONG == 32
+#if defined(__SYSCALL_COMPAT) || __BITS_PER_LONG == 32
 #define __NR_clock_gettime64 403
 __SYSCALL(__NR_clock_gettime64, sys_clock_gettime)
 #define __NR_clock_settime64 404
-- 
2.25.1



[PATCH V7 06/20] asm-generic: compat: Cleanup duplicate definitions

2022-02-27 Thread guoren
From: Guo Ren 

There are 7 64bit architectures that support Linux COMPAT mode to
run 32bit applications. A lot of definitions are duplicate:
 - COMPAT_USER_HZ
 - COMPAT_RLIM_INFINITY
 - COMPAT_OFF_T_MAX
 - __compat_uid_t, __compat_uid_t
 - compat_dev_t
 - compat_ipc_pid_t
 - struct compat_flock
 - struct compat_flock64
 - struct compat_statfs
 - struct compat_ipc64_perm, compat_semid64_ds,
  compat_msqid64_ds, compat_shmid64_ds

Cleanup duplicate definitions and merge them into asm-generic.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Arnd Bergmann 
Reviewed-by: Christoph Hellwig 
Cc: Palmer Dabbelt 
---
 arch/arm64/include/asm/compat.h   |  71 ++--
 arch/mips/include/asm/compat.h|  18 ++---
 arch/parisc/include/asm/compat.h  |  29 ++--
 arch/powerpc/include/asm/compat.h |  30 ++---
 arch/s390/include/asm/compat.h|  79 --
 arch/sparc/include/asm/compat.h   |  39 ---
 arch/x86/include/asm/compat.h |  80 --
 include/asm-generic/compat.h  | 106 ++
 8 files changed, 168 insertions(+), 284 deletions(-)

diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index e0faec1984a1..46317319738a 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -8,6 +8,13 @@
 #define compat_mode_t compat_mode_t
 typedef u16compat_mode_t;
 
+#define __compat_uid_t __compat_uid_t
+typedef u16__compat_uid_t;
+typedef u16__compat_gid_t;
+
+#define compat_ipc_pid_t compat_ipc_pid_t
+typedef u16 compat_ipc_pid_t;
+
 #include 
 
 #ifdef CONFIG_COMPAT
@@ -19,21 +26,15 @@ typedef u16 compat_mode_t;
 #include 
 #include 
 
-#define COMPAT_USER_HZ 100
 #ifdef __AARCH64EB__
 #define COMPAT_UTS_MACHINE "armv8b\0\0"
 #else
 #define COMPAT_UTS_MACHINE "armv8l\0\0"
 #endif
 
-typedef u16__compat_uid_t;
-typedef u16__compat_gid_t;
 typedef u16__compat_uid16_t;
 typedef u16__compat_gid16_t;
-typedef u32compat_dev_t;
 typedef s32compat_nlink_t;
-typedef u16compat_ipc_pid_t;
-typedef __kernel_fsid_tcompat_fsid_t;
 
 struct compat_stat {
 #ifdef __AARCH64EB__
@@ -87,64 +88,6 @@ struct compat_statfs {
 #define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current)))
 #define COMPAT_MINSIGSTKSZ 2048
 
-struct compat_ipc64_perm {
-   compat_key_t key;
-   __compat_uid32_t uid;
-   __compat_gid32_t gid;
-   __compat_uid32_t cuid;
-   __compat_gid32_t cgid;
-   unsigned short mode;
-   unsigned short __pad1;
-   unsigned short seq;
-   unsigned short __pad2;
-   compat_ulong_t unused1;
-   compat_ulong_t unused2;
-};
-
-struct compat_semid64_ds {
-   struct compat_ipc64_perm sem_perm;
-   compat_ulong_t sem_otime;
-   compat_ulong_t sem_otime_high;
-   compat_ulong_t sem_ctime;
-   compat_ulong_t sem_ctime_high;
-   compat_ulong_t sem_nsems;
-   compat_ulong_t __unused3;
-   compat_ulong_t __unused4;
-};
-
-struct compat_msqid64_ds {
-   struct compat_ipc64_perm msg_perm;
-   compat_ulong_t msg_stime;
-   compat_ulong_t msg_stime_high;
-   compat_ulong_t msg_rtime;
-   compat_ulong_t msg_rtime_high;
-   compat_ulong_t msg_ctime;
-   compat_ulong_t msg_ctime_high;
-   compat_ulong_t msg_cbytes;
-   compat_ulong_t msg_qnum;
-   compat_ulong_t msg_qbytes;
-   compat_pid_t   msg_lspid;
-   compat_pid_t   msg_lrpid;
-   compat_ulong_t __unused4;
-   compat_ulong_t __unused5;
-};
-
-struct compat_shmid64_ds {
-   struct compat_ipc64_perm shm_perm;
-   compat_size_t  shm_segsz;
-   compat_ulong_t shm_atime;
-   compat_ulong_t shm_atime_high;
-   compat_ulong_t shm_dtime;
-   compat_ulong_t shm_dtime_high;
-   compat_ulong_t shm_ctime;
-   compat_ulong_t shm_ctime_high;
-   compat_pid_t   shm_cpid;
-   compat_pid_t   shm_lpid;
-   compat_ulong_t shm_nattch;
-   compat_ulong_t __unused4;
-   compat_ulong_t __unused5;
-};
-
 static inline int is_compat_task(void)
 {
return test_thread_flag(TIF_32BIT);
diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h
index 6d6e5a451f4d..ec01dc000a41 100644
--- a/arch/mips/include/asm/compat.h
+++ b/arch/mips/include/asm/compat.h
@@ -9,28 +9,28 @@
 #include 
 #include 
 
+#define __compat_uid_t __compat_uid_t
 typedef s32__compat_uid_t;
 typedef s32__compat_gid_t;
+
 typedef __compat_uid_t __compat_uid32_t;
 typedef __compat_gid_t __compat_gid32_t;
 #define __compat_uid32_t __compat_uid32_t
-#define __compat_gid32_t __compat_gid32_t
+
+#define compat_statfs  compat_statfs
+#define compat_ipc64_perm  compat_ipc64_perm
 
 #define _COMPAT_NSIG   128 /* Don't ask !$@#% ...  */
 #define _COMPAT

[PATCH V7 05/20] fs: stat: compat: Add __ARCH_WANT_COMPAT_STAT

2022-02-27 Thread guoren
From: Guo Ren 

RISC-V doesn't neeed compat_stat, so using __ARCH_WANT_COMPAT_STAT
to exclude unnecessary SYSCALL functions.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Reviewed-by: Arnd Bergmann 
Reviewed-by: Christoph Hellwig 
Cc: Palmer Dabbelt 
---
 arch/arm64/include/asm/unistd.h   | 1 +
 arch/mips/include/asm/unistd.h| 2 ++
 arch/parisc/include/asm/unistd.h  | 1 +
 arch/powerpc/include/asm/unistd.h | 1 +
 arch/s390/include/asm/unistd.h| 1 +
 arch/sparc/include/asm/unistd.h   | 1 +
 arch/x86/include/asm/unistd.h | 1 +
 fs/stat.c | 2 +-
 8 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 4e65da3445c7..037feba03a51 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -3,6 +3,7 @@
  * Copyright (C) 2012 ARM Ltd.
  */
 #ifdef CONFIG_COMPAT
+#define __ARCH_WANT_COMPAT_STAT
 #define __ARCH_WANT_COMPAT_STAT64
 #define __ARCH_WANT_SYS_GETHOSTNAME
 #define __ARCH_WANT_SYS_PAUSE
diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h
index c2196b1b6604..25a5253db7f4 100644
--- a/arch/mips/include/asm/unistd.h
+++ b/arch/mips/include/asm/unistd.h
@@ -50,6 +50,8 @@
 # ifdef CONFIG_32BIT
 #  define __ARCH_WANT_STAT64
 #  define __ARCH_WANT_SYS_TIME32
+# else
+#  define __ARCH_WANT_COMPAT_STAT
 # endif
 # ifdef CONFIG_MIPS32_O32
 #  define __ARCH_WANT_SYS_TIME32
diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h
index cd438e4150f6..14e0668184cb 100644
--- a/arch/parisc/include/asm/unistd.h
+++ b/arch/parisc/include/asm/unistd.h
@@ -168,6 +168,7 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, 
type5 arg5)   \
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_SYS_CLONE3
 #define __ARCH_WANT_COMPAT_SYS_SENDFILE
+#define __ARCH_WANT_COMPAT_STAT
 
 #ifdef CONFIG_64BIT
 #define __ARCH_WANT_SYS_TIME
diff --git a/arch/powerpc/include/asm/unistd.h 
b/arch/powerpc/include/asm/unistd.h
index 5eb462af6766..b1129b4ef57d 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -44,6 +44,7 @@
 #define __ARCH_WANT_SYS_TIME
 #define __ARCH_WANT_SYS_UTIME
 #define __ARCH_WANT_SYS_NEWFSTATAT
+#define __ARCH_WANT_COMPAT_STAT
 #define __ARCH_WANT_COMPAT_SYS_SENDFILE
 #endif
 #define __ARCH_WANT_SYS_FORK
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index 9e9f75ef046a..4260bc5ce7f8 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -28,6 +28,7 @@
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 # ifdef CONFIG_COMPAT
+#   define __ARCH_WANT_COMPAT_STAT
 #   define __ARCH_WANT_SYS_TIME32
 #   define __ARCH_WANT_SYS_UTIME32
 # endif
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h
index 1e66278ba4a5..d6bc76706a7a 100644
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -46,6 +46,7 @@
 #define __ARCH_WANT_SYS_TIME
 #define __ARCH_WANT_SYS_UTIME
 #define __ARCH_WANT_COMPAT_SYS_SENDFILE
+#define __ARCH_WANT_COMPAT_STAT
 #endif
 
 #ifdef __32bit_syscall_numbers__
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 80e9d5206a71..761173ccc33c 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -22,6 +22,7 @@
 #  include 
 #  define __ARCH_WANT_SYS_TIME
 #  define __ARCH_WANT_SYS_UTIME
+#  define __ARCH_WANT_COMPAT_STAT
 #  define __ARCH_WANT_COMPAT_SYS_PREADV64
 #  define __ARCH_WANT_COMPAT_SYS_PWRITEV64
 #  define __ARCH_WANT_COMPAT_SYS_PREADV64V2
diff --git a/fs/stat.c b/fs/stat.c
index 28d2020ba1f4..ffdeb9065d53 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -639,7 +639,7 @@ SYSCALL_DEFINE5(statx,
return do_statx(dfd, filename, flags, mask, buffer);
 }
 
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT)
 static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
 {
struct compat_stat tmp;
-- 
2.25.1



[PATCH V7 04/20] kconfig: Add SYSVIPC_COMPAT for all architectures

2022-02-27 Thread guoren
From: Guo Ren 

The existing per-arch definitions are pretty much historic cruft.
Move SYSVIPC_COMPAT into init/Kconfig.

Signed-off-by: Guo Ren 
Signed-off-by: Guo Ren 
Acked-by: Arnd Bergmann 
Reviewed-by: Christoph Hellwig 
Cc: Palmer Dabbelt 
---
 arch/arm64/Kconfig   | 4 
 arch/mips/Kconfig| 5 -
 arch/parisc/Kconfig  | 4 
 arch/powerpc/Kconfig | 5 -
 arch/s390/Kconfig| 3 ---
 arch/sparc/Kconfig   | 5 -
 arch/x86/Kconfig | 4 
 init/Kconfig | 4 
 8 files changed, 4 insertions(+), 30 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 09b885cc4db5..51fdb6e9c522 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2108,10 +2108,6 @@ config DMI
 
 endmenu
 
-config SYSVIPC_COMPAT
-   def_bool y
-   depends on COMPAT && SYSVIPC
-
 menu "Power management options"
 
 source "kernel/power/Kconfig"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 058446f01487..91a17ad380c9 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -3170,16 +3170,12 @@ config MIPS32_COMPAT
 config COMPAT
bool
 
-config SYSVIPC_COMPAT
-   bool
-
 config MIPS32_O32
bool "Kernel support for o32 binaries"
depends on 64BIT
select ARCH_WANT_OLD_COMPAT_IPC
select COMPAT
select MIPS32_COMPAT
-   select SYSVIPC_COMPAT if SYSVIPC
help
  Select this option if you want to run o32 binaries.  These are pure
  32-bit binaries as used by the 32-bit Linux/MIPS port.  Most of
@@ -3193,7 +3189,6 @@ config MIPS32_N32
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
select COMPAT
select MIPS32_COMPAT
-   select SYSVIPC_COMPAT if SYSVIPC
help
  Select this option if you want to run n32 binaries.  These are
  64-bit binaries using 32-bit quantities for addressing and certain
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 43c1c880def6..bc56759d44a2 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -345,10 +345,6 @@ config COMPAT
def_bool y
depends on 64BIT
 
-config SYSVIPC_COMPAT
-   def_bool y
-   depends on COMPAT && SYSVIPC
-
 config AUDIT_ARCH
def_bool y
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b779603978e1..5a32b7f21af2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -291,11 +291,6 @@ config COMPAT
select ARCH_WANT_OLD_COMPAT_IPC
select COMPAT_OLD_SIGACTION
 
-config SYSVIPC_COMPAT
-   bool
-   depends on COMPAT && SYSVIPC
-   default y
-
 config SCHED_OMIT_FRAME_POINTER
bool
default y
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index be9f39fd06df..80f69cafbb87 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -459,9 +459,6 @@ config COMPAT
  (and some other stuff like libraries and such) is needed for
  executing 31 bit applications.  It is safe to say "Y".
 
-config SYSVIPC_COMPAT
-   def_bool y if COMPAT && SYSVIPC
-
 config SMP
def_bool y
 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 1cab1b284f1a..15d5725bd623 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -488,9 +488,4 @@ config COMPAT
select ARCH_WANT_OLD_COMPAT_IPC
select COMPAT_OLD_SIGACTION
 
-config SYSVIPC_COMPAT
-   bool
-   depends on COMPAT && SYSVIPC
-   default y
-
 source "drivers/sbus/char/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9f5bd41bf660..7d0487189f6e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2860,10 +2860,6 @@ config COMPAT
 if COMPAT
 config COMPAT_FOR_U64_ALIGNMENT
def_bool y
-
-config SYSVIPC_COMPAT
-   def_bool y
-   depends on SYSVIPC
 endif
 
 endmenu
diff --git a/init/Kconfig b/init/Kconfig
index e9119bf54b1f..589ccec56571 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -386,6 +386,10 @@ config SYSVIPC_SYSCTL
depends on SYSCTL
default y
 
+config SYSVIPC_COMPAT
+   def_bool y
+   depends on COMPAT && SYSVIPC
+
 config POSIX_MQUEUE
bool "POSIX Message Queues"
depends on NET
-- 
2.25.1



[PATCH V7 03/20] compat: consolidate the compat_flock{, 64} definition

2022-02-27 Thread guoren
From: Christoph Hellwig 

Provide a single common definition for the compat_flock and
compat_flock64 structures using the same tricks as for the native
variants.  Another extra define is added for the packing required on
x86.

Signed-off-by: Christoph Hellwig 
Signed-off-by: Guo Ren 
Reviewed-by: Arnd Bergmann 
---
 arch/arm64/include/asm/compat.h   | 16 
 arch/mips/include/asm/compat.h| 19 ++-
 arch/parisc/include/asm/compat.h  | 16 
 arch/powerpc/include/asm/compat.h | 16 
 arch/s390/include/asm/compat.h| 16 
 arch/sparc/include/asm/compat.h   | 18 +-
 arch/x86/include/asm/compat.h | 20 +++-
 include/linux/compat.h| 31 +++
 8 files changed, 37 insertions(+), 115 deletions(-)

diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index 276328765408..e0faec1984a1 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -65,22 +65,6 @@ struct compat_stat {
compat_ulong_t  __unused4[2];
 };
 
-struct compat_flock {
-   short   l_type;
-   short   l_whence;
-   compat_off_tl_start;
-   compat_off_tl_len;
-   compat_pid_tl_pid;
-};
-
-struct compat_flock64 {
-   short   l_type;
-   short   l_whence;
-   compat_loff_t   l_start;
-   compat_loff_t   l_len;
-   compat_pid_tl_pid;
-};
-
 struct compat_statfs {
int f_type;
int f_bsize;
diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h
index 6a350c1f70d7..6d6e5a451f4d 100644
--- a/arch/mips/include/asm/compat.h
+++ b/arch/mips/include/asm/compat.h
@@ -55,23 +55,8 @@ struct compat_stat {
s32 st_pad4[14];
 };
 
-struct compat_flock {
-   short   l_type;
-   short   l_whence;
-   compat_off_tl_start;
-   compat_off_tl_len;
-   s32 l_sysid;
-   compat_pid_tl_pid;
-   s32 pad[4];
-};
-
-struct compat_flock64 {
-   short   l_type;
-   short   l_whence;
-   compat_loff_t   l_start;
-   compat_loff_t   l_len;
-   compat_pid_tl_pid;
-};
+#define __ARCH_COMPAT_FLOCK_EXTRA_SYSIDs32 l_sysid;
+#define __ARCH_COMPAT_FLOCK_PADs32 pad[4];
 
 struct compat_statfs {
int f_type;
diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h
index c04f5a637c39..a1e4534d8050 100644
--- a/arch/parisc/include/asm/compat.h
+++ b/arch/parisc/include/asm/compat.h
@@ -53,22 +53,6 @@ struct compat_stat {
u32 st_spare4[3];
 };
 
-struct compat_flock {
-   short   l_type;
-   short   l_whence;
-   compat_off_tl_start;
-   compat_off_tl_len;
-   compat_pid_tl_pid;
-};
-
-struct compat_flock64 {
-   short   l_type;
-   short   l_whence;
-   compat_loff_t   l_start;
-   compat_loff_t   l_len;
-   compat_pid_tl_pid;
-};
-
 struct compat_statfs {
s32 f_type;
s32 f_bsize;
diff --git a/arch/powerpc/include/asm/compat.h 
b/arch/powerpc/include/asm/compat.h
index 83d8f70779cb..5ef3c7c83c34 100644
--- a/arch/powerpc/include/asm/compat.h
+++ b/arch/powerpc/include/asm/compat.h
@@ -44,22 +44,6 @@ struct compat_stat {
u32 __unused4[2];
 };
 
-struct compat_flock {
-   short   l_type;
-   short   l_whence;
-   compat_off_tl_start;
-   compat_off_tl_len;
-   compat_pid_tl_pid;
-};
-
-struct compat_flock64 {
-   short   l_type;
-   short   l_whence;
-   compat_loff_t   l_start;
-   compat_loff_t   l_len;
-   compat_pid_tl_pid;
-};
-
 struct compat_statfs {
int f_type;
int f_bsize;
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index 0f14b3188b1b..07f04d37068b 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -102,22 +102,6 @@ struct compat_stat {
u32 __unused5;
 };
 
-struct compat_flock {
-   short   l_type;
-   short   l_whence;
-   compat_off_tl_start;
-   compat_off_tl_len;
-   compat_pid_tl_pid;
-};
-
-struct compat_flock64 {
-   short   l_type;
-   short   l_whence;
-   compat_loff_t   l_start;
-   compat_loff_t   l_len;
-   compat_pid_tl_pid;
-};
-
 struct compat_statfs {
u32 f_type;
u32 f_bsize;
diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
index 108078751bb5..d78fb44942e0 100644
--- a/arch/sparc/in

[PATCH V7 02/20] uapi: always define F_GETLK64/F_SETLK64/F_SETLKW64 in fcntl.h

2022-02-27 Thread guoren
From: Christoph Hellwig 

The F_GETLK64/F_SETLK64/F_SETLKW64 fcntl opcodes are only implemented
for the 32-bit syscall APIs, but are also needed for compat handling
on 64-bit kernels.

Consolidate them in unistd.h instead of definining the internal compat
definitions in compat.h, which is rather error prone (e.g. parisc
gets the values wrong currently).

Note that before this change they were never visible to userspace due
to the fact that CONFIG_64BIT is only set for kernel builds.

Signed-off-by: Christoph Hellwig 
Signed-off-by: Guo Ren 
Reviewed-by: Arnd Bergmann 
---
 arch/arm64/include/asm/compat.h| 4 
 arch/mips/include/asm/compat.h | 4 
 arch/mips/include/uapi/asm/fcntl.h | 4 ++--
 arch/powerpc/include/asm/compat.h  | 4 
 arch/s390/include/asm/compat.h | 4 
 arch/sparc/include/asm/compat.h| 4 
 arch/x86/include/asm/compat.h  | 4 
 include/uapi/asm-generic/fcntl.h   | 4 ++--
 tools/include/uapi/asm-generic/fcntl.h | 2 --
 9 files changed, 4 insertions(+), 30 deletions(-)

diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index eaa6ca062d89..276328765408 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -73,10 +73,6 @@ struct compat_flock {
compat_pid_tl_pid;
 };
 
-#define F_GETLK64  12  /*  using 'struct flock64' */
-#define F_SETLK64  13
-#define F_SETLKW64 14
-
 struct compat_flock64 {
short   l_type;
short   l_whence;
diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h
index bbb3bc5a42fd..6a350c1f70d7 100644
--- a/arch/mips/include/asm/compat.h
+++ b/arch/mips/include/asm/compat.h
@@ -65,10 +65,6 @@ struct compat_flock {
s32 pad[4];
 };
 
-#define F_GETLK64  33
-#define F_SETLK64  34
-#define F_SETLKW64 35
-
 struct compat_flock64 {
short   l_type;
short   l_whence;
diff --git a/arch/mips/include/uapi/asm/fcntl.h 
b/arch/mips/include/uapi/asm/fcntl.h
index 9e44ac810db9..0369a38e3d4f 100644
--- a/arch/mips/include/uapi/asm/fcntl.h
+++ b/arch/mips/include/uapi/asm/fcntl.h
@@ -44,11 +44,11 @@
 #define F_SETOWN   24  /*  for sockets. */
 #define F_GETOWN   23  /*  for sockets. */
 
-#ifndef __mips64
+#if __BITS_PER_LONG == 32 || defined(__KERNEL__)
 #define F_GETLK64  33  /*  using 'struct flock64' */
 #define F_SETLK64  34
 #define F_SETLKW64 35
-#endif
+#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */
 
 #if _MIPS_SIM != _MIPS_SIM_ABI64
 #define __ARCH_FLOCK_EXTRA_SYSID   long l_sysid;
diff --git a/arch/powerpc/include/asm/compat.h 
b/arch/powerpc/include/asm/compat.h
index 7afc96fb6524..83d8f70779cb 100644
--- a/arch/powerpc/include/asm/compat.h
+++ b/arch/powerpc/include/asm/compat.h
@@ -52,10 +52,6 @@ struct compat_flock {
compat_pid_tl_pid;
 };
 
-#define F_GETLK64  12  /*  using 'struct flock64' */
-#define F_SETLK64  13
-#define F_SETLKW64 14
-
 struct compat_flock64 {
short   l_type;
short   l_whence;
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index cdc7ae72529d..0f14b3188b1b 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -110,10 +110,6 @@ struct compat_flock {
compat_pid_tl_pid;
 };
 
-#define F_GETLK64   12
-#define F_SETLK64   13
-#define F_SETLKW64  14
-
 struct compat_flock64 {
short   l_type;
short   l_whence;
diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
index bd949fcf9d63..108078751bb5 100644
--- a/arch/sparc/include/asm/compat.h
+++ b/arch/sparc/include/asm/compat.h
@@ -84,10 +84,6 @@ struct compat_flock {
short   __unused;
 };
 
-#define F_GETLK64  12
-#define F_SETLK64  13
-#define F_SETLKW64 14
-
 struct compat_flock64 {
short   l_type;
short   l_whence;
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 7516e4199b3c..8d19a212f4f2 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -58,10 +58,6 @@ struct compat_flock {
compat_pid_tl_pid;
 };
 
-#define F_GETLK64  12  /*  using 'struct flock64' */
-#define F_SETLK64  13
-#define F_SETLKW64 14
-
 /*
  * IA32 uses 4 byte alignment for 64 bit quantities,
  * so we need to pack this structure.
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 77aa9f2ff98d..f13d37b60775 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -116,13 +116,13 @@
 #define F_GETSIG   11  /* for sockets. */
 #endif
 
-#ifndef CONFIG_64BIT
+#if __BITS_PER_LONG == 32 || defined(__KERNEL__)
 #ifndef F_GETLK64
 #define F_GETLK64  12  /*  using 'struct flock64' */
 #define F_SETLK64  

[PATCH V7 01/20] uapi: simplify __ARCH_FLOCK{,64}_PAD a little

2022-02-27 Thread guoren
From: Christoph Hellwig 

Don't bother to define the symbols empty, just don't use them.
That makes the intent a little more clear.

Remove the unused HAVE_ARCH_STRUCT_FLOCK64 define and merge the
32-bit mips struct flock into the generic one.

Add a new __ARCH_FLOCK_EXTRA_SYSID macro following the style of
__ARCH_FLOCK_PAD to avoid having a separate definition just for
one architecture.

Signed-off-by: Christoph Hellwig 
Signed-off-by: Guo Ren 
Reviewed-by: Arnd Bergmann 
---
 arch/mips/include/uapi/asm/fcntl.h | 26 +++---
 include/uapi/asm-generic/fcntl.h   | 19 +++
 tools/include/uapi/asm-generic/fcntl.h | 19 +++
 3 files changed, 17 insertions(+), 47 deletions(-)

diff --git a/arch/mips/include/uapi/asm/fcntl.h 
b/arch/mips/include/uapi/asm/fcntl.h
index 42e13dead543..9e44ac810db9 100644
--- a/arch/mips/include/uapi/asm/fcntl.h
+++ b/arch/mips/include/uapi/asm/fcntl.h
@@ -50,30 +50,10 @@
 #define F_SETLKW64 35
 #endif
 
-/*
- * The flavours of struct flock.  "struct flock" is the ABI compliant
- * variant.  Finally struct flock64 is the LFS variant of struct flock.
 As
- * a historic accident and inconsistence with the ABI definition it doesn't
- * contain all the same fields as struct flock.
- */
-
 #if _MIPS_SIM != _MIPS_SIM_ABI64
-
-#include 
-
-struct flock {
-   short   l_type;
-   short   l_whence;
-   __kernel_off_t  l_start;
-   __kernel_off_t  l_len;
-   longl_sysid;
-   __kernel_pid_t l_pid;
-   longpad[4];
-};
-
-#define HAVE_ARCH_STRUCT_FLOCK
-
-#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
+#define __ARCH_FLOCK_EXTRA_SYSID   long l_sysid;
+#define __ARCH_FLOCK_PAD   long pad[4];
+#endif
 
 #include 
 
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index ecd0f5bdfc1d..77aa9f2ff98d 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -192,25 +192,19 @@ struct f_owner_ex {
 
 #define F_LINUX_SPECIFIC_BASE  1024
 
-#ifndef HAVE_ARCH_STRUCT_FLOCK
-#ifndef __ARCH_FLOCK_PAD
-#define __ARCH_FLOCK_PAD
-#endif
-
 struct flock {
short   l_type;
short   l_whence;
__kernel_off_t  l_start;
__kernel_off_t  l_len;
__kernel_pid_t  l_pid;
-   __ARCH_FLOCK_PAD
-};
+#ifdef __ARCH_FLOCK_EXTRA_SYSID
+   __ARCH_FLOCK_EXTRA_SYSID
 #endif
-
-#ifndef HAVE_ARCH_STRUCT_FLOCK64
-#ifndef __ARCH_FLOCK64_PAD
-#define __ARCH_FLOCK64_PAD
+#ifdef __ARCH_FLOCK_PAD
+   __ARCH_FLOCK_PAD
 #endif
+};
 
 struct flock64 {
short  l_type;
@@ -218,8 +212,9 @@ struct flock64 {
__kernel_loff_t l_start;
__kernel_loff_t l_len;
__kernel_pid_t  l_pid;
+#ifdef __ARCH_FLOCK64_PAD
__ARCH_FLOCK64_PAD
-};
 #endif
+};
 
 #endif /* _ASM_GENERIC_FCNTL_H */
diff --git a/tools/include/uapi/asm-generic/fcntl.h 
b/tools/include/uapi/asm-generic/fcntl.h
index ac190958c981..99bc9b15ce2b 100644
--- a/tools/include/uapi/asm-generic/fcntl.h
+++ b/tools/include/uapi/asm-generic/fcntl.h
@@ -187,25 +187,19 @@ struct f_owner_ex {
 
 #define F_LINUX_SPECIFIC_BASE  1024
 
-#ifndef HAVE_ARCH_STRUCT_FLOCK
-#ifndef __ARCH_FLOCK_PAD
-#define __ARCH_FLOCK_PAD
-#endif
-
 struct flock {
short   l_type;
short   l_whence;
__kernel_off_t  l_start;
__kernel_off_t  l_len;
__kernel_pid_t  l_pid;
-   __ARCH_FLOCK_PAD
-};
+#ifdef __ARCH_FLOCK_EXTRA_SYSID
+   __ARCH_FLOCK_EXTRA_SYSID
 #endif
-
-#ifndef HAVE_ARCH_STRUCT_FLOCK64
-#ifndef __ARCH_FLOCK64_PAD
-#define __ARCH_FLOCK64_PAD
+#ifdef __ARCH_FLOCK_PAD
+   __ARCH_FLOCK_PAD
 #endif
+};
 
 struct flock64 {
short  l_type;
@@ -213,8 +207,9 @@ struct flock64 {
__kernel_loff_t l_start;
__kernel_loff_t l_len;
__kernel_pid_t  l_pid;
+#ifdef __ARCH_FLOCK64_PAD
__ARCH_FLOCK64_PAD
-};
 #endif
+};
 
 #endif /* _ASM_GENERIC_FCNTL_H */
-- 
2.25.1



[PATCH V7 00/20] riscv: compat: Add COMPAT mode support for rv64

2022-02-27 Thread guoren
From: Guo Ren 

Currently, most 64-bit architectures (x86, parisc, powerpc, arm64,
s390, mips, sparc) have supported COMPAT mode. But they all have
history issues and can't use standard linux unistd.h. RISC-V would
be first standard __SYSCALL_COMPAT user of include/uapi/asm-generic
/unistd.h.

The patchset are based on v5.17-rc5, you can compare rv64-compat
v.s. rv32-native in qemu with following steps:

 - Prepare rv32 rootfs & fw_jump.bin by buildroot.org
   $ git clone git://git.busybox.net/buildroot
   $ cd buildroot
   $ make qemu_riscv32_virt_defconfig O=qemu_riscv32_virt_defconfig
   $ make -C qemu_riscv32_virt_defconfig
   $ make qemu_riscv64_virt_defconfig O=qemu_riscv64_virt_defconfig
   $ make -C qemu_riscv64_virt_defconfig
   (Got fw_jump.bin & rootfs.ext2 in qemu_riscvXX_virt_defconfig/images)

 - Prepare Linux rv32 & rv64 Image
   $ git clone g...@github.com:c-sky/csky-linux.git -b riscv_compat_v6 linux
   $ cd linux
   $ echo "CONFIG_STRICT_KERNEL_RWX=n" >> arch/riscv/configs/defconfig
   $ echo "CONFIG_STRICT_MODULE_RWX=n" >> arch/riscv/configs/defconfig
   $ make ARCH=riscv CROSS_COMPILE=riscv32-buildroot-linux-gnu- 
O=../build-rv32/ rv32_defconfig
   $ make ARCH=riscv CROSS_COMPILE=riscv32-buildroot-linux-gnu- 
O=../build-rv32/ Image
   $ make ARCH=riscv CROSS_COMPILE=riscv64-buildroot-linux-gnu- 
O=../build-rv64/ defconfig
   $ make ARCH=riscv CROSS_COMPILE=riscv64-buildroot-linux-gnu- 
O=../build-rv64/ Image

 - Prepare Qemu: (rv32 compat was made by LIU Zhiwei )
   $ git clone g...@github.com:alistair23/qemu.git -b 
riscv-to-apply.for-upstream linux
   $ cd qemu
   $ ./configure --target-list="riscv64-softmmu riscv32-softmmu"
   $ make

Now let's compare rv64-compat with rv32-native memory footprint with almost the 
same
defconfig, rootfs, opensbi in one qemu.

 - Run rv64 with rv32 rootfs in compat mode:
   $ ./build/qemu-system-riscv64 -cpu rv64 -M virt -m 64m -nographic -bios 
qemu_riscv64_virt_defconfig/images/fw_jump.bin -kernel build-rv64/Image -drive 
file qemu_riscv32_virt_defconfig/images/rootfs.ext2,format=raw,id=hd0 -device 
virtio-blk-device,drive=hd0 -append "rootwait root=/dev/vda ro console=ttyS0 
earlycon=sbi" -netdev user,id=net0 -device virtio-net-device,netdev=net0

QEMU emulator version 6.2.50 (v6.2.0-29-g196d7182c8)
OpenSBI v0.9
[0.00] Linux version 5.16.0-rc6-00017-g750f87086bdd-dirty 
(guoren@guoren-Z87-HD3) (riscv64-unknown-linux-gnu-gcc (GCC) 10.2.0, GNU ld 
(GNU Binutils) 2.37) #96 SMP Tue Dec 28 21:01:55 CST 2021
[0.00] OF: fdt: Ignoring memory range 0x8000 - 0x8020
[0.00] Machine model: riscv-virtio,qemu
[0.00] earlycon: sbi0 at I/O port 0x0 (options '')
[0.00] printk: bootconsole [sbi0] enabled
[0.00] efi: UEFI not found.
[0.00] Zone ranges:
[0.00]   DMA32[mem 0x8020-0x83ff]
[0.00]   Normal   empty
[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x8020-0x83ff]
[0.00] Initmem setup node 0 [mem 0x8020-0x83ff]
[0.00] SBI specification v0.2 detected
[0.00] SBI implementation ID=0x1 Version=0x9
[0.00] SBI TIME extension detected
[0.00] SBI IPI extension detected
[0.00] SBI RFENCE extension detected
[0.00] SBI v0.2 HSM extension detected
[0.00] riscv: ISA extensions acdfhimsu
[0.00] riscv: ELF capabilities acdfim
[0.00] percpu: Embedded 17 pages/cpu s30696 r8192 d30744 u69632
[0.00] Built 1 zonelists, mobility grouping on.  Total pages: 15655
[0.00] Kernel command line: rootwait root=/dev/vda ro console=ttyS0 
earlycon=sbi
[0.00] Dentry cache hash table entries: 8192 (order: 4, 65536 bytes, 
linear)
[0.00] Inode-cache hash table entries: 4096 (order: 3, 32768 bytes, 
linear)
[0.00] mem auto-init: stack:off, heap alloc:off, heap free:off
[0.00] Virtual kernel memory layout:
[0.00]   fixmap : 0xffcefee0 - 0xffceff00   (2048 
kB)
[0.00]   pci io : 0xffceff00 - 0xffcf   (  16 
MB)
[0.00]  vmemmap : 0xffcf - 0xffcf   (4095 
MB)
[0.00]  vmalloc : 0xffd0 - 0xffdf   (65535 
MB)
[0.00]   lowmem : 0xffe0 - 0xffe003e0   (  62 
MB)
[0.00]   kernel : 0x8000 - 0x   (2047 
MB)
[0.00] Memory: 52788K/63488K available (6184K kernel code, 888K rwdata, 
1917K rodata, 294K init, 297K bss, 10700K reserved, 0K cma-reserved)
[0.00] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
[0.00] rcu: Hierarchical RCU implementation.
[0.00] rcu: RCU restricting CPUs from NR_CPUS=8 to nr_cpu_ids=1.
[0.00] rcu: RCU debug extended QS entry/exit.
[0.00]  Tracing variant of Tasks RCU ena

[PATCH 01/11] dma-direct: use is_swiotlb_active in dma_direct_map_page

2022-02-27 Thread Christoph Hellwig
Use the more specific is_swiotlb_active check instead of checking the
global swiotlb_force variable.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 kernel/dma/direct.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 4632b0f4f72eb..4dc16e08c7e1a 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -91,7 +91,7 @@ static inline dma_addr_t dma_direct_map_page(struct device 
*dev,
return swiotlb_map(dev, phys, size, dir, attrs);
 
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
-   if (swiotlb_force != SWIOTLB_NO_FORCE)
+   if (is_swiotlb_active(dev))
return swiotlb_map(dev, phys, size, dir, attrs);
 
dev_WARN_ONCE(dev, 1,
-- 
2.30.2



[PATCH 11/11] x86: remove cruft from

2022-02-27 Thread Christoph Hellwig
 gets pulled in by all drivers using the DMA API.
Remove x86 internal variables and unnecessary includes from it.

Signed-off-by: Christoph Hellwig 
---
 arch/x86/include/asm/dma-mapping.h | 11 ---
 arch/x86/include/asm/iommu.h   |  2 ++
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/dma-mapping.h 
b/arch/x86/include/asm/dma-mapping.h
index 256fd8115223d..1c66708e30623 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -2,17 +2,6 @@
 #ifndef _ASM_X86_DMA_MAPPING_H
 #define _ASM_X86_DMA_MAPPING_H
 
-/*
- * IOMMU interface. See Documentation/core-api/dma-api-howto.rst and
- * Documentation/core-api/dma-api.rst for documentation.
- */
-
-#include 
-#include 
-
-extern int iommu_merge;
-extern int panic_on_overflow;
-
 extern const struct dma_map_ops *dma_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index dba89ed40d38d..0bef44d30a278 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -8,6 +8,8 @@
 
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
+extern int iommu_merge;
+extern int panic_on_overflow;
 
 #ifdef CONFIG_SWIOTLB
 extern bool x86_swiotlb_enable;
-- 
2.30.2



[PATCH 10/11] swiotlb: merge swiotlb-xen initialization into swiotlb

2022-02-27 Thread Christoph Hellwig
Allow to pass a remap argument to the swiotlb initialization functions
to handle the Xen/x86 remap case.  ARM/ARM64 never did any remapping
from xen_swiotlb_fixup, so we don't even need that quirk.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/xen/mm.c   |  23 +++---
 arch/x86/include/asm/xen/page.h |   5 --
 arch/x86/kernel/pci-dma.c   |  19 +++--
 arch/x86/pci/sta2x11-fixup.c|   2 +-
 drivers/xen/swiotlb-xen.c   | 128 +---
 include/linux/swiotlb.h |   7 +-
 include/xen/arm/page.h  |   1 -
 include/xen/swiotlb-xen.h   |   8 +-
 kernel/dma/swiotlb.c| 120 +++---
 9 files changed, 96 insertions(+), 217 deletions(-)

diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index a7e54a087b802..58b40f87617d3 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -23,22 +23,20 @@
 #include 
 #include 
 
-unsigned long xen_get_swiotlb_free_pages(unsigned int order)
+static gfp_t xen_swiotlb_gfp(void)
 {
phys_addr_t base;
-   gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM;
u64 i;
 
for_each_mem_range(i, &base, NULL) {
if (base < (phys_addr_t)0x) {
if (IS_ENABLED(CONFIG_ZONE_DMA32))
-   flags |= __GFP_DMA32;
-   else
-   flags |= __GFP_DMA;
-   break;
+   return __GFP_DMA32;
+   return __GFP_DMA;
}
}
-   return __get_free_pages(flags, order);
+
+   return GFP_KERNEL;
 }
 
 static bool hypercall_cflush = false;
@@ -143,10 +141,15 @@ static int __init xen_mm_init(void)
if (!xen_swiotlb_detect())
return 0;
 
-   rc = xen_swiotlb_init();
/* we can work with the default swiotlb */
-   if (rc < 0 && rc != -EEXIST)
-   return rc;
+   if (!io_tlb_default_mem.nslabs) {
+   if (!xen_initial_domain())
+   return -EINVAL;
+   rc = swiotlb_init_late(swiotlb_size_or_default(),
+  xen_swiotlb_gfp(), NULL);
+   if (rc < 0)
+   return rc;
+   }
 
cflush.op = 0;
cflush.a.dev_bus_addr = 0;
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index e989bc2269f54..1fc67df500145 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -357,9 +357,4 @@ static inline bool xen_arch_need_swiotlb(struct device *dev,
return false;
 }
 
-static inline unsigned long xen_get_swiotlb_free_pages(unsigned int order)
-{
-   return __get_free_pages(__GFP_NOWARN, order);
-}
-
 #endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index bb08184a50e3a..b0901f027cdcd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -68,15 +68,12 @@ static inline void __init pci_swiotlb_detect_4gb(void)
 #endif /* CONFIG_SWIOTLB */
 
 #ifdef CONFIG_SWIOTLB_XEN
-static bool xen_swiotlb;
-
 static void __init pci_xen_swiotlb_init(void)
 {
if (!xen_initial_domain() && !x86_swiotlb_enable)
return;
x86_swiotlb_enable = false;
-   xen_swiotlb = true;
-   xen_swiotlb_init_early();
+   swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup);
dma_ops = &xen_swiotlb_dma_ops;
if (IS_ENABLED(CONFIG_PCI))
pci_request_acs();
@@ -84,14 +81,16 @@ static void __init pci_xen_swiotlb_init(void)
 
 int pci_xen_swiotlb_init_late(void)
 {
-   int rc;
-
-   if (xen_swiotlb)
+   if (dma_ops == &xen_swiotlb_dma_ops)
return 0;
 
-   rc = xen_swiotlb_init();
-   if (rc)
-   return rc;
+   /* we can work with the default swiotlb */
+   if (!io_tlb_default_mem.nslabs) {
+   int rc = swiotlb_init_late(swiotlb_size_or_default(),
+  GFP_KERNEL, xen_swiotlb_fixup);
+   if (rc < 0)
+   return rc;
+   }
 
/* XXX: this switches the dma ops under live devices! */
dma_ops = &xen_swiotlb_dma_ops;
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index c7e6faf59a861..7368afc039987 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev)
int size = STA2X11_SWIOTLB_SIZE;
/* First instance: register your own swiotlb area */
dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size);
-   if (swiotlb_init_late(size, GFP_DMA))
+   if (swiotlb_init_late(size, GFP_DMA, NULL))
dev_emerg(&pdev->dev, "init swiotlb failed\n");
}
list_add(&instance->list, &sta2x11_instance_list);
diff --git

[PATCH 08/11] swiotlb: make the swiotlb_init interface more useful

2022-02-27 Thread Christoph Hellwig
Pass a bool to pass if swiotlb needs to be enabled based on the
addressing needs and replace the verbose argument with a set of
flags, including one to force enable bounce buffering.

Note that this patch removes the possibility to force xen-swiotlb
use using swiotlb=force on the command line on x86 (arm and arm64
never supported that), but this interface will be restored shortly.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/mm/init.c |  6 +
 arch/arm64/mm/init.c   |  6 +
 arch/ia64/mm/init.c|  4 +--
 arch/mips/cavium-octeon/dma-octeon.c   |  2 +-
 arch/mips/loongson64/dma.c |  2 +-
 arch/mips/sibyte/common/dma.c  |  2 +-
 arch/powerpc/include/asm/swiotlb.h |  1 +
 arch/powerpc/mm/mem.c  |  3 ++-
 arch/powerpc/platforms/pseries/setup.c |  3 ---
 arch/riscv/mm/init.c   |  8 +-
 arch/s390/mm/init.c|  3 +--
 arch/x86/kernel/cpu/mshyperv.c |  8 --
 arch/x86/kernel/pci-dma.c  | 15 ++-
 arch/x86/mm/mem_encrypt_amd.c  |  3 ---
 drivers/xen/swiotlb-xen.c  |  4 +--
 include/linux/swiotlb.h| 15 ++-
 include/trace/events/swiotlb.h | 29 -
 kernel/dma/swiotlb.c   | 35 ++
 18 files changed, 56 insertions(+), 93 deletions(-)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 6d0cb0f7bc54b..73f30d278b565 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -312,11 +312,7 @@ static void __init free_highpages(void)
 void __init mem_init(void)
 {
 #ifdef CONFIG_ARM_LPAE
-   if (swiotlb_force == SWIOTLB_FORCE ||
-   max_pfn > arm_dma_pfn_limit)
-   swiotlb_init(1);
-   else
-   swiotlb_force = SWIOTLB_NO_FORCE;
+   swiotlb_init(max_pfn > arm_dma_pfn_limit, SWIOTLB_VERBOSE);
 #endif
 
set_max_mapnr(pfn_to_page(max_pfn) - mem_map);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index db63cc885771a..52102adda3d28 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -373,11 +373,7 @@ void __init bootmem_init(void)
  */
 void __init mem_init(void)
 {
-   if (swiotlb_force == SWIOTLB_FORCE ||
-   max_pfn > PFN_DOWN(arm64_dma_phys_limit))
-   swiotlb_init(1);
-   else if (!xen_swiotlb_detect())
-   swiotlb_force = SWIOTLB_NO_FORCE;
+   swiotlb_init(max_pfn > PFN_DOWN(arm64_dma_phys_limit), SWIOTLB_VERBOSE);
 
/* this will put all unused low memory onto the freelists */
memblock_free_all();
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 5d165607bf354..3c3e15b22608f 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -437,9 +437,7 @@ mem_init (void)
if (iommu_detected)
break;
 #endif
-#ifdef CONFIG_SWIOTLB
-   swiotlb_init(1);
-#endif
+   swiotlb_init(true, SWIOTLB_VERBOSE);
} while (0);
 
 #ifdef CONFIG_FLATMEM
diff --git a/arch/mips/cavium-octeon/dma-octeon.c 
b/arch/mips/cavium-octeon/dma-octeon.c
index fb7547e217263..9fbba6a8fa4c5 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -235,5 +235,5 @@ void __init plat_swiotlb_setup(void)
 #endif
 
swiotlb_adjust_size(swiotlbsize);
-   swiotlb_init(1);
+   swiotlb_init(true, SWIOTLB_VERBOSE);
 }
diff --git a/arch/mips/loongson64/dma.c b/arch/mips/loongson64/dma.c
index 364f2f27c8723..8220a1bc0db64 100644
--- a/arch/mips/loongson64/dma.c
+++ b/arch/mips/loongson64/dma.c
@@ -24,5 +24,5 @@ phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 void __init plat_swiotlb_setup(void)
 {
-   swiotlb_init(1);
+   swiotlb_init(true, SWIOTLB_VERBOSE);
 }
diff --git a/arch/mips/sibyte/common/dma.c b/arch/mips/sibyte/common/dma.c
index eb47a94f3583e..c5c2c782aff68 100644
--- a/arch/mips/sibyte/common/dma.c
+++ b/arch/mips/sibyte/common/dma.c
@@ -10,5 +10,5 @@
 
 void __init plat_swiotlb_setup(void)
 {
-   swiotlb_init(1);
+   swiotlb_init(true, SWIOTLB_VERBOSE);
 }
diff --git a/arch/powerpc/include/asm/swiotlb.h 
b/arch/powerpc/include/asm/swiotlb.h
index 3c1a1cd161286..4203b5e0a88ed 100644
--- a/arch/powerpc/include/asm/swiotlb.h
+++ b/arch/powerpc/include/asm/swiotlb.h
@@ -9,6 +9,7 @@
 #include 
 
 extern unsigned int ppc_swiotlb_enable;
+extern unsigned int ppc_swiotlb_flags;
 
 #ifdef CONFIG_SWIOTLB
 void swiotlb_detect_4g(void);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 8e301cd8925b2..d99b8b5b40ca6 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -251,7 +252,7 @@ void __init mem_init(void)
if (is_secure_guest())
svm_swiotlb_init();
else
-   swiotlb_init(0);
+   swiotlb_init(ppc_swiotlb_enable, ppc_swiot

[PATCH 09/11] swiotlb: add a SWIOTLB_ANY flag to lift the low memory restriction

2022-02-27 Thread Christoph Hellwig
Power SVM wants to allocate a swiotlb buffer that is not restricted to
low memory for the trusted hypervisor scheme.  Consolidate the support
for this into the swiotlb_init interface by adding a new flag.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/svm.h   |  4 
 arch/powerpc/mm/mem.c|  5 +
 arch/powerpc/platforms/pseries/svm.c | 26 +-
 include/linux/swiotlb.h  |  1 +
 kernel/dma/swiotlb.c |  9 +++--
 5 files changed, 10 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h
index 7546402d796af..85580b30aba48 100644
--- a/arch/powerpc/include/asm/svm.h
+++ b/arch/powerpc/include/asm/svm.h
@@ -15,8 +15,6 @@ static inline bool is_secure_guest(void)
return mfmsr() & MSR_S;
 }
 
-void __init svm_swiotlb_init(void);
-
 void dtl_cache_ctor(void *addr);
 #define get_dtl_cache_ctor()   (is_secure_guest() ? dtl_cache_ctor : NULL)
 
@@ -27,8 +25,6 @@ static inline bool is_secure_guest(void)
return false;
 }
 
-static inline void svm_swiotlb_init(void) {}
-
 #define get_dtl_cache_ctor() NULL
 
 #endif /* CONFIG_PPC_SVM */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index d99b8b5b40ca6..a4d65418c30a9 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -249,10 +249,7 @@ void __init mem_init(void)
 * back to to-down.
 */
memblock_set_bottom_up(true);
-   if (is_secure_guest())
-   svm_swiotlb_init();
-   else
-   swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags);
+   swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags);
 #endif
 
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
diff --git a/arch/powerpc/platforms/pseries/svm.c 
b/arch/powerpc/platforms/pseries/svm.c
index c5228f4969eb2..3b4045d508ec8 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -28,7 +28,7 @@ static int __init init_svm(void)
 * need to use the SWIOTLB buffer for DMA even if dma_capable() says
 * otherwise.
 */
-   swiotlb_force = SWIOTLB_FORCE;
+   ppc_swiotlb_flags |= SWIOTLB_ANY | SWIOTLB_FORCE;
 
/* Share the SWIOTLB buffer with the host. */
swiotlb_update_mem_attributes();
@@ -37,30 +37,6 @@ static int __init init_svm(void)
 }
 machine_early_initcall(pseries, init_svm);
 
-/*
- * Initialize SWIOTLB. Essentially the same as swiotlb_init(), except that it
- * can allocate the buffer anywhere in memory. Since the hypervisor doesn't 
have
- * any addressing limitation, we don't need to allocate it in low addresses.
- */
-void __init svm_swiotlb_init(void)
-{
-   unsigned char *vstart;
-   unsigned long bytes, io_tlb_nslabs;
-
-   io_tlb_nslabs = (swiotlb_size_or_default() >> IO_TLB_SHIFT);
-   io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
-
-   bytes = io_tlb_nslabs << IO_TLB_SHIFT;
-
-   vstart = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE);
-   if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false))
-   return;
-
-
-   memblock_free(vstart, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
-   panic("SVM: Cannot allocate SWIOTLB buffer");
-}
-
 int set_memory_encrypted(unsigned long addr, int numpages)
 {
if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index dcecf953f7997..ee655f2e4d28b 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -15,6 +15,7 @@ struct scatterlist;
 
 #define SWIOTLB_VERBOSE(1 << 0) /* verbose initialization */
 #define SWIOTLB_FORCE  (1 << 1) /* force bounce buffering */
+#define SWIOTLB_ANY(1 << 2) /* allow any memory for the buffer */
 
 /*
  * Maximum allowable number of contiguous slabs to map,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index ad604e5a0983d..ec200e40fc397 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -275,8 +275,13 @@ void __init swiotlb_init(bool addressing_limit, unsigned 
int flags)
if (swiotlb_force_disable)
return;
 
-   /* Get IO TLB memory from the low pages */
-   tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+   /*
+* By default allocate the bonuce buffer memory from low memory.
+*/
+   if (flags & SWIOTLB_ANY)
+   tlb = memblock_alloc(bytes, PAGE_SIZE);
+   else
+   tlb = memblock_alloc_low(bytes, PAGE_SIZE);
if (!tlb)
goto fail;
if (swiotlb_init_with_tbl(tlb, default_nslabs, flags))
-- 
2.30.2



[PATCH 07/11] x86: remove the IOMMU table infrastructure

2022-02-27 Thread Christoph Hellwig
The IOMMU table tries to separate the different IOMMUs into different
backends, but actually requires various cross calls.

Rewrite the code to do the generic swiotlb/swiotlb-xen setup directly
in pci-dma.c and then just call into the IOMMU drivers.

Signed-off-by: Christoph Hellwig 
---
 arch/ia64/include/asm/iommu_table.h|   7 --
 arch/x86/include/asm/dma-mapping.h |   1 -
 arch/x86/include/asm/gart.h|   5 +-
 arch/x86/include/asm/iommu.h   |   6 ++
 arch/x86/include/asm/iommu_table.h | 102 ---
 arch/x86/include/asm/swiotlb.h |  30 ---
 arch/x86/include/asm/xen/swiotlb-xen.h |   2 -
 arch/x86/kernel/Makefile   |   2 -
 arch/x86/kernel/amd_gart_64.c  |   5 +-
 arch/x86/kernel/aperture_64.c  |  14 ++--
 arch/x86/kernel/pci-dma.c  | 111 -
 arch/x86/kernel/pci-iommu_table.c  |  77 -
 arch/x86/kernel/pci-swiotlb.c  |  77 -
 arch/x86/kernel/tboot.c|   1 -
 arch/x86/kernel/vmlinux.lds.S  |  12 ---
 arch/x86/xen/Makefile  |   2 -
 arch/x86/xen/pci-swiotlb-xen.c |  96 -
 drivers/iommu/amd/init.c   |   6 --
 drivers/iommu/amd/iommu.c  |   5 +-
 drivers/iommu/intel/dmar.c |   6 +-
 include/linux/dmar.h   |   6 +-
 21 files changed, 114 insertions(+), 459 deletions(-)
 delete mode 100644 arch/ia64/include/asm/iommu_table.h
 delete mode 100644 arch/x86/include/asm/iommu_table.h
 delete mode 100644 arch/x86/include/asm/swiotlb.h
 delete mode 100644 arch/x86/kernel/pci-iommu_table.c
 delete mode 100644 arch/x86/kernel/pci-swiotlb.c
 delete mode 100644 arch/x86/xen/pci-swiotlb-xen.c

diff --git a/arch/ia64/include/asm/iommu_table.h 
b/arch/ia64/include/asm/iommu_table.h
deleted file mode 100644
index cc96116ac276a..0
--- a/arch/ia64/include/asm/iommu_table.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_IOMMU_TABLE_H
-#define _ASM_IA64_IOMMU_TABLE_H
-
-#define IOMMU_INIT_POST(_detect)
-
-#endif /* _ASM_IA64_IOMMU_TABLE_H */
diff --git a/arch/x86/include/asm/dma-mapping.h 
b/arch/x86/include/asm/dma-mapping.h
index bb1654fe0ce74..256fd8115223d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -9,7 +9,6 @@
 
 #include 
 #include 
-#include 
 
 extern int iommu_merge;
 extern int panic_on_overflow;
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 3185565743459..5af8088a10df6 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -38,7 +38,7 @@ extern int gart_iommu_aperture_disabled;
 extern void early_gart_iommu_check(void);
 extern int gart_iommu_init(void);
 extern void __init gart_parse_options(char *);
-extern int gart_iommu_hole_init(void);
+void gart_iommu_hole_init(void);
 
 #else
 #define gart_iommu_aperture0
@@ -51,9 +51,8 @@ static inline void early_gart_iommu_check(void)
 static inline void gart_parse_options(char *options)
 {
 }
-static inline int gart_iommu_hole_init(void)
+static inline void gart_iommu_hole_init(void)
 {
-   return -ENODEV;
 }
 #endif
 
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index bf1ed2ddc74bd..dba89ed40d38d 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -9,6 +9,12 @@
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 
+#ifdef CONFIG_SWIOTLB
+extern bool x86_swiotlb_enable;
+#else
+#define x86_swiotlb_enable false
+#endif
+
 /* 10 seconds */
 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
 
diff --git a/arch/x86/include/asm/iommu_table.h 
b/arch/x86/include/asm/iommu_table.h
deleted file mode 100644
index 1fb3fd1a83c25..0
--- a/arch/x86/include/asm/iommu_table.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_IOMMU_TABLE_H
-#define _ASM_X86_IOMMU_TABLE_H
-
-#include 
-
-/*
- * History lesson:
- * The execution chain of IOMMUs in 2.6.36 looks as so:
- *
- *[xen-swiotlb]
- * |
- * +[swiotlb *]--+
- */ | \
- *   /  |  \
- *[GART] [Calgary]  [Intel VT-d]
- * /
- */
- * [AMD-Vi]
- *
- * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip
- * over the rest of IOMMUs and unconditionally initialize the SWIOTLB.
- * Also it would surreptitiously initialize set the swiotlb=1 if there were
- * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb
- * flag would be turned off by all IOMMUs except the Calgary one.
- *
- * The IOMMU_INIT* macros allow a similar tree (or more complex if desired)
- * to be built by defining who we depend on.
- *
- * And all that needs to be done is to use one of the macros in the IOMMU
- * and the pci-dma.c will take care of the rest.
- */
-
-struct iom

[PATCH 06/11] MIPS/octeon: use swiotlb_init instead of open coding it

2022-02-27 Thread Christoph Hellwig
Use the generic swiotlb initialization helper instead of open coding it.

Signed-off-by: Christoph Hellwig 
---
 arch/mips/cavium-octeon/dma-octeon.c | 15 ++-
 arch/mips/pci/pci-octeon.c   |  2 +-
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/arch/mips/cavium-octeon/dma-octeon.c 
b/arch/mips/cavium-octeon/dma-octeon.c
index df70308db0e69..fb7547e217263 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -186,15 +186,12 @@ phys_addr_t dma_to_phys(struct device *dev, dma_addr_t 
daddr)
return daddr;
 }
 
-char *octeon_swiotlb;
-
 void __init plat_swiotlb_setup(void)
 {
phys_addr_t start, end;
phys_addr_t max_addr;
phys_addr_t addr_size;
size_t swiotlbsize;
-   unsigned long swiotlb_nslabs;
u64 i;
 
max_addr = 0;
@@ -236,15 +233,7 @@ void __init plat_swiotlb_setup(void)
if (OCTEON_IS_OCTEON2() && max_addr >= 0x1ul)
swiotlbsize = 64 * (1<<20);
 #endif
-   swiotlb_nslabs = swiotlbsize >> IO_TLB_SHIFT;
-   swiotlb_nslabs = ALIGN(swiotlb_nslabs, IO_TLB_SEGSIZE);
-   swiotlbsize = swiotlb_nslabs << IO_TLB_SHIFT;
-
-   octeon_swiotlb = memblock_alloc_low(swiotlbsize, PAGE_SIZE);
-   if (!octeon_swiotlb)
-   panic("%s: Failed to allocate %zu bytes align=%lx\n",
- __func__, swiotlbsize, PAGE_SIZE);
 
-   if (swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs, 1) == -ENOMEM)
-   panic("Cannot allocate SWIOTLB buffer");
+   swiotlb_adjust_size(swiotlbsize);
+   swiotlb_init(1);
 }
diff --git a/arch/mips/pci/pci-octeon.c b/arch/mips/pci/pci-octeon.c
index fc29b85cfa926..e457a18cbdc59 100644
--- a/arch/mips/pci/pci-octeon.c
+++ b/arch/mips/pci/pci-octeon.c
@@ -664,7 +664,7 @@ static int __init octeon_pci_setup(void)
 
/* BAR1 movable regions contiguous to cover the swiotlb */
octeon_bar1_pci_phys =
-   virt_to_phys(octeon_swiotlb) & ~((1ull << 22) - 1);
+   io_tlb_default_mem.start & ~((1ull << 22) - 1);
 
for (index = 0; index < 32; index++) {
union cvmx_pci_bar1_indexx bar1_index;
-- 
2.30.2



[PATCH 05/11] swiotlb: pass a gfp_mask argument to swiotlb_init_late

2022-02-27 Thread Christoph Hellwig
Let the caller chose a zone to allocate from.  This will be used
later on by the xen-swiotlb initialization on arm.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 arch/x86/pci/sta2x11-fixup.c | 2 +-
 include/linux/swiotlb.h  | 2 +-
 kernel/dma/swiotlb.c | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index e0c039a75b2db..c7e6faf59a861 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev)
int size = STA2X11_SWIOTLB_SIZE;
/* First instance: register your own swiotlb area */
dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size);
-   if (swiotlb_init_late(size))
+   if (swiotlb_init_late(size, GFP_DMA))
dev_emerg(&pdev->dev, "init swiotlb failed\n");
}
list_add(&instance->list, &sta2x11_instance_list);
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b48b26bfa0edb..1befd6b2ccf5e 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -40,7 +40,7 @@ extern void swiotlb_init(int verbose);
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
-int swiotlb_init_late(size_t size);
+int swiotlb_init_late(size_t size, gfp_t gfp_mask);
 extern void __init swiotlb_update_mem_attributes(void);
 
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 5f64b02fbb732..a653fcf1fe6c2 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -290,7 +290,7 @@ swiotlb_init(int verbose)
  * initialize the swiotlb later using the slab allocator if needed.
  * This should be just like above, but with some error catching.
  */
-int swiotlb_init_late(size_t size)
+int swiotlb_init_late(size_t size, gfp_t gfp_mask)
 {
unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned long bytes;
@@ -309,7 +309,7 @@ int swiotlb_init_late(size_t size)
bytes = nslabs << IO_TLB_SHIFT;
 
while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
-   vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+   vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
  order);
if (vstart)
break;
-- 
2.30.2



[PATCH 03/11] swiotlb: simplify swiotlb_max_segment

2022-02-27 Thread Christoph Hellwig
Remove the bogus Xen override that was usually larger than the actual
size and just calculate the value on demand.  Note that
swiotlb_max_segment still doesn't make sense as an interface and should
eventually be removed.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 drivers/xen/swiotlb-xen.c |  2 --
 include/linux/swiotlb.h   |  1 -
 kernel/dma/swiotlb.c  | 20 +++-
 3 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 47aebd98f52f5..485cd06ed39e7 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -202,7 +202,6 @@ int xen_swiotlb_init(void)
rc = swiotlb_late_init_with_tbl(start, nslabs);
if (rc)
return rc;
-   swiotlb_set_max_segment(PAGE_SIZE);
return 0;
 error:
if (nslabs > 1024 && repeat--) {
@@ -254,7 +253,6 @@ void __init xen_swiotlb_init_early(void)
 
if (swiotlb_init_with_tbl(start, nslabs, true))
panic("Cannot allocate SWIOTLB buffer");
-   swiotlb_set_max_segment(PAGE_SIZE);
 }
 #endif /* CONFIG_X86 */
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index f6c3638255d54..9fb3a568f0c51 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -164,7 +164,6 @@ static inline void swiotlb_adjust_size(unsigned long size)
 #endif /* CONFIG_SWIOTLB */
 
 extern void swiotlb_print_info(void);
-extern void swiotlb_set_max_segment(unsigned int);
 
 #ifdef CONFIG_DMA_RESTRICTED_POOL
 struct page *swiotlb_alloc(struct device *dev, size_t size);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 36fbf1181d285..519e363097190 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -75,12 +75,6 @@ struct io_tlb_mem io_tlb_default_mem;
 
 phys_addr_t swiotlb_unencrypted_base;
 
-/*
- * Max segment that we can provide which (if pages are contingous) will
- * not be bounced (unless SWIOTLB_FORCE is set).
- */
-static unsigned int max_segment;
-
 static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
 
 static int __init
@@ -104,18 +98,12 @@ early_param("swiotlb", setup_io_tlb_npages);
 
 unsigned int swiotlb_max_segment(void)
 {
-   return io_tlb_default_mem.nslabs ? max_segment : 0;
+   if (!io_tlb_default_mem.nslabs)
+   return 0;
+   return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE);
 }
 EXPORT_SYMBOL_GPL(swiotlb_max_segment);
 
-void swiotlb_set_max_segment(unsigned int val)
-{
-   if (swiotlb_force == SWIOTLB_FORCE)
-   max_segment = 1;
-   else
-   max_segment = rounddown(val, PAGE_SIZE);
-}
-
 unsigned long swiotlb_size_or_default(void)
 {
return default_nslabs << IO_TLB_SHIFT;
@@ -267,7 +255,6 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long 
nslabs, int verbose)
 
if (verbose)
swiotlb_print_info();
-   swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
return 0;
 }
 
@@ -368,7 +355,6 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true);
 
swiotlb_print_info();
-   swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
return 0;
 }
 
-- 
2.30.2



[PATCH 04/11] swiotlb: rename swiotlb_late_init_with_default_size

2022-02-27 Thread Christoph Hellwig
swiotlb_late_init_with_default_size is an overly verbose name that
doesn't even catch what the function is doing, given that the size is
not just a default but the actual requested size.

Rename it to swiotlb_init_late.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 arch/x86/pci/sta2x11-fixup.c | 2 +-
 include/linux/swiotlb.h  | 2 +-
 kernel/dma/swiotlb.c | 6 ++
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 101081ad64b6d..e0c039a75b2db 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev)
int size = STA2X11_SWIOTLB_SIZE;
/* First instance: register your own swiotlb area */
dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size);
-   if (swiotlb_late_init_with_default_size(size))
+   if (swiotlb_init_late(size))
dev_emerg(&pdev->dev, "init swiotlb failed\n");
}
list_add(&instance->list, &sta2x11_instance_list);
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 9fb3a568f0c51..b48b26bfa0edb 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -40,7 +40,7 @@ extern void swiotlb_init(int verbose);
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
-extern int swiotlb_late_init_with_default_size(size_t default_size);
+int swiotlb_init_late(size_t size);
 extern void __init swiotlb_update_mem_attributes(void);
 
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 519e363097190..5f64b02fbb732 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -290,11 +290,9 @@ swiotlb_init(int verbose)
  * initialize the swiotlb later using the slab allocator if needed.
  * This should be just like above, but with some error catching.
  */
-int
-swiotlb_late_init_with_default_size(size_t default_size)
+int swiotlb_init_late(size_t size)
 {
-   unsigned long nslabs =
-   ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+   unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned long bytes;
unsigned char *vstart = NULL;
unsigned int order;
-- 
2.30.2



cleanup swiotlb initialization v2

2022-02-27 Thread Christoph Hellwig
Hi all,

this series tries to clean up the swiotlb initialization, including
that of swiotlb-xen.  To get there is also removes the x86 iommu table
infrastructure that massively obsfucates the initialization path.

Git tree:

git://git.infradead.org/users/hch/misc.git swiotlb-init-cleanup

Gitweb:


http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/swiotlb-init-cleanup

Changes since v1:
 - skip IOMMU initialization on Xen PV kernels
 - various small whitespace / typo fixes

Diffstat:
 arch/ia64/include/asm/iommu_table.h  |7 -
 arch/x86/include/asm/iommu_table.h   |  102 ---
 arch/x86/include/asm/swiotlb.h   |   30 -
 arch/x86/kernel/pci-iommu_table.c|   77 --
 arch/x86/kernel/pci-swiotlb.c|   77 --
 arch/x86/xen/pci-swiotlb-xen.c   |   96 --
 b/arch/arm/mm/init.c |6 -
 b/arch/arm/xen/mm.c  |   23 ++--
 b/arch/arm64/mm/init.c   |6 -
 b/arch/ia64/mm/init.c|4 
 b/arch/mips/cavium-octeon/dma-octeon.c   |   15 --
 b/arch/mips/loongson64/dma.c |2 
 b/arch/mips/pci/pci-octeon.c |2 
 b/arch/mips/sibyte/common/dma.c  |2 
 b/arch/powerpc/include/asm/svm.h |4 
 b/arch/powerpc/include/asm/swiotlb.h |1 
 b/arch/powerpc/mm/mem.c  |6 -
 b/arch/powerpc/platforms/pseries/setup.c |3 
 b/arch/powerpc/platforms/pseries/svm.c   |   26 
 b/arch/riscv/mm/init.c   |8 -
 b/arch/s390/mm/init.c|3 
 b/arch/x86/include/asm/dma-mapping.h |   12 --
 b/arch/x86/include/asm/gart.h|5 
 b/arch/x86/include/asm/iommu.h   |8 +
 b/arch/x86/include/asm/xen/page.h|5 
 b/arch/x86/include/asm/xen/swiotlb-xen.h |2 
 b/arch/x86/kernel/Makefile   |2 
 b/arch/x86/kernel/amd_gart_64.c  |5 
 b/arch/x86/kernel/aperture_64.c  |   14 --
 b/arch/x86/kernel/cpu/mshyperv.c |8 -
 b/arch/x86/kernel/pci-dma.c  |  109 
 b/arch/x86/kernel/tboot.c|1 
 b/arch/x86/kernel/vmlinux.lds.S  |   12 --
 b/arch/x86/mm/mem_encrypt_amd.c  |3 
 b/arch/x86/pci/sta2x11-fixup.c   |2 
 b/arch/x86/xen/Makefile  |2 
 b/drivers/iommu/amd/init.c   |6 -
 b/drivers/iommu/amd/iommu.c  |5 
 b/drivers/iommu/intel/dmar.c |6 -
 b/drivers/xen/swiotlb-xen.c  |  132 -
 b/include/linux/dmar.h   |6 -
 b/include/linux/swiotlb.h|   22 ++--
 b/include/trace/events/swiotlb.h |   29 +
 b/include/xen/arm/page.h |1 
 b/include/xen/swiotlb-xen.h  |8 +
 b/kernel/dma/direct.h|2 
 b/kernel/dma/swiotlb.c   |  163 +++
 47 files changed, 253 insertions(+), 817 deletions(-)


[PATCH 02/11] swiotlb: make swiotlb_exit a no-op if SWIOTLB_FORCE is set

2022-02-27 Thread Christoph Hellwig
If force bouncing is enabled we can't release the buffers.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 kernel/dma/swiotlb.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f1e7ea160b433..36fbf1181d285 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -378,6 +378,9 @@ void __init swiotlb_exit(void)
unsigned long tbl_vaddr;
size_t tbl_size, slots_size;
 
+   if (swiotlb_force == SWIOTLB_FORCE)
+   return;
+
if (!mem->nslabs)
return;
 
-- 
2.30.2