Re: [PATCH 4/5] s390x: Add boot device fallback infrastructure

2024-06-05 Thread Thomas Huth

On 05/06/2024 10.20, Thomas Huth wrote:

On 29/05/2024 17.43, jro...@linux.ibm.com wrote:

From: Jared Rossi 

Add a routine for loading the next IPLB if a device fails to boot.

This includes some minor changes to the List-Directed IPL routine so that the
failing device may be retried using the legacy boot pointers before moving 
on to

the next device.

Signed-off-by: Jared Rossi 
---

...

diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c
index a2137449dc..69391557fa 100644
--- a/pc-bios/s390-ccw/bootmap.c
+++ b/pc-bios/s390-ccw/bootmap.c
@@ -144,7 +144,10 @@ static block_number_t 
load_eckd_segments(block_number_t blk, bool ldipl,

  bool more_data;
  memset(_bprs, FREE_SPACE_FILLER, sizeof(_bprs));
-    read_block(blk, bprs, "BPRS read failed");
+    if (!read_block_nonfatal(blk, bprs)) {
+    IPL_assert(ldipl, "BPRS read failed");
+    return -1;
+    }
  do {
  more_data = false;
@@ -188,7 +191,10 @@ static block_number_t 
load_eckd_segments(block_number_t blk, bool ldipl,

   * I.e. the next ptr must point to the unused memory area
   */
  memset(_bprs, FREE_SPACE_FILLER, sizeof(_bprs));
-    read_block(block_nr, bprs, "BPRS continuation read failed");
+    if (!read_block_nonfatal(block_nr, bprs)) {
+    IPL_assert(ldipl, "BPRS continuation read failed");
+    break;
+    }
  more_data = true;
  break;
  }
@@ -197,7 +203,10 @@ static block_number_t 
load_eckd_segments(block_number_t blk, bool ldipl,

   * to memory (address).
   */
  rc = virtio_read_many(block_nr, (void *)(*address), count + 1);
-    IPL_assert(rc == 0, "code chunk read failed");
+    if (rc != 0) {
+    IPL_assert(ldipl, "code chunk read failed");
+    break;
+    }
  *address += (count + 1) * virtio_get_block_size();
  }
@@ -295,13 +304,22 @@ static void run_eckd_boot_script(block_number_t 
bmt_block_nr,

 " maximum number of boot entries allowed");
  memset(sec, FREE_SPACE_FILLER, sizeof(sec));
-    read_block(bmt_block_nr, sec, "Cannot read Boot Map Table");
+    if (!read_block_nonfatal(bmt_block_nr, sec)) {
+    IPL_assert(ldipl, "Cannot read Boot Map Table");
+    return;
+    }
  block_nr = gen_eckd_block_num(>entry[loadparm].xeckd, ldipl);
-    IPL_assert(block_nr != -1, "Cannot find Boot Map Table Entry");
+    if (block_nr == -1) {
+    IPL_assert(ldipl, "Cannot find Boot Map Table Entry");
+    return;
+    }
  memset(sec, FREE_SPACE_FILLER, sizeof(sec));
-    read_block(block_nr, sec, "Cannot read Boot Map Script");
+    if (!read_block_nonfatal(block_nr, sec)) {
+    IPL_assert(ldipl, "Cannot read Boot Map Script");
+    return;
+    }
  for (i = 0; bms->entry[i].type == BOOT_SCRIPT_LOAD ||
  bms->entry[i].type == BOOT_SCRIPT_SIGNATURE; i++) {
@@ -319,13 +337,10 @@ static void run_eckd_boot_script(block_number_t 
bmt_block_nr,

  } while (block_nr != -1);
  }
-    if (ldipl && bms->entry[i].type != BOOT_SCRIPT_EXEC) {
-    /* Abort LD-IPL and retry as CCW-IPL */
+    if (bms->entry[i].type != BOOT_SCRIPT_EXEC) {
+    IPL_assert(ldipl, "Unknown script entry type");
  return;
  }
-
-    IPL_assert(bms->entry[i].type == BOOT_SCRIPT_EXEC,
-   "Unknown script entry type");
  write_reset_psw(bms->entry[i].address.load_address); /* no return */
  jump_to_IPL_code(0); /* no return */
  }
@@ -492,7 +507,7 @@ static void ipl_eckd(void)
  /* LD-IPL does not use the S1B bock, just make it NULL */
  run_eckd_boot_script(ldipl_bmt, NULL_BLOCK_NR);
  /* Only return in error, retry as CCW-IPL */
-    sclp_print("Retrying IPL ");
+    sclp_print("LD-IPL failed, retrying device\n");
  print_eckd_msg();
  }
  memset(sec, FREE_SPACE_FILLER, sizeof(sec));
@@ -944,5 +959,5 @@ void zipl_load(void)
  panic("\n! Unknown IPL device type !\n");
  }
-    sclp_print("zIPL load failed.\n");
+    panic("zIPL load failed.\n");


Why replacing the sclp_print() here? Wouldn't it be nicer to continue 
panicking on the calling site instead?


Ok, after looking at the 5th patch, I think I understand it now: panic() is 
not fatal anymore and might restart with the next boot device... not sure 
whether I like that, but let's discuss that on patch 5 instead...


 Thomas




Re: [PATCH 4/5] s390x: Add boot device fallback infrastructure

2024-06-05 Thread Thomas Huth

On 29/05/2024 17.43, jro...@linux.ibm.com wrote:

From: Jared Rossi 

Add a routine for loading the next IPLB if a device fails to boot.

This includes some minor changes to the List-Directed IPL routine so that the
failing device may be retried using the legacy boot pointers before moving on to
the next device.

Signed-off-by: Jared Rossi 
---

...

diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c
index a2137449dc..69391557fa 100644
--- a/pc-bios/s390-ccw/bootmap.c
+++ b/pc-bios/s390-ccw/bootmap.c
@@ -144,7 +144,10 @@ static block_number_t load_eckd_segments(block_number_t 
blk, bool ldipl,
  bool more_data;
  
  memset(_bprs, FREE_SPACE_FILLER, sizeof(_bprs));

-read_block(blk, bprs, "BPRS read failed");
+if (!read_block_nonfatal(blk, bprs)) {
+IPL_assert(ldipl, "BPRS read failed");
+return -1;
+}
  
  do {

  more_data = false;
@@ -188,7 +191,10 @@ static block_number_t load_eckd_segments(block_number_t 
blk, bool ldipl,
   * I.e. the next ptr must point to the unused memory area
   */
  memset(_bprs, FREE_SPACE_FILLER, sizeof(_bprs));
-read_block(block_nr, bprs, "BPRS continuation read failed");
+if (!read_block_nonfatal(block_nr, bprs)) {
+IPL_assert(ldipl, "BPRS continuation read failed");
+break;
+}
  more_data = true;
  break;
  }
@@ -197,7 +203,10 @@ static block_number_t load_eckd_segments(block_number_t 
blk, bool ldipl,
   * to memory (address).
   */
  rc = virtio_read_many(block_nr, (void *)(*address), count + 1);
-IPL_assert(rc == 0, "code chunk read failed");
+if (rc != 0) {
+IPL_assert(ldipl, "code chunk read failed");
+break;
+}
  
  *address += (count + 1) * virtio_get_block_size();

  }
@@ -295,13 +304,22 @@ static void run_eckd_boot_script(block_number_t 
bmt_block_nr,
 " maximum number of boot entries allowed");
  
  memset(sec, FREE_SPACE_FILLER, sizeof(sec));

-read_block(bmt_block_nr, sec, "Cannot read Boot Map Table");
+if (!read_block_nonfatal(bmt_block_nr, sec)) {
+IPL_assert(ldipl, "Cannot read Boot Map Table");
+return;
+}
  
  block_nr = gen_eckd_block_num(>entry[loadparm].xeckd, ldipl);

-IPL_assert(block_nr != -1, "Cannot find Boot Map Table Entry");
+if (block_nr == -1) {
+IPL_assert(ldipl, "Cannot find Boot Map Table Entry");
+return;
+}
  
  memset(sec, FREE_SPACE_FILLER, sizeof(sec));

-read_block(block_nr, sec, "Cannot read Boot Map Script");
+if (!read_block_nonfatal(block_nr, sec)) {
+IPL_assert(ldipl, "Cannot read Boot Map Script");
+return;
+}
  
  for (i = 0; bms->entry[i].type == BOOT_SCRIPT_LOAD ||

  bms->entry[i].type == BOOT_SCRIPT_SIGNATURE; i++) {
@@ -319,13 +337,10 @@ static void run_eckd_boot_script(block_number_t 
bmt_block_nr,
  } while (block_nr != -1);
  }
  
-if (ldipl && bms->entry[i].type != BOOT_SCRIPT_EXEC) {

-/* Abort LD-IPL and retry as CCW-IPL */
+if (bms->entry[i].type != BOOT_SCRIPT_EXEC) {
+IPL_assert(ldipl, "Unknown script entry type");
  return;
  }
-
-IPL_assert(bms->entry[i].type == BOOT_SCRIPT_EXEC,
-   "Unknown script entry type");
  write_reset_psw(bms->entry[i].address.load_address); /* no return */
  jump_to_IPL_code(0); /* no return */
  }
@@ -492,7 +507,7 @@ static void ipl_eckd(void)
  /* LD-IPL does not use the S1B bock, just make it NULL */
  run_eckd_boot_script(ldipl_bmt, NULL_BLOCK_NR);
  /* Only return in error, retry as CCW-IPL */
-sclp_print("Retrying IPL ");
+sclp_print("LD-IPL failed, retrying device\n");
  print_eckd_msg();
  }
  memset(sec, FREE_SPACE_FILLER, sizeof(sec));
@@ -944,5 +959,5 @@ void zipl_load(void)
  panic("\n! Unknown IPL device type !\n");
  }
  
-sclp_print("zIPL load failed.\n");

+panic("zIPL load failed.\n");


Why replacing the sclp_print() here? Wouldn't it be nicer to continue 
panicking on the calling site instead?



  }
diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c
index 3e51d698d7..248ed5a410 100644
--- a/pc-bios/s390-ccw/main.c
+++ b/pc-bios/s390-ccw/main.c
@@ -53,6 +53,12 @@ unsigned int get_loadparm_index(void)
  return atoui(loadparm_str);
  }
  
+static void copy_qipl(void)

+{
+QemuIplParameters *early_qipl = (QemuIplParameters *)QIPL_ADDRESS;
+memcpy(, early_qipl, sizeof(QemuIplParameters));
+}


You could move this function as a static inline into iplb.h ...

...

diff --git a/pc-bios/s390-ccw/netmain.c b/pc-bios/s390-ccw/netmain.c
index 5cd619b2d6..65cee15fef 100644
--- 

[PATCH 4/5] s390x: Add boot device fallback infrastructure

2024-05-29 Thread jrossi
From: Jared Rossi 

Add a routine for loading the next IPLB if a device fails to boot.

This includes some minor changes to the List-Directed IPL routine so that the
failing device may be retried using the legacy boot pointers before moving on to
the next device.

Signed-off-by: Jared Rossi 
---
 pc-bios/s390-ccw/bootmap.h |  5 +
 pc-bios/s390-ccw/iplb.h| 24 ++
 pc-bios/s390-ccw/bootmap.c | 41 ++
 pc-bios/s390-ccw/main.c| 15 +-
 pc-bios/s390-ccw/netmain.c |  4 
 5 files changed, 71 insertions(+), 18 deletions(-)

diff --git a/pc-bios/s390-ccw/bootmap.h b/pc-bios/s390-ccw/bootmap.h
index d4690a88c2..d5061ed6c8 100644
--- a/pc-bios/s390-ccw/bootmap.h
+++ b/pc-bios/s390-ccw/bootmap.h
@@ -366,6 +366,11 @@ static inline void read_block(block_number_t blockno,
 IPL_assert(virtio_read(blockno, buffer) == 0, errmsg);
 }
 
+static inline bool read_block_nonfatal(block_number_t blockno, void *buffer)
+{
+return (virtio_read(blockno, buffer) == 0);
+}
+
 static inline bool block_size_ok(uint32_t block_size)
 {
 return block_size == virtio_get_block_size();
diff --git a/pc-bios/s390-ccw/iplb.h b/pc-bios/s390-ccw/iplb.h
index 16643f5879..3c29d23375 100644
--- a/pc-bios/s390-ccw/iplb.h
+++ b/pc-bios/s390-ccw/iplb.h
@@ -49,4 +49,28 @@ static inline bool set_iplb(IplParameterBlock *iplb)
 return manage_iplb(iplb, false);
 }
 
+/*
+ * The IPL started on the device, but failed in some way.  If the IPLB chain
+ * still has more devices left to try, use the next device in order. Set the
+ * next IPLB and save the current qipl parameters state.
+ */
+static inline bool load_next_iplb(void)
+{
+IplParameterBlock *next_iplb;
+
+if (qipl.num_iplbs < 1) {
+return false;
+}
+
+next_iplb = (IplParameterBlock *) qipl.next_iplb;
+memcpy(, next_iplb, sizeof(IplParameterBlock));
+set_iplb();
+
+qipl.num_iplbs--;
+qipl.next_iplb = qipl.next_iplb + sizeof(IplParameterBlock);
+memcpy((QemuIplParameters *)QIPL_ADDRESS, , 
sizeof(QemuIplParameters));
+
+return true;
+}
+
 #endif /* IPLB_H */
diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c
index a2137449dc..69391557fa 100644
--- a/pc-bios/s390-ccw/bootmap.c
+++ b/pc-bios/s390-ccw/bootmap.c
@@ -144,7 +144,10 @@ static block_number_t load_eckd_segments(block_number_t 
blk, bool ldipl,
 bool more_data;
 
 memset(_bprs, FREE_SPACE_FILLER, sizeof(_bprs));
-read_block(blk, bprs, "BPRS read failed");
+if (!read_block_nonfatal(blk, bprs)) {
+IPL_assert(ldipl, "BPRS read failed");
+return -1;
+}
 
 do {
 more_data = false;
@@ -188,7 +191,10 @@ static block_number_t load_eckd_segments(block_number_t 
blk, bool ldipl,
  * I.e. the next ptr must point to the unused memory area
  */
 memset(_bprs, FREE_SPACE_FILLER, sizeof(_bprs));
-read_block(block_nr, bprs, "BPRS continuation read failed");
+if (!read_block_nonfatal(block_nr, bprs)) {
+IPL_assert(ldipl, "BPRS continuation read failed");
+break;
+}
 more_data = true;
 break;
 }
@@ -197,7 +203,10 @@ static block_number_t load_eckd_segments(block_number_t 
blk, bool ldipl,
  * to memory (address).
  */
 rc = virtio_read_many(block_nr, (void *)(*address), count + 1);
-IPL_assert(rc == 0, "code chunk read failed");
+if (rc != 0) {
+IPL_assert(ldipl, "code chunk read failed");
+break;
+}
 
 *address += (count + 1) * virtio_get_block_size();
 }
@@ -295,13 +304,22 @@ static void run_eckd_boot_script(block_number_t 
bmt_block_nr,
" maximum number of boot entries allowed");
 
 memset(sec, FREE_SPACE_FILLER, sizeof(sec));
-read_block(bmt_block_nr, sec, "Cannot read Boot Map Table");
+if (!read_block_nonfatal(bmt_block_nr, sec)) {
+IPL_assert(ldipl, "Cannot read Boot Map Table");
+return;
+}
 
 block_nr = gen_eckd_block_num(>entry[loadparm].xeckd, ldipl);
-IPL_assert(block_nr != -1, "Cannot find Boot Map Table Entry");
+if (block_nr == -1) {
+IPL_assert(ldipl, "Cannot find Boot Map Table Entry");
+return;
+}
 
 memset(sec, FREE_SPACE_FILLER, sizeof(sec));
-read_block(block_nr, sec, "Cannot read Boot Map Script");
+if (!read_block_nonfatal(block_nr, sec)) {
+IPL_assert(ldipl, "Cannot read Boot Map Script");
+return;
+}
 
 for (i = 0; bms->entry[i].type == BOOT_SCRIPT_LOAD ||
 bms->entry[i].type == BOOT_SCRIPT_SIGNATURE; i++) {
@@ -319,13 +337,10 @@ static void run_eckd_boot_script(block_number_t 
bmt_block_nr,
 } while (block_nr != -1);
 }
 
-if (ldipl && bms->entry[i].type != BOOT_SCRIPT_EXEC) {
-/*