osmith has uploaded this change for review. ( 
https://gerrit.osmocom.org/c/osmo-bsc/+/30252 )


Change subject: lchan: try to recover from ST_BORKEN after timeout
......................................................................

lchan: try to recover from ST_BORKEN after timeout

Change-Id: Ic4728b3efe843ea63e2a0b54b1ea8a925347484a
---
M include/osmocom/bsc/lchan.h
M include/osmocom/bsc/lchan_fsm.h
M src/osmo-bsc/lchan_fsm.c
M src/osmo-bsc/net_init.c
M tests/timer.vty
5 files changed, 110 insertions(+), 0 deletions(-)



  git pull ssh://gerrit.osmocom.org:29418/osmo-bsc refs/changes/52/30252/1

diff --git a/include/osmocom/bsc/lchan.h b/include/osmocom/bsc/lchan.h
index 4fcfa20..7f25acf 100644
--- a/include/osmocom/bsc/lchan.h
+++ b/include/osmocom/bsc/lchan.h
@@ -359,6 +359,8 @@
        /* Timestamps and markers to track active state duration. */
        struct timespec active_start;
        struct timespec active_stored;
+       /* How many times we tried to recover from ST_BORKEN, gets reset to 0 
on success */
+       uint8_t borken_recovery_attempts;
 };

 #define GSM_LCHAN_SI(lchan, i) (void *)((lchan)->si.buf[i][0])
diff --git a/include/osmocom/bsc/lchan_fsm.h b/include/osmocom/bsc/lchan_fsm.h
index cc231dc..f47c417 100644
--- a/include/osmocom/bsc/lchan_fsm.h
+++ b/include/osmocom/bsc/lchan_fsm.h
@@ -33,6 +33,8 @@
        LCHAN_ST_WAIT_RF_RELEASE_ACK,
        LCHAN_ST_WAIT_AFTER_ERROR,
        LCHAN_ST_BORKEN,
+       LCHAN_ST_RECOVER_WAIT_ACTIV_ACK, /*< Attempt to recover from BORKEN: 
first try to activate the lchan */
+       LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK, /*< Attempt to recover from 
BORKEN: then try to release it */
 };

 enum lchan_fsm_event {
diff --git a/src/osmo-bsc/lchan_fsm.c b/src/osmo-bsc/lchan_fsm.c
index b1cef5d..f7ceec2 100644
--- a/src/osmo-bsc/lchan_fsm.c
+++ b/src/osmo-bsc/lchan_fsm.c
@@ -43,6 +43,8 @@
 #include <osmocom/bsc/bsc_stats.h>
 #include <osmocom/bsc/lchan.h>

+#define BORKEN_RECOVERY_ATTEMPTS_MAX 5
+
 static struct osmo_fsm lchan_fsm;

 struct gsm_lchan *lchan_fi_lchan(struct osmo_fsm_inst *fi)
@@ -291,6 +293,9 @@
        [LCHAN_ST_WAIT_AFTER_ERROR]     = { .T=-3111 },
        [LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = { .T=-13 },
        [LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK]        = { .T=-14 },
+       [LCHAN_ST_BORKEN]               = { .T=-15 },
+       [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK]       = { .T=-6 },
+       [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK]  = { .T=3111 },
 };

 /* Transition to a state, using the T timer defined in lchan_fsm_timeouts.
@@ -337,6 +342,8 @@
        [LCHAN_ST_BORKEN]                       = LCHAN_ST_BORKEN,
        [LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = LCHAN_ST_WAIT_RF_RELEASE_ACK,
        [LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK]        = 
LCHAN_ST_WAIT_RF_RELEASE_ACK,
+       [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK]       = LCHAN_ST_BORKEN,
+       [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK]  = LCHAN_ST_BORKEN,
 };

 #define lchan_fail(fmt, args...) lchan_fail_to(lchan_fsm_on_error[fi->state], 
fmt, ## args)
@@ -1494,6 +1501,10 @@
        if (prev_state != LCHAN_ST_BORKEN)
                
osmo_stat_item_inc(osmo_stat_item_group_get_item(bts->bts_statg, 
BTS_STAT_LCHAN_BORKEN), 1);

+       if (lchan->borken_recovery_attempts == BORKEN_RECOVERY_ATTEMPTS_MAX)
+               LOG_LCHAN(lchan, LOGL_ERROR, "Reached 
BORKEN_RECOVERY_ATTEMPTS_MAX=%d, giving up\n",
+                         BORKEN_RECOVERY_ATTEMPTS_MAX);
+
        /* The actual action besides all the beancounting above */
        lchan_reset(lchan);
        chan_counts_ts_update(lchan->ts);
@@ -1552,6 +1563,69 @@
        }
 }

+static void lchan_fsm_recover_wait_activ_ack_onenter(struct osmo_fsm_inst *fi, 
uint32_t prev_state)
+{
+       int rc;
+       struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+       lchan->borken_recovery_attempts++;
+       LOG_LCHAN(lchan, LOGL_NOTICE, "attempting to recover from BORKEN lchan 
(%d/%d)\n",
+                 lchan->borken_recovery_attempts, 
BORKEN_RECOVERY_ATTEMPTS_MAX);
+
+       rc = rsl_tx_chan_activ(lchan, RSL_ACT_INTRA_NORM_ASS, 0);
+       if (rc)
+               lchan_fail("Tx Chan Activ failed: %s (%d)", strerror(-rc), rc);
+}
+
+static void lchan_fsm_recover_wait_activ_ack(struct osmo_fsm_inst *fi, 
uint32_t event, void *data)
+{
+       struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+       switch (event) {
+
+       case LCHAN_EV_RSL_CHAN_ACTIV_ACK:
+               lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK);
+               break;
+
+       case LCHAN_EV_RSL_CHAN_ACTIV_NACK:
+               /* If an earlier lchan activ got through to the BTS, but the
+                * ACK did not get back to the BSC, it may still be active on
+                * the BTS side. Proceed to release it. */
+               LOG_LCHAN(lchan, LOGL_NOTICE, "received NACK for activation of 
BORKEN lchan, assuming still active\n");
+               lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK);
+               break;
+
+       default:
+               OSMO_ASSERT(false);
+       }
+}
+
+static void lchan_fsm_recover_wait_rf_release_ack_onenter(struct osmo_fsm_inst 
*fi, uint32_t prev_state)
+{
+       int rc;
+       struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+       rc = rsl_tx_rf_chan_release(lchan);
+       if (rc)
+               lchan_fail("Tx RSL RF Channel Release failed: %s (%d)\n", 
strerror(-rc), rc);
+}
+
+static void lchan_fsm_recover_wait_rf_release_ack(struct osmo_fsm_inst *fi, 
uint32_t event, void *data)
+{
+       struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+       switch (event) {
+
+       case LCHAN_EV_RSL_RF_CHAN_REL_ACK:
+               LOG_LCHAN(lchan, LOGL_NOTICE, "successfully recovered BORKEN 
lchan\n");
+               lchan->borken_recovery_attempts = 0;
+               lchan_fsm_state_chg(LCHAN_ST_UNUSED);
+               break;
+
+       default:
+               OSMO_ASSERT(false);
+       }
+}
+
 #define S(x)   (1 << (x))

 static const struct osmo_fsm_state lchan_fsm_states[] = {
@@ -1743,6 +1817,30 @@
                        | S(LCHAN_ST_WAIT_AFTER_ERROR)
                        ,
        },
+       [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] {
+               .name = "RECOVER_WAIT_ACTIV_ACK",
+               .onenter = lchan_fsm_recover_wait_activ_ack_onenter,
+               .action = lchan_fsm_recover_wait_activ_ack,
+               .in_event_mask = 0
+                       | S(LCHAN_EV_RSL_CHAN_ACTIV_ACK)
+                       | S(LCHAN_EV_RSL_CHAN_ACTIV_NACK)
+                       ,
+               .out_state_mask = 0
+                       | S(LCHAN_ST_BORKEN)
+                       | S(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK)
+                       ,
+       },
+       [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] {
+               .name = "RECOVER_WAIT_RF_RELEASE_ACK",
+               .onenter = lchan_fsm_recover_wait_rf_release_ack_onenter,
+               .action = lchan_fsm_recover_wait_rf_release_ack,
+               .in_event_mask = 0
+                       | S(LCHAN_EV_RSL_RF_CHAN_REL_ACK)
+                       ,
+               .out_state_mask = 0
+                       | S(LCHAN_ST_UNUSED)
+                       ,
+       },
 };

 static const struct value_string lchan_fsm_event_names[] = {
@@ -1814,6 +1912,11 @@
                lchan_fsm_state_chg(LCHAN_ST_UNUSED);
                return 0;

+       case LCHAN_ST_BORKEN:
+               if (lchan->borken_recovery_attempts < 
BORKEN_RECOVERY_ATTEMPTS_MAX)
+                       lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_ACTIV_ACK);
+               return 0;
+
        default:
                lchan->release.in_error = true;
                lchan->release.rsl_error_cause = RSL_ERR_INTERWORKING;
diff --git a/src/osmo-bsc/net_init.c b/src/osmo-bsc/net_init.c
index 8f2b26c..28ce175 100644
--- a/src/osmo-bsc/net_init.c
+++ b/src/osmo-bsc/net_init.c
@@ -58,6 +58,7 @@
        { .T=-12, .default_val=5, .desc="Timeout for obtaining TA after BSSLAP 
TA Request" },
        { .T=-13, .default_val=5, .desc="Timeout for RR Channel Mode Modify ACK 
(BSC <-> MS)" },
        { .T=-14, .default_val=5, .desc="Timeout for RSL Channel Mode Modify 
ACK (BSC <-> BTS)" },
+       { .T=-15, .default_val=30, .desc="Wait time before trying to use a 
BORKEN timeslot again" },
        { .T = -16, .default_val = 1000, .unit = OSMO_TDEF_MS,
                .desc = "Granularity for all_allocated:* rate counters: amount 
of milliseconds that one counter increment"
                        " represents. See also X17, X18" },
diff --git a/tests/timer.vty b/tests/timer.vty
index 04c9872..16baaf2 100644
--- a/tests/timer.vty
+++ b/tests/timer.vty
@@ -30,6 +30,7 @@
 net: X12 = 5 s Timeout for obtaining TA after BSSLAP TA Request (default: 5 s)
 net: X13 = 5 s Timeout for RR Channel Mode Modify ACK (BSC <-> MS) (default: 5 
s)
 net: X14 = 5 s Timeout for RSL Channel Mode Modify ACK (BSC <-> BTS) (default: 
5 s)
+net: X15 = 30 s        Wait time before trying to use a BORKEN timeslot again 
(default: 30 s)
 net: X16 = 1000 ms     Granularity for all_allocated:* rate counters: amount 
of milliseconds that one counter increment represents. See also X17, X18 
(default: 1000 ms)
 net: X17 = 0 ms        Rounding threshold for all_allocated:* rate counters: 
round up to the next counter increment after this many milliseconds. If set to 
half of X16 (or 0), employ the usual round() behavior: round up after half of a 
granularity period. If set to 1, behave like ceil(): already increment the 
counter immediately when all channels are allocated. If set >= X16, behave like 
floor(): only increment after a full X16 period of all channels being occupied. 
See also X16, X18 (default: 0 ms)
 net: X18 = 60000 ms    Forget-sum period for all_allocated:* rate counters: 
after this amount of idle time, forget internally cumulated time remainders. 
Zero to always keep remainders. See also X16, X17. (default: 60000 ms)
@@ -84,6 +85,7 @@
 net: X12 = 5 s Timeout for obtaining TA after BSSLAP TA Request (default: 5 s)
 net: X13 = 5 s Timeout for RR Channel Mode Modify ACK (BSC <-> MS) (default: 5 
s)
 net: X14 = 5 s Timeout for RSL Channel Mode Modify ACK (BSC <-> BTS) (default: 
5 s)
+net: X15 = 30 s        Wait time before trying to use a BORKEN timeslot again 
(default: 30 s)
 net: X16 = 1000 ms     Granularity for all_allocated:* rate counters: amount 
of milliseconds that one counter increment represents. See also X17, X18 
(default: 1000 ms)
 net: X17 = 0 ms        Rounding threshold for all_allocated:* rate counters: 
round up to the next counter increment after this many milliseconds. If set to 
half of X16 (or 0), employ the usual round() behavior: round up after half of a 
granularity period. If set to 1, behave like ceil(): already increment the 
counter immediately when all channels are allocated. If set >= X16, behave like 
floor(): only increment after a full X16 period of all channels being occupied. 
See also X16, X18 (default: 0 ms)
 net: X18 = 60000 ms    Forget-sum period for all_allocated:* rate counters: 
after this amount of idle time, forget internally cumulated time remainders. 
Zero to always keep remainders. See also X16, X17. (default: 60000 ms)

--
To view, visit https://gerrit.osmocom.org/c/osmo-bsc/+/30252
To unsubscribe, or for help writing mail filters, visit 
https://gerrit.osmocom.org/settings

Gerrit-Project: osmo-bsc
Gerrit-Branch: master
Gerrit-Change-Id: Ic4728b3efe843ea63e2a0b54b1ea8a925347484a
Gerrit-Change-Number: 30252
Gerrit-PatchSet: 1
Gerrit-Owner: osmith <osm...@sysmocom.de>
Gerrit-MessageType: newchange

Reply via email to