osmith has uploaded this change for review. ( https://gerrit.osmocom.org/c/osmo-bsc/+/30252 )
Change subject: lchan: try to recover from ST_BORKEN after timeout ...................................................................... lchan: try to recover from ST_BORKEN after timeout Change-Id: Ic4728b3efe843ea63e2a0b54b1ea8a925347484a --- M include/osmocom/bsc/lchan.h M include/osmocom/bsc/lchan_fsm.h M src/osmo-bsc/lchan_fsm.c M src/osmo-bsc/net_init.c M tests/timer.vty 5 files changed, 110 insertions(+), 0 deletions(-) git pull ssh://gerrit.osmocom.org:29418/osmo-bsc refs/changes/52/30252/1 diff --git a/include/osmocom/bsc/lchan.h b/include/osmocom/bsc/lchan.h index 4fcfa20..7f25acf 100644 --- a/include/osmocom/bsc/lchan.h +++ b/include/osmocom/bsc/lchan.h @@ -359,6 +359,8 @@ /* Timestamps and markers to track active state duration. */ struct timespec active_start; struct timespec active_stored; + /* How many times we tried to recover from ST_BORKEN, gets reset to 0 on success */ + uint8_t borken_recovery_attempts; }; #define GSM_LCHAN_SI(lchan, i) (void *)((lchan)->si.buf[i][0]) diff --git a/include/osmocom/bsc/lchan_fsm.h b/include/osmocom/bsc/lchan_fsm.h index cc231dc..f47c417 100644 --- a/include/osmocom/bsc/lchan_fsm.h +++ b/include/osmocom/bsc/lchan_fsm.h @@ -33,6 +33,8 @@ LCHAN_ST_WAIT_RF_RELEASE_ACK, LCHAN_ST_WAIT_AFTER_ERROR, LCHAN_ST_BORKEN, + LCHAN_ST_RECOVER_WAIT_ACTIV_ACK, /*< Attempt to recover from BORKEN: first try to activate the lchan */ + LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK, /*< Attempt to recover from BORKEN: then try to release it */ }; enum lchan_fsm_event { diff --git a/src/osmo-bsc/lchan_fsm.c b/src/osmo-bsc/lchan_fsm.c index b1cef5d..f7ceec2 100644 --- a/src/osmo-bsc/lchan_fsm.c +++ b/src/osmo-bsc/lchan_fsm.c @@ -43,6 +43,8 @@ #include <osmocom/bsc/bsc_stats.h> #include <osmocom/bsc/lchan.h> +#define BORKEN_RECOVERY_ATTEMPTS_MAX 5 + static struct osmo_fsm lchan_fsm; struct gsm_lchan *lchan_fi_lchan(struct osmo_fsm_inst *fi) @@ -291,6 +293,9 @@ [LCHAN_ST_WAIT_AFTER_ERROR] = { .T=-3111 }, [LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = { .T=-13 }, [LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK] = { .T=-14 }, + [LCHAN_ST_BORKEN] = { .T=-15 }, + [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] = { .T=-6 }, + [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] = { .T=3111 }, }; /* Transition to a state, using the T timer defined in lchan_fsm_timeouts. @@ -337,6 +342,8 @@ [LCHAN_ST_BORKEN] = LCHAN_ST_BORKEN, [LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = LCHAN_ST_WAIT_RF_RELEASE_ACK, [LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK] = LCHAN_ST_WAIT_RF_RELEASE_ACK, + [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] = LCHAN_ST_BORKEN, + [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] = LCHAN_ST_BORKEN, }; #define lchan_fail(fmt, args...) lchan_fail_to(lchan_fsm_on_error[fi->state], fmt, ## args) @@ -1494,6 +1501,10 @@ if (prev_state != LCHAN_ST_BORKEN) osmo_stat_item_inc(osmo_stat_item_group_get_item(bts->bts_statg, BTS_STAT_LCHAN_BORKEN), 1); + if (lchan->borken_recovery_attempts == BORKEN_RECOVERY_ATTEMPTS_MAX) + LOG_LCHAN(lchan, LOGL_ERROR, "Reached BORKEN_RECOVERY_ATTEMPTS_MAX=%d, giving up\n", + BORKEN_RECOVERY_ATTEMPTS_MAX); + /* The actual action besides all the beancounting above */ lchan_reset(lchan); chan_counts_ts_update(lchan->ts); @@ -1552,6 +1563,69 @@ } } +static void lchan_fsm_recover_wait_activ_ack_onenter(struct osmo_fsm_inst *fi, uint32_t prev_state) +{ + int rc; + struct gsm_lchan *lchan = lchan_fi_lchan(fi); + + lchan->borken_recovery_attempts++; + LOG_LCHAN(lchan, LOGL_NOTICE, "attempting to recover from BORKEN lchan (%d/%d)\n", + lchan->borken_recovery_attempts, BORKEN_RECOVERY_ATTEMPTS_MAX); + + rc = rsl_tx_chan_activ(lchan, RSL_ACT_INTRA_NORM_ASS, 0); + if (rc) + lchan_fail("Tx Chan Activ failed: %s (%d)", strerror(-rc), rc); +} + +static void lchan_fsm_recover_wait_activ_ack(struct osmo_fsm_inst *fi, uint32_t event, void *data) +{ + struct gsm_lchan *lchan = lchan_fi_lchan(fi); + + switch (event) { + + case LCHAN_EV_RSL_CHAN_ACTIV_ACK: + lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK); + break; + + case LCHAN_EV_RSL_CHAN_ACTIV_NACK: + /* If an earlier lchan activ got through to the BTS, but the + * ACK did not get back to the BSC, it may still be active on + * the BTS side. Proceed to release it. */ + LOG_LCHAN(lchan, LOGL_NOTICE, "received NACK for activation of BORKEN lchan, assuming still active\n"); + lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK); + break; + + default: + OSMO_ASSERT(false); + } +} + +static void lchan_fsm_recover_wait_rf_release_ack_onenter(struct osmo_fsm_inst *fi, uint32_t prev_state) +{ + int rc; + struct gsm_lchan *lchan = lchan_fi_lchan(fi); + + rc = rsl_tx_rf_chan_release(lchan); + if (rc) + lchan_fail("Tx RSL RF Channel Release failed: %s (%d)\n", strerror(-rc), rc); +} + +static void lchan_fsm_recover_wait_rf_release_ack(struct osmo_fsm_inst *fi, uint32_t event, void *data) +{ + struct gsm_lchan *lchan = lchan_fi_lchan(fi); + switch (event) { + + case LCHAN_EV_RSL_RF_CHAN_REL_ACK: + LOG_LCHAN(lchan, LOGL_NOTICE, "successfully recovered BORKEN lchan\n"); + lchan->borken_recovery_attempts = 0; + lchan_fsm_state_chg(LCHAN_ST_UNUSED); + break; + + default: + OSMO_ASSERT(false); + } +} + #define S(x) (1 << (x)) static const struct osmo_fsm_state lchan_fsm_states[] = { @@ -1743,6 +1817,30 @@ | S(LCHAN_ST_WAIT_AFTER_ERROR) , }, + [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] { + .name = "RECOVER_WAIT_ACTIV_ACK", + .onenter = lchan_fsm_recover_wait_activ_ack_onenter, + .action = lchan_fsm_recover_wait_activ_ack, + .in_event_mask = 0 + | S(LCHAN_EV_RSL_CHAN_ACTIV_ACK) + | S(LCHAN_EV_RSL_CHAN_ACTIV_NACK) + , + .out_state_mask = 0 + | S(LCHAN_ST_BORKEN) + | S(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK) + , + }, + [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] { + .name = "RECOVER_WAIT_RF_RELEASE_ACK", + .onenter = lchan_fsm_recover_wait_rf_release_ack_onenter, + .action = lchan_fsm_recover_wait_rf_release_ack, + .in_event_mask = 0 + | S(LCHAN_EV_RSL_RF_CHAN_REL_ACK) + , + .out_state_mask = 0 + | S(LCHAN_ST_UNUSED) + , + }, }; static const struct value_string lchan_fsm_event_names[] = { @@ -1814,6 +1912,11 @@ lchan_fsm_state_chg(LCHAN_ST_UNUSED); return 0; + case LCHAN_ST_BORKEN: + if (lchan->borken_recovery_attempts < BORKEN_RECOVERY_ATTEMPTS_MAX) + lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_ACTIV_ACK); + return 0; + default: lchan->release.in_error = true; lchan->release.rsl_error_cause = RSL_ERR_INTERWORKING; diff --git a/src/osmo-bsc/net_init.c b/src/osmo-bsc/net_init.c index 8f2b26c..28ce175 100644 --- a/src/osmo-bsc/net_init.c +++ b/src/osmo-bsc/net_init.c @@ -58,6 +58,7 @@ { .T=-12, .default_val=5, .desc="Timeout for obtaining TA after BSSLAP TA Request" }, { .T=-13, .default_val=5, .desc="Timeout for RR Channel Mode Modify ACK (BSC <-> MS)" }, { .T=-14, .default_val=5, .desc="Timeout for RSL Channel Mode Modify ACK (BSC <-> BTS)" }, + { .T=-15, .default_val=30, .desc="Wait time before trying to use a BORKEN timeslot again" }, { .T = -16, .default_val = 1000, .unit = OSMO_TDEF_MS, .desc = "Granularity for all_allocated:* rate counters: amount of milliseconds that one counter increment" " represents. See also X17, X18" }, diff --git a/tests/timer.vty b/tests/timer.vty index 04c9872..16baaf2 100644 --- a/tests/timer.vty +++ b/tests/timer.vty @@ -30,6 +30,7 @@ net: X12 = 5 s Timeout for obtaining TA after BSSLAP TA Request (default: 5 s) net: X13 = 5 s Timeout for RR Channel Mode Modify ACK (BSC <-> MS) (default: 5 s) net: X14 = 5 s Timeout for RSL Channel Mode Modify ACK (BSC <-> BTS) (default: 5 s) +net: X15 = 30 s Wait time before trying to use a BORKEN timeslot again (default: 30 s) net: X16 = 1000 ms Granularity for all_allocated:* rate counters: amount of milliseconds that one counter increment represents. See also X17, X18 (default: 1000 ms) net: X17 = 0 ms Rounding threshold for all_allocated:* rate counters: round up to the next counter increment after this many milliseconds. If set to half of X16 (or 0), employ the usual round() behavior: round up after half of a granularity period. If set to 1, behave like ceil(): already increment the counter immediately when all channels are allocated. If set >= X16, behave like floor(): only increment after a full X16 period of all channels being occupied. See also X16, X18 (default: 0 ms) net: X18 = 60000 ms Forget-sum period for all_allocated:* rate counters: after this amount of idle time, forget internally cumulated time remainders. Zero to always keep remainders. See also X16, X17. (default: 60000 ms) @@ -84,6 +85,7 @@ net: X12 = 5 s Timeout for obtaining TA after BSSLAP TA Request (default: 5 s) net: X13 = 5 s Timeout for RR Channel Mode Modify ACK (BSC <-> MS) (default: 5 s) net: X14 = 5 s Timeout for RSL Channel Mode Modify ACK (BSC <-> BTS) (default: 5 s) +net: X15 = 30 s Wait time before trying to use a BORKEN timeslot again (default: 30 s) net: X16 = 1000 ms Granularity for all_allocated:* rate counters: amount of milliseconds that one counter increment represents. See also X17, X18 (default: 1000 ms) net: X17 = 0 ms Rounding threshold for all_allocated:* rate counters: round up to the next counter increment after this many milliseconds. If set to half of X16 (or 0), employ the usual round() behavior: round up after half of a granularity period. If set to 1, behave like ceil(): already increment the counter immediately when all channels are allocated. If set >= X16, behave like floor(): only increment after a full X16 period of all channels being occupied. See also X16, X18 (default: 0 ms) net: X18 = 60000 ms Forget-sum period for all_allocated:* rate counters: after this amount of idle time, forget internally cumulated time remainders. Zero to always keep remainders. See also X16, X17. (default: 60000 ms) -- To view, visit https://gerrit.osmocom.org/c/osmo-bsc/+/30252 To unsubscribe, or for help writing mail filters, visit https://gerrit.osmocom.org/settings Gerrit-Project: osmo-bsc Gerrit-Branch: master Gerrit-Change-Id: Ic4728b3efe843ea63e2a0b54b1ea8a925347484a Gerrit-Change-Number: 30252 Gerrit-PatchSet: 1 Gerrit-Owner: osmith <osm...@sysmocom.de> Gerrit-MessageType: newchange