git: 105b9df26ee0 - main - LinuxKPI: 802.11: close race lkpi_sta_scan_to_auth()/(*iv_update_bss)()

From: Bjoern A. Zeeb <bz_at_FreeBSD.org>
Date: Fri, 07 Jun 2024 21:41:07 UTC
The branch main has been updated by bz:

URL: https://cgit.FreeBSD.org/src/commit/?id=105b9df26ee0286f3a5a7d191075e068dee1c4a2

commit 105b9df26ee0286f3a5a7d191075e068dee1c4a2
Author:     Bjoern A. Zeeb <bz@FreeBSD.org>
AuthorDate: 2024-02-18 20:57:51 +0000
Commit:     Bjoern A. Zeeb <bz@FreeBSD.org>
CommitDate: 2024-06-07 21:40:32 +0000

    LinuxKPI: 802.11: close race lkpi_sta_scan_to_auth()/(*iv_update_bss)()
    
    We have to unlock the net80211 ic lock in order to be able to call
    sleepable downcalls to the driver/firmware; a 2nd thread may go through
    net80211::join1() and (*iv_update_bss)() after we checked and unlocked.
    Re-check status at the end of the function under the ic lock so that we
    do not accidentally set lvif_bss_synched to true again despite it no
    longer being true.
    
    This should fix a race where we lost the (*iv_update_bss)() state
    during startup where one SCAN->AUTH is followed by a (then) AUTH->AUTH
    and lkpi_sta_a_to_a() did the wrong thing.
    
    Once we re-consider net80211 state and allowing a second join
    on a different node or iv_bss update without previously tearing down
    the older node we can likely undo a lot of these extra checks and
    workarounds.
    
    Sponsored by:   The FreeBSD Foundation (updated version)
    Tested by:      emaste (on and off)
    MFC after:      3 days
    Reviewd by:     cc
    Differential Revision: https://reviews.freebsd.org/D43967
---
 sys/compat/linuxkpi/common/src/linux_80211.c | 66 +++++++++++++++++++---------
 1 file changed, 46 insertions(+), 20 deletions(-)

diff --git a/sys/compat/linuxkpi/common/src/linux_80211.c b/sys/compat/linuxkpi/common/src/linux_80211.c
index cd1897a08ccd..bf2050deed0d 100644
--- a/sys/compat/linuxkpi/common/src/linux_80211.c
+++ b/sys/compat/linuxkpi/common/src/linux_80211.c
@@ -1275,25 +1275,6 @@ lkpi_sta_scan_to_auth(struct ieee80211vap *vap, enum ieee80211_state nstate, int
 	lsta = ni->ni_drv_data;
 
 	LKPI_80211_LVIF_LOCK(lvif);
-	/* Re-check given (*iv_update_bss) could have happened. */
-	/* XXX-BZ KASSERT later? or deal as error? */
-	if (lvif->lvif_bss_synched || lvif->lvif_bss != NULL)
-		ic_printf(vap->iv_ic, "%s:%d: lvif %p vap %p iv_bss %p lvif_bss %p "
-		    "lvif_bss->ni %p synched %d, ni %p lsta %p\n", __func__, __LINE__,
-		    lvif, vap, vap->iv_bss, lvif->lvif_bss,
-		    (lvif->lvif_bss != NULL) ? lvif->lvif_bss->ni : NULL,
-		    lvif->lvif_bss_synched, ni, lsta);
-
-	/*
-	 * Reference the ni for this cache of lsta/ni on lvif->lvif_bss
-	 * essentially out lsta version of the iv_bss.
-	 * Do NOT use iv_bss here anymore as that may have diverged from our
-	 * function local ni already and would lead to inconsistencies.
-	 */
-	ieee80211_ref_node(ni);
-	lvif->lvif_bss = lsta;
-	lvif->lvif_bss_synched = true;
-
 	/* Insert the [l]sta into the list of known stations. */
 	TAILQ_INSERT_TAIL(&lvif->lsta_head, lsta, lsta_entry);
 	LKPI_80211_LVIF_UNLOCK(lvif);
@@ -1342,11 +1323,56 @@ lkpi_sta_scan_to_auth(struct ieee80211vap *vap, enum ieee80211_state nstate, int
 	 * (ideally we'd do that on a callback for something else ...)
 	 */
 
+	LKPI_80211_LHW_UNLOCK(lhw);
+	IEEE80211_LOCK(vap->iv_ic);
+
+	LKPI_80211_LVIF_LOCK(lvif);
+	/* Re-check given (*iv_update_bss) could have happened while we were unlocked. */
+	if (lvif->lvif_bss_synched || lvif->lvif_bss != NULL ||
+	    lsta->ni != vap->iv_bss)
+		ic_printf(vap->iv_ic, "%s:%d: lvif %p vap %p iv_bss %p lvif_bss %p "
+		    "lvif_bss->ni %p synched %d, ni %p lsta %p\n", __func__, __LINE__,
+		    lvif, vap, vap->iv_bss, lvif->lvif_bss,
+		    (lvif->lvif_bss != NULL) ? lvif->lvif_bss->ni : NULL,
+		    lvif->lvif_bss_synched, ni, lsta);
+
+	/*
+	 * Reference the "ni" for caching the lsta/ni in lvif->lvif_bss.
+	 * Given we cache lsta we use lsta->ni instead of ni here (even though
+	 * lsta->ni == ni) to be distinct from the rest of the code where we do
+	 * assume that ni == vap->iv_bss which it may or may not be.
+	 * So do NOT use iv_bss here anymore as that may have diverged from our
+	 * function local ni already while ic was unlocked and would lead to
+	 * inconsistencies.  Go and see if we lost a race and do not update
+	 * lvif_bss_synched in that case.
+	 */
+	ieee80211_ref_node(lsta->ni);
+	lvif->lvif_bss = lsta;
+	if (lsta->ni == vap->iv_bss) {
+		lvif->lvif_bss_synched = true;
+	} else {
+		/* Set to un-synched no matter what. */
+		lvif->lvif_bss_synched = false;
+		/*
+		 * We do not error as someone has to take us down.
+		 * If we are followed by a 2nd, new net80211::join1() going to
+		 * AUTH lkpi_sta_a_to_a() will error, lkpi_sta_auth_to_{scan,init}()
+		 * will take the lvif->lvif_bss node down eventually.
+		 * What happens with the vap->iv_bss node will entirely be up
+		 * to net80211 as we never used the node beyond alloc()/free()
+		 * and we do not hold an extra reference for that anymore given
+		 * ni : lsta == 1:1.
+		 */
+	}
+	LKPI_80211_LVIF_UNLOCK(lvif);
+	goto out_relocked;
+
 out:
 	LKPI_80211_LHW_UNLOCK(lhw);
 	IEEE80211_LOCK(vap->iv_ic);
+out_relocked:
 	/*
-	 * Release the reference that keop the ni stable locally
+	 * Release the reference that kept the ni stable locally
 	 * during the work of this function.
 	 */
 	if (ni != NULL)