git: f64dc2ab5be3 - main - tcp: TCP output method can request tcp_drop
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Sun, 26 Dec 2021 16:49:33 UTC
The branch main has been updated by glebius: URL: https://cgit.FreeBSD.org/src/commit/?id=f64dc2ab5be38e5366271ef85ea90d8cb1c7841a commit f64dc2ab5be38e5366271ef85ea90d8cb1c7841a Author: Gleb Smirnoff <glebius@FreeBSD.org> AuthorDate: 2021-12-26 16:48:19 +0000 Commit: Gleb Smirnoff <glebius@FreeBSD.org> CommitDate: 2021-12-26 16:48:19 +0000 tcp: TCP output method can request tcp_drop The advanced TCP stacks (bbr, rack) may decide to drop a TCP connection when they do output on it. The default stack never does this, thus existing framework expects tcp_output() always to return locked and valid tcpcb. Provide KPI extension to satisfy demands of advanced stacks. If the output method returns negative error code, it means that caller must call tcp_drop(). In tcp_var() provide three inline methods to call tcp_output(): - tcp_output() is a drop-in replacement for the default stack, so that default stack can continue using it internally without modifications. For advanced stacks it would perform tcp_drop() and unlock and report that with negative error code. - tcp_output_unlock() handles the negative code and always converts it to positive and always unlocks. - tcp_output_nodrop() just calls the method and leaves the responsibility to drop on the caller. Sweep over the advanced stacks and use new KPI instead of using HPTS delayed drop queue for that. Reviewed by: rrs, tuexen Differential revision: https://reviews.freebsd.org/D33370 --- sys/dev/cxgbe/tom/t4_cpl_io.c | 6 +++ sys/netinet/tcp_hpts.c | 2 + sys/netinet/tcp_stacks/bbr.c | 52 ++++++++++----------- sys/netinet/tcp_stacks/rack.c | 49 +++++++++++--------- sys/netinet/tcp_subr.c | 19 ++++---- sys/netinet/tcp_timer.c | 18 ++++---- sys/netinet/tcp_usrreq.c | 47 ++++++++++--------- sys/netinet/tcp_var.h | 102 ++++++++++++++++++++++++++++++++++++++++-- sys/netinet/toecore.c | 3 +- 9 files changed, 210 insertions(+), 88 deletions(-) diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index ae22b3d5c252..56bb7910b5c1 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -2300,6 +2300,12 @@ sendanother: if (moretocome) tp->t_flags |= TF_MORETOCOME; error = tcp_output(tp); + if (error < 0) { + INP_UNLOCK_ASSERT(inp); + SOCK_IO_SEND_UNLOCK(so); + error = -error; + goto out; + } if (moretocome) tp->t_flags &= ~TF_MORETOCOME; } diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 745ebe9f3aaa..40ae9a5cc3ad 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -1550,6 +1550,8 @@ again: } inp->inp_hpts_calls = 1; error = tcp_output(tp); + if (error < 0) + goto skip_pacing; inp->inp_hpts_calls = 0; if (ninp && ninp->inp_ppcb) { /* diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index f79933d54f24..ada278d629d0 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -4580,8 +4580,7 @@ bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) } if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); - return (1); + return (-ETIMEDOUT); /* tcp_drop() */ } /* Did we somehow get into persists? */ if (bbr->rc_in_persist) { @@ -4773,8 +4772,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) */ if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); - goto out; + return (-ETIMEDOUT); /* tcp_drop() */ } /* * Hack: if the peer is dead/unreachable, we do not time out if the @@ -4787,8 +4785,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { KMOD_TCPSTAT_INC(tcps_persistdrop); tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); - tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); - goto out; + return (-ETIMEDOUT); /* tcp_drop() */ } if ((sbavail(&bbr->rc_inp->inp_socket->so_snd) == 0) && tp->snd_una == tp->snd_max) { @@ -4804,8 +4801,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { KMOD_TCPSTAT_INC(tcps_persistdrop); tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); - tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); - goto out; + return (-ETIMEDOUT); /* tcp_drop() */ } t_template = tcpip_maketemplate(bbr->rc_inp); if (t_template) { @@ -4877,8 +4873,7 @@ bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) dropit: KMOD_TCPSTAT_INC(tcps_keepdrops); tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); - tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); - return (1); + return (-ETIMEDOUT); /* tcp_drop() */ } /* @@ -4998,10 +4993,8 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) * and retransmit one segment. */ if (ctf_progress_timeout_check(tp, true)) { - retval = 1; bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); - goto out; + return (-ETIMEDOUT); /* tcp_drop() */ } bbr_remxt_tmr(tp); if ((bbr->r_ctl.rc_resend == NULL) || @@ -5017,11 +5010,11 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; KMOD_TCPSTAT_INC(tcps_timeoutdrop); - retval = 1; tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); - tcp_set_inp_to_drop(bbr->rc_inp, - (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); - goto out; + /* XXXGL: previously t_softerror was casted to uint16_t */ + MPASS(tp->t_softerror >= 0); + retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT; + return (retval); /* tcp_drop() */ } if (tp->t_state == TCPS_SYN_SENT) { /* @@ -5194,7 +5187,7 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) tp->snd_recover = tp->snd_max; tp->t_flags |= TF_ACKNOW; tp->t_rtttime = 0; -out: + return (retval); } @@ -11637,7 +11630,8 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, if (bbr->r_wanted_output != 0) { bbr->rc_output_starts_timer = 0; did_out = 1; - (void)tcp_output(tp); + if (tcp_output(tp) < 0) + return (1); } else bbr_start_hpts_timer(bbr, tp, cts, 6, 0, 0); } @@ -11676,7 +11670,8 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, /* We are late */ bbr->r_ctl.rc_last_delay_val = 0; BBR_STAT_INC(bbr_force_output); - (void)tcp_output(tp); + if (tcp_output(tp) < 0) + return (1); } } } @@ -12163,9 +12158,16 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) hpts_calling = inp->inp_hpts_calls; inp->inp_hpts_calls = 0; if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { - if (bbr_process_timers(tp, bbr, cts, hpts_calling)) { + int retval; + + retval = bbr_process_timers(tp, bbr, cts, hpts_calling); + if (retval != 0) { counter_u64_add(bbr_out_size[TCP_MSS_ACCT_ATIMER], 1); - return (0); + /* + * If timers want tcp_drop(), then pass error out, + * otherwise suppress it. + */ + return (retval < 0 ? retval : 0); } } bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; @@ -13234,10 +13236,9 @@ send: * is the only thing to do. */ BBR_STAT_INC(bbr_offset_drop); - tcp_set_inp_to_drop(inp, EFAULT); SOCKBUF_UNLOCK(sb); (void)m_free(m); - return (0); + return (-EFAULT); /* tcp_drop() */ } len = rsm->r_end - rsm->r_start; } @@ -13891,7 +13892,7 @@ nomore: bbr->oerror_cnt++; if (bbr_max_net_error_cnt && (bbr->oerror_cnt >= bbr_max_net_error_cnt)) { /* drop the session */ - tcp_set_inp_to_drop(inp, ENETDOWN); + return (-ENETDOWN); } switch (error) { case ENOBUFS: @@ -14238,6 +14239,7 @@ struct tcp_function_block __tcp_bbr = { .tfb_tcp_handoff_ok = bbr_handoff_ok, .tfb_tcp_mtu_chg = bbr_mtu_chg, .tfb_pru_options = bbr_pru_options, + .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, }; /* diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index cd198e11babe..204533b3b778 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -6319,8 +6319,7 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t } if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); - tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - return (1); + return (-ETIMEDOUT); /* tcp_drop() */ } /* * A TLP timer has expired. We have been idle for 2 rtts. So we now @@ -6538,9 +6537,8 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) if (ctf_progress_timeout_check(tp, false)) { tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); - tcp_set_inp_to_drop(inp, ETIMEDOUT); counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); - return (1); + return (-ETIMEDOUT); /* tcp_drop() */ } KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); /* @@ -6558,10 +6556,9 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) (ticks - tp->t_rcvtime >= tcp_maxpersistidle || TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) { KMOD_TCPSTAT_INC(tcps_persistdrop); - retval = 1; tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); - tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); + retval = -ETIMEDOUT; /* tcp_drop() */ goto out; } if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && @@ -6574,11 +6571,10 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { - retval = 1; KMOD_TCPSTAT_INC(tcps_persistdrop); tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); - tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); + retval = -ETIMEDOUT; /* tcp_drop() */ goto out; } t_template = tcpip_maketemplate(rack->rc_inp); @@ -6669,8 +6665,7 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) dropit: KMOD_TCPSTAT_INC(tcps_keepdrops); tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); - tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); - return (1); + return (-ETIMEDOUT); /* tcp_drop() */ } /* @@ -6874,8 +6869,7 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) if (ctf_progress_timeout_check(tp, false)) { tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); - tcp_set_inp_to_drop(inp, ETIMEDOUT); - return (1); + return (-ETIMEDOUT); /* tcp_drop() */ } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; rack->r_ctl.retran_during_recovery = 0; @@ -6944,10 +6938,10 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) drop_it: tp->t_rxtshift = TCP_MAXRXTSHIFT; KMOD_TCPSTAT_INC(tcps_timeoutdrop); - retval = 1; - tcp_set_inp_to_drop(rack->rc_inp, - (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); - goto out; + /* XXXGL: previously t_softerror was casted to uint16_t */ + MPASS(tp->t_softerror >= 0); + retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT; + goto out; /* tcp_drop() */ } if (tp->t_state == TCPS_SYN_SENT) { /* @@ -14164,7 +14158,12 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb ctf_calc_rwin(so, tp); if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { send_out_a_rst: - (void)tcp_output(tp); + if (tcp_output(tp) < 0) { +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif + return (1); + } did_out = 1; } rack_free_trim(rack); @@ -14649,8 +14648,9 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, if (nxt_pkt == 0) { if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { do_output_now: + if (tcp_output(tp) < 0) + return (1); did_out = 1; - (void)tcp_output(tp); } rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); rack_free_trim(rack); @@ -16877,12 +16877,20 @@ rack_output(struct tcpcb *tp) } /* Do the timers, which may override the pacer */ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { - if (rack_process_timers(tp, rack, cts, hpts_calling, &doing_tlp)) { + int retval; + + retval = rack_process_timers(tp, rack, cts, hpts_calling, + &doing_tlp); + if (retval != 0) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); #ifdef TCP_ACCOUNTING sched_unpin(); #endif - return (0); + /* + * If timers want tcp_drop(), then pass error out, + * otherwise suppress it. + */ + return (retval < 0 ? retval : 0); } } if (rack->rc_in_persist) { @@ -20393,6 +20401,7 @@ static struct tcp_function_block __tcp_rack = { .tfb_tcp_mtu_chg = rack_mtu_change, .tfb_pru_options = rack_pru_options, .tfb_hwtls_change = rack_hw_tls_change, + .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, }; /* diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 15faca076001..dde8616dcec6 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -375,7 +375,7 @@ static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); -static void tcp_mtudisc(struct inpcb *, int); +static struct inpcb *tcp_mtudisc(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); @@ -2398,7 +2398,8 @@ tcp_drop(struct tcpcb *tp, int errno) if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); - (void) tcp_output(tp); + /* Don't use tcp_output() here due to possible recursion. */ + (void)tcp_output_nodrop(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); @@ -3026,7 +3027,7 @@ tcp_ctlinput_with_port(int cmd, struct sockaddr *sa, void *vip, uint16_t port) inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); - tcp_mtudisc(inp, mtu); + inp = tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, @@ -3474,11 +3475,10 @@ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { - tcp_mtudisc(inp, -1); - return (inp); + return (tcp_mtudisc(inp, -1)); } -static void +static struct inpcb * tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; @@ -3487,7 +3487,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) - return; + return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); @@ -3517,7 +3517,10 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) */ tp->t_fb->tfb_tcp_mtu_chg(tp); } - tcp_output(tp); + if (tcp_output(tp) < 0) + return (NULL); + else + return (inp); } #ifdef INET diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index d41bd9465a08..a3cb16869dc9 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -292,8 +292,7 @@ tcp_timer_delack(void *xtp) tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); NET_EPOCH_ENTER(et); - (void) tcp_output(tp); - INP_WUNLOCK(inp); + (void) tcp_output_unlock(tp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } @@ -502,6 +501,7 @@ tcp_timer_persist(void *xtp) struct tcpcb *tp = xtp; struct inpcb *inp; struct epoch_tracker et; + int outrv; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; @@ -563,8 +563,7 @@ tcp_timer_persist(void *xtp) tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; NET_EPOCH_ENTER(et); - (void) tcp_output(tp); - NET_EPOCH_EXIT(et); + outrv = tcp_output_nodrop(tp); tp->t_flags &= ~TF_FORCEDATA; #ifdef TCPDEBUG @@ -572,7 +571,8 @@ tcp_timer_persist(void *xtp) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); - INP_WUNLOCK(inp); + (void) tcp_unlock_or_drop(tp, outrv); + NET_EPOCH_EXIT(et); out: CURVNET_RESTORE(); } @@ -582,7 +582,7 @@ tcp_timer_rexmt(void * xtp) { struct tcpcb *tp = xtp; CURVNET_SET(tp->t_vnet); - int rexmt; + int rexmt, outrv; struct inpcb *inp; struct epoch_tracker et; bool isipv6; @@ -843,15 +843,15 @@ tcp_timer_rexmt(void * xtp) cc_cong_signal(tp, NULL, CC_RTO); NET_EPOCH_ENTER(et); - (void) tcp_output(tp); - NET_EPOCH_EXIT(et); + outrv = tcp_output_nodrop(tp); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); - INP_WUNLOCK(inp); + (void) tcp_unlock_or_drop(tp, outrv); + NET_EPOCH_EXIT(et); out: CURVNET_RESTORE(); } diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 39705c3a9b54..55071a46e470 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -594,6 +594,8 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); error = tcp_output(tp); + KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()", + tp->t_fb->tfb_tcp_block_name, tp)); out_in_epoch: NET_EPOCH_EXIT(et); out: @@ -720,6 +722,8 @@ out_in_epoch: #endif NET_EPOCH_EXIT(et); out: + KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()", + tp->t_fb->tfb_tcp_block_name, tp)); /* * If the implicit bind in the connect call fails, restore * the flags we modified. @@ -896,21 +900,20 @@ tcp_usr_shutdown(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_WLOCK(inp); + tp = intotcpcb(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } - tp = intotcpcb(inp); TCPDEBUG1(); socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - error = tcp_output(tp); - + error = tcp_output_nodrop(tp); out: TCPDEBUG2(PRU_SHUTDOWN); TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); - INP_WUNLOCK(inp); + error = tcp_unlock_or_drop(tp, error); NET_EPOCH_EXIT(et); return (error); @@ -925,17 +928,18 @@ tcp_usr_rcvd(struct socket *so, int flags) struct epoch_tracker et; struct inpcb *inp; struct tcpcb *tp = NULL; - int error = 0; + int outrv = 0, error = 0; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); INP_WLOCK(inp); + NET_EPOCH_ENTER(et); + tp = intotcpcb(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } - tp = intotcpcb(inp); TCPDEBUG1(); /* * For passively-created TFO connections, don't attempt a window @@ -947,18 +951,17 @@ tcp_usr_rcvd(struct socket *so, int flags) if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) goto out; - NET_EPOCH_ENTER(et); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) tcp_offload_rcvd(tp); else #endif - tcp_output(tp); - NET_EPOCH_EXIT(et); + outrv = tcp_output_nodrop(tp); out: TCPDEBUG2(PRU_RCVD); TCP_PROBE2(debug__user, tp, PRU_RCVD); - INP_WUNLOCK(inp); + (void) tcp_unlock_or_drop(tp, outrv); + NET_EPOCH_EXIT(et); return (error); } @@ -999,6 +1002,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); + tp = intotcpcb(inp); vflagsav = inp->inp_vflag; incflagsav = inp->inp_inc.inc_flags; restoreflags = false; @@ -1018,7 +1022,6 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, m_freem(control); /* empty control, just free it */ control = NULL; } - tp = intotcpcb(inp); if ((flags & PRUS_OOB) != 0 && (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0) goto out; @@ -1188,7 +1191,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, !(flags & PRUS_NOTREADY)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; - error = tcp_output(tp); + error = tcp_output_nodrop(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } @@ -1255,7 +1258,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, tp->snd_up = tp->snd_una + sbavail(&so->so_snd); if ((flags & PRUS_NOTREADY) == 0) { tp->t_flags |= TF_FORCEDATA; - error = tcp_output(tp); + error = tcp_output_nodrop(tp); tp->t_flags &= ~TF_FORCEDATA; } } @@ -1285,7 +1288,7 @@ out: ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); - INP_WUNLOCK(inp); + error = tcp_unlock_or_drop(tp, error); NET_EPOCH_EXIT(et); return (error); } @@ -1310,12 +1313,13 @@ tcp_usr_ready(struct socket *so, struct mbuf *m, int count) SOCKBUF_LOCK(&so->so_snd); error = sbready(&so->so_snd, m, count); SOCKBUF_UNLOCK(&so->so_snd); - if (error == 0) { - NET_EPOCH_ENTER(et); - error = tcp_output(tp); - NET_EPOCH_EXIT(et); + if (error) { + INP_WUNLOCK(inp); + return (error); } - INP_WUNLOCK(inp); + NET_EPOCH_ENTER(et); + error = tcp_output_unlock(tp); + NET_EPOCH_EXIT(et); return (error); } @@ -2238,7 +2242,7 @@ unlock_and_done: struct epoch_tracker et; NET_EPOCH_ENTER(et); - error = tcp_output(tp); + error = tcp_output_nodrop(tp); NET_EPOCH_EXIT(et); } } @@ -2767,7 +2771,8 @@ tcp_disconnect(struct tcpcb *tp) sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - tcp_output(tp); + /* Ignore stack's drop request, we already at it. */ + (void)tcp_output_nodrop(tp); } } diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index a3e6e8a05107..7792e2162852 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -316,7 +316,8 @@ struct tcptemp { * function below. */ /* Flags for tcp functions */ -#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ +#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ +#define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */ /* * If defining the optional tcp_timers, in the @@ -386,12 +387,107 @@ struct tcp_function { TAILQ_HEAD(tcp_funchead, tcp_function); +struct tcpcb * tcp_drop(struct tcpcb *, int); + +#ifdef _NETINET_IN_PCB_H_ +/* + * tcp_output() + * Handles tcp_drop request from advanced stacks and reports that inpcb is + * gone with negative return code. + * Drop in replacement for the default stack. + */ static inline int tcp_output(struct tcpcb *tp) { + int rv; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + rv = tp->t_fb->tfb_tcp_output(tp); + if (rv < 0) { + KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, + ("TCP stack %s requested tcp_drop(%p)", + tp->t_fb->tfb_tcp_block_name, tp)); + tp = tcp_drop(tp, -rv); + if (tp) + INP_WUNLOCK(tp->t_inpcb); + } - return (tp->t_fb->tfb_tcp_output(tp)); + return (rv); } + +/* + * tcp_output_unlock() + * Always returns unlocked, handles drop request from advanced stacks. + * Always returns positive error code. + */ +static inline int +tcp_output_unlock(struct tcpcb *tp) +{ + int rv; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + rv = tp->t_fb->tfb_tcp_output(tp); + if (rv < 0) { + KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, + ("TCP stack %s requested tcp_drop(%p)", + tp->t_fb->tfb_tcp_block_name, tp)); + rv = -rv; + tp = tcp_drop(tp, rv); + if (tp) + INP_WUNLOCK(tp->t_inpcb); + } else + INP_WUNLOCK(tp->t_inpcb); + + return (rv); +} + +/* + * tcp_output_nodrop() + * Always returns locked. It is caller's responsibility to run tcp_drop()! + * Useful in syscall implementations, when we want to perform some logging + * and/or tracing with tcpcb before calling tcp_drop(). To be used with + * tcp_unlock_or_drop() later. + * + * XXXGL: maybe don't allow stacks to return a drop request at certain + * TCP states? Why would it do in connect(2)? In recv(2)? + */ +static inline int +tcp_output_nodrop(struct tcpcb *tp) +{ + int rv; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + rv = tp->t_fb->tfb_tcp_output(tp); + KASSERT(rv >= 0 || tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, + ("TCP stack %s requested tcp_drop(%p)", + tp->t_fb->tfb_tcp_block_name, tp)); + return (rv); +} + +/* + * tcp_unlock_or_drop() + * Handle return code from tfb_tcp_output() after we have logged/traced, + * to be used with tcp_output_nodrop(). + */ +static inline int +tcp_unlock_or_drop(struct tcpcb *tp, int tcp_output_retval) +{ + + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (tcp_output_retval < 0) { + tcp_output_retval = -tcp_output_retval; + if (tcp_drop(tp, tcp_output_retval) != NULL) + INP_WUNLOCK(tp->t_inpcb); + } else + INP_WUNLOCK(tp->t_inpcb); + + return (tcp_output_retval); +} +#endif /* _NETINET_IN_PCB_H_ */ #endif /* _KERNEL */ /* @@ -979,8 +1075,6 @@ void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); void tcp_ctlinput_viaudp(int, struct sockaddr *, void *, void *); -struct tcpcb * - tcp_drop(struct tcpcb *, int); void tcp_drain(void); void tcp_init(void); void tcp_fini(void *); diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c index 17900a047559..676eca5462bd 100644 --- a/sys/netinet/toecore.c +++ b/sys/netinet/toecore.c @@ -532,7 +532,8 @@ toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err) KASSERT(!(tp->t_flags & TF_TOE), ("%s: tp %p still offloaded.", __func__, tp)); tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - (void) tcp_output(tp); + if (tcp_output(tp) < 0) + INP_WLOCK(inp); /* re-acquire */ } else { tp = tcp_drop(tp, err); if (tp == NULL)