/* * This file implements the Chelsio CPL5 message processing. * * Copyright (C) 2003-2017 Chelsio Communications. All rights reserved. * * Written by Dimitris Michailidis (dm@chelsio.com) * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE file included in this * release for licensing terms and conditions. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "defs.h" #include "tom.h" #include "l2t.h" #include "clip_tbl.h" #include "cpl_io_state.h" #include "t4_ddp.h" #include "t4_tcb.h" #include "t4_regs.h" #include "cxgb4_ctl_defs.h" #include "cxgb4_ofld.h" #include "t4fw_interface.h" #include "t4_ma_failover.h" #include "trace.h" #include "tom_compat.h" #include "offload.h" #ifdef WD_TOE #include "ntuples.h" #include "wd_qp.h" #endif #include "t4_tls.h" #define DEBUG_WR 0 #define TCB_PAGE_PTR_NULL 0x1ffffU /* * Min receive window. We want it to be large enough to accommodate receive * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. */ #define MIN_RCV_WND (24 * 1024U) extern struct sk_ofld_proto t4_tcp_prot; extern struct sk_ofld_proto t4_tcp_v6_prot; extern struct request_sock_ops t4_rsk_ops; extern struct request_sock_ops t4_rsk6_ops; #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,18,0) extern struct tcp_congestion_ops *tcp_reno_p; #endif extern int do_pass_open_rpl(struct tom_data *td, struct sk_buff *skb); #ifdef WD_TOE /* * external variable for WD-TOE. * * "wdtoe_dev_table" is where all the WD-TOE device or stack stored; * * "conn_tuple" is the table hold information of all the actively opening * connections (sending out a SYN to remote peer). This table is to help * user land WD-TOE library to have the tid<->sockfd mapping. Once the * connections are established, the entry in this table is cleared; * * "passive_conn_tuple" is the table hold information of all the passively * opening/listening server connections (SYN from remote peer). Again, * this table is to help user land WD-TOE library to have the passive * side tid<->sockfd mapping. Once the connections are established, the * entry in this table is cleared; * * "listen_table" is the table to hold all listening server's local port * number. When a SYN arrives from remote peer to a local port, we look * it up in this table to figure out which WD-TOE device, i.e. which * stack this SYN corresponds; */ extern struct wdtoe_device_table *wdtoe_dev_table; extern struct conn_tuple *conn_tuple; extern struct passive_tuple *passive_conn_tuple; extern struct wdtoe_listen_device *listen_table; #endif /* * For ULP connections HW may add headers, e.g., for digests, that aren't part * of the messages sent by the host but that are part of the TCP payload and * therefore consume TCP sequence space. Tx connection parameters that * operate in TCP sequence space are affected by the HW additions and need to * compensate for them to accurately track TCP sequence numbers. This array * contains the compensating extra lengths for ULP packets. It is indexed by * a packet's ULP submode. */ const unsigned int t4_ulp_extra_len[] = {0, 4, 4, 8}; /* * This sk_buff holds a fake header-only TCP segment that we use whenever we * need to exploit SW TCP functionality that expects TCP headers, such as * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple * CPUs without locking. */ static struct sk_buff *tcphdr_skb __read_mostly; /* * Socket filter that drops everything. This is assigned to offloaded sockets * in order to make sure that any packets belonging to an offloaded socket * which may find their way into the Host Stack are dropped. See * init_cpl_io() for the initialization. */ static struct sk_filter *drop_all; #if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) static struct sock_filter drop_insns[] = { {(BPF_RET | BPF_K), 0, 0, 0}, }; #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,15,0) static struct sock_filter drop_insns[] = { {BPF_S_RET_K, 0, 0, 0}, }; #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) static struct sock_filter_int drop_insnsi[] = { {(BPF_JMP|BPF_EXIT), 0, 0, 0, 0}, }; #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) static struct sock_filter_int drop_insnsi[] = { BPF_EXIT_INSN(), }; #else /* >= 3.17 */ static struct bpf_prog *drop_bpf; #if LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) static struct bpf_insn drop_insnsi[] = { BPF_EXIT_INSN(), }; #else static struct bpf_insn drop_insnsi[] = { BPF_ALU32_IMM(BPF_MOV, BPF_REG_A, 0x0), BPF_EXIT_INSN(), }; #endif /* < 4.5.0 */ #endif /* >= 3.17 */ /* * TOE information returned through inet_diag for offloaded connections. */ struct t4_inet_diag_info { u32 toe_id; /* determines how to interpret the rest of the fields */ u32 tid; u8 wr_credits; u8 queue; u8 ulp_mode:4; u8 sched_class:4; u8 ddp_enabled; char dev_name[TOENAMSIZ]; }; /* * Similar to process_cpl_msg() but takes an extra socket reference around the * call to the handler. Should be used if the handler may drop a socket * reference. */ static inline void process_cpl_msg_ref(void (*fn)(struct sock *, struct sk_buff *), struct sock *sk, struct sk_buff *skb) { sock_hold(sk); process_cpl_msg(fn, sk, skb); sock_put(sk); } static inline int is_t4a(const struct toedev *dev) { return dev->ttid == TOE_ID_CHELSIO_T4; } /* * Returns an sk_buff for a reply CPL message of size len. If the input * sk_buff has no other users it is trimmed and reused, otherwise a new buffer * is allocated. The input skb must be of size at least len. Note that this * operation does not destroy the original skb data even if it decides to reuse * the buffer. */ static struct sk_buff *get_cpl_reply_skb(struct sk_buff *skb, size_t len, gfp_t gfp) { if (likely(!skb_is_nonlinear(skb) && !skb_cloned(skb))) { BUG_ON(skb->len < len); __skb_trim(skb, len); skb_get(skb); } else { skb = alloc_skb(len, gfp); if (skb) __skb_put(skb, len); } return skb; } /* * Like get_cpl_reply_skb() but the returned buffer starts out empty. */ static struct sk_buff *__get_cpl_reply_skb(struct sk_buff *skb, size_t len, gfp_t gfp) { if (likely(!skb_is_nonlinear(skb) && !skb_cloned(skb))) { __skb_trim(skb, 0); skb_get(skb); } else skb = alloc_skb(len, gfp); return skb; } /* * Determine whether to send a CPL message now or defer it. A message is * deferred if the connection is in SYN_SENT since we don't know the TID yet. * For connections in other states the message is sent immediately. * If through_l2t is set the message is subject to ARP processing, otherwise * it is sent directly. */ inline void send_or_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb, int through_l2t) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); if (toedev_in_shutdown(cplios->toedev)) return; if (unlikely(sk->sk_state == TCP_SYN_SENT)) { /* defer */ __skb_queue_tail(&cplios->ooo_queue, skb); } else if (through_l2t) { /* send through L2T */ cxgb4_l2t_send(cplios->egress_dev, skb, cplios->l2t_entry); } else { /* send directly */ cxgb4_ofld_send(cplios->egress_dev, skb); } } /* * Populate a TID_RELEASE WR. The skb must be already propely sized. */ static inline void mk_tid_release(struct sk_buff *skb, unsigned int chan, unsigned int tid) { struct cpl_tid_release *req; unsigned int len = roundup(sizeof(struct cpl_tid_release), 16); req = (struct cpl_tid_release *)__skb_put(skb, len); memset(req, 0, len); set_wr_txq(skb, CPL_PRIORITY_SETUP, chan); INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid); } /* * Insert a socket to the TID table and take an extra reference. */ inline int sk_insert_tid(struct tom_data *d, struct sock *sk, unsigned int tid) { int id; sock_hold(sk); cxgb4_insert_tid(d->tids, sk, tid, sk->sk_family); id = bh_insert_handle(d, sk, tid); return id; } static unsigned int select_mss(const struct cpl_io_state *cplios, unsigned int pmtu, u16 peer_mss) { struct sock *sk = cplios->sk; unsigned int idx; struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct tom_data *d = TOM_DATA(cplios->toedev); unsigned int iphdrsize; unsigned int tcpopthdrsize = 0; /* * Compute the size of the IP + TCP headers. */ #if defined(CONFIG_TCPV6_OFFLOAD) if (sk->sk_family == AF_INET6) iphdrsize = sizeof(struct ipv6hdr) + sizeof(struct tcphdr); else #endif iphdrsize = sizeof(struct iphdr) + sizeof(struct tcphdr); if (cplios->opt2 & F_TSTAMPS_EN) tcpopthdrsize += round_up(TCPOLEN_TIMESTAMP, 4); /* * Compute the Maximum Segment Size based on all the constraints * we've been given. */ tp->advmss = dst_metric_advmss(dst); if (USER_MSS(tp) && tp->advmss > USER_MSS(tp)) tp->advmss = USER_MSS(tp); if (tp->advmss > pmtu - iphdrsize) tp->advmss = pmtu - iphdrsize; if (peer_mss && tp->advmss > peer_mss) tp->advmss = peer_mss; /* * Now find a TP MTU Index which will give us an MSS not larger than * our constrained size. If we can get an MTU which will allow the * MSS to be a multiple of 8 bytes we'll get better performance within * the chip between TP and the memory controller. Note that * Advertised MSS includes both Data and TCP Option Headers but our * goal is to get the Data Portion of a TCP Segment to be a multiple * of 8 bytes so it'll slot into the chip memory nicely ... */ tp->advmss = (cxgb4_best_aligned_mtu(d->mtus, iphdrsize + tcpopthdrsize, tp->advmss - tcpopthdrsize, 8, &idx) - iphdrsize); inet_csk(sk)->icsk_pmtu_cookie = pmtu; return idx; } void t4_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); unsigned int wnd = tp->rcv_wnd; wnd = max_t(unsigned int, wnd, tcp_full_space(sk)); wnd = max_t(unsigned int, MIN_RCV_WND, wnd); if (wnd > MAX_RCV_WND) wnd = MAX_RCV_WND; /* * Check if we need to grow the receive window in response to an increase in * the socket's receive buffer size. Some applications increase the buffer * size dynamically and rely on the window to grow accordingly. */ if (wnd > tp->rcv_wnd) { tp->rcv_wup -= wnd - tp->rcv_wnd; tp->rcv_wnd = wnd; /* Mark the recieve window as updated */ cplios_reset_flag(CPL_IO_STATE(sk), CPLIOS_UPDATE_RCV_WND); } } /* Assumes DACK_CONFIG[ByteThreshold]=26856 and DACK_CONFIG[MSSThreshold]=4 */ unsigned int t4_select_delack(struct sock *sk) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); struct toedev *dev = cplios->toedev; unsigned int dack_mode; const unsigned int MSSThreshold = 4; const unsigned int ByteThreshold = 26856; dack_mode = TOM_TUNABLE(dev, delack); if (!dack_mode || !inet_csk(sk)->icsk_ack.pingpong) return 0; if ((dack_mode == 2) && (MSS_CLAMP(tp) > ByteThreshold/(MSSThreshold*2))) dack_mode = 3; if ((dack_mode == 3) && (tp->rcv_wnd < (ByteThreshold + MSS_CLAMP(tp)))) dack_mode = 1; if ((dack_mode == 2) && (tp->rcv_wnd < ((MSSThreshold*2+1)*MSS_CLAMP(tp)))) dack_mode = 1; if ((dev->ttid >= TOE_ID_CHELSIO_T4) && (cplios->delack_mode == 0) && (tp->rcv_wnd > 3 * MSS_CLAMP(tp))) dack_mode = 1; return dack_mode; } #if VALIDATE_TID #define VALIDATE_SOCK(sk) \ do { \ if (unlikely(!(sk))) \ return CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE; \ } while (0) #else #define VALIDATE_SOCK(sk) do {} while (0) #endif /* * Called when we receive the last message from HW for a connection. A * connection cannot transition to TCP_CLOSE prior to this event. * Resources related to the offload state of a connection (e.g., L2T entries) * must have been relinquished prior to calling this. */ void connection_done(struct sock *sk) { #if 0 printk("connection_done: TID: %u, state: %d, dead %d, refs %d\n", CPL_IO_STATE(sk)->tid, sk->sk_state, sock_flag(sk, SOCK_DEAD), atomic_read(&sk->sk_refcnt)); // dump_stack(); #endif #ifdef T4_TRACE T4_TRACE1(TIDTB(sk), "connection_done: GTS rpl pending %d, if pending wake", cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)); #endif if (sock_flag(sk, SOCK_DEAD)) t4_purge_receive_queue(sk); sk_wakeup_sleepers(sk, 0); tcp_done(sk); } /* * Determine the receive window scaling factor given a target max * receive window. */ static inline int select_rcv_wscale(int space, int wscale_ok, int window_clamp) { int wscale = 0; if (space > MAX_RCV_WND) space = MAX_RCV_WND; if (window_clamp && window_clamp < space) space = window_clamp; if (wscale_ok) for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; return wscale; } /* Returns bits 2:7 of a socket's TOS field */ #define SK_TOS(sk) ((inet_sk(sk)->tos >> 2) & M_DSCP) /* * The next two functions calculate the option 0 value for a socket. */ inline unsigned long long calc_opt0(struct sock *sk, int nagle) { const struct cpl_io_state *cplios = CPL_IO_STATE(sk); const struct tcp_sock *tp = tcp_sk(sk); /* If nagle policy is not modified by user, determine nagle on/off * based on application socket settings/kernel settings */ if (likely(nagle == -1)) nagle = ((tp->nonagle & TCP_NAGLE_OFF) == 0); return V_NAGLE(nagle) | F_TCAM_BYPASS | V_KEEP_ALIVE(sock_flag(sk, SOCK_KEEPOPEN) != 0) | V_WND_SCALE(RCV_WSCALE(tp)) | V_MSS_IDX(cplios->mtu_idx) | V_DSCP(SK_TOS(sk)) | V_ULP_MODE(cplios->ulp_mode) | V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, M_RCV_BUFSIZ)); } unsigned int t4_calc_opt2(const struct cpl_io_state *cplios, const struct offload_settings *s, unsigned int iq_id) { const struct sock *sk = cplios->sk; struct toedev *tdev = cplios->toedev; struct tom_data *td = TOM_DATA(tdev); struct cxgb4_lld_info *lldi = td->lldi; unsigned short chan = cxgb4_port_chan(tdev->lldev[cplios->port_id]); u32 opt2 = V_RX_CHANNEL(cplios->rx_c_chan) | V_TX_QUEUE(lldi->tx_modq[chan]) | F_RSS_QUEUE_VALID | V_RSS_QUEUE(iq_id); /* * Absent a specified offload settings, the default is to enable RX * Coalescing. For T5 we also enable the ability to set the Initial * Segment Sequence Number in CPL_ACT_OPEN_REQ{,_V6}.ISS fields. */ if (is_t4(lldi->adapter_type)) opt2 |= F_RX_COALESCE_VALID; else { opt2 |= F_T5_OPT_2_VALID; opt2 |= F_T5_ISS; } opt2 |= V_RX_COALESCE(M_RX_COALESCE); if (cplios->ulp_mode == ULP_MODE_TCPDDP) { opt2 |= F_RX_FC_VALID | F_RX_FC_DDP; if (cplios->port_speed > 50000) opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); } else if (cplios->ulp_mode == ULP_MODE_TLS) { opt2 |= F_RX_FC_VALID; opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); } if (tcp_win_scaling_enabled()) opt2 |= F_WND_SCALE_EN; /* * Other TCP options obey the sysctls in the absence of policies. */ if (s && s->tstamp >= 0) opt2 |= V_TSTAMPS_EN(s->tstamp); else if (tcp_timestamps_enabled()) opt2 |= F_TSTAMPS_EN; if (s && s->sack >= 0) opt2 |= V_SACK_EN(s->sack); else if (tcp_sack_enabled()) opt2 |= F_SACK_EN; if (unlikely(!s)) return opt2; /* * We have an ofload settings specification so it can specify behavior * which is different from the default. E.g. turning RX Coalescing. */ if (s->rx_coalesce >= 0) { opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); opt2 |= V_RX_COALESCE(s->rx_coalesce ? M_RX_COALESCE : 0); } if (s->cong_algo >= 0) { if (is_t4(lldi->adapter_type)) opt2 |= F_CONG_CNTRL_VALID; opt2 |= V_CONG_CNTRL(s->cong_algo); } if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) opt2 |= F_CCTRL_ECN; return opt2; } /* * This function is intended for allocations of small control messages. * Such messages go as immediate data and usually the pakets are freed * immediately. We maintain a cache of one small sk_buff and use it whenever * it is available (has a user count of 1). Otherwise we get a fresh buffer. */ struct sk_buff *alloc_ctrl_skb(struct sk_buff *skb, int len) { if (likely(skb && !skb_shared(skb) && !skb_cloned(skb))) { __skb_trim(skb, 0); atomic_set(&skb->users, 2); #ifdef DEBUG if (skb_tailroom(skb) < len) { printk(KERN_WARNING "Requested Length of sk_buff (%d) is larger " "than pre-allocated sk_buff cache (%d).\n", len, skb_tailroom(skb)); BUG_ON(1); } #endif } else if (likely(!in_atomic())) skb = alloc_skb_nofail(len); else skb = alloc_skb(len, GFP_ATOMIC); return skb; } static inline void free_wr_skb(struct sock *sk, struct sk_buff *skb) { #if defined(CONFIG_T4_ZCOPY_SENDMSG) || defined(CONFIG_T4_ZCOPY_SENDMSG_MODULE) if (skb->data[0] == FW_OFLD_TX_DATA_WR) t4_zcopy_cleanup_skb(sk, skb); #endif kfree_skb(skb); } static void purge_wr_queue(struct sock *sk) { struct sk_buff *skb; while ((skb = dequeue_wr(sk)) != NULL) free_wr_skb(sk, skb); } /* * Returns true if an sk_buff carries urgent data. */ static inline int skb_urgent(struct sk_buff *skb) { return (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_URG) != 0; } /* * Generic ARP failure handler that discards the buffer. */ static void arp_failure_discard(void *handle, struct sk_buff *skb) { kfree_skb(skb); } /** * sgl_len - calculates the size of an SGL of the given capacity * @n: the number of SGL entries * * Calculates the number of flits needed for a scatter/gather list that * can hold the given number of entries. */ static inline unsigned int sgl_len(unsigned int n) { /* * A Direct Scatter Gather List uses 32-bit lengths and 64-bit PCI DMA * addresses. The DSGL Work Request starts off with a 32-bit DSGL * ULPTX header, then Length0, then Address0, then, for 1 <= i <= N, * repeated sequences of { Length[i], Length[i+1], Address[i], * Address[i+1] } (this ensures that all addresses are on 64-bit * boundaries). If N is even, then Length[N+1] should be set to 0 and * Address[N+1] is omitted. * * The following calculation incorporates all of the above. It's * somewhat hard to follow but, briefly: the "+2" accounts for the * first two flits which include the DSGL header, Length0 and * Address0; the "(3*(n-1))/2" covers the main body of list entries (3 * flits for every pair of the remaining N) +1 if (n-1) is odd; and * finally the "+((n-1)&1)" adds the one remaining flit needed if * (n-1) is odd ... */ n--; return (3 * n) / 2 + (n & 1) + 2; } /* * is_ofld_imm - check whether a packet can be sent as immediate data * @skb: the packet * * Returns true if a packet can be sent as an offload WR with immediate * data. * FW_OFLD_TX_DATA_WR limits the payload to 255 bytes due to 8-bit field. * However, FW_ULPTX_WR commands have a 256 byte immediate only * payload limit. */ static inline int is_ofld_imm(struct cpl_io_state *cplios, const struct sk_buff *skb) { int length = skb->len; if (is_ofld_sg_reqd(skb)) return 0; if (likely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR)) { /* Check TLS header len for Immediate */ if ((cplios->ulp_mode == ULP_MODE_TLS) && is_tls_offload_skb(cplios->sk, skb)) length += tls_wr_size(cplios->sk, skb, true); else length += sizeof(struct fw_ofld_tx_data_wr); return length <= MAX_IMM_OFLD_TX_DATA_WR_LEN; } return 1; } /** * calc_tx_flits_ofld - calculate # of flits for an offload packet * @skb: the packet * * Returns the number of flits needed for the SG offload packet. */ static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb, unsigned int immdlen) { unsigned int flits, cnt; flits = immdlen / 8; /* headers */ cnt = skb_shinfo(skb)->nr_frags; if (skb_tail_pointer(skb) != skb_transport_header(skb)) cnt++; return flits + sgl_len(cnt); } u8 tcp_state_to_flowc_state(u8 state) { u8 ret = FW_FLOWC_MNEM_TCPSTATE_ESTABLISHED; switch (state) { case TCP_ESTABLISHED: ret = FW_FLOWC_MNEM_TCPSTATE_ESTABLISHED; break; case TCP_CLOSE_WAIT: ret = FW_FLOWC_MNEM_TCPSTATE_CLOSEWAIT; break; case TCP_FIN_WAIT1: ret = FW_FLOWC_MNEM_TCPSTATE_FINWAIT1; break; case TCP_CLOSING: ret = FW_FLOWC_MNEM_TCPSTATE_CLOSING; break; case TCP_LAST_ACK: ret = FW_FLOWC_MNEM_TCPSTATE_LASTACK; break; case TCP_FIN_WAIT2: ret = FW_FLOWC_MNEM_TCPSTATE_FINWAIT2; break; }; return ret; } /** * flowc_wr_credits - return number of credits needed for a FW_FLOWC_WR * @nparams: number of parameters * @flowclenp: return value pointer for FW_FLOWC_WR length * * Return number of 16-byte "credits" needed to send a Firmware FlowC * Work Request containing @nparams parameters. If a non-NULL @flowclenp * is provided, the length of the FlowC Work Request will be returned via * that pointer. */ static int flowc_wr_credits(int nparams, int *flowclenp) { int flowclen16, flowclen; flowclen = offsetof(struct fw_flowc_wr, mnemval[nparams]); flowclen16 = DIV_ROUND_UP(flowclen, 16); flowclen = flowclen16 * 16; if (flowclenp) *flowclenp = flowclen; return flowclen16; } /** * create_flowc_wr_skb - allocate and initialize a FlowC Work Request skb * @sk: the associated socket * @flowc: the FW_FLOWC_WR * @flowclen: the FLOWC_WR length (rounded up to 16-bytes) * * Allocates an skb using the connections Control skb Cache and returns * it filled in with the FlowC Work Request. NULL is returned if an * skb can't be allocated. */ struct sk_buff *create_flowc_wr_skb(struct sock *sk, struct fw_flowc_wr *flowc, int flowclen) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct sk_buff *skb; skb = alloc_skb(flowclen, GFP_ATOMIC); if (!skb) return NULL; memcpy(__skb_put(skb, flowclen), flowc, flowclen); set_queue(skb, (cplios->txq_idx << 1) | CPL_PRIORITY_DATA, sk); return skb; } /** * send_flowc_wr - send a FW_FLOWC_WR; return credits consumed * @sk: the associated socket * @flowc: the FW_FLOWC_WR * @flowclen: the FLOWC_WR length (rounded up to 16-bytes) * * Send the Firmware FlowC Work Request and return the number of * 16-byte credits consumed, a negative error number, or 0 if the * socket is in shutdown state. */ static int send_flowc_wr(struct sock *sk, struct fw_flowc_wr *flowc, int flowclen) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); bool syn_sent = (sk->sk_state == TCP_SYN_SENT); int flowclen16 = flowclen/16; struct sk_buff *skb; if (toedev_in_shutdown(cplios->toedev)) return 0; /* * If the first data has already been sent on the connection, then * we'll need to queue up an skb to carry the FlowC Work Request. * As such, it won't use any credits [yet] on the connection. */ if (cplios_flag(sk, CPLIOS_TX_DATA_SENT)) { skb = create_flowc_wr_skb(sk, flowc, flowclen); if (!skb) return -ENOMEM; if (syn_sent) __skb_queue_tail(&cplios->ooo_queue, skb); else skb_entail(sk, skb, ULPCB_FLAG_NO_HDR|ULPCB_FLAG_NO_APPEND); return 0; } if (!syn_sent) { int ret; ret = cxgb4_immdata_send(cplios->egress_dev, cplios->txq_idx, flowc, flowclen); if (!ret) return flowclen16; } skb = create_flowc_wr_skb(sk, flowc, flowclen); if (!skb) return -ENOMEM; send_or_defer(sk, tp, skb, 0); return flowclen16; } /** * send_tx_flowc_wr - send first FW_FLOWC_WR * @sk: the associated socket * @compl: 0 => no completion; 1 => completion after processing * @snd_nxt: initial TCP Send Next Sequence Number * @rcv_nxt: initial TCP Receive Next Sequence Number * * Send the first Firmware FlowC Work Request just before the first * data is sent on an Offload Connection. Returns the number of * 16-byte credits used by the FlowC Work Request, a negative error * number, or 0 if the socket is in shutdown. */ int send_tx_flowc_wr(struct sock *sk, int compl, u32 snd_nxt, u32 rcv_nxt) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tls_ofld_info *tls_ofld = TLS_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); struct tom_data *d = TOM_DATA(cplios->toedev); int nparams, paramidx, flowclen16, flowclen; struct flowc_packed { struct fw_flowc_wr fc; struct fw_flowc_mnemval mnemval[FW_FLOWC_MNEM_MAX]; } __packed sflowc; struct fw_flowc_wr *flowc; #ifdef CONFIG_CXGB4_DCB u16 vlan, dcbprio; #endif int iqid; #ifdef WD_TOE struct wdtoe_device *wd_dev; int ret, dev_idx, tbl_idx; /* find the associated wd_dev by the tid */ ret = wdtoe_find_dev_by_tid(wdtoe_dev_table, &dev_idx, &tbl_idx, cplios->tid); if (ret == 0) { /* * we find the tid in the wdtoe device table, * so we get "wd_dev" and figure out the iq_id * for this WDTOE connection. */ wd_dev = wdtoe_dev_table[dev_idx].wd_dev; iqid = wd_dev->rxq_list[cplios->port_id]->iq.cntxt_id; } else { /* * The tid is not in the wdtoe device table. * This probably means we have WDTOE in place, * but we want this connection go through TOE instead. */ iqid = cplios->rss_qid; } #else iqid = cplios->rss_qid; #endif /* * Initialize the FlowC Work Request. We do the parameters first * because there are a variable number. */ memset(&sflowc, 0, sizeof sflowc); flowc = &sflowc.fc; #define FLOWC_PARAM(__m, __v) \ do { \ flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ flowc->mnemval[paramidx].val = htonl(__v); \ paramidx++; \ } while (0) paramidx = 0; FLOWC_PARAM(PFNVFN, d->pfvf); FLOWC_PARAM(CH, cplios->tx_c_chan); FLOWC_PARAM(PORT, cplios->tx_c_chan); FLOWC_PARAM(IQID, iqid); FLOWC_PARAM(SNDNXT, snd_nxt); FLOWC_PARAM(RCVNXT, rcv_nxt); FLOWC_PARAM(SNDBUF, cplios->sndbuf); FLOWC_PARAM(MSS, tp->mss_cache); FLOWC_PARAM(TCPSTATE, tcp_state_to_flowc_state(sk->sk_state)); #ifdef CONFIG_CXGB4_DCB if (!cxgb4_dcb_enabled(cplios->egress_dev)) dcbprio = 0; else { vlan = cplios->l2t_entry->vlan; if (vlan == CPL_L2T_VLAN_NONE) { if (printk_ratelimit()) printk(KERN_WARNING "Connection without VLAN " "Tag on DCB Link\n"); dcbprio = 0; } else dcbprio = (vlan & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } FLOWC_PARAM(DCBPRIO, dcbprio); #endif /* * If the connection's Scheduling Class has been set, pass that in. */ if (cplios->sched_cls != SCHED_CLS_NONE) FLOWC_PARAM(SCHEDCLASS, cplios->sched_cls); if (cplios->txplen_max) FLOWC_PARAM(TXDATAPLEN_MAX, cplios->txplen_max); if (SND_WSCALE(tp)) FLOWC_PARAM(RCV_SCALE, SND_WSCALE(tp)); if (cplios->ulp_mode == ULP_MODE_TLS) FLOWC_PARAM(ULP_MODE, cplios->ulp_mode); if (is_tls_offload(sk) && tls_ofld->fcplenmax) { FLOWC_PARAM(TXDATAPLEN_MAX, tls_ofld->fcplenmax); } nparams = paramidx; #undef FLOWC_PARAM flowclen16 = flowc_wr_credits(nparams, &flowclen); flowc->op_to_nparams = htonl(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_WR_COMPL(compl) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | V_FW_WR_FLOWID(cplios->tid)); return send_flowc_wr(sk, flowc, flowclen); } /** * send_tx_schedclass_wr - send a FlowC TX Schedule Class Work Request * @sk: the associated socket * @compl: 0 => no completion; 1 => completion after processing * * Send a FlowC FW_FLOWC_MNEM_SCHEDCLASS Work Request for an Offloaded * Connection. This is used to change the TX Scheduling Class bound to * the connection. Returns the number of 16-byte credits consumed by the * FlowC Work Request, a negative error number, or 0 if the socket is in * shutdown. */ int send_tx_schedclass_wr(struct sock *sk, int compl) { const struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct flowc_packed { struct fw_flowc_wr fc; struct fw_flowc_mnemval mnemval[1]; } __packed sflowc; struct fw_flowc_wr *flowc; int nparams, flowclen16, flowclen; unsigned int fw_schedclass; /* * Translate TX Scheduling Class into Firmware value. */ if (cplios->sched_cls == SCHED_CLS_NONE) fw_schedclass = 0xff; else fw_schedclass = cplios->sched_cls; /* * Initialize the FlowC Work Request. */ memset(&sflowc, 0, sizeof sflowc); flowc = &sflowc.fc; nparams = 1; flowclen16 = flowc_wr_credits(nparams, &flowclen); flowc->op_to_nparams = htonl(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_WR_COMPL(compl) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | V_FW_WR_FLOWID(cplios->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; flowc->mnemval[0].val = htonl(fw_schedclass); /* * And send it to the Firmware. */ return send_flowc_wr(sk, flowc, flowclen); } static inline void make_tx_data_wr(struct sock *sk, struct sk_buff *skb, unsigned int immdlen, int len, u32 credits, u32 compl) { const struct cpl_io_state *cplios = CPL_IO_STATE(sk); const struct tom_data *d = TOM_DATA(cplios->toedev); const enum chip_type adapter_type = d->lldi->adapter_type; struct fw_ofld_tx_data_wr *req; unsigned int opcode = FW_OFLD_TX_DATA_WR; unsigned int wr_ulp_mode_force; if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ISCSI_WR) { /* fw_ofld_tx_data_wr struct is used for this as well */ opcode = FW_ISCSI_TX_DATA_WR; } req = (struct fw_ofld_tx_data_wr *)__skb_push(skb, sizeof(*req)); req->op_to_immdlen = htonl(V_WR_OP(opcode) | V_FW_WR_COMPL(compl) | V_FW_WR_IMMDLEN(immdlen)); req->flowid_len16 = htonl(V_FW_WR_FLOWID(cplios->tid) | V_FW_WR_LEN16(credits)); /* for iscsi, the mode & submode setting is per-packet */ if (cplios->ulp_mode == ULP_MODE_ISCSI) wr_ulp_mode_force = V_TX_ULP_MODE(skb_ulp_mode(skb) >> 4) | V_TX_ULP_SUBMODE(skb_ulp_mode(skb) & 0xf); else { wr_ulp_mode_force = V_TX_ULP_MODE(cplios->ulp_mode); if (is_ofld_sg_reqd(skb)) wr_ulp_mode_force |= F_FW_OFLD_TX_DATA_WR_ALIGNPLD | ((tcp_sk(sk)->nonagle & TCP_NAGLE_OFF) ? 0 : F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE); } if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ISCSI_FORCE) wr_ulp_mode_force |= (CHELSIO_CHIP_VERSION(adapter_type) <= CHELSIO_T5 ? F_TX_FORCE : F_T6_TX_FORCE); req->lsodisable_to_flags = htonl(wr_ulp_mode_force | V_TX_URG(skb_urgent(skb)) | V_TX_SHOVE((!cplios_flag_nochk(cplios, CPLIOS_TX_MORE_DATA)) && skb_queue_empty(&cplios->tx_queue))); req->plen = htonl(len); } /* * Prepends TX_DATA_WR to buffers requesting a header using ULPCB_FLAG_NEED_HDR * waiting on a socket's send queue and sends them on to the TOE. * Must be called with the socket lock held. Returns the amount of send buffer * space that was freed as a result of sending queued data to the TOE. * Buffers with headers should set ULPCB_FLAG_COMPL to request completion. */ int t4_push_frames(struct sock *sk, int req_completion) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); int total_size = 0; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; struct tom_data *d; int wr_size = sizeof(struct fw_ofld_tx_data_wr); if (unlikely(sk_in_state(sk, TCPF_SYN_SENT | TCPF_CLOSE))) return 0; /* * We shouldn't really be called at all after an abort but check just * in case. */ if (unlikely(cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN))) return 0; d = TOM_DATA(cplios->toedev); while (cplios->wr_credits && (skb = skb_peek(&cplios->tx_queue)) && !cplios_flag_nochk(cplios, CPLIOS_TX_WAIT_IDLE) && (!(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_HOLD) || skb_queue_len(&cplios->tx_queue) > 1)) { unsigned int immdlen; int len; /* length with ulp bytes inserted by h/w */ int tls_len; /* TLS data len before IV/key */ unsigned int credits_needed, credit_len; unsigned int completion=0; int flowclen16=0; int tls_imm = 0; immdlen = len = credit_len = tls_len = skb->len; if (!is_ofld_imm(cplios, skb)) { immdlen = skb_transport_offset(skb); if (is_tls_offload_skb(sk, skb)) wr_size = tls_wr_size(sk, skb, false); credit_len = 8*calc_tx_flits_ofld(skb, immdlen); } else { if (is_tls_offload_skb(sk, skb)) { wr_size = tls_wr_size(sk, skb, false); tls_imm = 1; } } if (likely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR)) credit_len += wr_size; credits_needed = DIV_ROUND_UP(credit_len, 16); /* Assumes the initial credits is large enough to support fw_flowc_wr plus largest possible first payload */ if (!cplios_flag_nochk(cplios, CPLIOS_TX_DATA_SENT)) { flowclen16 = send_tx_flowc_wr(sk, 1, tp->snd_nxt, tp->rcv_nxt); if (flowclen16 <= 0) break; cplios->wr_credits -= flowclen16; cplios->wr_unacked += flowclen16; cplios->wr_nondata += flowclen16; cplios_set_flag(cplios, CPLIOS_TX_DATA_SENT); } if (cplios->wr_credits < credits_needed) { /* Revert IV DSGL accounted for credits count */ if (is_tls_offload_skb(sk, skb) && !skb_ulp_tls_skb_iv(skb)) skb_shinfo(skb)->nr_frags--; break; } __skb_unlink(skb, &cplios->tx_queue); set_queue(skb, (cplios->txq_idx << 1) | CPL_PRIORITY_DATA, sk); if (is_tls_offload(sk)) TLS_IO_STATE(sk)->tx_qid = (skb->queue_mapping >> 1); /* remember credits for TX data and non-data WR until WR_ACK */ skb->csum = credits_needed + cplios->wr_nondata; cplios->wr_credits -= credits_needed; cplios->wr_unacked += credits_needed; cplios->wr_nondata = 0; enqueue_wr(sk, skb); if (likely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR)) { len += ulp_extra_len(skb) + skb_ulp_len_adjust(skb); if ((req_completion && cplios->wr_unacked == credits_needed) || (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_COMPL) || cplios->wr_unacked >= cplios->wr_max_credits / 2) { completion = 1; cplios->wr_unacked = 0; } if (is_tls_offload_skb(sk, skb)) { make_tlstx_data_wr(sk, skb, tls_imm, tls_len, credits_needed); } else { make_tx_data_wr(sk, skb, immdlen, len, credits_needed, completion); } tp->snd_nxt += len; tp->lsndtime = tcp_time_stamp; if (completion) ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_NEED_HDR; } else { struct cpl_close_con_req *req = cplhdr(skb); unsigned int cmd = (G_CPL_OPCODE(ntohl(OPCODE_TID(req)))); if (cmd == CPL_CLOSE_CON_REQ) cplios_set_flag(cplios, CPLIOS_CLOSE_CON_REQUESTED); if ((ULP_SKB_CB(skb)->flags & ULPCB_FLAG_COMPL) && (cplios->wr_unacked >= cplios->wr_max_credits / 2)) { req->wr.wr_hi |= htonl(F_FW_WR_COMPL); cplios->wr_unacked = 0; } } total_size += skb->truesize; if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_BARRIER) cplios_set_flag(cplios, CPLIOS_TX_WAIT_IDLE); t4_set_arp_err_handler(skb, NULL, arp_failure_discard); cxgb4_l2t_send(cplios->egress_dev, skb, cplios->l2t_entry); } sk->sk_wmem_queued -= total_size; return total_size; } EXPORT_SYMBOL(t4_push_frames); #ifndef TCP_CONGESTION_CONTROL #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,18,0) tcp_reno_p = & struct tcp_congestion_ops { .name = "", .owner = THIS_MODULE, }; #else struct tcp_congestion_ops tcp_init_congestion_ops = { .name = "", .owner = THIS_MODULE, }; #endif #endif void free_atid(struct cpl_io_state *cplios, struct tom_data *td, unsigned int atid) { struct tid_info *tids = td->tids; conn_remove_handle(td, atid); cxgb4_free_atid(tids, atid); sock_put(cplios->sk); kref_put(&cplios->kref, t4_cplios_release); } /* * Release resources held by an offload connection (TID, L2T entry, etc.) */ void t4_release_offload_resources(struct sock *sk) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct toedev *tdev = cplios->toedev; struct tid_info *tids; unsigned int tid = cplios->tid; struct tom_data *td; if (!tdev) return; td = TOM_DATA(tdev); tids = td->tids; cplios->rss_qid = cplios->txq_idx = 0; t4_release_ddp_resources(sk); kfree_skb(cplios->ctrl_skb_cache); cplios->ctrl_skb_cache = NULL; kfree_skb(cplios->txdata_skb_cache); cplios->txdata_skb_cache = NULL; if (cplios->wr_credits != cplios->wr_max_credits) { purge_wr_queue(sk); reset_wr_list(cplios); } if (cplios->l2t_entry) { cxgb4_l2t_release(cplios->l2t_entry); cplios->l2t_entry = NULL; } if (sk->sk_family != AF_INET) cxgb4_clip_release(cplios->egress_dev, (const u32 *)((&inet6_sk_saddr(sk))->s6_addr), 1); if (sk->sk_state == TCP_SYN_SENT) { // we have ATID free_atid(cplios, td, tid); __skb_queue_purge(&cplios->ooo_queue); } else { // we have TID cxgb4_remove_tid(tids, cplios->port_id, tid, sk->sk_family); bh_remove_handle(td, cplios->idr); sock_put(sk); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,18,0) t4_set_ca_ops(sk, tcp_reno_p); #else t4_set_ca_ops(sk, &tcp_init_congestion_ops); #endif } /* * Returns whether a CPL message is not expected in the socket backlog of a * closed connection. Most messages are illegal at that point except * ABORT_RPL_RSS and SET_TCB_RPL sent by DDP. */ static int bad_backlog_msg(unsigned int opcode) { return opcode != CPL_ABORT_RPL_RSS && opcode != CPL_SET_TCB_RPL; } /* * Called for each sk_buff in a socket's receive backlog during * backlog processing. */ static int t4_backlog_rcv(struct sock *sk, struct sk_buff *skb) { #if VALIDATE_TID u8 opcode; #endif /* * NIC packets can sneak into the backlog once a sokcet is hashed and * before the BPF drop filter is installed. They'll have either IP or * IPv6 protocol while TOE packets leave it at 0. Look for them and * drop them. */ if (skb->protocol) { kfree_skb(skb); return 0; } #if VALIDATE_TID opcode = ((const struct rss_header *)cplhdr(skb))->opcode; if (unlikely(sk->sk_state == TCP_CLOSE && bad_backlog_msg(opcode))) { printk(KERN_ERR "unexpected CPL message with opcode %x for " "closed TID %u\n", opcode, CPL_IO_STATE(sk)->tid); kfree_skb(skb); return 0; } #endif BLOG_SKB_CB(skb)->backlog_rcv(sk, skb); return 0; } #ifdef CONFIG_TCP_OFFLOAD_MODULE static void dummy_tcp_keepalive_timer(unsigned long data) { } #endif /* * Switch a socket to the offload protocol operations. Note that the offload * operations do not contain the offload backlog handler, we install that * directly to the socket. */ static void install_offload_ops(struct cpl_io_state *cplios, struct sk_ofld_proto *oproto) { struct sock *sk = cplios->sk; struct proto *ps; #if defined(CONFIG_TCPV6_OFFLOAD) if (sk->sk_family == AF_INET) { sk_copy_ofldproto(oproto, &t4_tcp_prot); ps = &t4_tcp_prot.proto; } else { sk_copy_ofldproto(oproto, &t4_tcp_v6_prot); ps = &t4_tcp_v6_prot.proto; } #else sk_copy_ofldproto(oproto, &t4_tcp_prot); ps = &t4_tcp_prot.proto; #endif sk_ofld_proto_set_tomhandlers(oproto, ps); sk_ofld_proto_set_ptr(oproto, cplios); sk->sk_prot = &oproto->proto; sk->sk_backlog_rcv = t4_backlog_rcv; if (sk->sk_write_space == sk_stream_write_space_compat) sk->sk_write_space = t4_write_space; if (sk->sk_filter) sk_filter_uncharge_compat(sk, sk->sk_filter); sk->sk_filter = drop_all; sk_filter_charge_compat(sk, sk->sk_filter); #ifdef CONFIG_TCP_OFFLOAD_MODULE sk->sk_timer.function = dummy_tcp_keepalive_timer; #endif sock_set_flag(sk, SOCK_OFFLOADED); } #if DEBUG_WR static void dump_wrs(struct sock *sk) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); u64 *d; struct sk_buff *p; printk("TID %u info:\n", cplios->tid); skb_queue_walk(&cplios->tx_queue, p) { d = cplhdr(p); printk(" len %u, frags %u, flags %x, data %llx\n", p->len, skb_shinfo(p)->nr_frags, ULP_SKB_CB(p)->flags, (unsigned long long)be64_to_cpu(*d)); } printk("outstanding:\n"); wr_queue_walk(sk, p) { d = cplhdr(p); printk(" len %u, frags %u, flags %x, data %llx,%llx,%llx\n", p->len, skb_shinfo(p)->nr_frags, ULP_SKB_CB(p)->flags, (unsigned long long)be64_to_cpu(*d), (unsigned long long)be64_to_cpu(d[1]), (unsigned long long)be64_to_cpu(d[2])); } } static int count_pending_wrs(const struct sock *sk) { int n = 0; const struct sk_buff *p; wr_queue_walk(sk, p) n += p->csum; return n; } static void check_wr_invariants(const struct sock *sk) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); int pending = count_pending_wrs(sk); if (unlikely(cplios->wr_avail + pending != cplios->wr_max)) printk(KERN_ERR "TID %u: credit imbalance: avail %u, " "pending %u, total should be %u\n", cplios->tid, cplios->wr_avail, pending, cplios->wr_max); } #endif struct t4_tcp_congestion_ops { struct tcp_congestion_ops ops; int key; }; #define T4_CONG_OPS(__s, __k) \ { { .name = __s, .owner = THIS_MODULE }, .key = CONG_ALG_##__k, } #define CONG_ALG_NONE (-1) static struct t4_tcp_congestion_ops t4_cong_ops[] = { T4_CONG_OPS("reno", RENO), T4_CONG_OPS("tahoe", TAHOE), T4_CONG_OPS("newreno", NEWRENO), T4_CONG_OPS("highspeed", HIGHSPEED), T4_CONG_OPS("none", NONE), }; #ifdef WD_TOE /** * wdtoe_remove_conn_tuple - sets a conn_tuple in c as not in use (free) * @c: array of connection tupes * @atid: atid of the tuple we want to make available * * returns: index of the successfully freed tuple, 0 otherwise */ static int wdtoe_remove_conn_tuple(struct conn_tuple *c, unsigned atid) { int i; for (i = 0; i < NWDTOECONN; i++) { if (c[i].in_use && c[i].atid == atid) { c[i].in_use = 0; return i; } } return 0; } #endif #ifdef WD_TOE /* * Same logic as wdtoe_remove_conn_tuple(), but work on the passive * connection table and mark one entry as free. */ static int wdtoe_remove_passive_conn_tuple(struct passive_tuple *c, unsigned stid, unsigned int tid) { int i; for (i = 0; i < NWDTOECONN; i++) { if (c[i].in_use && c[i].stid == stid && c[i].tid == tid) { c[i].in_use = 0; return i; } } return 0; } #endif static void mk_act_open_req(struct sock *sk, struct sk_buff *skb, unsigned int qid_atid, const struct l2t_entry *e, const struct offload_settings *s) { const struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tom_data *td = TOM_DATA(cplios->toedev); #ifdef WD_TOE int ret; #endif struct cpl_act_open_req *req = NULL; struct cpl_t5_act_open_req *t5req = NULL; struct cpl_t6_act_open_req *t6req = NULL; switch (CHELSIO_CHIP_VERSION(td->lldi->adapter_type)) { case CHELSIO_T4: req = (struct cpl_act_open_req *)__skb_put(skb, sizeof(*req)); INIT_TP_WR(req, 0); break; case CHELSIO_T5: t5req = (struct cpl_t5_act_open_req *)__skb_put(skb, sizeof(*t5req)); INIT_TP_WR(t5req, 0); req = (struct cpl_act_open_req *)t5req; break; case CHELSIO_T6: default: t6req = (struct cpl_t6_act_open_req *)__skb_put(skb, sizeof(*t6req)); INIT_TP_WR(t6req, 0); req = (struct cpl_act_open_req *)t6req; t5req = (struct cpl_t5_act_open_req *)t6req; break; } OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); set_wr_txq(skb, CPL_PRIORITY_SETUP, cplios->port_id); req->local_port = inet_sk(sk)->inet_sport; req->peer_port = inet_sk(sk)->inet_dport; req->local_ip = inet_sk(sk)->inet_saddr; req->peer_ip = inet_sk(sk)->inet_daddr; req->opt0 = cpu_to_be64(calc_opt0(sk, s->nagle) | V_L2T_IDX(e->idx) | V_SMAC_SEL(cplios->smac_idx) | V_ULP_MODE(cplios->ulp_mode) | V_TX_CHAN(cplios->tx_c_chan)); if (is_t4(td->lldi->adapter_type)) { req->params = cpu_to_be32(cxgb4_select_ntuple(cplios->egress_dev, e)); #ifdef WD_TOE if (is_wdtoe(sk)) { ret = wdtoe_act_open_req(sk, G_TID_TID(qid_atid), req->local_port, s, &req->opt2); if (ret == -1) goto t4_toe; return; } t4_toe: #endif req->opt2 = htonl(cplios->opt2); } else if (is_t5(td->lldi->adapter_type)) { t5req->rsvd = cpu_to_be32(secure_tcp_sequence_number_offload( inet_sk(sk)->inet_saddr, inet_sk(sk)->inet_daddr, inet_sk(sk)->inet_sport, inet_sk(sk)->inet_dport) | (sizeof(uint64_t) - 1)); t5req->params = cpu_to_be64(V_FILTER_TUPLE(cxgb4_select_ntuple(cplios->egress_dev, e))); #ifdef WD_TOE if (is_wdtoe(sk)) { ret = wdtoe_act_open_req(sk, G_TID_TID(qid_atid), t5req->local_port, s, &t5req->opt2); if (ret == -1) goto t5_toe; return; } t5_toe: #endif t5req->opt2 = htonl(cplios->opt2); } else { t6req->rsvd = cpu_to_be32(secure_tcp_sequence_number_offload( inet_sk(sk)->inet_saddr, inet_sk(sk)->inet_daddr, inet_sk(sk)->inet_sport, inet_sk(sk)->inet_dport) | (sizeof(uint64_t) - 1)); t6req->params = cpu_to_be64(V_FILTER_TUPLE(cxgb4_select_ntuple(cplios->egress_dev, e))); #ifdef WD_TOE if (is_wdtoe(sk)) { ret = wdtoe_act_open_req(sk, G_TID_TID(qid_atid), t6req->local_port, s, &t6req->opt2); if (ret == -1) goto t6_toe; return; } t6_toe: #endif t6req->opt2 = htonl(cplios->opt2); /* TODO */ //t6req->opt3 = htonl(cplios->opt3); } } static int mk_fw_act_open_req(struct sock *sk, unsigned int atid, const struct l2t_entry *e) { struct sk_buff *skb; struct fw_ofld_connection_wr *req; struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tom_data *d = TOM_DATA(cplios->toedev); struct offload_settings s; struct offload_req oreq; u32 dack; offload_req_from_sk(&oreq, sk, OPEN_TYPE_ACTIVE); rcu_read_lock(); s = *lookup_ofld_policy(cplios->toedev, &oreq, d->conf.cop_managed_offloading); rcu_read_unlock(); if (ma_fail_mk_fw_act_open_req(sk, atid, e)) return 0; dack = t4_select_delack(sk); skb = alloc_ctrl_skb(cplios->ctrl_skb_cache, sizeof(*req)); if (!skb) return -ENOMEM; req = (struct fw_ofld_connection_wr *)__skb_put(skb, sizeof(*req)); memset(req, 0, sizeof(*req)); req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR)); req->len16_pkd = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16))); req->le.filter = cpu_to_be32(cxgb4_select_ntuple(cplios->egress_dev, e)); req->le.lport = inet_sk(sk)->inet_sport; req->le.pport = inet_sk(sk)->inet_dport; req->le.u.ipv4.lip = inet_sk(sk)->inet_saddr; req->le.u.ipv4.pip = inet_sk(sk)->inet_daddr; req->tcb.t_state_to_astid = htonl(V_FW_OFLD_CONNECTION_WR_T_STATE(TCP_SYN_SENT) | V_FW_OFLD_CONNECTION_WR_ASTID(atid)); req->tcb.cplrxdataack_cplpassacceptrpl = htons(F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK); req->tcb.tx_max = jiffies; req->tcb.rcv_adv = htons(1); req->tcb.opt0 = cpu_to_be64(calc_opt0(sk, s.nagle) | V_L2T_IDX(e->idx) | V_SMAC_SEL(cplios->smac_idx) | V_TX_CHAN(cplios->tx_c_chan)); req->tcb.opt2 = htonl(cplios->opt2); set_wr_txq(skb, CPL_PRIORITY_CONTROL, cplios->port_id); t4_set_arp_err_handler(skb, NULL, NULL); cxgb4_l2t_send(cplios->egress_dev, skb, cplios->l2t_entry); return 0; } static int mk_fw_pass_open_req(struct tom_data *td, struct sk_buff *skb, struct request_sock *oreq, u32 filter, u16 window, struct l2t_entry *e, struct cpl_io_state *cplios) { struct sk_buff *req_skb; struct fw_ofld_connection_wr *req; struct cpl_pass_accept_req *cpl = cplhdr(skb); req_skb = alloc_skb(sizeof(struct fw_ofld_connection_wr), GFP_ATOMIC); if (!req_skb) return -ENOMEM; req = (struct fw_ofld_connection_wr *)__skb_put(req_skb, sizeof(*req)); memset(req, 0, sizeof(*req)); req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR) | F_FW_WR_COMPL); req->len16_pkd = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16))); req->le.version_cpl = htonl(F_FW_OFLD_CONNECTION_WR_CPL); req->le.filter = filter; req->le.lport = t4_get_req_lport(oreq); req->le.pport = inet_rsk(oreq)->ir_rmt_port; req->le.u.ipv4.lip = inet_rsk(oreq)->ir_loc_addr; req->le.u.ipv4.pip = inet_rsk(oreq)->ir_rmt_addr; req->tcb.rcv_nxt = htonl(tcp_rsk(oreq)->rcv_isn + 1); req->tcb.rcv_adv = htons(window); req->tcb.t_state_to_astid = htonl(V_FW_OFLD_CONNECTION_WR_T_STATE(TCP_SYN_RECV) | V_FW_OFLD_CONNECTION_WR_RCV_SCALE(cpl->tcpopt.wsf) | V_FW_OFLD_CONNECTION_WR_ASTID(G_PASS_OPEN_TID(ntohl(cpl->tos_stid)))); cplios->port_id = ((struct port_info *)netdev_priv(cplios->egress_dev))->port_id; cplios->rss_qid = td->lldi->rxq_ids[cplios->port_id*td->lldi->nrxq/td->lldi->nchan]; cplios->l2t_entry = e; /* We store the qid in opt2 which will be used by firmware * to send us the response to the work request */ req->tcb.opt2 = htonl(V_RSS_QUEUE(cplios->rss_qid)); /* We initialize the MSS index in TCB to 0xF. * So that when driver sends cpl_pass_accept_rpl * TCB picks up the correct value. If this was 0 * TP will ignore any value > 0 for MSS index. */ req->tcb.opt0 = cpu_to_be64(V_MSS_IDX(0xF)); req->cookie = cpu_to_be64((u64)(uintptr_t)skb); set_wr_txq(req_skb, CPL_PRIORITY_CONTROL, cplios->port_id); cxgb4_ofld_send(cplios->egress_dev, req_skb); return 0; } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static void mk_act_open_req6(struct sock *sk, struct sk_buff *skb, unsigned int qid_atid, const struct l2t_entry *e, const struct offload_settings *s, const struct in6_addr *sip, const struct in6_addr *dip) { const struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tom_data *td = TOM_DATA(cplios->toedev); struct cpl_act_open_req6 *req = NULL; struct cpl_t5_act_open_req6 *t5req = NULL; struct cpl_t6_act_open_req6 *t6req = NULL; switch (CHELSIO_CHIP_VERSION(td->lldi->adapter_type)) { case CHELSIO_T4: req = (struct cpl_act_open_req6 *)__skb_put(skb, sizeof(*req)); INIT_TP_WR(req, 0); break; case CHELSIO_T5: t5req = (struct cpl_t5_act_open_req6 *)__skb_put(skb, sizeof(*t5req)); INIT_TP_WR(t5req, 0); req = (struct cpl_act_open_req6 *)t5req; break; case CHELSIO_T6: default: t6req = (struct cpl_t6_act_open_req6 *)__skb_put(skb, sizeof(*t6req)); INIT_TP_WR(t6req, 0); req = (struct cpl_act_open_req6 *)t6req; t5req = (struct cpl_t5_act_open_req6 *)t6req; break; } OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, qid_atid)); set_wr_txq(skb, CPL_PRIORITY_SETUP, cplios->port_id); req->local_port = inet_sk(sk)->inet_sport; req->peer_port = inet_sk(sk)->inet_dport; req->local_ip_hi = *(__be64 *)(sip->s6_addr); req->local_ip_lo = *(__be64 *)(sip->s6_addr + 8); req->peer_ip_hi = *(__be64 *)(dip->s6_addr); req->peer_ip_lo = *(__be64 *)(dip->s6_addr + 8); req->opt0 = cpu_to_be64(calc_opt0(sk, s->nagle) | V_L2T_IDX(e->idx) | V_SMAC_SEL(cplios->smac_idx) | V_TX_CHAN(cplios->tx_c_chan)); if (is_t4(td->lldi->adapter_type)) { req->params = cpu_to_be32(cxgb4_select_ntuple(cplios->egress_dev, e)); req->opt2 = htonl(cplios->opt2); } else { #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) t5req->rsvd = cpu_to_be32(secure_tcpv6_sequence_number( inet6_sk_saddr(sk).s6_addr32, inet6_sk_daddr(sk).s6_addr32, inet_sk(sk)->inet_sport, inet_sk(sk)->inet_dport) | (sizeof(uint64_t) - 1)); #endif t5req->params = cpu_to_be64(V_FILTER_TUPLE(cxgb4_select_ntuple(cplios->egress_dev, e))); t5req->opt2 = htonl(cplios->opt2); /* TODO */ //if (is_t6(td->lldi->adapter_type)) // t6req->opt3 = htonl(cplios->opt3); } } #endif /* * Convert an ACT_OPEN_RPL status to a Linux errno. */ static int act_open_rpl_status_to_errno(int status) { switch (status) { case CPL_ERR_CONN_RESET: return ECONNREFUSED; case CPL_ERR_ARP_MISS: return EHOSTUNREACH; case CPL_ERR_CONN_TIMEDOUT: return ETIMEDOUT; case CPL_ERR_TCAM_FULL: return ENOMEM; case CPL_ERR_CONN_EXIST: return EADDRINUSE; default: return EIO; } } void act_open_req_arp_failure(void *handle, struct sk_buff *skb); void t4_fail_act_open(struct sock *sk, int errno) { sk->sk_err = errno; sk->sk_error_report(sk); t4_release_offload_resources(sk); connection_done(sk); T4_TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); } static void act_open_retry_timer(unsigned long data) { struct sk_buff *skb; struct sock *sk = (struct sock *)data; struct inet_connection_sock *icsk = inet_csk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) /* try in a bit */ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + HZ / 20); else { if (!sock_flag(sk, SOCK_OFFLOADED)) goto out_err; /* no space is saved using hw specific cpl_act_open_req here * no need to check sk_family either. */ skb = alloc_skb( roundup(sizeof(struct cpl_t6_act_open_req6), 16), GFP_ATOMIC); if (!skb) t4_fail_act_open(sk, ENOMEM); else { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct toedev *tdev = cplios->toedev; struct tom_data *d = TOM_DATA(tdev); unsigned int qid_atid = cplios->rss_qid << 14; struct offload_req oreq; struct offload_settings settings; offload_req_from_sk(&oreq, sk, OPEN_TYPE_ACTIVE); rcu_read_lock(); settings = *lookup_ofld_policy(tdev, &oreq, d->conf.cop_managed_offloading); rcu_read_unlock(); if (!settings.offload) { kfree_skb(skb); goto out_err; } qid_atid |= (unsigned int)cplios->tid; skb->sk = sk; t4_set_arp_err_handler(skb, NULL, act_open_req_arp_failure); if (sk->sk_family == AF_INET) mk_act_open_req(sk, skb, qid_atid, cplios->l2t_entry, &settings); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else mk_act_open_req6(sk, skb, qid_atid, cplios->l2t_entry, &settings, &inet6_sk_rcv_saddr(sk), &inet6_sk_daddr(sk)); #endif cxgb4_l2t_send(cplios->egress_dev, skb, cplios->l2t_entry); } } out_err: bh_unlock_sock(sk); sock_put(sk); } static void deferred_tnl_connect(struct toedev *tdev, struct sk_buff *skb) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); int err; kfree_skb(skb); lock_sock(sk); if (sk->sk_state == TCP_SYN_SENT) { if (CPL_IO_STATE(sk)) { t4_release_offload_resources(sk); t4_install_standard_ops(sk); } if (!tp->write_seq) { if (sk->sk_family == AF_INET) tp->write_seq = secure_tcp_sequence_number_offload(inet->inet_saddr, inet->inet_daddr, inet->inet_sport, inet->inet_dport); #if defined(CONFIG_TCPV6_OFFLOAD) else tp->write_seq = secure_tcpv6_sequence_number( inet6_sk_saddr(sk).s6_addr32, inet6_sk_daddr(sk).s6_addr32, inet->inet_sport, inet->inet_dport); #endif } inet->inet_id = tp->write_seq ^ jiffies; err = tcp_connect(sk); if (err) goto failure; } release_sock(sk); return; failure: tcp_set_state(sk, TCP_CLOSE); sk->sk_route_caps = 0; inet->inet_dport = 0; sk->sk_err = err; sk->sk_error_report(sk); T4_TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); release_sock(sk); } static void fixup_and_send_ofo(struct cpl_io_state *cplios, unsigned int tid); /* * Returns whether an ABORT_REQ_RSS/ACT_OPEN_RPL message is a negative advice. */ static inline int is_neg_adv(unsigned int status) { return status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE; } /* * Handle active open replies. Reply status is non-zero * except when ACT_OPEN_REQ has NON_OFFLOAD set. * Note miss in CLIP region is reported as CPL_ERR_TCAM_PARITY */ static void active_open_rpl(struct sock *sk, struct sk_buff *skb) { struct cpl_act_open_rpl *rpl = cplhdr(skb); struct inet_connection_sock *icsk = inet_csk(sk); unsigned int status = G_AOPEN_STATUS(ntohl(rpl->atid_status)); int err; if (is_neg_adv(status)) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tom_data *td = TOM_DATA(cplios->toedev); unsigned int tid = GET_TID(rpl); if (cplios_flag_nochk(cplios, CPLIOS_ABORT_RPL_PENDING)) { if (!lookup_tid(td->tids, tid)) cplios->idr = sk_insert_tid(td, sk, tid); } cplios->neg_adv_tid = tid; fixup_and_send_ofo(cplios, tid); kfree_skb(skb); return; } if (status) { if (status == CPL_ERR_CONN_EXIST && icsk->icsk_retransmit_timer.function != act_open_retry_timer) { icsk->icsk_retransmit_timer.function = act_open_retry_timer; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + HZ / 2); } else if (status == CPL_ERR_TCAM_PARITY) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); skb->sk = sk; t4_defer_reply(skb, cplios->toedev, deferred_tnl_connect); return; } else if (status == CPL_ERR_TCAM_FULL) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tom_data *d = TOM_DATA(cplios->toedev); if (sk->sk_family == AF_INET && d->lldi->enable_fw_ofld_conn) { err = mk_fw_act_open_req(sk, G_TID_TID(G_AOPEN_ATID(ntohl(rpl->atid_status))), cplios->l2t_entry); if (err < 0 ) { skb->sk = sk; t4_defer_reply(skb, cplios->toedev, deferred_tnl_connect); return; } } else { skb->sk = sk; t4_defer_reply(skb, cplios->toedev, deferred_tnl_connect); return; } } else { err = act_open_rpl_status_to_errno(status); if (err == EADDRINUSE) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct toedev *tdev = cplios->toedev; unsigned short sport = ntohs(inet_sk(sk)->inet_sport); unsigned short dport = ntohs(inet_sk(sk)->inet_dport); printk(KERN_ERR "ACTIVE_OPEN_RPL: 4-tuple in use: "); if (sk->sk_family == AF_INET) printk("%pi4, %u, %pi4, %u\n", &inet_sk(sk)->inet_saddr, sport, &inet_sk(sk)->inet_daddr, dport); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else printk("%pi6, %u, %pi6, %u\n", &inet6_sk_rcv_saddr(sk), sport, &inet6_sk_daddr(sk), dport); #endif skb->sk = sk; t4_defer_reply(skb, tdev, deferred_tnl_connect); return; } t4_fail_act_open(sk, err); } } else ma_fail_active_open_rpl(sk, skb); kfree_skb(skb); } /* * Process an ACT_OPEN_RPL CPL message. */ static int do_act_open_rpl(struct tom_data *td, struct sk_buff *skb) { struct cpl_act_open_rpl *rpl = cplhdr(skb); unsigned int atid = G_TID_TID(G_AOPEN_ATID(ntohl(rpl->atid_status))); unsigned int status = G_AOPEN_STATUS(ntohl(rpl->atid_status)); struct cpl_io_state *cplios; struct sock *sk; cplios = (struct cpl_io_state *)lookup_atid(td->tids, atid); VALIDATE_SOCK(cplios); sk = cplios->sk; if (status && !is_neg_adv(status) && act_open_has_tid(status)) cxgb4_remove_tid(td->tids, cplios->port_id, GET_TID(rpl), sk->sk_family); process_cpl_msg_ref(active_open_rpl, sk, skb); return 0; } /* * Handle an ARP failure for an active open. XXX purge ofo queue * * XXX badly broken for crossed SYNs as the ATID is no longer valid. * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't * free the atid. Hmm. */ void act_open_req_arp_failure(void *handle, struct sk_buff *skb) { struct sock *sk = skb->sk; sock_hold(sk); bh_lock_sock(sk); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) { if (!sock_owned_by_user(sk)) { t4_fail_act_open(sk, EHOSTUNREACH); __kfree_skb(skb); } else { /* * Smart solution: Synthesize an ACTIVE_OPEN_RPL in the * existing sk_buff and queue it to the backlog. We * are certain the sk_buff is not shared. We also * don't bother trimming the buffer. */ struct cpl_act_open_rpl *rpl = cplhdr(skb); rpl->ot.opcode = CPL_ACT_OPEN_RPL; rpl->atid_status = CPL_ERR_ARP_MISS; BLOG_SKB_CB(skb)->backlog_rcv = active_open_rpl; __sk_add_backlog(sk, skb); /* * XXX Make sure a PASS_ACCEPT_RPL behind us doesn't * destroy the socket. Unfortunately we can't go into * SYN_SENT because we don't have an atid. * Needs more thought. */ } } bh_unlock_sock(sk); sock_put(sk); } /* * Determine the receive window size for a socket. */ static unsigned int select_rcv_wnd(struct cpl_io_state *cplios) { struct sock *sk = cplios->sk; unsigned int wnd = tcp_full_space(sk); unsigned int max_rcv_wnd; if (cplios->ulp_mode == ULP_MODE_TCPDDP) wnd = sk->sk_rcvbuf; /* * For receive coalescing to work effectively we need a receive window * that can accomodate a coalesced segment. */ if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; max_rcv_wnd = MAX_RCV_WND; cplios_set_flag(cplios, CPLIOS_UPDATE_RCV_WND); return min(wnd, max_rcv_wnd); } #if defined(TCP_CONGESTION_CONTROL) static void pivot_ca_ops(struct sock *sk, int cong) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); module_put(icsk->icsk_ca_ops->owner); icsk->icsk_ca_ops = &t4_cong_ops[cong < 0 ? 2 : cong].ops; } #endif u32 tcp_default_init_rwnd(u32 mss) { /* Initial receive window should be twice of TCP_INIT_CWND to * enable proper sending of new unsent data during fast recovery * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a * limit when mss is larger than 1460. */ u32 init_rwnd = TCP_INIT_CWND * 2; if (mss > 1460) init_rwnd = max((1460 * init_rwnd) / mss, 2U); return init_rwnd; } static void tcp_fixup_rcvbuf(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); u32 mss = dst_metric_advmss(dst); int rcvmem; rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * tcp_default_init_rwnd(mss); /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency * Allow enough cushion so that sender is not limited by our window */ if (sysctl_tcp_moderate_rcvbuf_p && *sysctl_tcp_moderate_rcvbuf_p) rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); } #define CTRL_SKB_LEN 304 #define TXDATA_SKB_LEN 128 /* * Assign offload parameters to some socket fields. This code is used by * both active and passive opens. */ static void init_offload_sk(struct cpl_io_state *cplios, struct toedev *dev, unsigned int tid, struct l2t_entry *e, struct dst_entry *dst, struct net_device *egress_dev, const struct offload_settings *s, u16 peer_mss) { struct sock *sk = cplios->sk; struct tcp_sock *tp = tcp_sk(sk); struct tom_data *td = TOM_DATA(dev); struct cxgb4_lld_info *lldi = td->lldi; struct adapter *adap = netdev2adap(cplios->egress_dev); int rxq_perchan, rxq_idx; cplios->toedev = dev; cplios->tid = tid; cplios->l2t_entry = e; cplios->wr_max_credits = cplios->wr_credits = min_t(unsigned int, td->max_wr_credits, TOM_TUNABLE(dev, max_wr_credits)); cplios->wr_unacked = 0; cplios->wr_nondata = 0; cplios->delack_mode = 0; cplios->ulp_mode = ((TOM_TUNABLE(dev, ddp) && !sock_flag(sk, SOCK_NO_DDP) && s->ddp) ? ULP_MODE_TCPDDP : ULP_MODE_NONE); if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) tcp_fixup_rcvbuf(sk); tp->rcv_wnd = select_rcv_wnd(cplios); /* depends in ulp_mode */ if (adap && (adap->params.ulp_crypto & ULP_CRYPTO_INLINE_TLS) && (is_tls_sock(sk, dev) || s->tls)) tls_offload_init(cplios); if (cplios->ulp_mode != ULP_MODE_TLS) cplios->lro = TOM_TUNABLE(dev, lro); cplios->lro_skb = NULL; cplios->sched_cls = (s->sched_class >= 0 && s->sched_class < lldi->nsched_cls ? s->sched_class : SCHED_CLS_NONE); /* * Save the socket send buffer size parameter for sending it to firmware for * allocating TX pages. */ cplios->sndbuf = sk->sk_sndbuf; if (netdev_is_offload(egress_dev)) { cplios->port_id = ((struct port_info *)netdev_priv(egress_dev))->port_id; cplios->port_speed = ((struct port_info *)netdev_priv(egress_dev))->link_cfg.speed; } /* * Note that select_mss() depends on cplios->opt2 being setup. Thus * the follwoing three lines need to be executed in exactly the order * below: 1. rss_qid, 2. opt2, 3. mtu_idx. */ rxq_perchan = ((cplios->tls_ofld.tls_offload ? td->lldi->ntlsrxq : td->lldi->nrxq) / td->lldi->nchan); rxq_idx = cplios->port_id*rxq_perchan; if (s->rssq >= 0 || s->rssq == QUEUE_CPU) { unsigned int id; if (s->rssq >= 0) id = s->rssq; else id = smp_processor_id(); rxq_idx += id % rxq_perchan; } else if (rxq_perchan > 1 && s->rssq == QUEUE_RANDOM) { rxq_idx += td->round_robin_cnt++ % rxq_perchan; } if (cplios->tls_ofld.tls_offload) cplios->rss_qid = td->lldi->tlsrxq_ids[rxq_idx]; else cplios->rss_qid = td->lldi->rxq_ids[rxq_idx]; cplios->txq_idx = (rxq_idx < td->lldi->ntxq) ? rxq_idx : cplios->port_id*td->lldi->ntxq/td->lldi->nchan; cplios->opt2 = t4_calc_opt2(cplios, s, cplios->rss_qid); cplios->mtu_idx = select_mss(cplios, dst_mtu(dst), peer_mss); cplios->ctrl_skb_cache = __alloc_skb(CTRL_SKB_LEN, gfp_any(), 0, td->lldi->nodeid); cplios->neg_adv_tid = INVALID_TID; cplios->passive_reap_next = NULL; /* mss configured in multiples of 1K */ cplios->mss = min_not_zero(s->mss * 1024, TOM_TUNABLE(cplios->toedev, mss)); skb_queue_head_init(&cplios->tx_queue); skb_queue_head_init(&cplios->ooo_queue); reset_wr_list(cplios); if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); inet_csk(sk)->icsk_ack.pingpong = 1; /* TCP_QUICKACK disabled */ /* * Set sk_sndbuf so that t4_write_space and sk_stream_write_space * calculate available socket space the same way. This allows us to * keep the original ->sk_write_space callback in cases of kernel * sockets that provide their own version and expect * sk_stream_write_space's method to be working. * * The only case we don't handle are sockets that have their own * ->sk_write_space callback and set SOCK_SNDBUF_LOCK. */ if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) sk->sk_sndbuf = TOM_TUNABLE(dev, max_host_sndbuf); #if defined(TCP_CONGESTION_CONTROL) pivot_ca_ops(sk, s->cong_algo); #endif } static inline void check_sk_callbacks(struct cpl_io_state *cplios) { struct sock *sk = cplios->sk; if (unlikely(sk->sk_user_data && !cplios_flag_nochk(cplios, CPLIOS_CALLBACKS_CHKD))) { if (install_special_data_ready(sk) > 0) sock_set_flag(sk, SOCK_NO_DDP); cplios_set_flag(cplios, CPLIOS_CALLBACKS_CHKD); } } /* * Send an active open request. */ int t4_connect(struct toedev *tdev, struct sock *sk, struct net_device *egress_dev) { int atid, ret; struct sk_buff *skb; struct l2t_entry *e; struct tom_data *d = TOM_DATA(tdev); struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct cpl_io_state *cplios = NULL; struct sk_ofld_proto *oproto; struct offload_req orq; struct offload_settings settings; unsigned int qid_atid; struct neighbour *neigh = NULL; struct net_device *master = NULL; bool use_ecn; int id; offload_req_from_sk(&orq, sk, OPEN_TYPE_ACTIVE); settings = *lookup_ofld_policy(tdev, &orq, d->conf.cop_managed_offloading); if (!settings.offload) { rcu_read_unlock(); goto out_err; } if (netif_is_bond_slave(egress_dev)) master = netdev_master_upper_dev_get_rcu(egress_dev); rcu_read_unlock(); if (master) { ret = toe_enslave(master, egress_dev); if (ret) goto out_err; } cplios = kzalloc(sizeof(*cplios), GFP_USER); if (!cplios) goto out_err; oproto = kzalloc(sizeof(*oproto), GFP_USER); if (!oproto) goto free_cplios; cplios->txdata_skb_cache = alloc_skb(TXDATA_SKB_LEN, GFP_KERNEL); if (!cplios->txdata_skb_cache) goto free_oproto; kref_init(&cplios->kref); atid = cxgb4_alloc_atid(d->tids, cplios); if (atid < 0) goto free_txdata_skb; id = conn_insert_handle(d, sk, atid); if (id < 0) goto free_tid; sock_hold(sk); cplios->sk = sk; cplios->egress_dev = egress_dev; if (sk->sk_family == AF_INET) neigh = t4_dst_neigh_lookup(dst, &inet_sk(sk)->inet_daddr); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else neigh = t4_dst_neigh_lookup(dst, &inet6_sk_daddr(sk)); #endif if (!neigh) { printk(KERN_INFO "%s: dst->_neighbour is NULL\n", __func__); goto free_tid; } e = cxgb4_l2t_get(d->lldi->l2t, neigh, egress_dev , sk->sk_priority); t4_dst_neigh_release(neigh); if (!e) { printk(KERN_ERR "cxgb4_l2t_get() returned zero\n"); goto free_tid; } tp->ecn_flags = 0; use_ecn = (tcp_ecn_enabled(sock_net(sk)) == 1) || tcp_ca_needs_ecn(sk); if (!use_ecn) { if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) use_ecn = true; } if (use_ecn) tp->ecn_flags = TCP_ECN_OK; /* no space is saved using hw specific cpl_act_open_req here * no need to check sk_family either. */ skb = alloc_skb( roundup(sizeof(struct cpl_t6_act_open_req6), 16), GFP_KERNEL); if (!skb) goto free_tid; if (sk->sk_family != AF_INET) { if (cxgb4_clip_get(egress_dev, (const u32 *)((&inet6_sk_saddr(sk))->s6_addr), 1)) goto free_skb; } skb->sk = sk; t4_set_arp_err_handler(skb, NULL, act_open_req_arp_failure); kref_get(&cplios->kref); install_offload_ops(cplios, oproto); check_sk_callbacks(cplios); cplios->tx_c_chan = cxgb4_port_chan(egress_dev); cplios->rx_c_chan = cxgb4_port_e2cchan(egress_dev); init_offload_sk(cplios, tdev, atid, e, dst, egress_dev, &settings, 0); RCV_WSCALE(tp) = select_rcv_wscale(tcp_full_space(sk), sysctl_tcp_window_scaling, tp->window_clamp); sk->sk_err = 0; sock_reset_flag(sk, SOCK_DONE); T4_TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); ma_fail_t4_connect(sk); cplios->toedev = tdev; cplios->smac_idx = cxgb4_tp_smt_idx(d->lldi->adapter_type, cxgb4_port_viid(egress_dev)); qid_atid = cplios->rss_qid << 14; qid_atid |= (unsigned int)atid; if (sk->sk_family == AF_INET) mk_act_open_req(sk, skb, qid_atid, e, &settings); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else mk_act_open_req6(sk, skb, qid_atid, e, &settings, &inet6_sk_rcv_saddr(sk), &inet6_sk_daddr(sk)); #endif cxgb4_l2t_send(cplios->egress_dev, skb, e); return 0; free_skb: kfree_skb(skb); free_tid: cxgb4_free_atid(d->tids, atid); if (id >= 0) { conn_remove_handle(d, atid); sock_put(sk); } free_txdata_skb: kfree_skb(cplios->txdata_skb_cache); free_oproto: kfree(oproto); free_cplios: kfree(cplios); out_err: return -1; } extern t4tom_cpl_handler_func tom_cpl_handlers[NUM_CPL_CMDS]; extern void (*tom_cpl_iscsi_callback)(struct tom_data *, struct sock *, struct sk_buff *, unsigned int); extern void (*fp_iscsi_lro_proc_rx)(struct sock *sk, struct sk_buff *skb); static int inline t4_cpl_iscsi_callback(struct tom_data *td, struct sock *sk, struct sk_buff *skb, unsigned int opcode) { if (tom_cpl_iscsi_callback && sk) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); if (cplios->ulp_mode == ULP_MODE_ISCSI) { tom_cpl_iscsi_callback(td, sk, skb, opcode); return 0; } } return 1; } /* * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant * and send it along. */ static void abort_arp_failure(void *handle, struct sk_buff *skb) { struct cpl_abort_req *req = cplhdr(skb); struct toedev *tdev = (struct toedev *)handle; req->cmd = CPL_ABORT_NO_RST; cxgb4_ofld_send(tdev->lldev[0], skb); } /* Helper function to send the CPL_ABORT_REQ */ static void t4_send_abort(struct sock *sk, int mode, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); bool use_negadv_tid = false; struct cpl_abort_req *req; unsigned int tid; if ((sk->sk_state == TCP_SYN_SENT) && (cplios->neg_adv_tid != INVALID_TID)) { tid = cplios->neg_adv_tid; cplios->idr = sk_insert_tid(TOM_DATA(cplios->toedev), sk, tid); use_negadv_tid = true; } else tid = cplios->tid; if (!skb) skb = alloc_ctrl_skb(cplios->txdata_skb_cache, sizeof(*req)); req = (struct cpl_abort_req *)skb_put(skb, sizeof(*req)); INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); set_queue(skb, (cplios->txq_idx << 1) | CPL_PRIORITY_DATA, sk); req->rsvd0 = htonl(tp->snd_nxt); req->rsvd1 = !cplios_flag_nochk(cplios, CPLIOS_TX_DATA_SENT); req->cmd = mode; if (unlikely(use_negadv_tid)) { /* * need to queue it since flowc is already queued-up. * So, can't send directly. */ __skb_queue_tail(&cplios->ooo_queue, skb); fixup_and_send_ofo(cplios, tid); } else { t4_set_arp_err_handler(skb, cplios->toedev, abort_arp_failure); send_or_defer(sk, tp, skb, mode == CPL_ABORT_SEND_RST); } } /* * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do * not send multiple ABORT_REQs for the same connection and also that we do * not try to send a message after the connection has closed. Returns 1 if * an ABORT_REQ wasn't generated after all, 0 otherwise. */ int t4_send_reset(struct sock *sk, int mode, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tom_data *d = NULL; if (unlikely(cplios_flag_nochk(cplios, CPLIOS_ABORT_SHUTDOWN) || !cplios->toedev)) { if (sk->sk_state == TCP_SYN_RECV) cplios_set_flag(cplios, CPLIOS_RST_ABORTED); goto out; } if (ma_fail_t4_send_reset(sk)) goto out; d = TOM_DATA(cplios->toedev); if (!cplios_flag_nochk(cplios, CPLIOS_TX_DATA_SENT)) { struct tcp_sock *tp = tcp_sk(sk); if (send_tx_flowc_wr(sk, 0, tp->snd_nxt, tp->rcv_nxt) < 0) BUG_ON(1); cplios_set_flag(cplios, CPLIOS_TX_DATA_SENT); } cplios_set_flag(cplios, CPLIOS_ABORT_RPL_PENDING); /* Purge the send queue so we don't send anything after an abort. */ t4_purge_write_queue(sk); /* PR20010: Sending ABORT in SYN_RCV state. * As a workaround to using the same queue for * CPL_PASS_ACCEPT_RPL/CPL_ABORT_REQ, we read the DDP buffer offset * so that ingress queue is set in the rss_info TCB field by the time * CPL_SET_TCB_RPL comes back. * We will send the CPL_ABORT_REQ in process_set_tcb_rpl. * Without this CPL_ABORT_RPL_RSS might end up with receive queue as 0 * which can happen when CPL_ABORT_REQ reaches hardware before * CPL_PASS_ACCEPT_RPL as they are sent on different queues. */ if (sk->sk_state == TCP_SYN_RECV) { t4_set_tcb_field_rpl_skb(sk, W_TCB_RX_DDP_BUF0_OFFSET, 0, 0, DDP_COOKIE_OFFSET, 1); cplios_set_flag(cplios, CPLIOS_ABORT_SHUTDOWN); } else { cplios_set_flag(cplios, CPLIOS_ABORT_SHUTDOWN); t4_send_abort(sk, mode, skb); } return 0; out: if (skb) kfree_skb(skb); return 1; } EXPORT_SYMBOL(t4_send_reset); /* * Reset a connection that is on a listener's SYN queue or accept queue, * i.e., one that has not had a struct socket associated with it. * Must be called from process context. * * Modeled after code in inet_csk_listen_stop(). */ static void reset_listen_child(struct sock *child) { struct cpl_io_state *cplios = CPL_IO_STATE(child); struct sk_buff *skb; skb = alloc_ctrl_skb(cplios->txdata_skb_cache, sizeof(struct cpl_abort_req)); t4_send_reset(child, CPL_ABORT_SEND_RST, skb); sock_orphan(child); INC_ORPHAN_COUNT(child); if (child->sk_state == TCP_CLOSE) inet_csk_destroy_sock(child); } /* * The reap list is the list of passive open sockets that were orphaned when * their listening parent went away and wasn't able to nuke them for whatever * reason. These sockets are terminated through a work request from process * context. */ static struct sock *reap_list; static DEFINE_SPINLOCK(reap_list_lock); /* * Process the reap list. */ DECLARE_TASK_FUNC(process_reap_list, task_param) { spin_lock_bh(&reap_list_lock); while (reap_list) { struct sock *sk = reap_list; reap_list = CPL_IO_STATE(sk)->passive_reap_next; CPL_IO_STATE(sk)->passive_reap_next = NULL; spin_unlock(&reap_list_lock); sock_hold(sk); // need to survive past inet_csk_destroy_sock() bh_lock_sock(sk); reset_listen_child(sk); bh_unlock_sock(sk); sock_put(sk); spin_lock(&reap_list_lock); } spin_unlock_bh(&reap_list_lock); } static T4_DECLARE_WORK(reap_task, process_reap_list, NULL); /* * Add a socket to the reap list and schedule a work request to process it. * We thread sockets through their sk_user_data pointers. May be called * from softirq context and any associated open request must have already * been freed. */ static void add_to_reap_list(struct sock *sk) { BUG_ON(CPL_IO_STATE(sk)->passive_reap_next); local_bh_disable(); bh_lock_sock(sk); release_tcp_port(sk); // release the port immediately, it may be reused spin_lock(&reap_list_lock); CPL_IO_STATE(sk)->passive_reap_next = reap_list; reap_list = sk; if (!CPL_IO_STATE(sk)->passive_reap_next) schedule_work(&reap_task); spin_unlock(&reap_list_lock); bh_unlock_sock(sk); local_bh_enable(); } void __set_tcb_field_direct(struct cpl_io_state *cplios, struct cpl_set_tcb_field *req, u16 word, u64 mask, u64 val, u8 cookie, int no_reply) { struct ulptx_idata *sc; INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, cplios->tid); req->reply_ctrl = htons(V_NO_REPLY(no_reply) | V_REPLY_CHAN(cplios->rx_c_chan) | V_QUEUENO(cplios->rss_qid)); req->word_cookie = htons(V_WORD(word) | V_COOKIE(cookie)); req->mask = cpu_to_be64(mask); req->val = cpu_to_be64(val); sc = (struct ulptx_idata *)(req + 1); sc->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_NOOP)); sc->len = htonl(0); } void __set_tcb_field(struct sock *sk, struct sk_buff *skb, u16 word, u64 mask, u64 val, u8 cookie, int no_reply) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct cpl_set_tcb_field *req; struct ulptx_idata *sc; unsigned int wrlen = roundup(sizeof(*req) + sizeof(*sc), 16); req = (struct cpl_set_tcb_field *)__skb_put(skb, wrlen); __set_tcb_field_direct(cplios, req, word, mask, val, cookie, no_reply); set_wr_txq(skb, CPL_PRIORITY_CONTROL, cplios->port_id); } void t4_set_tcb_field_skb(struct sock *sk, u16 word, u64 mask, u64 val) { struct sk_buff *skb; struct cpl_set_tcb_field *req; struct ulptx_idata *sc; unsigned int wrlen = roundup(sizeof(*req) + sizeof(*sc), 16); skb = alloc_ctrl_skb(CPL_IO_STATE(sk)->ctrl_skb_cache, wrlen); BUG_ON(!skb); __set_tcb_field(sk, skb, word, mask, val, 0, 1); send_or_defer(sk, tcp_sk(sk), skb, 0); } static inline int __t4_set_tcb_field(struct sock *sk, u16 word, u64 mask, u64 val, u8 cookie, int noreply) { struct packed_wr { struct cpl_set_tcb_field cpl; struct ulptx_idata sc; } __packed sreq, *req = &sreq; unsigned int wrlen = roundup(sizeof(struct packed_wr), 16); int useskb = (sk->sk_state == TCP_SYN_SENT); struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct toedev *tdev; if (sk->sk_state == TCP_CLOSE || cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN)) return 0; tdev = cplios->toedev; if (toedev_in_shutdown(tdev)) return 0; if (!useskb) { __set_tcb_field_direct(cplios, &sreq.cpl, word, mask, val, cookie, noreply); useskb = cxgb4_ctrl_send(tdev->lldev[cplios->port_id], req, wrlen); } return useskb; } void t4_set_tcb_field_rpl_skb(struct sock *sk, u16 word, u64 mask, u64 val, u8 cookie, int through_l2t) { struct sk_buff *skb; struct cpl_set_tcb_field *req; struct ulptx_idata *sc; unsigned int wrlen = roundup(sizeof(*req) + sizeof(*sc), 16); skb = alloc_ctrl_skb(CPL_IO_STATE(sk)->ctrl_skb_cache, wrlen); BUG_ON(!skb); __set_tcb_field(sk, skb, word, mask, val, cookie, 0); send_or_defer(sk, tcp_sk(sk), skb, through_l2t); } void t4_set_tcb_field(struct sock *sk, u16 word, u64 mask, u64 val) { if (__t4_set_tcb_field(sk, word, mask, val, 0, 1)) t4_set_tcb_field_skb(sk, word, mask, val); } void t4_set_tcb_field_rpl(struct sock *sk, u16 word, u64 mask, u64 val, u8 cookie) { if (__t4_set_tcb_field(sk, word, mask, val, cookie, 0)) t4_set_tcb_field_rpl_skb(sk, word, mask, val, cookie, 0); } /* * Set one of the t_flags bits in the TCB. */ void t4_set_tcb_tflag(struct sock *sk, unsigned int bit_pos, int val) { t4_set_tcb_field(sk, W_TCB_T_FLAGS, 1ULL << bit_pos, val << bit_pos); } /* * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. */ void t4_set_nagle(struct sock *sk) { t4_set_tcb_tflag(sk, S_TF_NAGLE, !(tcp_sk(sk)->nonagle & TCP_NAGLE_OFF)); } /* * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. */ void t4_set_keepalive(struct sock *sk, int on_off) { t4_set_tcb_tflag(sk, S_TF_KEEPALIVE, on_off); } void t4_set_rcv_coalesce_enable(struct sock *sk, int on_off) { t4_set_tcb_tflag(sk, S_TF_RCV_COALESCE_ENABLE, on_off); } void t4_set_dack(struct sock *sk, int on_off) { t4_set_tcb_tflag(sk, S_TF_DACK, on_off); } void t4_set_dack_mss(struct sock *sk, int on_off) { t4_set_tcb_tflag(sk, S_TF_DACK_MSS, on_off); } void t4_set_migrating(struct sock *sk, int on_off) { t4_set_tcb_tflag(sk, S_TF_MIGRATING, on_off); } void t4_set_non_offload(struct sock *sk, int on_off) { t4_set_tcb_tflag(sk, S_TF_NON_OFFLOAD, on_off); } /* * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. */ void t4_set_tos(struct sock *sk) { t4_set_tcb_field(sk, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), V_TCB_TOS(SK_TOS(sk))); } /* * In DDP mode, TP fails to schedule a timer to push RX data to the host when * DDP is disabled (data is delivered to freelist). [Note that, the peer should * set the PSH bit in the last segment, which would trigger delivery.] * We work around the issue by setting a DDP buffer in a partial placed state, * which guarantees that TP will schedule a timer. */ #define TP_DDP_TIMER_WORKAROUND_MASK\ (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) #define TP_DDP_TIMER_WORKAROUND_VAL\ (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ ((V_TCB_RX_DDP_BUF0_OFFSET((u64)1) | V_TCB_RX_DDP_BUF0_LEN((u64)2)) <<\ 32)) void t4_enable_ddp(struct sock *sk, int on_off) { t4_set_tcb_field(sk, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1ULL), V_TF_DDP_OFF((unsigned long long)!on_off)); } void t4_disable_ddp(struct sock *sk) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); t4_set_tcb_field(sk, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1ULL), V_TF_DDP_OFF(1ULL)); if (cplios->opt2 & V_RX_COALESCE(M_RX_COALESCE)) t4_set_tcb_field(sk, W_TCB_T_FLAGS, V_TF_RCV_COALESCE_ENABLE(1ULL), V_TF_RCV_COALESCE_ENABLE(1ULL)); } void t4_set_ddp_tag(struct sock *sk, int buf_idx, unsigned int tag_color) { t4_set_tcb_field(sk, W_TCB_RX_DDP_BUF0_TAG + buf_idx, V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), tag_color); } void t4_set_ddp_buf(struct sock *sk, int buf_idx, unsigned int offset, unsigned int len) { if (buf_idx == 0) t4_set_tcb_field(sk, W_TCB_RX_DDP_BUF0_OFFSET, V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), V_TCB_RX_DDP_BUF0_OFFSET((u64)offset) | V_TCB_RX_DDP_BUF0_LEN((u64)len)); else t4_set_tcb_field(sk, W_TCB_RX_DDP_BUF1_OFFSET, V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), V_TCB_RX_DDP_BUF1_OFFSET((u64)offset) | V_TCB_RX_DDP_BUF1_LEN(((u64)len) << 32)); } void t4_set_ddp_indicate(struct sock *sk, int on) { if (on) t4_set_tcb_field_rpl(sk, W_TCB_RX_DDP_FLAGS, V_TF_DDP_INDICATE_OUT(1ULL) | V_TF_DDP_BUF0_VALID(1ULL) | V_TF_DDP_BUF1_VALID(1ULL) | V_TF_DDP_BUF0_INDICATE(1ULL) | V_TF_DDP_BUF1_INDICATE(1ULL), V_TF_DDP_BUF0_INDICATE(1ULL), DDP_COOKIE_INDOUT); else t4_set_tcb_field_rpl(sk, W_TCB_RX_DDP_FLAGS, V_TF_DDP_INDICATE_OUT(1ULL), V_TF_DDP_INDICATE_OUT(1ULL), DDP_COOKIE_INDOUT); } int t4_get_tcb(struct sock *sk, unsigned short cookie) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct cpl_get_tcb *req; struct sk_buff *skb = alloc_skb(sizeof(*req), gfp_any()); if (!skb) return -ENOMEM; req = (struct cpl_get_tcb *)__skb_put(skb, sizeof(*req)); INIT_TP_WR_MIT_CPL(req, CPL_GET_TCB, cplios->tid); req->reply_ctrl = htons(V_REPLY_CHAN(cplios->rx_c_chan) | V_QUEUENO(cplios->rss_qid)); req->cookie = htons(cookie); set_wr_txq(skb, CPL_PRIORITY_CONTROL, cplios->port_id); if (sk->sk_state == TCP_SYN_SENT) __skb_queue_tail(&cplios->ooo_queue, skb); else cxgb4_ofld_send(cplios->egress_dev, skb); return 0; } int t4_set_cong_control(struct sock *sk, const char *name) { int cong_algo; u64 mask, val; for (cong_algo = 0; cong_algo < ARRAY_SIZE(t4_cong_ops); cong_algo++) if (!strcmp(name, t4_cong_ops[cong_algo].ops.name)) break; if (cong_algo >= ARRAY_SIZE(t4_cong_ops)) return -EINVAL; mask = V_TF_TURBO(1) | V_TF_CCTRL_SEL0(3); if (t4_cong_ops[cong_algo].key == CONG_ALG_NONE) val = V_TF_TURBO(1); else val = V_TF_CCTRL_SEL0(t4_cong_ops[cong_algo].key); t4_set_tcb_field(sk, W_TCB_T_FLAGS, mask, val); return 0; } /* * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are * permitted to return without sending the message in case we cannot allocate * an sk_buff. Returns the number of credits sent. */ u32 t4_send_rx_credits(struct sock *sk, u32 credits, u32 dack, int nofail) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct sk_buff *skb; struct cpl_rx_data_ack *req; if (ma_fail_t4_send_rx_credits(sk)) return 0; skb = nofail ? alloc_ctrl_skb(cplios->ctrl_skb_cache, sizeof(*req)) : alloc_skb(sizeof(*req), GFP_ATOMIC); if (!skb) return 0; req = (struct cpl_rx_data_ack *)__skb_put(skb, sizeof(*req)); INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, cplios->tid); req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); set_wr_txq(skb, CPL_PRIORITY_ACK, cplios->port_id); cxgb4_ofld_send(cplios->egress_dev, skb); return credits; } /* * Handle receipt of an urgent pointer. */ static void handle_urg_ptr(struct sock *sk, u32 urg_seq) { struct tcp_sock *tp = tcp_sk(sk); urg_seq--; /* initially points past the urgent data, per BSD */ if (tp->urg_data && !after(urg_seq, tp->urg_seq)) return; /* duplicate pointer */ sk_send_sigurg(sk); if (tp->urg_seq == tp->copied_seq && tp->urg_data && !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); tp->copied_seq++; if (skb && tp->copied_seq - ULP_SKB_CB(skb)->seq >= skb->len) tom_eat_skb(sk, skb); } tp->urg_data = TCP_URG_NOTYET; tp->urg_seq = urg_seq; } /* * Process an urgent data notification. */ static void rx_urg_notify(struct sock *sk, struct sk_buff *skb) { struct cpl_rx_urg_notify *hdr = cplhdr(skb); if (!sk_no_receive(sk)) handle_urg_ptr(sk, ntohl(hdr->seq)); kfree_skb(skb); } /* * Handler for RX_URG_NOTIFY CPL messages. */ static int do_rx_urg_notify(struct tom_data *td, struct sk_buff *skb) { struct sock *sk; struct cpl_rx_urg_notify *req = cplhdr(skb); unsigned int hwtid = GET_TID(req); sk = lookup_tid(td->tids, hwtid); VALIDATE_SOCK(sk); process_cpl_msg(rx_urg_notify, sk, skb); return 0; } /* * A helper function that aborts a connection and increments the given MIB * counter. The supplied skb is used to generate the ABORT_REQ message if * possible. Must be called with softirqs disabled. */ static inline void abort_conn(struct sock *sk, struct sk_buff *skb, int mib) { struct sk_buff *abort_skb; abort_skb = __get_cpl_reply_skb(skb, sizeof(struct cpl_abort_req), GFP_ATOMIC); if (abort_skb) { T4_NET_INC_STATS_BH(sock_net(sk), mib); t4_send_reset(sk, CPL_ABORT_SEND_RST, abort_skb); } } /* * Returns true if we need to explicitly request RST when we receive new data * on an RX-closed connection. */ static inline int need_rst_on_excess_rx(const struct sock *sk) { return 1; } /* * Handles Rx data that arrives in a state where the socket isn't accepting * new data. */ static void handle_excess_rx(struct sock *sk, struct sk_buff *skb) { if (need_rst_on_excess_rx(sk) && !cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN)) abort_conn(sk, skb, LINUX_MIB_TCPABORTONDATA); kfree_skb(skb); /* can't use __kfree_skb here */ } /* * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. */ static inline void mk_set_tcb_field_ulp(struct cpl_io_state *cplios, struct cpl_set_tcb_field *req, unsigned int word, u64 mask, u64 val, u8 cookie, int no_reply) { struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; struct ulptx_idata *sc = (struct ulptx_idata *)(txpkt + 1); txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); txpkt->len = htonl(DIV_ROUND_UP(sizeof(*req), 16)); sc->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM)); sc->len = htonl(sizeof(*req) - sizeof(struct work_request_hdr)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, cplios->tid)); req->reply_ctrl = htons(V_NO_REPLY(no_reply) | V_REPLY_CHAN(cplios->rx_c_chan) | V_QUEUENO( cplios->rss_qid)); req->word_cookie = htons(V_WORD(word) | V_COOKIE(cookie)); req->mask = cpu_to_be64(mask); req->val = cpu_to_be64(val); sc = (struct ulptx_idata *)(req + 1); sc->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_NOOP)); sc->len = htonl(0); } static void t4_set_maxseg(struct sock *sk, unsigned int mtu_idx) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct sk_buff *skb; struct work_request_hdr *wr; struct ulptx_idata *aligner; struct cpl_set_tcb_field *req; struct cpl_set_tcb_field *tstampreq; unsigned int wrlen; if (sk->sk_state == TCP_CLOSE || cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN)) return; wrlen = roundup(sizeof(*wr) + 2*(sizeof(*req) + sizeof(*aligner)), 16); skb = alloc_ctrl_skb(cplios->ctrl_skb_cache, wrlen); if (!skb) return; set_wr_txq(skb, CPL_PRIORITY_CONTROL, cplios->port_id); req = (struct cpl_set_tcb_field *)__skb_put(skb, wrlen); INIT_ULPTX_WR(req, wrlen, 0, 0); wr = (struct work_request_hdr *)req; wr++; req = (struct cpl_set_tcb_field *)wr; mk_set_tcb_field_ulp(cplios, req, W_TCB_T_MAXSEG, V_TCB_T_MAXSEG(M_TCB_T_MAXSEG), mtu_idx, 0, 1); aligner = (struct ulptx_idata *)(req + 1); tstampreq = (struct cpl_set_tcb_field *)(aligner + 1); /* * Clear bits 29:11 of the TCB Time Stamp field to trigger an * immediate retransmission with the new Maximum Segment Size. */ mk_set_tcb_field_ulp(cplios, tstampreq, W_TCB_TIMESTAMP, V_TCB_TIMESTAMP(0x7FFFFULL << 11), 0, 0, 1); cxgb4_ofld_send(cplios->egress_dev, skb); } /* * Process a set_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) * by getting the DDP offset from the TCB. */ static void tcb_rpl_as_ddp_complete(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_set_tcb_rpl *hdr; unsigned int ddp_offset; if (unlikely(!(tp = tcp_sk(sk)) || !CPL_IO_STATE(sk))) { kfree_skb(skb); return; } hdr = cplhdr(skb); /* It is a possible that a previous CPL already invalidated UBUF DDP * and moved the cur_buf idx and hence no further processing of this * skb is required. However, the app might be sleeping on * !q->get_tcb_count and we need to wake it up. */ q = DDP_STATE(sk); if (q->cancel_ubuf && !t4_ddp_ubuf_pending(sk)) { kfree_skb(skb); q->get_tcb_count--; if (!sock_flag(sk, SOCK_DEAD)) sk_data_ready_compat(sk, 0); return; } bsp = &q->buf_state[q->cur_buf]; if (q->cur_buf == 0) ddp_offset = (be64_to_cpu(hdr->oldval) >> S_TCB_RX_DDP_BUF0_OFFSET) & M_TCB_RX_DDP_BUF0_OFFSET; else ddp_offset = (be64_to_cpu(hdr->oldval) >> (32+S_TCB_RX_DDP_BUF1_OFFSET)) & M_TCB_RX_DDP_BUF1_OFFSET; skb_ulp_ddp_offset(skb) = bsp->cur_offset; bsp->cur_offset = ddp_offset; skb->len = ddp_offset - skb_ulp_ddp_offset(skb); if (unlikely(sk_no_receive(sk) && skb->len)) { handle_excess_rx(sk, skb); q->get_tcb_count--; return; } if (bsp->flags & DDP_BF_NOCOPY) { skb_ulp_ddp_flags(skb) = DDP_BF_PSH | DDP_BF_NODATA | DDP_BF_NOCOPY | 1; bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); q->cur_buf ^= 1; } else { /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, * but it got here way late and nobody cares anymore. */ kfree_skb(skb); q->get_tcb_count--; return; } skb_gl_set(skb, bsp->gl); ULP_SKB_CB(skb)->seq = tp->rcv_nxt; tp->rcv_nxt += skb->len; skb_reset_transport_header(skb); tcp_hdr(skb)->fin = 0; /* changes original TCB */ inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; __skb_queue_tail(&sk->sk_receive_queue, skb); smp_wmb(); q->get_tcb_count--; if (!sock_flag(sk, SOCK_DEAD)) sk_data_ready_compat(sk, 0); } static void process_set_tcb_rpl(struct sock *sk, struct sk_buff *skb) { struct cpl_set_tcb_rpl *rpl = cplhdr(skb); struct ddp_state *q; if (rpl->status) printk(KERN_INFO "CPL_SET_TCB_RPL: status = 0x%u\n", rpl->status); q = DDP_STATE(sk); if (G_COOKIE(rpl->cookie) == DDP_COOKIE_ENABLE) { if (likely(!sk_no_receive(sk) && !q->ddp_setup)) { q->indicate = tcp_sk(sk)->rcv_nxt; t4_set_ddp_indicate(sk, 1); q->indout_count++; q->ddp_setup = 1; } } else if (G_COOKIE(rpl->cookie) == DDP_COOKIE_INDOUT) { if (likely(!sk_no_receive(sk) && q->ddp_setup)) q->indout_count--; } else if ((G_COOKIE(rpl->cookie) == DDP_COOKIE_OFFSET) && !q->ddp_setup) { /* Sending ABORT in SYN_RCV state. * We are reusing this DDP_COOKIE_OFFSET to handle the special * case of sending ABORT in TCP_SYN_RECV state. */ /* Reusing the skb as size of cpl_set_tcb_field structure * is greater than cpl_abort_req */ __skb_trim(skb, 0); skb_get(skb); t4_send_abort(sk, CPL_ABORT_SEND_RST, skb); } else if (G_COOKIE(rpl->cookie) == DDP_COOKIE_OFFSET) { tcb_rpl_as_ddp_complete(sk, skb); return; } else { #ifdef CONFIG_T4_MA_FAILOVER if ((G_COOKIE(rpl->cookie) == MA_FAILOVER_COOKIE_RCV_WND) | (G_COOKIE(rpl->cookie) == MA_FAILOVER_COOKIE_RX_HDR_OFFSET) | (G_COOKIE(rpl->cookie) == MA_FAILOVER_COOKIE_NEW_RCV_WND) | (G_COOKIE(rpl->cookie) == MA_FAILOVER_COOKIE_L2TIX)) ma_fail_process_set_tcb_rpl(sk, skb); #endif } kfree_skb(skb); } static int do_set_tcb_rpl(struct tom_data *td, struct sk_buff *skb) { struct sock *sk; struct cpl_set_tcb_rpl *req = cplhdr(skb); unsigned int hwtid = GET_TID(req); sk = lookup_tid(td->tids, hwtid); /* OK if socket doesn't exist */ if (!sk) return CPL_RET_BUF_DONE; if (!t4_cpl_iscsi_callback(td, sk, skb, CPL_SET_TCB_RPL)) return 0; process_cpl_msg(process_set_tcb_rpl, sk, skb); return 0; } /* * We get called from the CPL_RX_DATA handler new_rx_data() when it gets * called and discovers that we thought the connection was in DDP mode. Here * we'll examine the CPL_RX_DATA and use it to synthesize a DDP skb to cover * the sequence space between where we last expected to get DDP data and the * sequence number of the new CPL_RX_DATA. */ static void handle_ddp_data(struct sock *sk, struct sk_buff *origskb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_rx_data *hdr = cplhdr(origskb); unsigned int rcv_nxt = ntohl(hdr->seq); struct sk_buff *skb; /* * If the sequence number received is less than expected then the * assumptions that follow do not apply. */ if (before(rcv_nxt, tp->rcv_nxt)) return; q = DDP_STATE(sk); if (!q->ddp_setup) return; bsp = &q->buf_state[q->cur_buf]; if (after(rcv_nxt, tp->rcv_nxt)) { /* * Create an skb to cover the range of data which was DDP'ed * and append that to the socket's receive queue. */ skb = skb_clone(origskb, GFP_ATOMIC); if (!skb) return; /* * Here we assume that data placed into host memory by DDP * corresponds to the difference between the sequence number * received in the RX_DATA header and the expected sequence * number. And since we tested the sequence above, the * computed skb->len is positive and we won't panic later on * ... */ skb->len = rcv_nxt - tp->rcv_nxt; skb_gl_set(skb, bsp->gl); skb_ulp_ddp_offset(skb) = bsp->cur_offset; skb_ulp_ddp_flags(skb) = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; if (bsp->flags & DDP_BF_NOCOPY) bsp->flags &= ~DDP_BF_NOCOPY; if (unlikely(hdr->dack_mode != cplios->delack_mode)) { cplios->delack_mode = hdr->dack_mode; cplios->delack_seq = tp->rcv_nxt; } ULP_SKB_CB(skb)->seq = tp->rcv_nxt; tp->rcv_nxt = rcv_nxt; bsp->cur_offset += skb->len; q->cur_buf ^= 1; inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; __skb_queue_tail(&sk->sk_receive_queue, skb); /* * Note that we've fallen out of DDP mode. */ q->ddp_off = 1; q->indicate = 0; } else { /* * This could be an "indicate" from T4 telling us about more * data available to DDP. Or it could be a CPL_RX_DATA with * ddp_off set meaning that we've fallen out of DDP mode ... */ unsigned int target, ind_size; q->ind_rcv_nxt = rcv_nxt; ind_size = origskb->len - sizeof(*hdr); if (hdr->ddp_off) { q->ddp_off = 1; q->indicate = 0; } else if (q->ddp_off) q->ddp_off = 0; target = sock_rcvlowat(sk, 0, (int)(~0U>>1)); if (!q->ddp_off && ((tp->rcv_nxt + ind_size) - tp->copied_seq < target)) { t4_set_ddp_indicate(sk, 1); q->indicate = tp->rcv_nxt + ind_size; q->indout_count++; } } } /* * Process new data received for a connection. */ static void new_rx_data(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct cpl_rx_data *hdr = cplhdr(skb); struct tcp_sock *tp = tcp_sk(sk); if (unlikely(sk_no_receive(sk))) { handle_excess_rx(sk, skb); return; } if (unlikely(hdr->status)) { u8 status = ACCESS_ONCE(hdr->status); /* iscsi connections can send cpl_rx_data * with status CPL_ERR_IWARP_FLM */ if (cplios->ulp_mode == ULP_MODE_ISCSI) { handle_excess_rx(sk, skb); pr_err_ratelimited( "%s: TID %u: iSCSI unexpected CPL_RX_DATA status = %u\n", cplios->toedev->name, cplios->tid, status); return; } } tom_sk_set_napi_id(sk, tom_skb_get_napi_id(skb)); if (cplios->ulp_mode == ULP_MODE_TCPDDP) handle_ddp_data(sk, skb); ULP_SKB_CB(skb)->seq = ntohl(hdr->seq); ULP_SKB_CB(skb)->psh = hdr->psh; skb_ulp_mode(skb) = ULP_MODE_NONE; /* for iSCSI */ skb_ulp_ddp_flags(skb) = 0; /* for DDP */ #if VALIDATE_SEQ if (unlikely(ULP_SKB_CB(skb)->seq != tp->rcv_nxt)) { pr_err_ratelimited( "%s: TID %u: Bad sequence number %u, expected %u\n", cplios->toedev->name, cplios->tid, ULP_SKB_CB(skb)->seq, tp->rcv_nxt); __kfree_skb(skb); return; } #endif skb_reset_transport_header(skb); __skb_pull(skb, sizeof(*hdr)); if (!skb->data_len) __skb_trim(skb, ntohs(hdr->len)); if (unlikely(hdr->urg)) handle_urg_ptr(sk, tp->rcv_nxt + ntohs(hdr->urg)); if (unlikely(tp->urg_data == TCP_URG_NOTYET && tp->urg_seq - tp->rcv_nxt < skb->len)) tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - tp->rcv_nxt]; if (unlikely(hdr->dack_mode != cplios->delack_mode)) { cplios->delack_mode = hdr->dack_mode; cplios->delack_seq = tp->rcv_nxt; } tcp_hdr(skb)->fin = 0; /* modifies original hdr->urg */ tp->rcv_nxt += skb->len; inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; __skb_queue_tail(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) { check_sk_callbacks(cplios); sk_data_ready_compat(sk, 0); } } /* * Handler for RX_DATA CPL messages. */ static int do_rx_data(struct tom_data *td, struct sk_buff *skb) { struct sock *sk; struct cpl_rx_data *req = cplhdr(skb); unsigned int hwtid = GET_TID(req); sk = lookup_tid(td->tids, hwtid); VALIDATE_SOCK(sk); skb_gl_set(skb, NULL); /* indicates packet is RX_DATA */ process_cpl_msg(new_rx_data, sk, skb); return 0; } #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR) static void new_rx_data_ddp(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp; struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_rx_data_ddp *hdr; unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; unsigned int delack_mode; if (unlikely(sk_no_receive(sk))) { handle_excess_rx(sk, skb); return; } tp = tcp_sk(sk); q = DDP_STATE(sk); hdr = cplhdr(skb); ddp_report = ntohl(hdr->ddp_report); buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; bsp = &q->buf_state[buf_idx]; ddp_len = ntohs(hdr->len); rcv_nxt = ntohl(hdr->seq) + ddp_len; end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; bsp->cur_offset = end_offset; if (unlikely(ntohl(hdr->ddpvld) & DDP_ERR)) { struct ddp_gather_list *gl = bsp->gl; if (ddp_report & F_DDP_INV) bsp->flags &= ~DDP_BF_NOCOPY; netdev_err(cplios->egress_dev, "RX_DATA_DDP: CPL TID=%u error=0x%x ppod addr=0x%x\n", GET_TID(hdr), ntohl(hdr->ddpvld), gl->tag<<6); kfree_skb(skb); return; } delack_mode = G_DDP_DACK_MODE(ddp_report); if (unlikely(G_DDP_DACK_MODE(ddp_report) != cplios->delack_mode)) { cplios->delack_mode = delack_mode; cplios->delack_seq = tp->rcv_nxt; } ULP_SKB_CB(skb)->seq = tp->rcv_nxt; tp->rcv_nxt = rcv_nxt; /* * Store the length in skb->len. We are changing the meaning of * skb->len here, we need to be very careful that nothing from now on * interprets ->len of this packet the usual way. */ skb->len = tp->rcv_nxt - ULP_SKB_CB(skb)->seq; /* * Figure out where the new data was placed in the buffer and store it * in when. Assumes the buffer offset starts at 0, consumer needs to * account for page pod's pg_offset. */ skb_ulp_ddp_offset(skb) = end_offset - skb->len; /* * We store in mac.raw the address of the gather list where the * placement happened. */ skb_gl_set(skb, bsp->gl); /* * Bit 0 of DDP flags stores whether the DDP buffer is completed. * Note that other parts of the code depend on this being in bit 0. */ skb_ulp_ddp_flags(skb) = !!(ddp_report & F_DDP_INV); if (bsp->flags & DDP_BF_NOCOPY) { skb_ulp_ddp_flags(skb) |= (bsp->flags & DDP_BF_NOCOPY); if (ddp_report & F_DDP_INV) bsp->flags &= ~DDP_BF_NOCOPY; } if (ddp_report & F_DDP_PSH) skb_ulp_ddp_flags(skb) |= DDP_BF_PSH; if (!!(ddp_report & F_DDP_INV)) skb_ulp_ddp_flags(skb) |= DDP_BF_NODATA; skb_reset_transport_header(skb); tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; __skb_queue_tail(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk_data_ready_compat(sk, 0); } /* * Handler for RX_DATA_DDP CPL messages. */ static int do_rx_data_ddp(struct tom_data *td, struct sk_buff *skb) { struct sock *sk; struct cpl_rx_data_ddp *hdr = cplhdr(skb); unsigned int hwtid = GET_TID(hdr); sk = lookup_tid(td->tids, hwtid); VALIDATE_SOCK(sk); if (!t4_cpl_iscsi_callback(td, sk, skb, CPL_RX_DATA_DDP)) return 0; process_cpl_msg(new_rx_data_ddp, sk, skb); return 0; } static void process_ddp_complete(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_rx_ddp_complete *hdr; unsigned int ddp_report, buf_idx; unsigned int delack_mode; if (unlikely(sk_no_receive(sk))) { handle_excess_rx(sk, skb); return; } tp = tcp_sk(sk); q = DDP_STATE(sk); hdr = cplhdr(skb); ddp_report = ntohl(hdr->ddp_report); buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; bsp = &q->buf_state[buf_idx]; skb_ulp_ddp_offset(skb) = bsp->cur_offset; skb->len = G_DDP_OFFSET(ddp_report) - skb_ulp_ddp_offset(skb); bsp->cur_offset += skb->len; q->cur_buf ^= 1; skb_gl_set(skb, bsp->gl); skb_ulp_ddp_flags(skb) = (bsp->flags & DDP_BF_NOCOPY) | 1; if (bsp->flags & DDP_BF_NOCOPY) bsp->flags &= ~DDP_BF_NOCOPY; skb_ulp_ddp_flags(skb) |= DDP_BF_NODATA; delack_mode = G_DDP_DACK_MODE(ddp_report); if (unlikely(G_DDP_DACK_MODE(ddp_report) != cplios->delack_mode)) { cplios->delack_mode = delack_mode; cplios->delack_seq = tp->rcv_nxt; } ULP_SKB_CB(skb)->seq = tp->rcv_nxt; tp->rcv_nxt = ntohl(hdr->rcv_nxt); skb_reset_transport_header(skb); tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; __skb_queue_tail(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk_data_ready_compat(sk, 0); } /* * Handler for RX_DDP_COMPLETE CPL messages. */ static int do_rx_ddp_complete(struct tom_data *td, struct sk_buff *skb) { struct sock *sk; struct cpl_rx_ddp_complete *req = cplhdr(skb); unsigned int hwtid = GET_TID(req); sk = lookup_tid(td->tids, hwtid); VALIDATE_SOCK(sk); process_cpl_msg(process_ddp_complete, sk, skb); return 0; } static void tls_skb_data_len(struct sk_buff *skb, unsigned int new_len) { struct tlsrx_hdr_pkt *tls_hdr_pkt = cplhdr(skb); skb->hdr_len = ntohs(tls_hdr_pkt->length); tls_hdr_pkt->length = ntohs(new_len); } static void new_rx_tls_cmp(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tls_ofld_info *tls_ofld = TLS_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); struct cpl_rx_tls_cmp *cmp_cpl = cplhdr(skb); struct sk_buff *skb_data = NULL; tom_sk_set_napi_id(sk, tom_skb_get_napi_id(skb)); ULP_SKB_CB(skb)->seq = ntohl(cmp_cpl->seq); ULP_SKB_CB(skb)->flags = 0; skb_reset_transport_header(skb); __skb_pull(skb, sizeof(*cmp_cpl)); if (!skb->data_len) { __skb_trim(skb, G_CPL_RX_TLS_CMP_LENGTH(ntohl( cmp_cpl->pdulength_length))); } tp->rcv_nxt += G_CPL_RX_TLS_CMP_PDULENGTH(ntohl(cmp_cpl->pdulength_length)); skb_data = __skb_dequeue(&tls_ofld->sk_recv_queue); if (!skb_data) { ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_TLS_NO_DATA; __skb_queue_tail(&sk->sk_receive_queue, skb); } else { tls_skb_data_len(skb, tls_ofld->pld_len); tls_ofld->pld_len = 0; __skb_queue_tail(&sk->sk_receive_queue, skb); __skb_queue_tail(&sk->sk_receive_queue, skb_data); } if (!sock_flag(sk, SOCK_DEAD)) { check_sk_callbacks(cplios); sk_data_ready_compat(sk, 0); } } static int do_cpl_rx_tls_cmp(struct tom_data *td, struct sk_buff *skb) { struct sock *sk; struct cpl_rx_data *req = cplhdr(skb); unsigned int hwtid = GET_TID(req); sk = lookup_tid(td->tids, hwtid); VALIDATE_SOCK(sk); skb_gl_set(skb, NULL); process_cpl_msg(new_rx_tls_cmp, sk, skb); return 0; } static void new_tls_data(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tls_ofld_info *tls_ofld = TLS_IO_STATE(sk); struct cpl_tls_data *hdr = cplhdr(skb); struct tcp_sock *tp = tcp_sk(sk); if (unlikely(sk_no_receive(sk))) { handle_excess_rx(sk, skb); return; } /* id of the last napi context to receive data for sk */ tom_sk_set_napi_id(sk, tom_skb_get_napi_id(skb)); ULP_SKB_CB(skb)->seq = ntohl(hdr->seq); ULP_SKB_CB(skb)->flags = 0; skb_ulp_mode(skb) = ULP_MODE_TLS; #if VALIDATE_SEQ if (unlikely(ULP_SKB_CB(skb)->seq != tp->rcv_nxt)) { pr_err("%s: TID %u: Bad sequence number %u, expected%u\n", cplios->toedev->name, cplios->tid, ULP_SKB_CB(skb)->seq, tp->rcv_nxt); __kfree_skb(skb); return; } #endif skb_reset_transport_header(skb); __skb_pull(skb, sizeof(*hdr)); if (!skb->data_len) __skb_trim(skb, G_CPL_TLS_DATA_LENGTH(ntohl(hdr->length_pkd))); if (unlikely(tp->urg_data == TCP_URG_NOTYET && tp->urg_seq - tp->rcv_nxt < skb->len)) tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - tp->rcv_nxt]; tcp_hdr(skb)->fin = 0; inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; tls_ofld->pld_len = G_CPL_TLS_DATA_LENGTH(ntohl(hdr->length_pkd)); __skb_queue_tail(&tls_ofld->sk_recv_queue, skb); } /* * Handler for CPL_TLS_DATA messages */ static int do_cpl_tls_data(struct tom_data *td, struct sk_buff *skb) { struct sock *sk; struct cpl_rx_data *req = cplhdr(skb); unsigned int hwtid = GET_TID(req); sk = lookup_tid(td->tids, hwtid); VALIDATE_SOCK(sk); skb_gl_set(skb, NULL); process_cpl_msg(new_tls_data, sk, skb); return 0; } /* * Process a CPL_GET_TCB_RPL. */ static int do_get_tcb_rpl(struct tom_data *td, struct sk_buff *skb) { struct sock *sk; struct cpl_get_tcb_rpl *rpl = cplhdr(skb); unsigned int hwtid = GET_TID(rpl); sk = lookup_tid(td->tids, hwtid); if (!sk) return CPL_RET_BUF_DONE; if (unlikely(rpl->cookie == CPLIOS_TLS_HANDSHK)) process_cpl_msg(process_get_tcb_tls_rpl, sk, skb); #ifdef CONFIG_T4_MA_FAILOVER else process_cpl_msg(process_get_tcb_mafo_rpl, sk, skb); #endif return 0; } /* * Move a socket to TIME_WAIT state. We need to make some adjustments to the * socket state before calling tcp_time_wait to comply with its expectations. */ static void enter_timewait(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); /* * Bump rcv_nxt for the peer FIN. We don't do this at the time we * process peer_close because we don't want to carry the peer FIN in * the socket's receive queue and if we increment rcv_nxt without * having the FIN in the receive queue we'll confuse facilities such * as SIOCINQ. */ tp->rcv_nxt++; /* * Fake a timestamp for the most recent time that ts_recent was set. * We never actually set ts_recent in our code but without this, an * attampt to use SO_REUSEADDR won't work on the client side and the * client will have to wait for the zombie socket to time out in the * kernel's Time Wait list. */ TS_RECENT_STAMP(tp) = get_seconds(); tp->srtt_us = 0; /* defeat tcp_update_metrics */ tcp_time_wait(sk, TCP_TIME_WAIT, 0); /* calls tcp_done */ } /* * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This * function deals with the data that may be reported along with the FIN. * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to * perform normal FIN-related processing. In the latter case 1 indicates that * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the * skb can be freed. */ static int handle_peer_close_data(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_peer_close *req = cplhdr(skb); unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ if (tp->rcv_nxt == rcv_nxt) /* no data */ return 0; if (unlikely(sk_no_receive(sk))) { handle_excess_rx(sk, skb); /* * Although we discard the data we want to process the FIN so * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + * PEER_CLOSE without data. In particular this PEER_CLOSE * may be what will close the connection. We return 1 because * handle_excess_rx() already freed the packet. */ return 1; } q = DDP_STATE(sk); if (!q->ddp_setup) return 0; bsp = &q->buf_state[q->cur_buf]; skb->len = rcv_nxt - tp->rcv_nxt; skb_gl_set(skb, bsp->gl); skb_ulp_ddp_offset(skb) = bsp->cur_offset; skb_ulp_ddp_flags(skb) = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; ULP_SKB_CB(skb)->seq = tp->rcv_nxt; tp->rcv_nxt = rcv_nxt; bsp->cur_offset += skb->len; q->cur_buf ^= 1; skb_reset_transport_header(skb); tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; __skb_queue_tail(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk_data_ready_compat(sk, 0); return 1; } /* * Handle a peer FIN. */ static void do_peer_fin(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); int keep = 0, dead = sock_flag(sk, SOCK_DEAD); if (cplios_flag_nochk(cplios, CPLIOS_ABORT_RPL_PENDING)) goto out; if (cplios->ulp_mode == ULP_MODE_TCPDDP) { keep = handle_peer_close_data(sk, skb); if (keep < 0) return; } sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: tcp_set_state(sk, TCP_CLOSE_WAIT); break; case TCP_FIN_WAIT1: tcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: if (ma_fail_do_peer_fin(sk, TCP_FIN_WAIT2)) break; /* * If we've sent an abort_req we must have sent it too late, * HW will send us a reply telling us so, and this peer_close * is really the last message for this connection and needs to * be treated as an abort_rpl, i.e., transition the connection * to TCP_CLOSE (note that the host stack does this at the * time of generating the RST but we must wait for HW). * Otherwise we enter TIME_WAIT. */ t4_release_offload_resources(sk); if (cplios_flag_nochk(cplios, CPLIOS_ABORT_RPL_PENDING)) connection_done(sk); else enter_timewait(sk); break; default: printk(KERN_ERR "%s: TID %u received PEER_CLOSE in bad state %d\n", cplios->toedev->name, cplios->tid, sk->sk_state); } if (!dead) { sk->sk_state_change(sk); /* Do not send POLL_HUP for half duplex close. */ if ((sk->sk_shutdown & SEND_SHUTDOWN) || sk->sk_state == TCP_CLOSE) sk_wake_async(sk, 1, POLL_HUP); else sk_wake_async(sk, 1, POLL_IN); } out: if (!keep) kfree_skb(skb); } /* * Handler for PEER_CLOSE CPL messages. */ static int do_peer_close(struct tom_data *td, struct sk_buff *skb) { struct sock *sk; struct cpl_peer_close *req = cplhdr(skb); unsigned int hwtid = GET_TID(req); sk = lookup_tid(td->tids, hwtid); VALIDATE_SOCK(sk); process_cpl_msg_ref(do_peer_fin, sk, skb); return 0; } /* * Process a peer ACK to our FIN. */ static void process_close_con_rpl(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); struct cpl_close_con_rpl *rpl = cplhdr(skb); tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ if (cplios_flag_nochk(cplios, CPLIOS_ABORT_RPL_PENDING)) goto out; switch (sk->sk_state) { case TCP_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ if (ma_fail_process_close_con_rpl(sk, TCP_CLOSING)) break; t4_release_offload_resources(sk); if (cplios_flag_nochk(cplios, CPLIOS_ABORT_RPL_PENDING)) connection_done(sk); else enter_timewait(sk); break; case TCP_LAST_ACK: if (ma_fail_process_close_con_rpl(sk, TCP_LAST_ACK)) break; /* * In this state we don't care about pending abort_rpl. * If we've sent abort_req it was post-close and was sent too * late, this close_con_rpl is the actual last message. */ t4_release_offload_resources(sk); connection_done(sk); break; case TCP_FIN_WAIT1: tcp_set_state(sk, TCP_FIN_WAIT2); sk->sk_shutdown |= SEND_SHUTDOWN; dst_confirm(sk->sk_dst_cache); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); // Wake up lingering close() else if (tcp_sk(sk)->linger2 < 0 && !cplios_flag_nochk(cplios, CPLIOS_ABORT_SHUTDOWN)) abort_conn(sk, skb, LINUX_MIB_TCPABORTONLINGER); break; default: printk(KERN_ERR "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", cplios->toedev->name, cplios->tid, sk->sk_state); } out: kfree_skb(skb); /* can't use __kfree_skb here */ } /* * Handler for CLOSE_CON_RPL CPL messages. */ static int do_close_con_rpl(struct tom_data *td, struct sk_buff *skb) { struct sock *sk; struct cpl_close_con_rpl *rpl = cplhdr(skb); unsigned int hwtid = GET_TID(rpl); sk = lookup_tid(td->tids, hwtid); VALIDATE_SOCK(sk); process_cpl_msg_ref(process_close_con_rpl, sk, skb); return 0; } /* * Process abort replies. We only process these messages if we anticipate * them as the coordination between SW and HW in this area is somewhat lacking * and sometimes we get ABORT_RPLs after we are done with the connection that * originated the ABORT_REQ. A migrating connection will set MIGRATION_TOEDEV * and CPLIOS_TX_FAILOVER before issuing commands to the old T4. */ static void process_abort_rpl(struct sock *sk, struct sk_buff *skb) { struct cpl_abort_rpl_rss *rpl = cplhdr(skb); struct cpl_io_state *cplios = CPL_IO_STATE(sk); if (ma_fail_process_abort_rpl(sk)) goto out; if (rpl->rss_hdr.channel != cplios->tx_c_chan) { cplios_reset_flag(cplios, CPLIOS_TX_WAIT_IDLE); cplios_reset_flag(cplios, CPLIOS_TX_FAILOVER); } if (cplios_flag_nochk(cplios, CPLIOS_ABORT_RPL_PENDING)) { cplios_reset_flag(cplios, CPLIOS_ABORT_RPL_PENDING); if (!cplios_flag_nochk(cplios, CPLIOS_ABORT_REQ_RCVD)) { if (sk->sk_state == TCP_SYN_SENT) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tom_data *td = TOM_DATA(cplios->toedev); struct tid_info *tids = td->tids; cxgb4_remove_tid(tids, cplios->port_id, GET_TID(rpl), sk->sk_family); sock_put(sk); } t4_release_offload_resources(sk); connection_done(sk); } } out: kfree_skb(skb); } /* * Handle an ABORT_RPL_RSS CPL message. */ static int do_abort_rpl(struct tom_data *td, struct sk_buff *skb) { struct cpl_abort_rpl_rss *rpl = cplhdr(skb); struct sock *sk; unsigned int hwtid = GET_TID(rpl); sk = lookup_tid(td->tids, hwtid); /* * Ignore replies to post-close aborts indicating that the abort was * requested too late. These connections are terminated when we get * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss * arrives the TID is either no longer used or it has been recycled. */ if (rpl->status == CPL_ERR_ABORT_FAILED) { discard: kfree_skb(skb); return 0; } /* * Sometimes we've already closed the socket, e.g., a post-close * abort races with ABORT_REQ_RSS, the latter frees the socket * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, * but FW turns the ABORT_REQ into a regular one and so we get * ABORT_RPL_RSS with status 0 and no socket. */ if (!sk) goto discard; process_cpl_msg_ref(process_abort_rpl, sk, skb); return 0; } /* * Convert the status code of an ABORT_REQ into a Linux error code. Also * indicate whether RST should be sent in response. */ static int abort_status_to_errno(struct sock *sk, int abort_reason, int *need_rst) { switch (abort_reason) { case CPL_ERR_BAD_SYN: case CPL_ERR_CONN_RESET: // XXX need to handle SYN_RECV due to crossed SYNs return sk->sk_state == TCP_CLOSE_WAIT ? EPIPE : ECONNRESET; case CPL_ERR_XMIT_TIMEDOUT: case CPL_ERR_PERSIST_TIMEDOUT: case CPL_ERR_FINWAIT2_TIMEDOUT: case CPL_ERR_KEEPALIVE_TIMEDOUT: T4_NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); return ETIMEDOUT; default: return EIO; } } static inline void set_abort_rpl_wr(struct sk_buff *skb, unsigned int tid, int cmd) { struct cpl_abort_rpl *rpl = cplhdr(skb); INIT_TP_WR_MIT_CPL(rpl, CPL_ABORT_RPL, tid); rpl->cmd = cmd; } static void send_deferred_abort_rpl(struct toedev *tdev, struct sk_buff *skb) { struct sk_buff *reply_skb; struct cpl_abort_req_rss *req = cplhdr(skb); reply_skb = alloc_skb_nofail(sizeof(struct cpl_abort_rpl)); __skb_put(reply_skb, sizeof(struct cpl_abort_rpl)); set_abort_rpl_wr(reply_skb, GET_TID(req), (req->status & CPL_ABORT_NO_RST)); set_wr_txq(reply_skb, CPL_PRIORITY_DATA, req->status >> 1); cxgb4_ofld_send(tdev->lldev[0], reply_skb); kfree_skb(skb); } static void send_deferred_tnl(struct toedev *tdev, struct sk_buff *skb) { local_bh_disable(); netif_receive_skb(skb); local_bh_enable(); } static void send_abort_rpl(struct sock *sk, struct sk_buff *skb, struct toedev *tdev, int rst_status, int queue) { struct sk_buff *reply_skb; struct cpl_abort_req_rss *req = cplhdr(skb); reply_skb = get_cpl_reply_skb(skb, sizeof(struct cpl_abort_rpl), gfp_any()); if (!reply_skb) { /* Defer the reply. Stick rst_status into req->cmd. Supports 7-bit tx ofld index */ req->status = (queue << 1) | rst_status; t4_defer_reply(skb, tdev, send_deferred_abort_rpl); return; } set_abort_rpl_wr(reply_skb, GET_TID(req), rst_status); kfree_skb(skb); /* can't use __kfree_skb here */ /* * XXX need to sync with ARP as for SYN_RECV connections we can send * these messages while ARP is pending. For other connection states * it's not a problem. */ set_wr_txq(reply_skb, CPL_PRIORITY_DATA, queue); if (sock_flag(sk, SOCK_OFFLOADED)) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct l2t_entry *e = cplios->l2t_entry; if (e && (sk->sk_state != TCP_SYN_RECV)) { cxgb4_l2t_send(cplios->egress_dev, reply_skb, e); return; } } cxgb4_ofld_send(tdev->lldev[0], reply_skb); } static void cleanup_syn_rcv_conn(struct sock *child, struct sock *parent) { struct request_sock *req = CPL_IO_STATE(child)->passive_reap_next; reqsk_queue_removed(&inet_csk(parent)->icsk_accept_queue, req); synq_remove(child); t4_reqsk_free(req); CPL_IO_STATE(child)->passive_reap_next = NULL; } /* * Performs the actual work to abort a SYN_RECV connection. */ static void do_abort_syn_rcv(struct sock *child, struct sock *parent) { /* * If the server is still open we clean up the child connection, * otherwise the server already did the clean up as it was purging * its SYN queue and the skb was just sitting in its backlog. */ if (likely(parent->sk_state == TCP_LISTEN)) { cleanup_syn_rcv_conn(child, parent); /* Without the below call to sock_orphan, * we leak the socket resource with syn_flood test * as inet_csk_destroy_sock will not be called * in tcp_done since SOCK_DEAD flag is not set. * Kernel handles this differently where new socket is * created only after 3 way handshake is done. */ sock_orphan(child); INC_ORPHAN_COUNT(child); t4_release_offload_resources(child); connection_done(child); } else { if (cplios_flag(child, CPLIOS_RST_ABORTED)) { t4_release_offload_resources(child); connection_done(child); } } } /* * This is run from a listener's backlog to abort a child connection in * SYN_RCV state (i.e., one on the listener's SYN queue). */ static void bl_abort_syn_rcv(struct sock *lsk, struct sk_buff *skb) { struct sock *child = skb->sk; int queue = CPL_IO_STATE(child)->txq_idx; skb->sk = NULL; do_abort_syn_rcv(child, lsk); send_abort_rpl(child, skb, BLOG_SKB_CB(skb)->dev, CPL_ABORT_NO_RST, queue); } /* * Handle abort requests for a SYN_RECV connection. These need extra work * because the socket is on its parent's SYN queue. */ static int abort_syn_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *parent; struct toedev *tdev = CPL_IO_STATE(sk)->toedev; struct tom_data *td = TOM_DATA(tdev); const struct request_sock *oreq = CPL_IO_STATE(sk)->passive_reap_next; void *data; struct listen_ctx *listen_ctx; if (!oreq) { printk(KERN_ERR "abort_syn_rcv: sk not on SYN Queue!\n"); return -1; /* somehow we are not on the SYN queue */ } data = lookup_stid(td->tids, oreq->ts_recent); if (!data) { printk(KERN_INFO "abort_syn_rcv: lookup for stid=%u failed\n", oreq->ts_recent); return -1; } listen_ctx = (struct listen_ctx *)data; parent = listen_ctx->lsk; bh_lock_sock(parent); if (!sock_owned_by_user(parent)) { int queue = CPL_IO_STATE(sk)->txq_idx; do_abort_syn_rcv(sk, parent); send_abort_rpl(sk, skb, tdev, CPL_ABORT_NO_RST, queue); } else { skb->sk = sk; BLOG_SKB_CB(skb)->backlog_rcv = bl_abort_syn_rcv; __sk_add_backlog(parent, skb); } bh_unlock_sock(parent); return 0; } /* * Process abort requests. If we are waiting for an ABORT_RPL we ignore this * request except that we need to reply to it. */ static void process_abort_req(struct sock *sk, struct sk_buff *skb) { int rst_status = CPL_ABORT_NO_RST; const struct cpl_abort_req_rss *req = cplhdr(skb); struct cpl_io_state *cplios = CPL_IO_STATE(sk); int queue = cplios->txq_idx; /* * If the Abort is really a "Negative Advice" message from TP * indicating that it's having problems with the connection (multiple * retransmissions, etc.), then let's see if something has changed * like the Path MTU (typically indicated via an ICMP_UNREACH * ICMP_FRAG_NEEDED message from an intermediate router). */ if (is_neg_adv(req->status)) { struct dst_entry *dst = __sk_dst_get(sk); unsigned int mtu_idx = select_mss(cplios, dst_mtu(dst), 0); if (mtu_idx < cplios->mtu_idx) { t4_set_maxseg(sk, mtu_idx); cplios->mtu_idx = mtu_idx; } if (sk->sk_state == TCP_SYN_RECV) t4_set_tcb_tflag(sk, S_TF_MIGRATING, 0); kfree_skb(skb); return; } cplios_reset_flag(cplios, CPLIOS_ABORT_REQ_RCVD); if (req->rss_hdr.channel != cplios->tx_c_chan) { cplios_reset_flag(cplios, CPLIOS_TX_WAIT_IDLE); cplios_reset_flag(cplios, CPLIOS_TX_FAILOVER); } /* * Send a flowc if not already sent */ if (!cplios_flag_nochk(cplios, CPLIOS_ABORT_SHUTDOWN) && !cplios_flag_nochk(cplios, CPLIOS_TX_DATA_SENT)) { struct tcp_sock *tp = tcp_sk(sk); if (send_tx_flowc_wr(sk, 0, tp->snd_nxt, tp->rcv_nxt) < 0) BUG_ON(1); cplios_set_flag(cplios, CPLIOS_TX_DATA_SENT); } cplios_set_flag(cplios, CPLIOS_ABORT_SHUTDOWN); if (ma_fail_process_abort_req(sk)) goto out; /* * Three cases to consider: * a) We haven't sent an abort_req; close the connection. * b) We have sent a post-close abort_req that will get to TP too late * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will * be ignored and the connection should be closed now. * c) We have sent a regular abort_req that will get to TP too late. * That will generate an abort_rpl with status 0, wait for it. */ if (!cplios_flag_nochk(cplios, CPLIOS_ABORT_RPL_PENDING)) { sk->sk_err = abort_status_to_errno(sk, req->status, &rst_status); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); /* * SYN_RECV needs special processing. If abort_syn_rcv() * returns 0 is has taken care of the abort. */ if (sk->sk_state == TCP_SYN_RECV && !abort_syn_rcv(sk, skb)) return; t4_release_offload_resources(sk); connection_done(sk); } out: send_abort_rpl(sk, skb, BLOG_SKB_CB(skb)->dev, rst_status, queue); } /* * Handle an ABORT_REQ_RSS CPL message. */ static int do_abort_req(struct tom_data *td, struct sk_buff *skb) { const struct cpl_abort_req_rss *req = cplhdr(skb); unsigned int hwtid = GET_TID(req); struct sock *sk; sk = lookup_tid(td->tids, hwtid); VALIDATE_SOCK(sk); /* * Save the offload device in the skb, we may process this message * after the socket has closed. */ BLOG_SKB_CB(skb)->dev = CPL_IO_STATE(sk)->toedev; process_cpl_msg_ref(process_abort_req, sk, skb); return 0; } static void pass_open_abort(struct sock *child, struct sock *parent, struct sk_buff *skb) { do_abort_syn_rcv(child, parent); kfree_skb(skb); } /* * Runs from a listener's backlog to abort a child connection that had an * ARP failure. */ static void bl_pass_open_abort(struct sock *lsk, struct sk_buff *skb) { pass_open_abort(skb->sk, lsk, skb); } static void handle_pass_open_arp_failure(struct sock *sk, struct sk_buff *skb) { struct sock *parent; const struct request_sock *oreq; void *data; const struct tom_data *d = TOM_DATA(CPL_IO_STATE(sk)->toedev); /* * If the connection is being aborted due to the parent listening * socket going away there's nothing to do, the ABORT_REQ will close * the connection. */ if (cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)) { kfree_skb(skb); return; } oreq = CPL_IO_STATE(sk)->passive_reap_next; data = lookup_stid(d->tids, oreq->ts_recent); parent = ((struct listen_ctx *)data)->lsk; bh_lock_sock(parent); if (!sock_owned_by_user(parent)) pass_open_abort(sk, parent, skb); else { BLOG_SKB_CB(skb)->backlog_rcv = bl_pass_open_abort; __sk_add_backlog(parent, skb); } bh_unlock_sock(parent); } /* * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV * connection. */ static void pass_accept_rpl_arp_failure(void *handle, struct sk_buff *skb) { T4_TCP_INC_STATS_BH(sock_net(skb->sk), TCP_MIB_ATTEMPTFAILS); BLOG_SKB_CB(skb)->dev = CPL_IO_STATE(skb->sk)->toedev; process_cpl_msg_ref(handle_pass_open_arp_failure, skb->sk, skb); } #if defined(ROUTE_REQ) static struct dst_entry *route_req(struct sock *sk, struct open_request *req) { struct rtable *rt; struct flowi fl = { .oif = sk->sk_bound_dev_if, .nl_u = { .ip4_u = { .daddr = req->af.v4_req.rmt_addr, .saddr = req->af.v4_req.loc_addr, .tos = RT_CONN_FLAGS(sk)}}, .proto = IPPROTO_TCP, .uli_u = { .ports = { .sport = inet_sk(sk)->inet_sport, .dport = req->rmt_port}} }; if (ip_route_output_flow(&rt, &fl, sk, 0)) { IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); return NULL; } return &rt->u.dst; } #endif /* * Create a new socket as a child of the listening socket 'lsk' and initialize * with the information in the supplied PASS_ACCEPT_REQ message. * * 'retry' indicates to the caller whether a failure is device-related and the * connection should be passed to the host stack, or connection-related and * the connection request should be rejected. */ static struct sock *mk_pass_sock(struct sock *lsk, struct toedev *tdev, struct net_device *edev, int tid, const struct cpl_pass_accept_req *req, void *network_hdr, struct request_sock *oreq, int *retry, const struct offload_settings *s) { struct sock *newsk; struct cpl_io_state *newcplios; struct sk_ofld_proto *oproto; struct l2t_entry *e; struct dst_entry *dst=NULL; struct tcp_sock *newtp; struct net_device *egress = NULL; struct tom_data *d = TOM_DATA(tdev); const struct iphdr *iph = (const struct iphdr *)network_hdr; struct tcphdr *tcph; struct inet_sock *newinet; struct neighbour *neigh = NULL; struct toe_hash_params hash_params; #ifdef CONFIG_TCPV6_OFFLOAD struct tcp6_sock *newtcp6sk; struct ipv6_pinfo *newnp, *np = inet6_sk(lsk); inet6_request_sock_t *treq; #endif *retry = 0; if (!oreq) goto out_err; #ifdef CONFIG_SECURITY_NETWORK if (security_inet_conn_request(lsk, tcphdr_skb, oreq)) goto free_or; #endif newsk = tcp_create_openreq_child(lsk, oreq, tcphdr_skb); if (!newsk) goto free_or; if (lsk->sk_family == AF_INET) { dst = inet_csk_route_child_sock(lsk, newsk, oreq); if (!dst) goto free_sk; tcph = (struct tcphdr *)(iph + 1); neigh = t4_dst_neigh_lookup(dst, &iph->saddr); if (neigh) { init_toe_hash_params(&hash_params, neigh->dev, neigh, iph->saddr, iph->daddr, tcph->source, tcph->dest, NULL, NULL, false, IPPROTO_TCP); egress = offload_get_phys_egress(&hash_params, TOE_OPEN); if (!egress || !netdev_is_offload(egress) || (TOEDEV(egress) != tdev)) { t4_dst_neigh_release(neigh); goto free_dst; } } else { printk(KERN_INFO "mk_pass_sock: dst->_neighbour is NULL\n"); goto free_dst; } } #if defined(CONFIG_TCPV6_OFFLOAD) else { struct flowi6 fl6; const struct ipv6hdr *ip6h = (const struct ipv6hdr *)network_hdr; tcph = (struct tcphdr *)(ip6h + 1); memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.saddr = ip6h->daddr; fl6.daddr = ip6h->saddr; fl6.fl6_dport = inet_rsk(oreq)->ir_rmt_port; fl6.fl6_sport = t4_get_req_lport(oreq); if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL) fl6.flowi6_oif = edev->ifindex; inet6_rsk(oreq)->ir_iif = fl6.flowi6_oif; security_req_classify_flow(oreq, flowi6_to_flowi(&fl6)); dst = ip6_dst_lookup_flow_compat(lsk, &fl6, NULL, false); if (IS_ERR(dst)) goto free_sk; neigh = t4_dst_neigh_lookup(dst, &ip6h->saddr); if (neigh) { init_toe_hash_params(&hash_params, neigh->dev, neigh, 0, 0, tcph->source, tcph->dest, (__be32*)&ip6h->saddr, (__be32*)&ip6h->daddr, true, IPPROTO_TCP); egress = offload_get_phys_egress(&hash_params, TOE_OPEN); if (!egress || !netdev_is_offload(egress) || (TOEDEV(egress) != tdev)) { t4_dst_neigh_release(neigh); goto free_dst; } } else { printk(KERN_INFO "mk_pass_sock: dst->_neighbour is NULL\n"); goto free_dst; } } #endif e = cxgb4_l2t_get(d->lldi->l2t, neigh, egress , lsk->sk_priority); t4_dst_neigh_release(neigh); if (!e) { /* out of HW resources */ goto free_dst; } newcplios = kzalloc(sizeof *newcplios, GFP_ATOMIC); if (!newcplios) goto free_l2t; oproto = kzalloc(sizeof(*oproto), GFP_ATOMIC); if (!oproto) { kfree(newcplios); goto free_l2t; } newcplios->txdata_skb_cache = alloc_skb(TXDATA_SKB_LEN, GFP_ATOMIC); if (!newcplios->txdata_skb_cache) { kfree(newcplios); goto free_l2t; } kref_init(&newcplios->kref); newcplios->sk = newsk; if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_delete_keepalive_timer(newsk); oreq->ts_recent = G_PASS_OPEN_TID(ntohl(req->tos_stid)); newcplios->tx_c_chan = cxgb4_port_chan(egress); newcplios->rx_c_chan = cxgb4_port_e2cchan(egress); sk_setup_caps(newsk, dst); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); newcplios->egress_dev = egress; if (unlikely(newsk->sk_user_data && check_special_data_ready(newsk) > 0)) sock_set_flag(newsk, SOCK_NO_DDP); init_offload_sk(newcplios, tdev, tid, e, dst, egress, s, ntohs(req->tcpopt.mss)); newcplios->passive_reap_next = oreq; newcplios->delack_seq = newtp->rcv_nxt; ma_fail_mk_pass_sock(newcplios); RCV_WSCALE(newtp) = select_rcv_wscale(tcp_full_space(newsk), WSCALE_OK(newtp), newtp->window_clamp); if (iph->version == 0x4) { newinet->inet_daddr = iph->saddr; newinet->inet_rcv_saddr = iph->daddr; newinet->inet_saddr = iph->daddr; } #ifdef CONFIG_TCPV6_OFFLOAD else if (iph->version == 0x6) { newtcp6sk = (struct tcp6_sock *)newsk; inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; np = inet6_sk(lsk); newnp = inet6_sk(newsk); treq = inet6_rsk(oreq); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); inet6_sk_daddr(newsk) = treq->ir_v6_rmt_addr; inet6_sk_saddr(newsk) = treq->ir_v6_loc_addr; inet6_sk_rcv_saddr(newsk) = treq->ir_v6_loc_addr; t4_set_inet_sock_opt(newinet, NULL); newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newsk->sk_bound_dev_if = treq->ir_iif; newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; } #endif lsk->sk_prot->hash(newsk); t4_inet_inherit_port(&tcp_hashinfo, lsk, newsk); install_offload_ops(newcplios, oproto); bh_unlock_sock(newsk); // counters tcp_create_openreq_child() if (lsk->sk_family != AF_INET) if (cxgb4_clip_get(newcplios->egress_dev, (const u32 *)((&inet6_sk_saddr(newsk))->s6_addr), 1)) goto free_l2t; return newsk; free_l2t: cxgb4_l2t_release(e); free_dst: *retry = 1; dst_release(dst); free_sk: inet_csk_prepare_forced_close(newsk); tcp_done(newsk); free_or: t4_reqsk_free(oreq); out_err: return NULL; } static void offload_req_from_pass_accept_req(struct offload_req *oreq, const struct cpl_pass_accept_req *req, const struct tcphdr *tcph, const struct sk_buff *skb, const struct sock *listen_sk) { unsigned int ipvers; if (listen_sk->sk_family == PF_INET) { const struct iphdr *iph = (struct iphdr *)skb_network_header(skb); ipvers = 4; oreq->sip[0] = iph->saddr; oreq->dip[0] = iph->daddr; oreq->sip[1] = oreq->sip[2] = oreq->sip[3] = 0; oreq->dip[1] = oreq->dip[2] = oreq->dip[3] = 0; } else { ipvers = 6; oreq->sip[0] = ipv6_hdr(skb)->saddr.s6_addr32[0]; oreq->sip[1] = ipv6_hdr(skb)->saddr.s6_addr32[1]; oreq->sip[2] = ipv6_hdr(skb)->saddr.s6_addr32[2]; oreq->sip[3] = ipv6_hdr(skb)->saddr.s6_addr32[3]; oreq->dip[0] = ipv6_hdr(skb)->daddr.s6_addr32[0]; oreq->dip[1] = ipv6_hdr(skb)->daddr.s6_addr32[1]; oreq->dip[2] = ipv6_hdr(skb)->daddr.s6_addr32[2]; oreq->dip[3] = ipv6_hdr(skb)->daddr.s6_addr32[3]; } oreq->dport = tcph->dest; oreq->sport = tcph->source; oreq->ipvers_opentype = (OPEN_TYPE_PASSIVE << 4) | ipvers; oreq->tos = G_PASS_OPEN_TOS(ntohl(req->tos_stid)); oreq->vlan = req->vlan ? req->vlan & htons(VLAN_VID_MASK) : htons(CPL_L2T_VLAN_NONE); #ifdef SO_MARK oreq->mark = listen_sk->sk_mark; #else oreq->mark = 0; #endif } static u32 resolve_options(u32 my_opt2, const struct cpl_pass_accept_req *req) { if (!req->tcpopt.tstamp) my_opt2 &= ~F_TSTAMPS_EN; if (!req->tcpopt.sack) my_opt2 &= ~F_SACK_EN; if (req->tcpopt.wsf > 14) my_opt2 &= ~F_WND_SCALE_EN; return my_opt2; } #ifdef WD_TOE /* * This function is for a passive connection request (SYN) to find * which "wdtoe_device" according to the listening port. */ static int wdtoe_find_listen_dev_new(struct wdtoe_listen_device *t, int *idx, int listen_port) { int i; for (i = 0; i < NWDTOECONN; i++) { if (t[i].in_use == 1 && t[i].listen_port == listen_port) { *idx = t[i].idx_dev; return 0; } } return -1; } #endif #ifdef WD_TOE /* * Check if an entry is alredy in the table */ static int wdtoe_passive_tuple_exists(struct passive_tuple *c, unsigned int stid, __u32 pip, __u16 pport) { int i; for (i = 0; i < NWDTOECONN; i++) { if (c[i].in_use && c[i].stid == stid && c[i].pip == pip && c[i].pport == pport) return 1; } return 0; } #endif #ifdef WD_TOE static struct passive_tuple *wdtoe_get_free_passive_tuple_slot( struct passive_tuple *c, unsigned short *idx) { int i; for (i = 0; i < NWDTOECONN; i++) { if (!c[i].in_use) { *idx = i; return &c[i]; } } return NULL; } #endif #ifdef WD_TOE static int wdtoe_insert_passive_tuple(struct passive_tuple *c, unsigned int stid, __u32 pip, __u16 pport) { int ret; unsigned short idx; struct passive_tuple *free_slot; ret = wdtoe_passive_tuple_exists(c, stid, pip, pport); if (ret) return -1; free_slot = wdtoe_get_free_passive_tuple_slot(c, &idx); if (!free_slot) return -1; free_slot->stid = stid; free_slot->pip = pip; free_slot->pport = pport; free_slot->in_use = 1; return idx; } #endif #ifdef WD_TOE /* insert tid into the existing entry in the passive tuple */ static int wdtoe_insert_passive_tuple_tid(struct passive_tuple *c, unsigned int tid, __u32 pip, __u16 pport) { int i; for (i = 0; i < NWDTOECONN; i++) { if(c[i].pip == pip && c[i].pport == pport && c[i].in_use == 1) { /* we get the entry, now update the tid and exit */ c[i].tid = tid; return 0; } } return -1; } #endif /* * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket * lock held. Note that the sock here is a listening socket that is not owned * by the TOE. */ static void process_pass_accept_req(struct sock *sk, struct sk_buff *skb) { int rt_flags; int pass2host, ret; struct sock *newsk; struct cpl_io_state *cplios; struct l2t_entry *e; struct offload_req orq; struct offload_settings settings; struct sk_buff *reply_skb; struct cpl_pass_accept_rpl *rpl; struct cpl_pass_accept_req *req = cplhdr(skb); #ifdef WD_TOE unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_stid)); #endif unsigned int tid = GET_TID(req); struct toedev *tdev = BLOG_SKB_CB(skb)->dev; struct tom_data *d = TOM_DATA(tdev); struct ethhdr *eh; struct vlan_ethhdr *vlan_eh = NULL; struct iphdr *iph; struct ipv6hdr *ip6h; struct tcphdr *tcph; struct request_sock *oreq = NULL; struct net_device *egress_dev; void *network_hdr; u16 vlan_tag, vlan_id, eth_hdr_len; struct cpl_t5_pass_accept_rpl *rpl5 = NULL; struct net_device *master = NULL; struct net_device *vlan_dev = NULL; __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ bool th_ecn, ect, ecn_ok; newsk = lookup_tid(d->tids, tid); if (newsk) { printk(KERN_ERR "%s: tid (%d) already in use\n", __func__, tid); goto out; } if (is_t4(d->lldi->adapter_type)) reply_skb = alloc_skb(sizeof(*rpl), GFP_ATOMIC); else reply_skb = alloc_skb( roundup(sizeof(*rpl5), 16), GFP_ATOMIC); if (unlikely(!reply_skb)) { cxgb4_remove_tid(d->tids, 0, tid, sk->sk_family); kfree_skb(skb); goto out; } if (sk->sk_state != TCP_LISTEN) goto reject; skb->dev = egress_dev = d->egr_dev[G_SYN_INTF(ntohs(req->l2info))]; if (CHELSIO_CHIP_VERSION(d->lldi->adapter_type) <= CHELSIO_T5) eth_hdr_len = G_ETH_HDR_LEN(ntohl(req->hdr_len)); else /* T6 and later */ eth_hdr_len = G_T6_ETH_HDR_LEN(ntohl(req->hdr_len)); if (eth_hdr_len == ETH_HLEN) { eh = (struct ethhdr *)(req + 1); iph = (struct iphdr *)(eh + 1); ip6h = (struct ipv6hdr *)(eh + 1); network_hdr = (void *)(eh + 1); } else { vlan_eh = (struct vlan_ethhdr *)(req + 1); iph = (struct iphdr *)(vlan_eh + 1); ip6h = (struct ipv6hdr *)(vlan_eh + 1); network_hdr = (void *)(vlan_eh + 1); } if (iph->version == 0x4) { tcph = (struct tcphdr *)(iph + 1); skb_set_network_header(skb, (void *)iph - (void *)req); } else { tcph = (struct tcphdr *)(ip6h + 1); skb_set_network_header(skb, (void *)ip6h - (void *)req); } /* * See if we have a Connection Offload Policy -- user-specified or * default -- which allows this connection to be offloaded. If not, * we'll defer to the Host Stack. */ offload_req_from_pass_accept_req(&orq, req, tcph, skb, sk); rcu_read_lock(); if (!rcu_access_pointer(tdev->can_offload) || !tdev->can_offload(tdev, sk)) { rcu_read_unlock(); goto reject; } settings = *lookup_ofld_policy(tdev, &orq, d->conf.cop_managed_offloading); if (!settings.offload) { rcu_read_unlock(); goto defer; } if (netif_is_bond_slave(egress_dev)) master = netdev_master_upper_dev_get_rcu(egress_dev); vlan_tag = ntohs(req->vlan); vlan_id = vlan_tag & VLAN_VID_MASK; if (vlan_id != CPL_L2T_VLAN_NONE && vlan_id) { if (master) vlan_dev = __vlan_find_dev_deep_ctag(master, vlan_id); else vlan_dev = __vlan_find_dev_deep_ctag(egress_dev, vlan_id); if (!vlan_dev) { /* Hmm.. we have a vlan id on packet, and we don't have * corresponding vlan device on host! Reject. */ rcu_read_unlock(); goto reject; } egress_dev = vlan_dev; } rcu_read_unlock(); if (inet_csk_reqsk_queue_is_full(sk)) goto reject; if (sk_acceptq_is_full(sk) && d->conf.soft_backlog_limit) goto reject; if (master && !vlan_dev) skb->dev = master; else skb->dev = egress_dev; /* * If this isn't a SYN destined to us, let the Host Stack figure it * out. */ if ((iph->version == 0x4) && ip_route_input(skb, iph->daddr, iph->saddr, G_PASS_OPEN_TOS(ntohl(req->tos_stid)), skb->dev)) goto defer; #if defined(CONFIG_TCPPV6_OFFLOAD) if (iph->version == 0x6) { ip6_route_input(skb); if (skb_dst(skb)->error) goto defer; dst_release(skb_dst(skb)); skb_dst_set(skb, NULL); } #endif skb->dev = egress_dev; if ((iph->version == 0x4) && skb_rtable(skb)) { rt_flags = skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); dst_release(skb_dst(skb)); // done with the input route, release it skb_dst_set(skb, NULL); if (rt_flags != RTCF_LOCAL) goto reject; } if (master) { ret = toe_enslave(master, egress_dev); if (ret) goto defer; } if (iph->version == 0x4) oreq = inet_reqsk_alloc(&t4_rsk_ops, sk); #if defined(CONFIG_TCPV6_OFFLOAD) else oreq = inet6_reqsk_alloc(&t4_rsk6_ops, sk); #endif if (!oreq) goto reject; /* * The newly allocated oreq returned from above is mostly * uninitialized. Most of this initialization echos the work done in * tcp_openreq_init(). Note that it's important to get these zero'ed * out since they're used in tcp_create_openreq_child() to initialize * various TCP Socket fields which can lead to confusion later on in * this code. For instance, if oreq->window_clamp contains a non-zero * (junk) value, that'll get assigned to the TCP Socket Window Clamp * field and later we'll think that was the desired (random value) * Window Clamp ... */ oreq->rsk_rcv_wnd = 0; oreq->rsk_window_clamp = 0; oreq->cookie_ts = 0; oreq->mss = 0; oreq->ts_recent = 0; tcp_rsk(oreq)->tfo_listener = false; tcp_rsk(oreq)->rcv_isn = ntohl(tcph->seq); t4_set_req_port(oreq, tcph->source, tcph->dest); inet_rsk(oreq)->ecn_ok = 0; if (iph->version == 0x4) { t4_set_req_addr(oreq, iph->daddr, iph->saddr); ip_dsfield = ipv4_get_dsfield(iph); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) } else { inet6_rsk(oreq)->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; inet6_rsk(oreq)->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; ip_dsfield = ipv6_get_dsfield(ipv6_hdr(skb)); #endif } t4_set_req_opt(oreq, NULL); if (req->tcpopt.wsf <= 14 && tcp_win_scaling_enabled()) { inet_rsk(oreq)->wscale_ok = 1; inet_rsk(oreq)->snd_wscale = req->tcpopt.wsf; } /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(oreq)->ir_iif = sk->sk_bound_dev_if; th_ecn = tcph->ece && tcph->cwr; if (th_ecn) { ect = !INET_ECN_is_not_ect(ip_dsfield); ecn_ok = tcp_ecn_enabled(sock_net(sk)); if ((!ect && ecn_ok) || tcp_ca_needs_ecn(sk)) inet_rsk(oreq)->ecn_ok = 1; } newsk = mk_pass_sock(sk, tdev, egress_dev, tid, req, network_hdr, oreq, &pass2host, &settings); if (!newsk) { if (pass2host) goto defer; else goto reject; } inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); synq_add(sk, newsk); /* Don't get a reference, newsk starts out with ref count 2 */ cxgb4_insert_tid(d->tids, newsk, tid, newsk->sk_family); cplios = CPL_IO_STATE(newsk); cplios->idr = bh_insert_handle(d, newsk, tid); reply_skb->sk = newsk; t4_set_arp_err_handler(reply_skb, NULL, pass_accept_rpl_arp_failure); e = cplios->l2t_entry; cplios->smac_idx = cxgb4_tp_smt_idx(d->lldi->adapter_type, cxgb4_port_viid(cplios->egress_dev)); if (is_t4(d->lldi->adapter_type)) { rpl = (struct cpl_pass_accept_rpl *)__skb_put(reply_skb, sizeof(*rpl)); INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); } else { rpl5 = (struct cpl_t5_pass_accept_rpl *)__skb_put(reply_skb, roundup(sizeof(*rpl5), 16)); rpl = (struct cpl_pass_accept_rpl *)rpl5; INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); } #ifdef WD_TOE /* If SO_PRIORITY is set, we think it's a WD-TOE connection */ if (is_wdtoe(sk)) { int lport; int idx_dev = 0; int ret; struct wdtoe_device *wd_dev; __u32 pip; __u16 pport; int idx; pip = be32_to_cpu(inet_rsk(oreq)->ir_rmt_addr); pport = be16_to_cpu(inet_rsk(oreq)->ir_rmt_port); idx = wdtoe_insert_passive_tuple(passive_conn_tuple, stid, pip, pport); if (idx == -1) printk(KERN_ERR "[wdtoe] %s: unable to insert tuple in " "'passive_conn_tuple' array\n", __func__); /* now we need to insert "tid" into the ntuple table */ ret = wdtoe_insert_passive_tuple_tid( passive_conn_tuple, tid, pip, pport); if (ret < 0) printk(KERN_ERR "[wdtoe] %s: could not insert tid for " "pip [%#x], pport [%u]\n", __func__, pip, pport); /* get the wdtoe device according to the local port */ lport = ntohs(inet_sk(sk)->inet_sport); ret = wdtoe_find_listen_dev_new(listen_table, &idx_dev, lport); if (ret != 0) { printk(KERN_ERR "[wdtoe] %s: could not get the listening " "wd_dev for port [%d]\n", __func__, lport); /* XXX error out or use toe's opt2? */ goto toe; } wd_dev = wdtoe_dev_table[idx_dev].wd_dev; cplios->opt2 = resolve_options( wdtoe_calc_opt2(newsk, &settings, wd_dev), req); } else { toe: #endif cplios->opt2 = resolve_options(cplios->opt2, req); #ifdef WD_TOE } /* end of the branch if a connection is WD-TOE */ #endif /* * Because we could have changed our TCP Timestamp option for this * connection in resolve_options(), we need to see if we want a new TP * MTU Index. Note that this is used in calc_opt0() ... */ cplios->mtu_idx = select_mss(cplios, dst_mtu(__sk_dst_get(newsk)), ntohs(req->tcpopt.mss)); rpl->opt0 = cpu_to_be64(calc_opt0(newsk, settings.nagle) | V_ACCEPT_MODE(0) | V_L2T_IDX(e->idx) | V_SMAC_SEL(cplios->smac_idx) | V_TX_CHAN(cplios->tx_c_chan)); rpl->opt2 = htonl(cplios->opt2); if (CHELSIO_CHIP_VERSION(d->lldi->adapter_type) > CHELSIO_T4) { memset(&rpl5->iss, 0, roundup(sizeof(*rpl5)-sizeof(*rpl), 16)); if (iph->version == 0x4) rpl5->iss = cpu_to_be32((secure_tcp_sequence_number_offload( inet_sk(sk)->inet_daddr, inet_sk(sk)->inet_saddr, inet_sk(sk)->inet_dport, inet_sk(sk)->inet_sport) & ~7U) - 1); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else rpl5->iss = cpu_to_be32((secure_tcpv6_sequence_number( ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32, inet_sk(sk)->inet_dport, inet_sk(sk)->inet_sport) & ~7U) - 1); #endif /* TODO */ //if (is_t6(d->lldi->adapter_type)) // rpl5->opt3 = ?; } set_wr_txq(reply_skb, CPL_PRIORITY_SETUP, cplios->port_id); cxgb4_l2t_send(cplios->egress_dev, reply_skb, e); kfree_skb(skb); return; defer: mk_tid_release(reply_skb, 0, tid); cxgb4_ofld_send(tdev->lldev[0], reply_skb); skb->dev = d->egr_dev[G_SYN_INTF(ntohs(req->l2info))]; __skb_pull(skb, sizeof(*req)); skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->protocol = eth_type_trans(skb, skb->dev); t4_defer_reply(skb, tdev, send_deferred_tnl); return; reject: mk_tid_release(reply_skb, 0, tid); cxgb4_ofld_send(tdev->lldev[0], reply_skb); kfree_skb(skb); out: T4_TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); } /* * Handle a CPL_PASS_ACCEPT_REQ message. */ static int do_pass_accept_req(struct tom_data *td, struct sk_buff *skb) { struct cpl_pass_accept_req *req = cplhdr(skb); unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_stid)); unsigned int tid = GET_TID(req); void *data; struct listen_ctx *ctx; struct sock *lsk; data = lookup_stid(td->tids, stid); if (!data) { printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %u\n", td->tdev.name, stid); return CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE; } ctx = (struct listen_ctx *)data; lsk = ctx->lsk; #if VALIDATE_TID if (unlikely(tid >= td->tids->ntids)) { printk(KERN_ERR "%s: passive open TID %u too large\n", td->tdev.name, tid); return CPL_RET_BUF_DONE; } #endif BLOG_SKB_CB(skb)->dev = &td->tdev; process_cpl_msg(process_pass_accept_req, lsk, skb); return 0; } static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos, enum chip_type type) { u32 l2info; u16 vlantag, len, hdr_len, eth_hdr_len; u8 intf; struct cpl_rx_pkt *cpl = cplhdr(skb); struct cpl_pass_accept_req *req; struct tcp_options_received tmp_opt; u8 *hash_location; /* Store values from cpl_rx_pkt in temporary location. */ vlantag = cpl->vlan; len = cpl->len; l2info = cpl->l2info; hdr_len = cpl->hdr_len; intf = cpl->iff; __skb_pull(skb , sizeof(struct cpl_pass_accept_req)); /* We need to parse the TCP options from SYN packet. * to generate cpl_pass_accept_req. */ memset(&tmp_opt, 0, sizeof tmp_opt); tcp_clear_options(&tmp_opt); t4_tcp_parse_options(skb, &tmp_opt, &hash_location, 0); req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req)); memset(req, 0, sizeof(*req)); req->l2info = cpu_to_be16(V_SYN_INTF(intf) | V_SYN_MAC_IDX(G_RX_MACIDX(htonl(l2info))) | F_SYN_XACT_MATCH); if (CHELSIO_CHIP_VERSION(type) <= CHELSIO_T5) { eth_hdr_len = is_t4(type) ? G_RX_ETHHDR_LEN(htonl(l2info)) : G_RX_T5_ETHHDR_LEN(htonl(l2info)); req->hdr_len = cpu_to_be32(V_SYN_RX_CHAN(G_RX_CHAN(htonl(l2info))) | V_TCP_HDR_LEN(G_RX_TCPHDR_LEN(htons(hdr_len))) | V_IP_HDR_LEN(G_RX_IPHDR_LEN(htons(hdr_len))) | V_ETH_HDR_LEN(eth_hdr_len)); } else { /* T6 and later */ eth_hdr_len = G_RX_T6_ETHHDR_LEN(htonl(l2info)); req->hdr_len = cpu_to_be32(V_SYN_RX_CHAN(G_RX_CHAN(htonl(l2info))) | V_T6_TCP_HDR_LEN(G_RX_TCPHDR_LEN(htons(hdr_len))) | V_T6_IP_HDR_LEN(G_RX_IPHDR_LEN(htons(hdr_len))) | V_T6_ETH_HDR_LEN(eth_hdr_len)); } req->vlan = vlantag; req->len = len; req->tos_stid = cpu_to_be32(V_PASS_OPEN_TID(stid) | V_PASS_OPEN_TOS(tos)); req->tcpopt.mss = htons(tmp_opt.mss_clamp); if (tmp_opt.wscale_ok) req->tcpopt.wsf = tmp_opt.snd_wscale; req->tcpopt.tstamp = tmp_opt.saw_tstamp; if (tmp_opt.sack_ok) req->tcpopt.sack = 1; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, 0)); return; } /* * Handler for CPL_RX_PKT message. Need to handle cpl_rx_pkt * messages when a filter is being used instead of server to * redirect a syn packet. When packets hit filter they are redirected * to the offload queue and driver tries to establish the connection * using firmware work request. */ static int do_rx_pkt(struct tom_data *td, struct sk_buff *skb) { int stid; unsigned int filter; struct ethhdr *eh; struct vlan_ethhdr *vlan_eh = NULL; struct iphdr *iph; struct tcphdr *tcph; struct request_sock *oreq = NULL; struct cpl_rx_pkt *cpl = cplhdr(skb); struct cpl_pass_accept_req *req = cplhdr(skb); struct toedev *tdev = &td->tdev; struct toe_hash_params hash_params; struct listen_ctx *ctx; struct sock *lsk; struct l2t_entry *e; struct dst_entry *dst=NULL; struct net_device *egress; struct cpl_io_state cplios; void *data; u8 tos = 0; u16 window, eth_hdr_len; struct neighbour *neigh = NULL; int iff = cpl->iff; skb->dev = td->egr_dev[iff]; /* Tunnel all non-SYN packets */ if (!(cpl->l2info & cpu_to_be32(F_RXF_SYN))) goto reject; /* Tunnel all packet which did not hit the filter. * Unlikely to happen. */ if (!(cpl->rss_hdr.filter_hit && cpl->rss_hdr.filter_tid)) goto reject; /* * Ignore new connection request (as of now) if we are in middle * of shutdown */ if (toedev_in_shutdown(tdev)) { dev_kfree_skb(skb); return 0; } /* Calculate the server tid from filter hit index from cpl_rx_pkt. */ stid = cpu_to_be32(cpl->rss_hdr.hash_val); data = lookup_stid(td->tids, stid); if (!data) { printk(KERN_ERR "%s: do_rx_pkt had unknown STID %u\n", td->tdev.name, stid); goto reject; } ctx = (struct listen_ctx *)data; lsk = ctx->lsk; if (CHELSIO_CHIP_VERSION(td->lldi->adapter_type) <= CHELSIO_T5) eth_hdr_len = is_t4(td->lldi->adapter_type) ? G_RX_ETHHDR_LEN(htonl(cpl->l2info)) : G_RX_T5_ETHHDR_LEN(htonl(cpl->l2info)); else /* T6 and later */ eth_hdr_len = G_RX_T6_ETHHDR_LEN(htonl(cpl->l2info)); if (eth_hdr_len == ETH_HLEN) { eh = (struct ethhdr *)(req + 1); iph = (struct iphdr *)(eh + 1); if (cpl->vlan_ex) __vlan_hwaccel_put_ctag(skb, ntohs(cpl->vlan)); else cpl->vlan = htons(CPL_L2T_VLAN_NONE); } else { vlan_eh = (struct vlan_ethhdr *)(req + 1); iph = (struct iphdr *)(vlan_eh + 1); skb->vlan_tci = ntohs(vlan_eh->h_vlan_TCI); cpl->vlan = vlan_eh->h_vlan_TCI; } BUG_ON(iph->version != 0x4); tos = iph->tos; tcph = (struct tcphdr *)(iph + 1); skb_set_network_header(skb, (void *)iph - (void *)req); skb_set_transport_header(skb, (void *)tcph - (void *)req); oreq = inet_reqsk_alloc(&t4_rsk_ops, lsk); if (!oreq) goto reject; window = htons(tcph->window); tcp_rsk(oreq)->rcv_isn = ntohl(tcph->seq); t4_set_req_port(oreq, tcph->source, tcph->dest); t4_set_req_addr(oreq, iph->daddr, iph->saddr); t4_set_req_opt(oreq, NULL); dst = route_req(lsk, oreq); if (!dst) goto free_or; neigh = t4_dst_neigh_lookup(dst, &inet_sk(lsk)->inet_daddr); if (!neigh) { printk(KERN_INFO "%s: dst->_neighbour is NULL\n", __func__); goto free_dst; } memset(&cplios, 0, sizeof(struct cpl_io_state)); if (neigh) { init_toe_hash_params(&hash_params, neigh->dev, neigh, iph->saddr, iph->daddr, tcph->source, tcph->dest, NULL, NULL, false, IPPROTO_TCP); egress = offload_get_phys_egress(&hash_params, TOE_OPEN); if (!egress || !netdev_is_offload(egress) || (TOEDEV(egress) != tdev)) { t4_dst_neigh_release(neigh); goto free_dst; } cplios.toedev = tdev; cplios.egress_dev = egress; ma_fail_do_rx_pkt_init(&cplios); e = cxgb4_l2t_get(td->lldi->l2t, neigh, egress , lsk->sk_priority); t4_dst_neigh_release(neigh); if (!e) goto free_dst; } else { printk(KERN_INFO "do_rx_pkt: dst->_neighbour is NULL\n"); goto free_dst; } /* Calcuate filter portion for LE region. */ filter = cpu_to_be32(cxgb4_select_ntuple(cplios.egress_dev, e)); /* Synthesize cpl_pass_accept_req. We have everything except the TID. * Once firmware sends a reply with TID we update the TID field in cpl * and pass it through regular cpl_pass_accept_req process in driver. */ build_cpl_pass_accept_req(skb, stid, tos, td->lldi->adapter_type); if (mk_fw_pass_open_req(td, skb, oreq, filter, window, e, &cplios) < 0) goto free_l2t; cxgb4_l2t_release(e); dst_release(dst); t4_reqsk_free(oreq); return 0; free_l2t: cxgb4_l2t_release(e); free_dst: dst_release(dst); free_or: t4_reqsk_free(oreq); reject: if (ma_fail_do_rx_pkt(td, skb)) return 0; __skb_pull(skb , sizeof(struct cpl_pass_accept_req)); skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->protocol = eth_type_trans(skb, skb->dev); if (unlikely(cpl->vlan_ex)) __vlan_hwaccel_put_ctag(skb, ntohs(cpl->vlan)); netif_receive_skb(skb); return 0; } /* * Add a passively open socket to its parent's accept queue. Note that the * child may be in any state by now, including TCP_CLOSE. We can guarantee * though that it has not been orphaned yet. */ static void add_pass_open_to_parent(struct sock *child, struct sock *lsk, struct toedev *dev) { struct request_sock *oreq; /* * If the server is closed it has already killed its embryonic * children. There is nothing further to do about child. */ if (lsk->sk_state != TCP_LISTEN) return; oreq = CPL_IO_STATE(child)->passive_reap_next; CPL_IO_STATE(child)->passive_reap_next = NULL; reqsk_queue_removed(&inet_csk(lsk)->icsk_accept_queue, oreq); synq_remove(child); if (sk_acceptq_is_full(lsk) && !TOM_TUNABLE(dev, soft_backlog_limit)) { T4_NET_INC_STATS_BH(sock_net(lsk), LINUX_MIB_LISTENOVERFLOWS); T4_NET_INC_STATS_BH(sock_net(lsk), LINUX_MIB_LISTENDROPS); t4_reqsk_free(oreq); add_to_reap_list(child); } else { /* The refcnt will be zero here, set it to one to allow the * stack to free the request_sock when it is removed from the * accept queue else we will leak the memory. */ atomic_set(&oreq->rsk_refcnt, 1); inet_csk_reqsk_queue_add(lsk, oreq, child); sk_data_ready_compat(lsk, 0); } } /* * This is run from a listener's backlog to add a child socket to its accept * queue. Note that at this point the child is not locked and we intentionally * do not bother locking it as the only fields we may be using are * sk_user_data, and the open request and there aren't any concurrent users * for them. */ static void bl_add_pass_open_to_parent(struct sock *lsk, struct sk_buff *skb) { struct sock *child = skb->sk; skb->sk = NULL; add_pass_open_to_parent(child, lsk, BLOG_SKB_CB(skb)->dev); kfree_skb(skb); } /* * Called when a connection is established to translate the TCP options * reported by HW to Linux's native format. */ static void assign_rxopt(struct sock *sk, unsigned int opt) { const struct tom_data *d; struct tcp_sock *tp = tcp_sk(sk); struct cpl_io_state *cplios = CPL_IO_STATE(sk); d = TOM_DATA(cplios->toedev); MSS_CLAMP(tp) = d->mtus[G_TCPOPT_MSS(opt)] - 40; tp->mss_cache = MSS_CLAMP(tp); tp->tcp_header_len = sizeof(struct tcphdr); TSTAMP_OK(tp) = G_TCPOPT_TSTAMP(opt); SACK_OK(tp) = G_TCPOPT_SACK(opt); WSCALE_OK(tp) = G_TCPOPT_WSCALE_OK(opt); SND_WSCALE(tp) = G_TCPOPT_SND_WSCALE(opt); if (!WSCALE_OK(tp)) RCV_WSCALE(tp) = 0; if (TSTAMP_OK(tp)) { tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; tp->mss_cache -= TCPOLEN_TSTAMP_ALIGNED; } else if (cplios->opt2 & F_TSTAMPS_EN) { cplios->opt2 &= ~F_TSTAMPS_EN; cplios->mtu_idx = G_TCPOPT_MSS(opt); } } /* * Completes some final bits of initialization for just established connections * and changes their state to TCP_ESTABLISHED. * * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. */ static void make_established(struct sock *sk, u32 snd_isn, unsigned int opt) { struct tcp_sock *tp = tcp_sk(sk); tp->pushed_seq = tp->write_seq = tp->snd_nxt = tp->snd_una = snd_isn; inet_sk(sk)->inet_id = tp->write_seq ^ jiffies; assign_rxopt(sk, opt); /* * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't * pass through opt0. */ if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) tp->rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); dst_confirm(sk->sk_dst_cache); /* * tcp_poll() does not lock socket, make sure initial values are * committed before changing to ESTABLISHED. */ smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); } /* * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work * if we are in TCP_SYN_RECV due to crossed SYNs */ static int do_pass_establish(struct tom_data *td, struct sk_buff *skb) { struct cpl_pass_establish *req = cplhdr(skb); struct sock *lsk, *sk; struct toedev *tdev; unsigned int hwtid = GET_TID(req); sk = lookup_tid(td->tids, hwtid); VALIDATE_SOCK(sk); bh_lock_sock(sk); if (unlikely(sock_owned_by_user(sk))) { // This can only happen in simultaneous opens. XXX TBD kfree_skb(skb); } else { // Complete socket initialization now that we have the SND_ISN void *data; unsigned int stid; struct cpl_io_state *cplios = CPL_IO_STATE(sk); tdev = cplios->toedev; cplios->wr_max_credits = cplios->wr_credits = min_t(unsigned int, td->max_wr_credits, TOM_TUNABLE(tdev, max_wr_credits)); cplios->wr_unacked = 0; make_established(sk, ntohl(req->snd_isn), ntohs(req->tcp_opt)); stid = G_PASS_OPEN_TID(ntohl(req->tos_stid)); #ifdef WD_TOE /* * Once the connection established, we can mark the entry * in the passive table not used any more */ wdtoe_remove_passive_conn_tuple(passive_conn_tuple, stid, hwtid); #endif sk->sk_state_change(sk); if (unlikely(sk->sk_socket)) { // simultaneous opens only sk_wake_async(sk, 0, POLL_OUT); } /* * The state for the new connection is now up to date. * Next check if we should add the connection to the parent's * accept queue. When the parent closes it resets connections * on its SYN queue, so check if we are being reset. If so we * don't need to do anything more, the coming ABORT_RPL will * destroy this socket. Otherwise move the connection to the * accept queue. * * Note that we reset the synq before closing the server so if * we are not being reset the stid is still open. */ if (unlikely(synq_empty(sk))) { /* removed from synq */ kfree_skb(skb); goto unlock; } data = lookup_stid(td->tids, stid); lsk = ((struct listen_ctx *)data)->lsk; bh_lock_sock(lsk); if (likely(!sock_owned_by_user(lsk))) { kfree_skb(skb); add_pass_open_to_parent(sk, lsk, tdev); } else { skb->sk = sk; BLOG_SKB_CB(skb)->dev = tdev; BLOG_SKB_CB(skb)->backlog_rcv = bl_add_pass_open_to_parent; __sk_add_backlog(lsk, skb); } bh_unlock_sock(lsk); } unlock: bh_unlock_sock(sk); return 0; } #define __FIXUP_WR_MIT_CPL(w, cpl, tid) do { \ (w)->wr.wr_mid = \ htonl(V_FW_WR_LEN16(G_FW_WR_LEN16(ntohl((w)->wr.wr_mid))) | \ V_FW_WR_FLOWID(tid)); \ OPCODE_TID(w) = htonl(MK_OPCODE_TID(cpl, tid)); \ } while (0) #define __FIXUP_FLOWC_WR(flowc, tid) do { \ (flowc)->flowid_len16 = \ htonl(V_FW_WR_LEN16(G_FW_WR_LEN16(ntohl((flowc)->flowid_len16))) | \ V_FW_WR_FLOWID(tid)); \ } while(0) /* * Fill in the right TID for CPL messages waiting in the out-of-order queue * and send them to the TOE. */ static void fixup_and_send_ofo(struct cpl_io_state *cplios, unsigned int tid) { struct sk_buff *skb; while ((skb = __skb_dequeue(&cplios->ooo_queue)) != NULL) { struct fw_flowc_wr *flowc = cplhdr(skb); struct cpl_close_con_req *p = cplhdr(skb); if (G_FW_WR_OP(ntohl(flowc->op_to_nparams)) == FW_FLOWC_WR) __FIXUP_FLOWC_WR(flowc, tid); else __FIXUP_WR_MIT_CPL(p, p->ot.opcode, tid); cxgb4_ofld_send(cplios->egress_dev, skb); } } /* * Adjust buffers already in write queue after a SYN_SENT->ESTABLISHED * transition. For TX_DATA we need to adjust the start sequence numbers, and * for other packets we need to adjust the TID. TX_DATA packets don't have * headers yet and so not TIDs. */ static void fixup_pending_writeq_buffers(struct sock *sk) { struct sk_buff *skb; struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int tid = cplios->tid; skb_queue_walk(&cplios->tx_queue, skb) { if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR) { ULP_SKB_CB(skb)->seq = tp->write_seq; tp->write_seq += skb->len + ulp_extra_len(skb) + skb_ulp_len_adjust(skb); } else { struct cpl_close_con_req *p = cplhdr(skb); __FIXUP_WR_MIT_CPL(p, p->ot.opcode, tid); } } } /* * Updates socket state from an active establish CPL message. Runs with the * socket lock held. */ static void sock_act_establish(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct cpl_act_establish *req = cplhdr(skb); u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ struct tcp_sock *tp = tcp_sk(sk); if (unlikely(sk->sk_state != TCP_SYN_SENT)) printk(KERN_ERR "TID %u expected SYN_SENT, found %d\n", cplios->tid, sk->sk_state); tp->rcv_tstamp = tcp_time_stamp; cplios->delack_seq = tp->copied_seq = tp->rcv_wup = tp->rcv_nxt = rcv_isn; make_established(sk, ntohl(req->snd_isn), ntohs(req->tcp_opt)); #ifdef CONFIG_SECURITY_NETWORK security_inet_conn_estab(sk, tcphdr_skb); #endif /* * Now that we finally have a TID send any CPL messages that we had to * defer for lack of a TID. */ if (skb_queue_len(&cplios->ooo_queue)) fixup_and_send_ofo(cplios, cplios->tid); if (likely(!sock_flag(sk, SOCK_DEAD))) { sk->sk_state_change(sk); sk_wake_async(sk, 0, POLL_OUT); } kfree_skb(skb); /* * Currently the send queue must be empty at this point because the * socket layer does not send anything before a connection is * established. To be future proof though we handle the possibility * that there are pending buffers to send (either TX_DATA or * CLOSE_CON_REQ). First we need to adjust the sequence number of the * buffers according to the just learned write_seq, and then we send * them on their way. */ fixup_pending_writeq_buffers(sk); if (t4_push_frames(sk, 1)) sk->sk_write_space(sk); /* For TLS connect start handshake parse timer */ if(is_tls_offload(sk)) start_hndsk_timer(sk); } /* * Process a CPL_ACT_ESTABLISH message. */ static int do_act_establish(struct tom_data *td, struct sk_buff *skb) { struct cpl_act_establish *req = cplhdr(skb); unsigned int tid = GET_TID(req); unsigned int atid = G_TID_TID(ntohl(req->tos_atid)); struct sock *sk; struct cpl_io_state *cplios; struct toedev *tdev; sk = lookup_tid(td->tids, tid); if (sk) { /* * If socket associated with this tid is already waiting for * CPL_ABORT_RPL i.e. connection is going away anyways then, * this CPL_ACT_ESTABLISH has likely arrived late in the game. * We can ignore this CPL_ACT_ESTABLISH assuming that CPL_ABORT_RPL * will arrive and subsequent necessary clean-up would occur. * * The only known sceanrio for this as of now is: * socket was associated with this tid in neg. advice from CPL_ACT_OPEN_RPL * so as to send CPL_ABORT_REQ on correct tid and subsequently * process the CPL_ABORT_RPL. */ if (sk->sk_state == TCP_SYN_SENT && cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)) return 0; printk(KERN_ERR "%s: tid (%d) already in use, sk_state = %d\n", __func__, tid, sk->sk_state); return CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE; } cplios = (struct cpl_io_state *)lookup_atid(td->tids, atid); VALIDATE_SOCK(cplios); sk = cplios->sk; tdev = cplios->toedev; /* * It's OK if the TID is currently in use, the owning socket may have * backlogged its last CPL message(s). Just take it away. */ cplios->tid = tid; cxgb4_insert_tid(td->tids, sk, tid, sk->sk_family); cplios->idr = bh_insert_handle(td, sk, tid); cxgb4_free_atid(td->tids, atid); bh_conn_remove_handle(td, atid); kref_put(&cplios->kref, t4_cplios_release); #ifdef WD_TOE /* once the active connection is established, we delete the entry */ wdtoe_remove_conn_tuple(conn_tuple, atid); #endif process_cpl_msg(sock_act_establish, sk, skb); return 0; } #define S_CPL_FW4_ACK_FLOWID 0 #define M_CPL_FW4_ACK_FLOWID 0xffffff #define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID) #define G_CPL_FW4_ACK_FLOWID(x) \ (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID) /* * Process an acknowledgment of WR completion. Advance snd_una and send the * next batch of work requests from the write queue. */ static void wr_ack(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); struct cpl_fw4_ack *hdr = (struct cpl_fw4_ack *)cplhdr(skb); u8 credits = hdr->credits; u32 snd_una = ntohl(hdr->snd_una); cplios->wr_credits += credits; /* * If the last write request in the queue with a request completion * flag has been consumed, reset our bookeepping. */ if (cplios->wr_unacked > cplios->wr_max_credits - cplios->wr_credits) cplios->wr_unacked = cplios->wr_max_credits - cplios->wr_credits; while (credits) { struct sk_buff *p = peek_wr(sk); if (unlikely(!p)) { #ifdef WD_TOE int ret; int dev_idx = 0; int tbl_idx = 0; struct wdtoe_stack_info_entry *stack_info; ret = wdtoe_find_dev_by_tid(wdtoe_dev_table, &dev_idx, &tbl_idx, cplios->tid); if(ret == 0) { /* printk(KERN_INFO "[wdtoe] WR_ACK [%u] for TID" "[%u] for WD-TOE connection" "dev_idx [%d]\n", credits, cplios->tid, dev_idx); */ stack_info = wdtoe_dev_table[dev_idx].wd_dev ->k_stack_info; /* XXX need to replace with atomic operation */ atomic_add(credits, &stack_info->conn_info[tbl_idx].cur_credits); /* printk(KERN_INFO "[wdtoe] returning credits " "[%u] for TID [%u], dev_idx " "[%d] tbl_idx [%d], " "cur_credits [%u]\n", credits, cplios->tid, dev_idx, tbl_idx, atomic_read(&stack_info-> conn_info[tbl_idx]. cur_credits)); */ goto wdtoe_ack_out; } #endif if (cplios->wr_nondata) cplios->wr_nondata -= credits; #if DEBUG_WR > 1 /* Currently, wr_nondata is only used for sending * flowc WR for failover so should only be needed * once before TX data */ if (cplios->wr_nondata) netdev_err(cplios->egress_dev, "WR credits=%u nondata=%u\n", credits, cplios->wr_nondata); #endif #ifdef WD_TOE wdtoe_ack_out: #endif break; } if (unlikely(credits < p->csum)) { #if DEBUG_WR > 1 struct tx_data_wr *w = cplhdr(p); printk(KERN_ERR "TID %u got %u WR credits, need %u, len %u, " "main body %u, seq # %u, ACK una %u," " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", cplios->tid, credits, p->csum, p->len, p->len - p->data_len, ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), cplios->wr_credits, count_pending_wrs(sk) - credits); #endif p->csum -= credits; break; } else { dequeue_wr(sk); credits -= p->csum; free_wr_skb(sk, p); } } #if DEBUG_WR check_wr_invariants(sk); #endif if (hdr->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { if (unlikely(before(snd_una, tp->snd_una))) { #if VALIDATE_SEQ struct tom_data *d = TOM_DATA(cplios->toedev); printk(KERN_ERR "%s: unexpected sequence # %x in WR_ACK " "for TID %u, snd_una %x\n", (&d->tdev)->name, snd_una, cplios->tid, tp->snd_una); #endif kfree_skb(skb); return; } if (tp->snd_una != snd_una) { tp->snd_una = snd_una; dst_confirm(sk->sk_dst_cache); tp->rcv_tstamp = tcp_time_stamp; if ((tp->snd_una == tp->snd_nxt) && !cplios_flag_nochk(cplios, CPLIOS_TX_FAILOVER)) cplios_reset_flag(cplios, CPLIOS_TX_WAIT_IDLE); } ma_fail_wr_ack(sk); } if (hdr->flags & CPL_FW4_ACK_FLAGS_FLOWC) { if (cplios_flag_nochk(cplios, CPLIOS_TX_FAILOVER)) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct l2t_entry *e = cplios->l2t_entry; if ( cplios->tx_c_chan != e->lport) send_failover_flowc_wr(sk); } } if (hdr->flags & CPL_FW4_ACK_FLAGS_CH) { unsigned int flowclen16 = DIV_ROUND_UP(failover_flowc_wr_len, 16); cplios->wr_credits -= flowclen16; cplios_reset_flag(cplios, CPLIOS_TX_WAIT_IDLE); cplios_reset_flag(cplios, CPLIOS_TX_FAILOVER); } /* * If there's more data queued up, see if we can get it into the write * queue ... If we're able to push any data into the write queue, * free up socket send buffer space. */ if (skb_queue_len(&cplios->tx_queue) && t4_push_frames(sk, 0)) sk->sk_write_space(sk); kfree_skb(skb); } #ifdef UDP_OFFLOAD static void uo_wr_ack(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct cpl_fw4_ack *hdr = (struct cpl_fw4_ack *)cplhdr(skb); u8 credits = hdr->credits; if (!cplios) { kfree_skb(skb); return; } cplios->wr_credits += credits; /* * If the last write request in the queue with a request completion * flag has been consumed, reset our bookeepping. */ if (cplios->wr_unacked > cplios->wr_max_credits - cplios->wr_credits) cplios->wr_unacked = cplios->wr_max_credits - cplios->wr_credits; while (credits) { struct sk_buff *p = peek_wr(sk); if (unlikely(!p)) { printk(KERN_ERR "%u WR_ACK credits for TID %u with " "nothing pending, state %u\n", credits, cplios->tid, sk->sk_state); break; } if (unlikely(credits < p->csum)) { p->csum -= credits; break; } else { dequeue_wr(sk); credits -= p->csum; free_wr_skb(sk, p); } } if (cplios_flag_nochk(cplios, CPLIOS_CLOSE_CON_REQUESTED) && cplios->wr_credits == cplios->wr_max_credits) { cxgb4_free_uotid(TOM_DATA(cplios->toedev)->tids, cplios->tid); t4_udp_release_resources(sk); sock_put(sk); kfree_skb(skb); return; } if (sk->sk_family == AF_INET) { if (skb_queue_len(&cplios->tx_queue) && !t4_udp_push_frames(sk)) sk->sk_write_space(sk); } #ifdef CONFIG_UDPV6_OFFLOAD else if (sk->sk_family == AF_INET6) { if (skb_queue_len(&cplios->tx_queue) && !chelsio_udp_v6_push_pending_frames(sk)) sk->sk_write_space(sk); } #endif /* CONFIG_UDPV6_OFFLOAD */ kfree_skb(skb); } #endif /* * Handler for TX_DATA_ACK CPL messages. */ static int do_fw4_ack(struct tom_data *td, struct sk_buff *skb) { struct cpl_fw4_ack *rpl = (struct cpl_fw4_ack *)cplhdr(skb); struct sock *sk; unsigned int hwtid = G_CPL_FW4_ACK_FLOWID(ntohl(OPCODE_TID(rpl))); #ifdef UDP_OFFLOAD if (hwtid >= td->tids->uotid_base) { sk = lookup_uotid(td->tids, hwtid); VALIDATE_SOCK(sk); skb_shinfo(skb)->gso_type = SKB_GSO_UDP; process_cpl_msg(uo_wr_ack, sk, skb); /* * when the sk->sk_sndbuf limit is reached, the * sock_alloc_send_skb will sleep for sndbuf. This need to * be woke up as we will be freeing some skb as part this * WR ACK handling */ if (sk_has_sleepers(sk)) { clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wake_up_interruptible(sk_sleep(sk)); } return 0; } #endif sk = lookup_tid(td->tids, hwtid); VALIDATE_SOCK(sk); process_cpl_msg (wr_ack, sk, skb); return 0; } #if 0 /* * Handler for TRACE_PKT CPL messages. Just sink these packets. */ static int do_trace_pkt(struct tom_data *td, struct sk_buff *skb) { __kfree_skb(skb); return 0; } #endif /* * Disconnect offloaded established but not yet accepted connections sitting * on a server's accept_queue. We just send an ABORT_REQ at this point and * finish off the disconnect later as we may need to wait for the ABORT_RPL. */ void t4_disconnect_acceptq(struct sock *listen_sk) { struct request_sock **pprev; pprev = ACCEPT_QUEUE(listen_sk); while (*pprev) { struct request_sock *req = *pprev; if ((req->rsk_ops == RSK_OPS(&t4_rsk_ops)) || (req->rsk_ops == RSK_OPS(&t4_rsk6_ops))) { // one of ours struct sock *child = req->sk; *pprev = req->dl_next; sk_acceptq_removed(listen_sk); t4_reqsk_free(req); sock_hold(child); // need to survive past inet_csk_destroy_sock() local_bh_disable(); bh_lock_sock(child); release_tcp_port(child); reset_listen_child(child); bh_unlock_sock(child); local_bh_enable(); sock_put(child); } else pprev = &req->dl_next; } } /* * Reset offloaded connections sitting on a server's syn queue. As above * we send ABORT_REQ and finish off when we get ABORT_RPL. */ void t4_reset_synq(struct sock *listen_sk) { struct sock **nextsk = &synq_next(listen_sk); /* * Note: the while predicate below is a little tricky because the * fields used to implement the doubly linked list have been hijacked * out of the (struct tcp_sock) portion of the socket. If the fields * were solely ours to use, then the test of "*nextsk != listen_sk" * would be enough. But when we empty the SYN queue, the state of * those hijacked fields are reset to the values expected by Linux * and "*nextsk" will no longer have any legitimate meaning for us. * Thus the double predicate of testing for both the SYN queue being * empty (which is implemented in a Linux version-dependent fashion) * and making sure the next socket to process isn't our listen * socket ... */ while (!synq_empty(listen_sk) && *nextsk != listen_sk) { struct sock *child = *nextsk; if ((sk_ofld_proto_get_tomhandlers(child) == &t4_tcp_prot.proto) || (sk_ofld_proto_get_tomhandlers(child) == &t4_tcp_v6_prot.proto)) { /* one of ours */ cleanup_syn_rcv_conn(child, listen_sk); sock_hold(child); // need to survive past inet_csk_destroy_sock() local_bh_disable(); bh_lock_sock(child); release_tcp_port(child); reset_listen_child(child); bh_unlock_sock(child); local_bh_enable(); sock_put(child); } else { /* some other offloaded socket ... */ nextsk = &synq_next(*nextsk); } } } int t4_set_dsgl_ppods(const struct cpl_io_state *cplios, const struct ddp_gather_list *gl, const struct dsgl_req *dsgl, const unsigned int ppod_addr, const unsigned int nppods, const unsigned int maxoff, const unsigned int startidx) { struct sock *sk = cplios->sk; unsigned int i, j, pidx; struct pagepod *p; struct sk_buff *skb; struct ulp_mem_io *req; struct ulptx_sgl *sgl; struct tcp_sock *tp = tcp_sk(sk); unsigned int tid = cplios->tid; unsigned int len; unsigned int ppodout = 0; unsigned int podchunk = nppods*PPOD_SIZE; len = roundup(sizeof(*req) + sizeof(*sgl), 16); skb = alloc_ctrl_skb(cplios->ctrl_skb_cache, len); if (!skb) return -ENOMEM; preempt_disable(); set_wr_txq(skb, CPL_PRIORITY_CONTROL, cplios->port_id); req = (struct ulp_mem_io *)__skb_put(skb, len); req->wr.wr_hi = cpu_to_be32(V_FW_WR_OP(FW_ULPTX_WR)|V_FW_WR_COMPL(1)); req->wr.wr_mid = cpu_to_be32(V_FW_WR_LEN16(DIV_ROUND_UP(len, 16))); req->wr.wr_lo = (uintptr_t)dsgl; req->cmd = htonl(V_ULPTX_CMD(ULP_TX_MEM_WRITE) | V_T5_ULP_MEMIO_ORDER(1) | V_T5_ULP_MEMIO_FID(cplios->rss_qid) | F_ULP_TXPKT_RO); req->dlen = cpu_to_be32(V_ULP_MEMIO_DATA_LEN(podchunk>>ULP_TX_MEMWRITE_ALIGN)); req->len16 = cpu_to_be32(DIV_ROUND_UP(len-sizeof(req->wr), 16)); req->lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr>>ULP_TX_MEMWRITE_ALIGN)); sgl = (struct ulptx_sgl *)(req + 1); sgl->cmd_nsge = cpu_to_be32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(1)); sgl->len0 = cpu_to_be32(podchunk); sgl->addr0 = cpu_to_be64(dsgl->dsgl_iova); p = (struct pagepod *)dsgl->dsgl_vaddr; i = startidx; do { pidx = 4 * i; p->vld_tid_pgsz_tag_color = cpu_to_be64(F_PPOD_VALID | V_PPOD_TID(tid) | V_PPOD_TAG(gl->tag) | V_PPOD_COLOR(gl->color)); p->len_offset = cpu_to_be64(V_PPOD_LEN(maxoff) | V_PPOD_OFST(gl->offset)); p->rsvd = 0; for (j = 0; j < 5; ++j, ++pidx) p->addr[j] = pidx < gl->nelem ? cpu_to_be64(gl->phys_addr[pidx]) : 0; p++; ppodout += PPOD_SIZE; i++; } while (ppodout < podchunk); dma_wmb(); preempt_enable(); send_or_defer(sk, tp, skb, 0); return 0; } /* Maximum Immediate command memory write length is 256 bytes */ #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE) int t4_setup_ppods(const struct cpl_io_state *cplios, const struct ddp_gather_list *gl, const unsigned int nppods, const unsigned int tag, const unsigned int maxoff) { struct sock *sk = cplios->sk; unsigned int i, j, pidx; struct pagepod *p; struct sk_buff *skb; struct ulp_mem_io *req; struct ulptx_idata *sc; struct tcp_sock *tp = tcp_sk(sk); unsigned int tid = cplios->tid; const struct tom_data *td = TOM_DATA(cplios->toedev); unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; unsigned int len, podchunk; __be32 cmd = htonl(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); if (is_t4(td->lldi->adapter_type)) cmd |= htonl(V_ULP_MEMIO_ORDER(1)); else cmd |= htonl(V_T5_ULP_MEMIO_IMM(1)); for (i = 0; i < nppods; ppod_addr += podchunk) { unsigned int ppodout = 0; podchunk = ((nppods-i) >= NUM_ULP_TX_SC_IMM_PPODS) ? NUM_ULP_TX_SC_IMM_PPODS : (nppods-i); podchunk *= PPOD_SIZE; len = roundup(sizeof(*req) + 2*sizeof(*sc) + podchunk, 16); skb = alloc_ctrl_skb(cplios->ctrl_skb_cache, len); if (!skb) return -ENOMEM; set_wr_txq(skb, CPL_PRIORITY_CONTROL, cplios->port_id); req = (struct ulp_mem_io *)__skb_put(skb, len); INIT_ULPTX_WR(req, len, 0, 0); req->cmd = cmd; req->dlen = htonl(V_ULP_MEMIO_DATA_LEN(podchunk / 32)); req->len16 = htonl(DIV_ROUND_UP(len-sizeof(req->wr), 16)); req->lock_addr = htonl(V_ULP_MEMIO_ADDR( ppod_addr>>ULP_TX_MEMWRITE_ALIGN)); sc = (struct ulptx_idata *)(req+1); sc->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM)); sc->len = htonl(podchunk); p = (struct pagepod *)(sc + 1); do { pidx = 4 * i; p->vld_tid_pgsz_tag_color = cpu_to_be64(F_PPOD_VALID | V_PPOD_TID(tid) | V_PPOD_TAG(tag) | V_PPOD_COLOR(gl->color)); p->len_offset = cpu_to_be64(V_PPOD_LEN(maxoff) | V_PPOD_OFST(gl->offset)); p->rsvd = 0; for (j = 0; j < 5; ++j, ++pidx) p->addr[j] = pidx < gl->nelem ? cpu_to_be64(gl->phys_addr[pidx]) : 0; p++; ppodout += PPOD_SIZE; i++; } while (ppodout < podchunk); sc = (struct ulptx_idata *)p; sc->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_NOOP)); sc->len = htonl(0); send_or_defer(sk, tp, skb, 0); } return 0; } /* * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. */ static void mk_rx_data_ack_ulp(struct sock *sk, struct cpl_rx_data_ack *ack, unsigned int tid, unsigned int credits) { struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; struct ulptx_idata *sc = (struct ulptx_idata *)(txpkt + 1); u32 dack; dack = t4_select_delack(sk); txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); txpkt->len = htonl(DIV_ROUND_UP(sizeof(*ack), 16)); sc->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM)); sc->len = htonl(sizeof(*ack) - sizeof(struct work_request_hdr)); OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); ack->credit_dack = htonl(F_RX_MODULATE_RX | F_RX_DACK_CHANGE | V_RX_DACK_MODE(dack) | V_RX_CREDITS(credits)); sc = (struct ulptx_idata *)(ack + 1); sc->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_NOOP)); sc->len = htonl(0); } int t4_cancel_ddpbuf(struct sock *sk, unsigned int bufidx) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); unsigned int wrlen; struct sk_buff *skb; struct ulptx_idata *aligner; struct cpl_set_tcb_field *req; struct ddp_state *p = DDP_STATE(sk); u64 mask = V_TF_DDP_ACTIVE_BUF(1ULL) | V_TF_DDP_INDICATE_OUT(1ULL) | V_TF_DDP_BUF0_VALID(1ULL) | V_TF_DDP_BUF1_VALID(1ULL) | V_TF_DDP_BUF0_INDICATE(1ULL) | V_TF_DDP_BUF1_INDICATE(1ULL); /* DDP buffer 0 is only used for indicate size */ BUG_ON(!bufidx); wrlen = roundup(sizeof(*req) + sizeof(*aligner), 16); skb = alloc_ctrl_skb(cplios->ctrl_skb_cache, wrlen); if (!skb) return -ENOMEM; __set_tcb_field(sk, skb, W_TCB_RX_DDP_FLAGS, mask, V_TF_DDP_ACTIVE_BUF(1ULL) | V_TF_DDP_INDICATE_OUT(1ULL), DDP_COOKIE_OFFSET, 0); p->get_tcb_count++; cxgb4_ofld_send(cplios->egress_dev, skb); return 0; } /* * Sends a compound WR containing all the CPL messages needed to program the * two HW DDP buffers, namely optionally setting up the length and offset of * each buffer, programming the DDP flags, and sending RX_DATA_ACK. */ int t4_setup_ddpbufs(struct sock *sk, unsigned int len0, unsigned int offset0, unsigned int len1, unsigned int offset1, u64 ddp_flags, u64 flag_mask) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); unsigned int wrlen; struct work_request_hdr *wr; struct ulptx_idata *aligner; struct cpl_set_tcb_field *req; struct cpl_rx_data_ack *ack; struct sk_buff *skb; if (ma_fail_t4_send_rx_credits(sk)) return -EINVAL; wrlen = roundup(2*(sizeof(*req) + sizeof(*aligner)), 16); skb = alloc_ctrl_skb(cplios->ctrl_skb_cache, wrlen); if (!skb) return -ENOMEM; if (len0) t4_set_ddp_buf(sk, 0, offset0, len0); if (len1) t4_set_ddp_buf(sk, 1, offset1, len1); set_wr_txq(skb, CPL_PRIORITY_CONTROL, cplios->port_id); req = (struct cpl_set_tcb_field *)__skb_put(skb, wrlen); INIT_ULPTX_WR(req, wrlen, 0, 0); wr = (struct work_request_hdr *)req; wr++; req = (struct cpl_set_tcb_field *)wr; mk_set_tcb_field_ulp(cplios, req, W_TCB_RX_DDP_FLAGS, flag_mask, ddp_flags, 0, 1); aligner = (struct ulptx_idata *)(req + 1); ack = (struct cpl_rx_data_ack *)(aligner + 1); mk_rx_data_ack_ulp(sk, ack, cplios->tid, 0); cxgb4_ofld_send(cplios->egress_dev, skb); return 0; } /* * Sends a compound WR containing all the CPL messages needed to program the * the DDP indicate, and sending RX_DATA_ACK. */ void t4_setup_indicate_modrx(struct sock *sk) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int wrlen; struct work_request_hdr *wr; struct ulp_txpkt *txpkt; struct ulptx_idata *sc; struct cpl_set_tcb_field *req; struct cpl_rx_data_ack *ack; struct sk_buff *skb; if (ma_fail_t4_send_rx_credits(sk)) return; wrlen = sizeof(*wr) + sizeof(*txpkt) + sizeof(*sc) + (sizeof(*req) - sizeof(*wr)) + (sizeof(*txpkt) + sizeof(*sc) + sizeof(*ack)); skb = alloc_ctrl_skb(cplios->ctrl_skb_cache, wrlen); BUG_ON(!skb); set_wr_txq(skb, CPL_PRIORITY_CONTROL, cplios->port_id); req = (struct cpl_set_tcb_field *)__skb_put(skb, wrlen); INIT_ULPTX_WR(req, wrlen, 0, 0); wr = (struct work_request_hdr *)req; wr++; req = (struct cpl_set_tcb_field *)wr; mk_set_tcb_field_ulp(cplios, req, W_TCB_RX_DDP_FLAGS, V_TF_DDP_INDICATE_OUT(1ULL) | V_TF_DDP_BUF0_VALID(1ULL) | V_TF_DDP_BUF1_VALID(1ULL) | V_TF_DDP_BUF0_INDICATE(1ULL) | V_TF_DDP_BUF1_INDICATE(1ULL), V_TF_DDP_BUF0_INDICATE(1ULL), DDP_COOKIE_INDOUT, 0); sc = (struct ulptx_idata *)(req + 1); ack = (struct cpl_rx_data_ack *)(sc + 1); mk_rx_data_ack_ulp(sk, ack, cplios->tid, tp->copied_seq - tp->rcv_wup); tp->rcv_wup = tp->copied_seq; cxgb4_ofld_send(cplios->egress_dev, skb); } /* * Handler for CPL_FW6_MSG. */ static int do_fw6_msg(struct tom_data *td, struct sk_buff *skb) { struct cpl_io_state *cplios; struct sock *sk; struct cpl_fw6_msg *p = cplhdr(skb); if (p->type == FW6_TYPE_OFLD_CONNECTION_WR_RPL) { struct cpl_fw6_msg_ofld_connection_wr_rpl *req = (struct cpl_fw6_msg_ofld_connection_wr_rpl *)p->data; if (req->t_state == TCP_SYN_SENT && (req->retval == FW_ENOMEM || req->retval == FW_EADDRINUSE)) { cplios = lookup_atid(td->tids, htonl(req->tid)); VALIDATE_SOCK(cplios); skb->sk = cplios->sk; t4_defer_reply(skb, cplios->toedev, deferred_tnl_connect); return 0; } else if(req->t_state == TCP_SYN_SENT && req->retval == FW_SUCCESS) { unsigned long atid = (unsigned long)req->cookie; cplios = lookup_atid(td->tids, atid); VALIDATE_SOCK(cplios); ma_fail_do_fw6_msg(cplios->sk, skb); return 0; } else if (req->t_state == TCP_SYN_RECV) { struct sk_buff *rpl_skb; struct cpl_pass_accept_req *cpl; rpl_skb = (struct sk_buff *)(uintptr_t)be64_to_cpu(req->cookie); if (req->retval == FW_EADDRINUSE) { __kfree_skb(rpl_skb); } else if (req->retval == FW_ENOMEM) { __skb_pull(rpl_skb, sizeof(*cpl)); skb_reset_mac_header(rpl_skb); rpl_skb->ip_summed = CHECKSUM_UNNECESSARY; rpl_skb->protocol = eth_type_trans(rpl_skb, rpl_skb->dev); netif_receive_skb(rpl_skb); } else { unsigned int stid; struct listen_ctx *ctx; struct sock *lsk; cpl = (struct cpl_pass_accept_req *)cplhdr(rpl_skb); OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, htonl(req->tid))); stid = G_PASS_OPEN_TID(ntohl(cpl->tos_stid)); ctx = (struct listen_ctx *)lookup_stid(td->tids, stid); if (ctx) { lsk = ctx->lsk; BLOG_SKB_CB(rpl_skb)->dev = &td->tdev; process_cpl_msg(process_pass_accept_req, lsk, rpl_skb); } else { cxgb4_remove_tid(td->tids, 0, htonl(req->tid), AF_INET); __kfree_skb(rpl_skb); } } } } else if (p->type == FW_TYPE_PI_ERR) { /* iscsi needs it */ struct fw_pi_error *pi_err = (struct fw_pi_error *)p->data; unsigned int tid = G_FW_WR_FLOWID(ntohl(pi_err->flowid_len16)); sk = lookup_tid(td->tids, tid); if (!t4_cpl_iscsi_callback(td, sk, skb, CPL_FW6_MSG)) return 0; } else if (p->type == FW_TYPE_WR_RPL) { struct cpl_fw6_msg *rpl = cplhdr(skb); if (rpl->data[1]) { struct dsgl_req *req = (struct dsgl_req *)rpl->data[1]; dma_pool_free(td->dma_pool, req->dsgl_vaddr, req->dsgl_iova); kfree(req); } } kfree_skb(skb); return 0; } static int lro_init_desc(struct napi_struct *napi, const struct pkt_gl *gl, struct sock *sk, unsigned int tid, const __be64 *rsp) { struct sk_buff *skb; struct cpl_io_state *cplios = CPL_IO_STATE(sk); skb = cxgb4_pktgl_to_skb(napi, gl, RX_PULL_LEN, RX_PULL_LEN); if (unlikely(!skb)) return -1; /* Copy RSS header */ __skb_push(skb, sizeof(struct rss_header)); skb_copy_to_linear_data(skb, rsp, sizeof(struct rss_header)); cplios->lro_skb = skb; sock_hold(sk); skb->sk = sk; return 0; } static void lro_add_packet(struct sk_buff *skb, const struct pkt_gl *gl) { struct skb_shared_info *ssi; int nr_frags = skb_shinfo(skb)->nr_frags; int cpl_hdr_size = sizeof(struct cpl_tx_data); /* Append the data to the skb frags */ ssi = skb_shinfo(skb); skb_frag_set_page(skb, nr_frags, gl->frags[0].page); ssi->frags[nr_frags].page_offset = gl->frags[0].offset + cpl_hdr_size; ssi->frags[nr_frags].size = gl->frags[0].size - cpl_hdr_size; if (gl->nfrags > 1) memcpy(&ssi->frags[nr_frags + 1], &gl->frags[1], (gl->nfrags - 1) * sizeof(skb_frag_t)); ssi->nr_frags += gl->nfrags; skb->len += gl->tot_len - cpl_hdr_size; skb->data_len += gl->tot_len - cpl_hdr_size; skb->truesize += gl->tot_len - cpl_hdr_size; /* Get a reference for the last page, we don't own it */ get_page(gl->frags[gl->nfrags - 1].page); } void t4_lro_flush(struct t4_lro_mgr *lro_mgr, struct sk_buff *skb) { struct sock *sk = skb->sk; struct cpl_io_state *cplios; if (skb->next || skb->prev) __skb_unlink(skb, &lro_mgr->lroq); cplios = CPL_IO_STATE(sk); sock_put(sk); skb->sk = NULL; if (cplios->ulp_mode == ULP_MODE_ISCSI && fp_iscsi_lro_proc_rx) { process_cpl_msg(fp_iscsi_lro_proc_rx, sk, skb); } else { skb_gl_set(skb, NULL); /* indicates packet is RX_DATA */ process_cpl_msg(new_rx_data, sk, skb); } lro_mgr->lro_pkts++; lro_mgr->lro_session_cnt--; cplios->lro_skb = NULL; } int t4_lro_receive_gl(struct cpl_io_state *cplios, struct napi_struct *napi, const struct pkt_gl *gl, struct t4_lro_mgr *lro_mgr, const __be64 *rsp) { const struct cpl_tx_data *rpl = gl->va; unsigned int tid = G_TID(ntohl(OPCODE_TID(rpl))); struct sock *sk = cplios->sk; struct sk_buff *skb; int cpl_hdr_size = sizeof(struct cpl_tx_data); /* Check if we have already started LRO for this session */ if ((cplios->tid == tid) && cplios->lro_skb) goto add_packet; start_lro: /* Did we reach the limit of maximum sessions to aggreagate */ if (lro_mgr->lro_session_cnt >= MAX_LRO_SESSIONS) goto out; /* Start LROing the packets of this connection */ if (lro_init_desc(napi, gl, sk, tid, rsp)) goto out; lro_mgr->lro_merged++; lro_mgr->lro_session_cnt++; skb = cplios->lro_skb; __skb_queue_tail(&lro_mgr->lroq, skb); return 0; add_packet: skb = cplios->lro_skb; /* Check if this packet can be aggregated. ie * toal lenght should not exceed 64K and * total frags count should not exceed MAX_SKB_FRAGS */ if (((skb->len + gl->tot_len - cpl_hdr_size) > 65535) || ((skb_shinfo(skb)->nr_frags + gl->nfrags) >= MAX_SKB_FRAGS)) { /* Flush the so far aggregated packet */ t4_lro_flush(lro_mgr, skb); goto start_lro; } lro_add_packet(skb, gl); lro_mgr->lro_merged++; return 0; out: return -1; } void t4_lro_flush_all(struct t4_lro_mgr *lro_mgr) { struct sk_buff *skb; while ((skb = __skb_dequeue(&lro_mgr->lroq)) != NULL) t4_lro_flush(lro_mgr, skb); __skb_queue_head_init(&lro_mgr->lroq); } int t4_init_sk_filter(void) { /* * Initialize Drop All filter. * Below BUG_ON() is left for now since a simple drop filter for * offload is not more than 1 Filter instruction long. */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,7,0) int err; #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) /* 3.17 */ #if LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) BUG_ON(ARRAY_SIZE(drop_insnsi) != 1); #else BUG_ON(ARRAY_SIZE(drop_insnsi) != 2); #endif drop_bpf = bpf_prog_alloc(bpf_prog_size(ARRAY_SIZE(drop_insnsi)), GFP_KERNEL); if (!drop_bpf) goto err; drop_bpf->len = ARRAY_SIZE(drop_insnsi); memcpy(&drop_bpf->insnsi, drop_insnsi, sizeof(drop_insnsi)); bpf_prog_select_runtime(drop_bpf, &err); drop_all = (struct sk_filter *)kmalloc(sizeof(*drop_all), GFP_KERNEL); if (!drop_all) { bpf_prog_free(drop_bpf); goto err; } drop_all->prog = drop_bpf; #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0) /* 3.15, 3.16 */ BUG_ON(ARRAY_SIZE(drop_insnsi) != 1); drop_all = (struct sk_filter *)kmalloc(sizeof(*drop_all)+ sizeof(drop_insnsi), GFP_KERNEL); if (!drop_all) goto err; drop_all->len = ARRAY_SIZE(drop_insnsi); memcpy(&drop_all->insnsi, drop_insnsi, sizeof(drop_insnsi)); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) /* 3.15 */ drop_all->bpf_func = sk_run_filter_int_skb; #else /* 3.16 */ sk_filter_select_runtime(drop_all); #endif /* < 3.16 */ #else /* < 3.15 */ BUG_ON(ARRAY_SIZE(drop_insns) != 1); drop_all = (struct sk_filter *)kmalloc(sizeof(*drop_all)+ sizeof(drop_insns), GFP_KERNEL); if (!drop_all) goto err; drop_all->len = ARRAY_SIZE(drop_insns); memcpy(&drop_all->insns, drop_insns, sizeof(drop_insns)); #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0) drop_all->bpf_func = sk_run_filter; #endif /* >= 3.0 */ #endif /* < 3.15 */ atomic_set(&drop_all->refcnt, 1); return 0; err: return -1; } void t4_free_sk_filter(void) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) if (drop_bpf) bpf_prog_free(drop_bpf); #endif if (drop_all) kfree(drop_all); } int __init t4_init_cpl_io(void) { tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); if (!tcphdr_skb) { printk(KERN_ERR "Chelsio TCP offload: can't allocate sk_buff\n"); return -1; } skb_put(tcphdr_skb, sizeof(struct tcphdr)); skb_reset_transport_header(tcphdr_skb); memset(tcphdr_skb->data, 0, tcphdr_skb->len); /* CIPSO_V4_OPTEXIST is false for tcphdr_skb without anything extra */ if (t4_init_sk_filter()) { printk(KERN_ERR "Chelsio TCP offload: can't allocate sk_filter\n"); kfree_skb(tcphdr_skb); return -1; } t4tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); t4tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); t4tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); t4tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); t4tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); t4tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); t4tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); t4tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); t4tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); t4tom_register_cpl_handler(CPL_SET_TCB_RPL, do_set_tcb_rpl); t4tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); t4tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); t4tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); t4tom_register_cpl_handler(CPL_FW6_MSG, do_fw6_msg); t4tom_register_cpl_handler(CPL_RX_PKT, do_rx_pkt); t4tom_register_cpl_handler(CPL_FW4_ACK, do_fw4_ack); t4tom_register_cpl_handler(CPL_TLS_DATA, do_cpl_tls_data); t4tom_register_cpl_handler(CPL_RX_TLS_CMP, do_cpl_rx_tls_cmp); t4tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); if (ma_fail_t4_init_cpl_io()) return -1; return 0; }