/* * This file implements the interface between the socket layer and * the HW TCP/CPL, including the protocol operations for Chelsio's HW TCP. * * Large portions of this file are taken from net/ipv4/tcp.c. * See that file for copyrights of the original code. * Any additional code is * * Copyright (C) 2003-2017 Chelsio Communications. All rights reserved. * * Written by Dimitris Michailidis (dm@chelsio.com) * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 * Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include #include #include #if defined(CONFIG_T4_ZCOPY_SENDMSG) || defined(CONFIG_T4_ZCOPY_SENDMSG_MODULE) #include #include #endif #include #include #include #if defined(CONFIG_NET_RX_BUSY_POLL) #include #endif #if defined(CONFIG_TCPV6_OFFLOAD) #include #endif #include #include #include "l2t.h" #include "defs.h" #include "t4_ddp.h" #include "tom.h" #include "t4_tcb.h" #include "t4fw_interface.h" #include "t4_ma_failover.h" #include "trace.h" #include "t4_tls.h" #define DEFAULT_RCV_COALESCE_SIZE 16224U #ifdef CONFIG_TCPV6_OFFLOAD extern struct proto *tcpv6_prot_p; #endif /* Release callback invoked by kref_put() in chelsio_destroy_sock() */ void t4_cplios_release(struct kref *ref) { struct cpl_io_state *cplios = container_of(ref, struct cpl_io_state, kref); kfree(cplios); } /* * This must be called with the socket locked, otherwise dev may be NULL. */ static inline int chelsio_wspace(const struct sock *sk) { struct toedev *dev = CPL_IO_STATE(sk)->toedev; return dev ? TOM_TUNABLE(dev, max_host_sndbuf) - sk->sk_wmem_queued : 0; } /* * TCP socket write_space callback. Follows sk_stream_write_space(). */ void t4_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; struct socket_wq *wq; if (chelsio_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (sk_has_sleepers(sk)) wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async_compat(sock, wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } static inline int tcp_memory_free(struct sock *sk) { return chelsio_wspace(sk) > 0; } /* * Wait for memory to become available, either space in a socket's send buffer * or system memory. */ static int wait_for_mem(struct sock *sk, long *timeo_p) { int sndbuf, err = 0; long vm_wait = 0; long current_timeo = *timeo_p; bool noblock = (*timeo_p ? false : true); DEFINE_WAIT(wait); /* * We open code tcp_memory_free() because we need it outside the * socket lock and chelsio_wspace() isn't safe there. */ sndbuf = TOM_TUNABLE(CPL_IO_STATE(sk)->toedev, max_host_sndbuf); if (sndbuf > sk->sk_wmem_queued) current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; while (1) { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; if (!*timeo_p) { if (noblock) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); goto do_nonblock; } if (signal_pending(current)) goto do_interrupted; sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sndbuf > sk->sk_wmem_queued && !vm_wait) break; sk_set_bit(SOCK_NOSPACE, sk); sk->sk_write_pending++; sk_wait_event(sk, ¤t_timeo, sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || (sndbuf > sk->sk_wmem_queued && !vm_wait)); sk->sk_write_pending--; if (vm_wait) { vm_wait -= current_timeo; current_timeo = *timeo_p; if (current_timeo != MAX_SCHEDULE_TIMEOUT && (current_timeo -= vm_wait) < 0) current_timeo = 0; vm_wait = 0; } *timeo_p = current_timeo; } out: finish_wait(sk_sleep(sk), &wait); return err; do_error: err = -EPIPE; goto out; do_nonblock: err = -EAGAIN; goto out; do_interrupted: err = sock_intr_errno(*timeo_p); goto out; } void skb_entail(struct sock *sk, struct sk_buff *skb, int flags) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); ULP_SKB_CB(skb)->seq = tp->write_seq; ULP_SKB_CB(skb)->flags = flags; __skb_queue_tail(&cplios->tx_queue, skb); sk->sk_wmem_queued += skb->truesize; // tcp_charge_skb(sk, skb); // Do not share pages across sk_buffs if (TCP_PAGE(sk) && TCP_OFF(sk)) { put_page(TCP_PAGE(sk)); TCP_PAGE(sk) = NULL; TCP_OFF(sk) = 0; } } /* * Returns true if a connection should send more data to the TOE ASAP. */ static inline int should_push(struct sock *sk) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp = tcp_sk(sk); struct toedev *dev = cplios->toedev; /* * If we've released our offload resources there's nothing to do ... */ if (dev == NULL) return 0; /* * If there aren't any work requests in flight, or there isn't enough * data in flight, or Nagle is off then send the current TX_DATA * otherwise hold it and wait to accumulate more data. */ return cplios->wr_credits == cplios->wr_max_credits || tp->snd_nxt - tp->snd_una <= TOM_TUNABLE(dev, tx_hold_thres) || (tp->nonagle & TCP_NAGLE_OFF); } /* * Returns true if a TCP socket is corked. */ static inline int corked(const struct tcp_sock *tp, int flags) { return (flags & MSG_MORE) | (tp->nonagle & TCP_NAGLE_CORK); } /* * Returns true if a send should try to push new data. */ static inline int send_should_push(struct sock *sk, int flags) { return should_push(sk) && !corked(tcp_sk(sk), flags); } static inline void tx_skb_finalize(struct sk_buff *skb) { struct ulp_skb_cb *cb = ULP_SKB_CB(skb); #if defined(CONFIG_T4_ZCOPY_SENDMSG) || defined(CONFIG_T4_ZCOPY_SENDMSG_MODULE) /* * XXX We don't want to finalize an skb if it's flagged for ZCOPY * XXX since we'll end up losing the flag. This needs to be looked * XXX at more closely since we're blindly clearing a bunch of flags * XXX here. Most of these flags (including those for ZCOPY) * XXX probably ought to be retained rather than tossed and we * XXX should certainly have an assert for flags that shouldn't * XXX find their way into this routine ... */ if (cb->flags & (ULPCB_FLAG_ZCOPY| ULPCB_FLAG_ZCOPY_COW| ULPCB_FLAG_ZCOPY_COW_SKIP)) return; #endif if (!(cb->flags & ULPCB_FLAG_NO_HDR)) cb->flags = ULPCB_FLAG_NEED_HDR; cb->flags |= ULPCB_FLAG_NO_APPEND; } static inline void mark_urg(struct tcp_sock *tp, int flags, struct sk_buff *skb) { if (unlikely(flags & MSG_OOB)) { tp->snd_up = tp->write_seq; ULP_SKB_CB(skb)->flags = ULPCB_FLAG_URG | ULPCB_FLAG_BARRIER | ULPCB_FLAG_NO_APPEND | ULPCB_FLAG_NEED_HDR; } } /* * Decide if the last frame on the send queue needs any special annotations * (e.g., marked URG) and whether it should be transmitted immediately or * held for additional data. This is the only routine that performs the full * suite of tests for a Tx packet and therefore must be called for the last * packet added by the various send*() APIs. */ static void tcp_push(struct sock *sk, int flags) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); int qlen = skb_queue_len(&cplios->tx_queue); if (likely(qlen)) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = skb_peek_tail(&cplios->tx_queue); mark_urg(tp, flags, skb); if (!(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) && corked(tp, flags)) { ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_HOLD; return; } ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_HOLD; if (qlen == 1 && ((ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) || should_push(sk))) t4_push_frames(sk, 1); } } static void tcp_uncork(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->nonagle & TCP_NAGLE_CORK) { tp->nonagle &= ~TCP_NAGLE_CORK; tcp_push(sk, 0); } } /* * Try to transmit the send queue if it has just one packet. This is intended * to be called as full packets are added to the send queue by the various * send*() APIs when we expect additional packets to be generated by the * current API call. It should not be called for the last packet generated, * use the full tcp_push call above for that. */ static inline void push_frames_if_head(struct sock *sk) { if (skb_queue_len(&CPL_IO_STATE(sk)->tx_queue) == 1) t4_push_frames(sk, 1); } static struct sk_buff *alloc_tx_skb(struct sock *sk, int size) { struct sk_buff *skb; skb = alloc_skb(size + TX_HEADER_LEN, sk->sk_allocation); if (likely(skb)) { skb_reserve(skb, TX_HEADER_LEN); skb_entail(sk, skb, ULPCB_FLAG_NEED_HDR); skb_reset_transport_header(skb); } return skb; } /* * Calculate the size for a new send sk_buff. It's maximum size so we can * pack lots of data into it, unless we plan to send it immediately, in which * case we size it more tightly. * * Note: we don't bother compensating for MSS < PAGE_SIZE because it doesn't * arise in normal cases and when it does we are just wasting memory. */ static inline int select_size(struct sock *sk, int io_len, int flags, int len) { const int pgbreak = SKB_MAX_HEAD(len); /* * If the data wouldn't fit in the main body anyway, put only the * header in the main body so it can use immediate data and place all * the payload in page fragments. */ if (io_len > pgbreak) return 0; /* * If we will be accumulating payload get a large main body. */ if (!send_should_push(sk, flags)) return pgbreak; return io_len; } static int chelsio_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { long timeo; int mss, err, copied = 0; struct cpl_io_state *cplios; struct tcp_sock *tp = tcp_sk(sk); lock_sock(sk); if (sk->sk_prot->sendpage != chelsio_sendpage) { release_sock(sk); if (sk->sk_prot->sendpage) return sk->sk_prot->sendpage(sk, page, offset, size, flags); else return sk->sk_socket->ops->sendpage(sk->sk_socket, page, offset, size, flags); } cplios = CPL_IO_STATE(sk); timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); /* Wait for connection establishment to finish. */ if (!sk_in_state(sk, TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && (err = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_err; if (ma_fail_chelsio_sendpage(sk, timeo)) return -EAGAIN; sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; mss = cplios->mss; cplios_set_flag(cplios, CPLIOS_TX_MORE_DATA); while (size > 0) { int copy, i; struct sk_buff *skb = skb_peek_tail(&cplios->tx_queue); if (!skb || (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) || (copy = mss - skb->len) <= 0) { new_buf: if (!tcp_memory_free(sk)) goto wait_for_sndbuf; if (is_tls_offload(sk) && tls_tx_key(sk)) { skb = alloc_tls_tx_skb(sk, select_size(sk, size, flags, TLS_TX_HEADER_LEN), true); if (skb) ULP_SKB_CB(skb)->ulp.tls.type = CONTENT_TYPE_APP_DATA; } else { skb = alloc_tx_skb(sk, 0); } if (!skb) goto wait_for_memory; copy = mss; } if (copy > size) copy = size; i = skb_shinfo(skb)->nr_frags; if (skb_can_coalesce(skb, i, page, offset)) { skb_shinfo(skb)->frags[i - 1].size += copy; } else if (i < MAX_SKB_FRAGS) { get_page(page); skb_fill_page_desc(skb, i, page, offset, copy); } else { tx_skb_finalize(skb); push_frames_if_head(sk); goto new_buf; } skb->len += copy; if (skb->len == mss) tx_skb_finalize(skb); skb->data_len += copy; skb->truesize += copy; sk->sk_wmem_queued += copy; tp->write_seq += copy; copied += copy; offset += copy; size -= copy; if (corked(tp, flags) && (sk_stream_wspace(sk) < sk_stream_min_wspace(sk))) ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_NO_APPEND; if (!size) break; if (unlikely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND)) push_frames_if_head(sk); continue; wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: if ((err = wait_for_mem(sk, &timeo)) != 0) goto do_error; } out: cplios_reset_flag(cplios, CPLIOS_TX_MORE_DATA); if (copied) tcp_push(sk, flags); done: release_sock(sk); return copied; do_error: if (copied) goto out; out_err: if (sock_flag(sk, SOCK_OFFLOADED)) cplios_reset_flag(cplios, CPLIOS_TX_MORE_DATA); copied = sk_stream_error(sk, flags, err); goto done; } /* * Add a list of skbs to a socket send queue. This interface is intended for * use by in-kernel ULPs. The callers must comply with the max size limit of the * device and have a headroom of at least TX_HEADER_LEN bytes. */ int t4_sendskb(struct sock *sk, struct sk_buff *skb, int flags) { struct sk_buff *next; struct cpl_io_state *cplios; struct tcp_sock *tp = tcp_sk(sk); int err, copied = 0; long timeo; unsigned int cb_flags; lock_sock(sk); cplios = CPL_IO_STATE(sk); timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (!sk_in_state(sk, TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && (err = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_err; sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; /* * We check for send buffer space once for the whole skb list. It * isn't critical if we end up overrunning the send buffer limit as we * do not allocate any new memory. The benefit is we don't need to * perform intermediate packet pushes. */ while (!tcp_memory_free(sk)) { sk_set_bit(SOCK_NOSPACE, sk); if ((err = wait_for_mem(sk, &timeo)) != 0) goto out_err; } while (skb) { cb_flags = ULP_SKB_CB(skb)->flags|ULPCB_FLAG_NO_APPEND; /* No WR header for memory write skb */ if (likely(!(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_MEMWRITE))) { if (unlikely(skb_headroom(skb) < TX_HEADER_LEN)) { err = -EINVAL; goto out_err; } if (unlikely(!skb->len)) { err = -EMSGSIZE; goto out_err; } tp->write_seq += skb->len + ulp_extra_len(skb) + skb_ulp_len_adjust(skb); cb_flags |= ULPCB_FLAG_NEED_HDR; } next = skb->next; skb->next = NULL; skb_entail(sk, skb, cb_flags); copied += skb->len; skb = next; } done: if (likely(skb_queue_len(&cplios->tx_queue))) t4_push_frames(sk, 1); release_sock(sk); return copied; out_err: if (copied == 0) copied = sk_stream_error(sk, flags, err); goto done; } EXPORT_SYMBOL(t4_sendskb); #if defined(CONFIG_T4_ZCOPY_SENDMSG) || defined(CONFIG_T4_ZCOPY_SENDMSG_MODULE) /* * ZCOPY_SENDMSG maps (if necessary) and pins a user space buffer instead of * copying the payload from user- to kernel space. In normal mode of * operation, we block until the DMA has completed and it is safe to return * (considering that the user might modifies the buffer). Since host bus * performance (PCI-E x8 and PCI-X 2.0) now exceeds the wire speed, this * actually works pretty well. In addition, I added some tunables to do a * hybrid scheme where the end of the user space buffer is copied (at the same * the beginning of the buffer is DMAed). The mechanism provides enough * pipelinging to achieve 10Gbps linerate on a single connection with moderate * CPU utilization. * * Now, the exception (which as usual makes up for most of the code and * complexity): while unlikely, there are scenarios where we want to return * before the DMA completes (i.e. the DMA might not complete if a connection * doesn't drain (somebody unplugged the cable *&%!) or we want to return for * anther reason, i.e. because we got a signal. In that case, we must make * sure that the user doesn't modify the buffer before the DMA has * completed... yes, you guessed correctly, by remapping the buffer as COW and * yes, that has some cost associated with it starting with mandatory TLB * flush and potential page fault and buffer copy (what we wanted to avoid). * However, it is NOT THE NORMAL case and rare! * * Written by Felix Marti (felix@chelsio.com) */ #include #include #include #define ZCOPY_PRT(m) /* * zcopy_to_skb() maps the user space buffer (from) and fills in the skb * page descriptors to point to the buffer. */ static int t4_zcopy_to_skb(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t copy) { int frag = 0; int total = 0; unsigned int off; unsigned int numpages; unsigned int locked; unsigned int lock_limit; unsigned long startaddr = (unsigned long)from->iov->iov_base + from->iov_offset; struct cpl_io_state *cplios = CPL_IO_STATE(sk); off = startaddr & (PAGE_SIZE - 1); numpages = (copy + off + (PAGE_SIZE - 1)) / PAGE_SIZE; down_read(¤t->mm->mmap_sem); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; locked = numpages + current->mm->pinned_vm; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { up_read(¤t->mm->mmap_sem); return -ENOMEM; } up_read(¤t->mm->mmap_sem); while (total != copy) { struct page *pages[MAX_SKB_FRAGS]; size_t off; ssize_t ret; int copied = 0; int n = 0; if (frag == MAX_SKB_FRAGS) { /* come back with new buffer */ skb_vaddr_set(skb, startaddr); return total; } ret = iov_iter_get_pages(from, pages, copy - total, MAX_SKB_FRAGS - frag, &off); if (ret < 0) return -EFAULT; while (ret) { int size = min_t(int, ret, PAGE_SIZE - off); skb_fill_page_desc(skb, frag++, pages[n++], off, size); off = 0; ret -= size; copied += size; } iov_iter_advance(from, copied); skb->len += copied; skb->data_len += copied; skb->truesize += copied; sk->sk_wmem_queued += copied; cplios->zcopy_dma_unacked += copied; total += copied; } skb_vaddr_set(skb, startaddr); return total; } /* * If we're on an older kernel, we don't have the pte_offset_map_lock() macro * available to prevent race conditions accessing PTEs in an atomic fashion. * But on newer kernels, we use that mechanism exclusively and don't take the * memory map spin lock ... This code is modeled on the mprotect() code * which does exactly what we want but isn't exported from the kernel. */ #if defined(pte_offset_map_lock) # define mprotect_page_table_lock(mm) \ do { } while (0) # define mprotect_page_table_unlock(mm) \ do { } while (0) #else # define mprotect_page_table_lock(mm) \ do { spin_lock(&(mm)->page_table_lock); } while (0) # define mprotect_page_table_unlock(mm) \ do { spin_unlock(&(mm)->page_table_lock); } while (0) # define pgd_none_or_clear_bad(pgd) \ (pgd_none(*(pgd)) || unlikely(pgd_bad(*(pgd)))) # define pud_none_or_clear_bad(pud) \ (pud_none(*(pud)) || unlikely(pud_bad(*(pud)))) # define pmd_none_or_clear_bad(pmd) \ (pmd_none(*(pmd)) || unlikely(pmd_bad(*(pmd)))) # define pte_offset_map_lock(mm, pmd, address, ptl) \ pte_offset_map(pmd, address) # define pte_unmap_unlock(pte, ptl) \ pte_unmap(pte) #endif /* !deinfed(pte_offset_map_lock) */ /* * We have an skb which has outstanding zero-copy DMA references to user pages * but we need to return to the user. This sometimes happens when an * application sets up a timer or the user types a ^C. Since the DMA hasn't * been acknowledged yet, we need to mark all of the pages referenced by the * skb as copy-on-write in order to fulfill standard UNIX write() semantics. * (I.e. writes to application memory buffers after a write() call returns cannot * affect the actual write results.) * In the case the VMA is read-only, we don't want to have mapcount decreased * later when in ULPCB_FLAG_ZCOPY_COW state so don't transition to that state. * Also, zcopy_skb_dma_pending() is called when in ULPCB_FLAG_ZCOPY state * at different stages of skb path through TX lists so make sure read-only * VMA's only looked at once. */ static int zcopy_skb_dma_pending(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct vm_area_struct *vma; unsigned int wr_hdr_len = ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR ? 0 : sizeof (struct fw_ofld_tx_data_wr); unsigned int len = skb->len - wr_hdr_len; unsigned long address = skb_vaddr(skb); unsigned long end = PAGE_ALIGN(address + len); int i; address &= PAGE_MASK; down_write(¤t->mm->mmap_sem); vma = find_vma(current->mm, skb_vaddr(skb)); if (!(vma->vm_flags & VM_WRITE)) { vma->vm_flags &= ~VM_MAYWRITE; up_write(¤t->mm->mmap_sem); ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_ZCOPY; ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_ZCOPY_COW_SKIP; cplios->zcopy_dma_unacked -= len; return 0; } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) /* * If Transparent Huge Pages are configured into the kernel then our * write buffer may overlap one or more THPs. At the Virtual Memory * Area level these look like "normal" (non-HUGE) VMAs. But burried * within the PGD/PUD/PMD structures, we may have THPs and a normal * Copy-On-Write process of marking PTEs won't work. It is possible * to handle these natively but difficult and any code to do this * would be subject to future architechtural changes in THPs. So, * instead we do another Get User Pages call with a request to "split" * any THPs which are found into "normal" pages and then we drop the * additional reference counts. */ if (!is_vm_hugetlb_page(vma)) { struct page *pages[MAX_SKB_FRAGS]; int numpages; int locked = 1; numpages = t4_get_user_pages_locked_with_flags_nowait(skb_vaddr(skb), skb_shinfo(skb)->nr_frags, FOLL_SPLIT, pages, &locked); if (numpages > 0) for (i=0; i < numpages; i++) put_page(pages[i]); } #endif mprotect_page_table_lock(current->mm); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, address += PAGE_SIZE) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; /* make sure the page doesn't go away */ atomic_inc(&skb_frag_page(frag)->_mapcount); /* * Dive down the PGD/PUD/PMD/PTE hierarchy for the page and * mark it COW. When we have a ZERO_PAGE() mapping, some * portions of the hierarchy may be missing. Since the * ZERO_PAGE() is already COW and can never change, there's * nothing we need to do. */ if ((pgd = pgd_offset(current->mm, address), !(pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))) && (pud = pud_offset(pgd, address), !(pud_none(*pud) || unlikely(pud_bad(*pud)))) && (pmd = pmd_offset(pud, address), !(pmd_none(*pmd) || unlikely(pmd_bad(*pmd))))) { spinlock_t *ptl __attribute__((unused)); pte_t *pte = t4_pte_offset_map_lock(current->mm, pmd, address, &ptl); if (pte != NULL) { #if defined(CONFIG_ARM64) BUG_ON(pte_exec(*pte)); #endif if (pte_present(*pte)) t4_ptep_set_wrprotect(current->mm, address, pte); } pte_unmap_unlock(pte, ptl); } } mprotect_page_table_unlock(current->mm); t4_flush_tlb_range(vma, skb_vaddr(skb), end); up_write(¤t->mm->mmap_sem); ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_ZCOPY; ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_ZCOPY_COW; cplios->zcopy_dma_unacked -= len; #ifdef T4_TRACE T4_TRACE5(TIDTB(sk), "zcopy_skb_dma_pending: address 0x%lx len %u mm %p " "mm_count %d need_hdr %d", address, len, current->mm, atomic_read(¤t->mm->mm_count), ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR); #endif return 0; } static void zcopy_skb_dma_complete(struct sock *sk, struct sk_buff *skb) { int i; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; atomic_dec(&skb_frag_page(frag)->_mapcount); } ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_ZCOPY_COW; } static int zcopy_dma_pending(struct sock *sk) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct sk_buff *skb; int ret = 0; wr_queue_walk(sk, skb) { if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ZCOPY) { ret = zcopy_skb_dma_pending(sk, skb); if (ret) return ret; } } skb_queue_walk(&cplios->tx_queue, skb) { if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ZCOPY) { ret = zcopy_skb_dma_pending(sk, skb); if (ret) return ret; } } return 0; } #if defined(CONFIG_T4_ZCOPY_SENDMSG) || defined(CONFIG_T4_ZCOPY_SENDMSG_MODULE) /* * Clean up any residual state for Zero-Copy TX Data skb's. */ void t4_zcopy_cleanup_skb(struct sock *sk, struct sk_buff *skb) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); unsigned int hdr_len = 0; /* * If ULPCB_FLAG_NEED_HDR isn't set, then we've gone through * t4_push_frames() and incremented tx_dma_pending (because Zero-Copy * skb's always start out with { ULPCB_FLAG_NEED_HDR, * ULPCB_FLAG_COMPL, ULPCB_FLAG_ZCOPY } and ULPCB_FLAG_NEED_HDR is * only removed in t4_push_frames() when ULPCB_FLAG_COMPL is set. * This is fragile logic so it must only be changed with care. */ if (!(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR)) { hdr_len = sizeof (struct fw_ofld_tx_data_wr); } /* * If ULPCB_FLAG_ZCOPY is set, then either the skb is still on the * Socket Write Queue or the Hardware TX Data Work Request Write Queue * and we've charged it's potential or outstanding DMA to * zcopy_dma_unacked. * * If ULPCB_FLAG_ZCOPY_COW is set, then we've marked all the user * pages backing the skb as Copy On Write and allowed an early return * to the user application. In this case, the skb was on the Hardware * TX Data Work Request Write Queue and we must have received a Write * Acknowlegement from the hardware for this skb's TX Data Work * Request. */ if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ZCOPY) { ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_ZCOPY; cplios->zcopy_dma_unacked -= (skb->len - hdr_len); if (!cplios->zcopy_dma_unacked) { if (likely(!sock_flag(sk, SOCK_DEAD))) __wake_up(sk_sleep(sk), TASK_INTERRUPTIBLE, 0, NULL); } } else if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ZCOPY_COW) zcopy_skb_dma_complete(sk, skb); skb_vaddr_set(skb, 0); } #endif static void zcopy_wait(struct sock *sk, long timeout, int ret_pending) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); DEFINE_WAIT(wait); timeout = max_t(long, HZ / 2, timeout); while (cplios->zcopy_dma_unacked && !(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) { if (ret_pending) { if (signal_pending(current) || !timeout) { #ifdef T4_TRACE T4_TRACE4(TIDTB(sk), "zcopy_wait: sk_err %d " "signal_pending 0x%x timeout %ld " "zcopy_dma_unacked %d", sk->sk_err, signal_pending(current), timeout, cplios->zcopy_dma_unacked); #endif if (!zcopy_dma_pending(sk)) { BUG_ON(cplios->zcopy_dma_unacked); break; } } } else if (!timeout) timeout = HZ / 2; #ifdef T4_TRACE T4_TRACE1(TIDTB(sk), "zcopy_wait: GTS zcopy_dma_unacked %d", cplios->zcopy_dma_unacked); #endif prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); } if (sk->sk_err == ECONNRESET) { if (cplios->zcopy_dma_unacked) { if (!zcopy_dma_pending(sk)) BUG_ON(cplios->zcopy_dma_unacked); } } finish_wait(sk_sleep(sk), &wait); } #endif static inline int t4_skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from, struct sk_buff *skb, struct page *page, int off, int copy) { int err; err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off, copy, skb->len); if (err) return err; skb->len += copy; skb->data_len += copy; skb->truesize += copy; sk->sk_wmem_queued += copy; return 0; } static int chelsio_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct cpl_io_state *cplios; long timeo; struct sk_buff *skb = NULL; struct tcp_sock *tp = tcp_sk(sk); struct toedev *tdev = NULL; int mss, flags, err; int copied = 0; struct tom_data *d; #if defined(CONFIG_T4_ZCOPY_SENDMSG) || defined(CONFIG_T4_ZCOPY_SENDMSG_MODULE) int zcopy_size = 0; int zcopied = 0; int partial_thres; #endif int tls_copied = 0; int tls_size = 0; struct tls_ofld_info *tls_ofld = NULL; lock_sock(sk); flags = msg->msg_flags; timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (!sk_in_state(sk, TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { err = sk_stream_wait_connect(sk, &timeo); if (err) goto out_err; } /* * Offloaded active opens can fail and revert back to the stack. * Yet the offloaded ops have already been made visible to applications * meaning they could well be executing after the transition back to * the stack. That has unfortunate consequences. If we find ourselves * in an offloaded op that could have started in SYN_SENT we need to * check if we should still be in there. */ if (sk->sk_prot->sendmsg != chelsio_sendmsg) { release_sock(sk); if (sk->sk_prot->sendmsg) return sk->sk_prot->sendmsg(sk, msg, size); else return sk->sk_socket->ops->sendmsg(sk->sk_socket, msg, size); } if (ma_fail_chelsio_sendmsg(sk, timeo)) return -EAGAIN; cplios = CPL_IO_STATE(sk); tdev = cplios->toedev; /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; mss = cplios->mss; d = TOM_DATA(tdev); #if defined(CONFIG_T4_ZCOPY_SENDMSG) || defined(CONFIG_T4_ZCOPY_SENDMSG_MODULE) partial_thres = (cplios->port_speed > SPEED_10000) ? TOM_TUNABLE(tdev, zcopy_sendmsg_partial_xlthres) : TOM_TUNABLE(tdev, zcopy_sendmsg_partial_thres); if (size >= partial_thres && !(msg->msg_iter.nr_segs > 1) && !cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN) && !corked(tp, flags) && can_do_mlock() && !segment_eq(get_fs(), KERNEL_DS) && !is_tls_offload(sk)) zcopy_size = size - TOM_TUNABLE(tdev, zcopy_sendmsg_partial_copy); /* In the case of NON-BLOCKING IO we don't want to exceed the * sendbuffer at all which could cause delays in the zcopy path */ if ((zcopy_size > 0) && (flags & MSG_DONTWAIT)) { int rem = sk->sk_sndbuf - sk->sk_wmem_queued; if (rem <= 0) { err = -EAGAIN; goto do_error; } else if (size > rem) size = rem; } /* If any of the pages are problematic or if the address range crosses * a VMA boundary or if base address of any iovec falls in the vm area * with VM_EXEC or VM_SHARED flags set, despite the fact that vm_flags * may vary between the iovec's in the message,we just reject the zero * copy effort. */ if (zcopy_size) { int iovlen; const struct iovec *iov; struct vm_area_struct *vma; down_read(¤t->mm->mmap_sem); for (iovlen = msg->msg_iter.nr_segs, iov = msg->msg_iter.iov; iovlen--; iov++) { unsigned long from = (unsigned long)iov->iov_base; vma = find_vma(current->mm, from); if (!vma || (vma->vm_start > from) || (vma->vm_end < from + size) || !zcopy_vma(vma)) { zcopy_size = 0; break; } } up_read(¤t->mm->mmap_sem); } #endif cplios_set_flag(cplios, CPLIOS_TX_MORE_DATA); while (msg_data_left(msg)) { int copy = 0; skb = skb_peek_tail(&cplios->tx_queue); if (skb) { copy = mss - skb->len; skb->ip_summed = CHECKSUM_UNNECESSARY; } /* * Inline TLS, strip 5B header when tx is encrypted by HW. */ if (is_tls_offload(sk) && tls_tx_key(sk)) tls_ofld = TLS_IO_STATE(sk); if (tls_ofld && !tls_ofld->sd.left) { struct tls_hdr thdr; tls_ofld = TLS_IO_STATE(sk); tls_size = tls_header_read(&thdr, &msg->msg_iter); size -= TLS_HEADER_LENGTH; tls_ofld->sd.left = tls_size; tls_ofld->sd.type = thdr.type; tls_copied += TLS_HEADER_LENGTH; if (skb) ULP_SKB_CB(skb)->ulp.tls.type = tls_ofld->sd.type; } if (!skb #if defined(CONFIG_T4_ZCOPY_SENDMSG) || defined(CONFIG_T4_ZCOPY_SENDMSG_MODULE) || zcopy_size > 0 #endif || (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) || copy <= 0) { new_buf: if (skb) { tx_skb_finalize(skb); push_frames_if_head(sk); } if (!tcp_memory_free(sk)) goto wait_for_sndbuf; if (is_tls_offload(sk) && tls_tx_key(sk)) { skb = alloc_tls_tx_skb( sk, select_size(sk, tls_size, flags, TLS_TX_HEADER_LEN), false); } else { skb = alloc_tx_skb( sk, select_size(sk, size, flags, TX_HEADER_LEN)); } if (unlikely(!skb)) goto wait_for_memory; skb->ip_summed = CHECKSUM_UNNECESSARY; copy = mss; } if (copy > size) copy = size; #if defined(CONFIG_T4_ZCOPY_SENDMSG) || defined(CONFIG_T4_ZCOPY_SENDMSG_MODULE) if (zcopy_size > 0) { copy = min_t(int, copy, zcopy_size); if (is_tls_offload(sk) && tls_tx_key(sk)) copy = min_t(int, copy, tls_ofld->sd.left); copy = t4_zcopy_to_skb(sk, skb, &msg->msg_iter, copy); if (copy < 0) { if (err == -EFAULT) goto do_fault; zcopy_size = 0; continue; } copied += copy; zcopied += copy; size -= copy; zcopy_size -= copy; /* Decrement tls data copied */ if (is_tls_offload(sk) && tls_tx_key(sk)) tls_ofld->sd.left -= copy; tx_skb_finalize(skb); ULP_SKB_CB(skb)->flags = ULP_SKB_CB(skb)->flags | ULPCB_FLAG_COMPL | ULPCB_FLAG_ZCOPY; if (!size) { cplios_reset_flag(cplios, CPLIOS_TX_MORE_DATA); t4_push_frames(sk, 1); goto done; } else { t4_push_frames(sk, 1); continue; } } #endif if (skb_tailroom(skb) > 0) { copy = min(copy, skb_tailroom(skb)); if (is_tls_offload(sk) && tls_tx_key(sk)) copy = min_t(int, copy, tls_ofld->sd.left); err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); if (err) goto do_fault; } else { bool merge; int off = TCP_OFF(sk); int i = skb_shinfo(skb)->nr_frags; struct page *page = TCP_PAGE(sk); int pg_size = PAGE_SIZE; if (page) pg_size <<= compound_order(page); if (off < pg_size && skb_can_coalesce(skb, i, page, off)) { merge = 1; goto copy; } merge = 0; /* IV DSGL as last entry in skb frags. Prefer * to send single PDU for fast path, can also be * ensured by setting frag size MSS */ if (i == (is_tls_offload(sk) ? (MAX_SKB_FRAGS - 1) : MAX_SKB_FRAGS)) goto new_buf; if (page && off == pg_size) { put_page(page); TCP_PAGE(sk) = page = NULL; pg_size = PAGE_SIZE; } if (!page) { gfp_t gfp = sk->sk_allocation; int order = d->send_page_order; if (order) { page = alloc_pages( gfp| __GFP_COMP| __GFP_NOWARN| __GFP_NORETRY, order); if (page) pg_size <<= compound_order(page); } if (!page) { page = alloc_page(gfp); pg_size = PAGE_SIZE; } if (!page) goto wait_for_memory; off = 0; } copy: if (copy > pg_size - off) copy = pg_size - off; if (is_tls_offload(sk) && tls_tx_key(sk)) copy = min_t(int, copy, tls_ofld->sd.left); err = t4_skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, page, off, copy); if (unlikely(err)) { /* * If the page was new, give it to the * socket so it does not get leaked. */ if (!TCP_PAGE(sk)) { TCP_PAGE(sk) = page; TCP_OFF(sk) = 0; } goto do_fault; } /* Update the skb. */ if (merge) { skb_shinfo(skb)->frags[i - 1].size += copy; } else { skb_fill_page_desc(skb, i, page, off, copy); if (off + copy < pg_size) { /* space left keep page */ get_page(page); TCP_PAGE(sk) = page; } else { TCP_PAGE(sk) = NULL; } } TCP_OFF(sk) = off + copy; } if (unlikely(skb->len == mss)) tx_skb_finalize(skb); tp->write_seq += copy; copied += copy; size -= copy; /* decrement tls data copied */ if (is_tls_offload(sk) && tls_tx_key(sk)) tls_ofld->sd.left -= copy; if (corked(tp, flags) && (sk_stream_wspace(sk) < sk_stream_min_wspace(sk))) ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_NO_APPEND; if (size == 0) goto out; if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) push_frames_if_head(sk); continue; wait_for_sndbuf: sk_set_bit(SOCK_NOSPACE, sk); wait_for_memory: err = wait_for_mem(sk, &timeo); if (err) goto do_error; } out: cplios_reset_flag(cplios, CPLIOS_TX_MORE_DATA); if (copied) tcp_push(sk, flags); done: #if defined(CONFIG_T4_ZCOPY_SENDMSG) || defined(CONFIG_T4_ZCOPY_SENDMSG_MODULE) if (zcopied > 0 && tdev) zcopy_wait(sk, timeo, TOM_TUNABLE(tdev, zcopy_sendmsg_ret_pending_dma)); #endif release_sock(sk); return copied + tls_copied; do_fault: if (!skb->len) { __skb_unlink(skb, &cplios->tx_queue); sk->sk_wmem_queued -= skb->truesize; __kfree_skb(skb); } do_error: if (copied) goto out; out_err: if (sock_flag(sk, SOCK_OFFLOADED)) cplios_reset_flag(CPL_IO_STATE(sk), CPLIOS_TX_MORE_DATA); copied = sk_stream_error(sk, flags, err); goto done; } static inline int is_delack_mode_valid(struct toedev *dev, struct sock *sk) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); return cplios->ulp_mode == ULP_MODE_NONE || (cplios->ulp_mode == ULP_MODE_TCPDDP && dev->ttid >= TOE_ID_CHELSIO_T4); } /* * Set of states for which we should return RX credits. */ #define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) /* * Called after some received data has been read. It returns RX credits * to the HW for the amount of data processed. */ void t4_cleanup_rbuf(struct sock *sk, int copied) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct tcp_sock *tp; struct toedev *dev; int dack_mode, must_send; u32 thres, credits, dack = 0; if (!sk_in_state(sk, CREDIT_RETURN_STATE)) return; t4_select_window(sk); tp = tcp_sk(sk); credits = tp->copied_seq - tp->rcv_wup; if (unlikely(!credits)) return; dev = cplios->toedev; thres = TOM_TUNABLE(dev, rx_credit_thres); if (unlikely(thres == 0)) return; if (is_delack_mode_valid(dev, sk)) { dack_mode = t4_select_delack(sk); if (unlikely(dack_mode != cplios->delack_mode)) { u32 r = tp->rcv_nxt - cplios->delack_seq; if (r >= tp->rcv_wnd || r >= 16 * MSS_CLAMP(tp)) dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(dack_mode); } } else dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); /* * For coalescing to work effectively ensure the receive window has * at least 16KB left. Can ignore check for TLS as no coalescing. */ must_send = credits + 16384 >= tp->rcv_wnd; if (must_send || credits >= thres) tp->rcv_wup += t4_send_rx_credits(sk, credits, dack, must_send); } EXPORT_SYMBOL(t4_cleanup_rbuf); static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; skb_queue_walk(&sk->sk_receive_queue, skb) { u32 offset = seq - ULP_SKB_CB(skb)->seq; if (offset < skb->len) { *off = offset; return skb; } } return NULL; } static inline int ddp_rcv_coalesce_enabled(struct cpl_io_state *cplios, int last_recv_len) { struct toedev *tdev = cplios->toedev; int should_ddp; switch (cplios->port_speed) { case 100: case 1000: should_ddp = last_recv_len > min((int)DEFAULT_RCV_COALESCE_SIZE-1, TOM_TUNABLE(tdev, ddp_thres)); break; case 10000: case 25000: should_ddp = last_recv_len > TOM_TUNABLE(tdev, ddp_thres); break; default: should_ddp = last_recv_len > TOM_TUNABLE(tdev, ddp_xlthres); break; } return should_ddp; } /* * Returns whether a connection should enable DDP. This happens when all of * the following conditions are met: * - the connection's ULP mode is DDP * - DDP is not already enabled * - the last receive was above the DDP threshold * - receive buffers are in user space * - receive side isn't shutdown (handled by caller) * - the connection's receive window is big enough so that sizable buffers * can be posted without closing the window in the middle of DDP (checked * when the connection is offloaded) */ static int sk_should_ddp(const struct sock *sk, const struct tcp_sock *tp, int last_recv_len) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct toedev *tdev = cplios->toedev; struct tom_data *d = TOM_DATA(tdev); unsigned int wsf; int should_ddp = cplios->ulp_mode == ULP_MODE_TCPDDP && !DDP_STATE(sk)->ddp_setup && ((can_do_mlock() && !segment_eq(get_fs(), KERNEL_DS)) || TOM_TUNABLE(tdev, kseg_ddp)); if (cplios->opt2 & V_RX_COALESCE(M_RX_COALESCE)) should_ddp = should_ddp && ddp_rcv_coalesce_enabled(cplios, last_recv_len); if (!should_ddp) return 0; /* d->lldi->sge_ingpadboundary refers to minimum indicate size */ wsf = cplios->port_speed ? DIV_ROUND_UP(cplios->port_speed, SPEED_10000) : 1; if (tcp_sk(sk)->rcv_wnd < (d->lldi->sge_ingpadboundary + wsf*DDP_RSVD_WIN)) return 0; return !bitmap_full(d->ppod_bmap, d->nppods); } static inline int is_ddp(const struct sk_buff *skb) { return skb_gl(skb) != NULL; } static inline int is_ddp_psh(const struct sk_buff *skb) { return is_ddp(skb) && (skb_ulp_ddp_flags(skb) & DDP_BF_PSH); } /* * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a * DDP buffer. */ static inline int copy_data(const struct sk_buff *skb, int offset, struct msghdr *msg, int len) { if (likely(!is_ddp(skb))) /* RX_DATA */ return skb_copy_datagram_msg(skb, offset, msg, len); else if (likely(skb_ulp_ddp_flags(skb) & DDP_BF_NOCOPY)) { /* user DDP */ iov_iter_advance(&msg->msg_iter, len); return 0; } return -EINVAL; } /* * Peek at data in a socket's receive buffer. */ static int peekmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags) { long timeo; struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); int copied = 0; u32 peek_seq, offset; size_t avail; /* amount of available data in current skb */ lock_sock(sk); timeo = sock_rcvtimeo(sk, nonblock); peek_seq = tp->copied_seq; do { if (unlikely(tp->urg_data && tp->urg_seq == peek_seq)) { if (copied) break; if (signal_pending(current)) { copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } } skb_queue_walk(&sk->sk_receive_queue, skb) { offset = peek_seq - ULP_SKB_CB(skb)->seq; if (offset < skb->len) goto found_ok_skb; } /* empty receive queue */ if (copied) break; if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk_no_receive(sk)) break; if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } if (sk->sk_backlog.tail) { /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); } else sk_wait_data(sk, &timeo); if (unlikely(peek_seq != tp->copied_seq)) { if (net_ratelimit()) printk(KERN_DEBUG "TCP(%s:%d): Application " "bug, race in MSG_PEEK.\n", current->comm, current->pid); peek_seq = tp->copied_seq; } continue; found_ok_skb: avail = skb->len - offset; if (len < avail) avail = len; /* * Do we have urgent data here? We need to skip over the * urgent byte. */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - peek_seq; if (urg_offset < avail) { /* * The amount of data we are preparing to copy * contains urgent data. */ if (!urg_offset) { /* First byte is urgent */ if (!sock_flag(sk, SOCK_URGINLINE)) { peek_seq++; offset++; avail--; if (!avail) continue; } } else { /* stop short of the urgent data */ avail = urg_offset; } } } /* * If MSG_TRUNC is specified the data is discarded. */ if (likely(!(flags & MSG_TRUNC))) if (copy_data(skb, offset, msg, avail)) { if (!copied) copied = -EFAULT; break; } peek_seq += avail; copied += avail; len -= avail; } while (len > 0); release_sock(sk); return copied; } static int sk_wait_data_uninterruptible(struct sock *sk) { int rc; long timeo = MAX_SCHEDULE_TIMEOUT; DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); rc = sk_wait_event(sk, &timeo, !skb_queue_empty(&sk->sk_receive_queue)); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); return rc; } /* * Called after a user buffer is posted to await DDP completion. The waiting * mode depends on the receive flags, which in turn determine the HW DDP flags. * * - Without MSG_WAITALL we set up the DDP buffer with non-zero initial offset * and enable the HW timeout. In this case we sleep uninterruptably since we * know the buffer will complete or timeout in reasonable time. * - With MSG_WAITALL HW timeout is initially disabled. If a signal arrives * and the DDP is still on-going we turn on the timer and disable * no-invalidate, then sleep uninterruptably until the buffer completes. */ static inline int await_ddp_completion(struct sock *sk, int rcv_flags, long *timeo) { if (unlikely(rcv_flags & MSG_WAITALL)) { sk_wait_data(sk, timeo); if (sk->sk_err || sk->sk_state == TCP_CLOSE || sk_no_receive(sk)) return 0; /* Got signal or timed out */ t4_set_tcb_field(sk, W_TCB_RX_DDP_FLAGS, V_TF_DDP_PSH_NO_INVALIDATE1(1ULL) | V_TF_DDP_PUSH_DISABLE_1(1ULL), 0); } return sk_wait_data_uninterruptible(sk); } #if defined(CONFIG_CHELSIO_IO_SPIN) /* * Return nanosecond "cycle counter". This is used to time short intervals * via simple unsigned integer subtraction. E.g. (t1 - t0) < interval. */ static inline unsigned long long get_ns_cycles(void) { struct timespec ts; ktime_get_ts(&ts); return (unsigned long long)ktime_to_ns(timespec_to_ktime(ts)); } #endif /* * Receive data from a socket into an application buffer. */ static int chelsio_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct cpl_io_state *cplios; struct tcp_sock *tp = tcp_sk(sk); int copied = 0, buffers_freed = 0; unsigned long avail; /* amount of available data in current skb */ int target; /* Read at least this many bytes */ int request; long timeo; int user_ddp_ok, user_ddp_pending = 0; struct ddp_state *p; struct tom_data *td; #if defined(CONFIG_CHELSIO_IO_SPIN) /* * Initialize I/O spin state variables. (We need to initialize * spin_ns; the others are to avoid annoying compiler warnings.) */ unsigned int spin_us = 0; unsigned long long spin_ns = 0; unsigned long long spin_start = 0; #endif /* Urgent data is handled by the SW stack's receive */ if (unlikely(flags & MSG_OOB)) return tcp_prot.recvmsg(sk, msg, len, nonblock, flags, addr_len); if (unlikely(flags & MSG_PEEK)) return peekmsg(sk, msg, len, nonblock, flags); if (tom_sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && (sk->sk_state == TCP_ESTABLISHED)) tom_sk_busy_loop(sk, nonblock); lock_sock(sk); if (sk->sk_prot->recvmsg != chelsio_recvmsg) { release_sock(sk); return sk->sk_prot->recvmsg(sk, msg, len, nonblock, flags, addr_len); } cplios = CPL_IO_STATE(sk); td = TOM_DATA(cplios->toedev); if (is_tls_offload(sk) && tls_rx_key(sk)) return chelsio_tlsv4_recvmsg(sk, msg, len, nonblock, flags, addr_len); timeo = sock_rcvtimeo(sk, nonblock); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); request = len; user_ddp_ok = (target <= iov_iter_single_seg_count(&msg->msg_iter)) && !((flags & MSG_WAITALL) && (msg->msg_iter.nr_segs > 1)); p = DDP_STATE(sk); /* * Check to see if we need to grow receive window. */ if (unlikely (cplios_flag(sk , CPLIOS_UPDATE_RCV_WND))) t4_cleanup_rbuf(sk, copied); if (p->ddp_setup) p->cancel_ubuf = 0; #if defined(CONFIG_CHELSIO_IO_SPIN) /* * If the administrator has selected to have us spin for recvmsg() * I/O, setup our I/O spin state variables. Rather than immediately * going to sleep waiting for ingress data when none is available, we * keep spinning for the specified time interval (specified in * microseconds) before giving up and sleeping waiting for new ingress * data. For latency-sensitive applications this can be a big win * (even though it does waste CPU). * * Note that we can actually be called with the socket in closing * state and with our offload resources released (including our TOE * Device). So we need to be paranoid here. */ if (cplios->toedev != NULL) { spin_us = TOM_TUNABLE(cplios->toedev, recvmsg_spin_us); if (spin_us) { spin_ns = (unsigned long long)spin_us * 1000; spin_start = get_ns_cycles(); } } #endif do { struct sk_buff *skb; u32 offset; p = DDP_STATE(sk); again: if (unlikely(tp->urg_data && tp->urg_seq == tp->copied_seq)) { if (copied) break; if (signal_pending(current)) { copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } } skb = skb_peek(&sk->sk_receive_queue); if (skb) goto found_ok_skb; /* * The receive queue is empty and here we are asking for more * data. Before we do anything else, check to see if we have * data queued up to send and if there's available write * space. If so, push it along and free up the write space. * This is a major win for request-response style * communication patterns and doesn't hurt bulk data * applications. */ if (cplios->wr_credits && skb_queue_len(&cplios->tx_queue) && t4_push_frames(sk, cplios->wr_credits == cplios->wr_max_credits)) sk->sk_write_space(sk); if (copied >= target && !sk->sk_backlog.tail && !user_ddp_pending) break; if (copied) { #ifdef T4_TRACE T4_TRACE5(TIDTB(sk), "chelsio_recvmsg: copied - break %d %d %d %d %d", sk->sk_err, sk->sk_state == TCP_CLOSE, sk_no_receive(sk), !timeo, signal_pending(current)); #endif if (sk->sk_err || sk->sk_state == TCP_CLOSE || sk_no_receive(sk) || signal_pending(current)) break; if (!timeo) { if (!sk->sk_backlog.tail && user_ddp_pending && t4_ddp_ubuf_pending(sk)) { long mintimeo = TOM_TUNABLE(cplios->toedev, recvmsg_ddp_wait_us); if (mintimeo > 0) { mintimeo = usecs_to_jiffies(mintimeo); sk_wait_data(sk, &mintimeo); } } break; } } else { #ifdef T4_TRACE T4_TRACE5(TIDTB(sk), "chelsio_recvmsg: !copied - break %d %d %d %d %d", sock_flag(sk, SOCK_DONE), sk->sk_err, sk_no_receive(sk), sk->sk_state == TCP_CLOSE, !timeo); #endif if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk_no_receive(sk)) break; if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; /* SOCK_DONE is off here */ break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } if (sk->sk_backlog.tail && !user_ddp_pending) { /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); t4_cleanup_rbuf(sk, copied); continue; } if (user_ddp_pending) { /* One shot at DDP if we already have enough data */ if (copied >= target) { user_ddp_ok = 0; } #ifdef T4_TRACE T4_TRACE0(TIDTB(sk), "chelsio_recvmsg: AWAIT"); #endif sk_wait_data(sk, &timeo); if (t4_ddp_cancel_push_disable(sk, !!(flags & MSG_WAITALL))) { t4_set_tcb_field(sk, W_TCB_RX_DDP_FLAGS, V_TF_DDP_PSHF_ENABLE_1(1ULL) | V_TF_DDP_PSH_NO_INVALIDATE1(1ULL) | V_TF_DDP_PUSH_DISABLE_1(1ULL), V_TF_DDP_PSHF_ENABLE_1(1ULL)); } #ifdef T4_TRACE T4_TRACE0(TIDTB(sk), "chelsio_recvmsg: AWAITed"); #endif } else if (copied >= target) break; else { t4_cleanup_rbuf(sk, copied); if ((flags & MSG_WAITALL) && p->ddp_setup && t4_ddp_indicate_ok(p)) { if ((iov_iter_single_seg_count(&msg->msg_iter) <= p->ind_size) || !user_ddp_ok) { p->indicate = tcp_sk(sk)->rcv_nxt; t4_setup_indicate_modrx(sk); p->indout_count++; } else if (user_ddp_ok) { p->ubuf_ddp_pending = user_ddp_pending = !t4_post_ubuf(sk, msg, nonblock, flags); if (!p->ubuf_ddp_pending) { p->post_failed++; if (p->post_failed >= TOM_TUNABLE(cplios->toedev, ddp_maxfail)) t4_shutdown_ddp(sk); } } } #if defined(CONFIG_CHELSIO_IO_SPIN) /* * If we're configured for spinning a bit before * giving up and going to sleep to wait for ingress * data, just retry to see if any data has arrived ... */ if (spin_ns && get_ns_cycles() - spin_start < spin_ns) { release_sock(sk); lock_sock(sk); continue; } #endif sk_wait_data(sk, &timeo); #if defined(CONFIG_CHELSIO_IO_SPIN) /* * If we're configured to spin a bit and the caller * has indicated that it wants to get all of the * requested data length, then set up our I/O spin * state to spin again. Otherwise, turn off I/O * spinning because the only reason we're back is * because there's more data or we timed out. (Mostly * this just saves the call to get_ns_cycles().) */ if (spin_ns) { if (flags & MSG_WAITALL) spin_start = get_ns_cycles(); else spin_ns = 0; } #endif #ifdef T4_TRACE T4_TRACE0(TIDTB(sk), "chelsio_recvmsg: DATA AWAITed"); #endif if (t4_ddp_cancel_push_disable(sk, !!(flags & MSG_WAITALL))) { t4_set_tcb_field(sk, W_TCB_RX_DDP_FLAGS, V_TF_DDP_PSHF_ENABLE_1(1ULL) | V_TF_DDP_PSH_NO_INVALIDATE1(1ULL) | V_TF_DDP_PUSH_DISABLE_1(1ULL), V_TF_DDP_PSHF_ENABLE_1(1ULL)); } } continue; found_ok_skb: if (!skb->len) { /* ubuf dma is complete */ #ifdef T4_TRACE T4_TRACE1(TIDTB(sk), "chelsio_recvmsg: zero len skb flags 0x%x", skb_ulp_ddp_flags(skb)); #endif BUG_ON(!(skb_ulp_ddp_flags(skb) & DDP_BF_NOCOPY)); user_ddp_pending = 0; tom_eat_ddp_skb(sk, skb); if (!copied && !timeo) { copied = -EAGAIN; break; } if (copied < target) continue; break; } offset = tp->copied_seq - ULP_SKB_CB(skb)->seq; if (offset >= skb->len) { #ifdef T4_TRACE T4_TRACE3(TIDTB(sk), "chelsio_recvmsg: BUG: OFFSET > LEN" " seq 0x%x skb->len %dflags 0x%x", ULP_SKB_CB(skb)->seq, skb->len, ULP_SKB_CB(skb)->flags); #endif printk("chelsio_recvmsg: BUG: OFFSET > LEN seq %#x " "skb->len %d copied %#x flags %#x", ULP_SKB_CB(skb)->seq, skb->len, tp->copied_seq, ULP_SKB_CB(skb)->flags); BUG_ON(1); } avail = skb->len - offset; if (len < avail) { if (is_ddp(skb) && (skb_ulp_ddp_flags(skb) & DDP_BF_NOCOPY)) { #ifdef T4_TRACE T4_TRACE5(TIDTB(sk), "chelsio_recvmsg: BUG: len < avail" " len %u skb->len %d offset %d" " flags 0x%x avail %u", len, skb->len, offset, skb_ulp_ddp_flags(skb), avail); #endif }; avail = len; } #ifdef T4_TRACE T4_TRACE5(TIDTB(sk), "chelsio_recvmsg: seq 0x%x skb->len %d offset %d" " avail %d flags 0x%x", ULP_SKB_CB(skb)->seq, skb->len, offset, avail, ULP_SKB_CB(skb)->flags); #endif /* * Check if the data we are preparing to copy contains urgent * data. Either stop short of urgent data or skip it if it's * first and we are not delivering urgent data inline. */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - tp->copied_seq; if (urg_offset < avail) { if (urg_offset) { /* stop short of the urgent data */ avail = urg_offset; } else if (!sock_flag(sk, SOCK_URGINLINE)) { /* First byte is urgent, skip */ tp->copied_seq++; offset++; avail--; if (!avail) goto skip_copy; } } } if (p->ddp_setup && !is_ddp(skb) && !p->ddp_off && (ULP_SKB_CB(skb)->seq == p->indicate)) { p->indicate = 0; if (skb_copy_datagram_msg(skb, offset, msg, avail)) { if (!copied) { copied = -EFAULT; break; } } if (ULP_SKB_CB(skb)->psh && !(flags & MSG_WAITALL)) user_ddp_ok = 0; if (likely(!sk_no_receive(sk))) { unsigned int iov_seg_len = iov_iter_single_seg_count(&msg->msg_iter); if ((flags & MSG_WAITALL) && t4_ddp_indicate_ok(p) && iov_seg_len && ((iov_seg_len <= p->ind_size) || !user_ddp_ok)) { p->indicate = tcp_sk(sk)->rcv_nxt; t4_setup_indicate_modrx(sk); p->indout_count++; } else if (iov_seg_len && user_ddp_ok && !user_ddp_pending) { p->ubuf_ddp_pending = user_ddp_pending = !t4_post_ubuf(sk, msg, nonblock, flags); if (!p->ubuf_ddp_pending) { p->post_failed++; if (p->post_failed >= TOM_TUNABLE(cplios->toedev, ddp_maxfail)) t4_shutdown_ddp(sk); } if (!(flags & MSG_WAITALL) && user_ddp_pending) user_ddp_ok = 0; } } } /* * If MSG_TRUNC is specified the data is discarded. */ else if (likely(!(flags & MSG_TRUNC))) if (!is_ddp(skb)) { if (skb_copy_datagram_msg(skb, offset, msg, avail)) { if (!copied) { copied = -EFAULT; break; } } } tp->copied_seq += avail; copied += avail; len -= avail; if (is_ddp(skb)) { iov_iter_advance(&msg->msg_iter, avail); tp->rcv_wup += avail; } skip_copy: if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) tp->urg_data = 0; /* * If the buffer is fully consumed free it. If it's a DDP * buffer also handle any events it indicates. */ if (avail + offset >= skb->len) { unsigned int fl = skb_ulp_ddp_flags(skb); int exitnow, got_psh = 0, nomoredata = 0; if (p->ddp_setup && is_ddp(skb)) { if (fl & 1) { if (is_ddp_psh(skb) && user_ddp_pending) got_psh = 1; if (fl & DDP_BF_NOCOPY) p->ubuf_ddp_pending = user_ddp_pending = 0; if ((fl & DDP_BF_NODATA) && nonblock) nomoredata = 1; } tom_eat_ddp_skb(sk, skb); skb = NULL; } if (likely(skb)) tom_eat_skb(sk, skb); buffers_freed++; exitnow = got_psh || nomoredata; if (copied >= target && !skb_peek(&sk->sk_receive_queue) && exitnow) break; } } while (len > 0); /* * If we can still receive decide what to do in preparation for the * next receive. Note that RCV_SHUTDOWN is set if the connection * transitioned to CLOSE but not if it was in that state to begin with. */ if (likely(!sk_no_receive(sk))) { /* * If we have DDP pending, turn off DDP and pull in any * completed DDP skbs on the receive queue. */ if (user_ddp_pending) { struct sk_buff *skb; user_ddp_ok = 0; t4_cancel_ubuf(sk, &timeo); p = DDP_STATE(sk); p->ubuf_ddp_pending = user_ddp_pending = 0; smp_rmb(); /* * Scan receive queue to absorb any completed DDPs. * The receive queue may contain some number of DDP * skbs (is_ddp() tests true), possibly followed by * some number of "normal" skbs. If our scan finds a * "normal" skb we can terminate the scan for DDP skbs * since the hardware won't ever turn DDP back on by * itself. */ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { unsigned int offset, avail; /* * If this is a normal skb then we're done * scanning for DDP skbs. If there's more * room to receive the non-DDP skb we may as * well grab the data. */ if (!is_ddp(skb)) { if (len > 0) { if (copied < 0) copied = 0; goto again; } else break; } /* * Update our statistics, buffer pointers, * etc. and consume the skb. * * XXX Why do we need to check for skb->len * XXX being non-zero? */ if (skb->len) { offset = tp->copied_seq - ULP_SKB_CB(skb)->seq; avail = skb->len - offset; tp->copied_seq += avail; tp->rcv_wup += avail; copied += avail; len -= avail; iov_iter_advance(&msg->msg_iter, avail); buffers_freed++; } tom_eat_ddp_skb(sk, skb); } } } /* Recheck SHUTDOWN conditions as t4_cancel_ubuf can release sock lock */ if (!(sk->sk_err || sk->sk_state == TCP_CLOSE || cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN) || sk_no_receive(sk))) { if (p->ddp_setup) { if ((p->avg_request_len < (TOM_TUNABLE(cplios->toedev, ddp_thres) >> 1)) && (request < (TOM_TUNABLE(cplios->toedev, ddp_thres) >> 1)) && !p->indout_count) t4_shutdown_ddp(sk); else t4_ddp_post_indicate(sk); p->avg_request_len = (p->avg_request_len + request) >> 1; } else if (!p->post_failed && sk_should_ddp(sk, tp, copied)) { if (!t4_enter_ddp(sk, sock_rcvlowat(sk, 0, DDP_RSVD_WIN), 0, nonblock)) p = DDP_STATE(sk); } } if (buffers_freed) t4_cleanup_rbuf(sk, copied); release_sock(sk); return copied; } /* * A visitor-pattern based receive method that runs the supplied receive actor * directly over the data in the receive queue. * * Caller must acquire the socket lock. */ int t4_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor) { u32 offset = 0; int used, copied = 0; struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); while ((skb = tcp_recv_skb(sk, tp->copied_seq, &offset)) != NULL) { size_t len = skb->len - offset; if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - tp->copied_seq; if (urg_offset < len) len = urg_offset; if (!len) break; } used = recv_actor(desc, skb, offset, len); if (unlikely(used < 0)) { if (!copied) return used; break; } else if (likely(used <= len)) { tp->copied_seq += used; copied += used; offset += used; } if (offset != skb->len) break; tom_eat_skb(sk, skb); if (!desc->count) break; } if (copied > 0) t4_cleanup_rbuf(sk, copied); return copied; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) /* * Offload splice_read() implementation. We need our own because the original * calls tcp_read_sock. */ #include struct tcp_splice_state { struct pipe_inode_info *pipe; size_t len; unsigned int flags; }; static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; return skb_splice_bits_pub(skb, offset, tss->pipe, tss->len, tss->flags); } static ssize_t chelsio_splice_read(struct sock *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct tcp_splice_state tss = { .pipe = pipe, .len = len, .flags = flags, }; int ret; long timeo; ssize_t spliced; read_descriptor_t rd_desc; /* We can't seek on a socket input */ if (unlikely(*ppos)) return -ESPIPE; ret = spliced = 0; rd_desc.arg.data = &tss; lock_sock(sk); timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK); while (tss.len) { ret = t4_read_sock(sk, &rd_desc, tcp_splice_data_recv); if (ret < 0) break; if (!ret) { if (spliced) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { ret = sock_error(sk); break; } if (sk_no_receive(sk)) break; if (sk->sk_state == TCP_CLOSE) { /* * This occurs when user tries to read * from never connected socket. */ ret = -ENOTCONN; break; } if (!timeo) { ret = -EAGAIN; break; } sk_wait_data(sk, &timeo); if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; } continue; } tss.len -= ret; spliced += ret; if (tss.len == 0) break; release_sock(sk); lock_sock(sk); if (sk->sk_err || sk->sk_state == TCP_CLOSE || sk_no_receive(sk) || !timeo || signal_pending(current)) break; } release_sock(sk); return spliced ? spliced : ret; } #endif /* * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail * under any circumstances. We take the easy way out and always queue the * message to the write_queue. We can optimize the case where the queue is * already empty though the optimization is probably not worth it. */ static void close_conn(struct sock *sk) { struct sk_buff *skb; struct cpl_close_con_req *req; struct cpl_io_state *cplios = CPL_IO_STATE(sk); unsigned int tid = cplios->tid; unsigned int len = roundup(sizeof(struct cpl_close_con_req), 16); skb = alloc_skb_nofail(len); req = (struct cpl_close_con_req *)__skb_put(skb, len); memset(req, 0, len); req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); req->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16)) | V_FW_WR_FLOWID(tid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); tcp_uncork(sk); skb_entail(sk, skb, ULPCB_FLAG_NO_HDR|ULPCB_FLAG_NO_APPEND); if (sk->sk_state != TCP_SYN_SENT) t4_push_frames(sk, 1); } /* * State transitions and actions for close. Note that if we are in SYN_SENT * we remain in that state as we cannot control a connection while it's in * SYN_SENT; such connections are allowed to establish and are then aborted. */ static unsigned char new_state[16] = { /* current state: new state: action: */ /* (Invalid) */ TCP_CLOSE, /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, /* TCP_SYN_SENT */ TCP_SYN_SENT, /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, /* TCP_TIME_WAIT */ TCP_CLOSE, /* TCP_CLOSE */ TCP_CLOSE, /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, /* TCP_LAST_ACK */ TCP_LAST_ACK, /* TCP_LISTEN */ TCP_CLOSE, /* TCP_CLOSING */ TCP_CLOSING, }; /* * Perform a state transition during close and return the actions indicated * for the transition. Do not make this function inline, the main reason * it exists at all is to avoid multiple inlining of tcp_set_state. */ static int make_close_transition(struct sock *sk) { int next = (int)new_state[sk->sk_state]; tcp_set_state(sk, next & TCP_STATE_MASK); return next & TCP_ACTION_FIN; } #define SHUTDOWN_ELIGIBLE_STATE (TCPF_ESTABLISHED | TCPF_SYN_RECV | TCPF_CLOSE_WAIT) /* * Shutdown the sending side of a connection. Much like close except * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD). * * Note: this does not do anything for SYN_SENT state as tcp_shutdown * does, however this function is not really called for SYN_SENT because * inet_shutdown handles that state specially. So no harm. */ static void chelsio_shutdown(struct sock *sk, int how) { if (sk->sk_prot->shutdown != chelsio_shutdown) return sk->sk_prot->shutdown(sk, how); if ((how & SEND_SHUTDOWN) && sk_in_state(sk, SHUTDOWN_ELIGIBLE_STATE) && !ma_fail_chelsio_shutdown(sk) && make_close_transition(sk)) close_conn(sk); } static void chelsio_close(struct sock *sk, long timeout) { struct cpl_io_state *cplios; int data_lost, old_state; lock_sock(sk); if (sk->sk_prot->close != chelsio_close) { release_sock(sk); return sk->sk_prot->close(sk, timeout); } cplios = CPL_IO_STATE(sk); sk->sk_shutdown |= SHUTDOWN_MASK; if (ma_fail_chelsio_close(sk)) return; /* * We need to flush the receive buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! Make a note * of whether any received data will be lost so we can decide whether * to FIN or RST. */ data_lost = skb_queue_len(&sk->sk_receive_queue); if (is_tls_offload(sk)) { struct tls_ofld_info *tls_ofld = TLS_IO_STATE(sk); data_lost |= skb_queue_len(&tls_ofld->sk_recv_queue); t4_tls_purge_receive_queue(sk); } t4_purge_receive_queue(sk); /* * If the connection is in DDP mode, disable DDP and have any * outstanding data and FIN (!!!) delivered to the host since HW * might fail a ABORT_REQ if a fin is held. */ if (cplios->ulp_mode == ULP_MODE_TCPDDP) t4_enable_ddp(sk, 0); if (sk->sk_state == TCP_CLOSE) /* Nothing if we are already closed */ ; else if (data_lost || sk->sk_state == TCP_SYN_SENT) { // Unread data was tossed, zap the connection. T4_NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); t4_send_reset(sk, CPL_ABORT_SEND_RST, NULL); release_tcp_port(sk); goto unlock; } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); T4_NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (make_close_transition(sk)) { /* Regular FIN-based close */ close_conn(sk); } if (timeout) sk_stream_wait_close(sk, timeout); unlock: old_state = sk->sk_state; sock_hold(sk); /* must last past the potential inet_csk_destroy_sock */ sock_orphan(sk); INC_ORPHAN_COUNT(sk); release_sock(sk); /* Final release_sock in connection's lifetime. */ /* * There are no more user references at this point. Grab the socket * spinlock and finish the close. */ local_bh_disable(); bh_lock_sock(sk); /* * Because the socket was orphaned before the bh_lock_sock * either the backlog or a BH may have already destroyed it. * Bail out if so. */ if (old_state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) goto out; if (sk->sk_state == TCP_FIN_WAIT2 && tcp_sk(sk)->linger2 < 0 && !cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN)) { struct sk_buff *skb; skb = alloc_skb(sizeof(struct cpl_abort_req), GFP_ATOMIC); if (skb) { t4_send_reset(sk, CPL_ABORT_SEND_RST, skb); T4_NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } } #if 0 if (sk->sk_state != TCP_CLOSE) { sk_stream_mem_reclaim(sk); if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { if (net_ratelimit()) printk(KERN_INFO "TCP: too many orphaned sockets\n"); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); } } #endif if (sk->sk_state == TCP_CLOSE) inet_csk_destroy_sock(sk); out: bh_unlock_sock(sk); local_bh_enable(); sock_put(sk); } /* * Our analog of tcp_free_skb(). */ static inline void chelsio_tcp_free_skb(struct sock *sk, struct sk_buff *skb) { sk->sk_wmem_queued -= skb->truesize; #if defined(CONFIG_T4_ZCOPY_SENDMSG) || defined(CONFIG_T4_ZCOPY_SENDMSG_MODULE) if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ZCOPY_COW) t4_zcopy_cleanup_skb(sk, skb); else skb_vaddr_set(skb, 0); #endif __kfree_skb(skb); } void t4_purge_write_queue(struct sock *sk) { struct cpl_io_state *cplios = CPL_IO_STATE(sk); struct sk_buff *skb; while ((skb = __skb_dequeue(&cplios->tx_queue))) chelsio_tcp_free_skb(sk, skb); // tcp_mem_reclaim(sk); } /* * Switch a socket to the SW TCP's protocol operations. */ void t4_install_standard_ops(struct sock *sk) { struct sk_ofld_proto *oproto; /* * Once we switch to the standard TCP operations our destructor * (chelsio_destroy_sock) will not be called. That function normally * cleans up socket DDP state so we need to do that here to avoid * leaking DDP resources. Note that while the socket may live on for * a long time DDP isn't usable with the standard ops, so DDP state * can be released at this time. */ struct cpl_io_state *cplios = CPL_IO_STATE(sk); t4_cleanup_ddp(sk); cplios->ulp_mode = ULP_MODE_NONE; oproto = (struct sk_ofld_proto *)sk->sk_prot; #if defined(CONFIG_TCPV6_OFFLOAD) if (sk->sk_family == AF_INET) sk->sk_prot = &tcp_prot; else sk->sk_prot = tcpv6_prot_p; #else sk->sk_prot = &tcp_prot; #endif sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; restore_socket_ops(sk); if (sk->sk_write_space == t4_write_space) sk->sk_write_space = sk_stream_write_space_compat; if (likely(sk->sk_filter)) { sk_filter_uncharge_compat(sk, sk->sk_filter); sk->sk_filter = NULL; } if (sk->sk_user_data) restore_special_data_ready(sk); tcp_xmit_timers_init_compat(sk); cplios_oproto_put(sk, cplios, oproto); } /* * Wait until a socket enters on of the given states. */ static int wait_for_states(struct sock *sk, unsigned int states) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) wait_queue_head_t _sk_sleep; #else struct socket_wq _sk_wq; #endif struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); int err = 0, count = 0; int maxcount = 5; /* * We want this to work even when there's no associated struct socket. * In that case we provide a temporary wait_queue_head_t. */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) if (sk->sk_sleep == NULL) { init_waitqueue_head(&_sk_sleep); sk->sk_sleep = &_sk_sleep; } #else if (sk->sk_wq == NULL) { init_waitqueue_head(&_sk_wq.wait); _sk_wq.fasync_list = NULL; init_rcu_head_on_stack(&_sk_wq.rcu); sk->sk_wq = &_sk_wq; } #endif add_wait_queue(sk_sleep(sk), &wait); while (!sk_in_state(sk, states)) { signed long current_timeo = msecs_to_jiffies(200); if (count == maxcount) { err = -EBUSY; break; } count++; set_task_state(tsk, TASK_UNINTERRUPTIBLE); release_sock(sk); if (!sk_in_state(sk, states)) current_timeo = schedule_timeout(current_timeo); __set_task_state(tsk, TASK_RUNNING); lock_sock(sk); } remove_wait_queue(sk_sleep(sk), &wait); #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) if (sk_sleep(sk) == &_sk_sleep) sk->sk_sleep = NULL; #else if (sk->sk_wq == &_sk_wq) sk->sk_wq = NULL; #endif return err; } static int chelsio_disconnect(struct sock *sk, int flags) { struct cpl_io_state *cplios; struct tcp_sock *tp = tcp_sk(sk); int err; if (sk->sk_prot->disconnect != chelsio_disconnect) return sk->sk_prot->disconnect(sk, flags); cplios = CPL_IO_STATE(sk); if (is_tls_offload(sk)) t4_tls_purge_receive_queue(sk); t4_purge_receive_queue(sk); t4_purge_write_queue(sk); if (sk->sk_state != TCP_CLOSE) { sk->sk_err = ECONNRESET; t4_send_reset(sk, CPL_ABORT_SEND_RST, NULL); if ((err = wait_for_states(sk, TCPF_CLOSE))) { pr_err("%s: Either Abort rpl did not get processed " "or the wait timed-out; err = %d \n", __func__, err); return err; } } if (sk->sk_prot->disconnect != chelsio_disconnect) return sk->sk_prot->disconnect(sk, flags); if (is_tls_offload(sk)) t4_tls_purge_receive_queue(sk); t4_purge_receive_queue(sk); __skb_queue_purge(&cplios->ooo_queue); /* * We don't know the correct value for max_window but we know an * upper limit. */ tp->max_window = 0xFFFF << SND_WSCALE(tp); /* * Now switch to Linux's TCP operations and let it finish the job. */ t4_install_standard_ops(sk); return tcp_disconnect(sk, flags); } /* * Our version of tcp_v4_destroy_sock(). We need to do this because * tcp_writequeue_purge() that is used in the original doesn't quite match * our needs. If we ever hook into the memory management of the SW stack we * may be able to use tcp_v4_destroy_sock() directly. */ static t4_type_compat chelsio_destroy_sock(struct sock *sk) { struct cpl_io_state *cplios; if (sk->sk_prot->destroy != chelsio_destroy_sock) return sk->sk_prot->destroy(sk); cplios = CPL_IO_STATE(sk); t4_cleanup_ddp(sk); if (is_tls_offload(sk)) t4_tls_purge_receive_queue(sk); cplios->ulp_mode = ULP_MODE_NONE; t4_purge_write_queue(sk); if (is_tls_offload(sk)) t4_tls_release_context(sk); cplios->toedev = NULL; cplios_oproto_put(sk, cplios, sk_ofld_proto_get(sk)); #if defined(CONFIG_TCPV6_OFFLOAD) if (sk->sk_family == AF_INET) sk->sk_prot = &tcp_prot; else sk->sk_prot = tcpv6_prot_p; #else sk->sk_prot = &tcp_prot; #endif sk->sk_prot->destroy(sk); } /* IP socket options we do not support on offloaded connections */ #define UNSUP_IP_SOCK_OPT ((1 << IP_OPTIONS)) /* * Socket option code for IP. We do not allow certain options while a * connection is offloaded. Some of the other options we handle specially, * and the rest are directed to the SW IP for their usual processing. */ static int t4_ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen, int call_compat) { if (level != SOL_IP) return -ENOPROTOOPT; /* unsupported options */ if ((1 << optname) & UNSUP_IP_SOCK_OPT) { printk(KERN_WARNING "IP option %d ignored on offloaded TCP connection\n", optname); return -ENOPROTOOPT; } /* specially handled options */ if (optname == IP_TOS) { struct inet_sock *inet = inet_sk(sk); int val = 0, err = 0; if (optlen >= sizeof(int)) { if (get_user(val, (int __user *)optval)) return -EFAULT; } else if (optlen >= sizeof(char)) { unsigned char ucval; if (get_user(ucval, (unsigned char __user *)optval)) return -EFAULT; val = (int)ucval; } lock_sock(sk); val &= ~3; val |= inet->tos & 3; if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && !capable(CAP_NET_ADMIN)) err = -EPERM; else if (inet->tos != val) { inet->tos = val; sk->sk_priority = rt_tos2priority(val); t4_set_tos(sk); } release_sock(sk); return err; } #ifdef TOM_CONFIG_COMPAT if (call_compat && inet_csk(sk)->icsk_af_ops->compat_setsockopt) return inet_csk(sk)->icsk_af_ops->compat_setsockopt(sk, level, optname, optval, optlen); #endif return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, optname, optval, optlen); } /* * Socket option code for TCP. We override any option processing that needs to * be handled specially for a TOE and leave the other options to SW TCP. */ static int do_t4_tcp_setsockopt(struct sock *sk, int level, int optname, char __user * optval, socklen_t optlen) { struct tcp_sock *tp = tcp_sk(sk); int val, err = 0; if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_user(name, optval, min((socklen_t)(TCP_CA_NAME_MAX - 1), optlen)); if (val < 0) return -EFAULT; name[val] = 0; return t4_set_cong_control(sk, name); } if (optlen < sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) return -EFAULT; lock_sock(sk); switch (optname) { case TCP_NODELAY: { int oldval = tp->nonagle; if (val) tp->nonagle |= TCP_NAGLE_OFF; else tp->nonagle &= ~TCP_NAGLE_OFF; if (oldval != tp->nonagle) t4_set_nagle(sk); break; } case TCP_CORK: if (val) tp->nonagle |= TCP_NAGLE_CORK; else tcp_uncork(sk); break; case TCP_KEEPIDLE: if (val < 1 || val > MAX_TCP_KEEPIDLE) err = -EINVAL; else { tp->keepalive_time = val * HZ; } break; case TCP_QUICKACK: if (!val) { inet_csk(sk)->icsk_ack.pingpong = 1; } else { inet_csk(sk)->icsk_ack.pingpong = 0; } break; case TCP_SCHEDCLASS: { struct cpl_io_state *cplios; struct toedev *tdev; struct tom_data *td; struct cxgb4_lld_info *lldi; unsigned int old_sched_cls; /* * Cant's do anything with dead sockets. */ if (sk->sk_state == TCP_CLOSE) { err = -ENOTCONN; break; } cplios = CPL_IO_STATE(sk); tdev = cplios->toedev; td = TOM_DATA(tdev); lldi = td->lldi; /* * Valid Scheduler Class values are: * val < 0: unbind the socket from any scheduling class * val < N: bind socket to indicated scheduling class */ if (val >= 0 && val >= lldi->nsched_cls) { err = -EINVAL; break; } old_sched_cls = cplios->sched_cls; if (val < 0) cplios->sched_cls = SCHED_CLS_NONE; else cplios->sched_cls = val; /* * If we've already sent the first data on the connection, * then we've already sent the first FlowC Work Request with * whatever scheduling class was bound at the time. So we'll * need to send an update. */ if (old_sched_cls != cplios->sched_cls && cplios_flag(sk, CPLIOS_TX_DATA_SENT)) { int flowclen16 = send_tx_schedclass_wr(sk, 0); if (flowclen16 < 0) err = flowclen16; } break; } default: release_sock(sk); err = tcp_setsockopt(sk, level, optname, optval, optlen); goto out; } release_sock(sk); out: return err; } static int t4_tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, socklen_t optlen) { if (sk->sk_prot->setsockopt != t4_tcp_setsockopt) return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); return level != SOL_TCP ? t4_ip_setsockopt(sk, level, optname, optval, optlen, 0) : do_t4_tcp_setsockopt(sk, level, optname, optval, optlen); } #ifdef TOM_CONFIG_COMPAT static int t4_compat_tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, socklen_t optlen) { if (sk->sk_prot->compat_setsockopt != t4_compat_tcp_setsockopt) return sk->sk_prot->compat_setsockopt(sk, level, optname, optval, optlen); return level != SOL_TCP ? t4_ip_setsockopt(sk, level, optname, optval, optlen, 1) : do_t4_tcp_setsockopt(sk, level, optname, optval, optlen); } #endif #if defined(CONFIG_TCP_OFFLOAD) static void set_keepalive(struct sock *sk, int on_off) { int old = sock_flag(sk, SOCK_KEEPOPEN) != 0; if (sk->sk_prot->set_keepalive != set_keepalive) return sk->sk_prot->set_keepalive(sk, on_off); if (sk->sk_state != TCP_CLOSE && (on_off ^ old)) t4_set_keepalive(sk, on_off); } #endif static int chelsio_ioctl(struct sock *sk, int cmd, unsigned long arg) { struct tls_key_context uk_ctx; int mode = 0; switch (cmd) { case IOCTL_TLSOM_SET_TLS_CONTEXT: if (copy_from_user((void *)&uk_ctx, (void *)arg, sizeof(struct tls_key_context))) { return -1; } return program_key_context(sk, (struct tls_key_context *)&uk_ctx); case IOCTL_TLSOM_GET_TLS_TOM: if (is_tls_offload(sk)) mode = 1; return put_user(mode, (int __user *)arg); case IOCTL_TLSOM_CLR_TLS_TOM: return tls_clr_ofld_mode(sk); case IOCTL_TLSOM_CLR_QUIES: return tls_clr_quiesce(sk); default: return tcp_ioctl(sk, cmd, arg); } } struct request_sock_ops t4_rsk_ops, t4_rsk6_ops; struct sk_ofld_proto t4_tcp_prot, t4_tcp_v6_prot; /* * Set up the offload protocol operations vector. We start with TCP's and * override some of the operations. Note that we do not override the backlog * handler here. */ void __init t4_init_offload_ops(void) { t4_tcp_prot.proto = tcp_prot; t4_init_rsk_ops(&t4_tcp_prot.proto, &t4_rsk_ops, &tcp_prot, PF_INET); t4_tcp_prot.proto.close = chelsio_close; t4_tcp_prot.proto.disconnect = chelsio_disconnect; t4_tcp_prot.proto.destroy = chelsio_destroy_sock; t4_tcp_prot.proto.shutdown = chelsio_shutdown; t4_tcp_prot.proto.setsockopt = t4_tcp_setsockopt; t4_tcp_prot.proto.sendmsg = chelsio_sendmsg; t4_tcp_prot.proto.recvmsg = chelsio_recvmsg; t4_tcp_prot.proto.sendpage = chelsio_sendpage; t4_tcp_prot.proto.ioctl = chelsio_ioctl; #if defined(CONFIG_TCP_OFFLOAD) t4_tcp_prot.proto.sendskb = t4_sendskb; t4_tcp_prot.proto.read_sock = t4_read_sock; t4_tcp_prot.proto.set_keepalive = set_keepalive; #endif #ifdef TOM_CONFIG_COMPAT t4_tcp_prot.proto.compat_setsockopt = t4_compat_tcp_setsockopt; #endif t4_tcp_prot.read_sock = t4_read_sock; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) t4_tcp_prot.splice_read = chelsio_splice_read; #endif #if defined(CONFIG_TCPV6_OFFLOAD) t4_tcp_v6_prot.proto = *tcpv6_prot_p; t4_tcp_v6_prot.proto.close = chelsio_close; t4_tcp_v6_prot.proto.disconnect = chelsio_disconnect; t4_tcp_v6_prot.proto.destroy = chelsio_destroy_sock; t4_tcp_v6_prot.proto.shutdown = chelsio_shutdown; t4_tcp_v6_prot.proto.setsockopt = t4_tcp_setsockopt; t4_tcp_v6_prot.proto.sendmsg = chelsio_sendmsg; t4_tcp_v6_prot.proto.recvmsg = chelsio_recvmsg; t4_tcp_v6_prot.proto.sendpage = chelsio_sendpage; #if defined(CONFIG_TCP_OFFLOAD) t4_tcp_v6_prot.proto.sendskb = t4_sendskb; t4_tcp_v6_prot.proto.read_sock = t4_read_sock; t4_tcp_v6_prot.proto.set_keepalive = set_keepalive; #endif #ifdef TOM_CONFIG_COMPAT t4_tcp_v6_prot.proto.compat_setsockopt = t4_compat_tcp_setsockopt; #endif t4_tcp_v6_prot.read_sock = t4_read_sock; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) t4_tcp_v6_prot.splice_read = chelsio_splice_read; #endif t4_init_rsk6_ops(&t4_tcp_v6_prot.proto, &t4_rsk6_ops, tcpv6_prot_p, PF_INET6); #endif }