/* QLogic (R)NIC Driver/Library * Copyright (c) 2015-2016 QLogic Corporation * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #include #include #include #include #include #include #include "qelr.h" #include "qelr_abi.h" #include "util/compiler.h" #include "qelr_chain.h" #include #include #include #include /* Fast path debug prints */ #define FP_DP_VERBOSE(...) /* #define FP_DP_VERBOSE(...) DP_VERBOSE(__VA_ARGS__) */ #define QELR_SQE_ELEMENT_SIZE (sizeof(struct rdma_sq_sge)) #define QELR_RQE_ELEMENT_SIZE (sizeof(struct rdma_rq_sge)) #define QELR_CQE_SIZE (sizeof(union rdma_cqe)) static void qelr_inc_sw_cons_u16(struct qelr_qp_hwq_info *info) { info->cons = (info->cons + 1) % info->max_wr; info->wqe_cons++; } static void qelr_inc_sw_prod_u16(struct qelr_qp_hwq_info *info) { info->prod = (info->prod + 1) % info->max_wr; } static inline int qelr_wq_is_full(struct qelr_qp_hwq_info *info) { return (((info->prod + 1) % info->max_wr) == info->cons); } #if QELR_SRQ static void qelr_inc_srq_wr_prod(struct qelr_srq_hwq_info *info) { info->wr_prod_cnt++; } static void qelr_inc_srq_wr_cons(struct qelr_srq_hwq_info *info) { info->wr_cons_cnt++; } #endif int qelr_query_device(struct ibv_context *context, struct ibv_device_attr *attr) { struct ibv_query_device cmd; uint64_t fw_ver; unsigned int major, minor, revision, eng; int status; bzero(attr, sizeof(*attr)); status = ibv_cmd_query_device(context, attr, &fw_ver, &cmd, sizeof(cmd)); major = (fw_ver >> 24) & 0xff; minor = (fw_ver >> 16) & 0xff; revision = (fw_ver >> 8) & 0xff; eng = fw_ver & 0xff; snprintf(attr->fw_ver, sizeof(attr->fw_ver), "%d.%d.%d.%d", major, minor, revision, eng); return status; } int qelr_query_port(struct ibv_context *context, uint8_t port, struct ibv_port_attr *attr) { struct ibv_query_port cmd; int status; status = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); return status; } struct ibv_pd *qelr_alloc_pd(struct ibv_context *context) { struct qelr_alloc_pd_req cmd; struct qelr_alloc_pd_resp resp; struct qelr_pd *pd; struct qelr_devctx *cxt = get_qelr_ctx(context); pd = malloc(sizeof(*pd)); if (!pd) return NULL; bzero(pd, sizeof(*pd)); memset(&cmd, 0, sizeof(cmd)); if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd.cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp))) { free(pd); return NULL; } pd->pd_id = resp.pd_id; DP_VERBOSE(cxt->dbg_fp, QELR_MSG_INIT, "Allocated pd: %d\n", pd->pd_id); return &pd->ibv_pd; } int qelr_dealloc_pd(struct ibv_pd *ibpd) { int rc = 0; struct qelr_pd *pd = get_qelr_pd(ibpd); struct qelr_devctx *cxt = get_qelr_ctx(ibpd->context); DP_VERBOSE(cxt->dbg_fp, QELR_MSG_INIT, "Deallocated pd: %d\n", pd->pd_id); rc = ibv_cmd_dealloc_pd(ibpd); if (rc) return rc; free(pd); return rc; } struct ibv_mr* qelr_reg_mr(struct ibv_pd *ibpd, void *addr, size_t len, int access) { struct qelr_mr *mr; struct ibv_reg_mr cmd; struct qelr_reg_mr_resp resp; struct qelr_pd *pd = get_qelr_pd(ibpd); struct qelr_devctx *cxt = get_qelr_ctx(ibpd->context); uint64_t hca_va = (uintptr_t) addr; mr = malloc(sizeof(*mr)); if (!mr) return NULL; bzero(mr, sizeof(*mr)); if (ibv_cmd_reg_mr(ibpd, addr, len, hca_va, access, &mr->ibv_mr, &cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp))) { free(mr); return NULL; } DP_VERBOSE(cxt->dbg_fp, QELR_MSG_MR, "MR Register %p completed successfully pd_id=%d addr=%p len=%zu access=%d lkey=%x rkey=%x\n", mr, pd->pd_id, addr, len, access, mr->ibv_mr.lkey, mr->ibv_mr.rkey); return &mr->ibv_mr; } int qelr_dereg_mr(struct ibv_mr *mr) { struct qelr_devctx *cxt = get_qelr_ctx(mr->context); int rc; rc = ibv_cmd_dereg_mr(mr); if (rc) return rc; free(mr); DP_VERBOSE(cxt->dbg_fp, QELR_MSG_MR, "MR DERegister %p completed successfully\n", mr); return 0; } static void consume_cqe(struct qelr_cq *cq) { if (cq->latest_cqe == cq->toggle_cqe) cq->chain_toggle ^= RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK; cq->latest_cqe = qelr_chain_consume(&cq->chain); } static inline int qelr_cq_entries(int entries) { /* We allocate an extra entry that we don't report to the FW. * Why? * The CQE size is 32 bytes but the FW writes in chunks of 64 bytes * (for performance purposes). Allocating an extra entry and telling * the FW we have less prevents overwriting the first entry in case of * a wrap i.e. when the FW writes the last entry and the application * hasn't read the first one. */ return entries + 1; } #if QELR_DB_RECOVERY /* Accept qelr_user_db_rec pointer pointer since different structures which * have a doorbell address require this fucntionality (cq/sq/rq). */ int qelr_db_rec_alloc(struct qelr_devctx *cxt, struct qelr_user_db_rec **db_rec_addr_ptr) { u32 size = cxt->kernel_page_size; void *addr; int rc; /* map the address of the user doorbell recovery entry to kernel */ addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, QELR_ANON_FD, QELR_ANON_OFFSET); if (addr == MAP_FAILED) { DP_ERR(cxt->dbg_fp, "doorbell recovery mmap failed\n"); return errno; } rc = ibv_dontfork_range(addr, size); if (rc) { munmap(addr, size); DP_ERR(cxt->dbg_fp, "doorbell recovery ibv_dontfork_range failed\n"); return rc; } *db_rec_addr_ptr = addr; return 0; } void qelr_db_rec_free(struct qelr_devctx *cxt, struct qelr_user_db_rec **db_rec_addr_ptr) { ibv_dofork_range(*db_rec_addr_ptr, cxt->kernel_page_size); munmap(*db_rec_addr_ptr, cxt->kernel_page_size); } #endif struct ibv_cq *qelr_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector) { struct qelr_devctx *cxt = get_qelr_ctx(context); struct qelr_create_cq_resp resp; struct qelr_create_cq_req cmd; struct qelr_cq *cq; int chain_size; int rc; DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "create cq: context=%p, cqe=%d, channel=%p, comp_vector=%d\n", context, cqe, channel, comp_vector); if (!cqe || cqe > cxt->max_cqes) { DP_ERR(cxt->dbg_fp, "create cq: failed. attempted to allocate %d cqes but valid range is 1...%d\n", cqe, cxt->max_cqes); return NULL; } /* allocate CQ structure */ cq = calloc(1, sizeof(*cq)); if (!cq) return NULL; /* allocate CQ buffer */ chain_size = qelr_cq_entries(cqe) * QELR_CQE_SIZE; rc = qelr_chain_alloc(&cq->chain, chain_size, cxt->kernel_page_size, QELR_CQE_SIZE); if (rc) goto err_0; #if QELR_DB_RECOVERY rc = qelr_db_rec_alloc(cxt, &cq->db_rec_addr); if (rc) goto err_1; #endif /* provide kernel with chain address and size and doorbell recovery * user entry address for mmap */ cmd.addr = (uintptr_t)cq->chain.first_addr; cmd.len = cq->chain.size; #if QELR_DB_RECOVERY cmd.db_rec_addr = (uintptr_t)cq->db_rec_addr; #endif rc = ibv_cmd_create_cq(context, cqe, channel, comp_vector, &cq->ibv_cq, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (rc) { DP_ERR(cxt->dbg_fp, "create cq: failed with rc = %d\n", rc); goto err_2; } /* init doorbell params */ cq->db.data.icid = htole16(resp.icid); cq->db.data.params = DB_AGG_CMD_SET << RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT; cq->db_addr = cxt->db_addr + resp.db_offset; /* point to the very last element, passing this we will toggle */ cq->toggle_cqe = qelr_chain_get_last_elem(&cq->chain); cq->chain_toggle = RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK; cq->latest_cqe = NULL; /* must be different from chain_toggle */ consume_cqe(cq); DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "create cq: successfully created %p\n", cq); return &cq->ibv_cq; err_2: #if QELR_DB_RECOVERY qelr_db_rec_free(cxt, &cq->db_rec_addr); err_1: #endif qelr_chain_free(&cq->chain); err_0: free(cq); return NULL; } int qelr_destroy_cq(struct ibv_cq *ibv_cq) { struct qelr_devctx *cxt = get_qelr_ctx(ibv_cq->context); struct qelr_cq *cq = get_qelr_cq(ibv_cq); int rc; DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "destroy cq: %p\n", cq); rc = ibv_cmd_destroy_cq(ibv_cq); if (rc) { DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "destroy cq: failed to destroy %p, got %d.\n", cq, rc); return rc; } qelr_chain_free(&cq->chain); #if QELR_DB_RECOVERY qelr_db_rec_free(cxt, &cq->db_rec_addr); #endif free(cq); DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "destroy cq: successfully destroyed %p\n", cq); return 0; } #if QELR_SRQ int qelr_query_srq(struct ibv_srq *ibv_srq, struct ibv_srq_attr *attr) { struct qelr_devctx *cxt = get_qelr_ctx(ibv_srq->context); struct qelr_srq *srq = get_qelr_srq(ibv_srq); struct ibv_query_srq cmd; int rc; rc = ibv_cmd_query_srq(ibv_srq, attr, &cmd, sizeof(cmd)); if (rc) { DP_ERR(cxt->dbg_fp, "query srq: failed to query %p, got %d.\n", srq, rc); return rc; } DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ, "query srq: successfully queried %p\n", srq); return 0; } int qelr_modify_srq(struct ibv_srq *ibv_srq, struct ibv_srq_attr *attr, int attr_mask) { struct qelr_devctx *cxt = get_qelr_ctx(ibv_srq->context); struct qelr_srq *srq = get_qelr_srq(ibv_srq); struct ibv_modify_srq cmd; int rc; rc = ibv_cmd_modify_srq(ibv_srq, attr, attr_mask, &cmd, sizeof(cmd)); if (rc) { DP_ERR(cxt->dbg_fp, "modify srq: failed to modify %p, got %d.\n", srq, rc); return rc; } DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ, "modify srq: successfully modified %p\n", srq); return 0; } int qelr_destroy_srq(struct ibv_srq *ibv_srq) { struct qelr_devctx *cxt = get_qelr_ctx(ibv_srq->context); struct qelr_srq *srq = get_qelr_srq(ibv_srq); uint32_t *virt_prod_pair_addr; uint32_t prod_size; int rc; rc = ibv_cmd_destroy_srq(ibv_srq); if (rc) { DP_ERR(cxt->dbg_fp, "destroy srq: failed to destroy %p, got %d.\n", srq, rc); return rc; } qelr_chain_free(&srq->hw_srq.chain); virt_prod_pair_addr = srq->hw_srq.virt_prod_pair_addr; prod_size = sizeof(struct rdma_srq_producers); ibv_dofork_range(virt_prod_pair_addr, prod_size); munmap(virt_prod_pair_addr, prod_size); free(srq); DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ, "destroy srq: successfully destroyed %p\n", srq); return 0; } static inline void qelr_create_srq_configure_req(struct qelr_srq *srq, struct qelr_create_srq_req *req) { req->srq_addr = (uintptr_t)srq->hw_srq.chain.first_addr; req->srq_len = srq->hw_srq.chain.size; req->prod_pair_addr = (uintptr_t)srq->hw_srq.virt_prod_pair_addr; } static inline int qelr_create_srq_buffers(struct qelr_devctx *cxt, struct qelr_srq *srq, struct ibv_srq_init_attr *attrs) { uint32_t max_wr, max_sges, max_recv_buf; int chain_size, prod_size; void *addr; int rc; max_wr = attrs->attr.max_wr; max_wr = max_t(uint32_t, max_wr, 1); max_wr = min_t(uint32_t, max_wr, cxt->max_srq_wr); max_sges = max_wr * (cxt->sges_per_srq_wr + 1); /* +1 for header */ max_recv_buf = max_sges * QELR_RQE_ELEMENT_SIZE; chain_size = max_recv_buf; rc = qelr_chain_alloc(&srq->hw_srq.chain, chain_size, cxt->kernel_page_size, QELR_RQE_ELEMENT_SIZE); if (rc) { DP_ERR(cxt->dbg_fp, "create srq: failed to map srq, got %d", rc); return rc; } prod_size = sizeof(struct rdma_srq_producers); addr = mmap(NULL, prod_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) { DP_ERR(cxt->dbg_fp, "create srq: failed to map producer, got %d", errno); qelr_chain_free(&srq->hw_srq.chain); return errno; } rc = ibv_dontfork_range(addr, prod_size); if (rc) { munmap(addr, prod_size); qelr_chain_free(&srq->hw_srq.chain); return rc; } srq->hw_srq.virt_prod_pair_addr = addr; srq->hw_srq.max_sges = cxt->sges_per_srq_wr; srq->hw_srq.max_wr = max_wr; return 0; } struct ibv_srq *qelr_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *init_attr) { struct qelr_devctx *cxt = get_qelr_ctx(pd->context); struct qelr_create_srq_req req = { { 0 } }; struct qelr_create_srq_resp resp; struct qelr_srq *srq; int rc, status = 0; srq = calloc(1, sizeof(*srq)); if (!srq) goto err0; rc = qelr_create_srq_buffers(cxt, srq, init_attr); if (rc) goto err1; pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE); qelr_create_srq_configure_req(srq, &req); status = ibv_cmd_create_srq(pd, &srq->ibv_srq, init_attr, &req.ibv_cmd, sizeof(req), &resp.ibv_resp, sizeof(resp)); if (status) { uint32_t *virt_prod_pair_addr = srq->hw_srq.virt_prod_pair_addr; uint32_t prod_size = sizeof(struct rdma_srq_producers); ibv_dofork_range(virt_prod_pair_addr, prod_size); munmap(virt_prod_pair_addr, prod_size); qelr_chain_free(&srq->hw_srq.chain); goto err1; } DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ, "create srq: successfully created %p.\n", srq); return &srq->ibv_srq; err1: free(srq); err0: DP_ERR(cxt->dbg_fp, "create srq: failed to create %p.\n", srq); return NULL; } #endif static inline bool qelr_qp_is_iwarp(struct qelr_qp *qp) { return !!(qp->flags & QELR_QP_FLAG_IWARP); } static inline bool qelr_qp_ldpm_enabled(struct qelr_qp *qp) { return !!(qp->flags & QELR_QP_FLAG_LDPM_EN); } static inline bool qelr_qp_edpm_enabled(struct qelr_qp *qp) { return !!(qp->flags & QELR_QP_FLAG_EDPM_EN); } static inline bool qelr_qp_atomic_enabled(struct qelr_qp *qp) { return !!(qp->flags & QELR_QP_FLAG_ATOMIC_EN); } static inline bool qelr_qp_wids_enabled(struct qelr_qp *qp) { return !!(qp->flags & QELR_QP_FLAG_WIDS_EN); } static inline bool qelr_qp_has_rq(struct qelr_qp *qp) { return !!(qp->flags & QELR_QP_FLAG_RQ); } static inline bool qelr_qp_has_sq(struct qelr_qp *qp) { return !!(qp->flags & QELR_QP_FLAG_SQ); } static inline bool qelr_qp_has_srq(struct qelr_qp *qp) { return !!qp->srq; } static inline int qelr_create_qp_buffers_sq(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_qp_init_attr *attrs) { uint32_t max_send_wr, max_send_sges, max_send_buf; int chain_size; int rc; /* SQ */ max_send_wr = attrs->cap.max_send_wr; max_send_wr = max_t(uint32_t, max_send_wr, 1); max_send_wr = min_t(uint32_t, max_send_wr, cxt->max_send_wr); max_send_sges = max_send_wr * cxt->sges_per_send_wr; max_send_buf = max_send_sges * QELR_SQE_ELEMENT_SIZE; chain_size = max_send_buf; rc = qelr_chain_alloc(&qp->sq.chain, chain_size, cxt->kernel_page_size, QELR_SQE_ELEMENT_SIZE); if (rc) DP_ERR(cxt->dbg_fp, "create qp: failed to map SQ chain, got %d", rc); qp->sq.max_wr = max_send_wr; qp->sq.max_sges = cxt->sges_per_send_wr; #if QELR_DB_RECOVERY rc = qelr_db_rec_alloc(cxt, &qp->sq.db_rec_addr); if (rc) { qelr_chain_free(&qp->sq.chain); DP_ERR(cxt->dbg_fp, "create qp: failed to map SQ db rec, got %d", rc); } #endif return rc; } static inline int qelr_create_qp_buffers_rq(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_qp_init_attr *attrs) { uint32_t max_recv_wr, max_recv_sges, max_recv_buf; int chain_size; int rc; /* RQ */ max_recv_wr = attrs->cap.max_recv_wr; max_recv_wr = max_t(uint32_t, max_recv_wr, 1); max_recv_wr = min_t(uint32_t, max_recv_wr, cxt->max_recv_wr); max_recv_sges = max_recv_wr * cxt->sges_per_recv_wr; max_recv_buf = max_recv_sges * QELR_RQE_ELEMENT_SIZE; chain_size = max_recv_buf; rc = qelr_chain_alloc(&qp->rq.chain, chain_size, cxt->kernel_page_size, QELR_RQE_ELEMENT_SIZE); if (rc) DP_ERR(cxt->dbg_fp, "create qp: failed to map RQ chain, got %d", rc); qp->rq.max_wr = max_recv_wr; qp->rq.max_sges = cxt->sges_per_recv_wr; #if QELR_DB_RECOVERY rc = qelr_db_rec_alloc(cxt, &qp->rq.db_rec_addr); if (rc) { DP_ERR(cxt->dbg_fp, "create qp: failed to map RQ db rec, got %d", rc); qelr_chain_free(&qp->rq.chain); } #endif return rc; } static inline int qelr_create_qp_buffers(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_qp_init_attr *attrs) { int rc; if (qelr_qp_has_sq(qp)) { rc = qelr_create_qp_buffers_sq(cxt, qp, attrs); if (rc) return rc; } if (qelr_qp_has_rq(qp)) { rc = qelr_create_qp_buffers_rq(cxt, qp, attrs); if (rc) goto err; } return 0; err: if (qelr_qp_has_sq(qp)) { qelr_chain_free(&qp->sq.chain); #if QELR_DB_RECOVERY qelr_db_rec_free(cxt, &qp->sq.db_rec_addr); #endif } return rc; } static inline int qelr_configure_qp_sq(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_qp_init_attr *attrs, struct qelr_create_qp_resp *resp) { qp->sq.icid = resp->sq_icid; qp->sq.db_data.data.icid = htole16(resp->sq_icid); qp->sq.prod = 0; qp->sq.db = cxt->db_addr + resp->sq_db_offset; qp->sq.edpm_db = cxt->db_addr; qp->sq_sig_all = attrs->sq_sig_all; if (qelr_qp_is_iwarp(qp)) qp->max_inline_data = IWARP_REQ_MAX_INLINE_DATA_SIZE; else qp->max_inline_data = ROCE_REQ_MAX_INLINE_DATA_SIZE; /* Shadow SQ depth is one more than requested since since the prod/cons * mechanism in an array of N elements supports only N-1 elements. It is * completely safe to this now, after the chain has been allocated since * it is unrelated to the number of WQEs. Also, we don't need to check * wrap since the size is limited to 32k. */ qp->sq.max_wr++; qp->wqe_wr_id = calloc(qp->sq.max_wr, sizeof(*qp->wqe_wr_id)); if (!qp->wqe_wr_id) { DP_ERR(cxt->dbg_fp, "create qp: failed shadow SQ memory allocation\n"); return -ENOMEM; } return 0; } static inline int qelr_configure_qp_rq(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_qp_init_attr *attrs, struct qelr_create_qp_resp *resp) { /* RQ */ qp->rq.icid = resp->rq_icid; qp->rq.db_data.data.icid = htole16(resp->rq_icid); qp->rq.db = cxt->db_addr + resp->rq_db_offset; qp->rq.iwarp_db2 = cxt->db_addr + resp->rq_db2_offset; qp->rq.iwarp_db2_data.data.icid = htole16(qp->rq.icid); qp->rq.iwarp_db2_data.data.value = htole16(DQ_TCM_IWARP_POST_RQ_CF_CMD); qp->rq.prod = 0; /* Shadow RQ depth is one more than requested since since the prod/cons * mechanism in an array of N elements supports only N-1 elements. It is * completely safe to this now, after the chain has been allocated since * it is unrelated to the number of WQEs. Also, we don't need to check * wrap since the size is limited to 32k. */ qp->rq.max_wr++; qp->rqe_wr_id = calloc(qp->rq.max_wr, sizeof(*qp->rqe_wr_id)); if (!qp->rqe_wr_id) { DP_ERR(cxt->dbg_fp, "create qp: failed shadow RQ memory allocation\n"); return -ENOMEM; } return 0; } static inline int qelr_configure_qp(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_qp_init_attr *attrs, struct qelr_create_qp_resp *resp) { int rc; /* general */ pthread_spin_init(&qp->q_lock, PTHREAD_PROCESS_PRIVATE); qp->qp_id = resp->qp_id; qp->state = QELR_QPS_RST; qp->wid_count = cxt->wid_count; if (cxt->dpm_enabled & QELR_RDMA_DPM_TYPE_ENHANCED) qp->flags |= QELR_QP_FLAG_EDPM_EN; if (cxt->dpm_enabled & QELR_RDMA_DPM_TYPE_LEGACY) qp->flags |= QELR_QP_FLAG_LDPM_EN; if (cxt->wids_enabled) qp->flags |= QELR_QP_FLAG_WIDS_EN; if (IS_IWARP(qp->ibv_qp.context->device)) qp->flags |= QELR_QP_FLAG_IWARP; if (resp->atomic_supported) qp->flags |= QELR_QP_FLAG_ATOMIC_EN; if (qelr_qp_has_sq(qp)) { rc = qelr_configure_qp_sq(cxt, qp, attrs, resp); if (rc) return rc; } if (qelr_qp_has_rq(qp)) { rc = qelr_configure_qp_rq(cxt, qp, attrs, resp); if (rc) goto err; } return 0; err: if (qelr_qp_has_sq(qp)) free(qp->wqe_wr_id); return rc; } static inline void qelr_print_qp_init_attr( struct qelr_devctx *cxt, struct ibv_qp_init_attr *attr) { DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "create qp: send_cq=%p, recv_cq=%p, srq=%p, max_inline_data=%d, max_recv_sge=%d, max_recv_wr=%d, max_send_sge=%d, max_send_wr=%d, qp_type=%d, sq_sig_all=%d\n", attr->send_cq, attr->recv_cq, attr->srq, attr->cap.max_inline_data, attr->cap.max_recv_sge, attr->cap.max_recv_wr, attr->cap.max_send_sge, attr->cap.max_send_wr, attr->qp_type, attr->sq_sig_all); } static inline void qelr_create_qp_configure_sq_req(struct qelr_qp *qp, struct qelr_create_qp_req *req) { req->sq_addr = (uintptr_t)qp->sq.chain.first_addr; req->sq_len = qp->sq.chain.size; #if QELR_DB_RECOVERY req->sq_db_rec_addr = (uintptr_t)qp->sq.db_rec_addr; #endif } static inline void qelr_create_qp_configure_rq_req(struct qelr_qp *qp, struct qelr_create_qp_req *req) { req->rq_addr = (uintptr_t)qp->rq.chain.first_addr; req->rq_len = qp->rq.chain.size; #if QELR_DB_RECOVERY req->rq_db_rec_addr = (uintptr_t)qp->rq.db_rec_addr; #endif } static inline void qelr_create_qp_configure_req(struct qelr_qp *qp, struct qelr_create_qp_req *req) { memset(req, 0, sizeof(*req)); req->qp_handle_hi = U64_HI(qp); req->qp_handle_lo = U64_LO(qp); if (qelr_qp_has_sq(qp)) qelr_create_qp_configure_sq_req(qp, req); if (qelr_qp_has_rq(qp)) qelr_create_qp_configure_rq_req(qp, req); } static inline void qelr_basic_qp_config(struct qelr_qp *qp, struct ibv_qp_init_attr *attrs) { #if QELR_SRQ if (attrs->srq) qp->srq = get_qelr_srq(attrs->srq); #endif if (attrs->qp_type == IBV_QPT_RC) qp->flags |= QELR_QP_FLAG_SQ; if (attrs->qp_type == IBV_QPT_RC && !qp->srq) qp->flags |= QELR_QP_FLAG_RQ; } struct ibv_qp *qelr_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attrs) { struct qelr_devctx *cxt = get_qelr_ctx(pd->context); struct qelr_create_qp_resp resp; struct qelr_create_qp_req req; struct qelr_qp *qp; int rc; qelr_print_qp_init_attr(cxt, attrs); qp = calloc(1, sizeof(*qp)); if (!qp) return NULL; qelr_basic_qp_config(qp, attrs); rc = qelr_create_qp_buffers(cxt, qp, attrs); if (rc) goto err0; qelr_create_qp_configure_req(qp, &req); rc = ibv_cmd_create_qp(pd, &qp->ibv_qp, attrs, &req.ibv_qp, sizeof(req), &resp.ibv_resp, sizeof(resp)); if (rc) { DP_ERR(cxt->dbg_fp, "create qp: failed on ibv_cmd_create_qp with %d\n", rc); goto err1; } rc = qelr_configure_qp(cxt, qp, attrs, &resp); if (rc) goto err2; DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "create qp: successfully created %p. handle_hi=%x handle_lo=%x\n", qp, req.qp_handle_hi, req.qp_handle_lo); return &qp->ibv_qp; err2: rc = ibv_cmd_destroy_qp(&qp->ibv_qp); if (rc) DP_ERR(cxt->dbg_fp, "create qp: fatal fault. rc=%d\n", rc); err1: if (qelr_qp_has_sq(qp)) qelr_chain_free(&qp->sq.chain); if (qelr_qp_has_rq(qp)) qelr_chain_free(&qp->rq.chain); err0: free(qp); return NULL; } static void qelr_print_ah_attr(struct qelr_devctx *cxt, struct ibv_ah_attr *attr) { DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "grh.dgid=[%#" PRIx64 ":%#" PRIx64 "], grh.flow_label=%d, grh.sgid_index=%d, grh.hop_limit=%d, grh.traffic_class=%d, dlid=%d, sl=%d, src_path_bits=%d, static_rate = %d, port_num=%d\n", be64toh(attr->grh.dgid.global.interface_id), be64toh(attr->grh.dgid.global.subnet_prefix), attr->grh.flow_label, attr->grh.hop_limit, attr->grh.sgid_index, attr->grh.traffic_class, attr->dlid, attr->sl, attr->src_path_bits, attr->static_rate, attr->port_num); } static void qelr_print_qp_attr(struct qelr_devctx *cxt, struct ibv_qp_attr *attr) { DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "\tqp_state=%d\tcur_qp_state=%d\tpath_mtu=%d\tpath_mig_state=%d\tqkey=%d\trq_psn=%d\tsq_psn=%d\tdest_qp_num=%d\tqp_access_flags=%d\tmax_inline_data=%d\tmax_recv_sge=%d\tmax_recv_wr=%d\tmax_send_sge=%d\tmax_send_wr=%d\tpkey_index=%d\talt_pkey_index=%d\ten_sqd_async_notify=%d\tsq_draining=%d\tmax_rd_atomic=%d\tmax_dest_rd_atomic=%d\tmin_rnr_timer=%d\tport_num=%d\ttimeout=%d\tretry_cnt=%d\trnr_retry=%d\talt_port_num=%d\talt_timeout=%d\n", attr->qp_state, attr->cur_qp_state, attr->path_mtu, attr->path_mig_state, attr->qkey, attr->rq_psn, attr->sq_psn, attr->dest_qp_num, attr->qp_access_flags, attr->cap.max_inline_data, attr->cap.max_recv_sge, attr->cap.max_recv_wr, attr->cap.max_send_sge, attr->cap.max_send_wr, attr->pkey_index, attr->alt_pkey_index, attr->en_sqd_async_notify, attr->sq_draining, attr->max_rd_atomic, attr->max_dest_rd_atomic, attr->min_rnr_timer, attr->port_num, attr->timeout, attr->retry_cnt, attr->rnr_retry, attr->alt_port_num, attr->alt_timeout); qelr_print_ah_attr(cxt, &attr->ah_attr); qelr_print_ah_attr(cxt, &attr->alt_ah_attr); } int qelr_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) { struct ibv_query_qp cmd; struct qelr_devctx *cxt = get_qelr_ctx(qp->context); int rc; DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "QP Query %p, attr_mask=0x%x\n", get_qelr_qp(qp), attr_mask); rc = ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd, sizeof(cmd)); qelr_print_qp_attr(cxt, attr); return rc; } static enum qelr_qp_state get_qelr_qp_state(enum ibv_qp_state qps) { switch (qps) { case IBV_QPS_RESET: return QELR_QPS_RST; case IBV_QPS_INIT: return QELR_QPS_INIT; case IBV_QPS_RTR: return QELR_QPS_RTR; case IBV_QPS_RTS: return QELR_QPS_RTS; case IBV_QPS_SQD: return QELR_QPS_SQD; case IBV_QPS_SQE: return QELR_QPS_SQE; case IBV_QPS_ERR: default: return QELR_QPS_ERR; }; } static void qelr_reset_qp_hwq_info(struct qelr_qp_hwq_info *q) { qelr_chain_reset(&q->chain); q->prod = 0; q->cons = 0; q->wqe_cons = 0; q->db_data.data.value = 0; } static int qelr_update_qp_state(struct qelr_qp *qp, enum ibv_qp_state new_ib_state) { int status = 0; enum qelr_qp_state new_state; /* iWARP states are updated implicitely by driver and don't have a * real purpose in user-lib. */ if (qelr_qp_is_iwarp(qp)) return 0; new_state = get_qelr_qp_state(new_ib_state); pthread_spin_lock(&qp->q_lock); if (new_state == qp->state) { pthread_spin_unlock(&qp->q_lock); return 0; } switch (qp->state) { case QELR_QPS_RST: switch (new_state) { case QELR_QPS_INIT: qp->prev_wqe_size = 0; if (qelr_qp_has_sq(qp)) qelr_reset_qp_hwq_info(&qp->sq); if (qelr_qp_has_rq(qp)) qelr_reset_qp_hwq_info(&qp->rq); break; default: status = -EINVAL; break; }; break; case QELR_QPS_INIT: /* INIT->XXX */ switch (new_state) { case QELR_QPS_RTR: /* Update doorbell (in case post_recv was done before * move to RTR) */ if (!qelr_qp_has_rq(qp)) break; if (IS_ROCE(qp->ibv_qp.context->device)) { mmio_wc_start(); writel(qp->rq.db_data.raw, qp->rq.db); mmio_flush_writes(); } break; case QELR_QPS_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case QELR_QPS_RTR: /* RTR->XXX */ switch (new_state) { case QELR_QPS_RTS: break; case QELR_QPS_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case QELR_QPS_RTS: /* RTS->XXX */ switch (new_state) { case QELR_QPS_SQD: case QELR_QPS_SQE: break; case QELR_QPS_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case QELR_QPS_SQD: /* SQD->XXX */ switch (new_state) { case QELR_QPS_RTS: case QELR_QPS_SQE: case QELR_QPS_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case QELR_QPS_SQE: switch (new_state) { case QELR_QPS_RTS: case QELR_QPS_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case QELR_QPS_ERR: /* ERR->XXX */ switch (new_state) { case QELR_QPS_RST: break; default: status = -EINVAL; break; }; break; default: status = -EINVAL; break; }; if (!status) qp->state = new_state; pthread_spin_unlock(&qp->q_lock); return status; } int qelr_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, int attr_mask) { struct ibv_modify_qp cmd = {}; struct qelr_qp *qp = get_qelr_qp(ibqp); struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context); union ibv_gid sgid, *p_dgid; int rc; DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "QP Modify %p, attr_mask=0x%x\n", qp, attr_mask); qelr_print_qp_attr(cxt, attr); rc = ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof(cmd)); if (rc) { DP_ERR(cxt->dbg_fp, "QP Modify: Failed command. rc=%d\n", rc); return rc; } if (attr_mask & IBV_QP_STATE) { rc = qelr_update_qp_state(qp, attr->qp_state); DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "QP Modify state %d->%d, rc=%d\n", qp->state, attr->qp_state, rc); if (rc) { DP_ERR(cxt->dbg_fp, "QP Modify: Failed to update state. rc=%d\n", rc); return rc; } } /* EDPM must be disabled if GIDs match */ if (attr_mask & IBV_QP_AV) { rc = ibv_query_gid(ibqp->context, attr->ah_attr.port_num, attr->ah_attr.grh.sgid_index, &sgid); if (!rc) { p_dgid = &attr->ah_attr.grh.dgid; if (!memcmp(&sgid, p_dgid, sizeof(sgid))) qp->flags &= ~QELR_QP_FLAG_EDPM_EN; DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "QP Modify: %p, dpm_enabled=%d\n", qp, qelr_qp_edpm_enabled(qp)); } else { DP_ERR(cxt->dbg_fp, "QP Modify: Failed querying GID. rc=%d\n", rc); } } return 0; } int qelr_destroy_qp(struct ibv_qp *ibqp) { struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context); struct qelr_qp *qp = get_qelr_qp(ibqp); int rc = 0; DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "destroy qp: %p\n", qp); rc = ibv_cmd_destroy_qp(ibqp); if (rc) { DP_ERR(cxt->dbg_fp, "destroy qp: failed to destroy %p, got %d.\n", qp, rc); return rc; } if (qelr_qp_has_sq(qp)) { free(qp->wqe_wr_id); qelr_chain_free(&qp->sq.chain); #if QELR_DB_RECOVERY qelr_db_rec_free(cxt, &qp->sq.db_rec_addr); #endif } if (qelr_qp_has_rq(qp)) { free(qp->rqe_wr_id); qelr_chain_free(&qp->rq.chain); #if QELR_DB_RECOVERY qelr_db_rec_free(cxt, &qp->rq.db_rec_addr); #endif } free(qp); DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "destroy cq: successfully destroyed %p\n", qp); return 0; } static int sge_data_len(struct ibv_sge *sg_list, int num_sge) { int i, len = 0; for (i = 0; i < num_sge; i++) len += sg_list[i].length; return len; } static inline void copy_swap_data64(uint64_t *out, uint64_t *in, uint32_t n_bytes) { uint32_t i, n_dwords; n_dwords = ALIGN_DIV(n_bytes, sizeof(uint64_t)); for (i = 0; i < n_dwords; i++) out[i] = (uint64_t)htobe64(in[i]); } #define QELR_DPM_LIMIT (8192) static inline void qelr_init_dpm_info(struct qelr_qp *qp, struct ibv_send_wr *wr, struct qelr_dpm *dpm, int data_size, uint32_t sq_elem_left) { /* Reset always because the buffer may be used even in non DPM mode */ dpm->payload_offset = 0; dpm->payload_size = 0; dpm->msg.raw = 0; /* sched_gecpu is costly. Store the cpu as it is used again later */ dpm->cpu = sched_getcpu(); if (unlikely(dpm->cpu >= qp->wid_count)) goto no_dpm; if (sq_elem_left != qp->sq.chain.n_elems) goto no_dpm; if (wr->send_flags & IBV_SEND_INLINE && qelr_qp_edpm_enabled(qp)) { dpm->rdma_ext = (struct qelr_rdma_ext *)&dpm->payload; dpm->is_edpm = 1; #if QELR_LDPM dpm->is_ldpm = 0; #endif return; } #if QELR_LDPM else if (!(wr->send_flags & IBV_SEND_INLINE) && data_size <= QELR_DPM_LIMIT && qelr_qp_ldpm_enabled(qp) && wr->opcode <= IBV_WR_ATOMIC_FETCH_AND_ADD) { /* Legacy DPM is relevant for send/imm, write/imm, read * and atomic, but limited to 8kb */ dpm->is_edpm = 0; dpm->is_ldpm = 1; return; } #endif no_dpm: dpm->is_edpm = 0; #if QELR_LDPM dpm->is_ldpm = 0; #endif } #define QELR_IB_OPCODE_SEND_ONLY 0x04 #define QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE 0x05 #define QELR_IB_OPCODE_RDMA_WRITE_ONLY 0x0a #define QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE 0x0b #define QELR_IS_IMM(opcode) \ ((opcode == QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE) || \ (opcode == QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE)) static inline void qelr_edpm_set_msg_data(struct qelr_dpm *dpm, uint8_t opcode, uint16_t length, uint8_t se, uint8_t comp) { uint32_t wqe_size, qwords, params; wqe_size = length + (QELR_IS_IMM(opcode) ? sizeof(uint32_t) : 0); qwords = ALIGN_DIV(wqe_size + sizeof(struct db_rdma_dpm_data), sizeof(uint64_t)); params = DPM_RDMA << DB_RDMA_DPM_PARAMS_DPM_TYPE_SHIFT; params |= opcode << DB_RDMA_DPM_PARAMS_OPCODE_SHIFT; params |= wqe_size << DB_RDMA_DPM_PARAMS_WQE_SIZE_SHIFT; params |= comp << DB_RDMA_DPM_PARAMS_COMPLETION_FLG_SHIFT; params |= se << DB_RDMA_DPM_PARAMS_S_FLG_SHIFT; params |= qwords << DB_RDMA_DPM_PARAMS_SIZE_SHIFT; dpm->msg.data.params.params = htole32(params); } static inline void qelr_edpm_set_inv_imm(struct qelr_qp *qp, struct qelr_dpm *dpm, __be32 data) { memcpy(&dpm->payload[dpm->payload_offset], &data, sizeof(data)); dpm->payload_offset += sizeof(data); dpm->payload_size += sizeof(data); } static inline void qelr_edpm_set_rdma_ext(struct qelr_qp *qp, struct qelr_dpm *dpm, uint64_t remote_addr, uint32_t rkey) { dpm->rdma_ext->remote_va = htobe64(remote_addr); dpm->rdma_ext->remote_key = htobe32(rkey); dpm->payload_offset += sizeof(*dpm->rdma_ext); dpm->payload_size += sizeof(*dpm->rdma_ext); } static inline void qelr_edpm_set_payload(struct qelr_qp *qp, struct qelr_dpm *dpm, char *buf, uint32_t length) { memcpy(&dpm->payload[dpm->payload_offset], buf, length); dpm->payload_offset += length; } static void qelr_prepare_sq_inline_data(struct qelr_qp *qp, struct qelr_dpm *dpm, int total_len, uint8_t *wqe_size, struct ibv_send_wr *wr) { uint32_t n_bytes, isg, ib, remain_len; uint8_t *dst, *buffer; /* DPM buffer is used here even in non-DPM mode */ buffer = &dpm->payload[dpm->payload_offset]; *wqe_size += ALIGN_DIV(total_len, sizeof(struct rdma_sq_common_wqe)); /* Copy inline data to DPM buffer but don't swap it yet as it may be * unaligned (this is done even if DPM is not used). */ for (ib = 0, isg = 0; isg < wr->num_sge; isg++) { struct ibv_sge *sge = &wr->sg_list[isg]; memcpy(&buffer[ib], (uint8_t *)(uintptr_t)sge->addr, sge->length); ib += sge->length; } /* Copy and swap inline data to SQ - up to 2 iterations, the second * occurs only on SQ wrap */ remain_len = total_len; do { dst = (uint8_t *)qelr_chain_produce_bytes(&qp->sq.chain, remain_len, &n_bytes, QELR_SQE_ELEMENT_SIZE); n_bytes = min_t(uint32_t, n_bytes, remain_len); remain_len -= n_bytes; copy_swap_data64((uint64_t *)dst, (uint64_t *)buffer, n_bytes); buffer += n_bytes; } while (remain_len); if (dpm->is_edpm) { dpm->payload_size += total_len; if (wr->opcode == IBV_WR_RDMA_WRITE || wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) dpm->rdma_ext->dma_length = htobe32(total_len); } } static void qelr_prepare_sq_sges(struct qelr_qp *qp, struct qelr_dpm *dpm, uint8_t *wqe_size, struct ibv_send_wr *wr) { int i; for (i = 0; i < wr->num_sge; i++) { struct rdma_sq_sge *sge = qelr_chain_produce(&qp->sq.chain); TYPEPTR_ADDR_SET(sge, addr, wr->sg_list[i].addr); sge->l_key = htole32(wr->sg_list[i].lkey); sge->length = htole32(wr->sg_list[i].length); #if QELR_LDPM if (dpm->is_ldpm) { memcpy(&dpm->payload[dpm->payload_size], sge, sizeof(*sge)); dpm->payload_size += sizeof(*sge); } #endif } if (wqe_size) *wqe_size += wr->num_sge; } static uint32_t qelr_prepare_sq_rdma_data(struct qelr_qp *qp, struct qelr_dpm *dpm, int data_size, struct rdma_sq_rdma_wqe_1st *rwqe, struct rdma_sq_rdma_wqe_2nd *rwqe2, struct ibv_send_wr *wr, bool is_imm) { memset(rwqe2, 0, sizeof(*rwqe2)); rwqe2->r_key = htole32(wr->wr.rdma.rkey); TYPEPTR_ADDR_SET(rwqe2, remote_va, wr->wr.rdma.remote_addr); rwqe->length = htole32(data_size); if (is_imm) rwqe->imm_data = htole32(be32toh(wr->imm_data)); if (wr->send_flags & IBV_SEND_INLINE && data_size && (wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM || wr->opcode == IBV_WR_RDMA_WRITE)) { qelr_prepare_sq_inline_data(qp, dpm, data_size, &rwqe->wqe_size, wr); rwqe->flags |= 1 << RDMA_SQ_RDMA_WQE_1ST_INLINE_FLG_SHIFT; } else { #if QELR_LDPM if (dpm->is_ldpm) dpm->payload_size = sizeof(*rwqe) + sizeof(*rwqe2); #endif qelr_prepare_sq_sges(qp, dpm, &rwqe->wqe_size, wr); #if QELR_LDPM if (dpm->is_ldpm) { memcpy(&dpm->payload[0], rwqe, sizeof(*rwqe)); memcpy(&dpm->payload[sizeof(*rwqe)], rwqe2, sizeof(*rwqe2)); } #endif } return data_size; } static uint32_t qelr_prepare_sq_send_data(struct qelr_qp *qp, struct qelr_dpm *dpm, int data_size, struct rdma_sq_send_wqe_1st *swqe, struct rdma_sq_send_wqe_2st *swqe2, struct ibv_send_wr *wr, bool is_imm) { memset(swqe2, 0, sizeof(*swqe2)); swqe->length = htole32(data_size); if (is_imm) swqe->inv_key_or_imm_data = htole32(be32toh(wr->imm_data)); if (data_size && (wr->send_flags & IBV_SEND_INLINE)) { qelr_prepare_sq_inline_data(qp, dpm, data_size, &swqe->wqe_size, wr); swqe->flags |= 1 << RDMA_SQ_SEND_WQE_INLINE_FLG_SHIFT; } else { #if QELR_LDPM if (dpm->is_ldpm) dpm->payload_size = sizeof(*swqe) + sizeof(*swqe2); #endif qelr_prepare_sq_sges(qp, dpm, &swqe->wqe_size, wr); #if QELR_LDPM if (dpm->is_ldpm) { memcpy(&dpm->payload[0], swqe, sizeof(*swqe)); memcpy(&dpm->payload[sizeof(*swqe)], swqe2, sizeof(*swqe2)); } #endif } return data_size; } static void qelr_prepare_sq_atom_data(struct qelr_qp *qp, struct qelr_dpm *dpm, struct rdma_sq_atomic_wqe_1st *awqe1, struct rdma_sq_atomic_wqe_2nd *awqe2, struct rdma_sq_atomic_wqe_3rd *awqe3, struct ibv_send_wr *wr) { #if QELR_LDPM if (dpm->is_ldpm) { memcpy(&dpm->payload[dpm->payload_size], awqe1, sizeof(*awqe1)); dpm->payload_size += sizeof(*awqe1); memcpy(&dpm->payload[dpm->payload_size], awqe2, sizeof(*awqe2)); dpm->payload_size += sizeof(*awqe2); memcpy(&dpm->payload[dpm->payload_size], awqe3, sizeof(*awqe3)); dpm->payload_size += sizeof(*awqe3); } #endif qelr_prepare_sq_sges(qp, dpm, NULL, wr); } #if QELR_LDPM static inline void qelr_ldpm_prepare_data(struct qelr_qp *qp, struct qelr_dpm *dpm) { int val; /* DPM size is given in 8 bytes so we round up */ val = dpm->payload_size + sizeof(struct db_rdma_dpm_data); val = (val + sizeof(uint64_t) - 1) / sizeof(uint64_t); SET_FIELD(dpm->msg.data.params.params, DB_RDMA_DPM_PARAMS_SIZE, val); SET_FIELD(dpm->msg.data.params.params, DB_RDMA_DPM_PARAMS_DPM_TYPE, DPM_LEGACY); } #endif void dumpwqe(struct rdma_sq_send_wqe *swqe) { int *p = (int*)swqe; int i, size = swqe->wqe_size*ROCE_WQE_ELEM_SIZE/sizeof(int); printf(">>>>>>>>>>>>>>>>>>>>>>> Dumping WQE of %d elements:\n", size); for (i = 0; i < size; i++) printf(" %d: %08x\n", i, p[i]); } static enum ibv_wc_opcode qelr_ibv_to_wc_opcode(enum ibv_wr_opcode opcode) { switch (opcode) { case IBV_WR_RDMA_WRITE: case IBV_WR_RDMA_WRITE_WITH_IMM: return IBV_WC_RDMA_WRITE; case IBV_WR_SEND_WITH_IMM: case IBV_WR_SEND: return IBV_WC_SEND; case IBV_WR_RDMA_READ: return IBV_WC_RDMA_READ; case IBV_WR_ATOMIC_CMP_AND_SWP: return IBV_WC_COMP_SWAP; case IBV_WR_ATOMIC_FETCH_AND_ADD: return IBV_WC_FETCH_ADD; default: return IBV_WC_SEND; } } static inline void doorbell_qp(struct qelr_qp *qp) { /* make sure data is prepared in memory where device will access it */ mmio_wc_start(); /* write value to device */ writel(qp->sq.db_data.raw, qp->sq.db); #if QELR_DB_RECOVERY /* copy value to doorbell recovery mechanism */ qp->sq.db_rec_addr->db_data = qp->sq.db_data.raw; #endif /* flush write combined buffer */ mmio_flush_writes(); } #define QELR_WID_0 (0) #define QELR_WID_SIZE (1024) static inline uint32_t qelr_get_wid_offset(struct qelr_qp *qp, struct qelr_dpm *dpm) { if (likely(qelr_qp_wids_enabled(qp))) return dpm->cpu * QELR_WID_SIZE; else return QELR_WID_0 * QELR_WID_SIZE; } #define QELR_QWORDS_FLUSH_THRESHOLD (16) static inline void doorbell_dpm_qp(struct qelr_qp *qp, struct qelr_dpm *dpm) { uint32_t offset = 0; uint64_t data; uint64_t *payload = (uint64_t *)dpm->payload; uint32_t num_dwords; int bytes = 0; void *db_addr; mmio_wc_start(); /* Write message header */ dpm->msg.data.icid = qp->sq.db_data.data.icid; dpm->msg.data.prod_val = qp->sq.db_data.data.value; db_addr = qp->sq.edpm_db + qelr_get_wid_offset(qp, dpm); writeq(dpm->msg.raw, db_addr); /* Write mesage body */ bytes += sizeof(uint64_t); db_addr += sizeof(dpm->msg.data); num_dwords = ALIGN_DIV(dpm->payload_size, sizeof(uint64_t)); while (offset < num_dwords) { /* LDPM WQEs are considered by the DORQ HW block as regular data * hence it doesn't flip them. We flip them here for the FW's * convenience. */ #if QELR_LDPM if (dpm->is_ldpm) data = htobe64(payload[offset]); else /* EDPM */ #endif data = payload[offset]; writeq(data, db_addr); db_addr += sizeof(uint64_t); offset++; /* Since the doorbell BAR is in write combined mode it is weakly * ordered, consecutive writes without flush between them can * overwrite each other */ bytes += sizeof(uint64_t); if (bytes == 64) { mmio_flush_writes(); bytes = 0; } } if (bytes) mmio_flush_writes(); } static inline int qelr_can_post_send(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_send_wr *wr, int data_size, uint32_t sq_elem_left) { /* Invalid WR */ if (unlikely(wr->num_sge > qp->sq.max_sges)) { DP_ERR(cxt->dbg_fp, "error: WR is bad. Post send on QP %p failed\n", qp); qp->err_bitmap |= QELR_QP_ERR_BAD_SR; return -EINVAL; } /* WR overflow */ if (unlikely(qelr_wq_is_full(&qp->sq))) { if (!(qp->err_bitmap & QELR_QP_ERR_SQ_FULL)) DP_ERR(cxt->dbg_fp, "error: WQ is full. Post send on QP %p failed (this error appears only once)\n", qp); qp->err_bitmap |= QELR_QP_ERR_SQ_FULL; return -ENOMEM; } /* WQE overflow */ if (unlikely(sq_elem_left < QELR_MAX_SQ_WQE_SIZE)) { if (!(qp->err_bitmap & QELR_QP_ERR_SQ_PBL_FULL)) DP_ERR(cxt->dbg_fp, "error: WQ PBL is full. Post send on QP %p failed (this error appears only once)\n", qp); qp->err_bitmap |= QELR_QP_ERR_SQ_PBL_FULL; return -ENOMEM; } if (unlikely((wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP || wr->opcode == IBV_WR_ATOMIC_FETCH_AND_ADD) && !qelr_qp_atomic_enabled(qp))) { DP_ERR(cxt->dbg_fp, "Atomic not supported on this machine\n"); return -EINVAL; } if (unlikely(wr->send_flags & IBV_SEND_INLINE && data_size > qp->max_inline_data)) { DP_ERR(cxt->dbg_fp, "Too much inline data in WR: %d\n", data_size); return -EINVAL; } return 0; } static inline int __qelr_post_send(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_send_wr *wr, struct qelr_dpm *dpm, int data_size) { struct rdma_sq_common_wqe *wqe, *wqe2; struct rdma_sq_atomic_wqe_2nd *awqe2; struct rdma_sq_atomic_wqe_3rd *awqe3; uint8_t se, comp, fence, flags; uint16_t db_val; int rc = 0; se = !!(wr->send_flags & IBV_SEND_SOLICITED); fence = !!(wr->send_flags & IBV_SEND_FENCE); comp = (!!(wr->send_flags & IBV_SEND_SIGNALED)) || (!!qp->sq_sig_all); flags = se << RDMA_SQ_COMMON_WQE_SE_FLG_SHIFT | fence << RDMA_SQ_COMMON_WQE_RD_FENCE_FLG_SHIFT | comp << RDMA_SQ_COMMON_WQE_COMP_FLG_SHIFT; wqe = qelr_chain_produce(&qp->sq.chain); wqe->flags = flags; wqe->prev_wqe_size = qp->prev_wqe_size; wqe2 = qelr_chain_produce(&qp->sq.chain); switch (wr->opcode) { case IBV_WR_SEND_WITH_IMM: if (unlikely(qelr_qp_is_iwarp(qp))) { rc = -EINVAL; DP_ERR(cxt->dbg_fp, "SEND With Immediate is not supported over iWARP\n"); break; } wqe->req_type = RDMA_SQ_REQ_TYPE_SEND_WITH_IMM; wqe->wqe_size = sizeof(struct rdma_sq_send_wqe) / RDMA_WQE_BYTES; if (dpm->is_edpm) qelr_edpm_set_inv_imm(qp, dpm, wr->imm_data); qelr_prepare_sq_send_data(qp, dpm, data_size, (struct rdma_sq_send_wqe_1st *)wqe, (struct rdma_sq_send_wqe_2st *)wqe2, wr, 1 /* Imm */); if (dpm->is_edpm) qelr_edpm_set_msg_data(dpm, QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE, data_size, se, comp); #if QELR_LDPM else if (dpm->is_ldpm) qelr_ldpm_prepare_data(qp, dpm); #endif FP_DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "SEND w/ IMM length = %d imm data=%x comp=%d\n", data_size, wr->imm_data, comp); break; case IBV_WR_SEND: wqe->req_type = RDMA_SQ_REQ_TYPE_SEND; wqe->wqe_size = sizeof(struct rdma_sq_send_wqe) / RDMA_WQE_BYTES; qelr_prepare_sq_send_data(qp, dpm, data_size, (struct rdma_sq_send_wqe_1st *)wqe, (struct rdma_sq_send_wqe_2st *)wqe2, wr, 0 /* Imm */); if (dpm->is_edpm) qelr_edpm_set_msg_data(dpm, QELR_IB_OPCODE_SEND_ONLY, data_size, se, comp); #if QELR_LDPM else if (dpm->is_ldpm) qelr_ldpm_prepare_data(qp, dpm); #endif FP_DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "SEND w/o IMM length = %d comp=%d\n", data_size, comp); break; case IBV_WR_RDMA_WRITE_WITH_IMM: if (unlikely(qelr_qp_is_iwarp(qp))) { rc = -EINVAL; DP_ERR(cxt->dbg_fp, "RDMA WRITE With Immediate is not supported over iWARP\n"); break; } wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR_WITH_IMM; wqe->wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES; if (dpm->is_edpm) { qelr_edpm_set_rdma_ext(qp, dpm, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); qelr_edpm_set_inv_imm(qp, dpm, wr->imm_data); } qelr_prepare_sq_rdma_data(qp, dpm, data_size, (struct rdma_sq_rdma_wqe_1st *)wqe, (struct rdma_sq_rdma_wqe_2nd *)wqe2, wr, 1 /* Imm */); if (dpm->is_edpm) qelr_edpm_set_msg_data(dpm, QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE, data_size + sizeof(*dpm->rdma_ext), se, comp); #if QELR_LDPM else if (dpm->is_ldpm) qelr_ldpm_prepare_data(qp, dpm); #endif FP_DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "RDMA WRITE w/ IMM length = %d imm data=%x comp=%d\n", data_size, wr->imm_data, comp); break; case IBV_WR_RDMA_WRITE: wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR; wqe->wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES; if (dpm->is_edpm) qelr_edpm_set_rdma_ext(qp, dpm, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); qelr_prepare_sq_rdma_data(qp, dpm, data_size, (struct rdma_sq_rdma_wqe_1st *)wqe, (struct rdma_sq_rdma_wqe_2nd *)wqe2, wr, 0); if (dpm->is_edpm) qelr_edpm_set_msg_data(dpm, QELR_IB_OPCODE_RDMA_WRITE_ONLY, data_size + sizeof(*dpm->rdma_ext), se, comp); #if QELR_LDPM else if (dpm->is_ldpm) qelr_ldpm_prepare_data(qp, dpm); #endif FP_DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "RDMA WRITE w/o IMM length = %d comp=%d\n", data_size, comp); break; case IBV_WR_RDMA_READ: wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_RD; wqe->wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES; qelr_prepare_sq_rdma_data(qp, dpm, data_size, (struct rdma_sq_rdma_wqe_1st *)wqe, (struct rdma_sq_rdma_wqe_2nd *)wqe2, wr, 0); #if QELR_LDPM if (dpm->is_ldpm) qelr_ldpm_prepare_data(qp, dpm); #endif FP_DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "RDMA READ length = %d comp=%d\n", data_size, comp); break; case IBV_WR_ATOMIC_CMP_AND_SWP: case IBV_WR_ATOMIC_FETCH_AND_ADD: awqe2 = (struct rdma_sq_atomic_wqe_2nd *)wqe2; awqe3 = (struct rdma_sq_atomic_wqe_3rd *) qelr_chain_produce(&qp->sq.chain); /* The +1 is for the data segment where a copy of the original * contents of the remote memory operation will be deposited */ wqe->wqe_size = (sizeof(struct rdma_sq_atomic_wqe) / RDMA_WQE_BYTES) + 1; TYPEPTR_ADDR_SET(awqe2, remote_va, wr->wr.atomic.remote_addr); awqe2->r_key = htole32(wr->wr.atomic.rkey); if (wr->opcode == IBV_WR_ATOMIC_FETCH_AND_ADD) { wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_ADD; TYPEPTR_ADDR_SET(awqe3, swap_data, wr->wr.atomic.compare_add); } else { wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_CMP_AND_SWAP; TYPEPTR_ADDR_SET(awqe3, swap_data, wr->wr.atomic.swap); TYPEPTR_ADDR_SET(awqe3, cmp_data, wr->wr.atomic.compare_add); } qelr_prepare_sq_atom_data(qp, dpm, (struct rdma_sq_atomic_wqe_1st *)wqe, awqe2, awqe3, wr); #if QELR_LDPM if (dpm->is_ldpm) qelr_ldpm_prepare_data(qp, dpm); #endif /* To avoid compiler warning */ data_size = 0; FP_DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "ATOMIC\n"); break; default: rc = -EINVAL; DP_ERR(cxt->dbg_fp, "invalid opcode in work request\n"); break; } if (likely(!rc)) { qp->prev_wqe_size = wqe->wqe_size; qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe->wqe_size; qp->wqe_wr_id[qp->sq.prod].signaled = comp; qp->wqe_wr_id[qp->sq.prod].wr_id = wr->wr_id; qp->wqe_wr_id[qp->sq.prod].bytes_len = data_size; qp->wqe_wr_id[qp->sq.prod].opcode = qelr_ibv_to_wc_opcode(wr->opcode); qelr_inc_sw_prod_u16(&qp->sq); db_val = le16toh(qp->sq.db_data.data.value) + 1; qp->sq.db_data.data.value = htole16(db_val); } else { /* restore prod to its position before this WR was processed */ qelr_chain_set_prod(&qp->sq.chain, le16toh(qp->sq.db_data.data.value), wqe); } return rc; } int qelr_post_send(struct ibv_qp *ib_qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { struct qelr_devctx *cxt = get_qelr_ctx(ib_qp->context); struct qelr_qp *qp = get_qelr_qp(ib_qp); int doorbell_required = 0; uint32_t sq_elem_left; struct qelr_dpm dpm; *bad_wr = NULL; int rc = 0; pthread_spin_lock(&qp->q_lock); if (unlikely(IS_ROCE(ib_qp->context->device) && (qp->state != QELR_QPS_RTS && qp->state != QELR_QPS_ERR && qp->state != QELR_QPS_SQD))) { pthread_spin_unlock(&qp->q_lock); *bad_wr = wr; return -EINVAL; } while (wr) { int data_size = sge_data_len(wr->sg_list, wr->num_sge); sq_elem_left = qelr_chain_get_elem_left_u32(&qp->sq.chain); rc = qelr_can_post_send(cxt, qp, wr, data_size, sq_elem_left); if (unlikely(rc)) { *bad_wr = wr; break; } qelr_init_dpm_info(qp, wr, &dpm, data_size, sq_elem_left); rc = __qelr_post_send(cxt, qp, wr, &dpm, data_size); if (unlikely(rc)) { *bad_wr = wr; break; } #if QELR_LDPM if (dpm.is_edpm || dpm.is_ldpm) #else if (dpm.is_edpm) #endif doorbell_dpm_qp(qp, &dpm); else doorbell_required |= 1; wr = wr->next; } if (doorbell_required) doorbell_qp(qp); pthread_spin_unlock(&qp->q_lock); return rc; } #if QELR_SRQ static uint32_t qelr_srq_elem_left(struct qelr_srq_hwq_info *hw_srq) { uint32_t used; /* Calculate number of elements used based on producer * count and consumer count and subtract it from max * work request supported so that we get elements left. */ used = hw_srq->wr_prod_cnt - hw_srq->wr_cons_cnt; return hw_srq->max_wr - used; } int qelr_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { struct qelr_devctx *cxt = get_qelr_ctx(ibsrq->context); struct qelr_srq *srq = get_qelr_srq(ibsrq); struct qelr_srq_hwq_info *hw_srq = &srq->hw_srq; struct qelr_chain *chain; int status = 0; uint32_t offset; pthread_spin_lock(&srq->lock); chain = &srq->hw_srq.chain; while (wr) { struct rdma_srq_wqe_header *hdr; int i; if (!qelr_srq_elem_left(hw_srq) || wr->num_sge > srq->hw_srq.max_sges) { DP_ERR(cxt->dbg_fp, "Can't post WR (%d,%d) || (%d > %d)\n", hw_srq->wr_prod_cnt, hw_srq->wr_cons_cnt, wr->num_sge, srq->hw_srq.max_sges); status = -ENOMEM; *bad_wr = wr; break; } hdr = qelr_chain_produce(chain); /* Set number of sge and work request id in header */ SRQ_HDR_SET(hdr, wr->wr_id, wr->num_sge); /* PBL is maintained in case of WR granularity. * So increment WR producer in case we post a WR. */ qelr_inc_srq_wr_prod(hw_srq); hw_srq->wqe_prod++; hw_srq->sge_prod++; DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ, "SRQ WR: SGEs: %d with wr_id[%d] = %lx\n", wr->num_sge, hw_srq->wqe_prod, wr->wr_id); for (i = 0; i < wr->num_sge; i++) { struct rdma_srq_sge *srq_sge; srq_sge = qelr_chain_produce(chain); /* Set SGE length, lkey and address */ SRQ_SGE_SET(srq_sge, wr->sg_list[i].addr, wr->sg_list[i].length, wr->sg_list[i].lkey); DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ, "[%d]: len %d key %x addr %x:%x\n", i, srq_sge->length, srq_sge->l_key, srq_sge->addr.hi, srq_sge->addr.lo); hw_srq->sge_prod++; } /* Flush WQE and SGE information before updating producer */ mmio_wc_start(); /* SRQ producer is 8 bytes. Need to update SGE producer index * in first 4 bytes and need to update WQE producer in * next 4 bytes. */ *(srq->hw_srq.virt_prod_pair_addr) = hw_srq->sge_prod; offset = offsetof(struct rdma_srq_producers, wqe_prod); *((uint8_t *)srq->hw_srq.virt_prod_pair_addr + offset) = hw_srq->wqe_prod; /* Flush producer after updating it. */ mmio_flush_writes(); wr = wr->next; } DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ, "POST: Elements in SRQ: %d\n", qelr_chain_get_elem_left_u32(chain)); pthread_spin_unlock(&srq->lock); return status; } #endif static inline void __qelr_post_recv(struct qelr_qp *qp, struct ibv_recv_wr *wr) { struct rdma_rq_sge *rqe; uint32_t flags; int i; /* first rqe must include the number of SGEs in the list */ flags = wr->num_sge << RDMA_RQ_SGE_NUM_SGES_SHIFT; for (i = 0; i < wr->num_sge; i++) { flags |= wr->sg_list[i].lkey << RDMA_RQ_SGE_L_KEY_SHIFT; rqe = qelr_chain_produce(&qp->rq.chain); RQ_SGE_SET(rqe, wr->sg_list[i].addr, wr->sg_list[i].length, flags); flags = 0; } /* Special case of no sges. FW requires between 1-4 sges... * In this case we need to post 1 sge with length zero. this is * because rdma write with immediate consumes an RQ. */ if (unlikely(!wr->num_sge)) { flags = 1 << RDMA_RQ_SGE_NUM_SGES_SHIFT; rqe = qelr_chain_produce(&qp->rq.chain); RQ_SGE_SET(rqe, 0, 0, flags); i = 1; } qp->rqe_wr_id[qp->rq.prod].wr_id = wr->wr_id; qp->rqe_wr_id[qp->rq.prod].wqe_size = i; qelr_inc_sw_prod_u16(&qp->rq); } int qelr_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context); struct qelr_qp *qp = get_qelr_qp(ibqp); uint32_t n_rqe = 0; int rc = 0; #if QELR_SRQ if (unlikely(qelr_qp_has_srq(qp))) { DP_ERR(cxt->dbg_fp, "QP is associated with SRQ, cannot post RQ buffers\n"); *bad_wr = wr; return -EINVAL; } #endif pthread_spin_lock(&qp->q_lock); if (unlikely(!qelr_qp_is_iwarp(qp) && qp->state == QELR_QPS_RST)) { rc = -EINVAL; *bad_wr = wr; goto out; } while (wr) { if (unlikely((qelr_chain_get_elem_left_u32(&qp->rq.chain) < QELR_MAX_RQ_WQE_SIZE))) { DP_ERR(cxt->dbg_fp, "post_recv failed. RQ has only %d elements left\n", qelr_chain_get_elem_left_u32(&qp->rq.chain)); rc = -ENOMEM; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > qp->rq.max_sges)) { DP_ERR(cxt->dbg_fp, "post recv failed, max_sge is %d but WR contains %d sges\n", qp->rq.max_sges, wr->num_sge); rc = -EINVAL; *bad_wr = wr; goto out; } __qelr_post_recv(qp, wr); n_rqe++; wr = wr->next; } out: if (likely(n_rqe)) { uint16_t db_val; /* make sure data is prepared beyond this point */ mmio_wc_start(); /* prepare db value */ db_val = le16toh(qp->rq.db_data.data.value) + n_rqe; qp->rq.db_data.data.value = htole16(db_val); /* write value to device */ writel(qp->rq.db_data.raw, qp->rq.db); #if QELR_DB_RECOVERY /* copy value to doorbell recovery mechanism */ qp->rq.db_rec_addr->db_data = qp->rq.db_data.raw; #endif mmio_flush_writes(); if (qelr_qp_is_iwarp(qp)) { writel(qp->rq.iwarp_db2_data.raw, qp->rq.iwarp_db2); mmio_flush_writes(); } } pthread_spin_unlock(&qp->q_lock); return rc; } static int is_valid_cqe(struct qelr_cq *cq, union rdma_cqe *cqe) { struct rdma_cqe_requester *resp_cqe = &cqe->req; return (resp_cqe->flags & RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK) == cq->chain_toggle; } static enum rdma_cqe_type cqe_get_type(union rdma_cqe *cqe) { struct rdma_cqe_requester *resp_cqe = &cqe->req; return GET_FIELD(resp_cqe->flags, RDMA_CQE_REQUESTER_TYPE); } static struct qelr_qp *cqe_get_qp(union rdma_cqe *cqe) { struct regpair *qph = &cqe->req.qp_handle; return (struct qelr_qp *)HILO_U64(le32toh(qph->hi), le32toh(qph->lo)); } static int process_req(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, uint16_t hw_cons, enum ibv_wc_status status, int force) { struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context); uint16_t cnt = 0; while (num_entries && qp->sq.wqe_cons != hw_cons) { if (!qp->wqe_wr_id[qp->sq.cons].signaled && !force) { DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "SKIPPING WC num_entries=%d qp->sq.wqe_cons=%d, hw_cons=%d\n", num_entries, qp->sq.wqe_cons, hw_cons); /* skip WC */ goto next_cqe; } /* fill WC */ wc->status = status; wc->wc_flags = 0; wc->qp_num = qp->qp_id; /* common section */ wc->wr_id = qp->wqe_wr_id[qp->sq.cons].wr_id; wc->opcode = qp->wqe_wr_id[qp->sq.cons].opcode; switch (wc->opcode) { case IBV_WC_RDMA_WRITE: wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len; DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "POLL REQ CQ: IBV_WC_RDMA_WRITE byte_len=%d num_entries=%d qp->sq.wqe_cons=%d, hw_cons=%d\n", qp->wqe_wr_id[qp->sq.cons].bytes_len, num_entries, qp->sq.wqe_cons, hw_cons); break; case IBV_WC_COMP_SWAP: case IBV_WC_FETCH_ADD: wc->byte_len = 8; break; case IBV_WC_RDMA_READ: case IBV_WC_SEND: case IBV_WC_BIND_MW: wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len; DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "POLL REQ CQ: IBV_WC_RDMA_READ / IBV_WC_SEND num_entries=%d qp->sq.wqe_cons=%d, hw_cons=%d\n", num_entries, qp->sq.wqe_cons, hw_cons); break; default: break; } num_entries--; wc++; cnt++; next_cqe: while (qp->wqe_wr_id[qp->sq.cons].wqe_size--) qelr_chain_consume(&qp->sq.chain); qelr_inc_sw_cons_u16(&qp->sq); } return cnt; } static int qelr_poll_cq_req(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, struct rdma_cqe_requester *req) { struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context); uint16_t sq_cons = le16toh(req->sq_cons); int cnt = 0; switch (req->status) { case RDMA_CQE_REQ_STS_OK: cnt = process_req(qp, cq, num_entries, wc, sq_cons, IBV_WC_SUCCESS, 0); break; case RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR: DP_ERR(cxt->dbg_fp, "Error: POLL CQ with ROCE_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR. QP icid=0x%x\n", qp->sq.icid); cnt = process_req(qp, cq, num_entries, wc, sq_cons, IBV_WC_WR_FLUSH_ERR, 0); break; default: /* other errors case */ /* process all WQE before the consumer */ qp->state = QELR_QPS_ERR; cnt = process_req(qp, cq, num_entries, wc, sq_cons - 1, IBV_WC_SUCCESS, 0); wc += cnt; /* if we have extra WC fill it with actual error info */ if (cnt < num_entries) { enum ibv_wc_status wc_status; switch (req->status) { case RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR: DP_ERR(cxt->dbg_fp, "Error: POLL CQ with RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_BAD_RESP_ERR; break; case RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR: DP_ERR(cxt->dbg_fp, "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_LOC_LEN_ERR; break; case RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR: DP_ERR(cxt->dbg_fp, "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_LOC_QP_OP_ERR; break; case RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR: DP_ERR(cxt->dbg_fp, "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_LOC_PROT_ERR; break; case RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR: DP_ERR(cxt->dbg_fp, "Error: POLL CQ with RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_MW_BIND_ERR; break; case RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR: DP_ERR(cxt->dbg_fp, "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_REM_INV_REQ_ERR; break; case RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR: DP_ERR(cxt->dbg_fp, "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_REM_ACCESS_ERR; break; case RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR: DP_ERR(cxt->dbg_fp, "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_REM_OP_ERR; break; case RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR: DP_ERR(cxt->dbg_fp, "Error: POLL CQ with RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_RNR_RETRY_EXC_ERR; break; case RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR: DP_ERR(cxt->dbg_fp, "RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_RETRY_EXC_ERR; break; default: DP_ERR(cxt->dbg_fp, "IBV_WC_GENERAL_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_GENERAL_ERR; } cnt += process_req(qp, cq, 1, wc, sq_cons, wc_status, 1 /* force use of WC */); } } return cnt; } static void __process_resp_one(struct qelr_qp *qp, struct qelr_cq *cq, struct ibv_wc *wc, struct rdma_cqe_responder *resp, uint64_t wr_id) { struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context); enum ibv_wc_status wc_status = IBV_WC_SUCCESS; uint8_t flags; wc->opcode = IBV_WC_RECV; wc->wc_flags = 0; FP_DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "\n"); switch (resp->status) { case RDMA_CQE_RESP_STS_LOCAL_ACCESS_ERR: wc_status = IBV_WC_LOC_ACCESS_ERR; break; case RDMA_CQE_RESP_STS_LOCAL_LENGTH_ERR: wc_status = IBV_WC_LOC_LEN_ERR; break; case RDMA_CQE_RESP_STS_LOCAL_QP_OPERATION_ERR: wc_status = IBV_WC_LOC_QP_OP_ERR; break; case RDMA_CQE_RESP_STS_LOCAL_PROTECTION_ERR: wc_status = IBV_WC_LOC_PROT_ERR; break; case RDMA_CQE_RESP_STS_MEMORY_MGT_OPERATION_ERR: wc_status = IBV_WC_MW_BIND_ERR; break; case RDMA_CQE_RESP_STS_REMOTE_INVALID_REQUEST_ERR: wc_status = IBV_WC_REM_INV_RD_REQ_ERR; break; case RDMA_CQE_RESP_STS_OK: wc_status = IBV_WC_SUCCESS; wc->byte_len = le32toh(resp->length); flags = resp->flags & QELR_RESP_RDMA_IMM; switch (flags) { case QELR_RESP_RDMA_IMM: /* update opcode */ wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; SWITCH_FALLTHROUGH; case QELR_RESP_IMM: wc->imm_data = htobe32(le32toh(resp->imm_data_or_inv_r_Key)); wc->wc_flags |= IBV_WC_WITH_IMM; FP_DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "POLL CQ RQ2: RESP_RDMA_IMM imm_data = %x resp_len=%d\n", wc->imm_data, wc->byte_len); break; case QELR_RESP_RDMA: DP_ERR(cxt->dbg_fp, "Invalid flags detected\n"); break; default: /* valid configuration, but nothing to do here */ break; } wc->wr_id = wr_id; break; default: wc->status = IBV_WC_GENERAL_ERR; DP_ERR(cxt->dbg_fp, "Invalid CQE status detected\n"); } /* fill WC */ wc->status = wc_status; wc->qp_num = qp->qp_id; } #if QELR_SRQ static int process_resp_one_srq(struct qelr_qp *qp, struct qelr_cq *cq, struct ibv_wc *wc, struct rdma_cqe_responder *resp) { struct qelr_srq_hwq_info *hw_srq = &qp->srq->hw_srq; uint64_t wr_id; wr_id = ((uint64_t)resp->srq_wr_id.lo & 0xffffffff) | ((uint64_t)resp->srq_wr_id.hi << 32); if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) { wc->byte_len = 0; wc->status = IBV_WC_WR_FLUSH_ERR; wc->qp_num = qp->qp_id; wc->wr_id = wr_id; } else { __process_resp_one(qp, cq, wc, resp, wr_id); } /* PBL is maintained in case of WR granularity. * So increment WR consumer after consuming WR */ qelr_inc_srq_wr_cons(hw_srq); return 1; } #endif static int process_resp_one(struct qelr_qp *qp, struct qelr_cq *cq, struct ibv_wc *wc, struct rdma_cqe_responder *resp) { uint64_t wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id; __process_resp_one(qp, cq, wc, resp, wr_id); while (qp->rqe_wr_id[qp->rq.cons].wqe_size--) qelr_chain_consume(&qp->rq.chain); qelr_inc_sw_cons_u16(&qp->rq); return 1; } static int process_resp_flush(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, uint16_t hw_cons) { uint16_t cnt = 0; while (num_entries && qp->rq.wqe_cons != hw_cons) { /* fill WC */ wc->status = IBV_WC_WR_FLUSH_ERR; wc->qp_num = qp->qp_id; wc->byte_len = 0; wc->wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id; num_entries--; wc++; cnt++; while (qp->rqe_wr_id[qp->rq.cons].wqe_size--) qelr_chain_consume(&qp->rq.chain); qelr_inc_sw_cons_u16(&qp->rq); } return cnt; } /* return latest CQE (needs processing) */ static union rdma_cqe *get_cqe(struct qelr_cq *cq) { return cq->latest_cqe; } static void try_consume_req_cqe(struct qelr_cq *cq, struct qelr_qp *qp, struct rdma_cqe_requester *req, int *update) { uint16_t sq_cons = le16toh(req->sq_cons); if (sq_cons == qp->sq.wqe_cons) { consume_cqe(cq); *update |= 1; } } /* used with flush only, when resp->rq_cons is valid */ static void try_consume_resp_cqe(struct qelr_cq *cq, struct qelr_qp *qp, uint16_t rq_cons, int *update) { if (rq_cons == qp->rq.wqe_cons) { consume_cqe(cq); *update |= 1; } } #if QELR_SRQ static int qelr_poll_cq_resp_srq(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, struct rdma_cqe_responder *resp, int *update) { int cnt; cnt = process_resp_one_srq(qp, cq, wc, resp); consume_cqe(cq); *update |= 1; return cnt; } #endif static int qelr_poll_cq_resp(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, struct rdma_cqe_responder *resp, int *update) { uint16_t rq_cons = le16toh(resp->rq_cons_or_srq_id); int cnt; if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) { cnt = process_resp_flush(qp, cq, num_entries, wc, rq_cons); try_consume_resp_cqe(cq, qp, rq_cons, update); } else { cnt = process_resp_one(qp, cq, wc, resp); consume_cqe(cq); *update |= 1; } return cnt; } static void doorbell_cq(struct qelr_cq *cq, uint32_t cons, uint8_t flags) { /* barrier to validate data is prepated */ mmio_wc_start(); /* prepare doorbell value and flags */ cq->db.data.agg_flags = flags; cq->db.data.value = htole32(cons); /* write value to device */ writeq(cq->db.raw, cq->db_addr); #if QELR_DB_RECOVERY /* copy value to doorbell recovery mechanism */ cq->db_rec_addr->db_data = cq->db.raw; #endif /* flush write combined buffer */ mmio_flush_writes(); } int qelr_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) { struct qelr_cq *cq = get_qelr_cq(ibcq); int done = 0; union rdma_cqe *cqe = get_cqe(cq); int update = 0; uint32_t db_cons; while (num_entries && is_valid_cqe(cq, cqe)) { int cnt = 0; struct qelr_qp *qp; /* prevent speculative reads of any field of CQE */ udma_from_device_barrier(); qp = cqe_get_qp(cqe); if (!qp) { DP_ERR(stderr, "Error: CQE QP pointer is NULL. CQE=%p\n", cqe); break; } switch (cqe_get_type(cqe)) { case RDMA_CQE_TYPE_REQUESTER: cnt = qelr_poll_cq_req(qp, cq, num_entries, wc, &cqe->req); try_consume_req_cqe(cq, qp, &cqe->req, &update); break; case RDMA_CQE_TYPE_RESPONDER_RQ: cnt = qelr_poll_cq_resp(qp, cq, num_entries, wc, &cqe->resp, &update); break; #if QELR_SRQ case RDMA_CQE_TYPE_RESPONDER_SRQ: cnt = qelr_poll_cq_resp_srq(qp, cq, num_entries, wc, &cqe->resp, &update); #endif break; case RDMA_CQE_TYPE_INVALID: default: printf("Error: invalid CQE type = %d\n", cqe_get_type(cqe)); } num_entries -= cnt; wc += cnt; done += cnt; cqe = get_cqe(cq); } db_cons = qelr_chain_get_cons_idx_u32(&cq->chain) - 1; if (update) { /* doorbell notifies about latest VALID entry, * but chain already point to the next INVALID one */ doorbell_cq(cq, db_cons, cq->arm_flags); FP_DP_VERBOSE(stderr, QELR_MSG_CQ, "doorbell_cq cons=%x\n", db_cons); } return done; } void qelr_cq_event(struct ibv_cq *ibcq) { /* Trigger received, can reset arm flags */ struct qelr_cq *cq = get_qelr_cq(ibcq); cq->arm_flags = 0; } int qelr_arm_cq(struct ibv_cq *ibcq, int solicited) { struct qelr_cq *cq = get_qelr_cq(ibcq); uint32_t db_cons; //pthread_spin_lock(&cq->cq_lock); db_cons = qelr_chain_get_cons_idx_u32(&cq->chain) - 1; FP_DP_VERBOSE(get_qelr_ctx(ibcq->context)->dbg_fp, QELR_MSG_CQ, "Arm CQ cons=%x solicited=%d\n", db_cons, solicited); cq->arm_flags = solicited ? DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD : DQ_UCM_ROCE_CQ_ARM_CF_CMD; doorbell_cq(cq, db_cons, cq->arm_flags); //pthread_spin_unlock(&cq->cq_lock); return 0; } void qelr_async_event(struct ibv_async_event *event) { struct qelr_cq *cq = NULL; struct qelr_qp *qp = NULL; switch (event->event_type) { case IBV_EVENT_CQ_ERR: cq = get_qelr_cq(event->element.cq); break; case IBV_EVENT_QP_FATAL: case IBV_EVENT_QP_REQ_ERR: case IBV_EVENT_QP_ACCESS_ERR: case IBV_EVENT_PATH_MIG_ERR:{ qp = get_qelr_qp(event->element.qp); break; } case IBV_EVENT_SQ_DRAINED: case IBV_EVENT_PATH_MIG: case IBV_EVENT_COMM_EST: case IBV_EVENT_QP_LAST_WQE_REACHED: break; case IBV_EVENT_PORT_ACTIVE: case IBV_EVENT_PORT_ERR: break; default: break; } fprintf(stderr, "qelr_async_event not implemented yet cq=%p qp=%p\n", cq, qp); }