1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Work Requests exploiting Infiniband API
6 *
7 * Work requests (WR) of type ib_post_send or ib_post_recv respectively
8 * are submitted to either RC SQ or RC RQ respectively
9 * (reliably connected send/receive queue)
10 * and become work queue entries (WQEs).
11 * While an SQ WR/WQE is pending, we track it until transmission completion.
12 * Through a send or receive completion queue (CQ) respectively,
13 * we get completion queue entries (CQEs) [aka work completions (WCs)].
14 * Since the CQ callback is called from IRQ context, we split work by using
15 * bottom halves implemented by tasklets.
16 *
17 * SMC uses this to exchange LLC (link layer control)
18 * and CDC (connection data control) messages.
19 *
20 * Copyright IBM Corp. 2016
21 *
22 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
23 */
24
25 #include <linux/atomic.h>
26 #include <linux/hashtable.h>
27 #include <linux/wait.h>
28 #include <rdma/ib_verbs.h>
29 #include <asm/div64.h>
30
31 #include "smc.h"
32 #include "smc_wr.h"
33
34 #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
35
36 #define SMC_WR_RX_HASH_BITS 4
37 static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
38 static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
39
40 struct smc_wr_tx_pend { /* control data for a pending send request */
41 u64 wr_id; /* work request id sent */
42 smc_wr_tx_handler handler;
43 enum ib_wc_status wc_status; /* CQE status */
44 struct smc_link *link;
45 u32 idx;
46 struct smc_wr_tx_pend_priv priv;
47 u8 compl_requested;
48 };
49
50 /******************************** send queue *********************************/
51
52 /*------------------------------- completion --------------------------------*/
53
54 /* returns true if at least one tx work request is pending on the given link */
smc_wr_is_tx_pend(struct smc_link * link)55 static inline bool smc_wr_is_tx_pend(struct smc_link *link)
56 {
57 if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) !=
58 link->wr_tx_cnt) {
59 return true;
60 }
61 return false;
62 }
63
64 /* wait till all pending tx work requests on the given link are completed */
smc_wr_tx_wait_no_pending_sends(struct smc_link * link)65 void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
66 {
67 wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
68 }
69
smc_wr_tx_find_pending_index(struct smc_link * link,u64 wr_id)70 static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
71 {
72 u32 i;
73
74 for (i = 0; i < link->wr_tx_cnt; i++) {
75 if (link->wr_tx_pends[i].wr_id == wr_id)
76 return i;
77 }
78 return link->wr_tx_cnt;
79 }
80
smc_wr_tx_process_cqe(struct ib_wc * wc)81 static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
82 {
83 struct smc_wr_tx_pend pnd_snd;
84 struct smc_link *link;
85 u32 pnd_snd_idx;
86
87 link = wc->qp->qp_context;
88
89 if (wc->opcode == IB_WC_REG_MR) {
90 if (wc->status)
91 link->wr_reg_state = FAILED;
92 else
93 link->wr_reg_state = CONFIRMED;
94 smc_wr_wakeup_reg_wait(link);
95 return;
96 }
97
98 pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
99 if (pnd_snd_idx == link->wr_tx_cnt) {
100 if (link->lgr->smc_version != SMC_V2 ||
101 link->wr_tx_v2_pend->wr_id != wc->wr_id)
102 return;
103 link->wr_tx_v2_pend->wc_status = wc->status;
104 memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
105 /* clear the full struct smc_wr_tx_pend including .priv */
106 memset(link->wr_tx_v2_pend, 0,
107 sizeof(*link->wr_tx_v2_pend));
108 memset(link->lgr->wr_tx_buf_v2, 0,
109 sizeof(*link->lgr->wr_tx_buf_v2));
110 } else {
111 link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
112 if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
113 complete(&link->wr_tx_compl[pnd_snd_idx]);
114 memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
115 sizeof(pnd_snd));
116 /* clear the full struct smc_wr_tx_pend including .priv */
117 memset(&link->wr_tx_pends[pnd_snd_idx], 0,
118 sizeof(link->wr_tx_pends[pnd_snd_idx]));
119 memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
120 sizeof(link->wr_tx_bufs[pnd_snd_idx]));
121 if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
122 return;
123 }
124
125 if (wc->status) {
126 if (link->lgr->smc_version == SMC_V2) {
127 memset(link->wr_tx_v2_pend, 0,
128 sizeof(*link->wr_tx_v2_pend));
129 memset(link->lgr->wr_tx_buf_v2, 0,
130 sizeof(*link->lgr->wr_tx_buf_v2));
131 }
132 /* terminate link */
133 smcr_link_down_cond_sched(link);
134 }
135 if (pnd_snd.handler)
136 pnd_snd.handler(&pnd_snd.priv, link, wc->status);
137 wake_up(&link->wr_tx_wait);
138 }
139
smc_wr_tx_tasklet_fn(struct tasklet_struct * t)140 static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
141 {
142 struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
143 struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
144 int i = 0, rc;
145 int polled = 0;
146
147 again:
148 polled++;
149 do {
150 memset(&wc, 0, sizeof(wc));
151 rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
152 if (polled == 1) {
153 ib_req_notify_cq(dev->roce_cq_send,
154 IB_CQ_NEXT_COMP |
155 IB_CQ_REPORT_MISSED_EVENTS);
156 }
157 if (!rc)
158 break;
159 for (i = 0; i < rc; i++)
160 smc_wr_tx_process_cqe(&wc[i]);
161 } while (rc > 0);
162 if (polled == 1)
163 goto again;
164 }
165
smc_wr_tx_cq_handler(struct ib_cq * ib_cq,void * cq_context)166 void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
167 {
168 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
169
170 tasklet_schedule(&dev->send_tasklet);
171 }
172
173 /*---------------------------- request submission ---------------------------*/
174
smc_wr_tx_get_free_slot_index(struct smc_link * link,u32 * idx)175 static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
176 {
177 *idx = link->wr_tx_cnt;
178 if (!smc_link_sendable(link))
179 return -ENOLINK;
180 for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
181 if (!test_and_set_bit(*idx, link->wr_tx_mask))
182 return 0;
183 }
184 *idx = link->wr_tx_cnt;
185 return -EBUSY;
186 }
187
188 /**
189 * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
190 * and sets info for pending transmit tracking
191 * @link: Pointer to smc_link used to later send the message.
192 * @handler: Send completion handler function pointer.
193 * @wr_buf: Out value returns pointer to message buffer.
194 * @wr_rdma_buf: Out value returns pointer to rdma work request.
195 * @wr_pend_priv: Out value returns pointer serving as handler context.
196 *
197 * Return: 0 on success, or -errno on error.
198 */
smc_wr_tx_get_free_slot(struct smc_link * link,smc_wr_tx_handler handler,struct smc_wr_buf ** wr_buf,struct smc_rdma_wr ** wr_rdma_buf,struct smc_wr_tx_pend_priv ** wr_pend_priv)199 int smc_wr_tx_get_free_slot(struct smc_link *link,
200 smc_wr_tx_handler handler,
201 struct smc_wr_buf **wr_buf,
202 struct smc_rdma_wr **wr_rdma_buf,
203 struct smc_wr_tx_pend_priv **wr_pend_priv)
204 {
205 struct smc_link_group *lgr = smc_get_lgr(link);
206 struct smc_wr_tx_pend *wr_pend;
207 u32 idx = link->wr_tx_cnt;
208 struct ib_send_wr *wr_ib;
209 u64 wr_id;
210 int rc;
211
212 *wr_buf = NULL;
213 *wr_pend_priv = NULL;
214 if (in_softirq() || lgr->terminating) {
215 rc = smc_wr_tx_get_free_slot_index(link, &idx);
216 if (rc)
217 return rc;
218 } else {
219 rc = wait_event_interruptible_timeout(
220 link->wr_tx_wait,
221 !smc_link_sendable(link) ||
222 lgr->terminating ||
223 (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
224 SMC_WR_TX_WAIT_FREE_SLOT_TIME);
225 if (!rc) {
226 /* timeout - terminate link */
227 smcr_link_down_cond_sched(link);
228 return -EPIPE;
229 }
230 if (idx == link->wr_tx_cnt)
231 return -EPIPE;
232 }
233 wr_id = smc_wr_tx_get_next_wr_id(link);
234 wr_pend = &link->wr_tx_pends[idx];
235 wr_pend->wr_id = wr_id;
236 wr_pend->handler = handler;
237 wr_pend->link = link;
238 wr_pend->idx = idx;
239 wr_ib = &link->wr_tx_ibs[idx];
240 wr_ib->wr_id = wr_id;
241 *wr_buf = &link->wr_tx_bufs[idx];
242 if (wr_rdma_buf)
243 *wr_rdma_buf = &link->wr_tx_rdmas[idx];
244 *wr_pend_priv = &wr_pend->priv;
245 return 0;
246 }
247
smc_wr_tx_get_v2_slot(struct smc_link * link,smc_wr_tx_handler handler,struct smc_wr_v2_buf ** wr_buf,struct smc_wr_tx_pend_priv ** wr_pend_priv)248 int smc_wr_tx_get_v2_slot(struct smc_link *link,
249 smc_wr_tx_handler handler,
250 struct smc_wr_v2_buf **wr_buf,
251 struct smc_wr_tx_pend_priv **wr_pend_priv)
252 {
253 struct smc_wr_tx_pend *wr_pend;
254 struct ib_send_wr *wr_ib;
255 u64 wr_id;
256
257 if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
258 return -EBUSY;
259
260 *wr_buf = NULL;
261 *wr_pend_priv = NULL;
262 wr_id = smc_wr_tx_get_next_wr_id(link);
263 wr_pend = link->wr_tx_v2_pend;
264 wr_pend->wr_id = wr_id;
265 wr_pend->handler = handler;
266 wr_pend->link = link;
267 wr_pend->idx = link->wr_tx_cnt;
268 wr_ib = link->wr_tx_v2_ib;
269 wr_ib->wr_id = wr_id;
270 *wr_buf = link->lgr->wr_tx_buf_v2;
271 *wr_pend_priv = &wr_pend->priv;
272 return 0;
273 }
274
smc_wr_tx_put_slot(struct smc_link * link,struct smc_wr_tx_pend_priv * wr_pend_priv)275 int smc_wr_tx_put_slot(struct smc_link *link,
276 struct smc_wr_tx_pend_priv *wr_pend_priv)
277 {
278 struct smc_wr_tx_pend *pend;
279
280 pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
281 if (pend->idx < link->wr_tx_cnt) {
282 u32 idx = pend->idx;
283
284 /* clear the full struct smc_wr_tx_pend including .priv */
285 memset(&link->wr_tx_pends[idx], 0,
286 sizeof(link->wr_tx_pends[idx]));
287 memset(&link->wr_tx_bufs[idx], 0,
288 sizeof(link->wr_tx_bufs[idx]));
289 test_and_clear_bit(idx, link->wr_tx_mask);
290 wake_up(&link->wr_tx_wait);
291 return 1;
292 } else if (link->lgr->smc_version == SMC_V2 &&
293 pend->idx == link->wr_tx_cnt) {
294 /* Large v2 buffer */
295 memset(&link->wr_tx_v2_pend, 0,
296 sizeof(link->wr_tx_v2_pend));
297 memset(&link->lgr->wr_tx_buf_v2, 0,
298 sizeof(link->lgr->wr_tx_buf_v2));
299 return 1;
300 }
301
302 return 0;
303 }
304
305 /* Send prepared WR slot via ib_post_send.
306 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
307 */
smc_wr_tx_send(struct smc_link * link,struct smc_wr_tx_pend_priv * priv)308 int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
309 {
310 struct smc_wr_tx_pend *pend;
311 int rc;
312
313 ib_req_notify_cq(link->smcibdev->roce_cq_send,
314 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
315 pend = container_of(priv, struct smc_wr_tx_pend, priv);
316 rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
317 if (rc) {
318 smc_wr_tx_put_slot(link, priv);
319 smcr_link_down_cond_sched(link);
320 }
321 return rc;
322 }
323
smc_wr_tx_v2_send(struct smc_link * link,struct smc_wr_tx_pend_priv * priv,int len)324 int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
325 int len)
326 {
327 int rc;
328
329 link->wr_tx_v2_ib->sg_list[0].length = len;
330 ib_req_notify_cq(link->smcibdev->roce_cq_send,
331 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
332 rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
333 if (rc) {
334 smc_wr_tx_put_slot(link, priv);
335 smcr_link_down_cond_sched(link);
336 }
337 return rc;
338 }
339
340 /* Send prepared WR slot via ib_post_send and wait for send completion
341 * notification.
342 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
343 */
smc_wr_tx_send_wait(struct smc_link * link,struct smc_wr_tx_pend_priv * priv,unsigned long timeout)344 int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
345 unsigned long timeout)
346 {
347 struct smc_wr_tx_pend *pend;
348 u32 pnd_idx;
349 int rc;
350
351 pend = container_of(priv, struct smc_wr_tx_pend, priv);
352 pend->compl_requested = 1;
353 pnd_idx = pend->idx;
354 init_completion(&link->wr_tx_compl[pnd_idx]);
355
356 rc = smc_wr_tx_send(link, priv);
357 if (rc)
358 return rc;
359 /* wait for completion by smc_wr_tx_process_cqe() */
360 rc = wait_for_completion_interruptible_timeout(
361 &link->wr_tx_compl[pnd_idx], timeout);
362 if (rc <= 0)
363 rc = -ENODATA;
364 if (rc > 0)
365 rc = 0;
366 return rc;
367 }
368
369 /* Register a memory region and wait for result. */
smc_wr_reg_send(struct smc_link * link,struct ib_mr * mr)370 int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
371 {
372 int rc;
373
374 ib_req_notify_cq(link->smcibdev->roce_cq_send,
375 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
376 link->wr_reg_state = POSTED;
377 link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
378 link->wr_reg.mr = mr;
379 link->wr_reg.key = mr->rkey;
380 rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
381 if (rc)
382 return rc;
383
384 atomic_inc(&link->wr_reg_refcnt);
385 rc = wait_event_interruptible_timeout(link->wr_reg_wait,
386 (link->wr_reg_state != POSTED),
387 SMC_WR_REG_MR_WAIT_TIME);
388 if (atomic_dec_and_test(&link->wr_reg_refcnt))
389 wake_up_all(&link->wr_reg_wait);
390 if (!rc) {
391 /* timeout - terminate link */
392 smcr_link_down_cond_sched(link);
393 return -EPIPE;
394 }
395 if (rc == -ERESTARTSYS)
396 return -EINTR;
397 switch (link->wr_reg_state) {
398 case CONFIRMED:
399 rc = 0;
400 break;
401 case FAILED:
402 rc = -EIO;
403 break;
404 case POSTED:
405 rc = -EPIPE;
406 break;
407 }
408 return rc;
409 }
410
411 /****************************** receive queue ********************************/
412
smc_wr_rx_register_handler(struct smc_wr_rx_handler * handler)413 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
414 {
415 struct smc_wr_rx_handler *h_iter;
416 int rc = 0;
417
418 spin_lock(&smc_wr_rx_hash_lock);
419 hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
420 if (h_iter->type == handler->type) {
421 rc = -EEXIST;
422 goto out_unlock;
423 }
424 }
425 hash_add(smc_wr_rx_hash, &handler->list, handler->type);
426 out_unlock:
427 spin_unlock(&smc_wr_rx_hash_lock);
428 return rc;
429 }
430
431 /* Demultiplex a received work request based on the message type to its handler.
432 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
433 * and not being modified any more afterwards so we don't need to lock it.
434 */
smc_wr_rx_demultiplex(struct ib_wc * wc)435 static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
436 {
437 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
438 struct smc_wr_rx_handler *handler;
439 struct smc_wr_rx_hdr *wr_rx;
440 u64 temp_wr_id;
441 u32 index;
442
443 if (wc->byte_len < sizeof(*wr_rx))
444 return; /* short message */
445 temp_wr_id = wc->wr_id;
446 index = do_div(temp_wr_id, link->wr_rx_cnt);
447 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
448 hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
449 if (handler->type == wr_rx->type)
450 handler->handler(wc, wr_rx);
451 }
452 }
453
smc_wr_rx_process_cqes(struct ib_wc wc[],int num)454 static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
455 {
456 struct smc_link *link;
457 int i;
458
459 for (i = 0; i < num; i++) {
460 link = wc[i].qp->qp_context;
461 if (wc[i].status == IB_WC_SUCCESS) {
462 link->wr_rx_tstamp = jiffies;
463 smc_wr_rx_demultiplex(&wc[i]);
464 smc_wr_rx_post(link); /* refill WR RX */
465 } else {
466 /* handle status errors */
467 switch (wc[i].status) {
468 case IB_WC_RETRY_EXC_ERR:
469 case IB_WC_RNR_RETRY_EXC_ERR:
470 case IB_WC_WR_FLUSH_ERR:
471 smcr_link_down_cond_sched(link);
472 break;
473 default:
474 smc_wr_rx_post(link); /* refill WR RX */
475 break;
476 }
477 }
478 }
479 }
480
smc_wr_rx_tasklet_fn(struct tasklet_struct * t)481 static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
482 {
483 struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
484 struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
485 int polled = 0;
486 int rc;
487
488 again:
489 polled++;
490 do {
491 memset(&wc, 0, sizeof(wc));
492 rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
493 if (polled == 1) {
494 ib_req_notify_cq(dev->roce_cq_recv,
495 IB_CQ_SOLICITED_MASK
496 | IB_CQ_REPORT_MISSED_EVENTS);
497 }
498 if (!rc)
499 break;
500 smc_wr_rx_process_cqes(&wc[0], rc);
501 } while (rc > 0);
502 if (polled == 1)
503 goto again;
504 }
505
smc_wr_rx_cq_handler(struct ib_cq * ib_cq,void * cq_context)506 void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
507 {
508 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
509
510 tasklet_schedule(&dev->recv_tasklet);
511 }
512
smc_wr_rx_post_init(struct smc_link * link)513 int smc_wr_rx_post_init(struct smc_link *link)
514 {
515 u32 i;
516 int rc = 0;
517
518 for (i = 0; i < link->wr_rx_cnt; i++)
519 rc = smc_wr_rx_post(link);
520 return rc;
521 }
522
523 /***************************** init, exit, misc ******************************/
524
smc_wr_remember_qp_attr(struct smc_link * lnk)525 void smc_wr_remember_qp_attr(struct smc_link *lnk)
526 {
527 struct ib_qp_attr *attr = &lnk->qp_attr;
528 struct ib_qp_init_attr init_attr;
529
530 memset(attr, 0, sizeof(*attr));
531 memset(&init_attr, 0, sizeof(init_attr));
532 ib_query_qp(lnk->roce_qp, attr,
533 IB_QP_STATE |
534 IB_QP_CUR_STATE |
535 IB_QP_PKEY_INDEX |
536 IB_QP_PORT |
537 IB_QP_QKEY |
538 IB_QP_AV |
539 IB_QP_PATH_MTU |
540 IB_QP_TIMEOUT |
541 IB_QP_RETRY_CNT |
542 IB_QP_RNR_RETRY |
543 IB_QP_RQ_PSN |
544 IB_QP_ALT_PATH |
545 IB_QP_MIN_RNR_TIMER |
546 IB_QP_SQ_PSN |
547 IB_QP_PATH_MIG_STATE |
548 IB_QP_CAP |
549 IB_QP_DEST_QPN,
550 &init_attr);
551
552 lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
553 lnk->qp_attr.cap.max_send_wr);
554 lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
555 lnk->qp_attr.cap.max_recv_wr);
556 }
557
smc_wr_init_sge(struct smc_link * lnk)558 static void smc_wr_init_sge(struct smc_link *lnk)
559 {
560 int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
561 u32 i;
562
563 for (i = 0; i < lnk->wr_tx_cnt; i++) {
564 lnk->wr_tx_sges[i].addr =
565 lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
566 lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
567 lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
568 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
569 lnk->roce_pd->local_dma_lkey;
570 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
571 lnk->roce_pd->local_dma_lkey;
572 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
573 lnk->roce_pd->local_dma_lkey;
574 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
575 lnk->roce_pd->local_dma_lkey;
576 lnk->wr_tx_ibs[i].next = NULL;
577 lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
578 lnk->wr_tx_ibs[i].num_sge = 1;
579 lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
580 lnk->wr_tx_ibs[i].send_flags =
581 IB_SEND_SIGNALED | IB_SEND_SOLICITED;
582 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
583 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
584 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
585 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
586 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
587 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
588 }
589
590 if (lnk->lgr->smc_version == SMC_V2) {
591 lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
592 lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
593 lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
594
595 lnk->wr_tx_v2_ib->next = NULL;
596 lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
597 lnk->wr_tx_v2_ib->num_sge = 1;
598 lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
599 lnk->wr_tx_v2_ib->send_flags =
600 IB_SEND_SIGNALED | IB_SEND_SOLICITED;
601 }
602
603 /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
604 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
605 * and the same buffer for all sges. When a larger message arrived then
606 * the content of the first small sge is copied to the beginning of
607 * the larger spillover buffer, allowing easy data mapping.
608 */
609 for (i = 0; i < lnk->wr_rx_cnt; i++) {
610 int x = i * sges_per_buf;
611
612 lnk->wr_rx_sges[x].addr =
613 lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
614 lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
615 lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
616 if (lnk->lgr->smc_version == SMC_V2) {
617 lnk->wr_rx_sges[x + 1].addr =
618 lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
619 lnk->wr_rx_sges[x + 1].length =
620 SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
621 lnk->wr_rx_sges[x + 1].lkey =
622 lnk->roce_pd->local_dma_lkey;
623 }
624 lnk->wr_rx_ibs[i].next = NULL;
625 lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
626 lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
627 }
628 lnk->wr_reg.wr.next = NULL;
629 lnk->wr_reg.wr.num_sge = 0;
630 lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
631 lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
632 lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
633 }
634
smc_wr_free_link(struct smc_link * lnk)635 void smc_wr_free_link(struct smc_link *lnk)
636 {
637 struct ib_device *ibdev;
638
639 if (!lnk->smcibdev)
640 return;
641 ibdev = lnk->smcibdev->ibdev;
642
643 smc_wr_wakeup_reg_wait(lnk);
644 smc_wr_wakeup_tx_wait(lnk);
645
646 smc_wr_tx_wait_no_pending_sends(lnk);
647 wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
648 wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
649
650 if (lnk->wr_rx_dma_addr) {
651 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
652 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
653 DMA_FROM_DEVICE);
654 lnk->wr_rx_dma_addr = 0;
655 }
656 if (lnk->wr_rx_v2_dma_addr) {
657 ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
658 SMC_WR_BUF_V2_SIZE,
659 DMA_FROM_DEVICE);
660 lnk->wr_rx_v2_dma_addr = 0;
661 }
662 if (lnk->wr_tx_dma_addr) {
663 ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
664 SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
665 DMA_TO_DEVICE);
666 lnk->wr_tx_dma_addr = 0;
667 }
668 if (lnk->wr_tx_v2_dma_addr) {
669 ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
670 SMC_WR_BUF_V2_SIZE,
671 DMA_TO_DEVICE);
672 lnk->wr_tx_v2_dma_addr = 0;
673 }
674 }
675
smc_wr_free_lgr_mem(struct smc_link_group * lgr)676 void smc_wr_free_lgr_mem(struct smc_link_group *lgr)
677 {
678 if (lgr->smc_version < SMC_V2)
679 return;
680
681 kfree(lgr->wr_rx_buf_v2);
682 lgr->wr_rx_buf_v2 = NULL;
683 kfree(lgr->wr_tx_buf_v2);
684 lgr->wr_tx_buf_v2 = NULL;
685 }
686
smc_wr_free_link_mem(struct smc_link * lnk)687 void smc_wr_free_link_mem(struct smc_link *lnk)
688 {
689 kfree(lnk->wr_tx_v2_ib);
690 lnk->wr_tx_v2_ib = NULL;
691 kfree(lnk->wr_tx_v2_sge);
692 lnk->wr_tx_v2_sge = NULL;
693 kfree(lnk->wr_tx_v2_pend);
694 lnk->wr_tx_v2_pend = NULL;
695 kfree(lnk->wr_tx_compl);
696 lnk->wr_tx_compl = NULL;
697 kfree(lnk->wr_tx_pends);
698 lnk->wr_tx_pends = NULL;
699 kfree(lnk->wr_tx_mask);
700 lnk->wr_tx_mask = NULL;
701 kfree(lnk->wr_tx_sges);
702 lnk->wr_tx_sges = NULL;
703 kfree(lnk->wr_tx_rdma_sges);
704 lnk->wr_tx_rdma_sges = NULL;
705 kfree(lnk->wr_rx_sges);
706 lnk->wr_rx_sges = NULL;
707 kfree(lnk->wr_tx_rdmas);
708 lnk->wr_tx_rdmas = NULL;
709 kfree(lnk->wr_rx_ibs);
710 lnk->wr_rx_ibs = NULL;
711 kfree(lnk->wr_tx_ibs);
712 lnk->wr_tx_ibs = NULL;
713 kfree(lnk->wr_tx_bufs);
714 lnk->wr_tx_bufs = NULL;
715 kfree(lnk->wr_rx_bufs);
716 lnk->wr_rx_bufs = NULL;
717 }
718
smc_wr_alloc_lgr_mem(struct smc_link_group * lgr)719 int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
720 {
721 if (lgr->smc_version < SMC_V2)
722 return 0;
723
724 lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
725 if (!lgr->wr_rx_buf_v2)
726 return -ENOMEM;
727 lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
728 if (!lgr->wr_tx_buf_v2) {
729 kfree(lgr->wr_rx_buf_v2);
730 return -ENOMEM;
731 }
732 return 0;
733 }
734
smc_wr_alloc_link_mem(struct smc_link * link)735 int smc_wr_alloc_link_mem(struct smc_link *link)
736 {
737 int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1;
738
739 /* allocate link related memory */
740 link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
741 if (!link->wr_tx_bufs)
742 goto no_mem;
743 link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
744 GFP_KERNEL);
745 if (!link->wr_rx_bufs)
746 goto no_mem_wr_tx_bufs;
747 link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
748 GFP_KERNEL);
749 if (!link->wr_tx_ibs)
750 goto no_mem_wr_rx_bufs;
751 link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
752 sizeof(link->wr_rx_ibs[0]),
753 GFP_KERNEL);
754 if (!link->wr_rx_ibs)
755 goto no_mem_wr_tx_ibs;
756 link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
757 sizeof(link->wr_tx_rdmas[0]),
758 GFP_KERNEL);
759 if (!link->wr_tx_rdmas)
760 goto no_mem_wr_rx_ibs;
761 link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
762 sizeof(link->wr_tx_rdma_sges[0]),
763 GFP_KERNEL);
764 if (!link->wr_tx_rdma_sges)
765 goto no_mem_wr_tx_rdmas;
766 link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
767 GFP_KERNEL);
768 if (!link->wr_tx_sges)
769 goto no_mem_wr_tx_rdma_sges;
770 link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
771 sizeof(link->wr_rx_sges[0]) * sges_per_buf,
772 GFP_KERNEL);
773 if (!link->wr_rx_sges)
774 goto no_mem_wr_tx_sges;
775 link->wr_tx_mask = kcalloc(BITS_TO_LONGS(SMC_WR_BUF_CNT),
776 sizeof(*link->wr_tx_mask),
777 GFP_KERNEL);
778 if (!link->wr_tx_mask)
779 goto no_mem_wr_rx_sges;
780 link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
781 sizeof(link->wr_tx_pends[0]),
782 GFP_KERNEL);
783 if (!link->wr_tx_pends)
784 goto no_mem_wr_tx_mask;
785 link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
786 sizeof(link->wr_tx_compl[0]),
787 GFP_KERNEL);
788 if (!link->wr_tx_compl)
789 goto no_mem_wr_tx_pends;
790
791 if (link->lgr->smc_version == SMC_V2) {
792 link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib),
793 GFP_KERNEL);
794 if (!link->wr_tx_v2_ib)
795 goto no_mem_tx_compl;
796 link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge),
797 GFP_KERNEL);
798 if (!link->wr_tx_v2_sge)
799 goto no_mem_v2_ib;
800 link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend),
801 GFP_KERNEL);
802 if (!link->wr_tx_v2_pend)
803 goto no_mem_v2_sge;
804 }
805 return 0;
806
807 no_mem_v2_sge:
808 kfree(link->wr_tx_v2_sge);
809 no_mem_v2_ib:
810 kfree(link->wr_tx_v2_ib);
811 no_mem_tx_compl:
812 kfree(link->wr_tx_compl);
813 no_mem_wr_tx_pends:
814 kfree(link->wr_tx_pends);
815 no_mem_wr_tx_mask:
816 kfree(link->wr_tx_mask);
817 no_mem_wr_rx_sges:
818 kfree(link->wr_rx_sges);
819 no_mem_wr_tx_sges:
820 kfree(link->wr_tx_sges);
821 no_mem_wr_tx_rdma_sges:
822 kfree(link->wr_tx_rdma_sges);
823 no_mem_wr_tx_rdmas:
824 kfree(link->wr_tx_rdmas);
825 no_mem_wr_rx_ibs:
826 kfree(link->wr_rx_ibs);
827 no_mem_wr_tx_ibs:
828 kfree(link->wr_tx_ibs);
829 no_mem_wr_rx_bufs:
830 kfree(link->wr_rx_bufs);
831 no_mem_wr_tx_bufs:
832 kfree(link->wr_tx_bufs);
833 no_mem:
834 return -ENOMEM;
835 }
836
smc_wr_remove_dev(struct smc_ib_device * smcibdev)837 void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
838 {
839 tasklet_kill(&smcibdev->recv_tasklet);
840 tasklet_kill(&smcibdev->send_tasklet);
841 }
842
smc_wr_add_dev(struct smc_ib_device * smcibdev)843 void smc_wr_add_dev(struct smc_ib_device *smcibdev)
844 {
845 tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
846 tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
847 }
848
smc_wr_create_link(struct smc_link * lnk)849 int smc_wr_create_link(struct smc_link *lnk)
850 {
851 struct ib_device *ibdev = lnk->smcibdev->ibdev;
852 int rc = 0;
853
854 smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
855 lnk->wr_rx_id = 0;
856 lnk->wr_rx_dma_addr = ib_dma_map_single(
857 ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
858 DMA_FROM_DEVICE);
859 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
860 lnk->wr_rx_dma_addr = 0;
861 rc = -EIO;
862 goto out;
863 }
864 if (lnk->lgr->smc_version == SMC_V2) {
865 lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev,
866 lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE,
867 DMA_FROM_DEVICE);
868 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
869 lnk->wr_rx_v2_dma_addr = 0;
870 rc = -EIO;
871 goto dma_unmap;
872 }
873 lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev,
874 lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE,
875 DMA_TO_DEVICE);
876 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) {
877 lnk->wr_tx_v2_dma_addr = 0;
878 rc = -EIO;
879 goto dma_unmap;
880 }
881 }
882 lnk->wr_tx_dma_addr = ib_dma_map_single(
883 ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
884 DMA_TO_DEVICE);
885 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
886 rc = -EIO;
887 goto dma_unmap;
888 }
889 smc_wr_init_sge(lnk);
890 memset(lnk->wr_tx_mask, 0,
891 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
892 init_waitqueue_head(&lnk->wr_tx_wait);
893 atomic_set(&lnk->wr_tx_refcnt, 0);
894 init_waitqueue_head(&lnk->wr_reg_wait);
895 atomic_set(&lnk->wr_reg_refcnt, 0);
896 return rc;
897
898 dma_unmap:
899 if (lnk->wr_rx_v2_dma_addr) {
900 ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
901 SMC_WR_BUF_V2_SIZE,
902 DMA_FROM_DEVICE);
903 lnk->wr_rx_v2_dma_addr = 0;
904 }
905 if (lnk->wr_tx_v2_dma_addr) {
906 ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
907 SMC_WR_BUF_V2_SIZE,
908 DMA_TO_DEVICE);
909 lnk->wr_tx_v2_dma_addr = 0;
910 }
911 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
912 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
913 DMA_FROM_DEVICE);
914 lnk->wr_rx_dma_addr = 0;
915 out:
916 return rc;
917 }
918