diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index a97284ad3c372e86bb29ca0e3987a0062c48cbbb..497af518816bede7306cd79c63cbe5bbae243555 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -46,7 +46,7 @@ wmem - INTEGER Initial size of send buffer used by SMC sockets. The minimum value is 256KiB and there is no hard limit for max value, but - only allowed 512KiB for SMC-R and 1MiB for SMC-D. + only allowed 2MiB for SMC-R and 2MiB for SMC-D. Default: 256K @@ -54,6 +54,6 @@ rmem - INTEGER Initial size of receive buffer (RMB) used by SMC sockets. The minimum value is 256KiB and there is no hard limit for max value, but - only allowed 512KiB for SMC-R and 1MiB for SMC-D. + only allowed 2MiB for SMC-R and 2MiB for SMC-D. Default: 256K diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index f6158ebea3613f42f5e2afc4b8f958854d0adf86..c5ddcde4dc018a6f66cb45c54ceb0dc9548d7c6a 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -12,7 +12,7 @@ struct netns_smc { /* per cpu counters for SMC */ struct smc_stats __percpu *smc_stats; /* protect fback_rsn */ - struct mutex mutex_fback_rsn; + spinlock_t mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; int limit_smc_hs; /* constraint on handshake */ atomic_t iwarp_cnt; diff --git a/include/net/smc.h b/include/net/smc.h index 8018c3a0143a3cc3ed465a703dfad3653555c83d..97756483760bb962ebb161f57da70660c61ff77f 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -280,11 +280,13 @@ struct smc_connection { struct smc_sock { /* smc sock container */ union { + struct tcp6_sock tp6sk; struct tcp_sock tpsk; struct sock sk; }; struct socket *clcsock; /* internal tcp socket */ unsigned char smc_state; /* smc state used in smc via inet_sk */ + unsigned long smc_sk_flags; unsigned int isck_smc_negotiation; struct socket accompany_socket; struct request_sock *tail_0; @@ -310,6 +312,7 @@ struct smc_sock { /* smc sock container */ bool limit_smc_hs; /* put constraint on handshake */ bool use_fallback; /* fallback to tcp */ bool under_presure; /* under presure */ + bool sent_confirm_accept; /* already sent confirm_accept */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ atomic_t queued_smc_hs; /* queued smc handshakes */ @@ -341,6 +344,8 @@ struct smc_sock { /* smc sock container */ /* protects clcsock of a listen * socket */ + /* ipv6_pinfo has to be the last member of tcp6_sock, see inet6_sk_generic */ + struct ipv6_pinfo inet6; }; #define SMC_NEGOTIATOR_NAME_MAX (16) @@ -376,14 +381,18 @@ int smc_sock_register_negotiator_ops(struct smc_sock_negotiator_ops *ops); int smc_sock_update_negotiator_ops(struct smc_sock_negotiator_ops *ops, struct smc_sock_negotiator_ops *old_ops); void smc_sock_unregister_negotiator_ops(struct smc_sock_negotiator_ops *ops); -int smc_sock_assign_negotiator_ops(struct smc_sock *smc, const char *name); #ifdef CONFIG_BPF_SYSCALL void smc_sock_cleanup_negotiator_ops(struct smc_sock *smc, int in_release); void smc_sock_clone_negotiator_ops(struct sock *parent, struct sock *child); +int smc_sock_assign_negotiator_ops(struct smc_sock *smc, const char *name); #else static inline void smc_sock_cleanup_negotiator_ops(struct smc_sock *smc, int in_release) {} static inline void smc_sock_clone_negotiator_ops(struct sock *parent, struct sock *child) {} +static inline int smc_sock_assign_negotiator_ops(struct smc_sock *smc, const char *name) +{ + return -EOPNOTSUPP; +} #endif #endif /* _SMC_H */ diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index fa4719ce5d62a75b2cebb00bfd3acd049baf08a8..c166de136028088d2ece19eba1b047583e85151a 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -212,7 +212,8 @@ enum { SMC_NLA_STATS_PLOAD_256K, /* u64 */ SMC_NLA_STATS_PLOAD_512K, /* u64 */ SMC_NLA_STATS_PLOAD_1024K, /* u64 */ - SMC_NLA_STATS_PLOAD_G_1024K, /* u64 */ + SMC_NLA_STATS_PLOAD_2048K, /* u64 */ + SMC_NLA_STATS_PLOAD_G_2048K, /* u64 */ __SMC_NLA_STATS_PLOAD_MAX, SMC_NLA_STATS_PLOAD_MAX = __SMC_NLA_STATS_PLOAD_MAX - 1 }; @@ -275,7 +276,7 @@ enum { SMC_NLA_FBACK_STATS_SRV_CNT, /* u64 */ SMC_NLA_FBACK_STATS_CLNT_CNT, /* u64 */ SMC_NLA_FBACK_STATS_RSN_CODE, /* u32 */ - SMC_NLA_FBACK_STATS_RSN_CNT, /* u16 */ + SMC_NLA_FBACK_STATS_RSN_CNT, /* u64 */ __SMC_NLA_FBACK_STATS_MAX, SMC_NLA_FBACK_STATS_MAX = __SMC_NLA_FBACK_STATS_MAX - 1 }; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b8d57c7ed895b8b8bd944660f6d912e780375a17..9bda27253af0457d62e3df3e9d4f7e428bc45eb4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -73,7 +73,6 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); -static void smc_inet_sock_state_change(struct sock *sk); static int smc_inet_sock_do_handshake(struct sock *sk, bool sk_locked, bool sync); static void __smc_inet_sock_sort_csk_queue(struct sock *parent, int *tcp_cnt, int *smc_cnt); @@ -81,8 +80,6 @@ static int smc_inet_sock_sort_csk_queue(struct sock *parent); /* default use reserve_mode */ bool reserve_mode = true; -module_param(reserve_mode, bool, 0444); -MODULE_PARM_DESC(reserve_mode, "reserve mode support and keep-first-contact disable"); /* rsvd_ports_base must less than (u16 MAX - 8) */ u16 rsvd_ports_base = SMC_IWARP_RSVD_PORTS_BASE; @@ -361,7 +358,7 @@ static int __smc_release(struct smc_sock *smc) if (!smc->use_fallback) { rc = smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); + smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } else { if (smc_sk_state(sk) != SMC_CLOSED) { @@ -448,11 +445,17 @@ static void smc_destruct(struct sock *sk) if (smc_sk(sk)->original_sk_destruct) smc_sk(sk)->original_sk_destruct(sk); + /* for inet sock, sk here MUST be non accepted */ + if (smc_sock_is_inet_sock(sk) && !smc_inet_sock_is_active_open(sk) && + (isck_smc_negotiation_load(smc_sk(sk)) == SMC_NEGOTIATION_TBD)) + goto out; + smc_sock_cleanup_negotiator_ops(smc_sk(sk), /* in release */ 1); +out: if (smc_sk_state(sk) != SMC_CLOSED) return; - if (!sock_flag(sk, SOCK_DEAD)) + if (!smc_sock_flag(sk, SOCK_DEAD)) return; sk_refcnt_debug_dec(sk); @@ -500,6 +503,7 @@ static void smc_sock_init(struct sock *sk, struct net *net) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); smc->limit_smc_hs = net->smc.limit_smc_hs; + smc_sock_assign_negotiator_ops(smc, "anolis"); /* already set (for inet sock), save the original */ if (sk->sk_destruct) @@ -888,7 +892,7 @@ static void smc_stat_fallback(struct smc_sock *smc) { struct net *net = sock_net(&smc->sk); - mutex_lock(&net->smc.mutex_fback_rsn); + spin_lock_bh(&net->smc.mutex_fback_rsn); if (smc->listen_smc) { smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv); net->smc.fback_rsn->srv_fback_cnt++; @@ -896,7 +900,7 @@ static void smc_stat_fallback(struct smc_sock *smc) smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt); net->smc.fback_rsn->clnt_fback_cnt++; } - mutex_unlock(&net->smc.mutex_fback_rsn); + spin_unlock_bh(&net->smc.mutex_fback_rsn); } /* must be called under rcu read lock */ @@ -1036,11 +1040,8 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) /* inet sock */ if (smc_sock_is_inet_sock(&smc->sk)) { - write_lock_bh(&smc->sk.sk_callback_lock); - smc_inet_sock_switch_negotiation_state_locked(&smc->sk, - isck_smc_negotiation_load(smc), - SMC_NEGOTIATION_NO_SMC); - write_unlock_bh(&smc->sk.sk_callback_lock); + smc_inet_sock_move_state(&smc->sk, SMC_NEGOTIATION_TBD, + SMC_NEGOTIATION_NO_SMC); return 0; } @@ -1077,7 +1078,7 @@ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) rc = smc_switch_to_fallback(smc, reason_code); if (rc) { /* fallback fails */ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); - if (smc_sk_state(&smc->sk) == SMC_INIT && !smc_sock_is_inet_sock(&smc->sk)) + if (smc_sk_state(&smc->sk) == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; } @@ -1097,7 +1098,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, if (reason_code < 0) { /* error, fallback is not possible */ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); - if (smc_sk_state(&smc->sk) == SMC_INIT && !smc_sock_is_inet_sock(&smc->sk)) + if (smc_sk_state(&smc->sk) == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return reason_code; } @@ -1106,7 +1107,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, rc = smc_clc_send_decline(smc, reason_code, version); if (rc < 0) { this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); - if (smc_sk_state(&smc->sk) == SMC_INIT && !smc_sock_is_inet_sock(&smc->sk)) + if (smc_sk_state(&smc->sk) == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; } @@ -1797,7 +1798,7 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_err = -rc; out: - if (!sock_flag(&smc->sk, SOCK_DEAD)) { + if (!smc_sock_flag(&smc->sk, SOCK_DEAD)) { if (smc->sk.sk_err) { smc->sk.sk_state_change(&smc->sk); } else { /* allow polling before and after fallback decision */ @@ -1814,7 +1815,6 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, struct sock *sk = sock->sk; struct smc_sock *smc; int rc = -EINVAL; - int cur; smc = smc_sk(sk); @@ -1875,20 +1875,8 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, } if (smc_sock_is_inet_sock(sk)) { - if (smc_inet_sock_set_syn_smc(sk)) { - if (flags & O_NONBLOCK) { - smc->connect_nonblock = 1; - /* To ensure that userspace will not be awakened by TCP sock events - * before the SMC handshake is completed or totaly fallback/failed. - */ - sk->sk_wq = &smc->accompany_socket.wq; - smc_clcsock_replace_cb(&sk->sk_state_change, - smc_inet_sock_state_change, - &smc->clcsk_state_change); - } - } else { + if (!smc_inet_sock_set_syn_smc(sk, flags)) smc_switch_to_fallback(smc, SMC_CLC_DECL_ACTIVE); - } } else { tcp_sk(smc->clcsock->sk)->syn_smc = 1; } @@ -1906,17 +1894,41 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, /* for inet sock */ if (smc_sock_is_inet_sock(sk)) { if (flags & O_NONBLOCK) { - rc = -EINPROGRESS; + write_lock_bh(&sk->sk_callback_lock); + if (smc_inet_sock_check_smc(sk) || smc_inet_sock_check_fallback(sk)) { + rc = 0; + } else { + smc->connect_nonblock = 1; + rc = -EINPROGRESS; + } + write_unlock_bh(&sk->sk_callback_lock); } else { - rc = 0; - cur = smc_inet_sock_switch_negotiation_state(sk, SMC_NEGOTIATION_TBD, - tcp_sk(sk)->syn_smc ? - SMC_NEGOTIATION_PREPARE_SMC : - SMC_NEGOTIATION_NO_SMC); - if (cur == SMC_NEGOTIATION_PREPARE_SMC) + write_lock_bh(&sk->sk_callback_lock); +again: + switch (isck_smc_negotiation_load(smc)) { + case SMC_NEGOTIATION_TBD: + smc_inet_sock_move_state_locked(sk, SMC_NEGOTIATION_TBD, + SMC_NEGOTIATION_PREPARE_SMC); + write_unlock_bh(&sk->sk_callback_lock); +do_handshake: rc = smc_inet_sock_do_handshake(sk, /* sk_locked */ true, true); - if (rc) + write_lock_bh(&sk->sk_callback_lock); + break; + case SMC_NEGOTIATION_PREPARE_SMC: + write_unlock_bh(&sk->sk_callback_lock); + /* cancel success */ + if (cancel_work_sync(&smc->connect_work)) + goto do_handshake; + write_lock_bh(&sk->sk_callback_lock); + goto again; + case SMC_NEGOTIATION_NO_SMC: + case SMC_NEGOTIATION_SMC: + rc = 0; + break; + } + write_unlock_bh(&sk->sk_callback_lock); + if (!rc) goto connected; } goto out; @@ -1975,7 +1987,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) if (new_clcsock) sock_release(new_clcsock); smc_sk_set_state(new_sk, SMC_CLOSED); - sock_set_flag(new_sk, SOCK_DEAD); + smc_sock_set_flag(new_sk, SOCK_DEAD); mutex_unlock(&lsmc->clcsock_release_lock); sock_put(new_sk); /* final */ *new_smc = NULL; @@ -2084,13 +2096,15 @@ void smc_close_non_accepted(struct sock *sk) sock_hold(sk); /* sock_put below */ lock_sock(sk); if (smc_sock_is_inet_sock(sk)) { - if (!smc_inet_sock_check_fallback(sk) && smc_sk_state(sk) != SMC_CLOSED) { + if (!smc_inet_sock_check_fallback(sk)) smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); - if (smc_sk_state(sk) == SMC_CLOSED) - smc_conn_free(&smc->conn); - } - } else { + smc_sock_set_flag(sk, SOCK_DEAD); + release_sock(sk); + tcp_close(sk, 0); + lock_sock(sk); + if (smc_sk_state(sk) == SMC_CLOSED) + smc_conn_free(&smc->conn); + } else { if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; @@ -2098,10 +2112,7 @@ void smc_close_non_accepted(struct sock *sk) } release_sock(sk); sock_put(sk); /* sock_hold above */ - - if (smc_sock_is_inet_sock(sk)) - tcp_close(sk, 0); - else + if (!smc_sock_is_inet_sock(sk)) sock_put(sk); /* final sock_put */ } @@ -2167,9 +2178,12 @@ static void smc_listen_out(struct smc_sock *new_smc) atomic_dec(&lsmc->queued_smc_hs); if (smc_sock_is_inet_sock(newsmcsk)) - smc_inet_sock_switch_negotiation_state(newsmcsk, - SMC_NEGOTIATION_PREPARE_SMC, - SMC_NEGOTIATION_SMC); + smc_inet_sock_move_state(newsmcsk, + SMC_NEGOTIATION_PREPARE_SMC, + new_smc->use_fallback && + smc_sk_state(newsmcsk) == SMC_ACTIVE ? + SMC_NEGOTIATION_NO_SMC : + SMC_NEGOTIATION_SMC); if (smc_sk_state(&lsmc->sk) == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); @@ -2189,8 +2203,10 @@ static void smc_listen_out_connected(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; + lock_sock(newsmcsk); if (smc_sk_state(newsmcsk) == SMC_INIT) smc_sk_set_state(newsmcsk, SMC_ACTIVE); + release_sock(newsmcsk); smc_listen_out(new_smc); } @@ -2202,9 +2218,13 @@ static void smc_listen_out_err(struct smc_sock *new_smc) struct net *net = sock_net(newsmcsk); this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt); - if (smc_sk_state(newsmcsk) == SMC_INIT) + + lock_sock(newsmcsk); + if (smc_sk_state(newsmcsk) == SMC_INIT) { sock_put(&new_smc->sk); /* passive closing */ - smc_sk_set_state(newsmcsk, SMC_CLOSED); + smc_sk_set_state(newsmcsk, SMC_CLOSED); + } + release_sock(newsmcsk); smc_listen_out(new_smc); } @@ -3819,29 +3839,18 @@ static int __smc_inet_connect_work_locked(struct smc_sock *smc) if (rc < 0) smc->sk.sk_err = -rc; - smc_inet_sock_switch_negotiation_state(&smc->sk, SMC_NEGOTIATION_PREPARE_SMC, - (smc->use_fallback || - smc_sk_state(&smc->sk) == SMC_INIT) ? - SMC_NEGOTIATION_NO_SMC : SMC_NEGOTIATION_SMC); + smc_inet_sock_move_state(&smc->sk, SMC_NEGOTIATION_PREPARE_SMC, + (smc->use_fallback && + smc_sk_state(&smc->sk) == SMC_ACTIVE) ? + SMC_NEGOTIATION_NO_SMC : SMC_NEGOTIATION_SMC); - /* reset to this */ - if (smc->sk.sk_socket) { - wake_up_interruptible_all(&smc->accompany_socket.wq.wait); - smc->sk.sk_wq = &smc->sk.sk_socket->wq; - } - - /* make smc_negotiation can be seen */ - smp_wmb(); - - if (!sock_flag(&smc->sk, SOCK_DEAD)) { + if (!smc_sock_flag(&smc->sk, SOCK_DEAD)) { if (smc->sk.sk_err) smc->sk.sk_state_change(&smc->sk); else smc->sk.sk_write_space(&smc->sk); } - /* sock hold in smc_inet_sock_state_change() or smc_inet_connect() */ - sock_put(&smc->sk); return rc; } @@ -3850,9 +3859,11 @@ static void smc_inet_connect_work(struct work_struct *work) struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); + sock_hold(&smc->sk); /* sock put bellow */ lock_sock(&smc->sk); __smc_inet_connect_work_locked(smc); release_sock(&smc->sk); + sock_put(&smc->sk); /* sock hold above */ } static void smc_inet_listen_work(struct work_struct *work) @@ -3868,10 +3879,11 @@ static void smc_inet_listen_work(struct work_struct *work) sk->sk_wq = &smc_sk(sk)->accompany_socket.wq; smc_listen_work(work); - /* sock hold in smc_inet_sock_do_handshake() */ - sock_put(&smc->sk); } +/* caller MUST not access sk after smc_inet_sock_do_handshake + * is invoked unless a sock_hold() has performed beforehand. + */ static int smc_inet_sock_do_handshake(struct sock *sk, bool sk_locked, bool sync) { struct smc_sock *smc = smc_sk(sk); @@ -3879,11 +3891,8 @@ static int smc_inet_sock_do_handshake(struct sock *sk, bool sk_locked, bool sync if (smc_inet_sock_is_active_open(sk)) { INIT_WORK(&smc->connect_work, smc_inet_connect_work); - /* protected sk during smc_inet_connect_work/__smc_inet_connect_work_locked */ - sock_hold(sk); if (!sync) { - if (unlikely(!queue_work(smc_hs_wq, &smc->connect_work))) - sock_put(sk); /* sock hold above */ + queue_work(smc_hs_wq, &smc->connect_work); return 0; } if (sk_locked) @@ -3895,14 +3904,11 @@ static int smc_inet_sock_do_handshake(struct sock *sk, bool sk_locked, bool sync } INIT_WORK(&smc->smc_listen_work, smc_inet_listen_work); - /* protected sk during smc_inet_listen_work */ - sock_hold(sk); /* protected listen_smc during smc_inet_listen_work */ sock_hold(&smc->listen_smc->sk); if (!sync) { - if (unlikely(!queue_work(smc_hs_wq, &smc->smc_listen_work))) - sock_put(sk); /* sock hold above */ + queue_work(smc_hs_wq, &smc->smc_listen_work); } else { smc_inet_listen_work(&smc->smc_listen_work); } @@ -3910,7 +3916,7 @@ static int smc_inet_sock_do_handshake(struct sock *sk, bool sk_locked, bool sync return 0; } -static void smc_inet_sock_state_change(struct sock *sk) +void smc_inet_sock_state_change(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); int cur; @@ -3918,27 +3924,31 @@ static void smc_inet_sock_state_change(struct sock *sk) if (sk->sk_err || (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_ESTABLISHED)) { write_lock_bh(&sk->sk_callback_lock); - /* cause by release */ - if (unlikely(sk->sk_state_change != smc_inet_sock_state_change)) + /* resume sk_state_change */ + sk->sk_state_change = smc->clcsk_state_change; + + /* cause by abort */ + if (isck_smc_negotiation_get_flags(smc_sk(sk)) & SMC_NEGOTIATION_ABORT_FLAG) goto out_unlock; - cur = smc_inet_sock_switch_negotiation_state_locked(sk, SMC_NEGOTIATION_TBD, - (tcp_sk(sk)->syn_smc && - !sk->sk_err) ? - SMC_NEGOTIATION_PREPARE_SMC : - SMC_NEGOTIATION_NO_SMC); + if (isck_smc_negotiation_load(smc) != SMC_NEGOTIATION_TBD) + goto out_unlock; - /* resume sk_state_change when cur changed */ - if (cur != SMC_NEGOTIATION_TBD) - sk->sk_state_change = smc->clcsk_state_change; + cur = smc_inet_sock_move_state_locked(sk, SMC_NEGOTIATION_TBD, + (tcp_sk(sk)->syn_smc && + !sk->sk_err) ? + SMC_NEGOTIATION_PREPARE_SMC : + SMC_NEGOTIATION_NO_SMC); if (cur == SMC_NEGOTIATION_PREPARE_SMC) { smc_inet_sock_do_handshake(sk, /* not locked */ false, /* async */ false); } else if (cur == SMC_NEGOTIATION_NO_SMC) { - /* resume sk_wq */ - sk->sk_wq = &sk->sk_socket->wq; - /* flush all sleeper on accompany_socket.wq */ - wake_up_interruptible_all(&smc->accompany_socket.wq.wait); + smc->use_fallback = true; + smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; + smc_stat_fallback(smc); + trace_smc_switch_to_fallback(smc, SMC_CLC_DECL_PEERNOSMC); + smc->connect_nonblock = 0; + smc_sk_set_state(&smc->sk, SMC_ACTIVE); } out_unlock: write_unlock_bh(&sk->sk_callback_lock); @@ -3952,6 +3962,15 @@ int smc_inet_init_sock(struct sock *sk) struct smc_sock *smc = smc_sk(sk); int rc; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == PF_INET6) { + memcpy(&((struct tcp6_sock *)sk)->inet6, inet_sk(sk)->pinet6, + sizeof(struct ipv6_pinfo)); + inet_sk(sk)->pinet6 = (struct ipv6_pinfo *)(((u8 *)sk) + + sizeof(struct tcp6_sock) - sizeof(struct ipv6_pinfo)); + } +#endif + /* Call tcp init sock first */ rc = smc_inet_get_tcp_prot(sk->sk_family)->init(sk); if (rc) @@ -4240,7 +4259,8 @@ int smc_inet_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; - int old_state; + int old_state, rc; + bool do_free = false; if (!sk) return 0; @@ -4250,19 +4270,21 @@ int smc_inet_release(struct socket *sock) /* trigger info gathering if needed.*/ smc_sock_perform_collecting_info(smc, SMC_SOCK_CLOSED_TIMING); - if (!smc_inet_sock_try_fallback_fast(sk, /* force it to no_smc */ 1)) - goto out; - old_state = smc_sk_state(sk); - /* cleanup for a dangling non-blocking connect */ - if (smc->connect_nonblock && old_state == SMC_INIT) { - sk->sk_err = ECONNABORTED; - sk->sk_error_report(sk); + sock_hold(sk); /* sock put bellow */ + + /* check fallback ? */ + if (!smc_inet_sock_try_fallback_fast(sk, /* force it to no_smc */ 1)) { + if (smc_sk_state(sk) == SMC_ACTIVE) + sock_put(sk); /* sock put for passive closing */ + smc_sock_set_flag(sk, SOCK_DEAD); + smc_sk_set_state(sk, SMC_CLOSED); + goto out; } if (smc->connect_nonblock && cancel_work_sync(&smc->connect_work)) - sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ + sock_put(&smc->sk); /* sock_hold for passive closing */ if (smc_sk_state(sk) == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires @@ -4272,21 +4294,33 @@ int smc_inet_release(struct socket *sock) else lock_sock(sk); - if ((old_state == SMC_INIT || smc->conn.killed) && - smc_sk_state(sk) == SMC_ACTIVE && !smc->use_fallback) + if (smc->conn.killed && !smc->use_fallback) smc_close_active_abort(smc); - /* ret of smc_close_active do not need return to userspace */ - smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); - - if (smc_sk_state(sk) == SMC_CLOSED) - smc_conn_free(&smc->conn); + if (!smc->use_fallback) { + /* ret of smc_close_active do not need return to userspace */ + smc_close_active(smc); + do_free = true; + } else { + if (smc_sk_state(sk) == SMC_ACTIVE) + sock_put(sk); /* sock put for passive closing */ + smc_sk_set_state(sk, SMC_CLOSED); + } + smc_sock_set_flag(sk, SOCK_DEAD); release_sock(sk); out: /* release tcp sock */ - return smc_call_inet_sock_ops(sk, inet_release, inet6_release, sock); + rc = smc_call_inet_sock_ops(sk, inet_release, inet6_release, sock); + + if (do_free) { + lock_sock(sk); + if (smc_sk_state(sk) == SMC_CLOSED) + smc_conn_free(&smc->conn); + release_sock(sk); + } + sock_put(sk); /* sock hold above */ + return rc; } static inline struct request_sock *smc_inet_reqsk_get_safe_tail_0(struct sock *parent) @@ -4423,15 +4457,11 @@ static inline void __smc_inet_sock_sort_csk_queue(struct sock *parent, int *tcp_ /* join queue */ smc_reqsk_queue_join_locked(queue, &queue_smc); - if (par->tail_0) { - smc_sk(par->tail_0->sk)->queued_cnt += cnt0; - cnt0 = smc_sk(par->tail_0->sk)->queued_cnt; - } + if (par->tail_0) + smc_sk(par->tail_0->sk)->queued_cnt = cnt0; - if (par->tail_1) { - smc_sk(par->tail_1->sk)->queued_cnt += cnt1; - cnt1 = smc_sk(par->tail_1->sk)->queued_cnt; - } + if (par->tail_1) + smc_sk(par->tail_1->sk)->queued_cnt = cnt1; *tcp_cnt = cnt0; *smc_cnt = cnt1; @@ -4453,35 +4483,6 @@ static int smc_inet_sock_sort_csk_queue(struct sock *parent) return mask; } -static int smc_inet_sock_reverse_ordered_csk_queue(struct sock *parent) -{ - struct request_sock_queue *queue, queue_smc, queue_free; - struct smc_sock *par = smc_sk(parent); - int mask = SMC_REQSK_TCP; - - queue = &inet_csk(parent)->icsk_accept_queue; - spin_lock_bh(&queue->rskq_lock); - - par->tail_0 = smc_inet_reqsk_get_safe_tail_0(parent); - par->tail_1 = smc_inet_reqsk_get_safe_tail_1(parent); - - smc_reqsk_queue_cut_locked(queue, par->tail_0, &queue_smc); - smc_reqsk_queue_cut_locked(&queue_smc, par->tail_1, &queue_free); - - /* has smc reqsk */ - if (!reqsk_queue_empty(&queue_smc)) - mask = SMC_REQSK_SMC; - - smc_reqsk_queue_join_locked(&queue_smc, queue); - smc_reqsk_queue_join_locked(&queue_smc, &queue_free); - smc_reqsk_queue_join_locked(queue, &queue_smc); - - if (par->tail_0) - par->tail_1 = NULL; - spin_unlock_bh(&queue->rskq_lock); - return mask; -} - /* Wait for an incoming connection, avoid race conditions. This must be called * with the socket locked. */ @@ -4541,17 +4542,20 @@ struct sock *__smc_inet_csk_accept(struct sock *sk, int flags, int *err, bool ke child = inet_csk_accept(sk, flags | O_NONBLOCK, err, kern); if (child) { + smc_sk(child)->listen_smc = smc_sk(sk); + /* depends on syn_smc if next_state not specify */ if (next_state == SMC_NEGOTIATION_TBD) next_state = tcp_sk(child)->syn_smc ? SMC_NEGOTIATION_PREPARE_SMC : SMC_NEGOTIATION_NO_SMC; - cur = smc_inet_sock_switch_negotiation_state(child, SMC_NEGOTIATION_TBD, - next_state); + cur = smc_inet_sock_move_state(child, SMC_NEGOTIATION_TBD, + next_state); switch (cur) { case SMC_NEGOTIATION_NO_SMC: smc_sk_set_state(child, SMC_ACTIVE); smc_switch_to_fallback(smc_sk(child), SMC_CLC_DECL_PEERNOSMC); + smc_sock_clone_negotiator_ops(sk, child); break; case SMC_NEGOTIATION_PREPARE_SMC: /* init as passive open smc sock */ @@ -4608,7 +4612,7 @@ static void smc_inet_tcp_listen_work(struct work_struct *work) int error = 0; while (smc_sk_state(lsk) == SMC_LISTEN && - (smc_inet_sock_reverse_ordered_csk_queue(lsk) & SMC_REQSK_SMC)) { + (smc_inet_sock_sort_csk_queue(lsk) & SMC_REQSK_SMC)) { child = __smc_inet_csk_accept(lsk, O_NONBLOCK, &error, 1, SMC_NEGOTIATION_PREPARE_SMC); if (!child || error) @@ -4620,10 +4624,6 @@ static void smc_inet_tcp_listen_work(struct work_struct *work) */ smc_inet_sock_do_handshake(child, /* sk not locked */ false, !tcp_sk(child)->syn_smc); - - /* Minimize handling fallback connections in workqueue as much as possible */ - if (!tcp_sk(child)->syn_smc) - break; } } @@ -4789,10 +4789,23 @@ static int __init smc_init(void) } /* no return value */ inet_register_protosw(&smc_inet_protosw); +#if IS_ENABLED(CONFIG_IPV6) + /* register smc inet6 proto */ + rc = proto_register(&smc_inet6_prot, 1); + if (rc) { + pr_err("%s: proto_register smc_inet6_prot fails with %d\n", __func__, rc); + goto out_proto_register; + } + /* no return value */ + inet6_register_protosw(&smc_inet6_protosw); +#endif } static_branch_enable(&tcp_have_smc); return 0; +out_proto_register: + inet_unregister_protosw(&smc_inet_protosw); + proto_unregister(&smc_inet_prot); out_proc: smc_proc_exit(); out_ib: @@ -4830,6 +4843,9 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); inet_unregister_protosw(&smc_inet_protosw); +#if IS_ENABLED(CONFIG_IPV6) + inet6_unregister_protosw(&smc_inet6_protosw); +#endif smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); @@ -4841,6 +4857,9 @@ static void __exit smc_exit(void) proto_unregister(&smc_proto6); proto_unregister(&smc_proto); proto_unregister(&smc_inet_prot); +#if IS_ENABLED(CONFIG_IPV6) + proto_unregister(&smc_inet6_prot); +#endif smc_pnet_exit(); smc_nl_exit(); smc_clc_exit(); @@ -4860,4 +4879,5 @@ MODULE_ALIAS_NETPROTO(PF_SMC); * understanding of enum type(IPPROTO_SMC or SOCK_STREAM) */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 263, 1); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 263, 1); MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME); diff --git a/net/smc/bpf_smc.c b/net/smc/bpf_smc.c index 5c569b1f0df916b7710dd48455a5c0abf064b9bd..525c6f1c669bcfb767ee2217cd73a0eaea2736f2 100644 --- a/net/smc/bpf_smc.c +++ b/net/smc/bpf_smc.c @@ -77,7 +77,6 @@ int smc_sock_register_negotiator_ops(struct smc_sock_negotiator_ops *ops) spin_unlock(&smc_sock_negotiator_list_lock); return ret; } -EXPORT_SYMBOL_GPL(smc_sock_register_negotiator_ops); /* unregister ops */ void smc_sock_unregister_negotiator_ops(struct smc_sock_negotiator_ops *ops) @@ -91,7 +90,6 @@ void smc_sock_unregister_negotiator_ops(struct smc_sock_negotiator_ops *ops) */ synchronize_rcu(); } -EXPORT_SYMBOL_GPL(smc_sock_unregister_negotiator_ops); int smc_sock_update_negotiator_ops(struct smc_sock_negotiator_ops *ops, struct smc_sock_negotiator_ops *old_ops) @@ -126,7 +124,6 @@ int smc_sock_update_negotiator_ops(struct smc_sock_negotiator_ops *ops, synchronize_rcu(); return 0; } -EXPORT_SYMBOL_GPL(smc_sock_update_negotiator_ops); /* assign ops to sock */ int smc_sock_assign_negotiator_ops(struct smc_sock *smc, const char *name) diff --git a/net/smc/smc.h b/net/smc/smc.h index e91c5ef1c12d0749466879584d7b22b2d94b3b63..ac1c0a2cf532b343265006575acd85a37890fe21 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -35,7 +35,7 @@ extern struct proto smc_proto6; extern bool reserve_mode; extern u16 rsvd_ports_base; -static __always_inline bool smc_sock_is_inet_sock(struct sock *sk) +static __always_inline bool smc_sock_is_inet_sock(const struct sock *sk) { return inet_sk(sk)->is_icsk; } @@ -175,4 +175,33 @@ int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); +static inline bool smc_sock_flag(const struct sock *sk, enum sock_flags flag) +{ + if (smc_sock_is_inet_sock(sk)) { + switch (flag) { + case SOCK_DEAD: + case SOCK_DONE: + return test_bit(flag, &smc_sk(sk)->smc_sk_flags); + default: + break; + } + } + return sock_flag(sk, flag); +} + +static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag) +{ + if (smc_sock_is_inet_sock(sk)) { + switch (flag) { + case SOCK_DEAD: + case SOCK_DONE: + __set_bit(flag, &smc_sk(sk)->smc_sk_flags); + return; + default: + break; + } + } + sock_set_flag(sk, flag); +} + #endif /* __SMC_H */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index c25bc9b64f3ac2f3836ff7f8fe9f52e6c61d30b4..390701c17ed3892794631da841405200edd5a2ea 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -91,12 +91,12 @@ void smc_cdc_tx_handler_rwwi(struct ib_wc *wc) wr_id.data = wc->wr_id; read_lock_bh(&lgr->conns_lock); - conn = smc_lgr_find_conn(wr_id.token, lgr); + smc = smc_lgr_get_sock(wr_id.token, lgr); read_unlock_bh(&lgr->conns_lock); - if (!conn) + if (!smc) return; - smc = container_of(conn, struct smc_sock, conn); + conn = &smc->conn; bh_lock_sock(&smc->sk); if (!wc->status) { @@ -125,6 +125,7 @@ void smc_cdc_tx_handler_rwwi(struct ib_wc *wc) smc_tx_sndbuf_nonfull(smc); bh_unlock_sock(&smc->sk); + sock_put(&smc->sk); /* sock_hold in smc_lgr_get_sock */ } int smc_cdc_get_free_slot(struct smc_connection *conn, @@ -354,7 +355,7 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, /* new data included urgent business */ smc_curs_copy(&conn->urg_curs, &conn->local_rx_ctrl.prod, conn); conn->urg_state = SMC_URG_VALID; - if (!sock_flag(&smc->sk, SOCK_URGINLINE)) + if (!smc_sock_flag(&smc->sk, SOCK_URGINLINE)) /* we'll skip the urgent byte, so don't account for it */ (*diff_prod)--; base = (char *)conn->rmb_desc->cpu_addr + conn->rx_off; @@ -443,7 +444,7 @@ static void __smc_cdc_msg_recv_action(struct smc_sock *smc, smc->sk.sk_shutdown |= RCV_SHUTDOWN; if (smc->clcsock && smc->clcsock->sk) smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; - sock_set_flag(&smc->sk, SOCK_DONE); + smc_sock_set_flag(&smc->sk, SOCK_DONE); sock_hold(&smc->sk); /* sock_put in close_work */ if (!queue_work(smc_close_wq, &conn->close_work)) sock_put(&smc->sk); @@ -670,14 +671,12 @@ void smc_cdc_rx_handler_rwwi(struct ib_wc *wc) imm_msg.imm_data = be32_to_cpu(wc->ex.imm_data); read_lock_bh(&lgr->conns_lock); - conn = smc_lgr_find_conn(imm_msg.hdr.token, lgr); + smc = smc_lgr_get_sock(imm_msg.hdr.token, lgr); read_unlock_bh(&lgr->conns_lock); - if (!conn) + if (!smc) return; - smc = container_of(conn, struct smc_sock, conn); - - sock_hold(&smc->sk); + conn = &smc->conn; bh_lock_sock(&smc->sk); diff_prod = wc->byte_len; if (diff_prod) @@ -702,7 +701,7 @@ void smc_cdc_rx_handler_rwwi(struct ib_wc *wc) } bh_unlock_sock(&smc->sk); - sock_put(&smc->sk); /* no free sk in softirq-context */ + sock_put(&smc->sk); /* sock_hold in smc_lgr_get_sock */ } static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = { diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 48945395ae524e6addca2c241e263a4fa05f9d19..2f87618380cbe32d4539fa87aefa2e79dfeae8c5 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -791,6 +791,12 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) int len, send_len; struct kvec vec; + if (smc->sent_confirm_accept) { + pr_warn_once("smc: decline [%d] dropped dues to confirm_accept sent.", + peer_diag_info); + return -EPROTO; + } + dclc_v1 = (struct smc_clc_msg_decline *)&dclc; memset(&dclc, 0, sizeof(dclc)); memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -1174,6 +1180,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, } vec[i].iov_base = &trl; vec[i++].iov_len = sizeof(trl); + smc->sent_confirm_accept = true; return kernel_sendmsg(smc->clcsock, &msg, vec, 1, ntohs(clc->hdr.length)); } diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index ebc2d040cc6919b9a47b27eaf1f206a656131c0f..880fa2121070f378bfcd8a860bb79a528b364f99 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -27,11 +27,12 @@ void smc_clcsock_release(struct smc_sock *smc) { struct socket *tcp; + if (smc->listen_smc && current_work() != &smc->smc_listen_work) + cancel_work_sync(&smc->smc_listen_work); + if (smc_sock_is_inet_sock(&smc->sk)) return; - if (smc->listen_smc && current_work() != &smc->smc_listen_work) - cancel_work_sync(&smc->smc_listen_work); mutex_lock(&smc->clcsock_release_lock); if (smc->clcsock) { tcp = smc->clcsock; @@ -130,7 +131,8 @@ static void smc_close_cancel_work(struct smc_sock *smc) struct sock *sk = &smc->sk; release_sock(sk); - cancel_work_sync(&smc->conn.close_work); + if (cancel_work_sync(&smc->conn.close_work)) + sock_put(sk); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); } @@ -146,10 +148,7 @@ void smc_close_active_abort(struct smc_sock *smc) if (smc_sk_state(sk) != SMC_INIT) { /* sock locked */ if (smc_sock_is_inet_sock(sk)) { - sk->sk_err = ECONNABORTED; - /* This barrier is coupled with smp_rmb() in tcp_poll() */ - smp_wmb(); - sk->sk_error_report(sk); + smc_inet_sock_abort(sk); } else if (smc->clcsock && smc->clcsock->sk) { sk->sk_err = ECONNABORTED; tcp_abort(smc->clcsock->sk, ECONNABORTED); @@ -165,6 +164,7 @@ void smc_close_active_abort(struct smc_sock *smc) if (smc_sk_state(sk) != SMC_PEERABORTWAIT) break; smc_sk_set_state(sk, SMC_CLOSED); + smc_conn_free(&smc->conn); sock_put(sk); /* (postponed) passive closing */ break; case SMC_PEERCLOSEWAIT1: @@ -195,7 +195,8 @@ void smc_close_active_abort(struct smc_sock *smc) break; } - sock_set_flag(sk, SOCK_DEAD); + smc_sock_set_flag(sk, SOCK_DEAD); + sk->sk_state_change(sk); if (release_clcsock) { @@ -223,7 +224,7 @@ int smc_close_active(struct smc_sock *smc) int rc1 = 0; timeout = current->flags & PF_EXITING ? - 0 : sock_flag(sk, SOCK_LINGER) ? + 0 : smc_sock_flag(sk, SOCK_LINGER) ? sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; old_state = smc_sk_state(sk); @@ -319,6 +320,7 @@ int smc_close_active(struct smc_sock *smc) break; case SMC_PEERABORTWAIT: smc_sk_set_state(sk, SMC_CLOSED); + sock_put(sk); /* (postponed) passive closing */ break; case SMC_CLOSED: /* nothing to do, add tracing in future patch */ @@ -363,6 +365,7 @@ static void smc_close_passive_abort_received(struct smc_sock *smc) break; case SMC_PEERABORTWAIT: smc_sk_set_state(sk, SMC_CLOSED); + sock_put(sk); /* passive closing */ break; case SMC_PROCESSABORT: /* nothing to do, add tracing in future patch */ @@ -416,7 +419,7 @@ static void smc_close_passive_work(struct work_struct *work) case SMC_PEERCLOSEWAIT2: if (!smc_cdc_rxed_any_close(conn)) break; - if (sock_flag(sk, SOCK_DEAD) && + if (smc_sock_flag(sk, SOCK_DEAD) && smc_close_sent_any_close(conn)) { /* smc_release has already been called locally */ smc_sk_set_state(sk, SMC_CLOSED); @@ -453,7 +456,7 @@ static void smc_close_passive_work(struct work_struct *work) if (old_state != smc_sk_state(sk)) { sk->sk_state_change(sk); if ((smc_sk_state(sk) == SMC_CLOSED) && - (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) { + (smc_sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) { smc_conn_free(conn); if (smc->clcsock) release_clcsock = true; @@ -474,7 +477,7 @@ int smc_close_shutdown_write(struct smc_sock *smc) int rc = 0; timeout = current->flags & PF_EXITING ? - 0 : sock_flag(sk, SOCK_LINGER) ? + 0 : smc_sock_flag(sk, SOCK_LINGER) ? sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; old_state = smc_sk_state(sk); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e97cf71cebcaf15ad0f1c50c22c737587b7bfe1f..a4a4117d5d06a7597bfd6af6fc0310972071e1b4 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -40,6 +40,8 @@ #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) +#define SMC_RTOKEN_UNINITIALIZED -1 + struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), .list = LIST_HEAD_INIT(smc_lgr_list.list), @@ -813,10 +815,13 @@ int smcr_iw_net_reserve_ports(struct net *net) goto release; } } - pr_info_ratelimited("smc: netns %pK reserved ports for eRDMA OOB\n", net); + pr_info_ratelimited("smc: netns %pK reserved ports [%d ~ %d] for eRDMA OOB\n", + net, ports_base, ports_base + SMC_IWARP_RSVD_PORTS_NUM - 1); return 0; release: + pr_warn_ratelimited("warning: smc: netns %pK reserved ports %d FAIL for eRDMA OOB\n", + net, ports_base + i); for (j = 0; j < i; j++) { sock_release(net->smc.rsvd_sock[j]); net->smc.rsvd_sock[j] = NULL; @@ -832,7 +837,9 @@ void smcr_iw_net_release_ports(struct net *net) sock_release(net->smc.rsvd_sock[i]); net->smc.rsvd_sock[i] = NULL; } - pr_info_ratelimited("smc: netns %pK released ports used by eRDMA OOB\n", net); + pr_info_ratelimited("smc: netns %pK released ports [%d ~ %d] used by eRDMA OOB\n", + net, rsvd_ports_base, + rsvd_ports_base + SMC_IWARP_RSVD_PORTS_NUM - 1); } static void smcr_link_iw_extension(struct iw_ext_conn_param *iw_param, struct sock *clcsk) @@ -2032,6 +2039,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) &smc_lgr_list.lock; ini->first_contact_local = 1; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + conn->rtoken_idx = SMC_RTOKEN_UNINITIALIZED; if (role == SMC_CLNT && ini->first_contact_peer) /* create new link group as well */ goto create; @@ -2051,7 +2059,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || - (lgr->conns_num < lgr->max_conns && + (lgr->conns_num < lgr->max_conns && !lgr->terminating && !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; @@ -2114,8 +2122,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) return rc; } -#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ -#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ +#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 7 -> 2MB */ +#define SMCR_RMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 7 -> 2MB */ /* convert the RMB size into the compressed notation (minimum 16K, see * SMCD/R_DMBE_SIZES. @@ -2124,7 +2132,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) */ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { - const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; u8 compressed; if (size <= SMC_BUF_MIN_SIZE) @@ -2134,10 +2141,6 @@ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) compressed = min_t(u8, ilog2(size) + 1, is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); - if (!is_smcd && is_rmb) - /* RMBs are backed by & limited to max size of scatterlists */ - compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); - return compressed; } @@ -2693,17 +2696,40 @@ int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { struct smc_link_group *lgr = smc_get_lgr(lnk); + struct smc_sock *smc = NULL; + struct smc_connection *conn; u32 rkey = ntohl(nw_rkey); int i, j; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { + read_lock_bh(&lgr->conns_lock); + smc = smc_lgr_get_sock_by_rtoken(i, lgr); + read_unlock_bh(&lgr->conns_lock); + if (smc) + spin_lock_bh(&smc->conn.send_lock); + for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) { lgr->rtokens[i][j].rkey = 0; lgr->rtokens[i][j].dma_addr = 0; } clear_bit(i, lgr->rtokens_used_mask); + + if (smc) { + smc->conn.rtoken_idx = SMC_RTOKEN_UNINITIALIZED; + conn = &smc->conn; + if (!smc_cdc_rxed_any_close(&smc->conn)) { + /* make peer_conn_abort */ + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort = 1; + sock_hold(&smc->sk); /* sock_put in close_work */ + if (!queue_work(smc_close_wq, &smc->conn.close_work)) + sock_put(&smc->sk); + } + spin_unlock_bh(&smc->conn.send_lock); + /* sock_hold in smc_lgr_get_sock_by_rtoken */ + sock_put(&smc->sk); + } return 0; } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 57605e0563705dbbe66d9920eed2e77145068257..6338d762c7d4ff5bffc8ce668fdc5386eea07ac5 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -50,7 +50,12 @@ enum smc_link_state { /* possible states of a link */ #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ #define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */ - +#define SMC_WR_BUF_CNT 64 /* # of ctrl buffers per link, SMC_WR_BUF_CNT + * should not be less than 2 * SMC_RMBS_PER_LGR_MAX, + * since every connection at least has two rq/sq + * credits in average, otherwise may result in + * waiting for credits in sending process. + */ struct smc_wr_buf { u8 raw[SMC_WR_BUF_SIZE]; }; @@ -123,11 +128,13 @@ struct smc_link { struct completion tx_ref_comp; atomic_t tx_inflight_credit; - struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ + struct smc_wr_buf *wr_rx_bufs[SMC_WR_BUF_CNT]; + /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ - dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ + dma_addr_t wr_rx_dma_addr[SMC_WR_BUF_CNT]; + /* DMA address of wr_rx_bufs */ u64 wr_rx_id; /* seq # of last recv WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ @@ -469,6 +476,52 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +/* Find the smc sock associated with the given alert token in the link group. + * Requires @conns_lock + * @token alert token to search for + * @lgr link group to search in + * Returns smc_sock associated with token if found, NULL otherwise. + * sock_put(&smc->sk) must be called after using the smc_sock. + */ +static inline struct smc_sock * +smc_lgr_get_sock(u32 token, struct smc_link_group *lgr) +{ + struct smc_connection *conn = NULL; + struct smc_sock *smc = NULL; + + conn = smc_lgr_find_conn(token, lgr); + if (!conn) + return NULL; + + smc = container_of(conn, struct smc_sock, conn); + sock_hold(&smc->sk); + return smc; +} + +/* Find the smc sock associated with the given rtoken_idx in the link group. + * Requires @conns_lock + * @rtoken_idx rtoken index to search for + * @lgr link group to search in + * Returns smc_sock associated with rtoken_idx if found, NULL otherwise. + * sock_put(&smc->sk) must be called after using the smc_sock. + */ +static inline struct smc_sock * +smc_lgr_get_sock_by_rtoken(int rtoken_idx, struct smc_link_group *lgr) +{ + struct smc_connection *cur, *tmp; + struct smc_sock *res = NULL; + + rbtree_postorder_for_each_entry_safe(cur, tmp, &lgr->conns_all, alert_node) { + if (cur->rtoken_idx == rtoken_idx) { + res = container_of(cur, struct smc_sock, conn); + sock_hold(&res->sk); + break; + } + } + + return res; +} + static inline bool smc_conn_lgr_valid(struct smc_connection *conn) { return conn->lgr && conn->alert_token_local; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index a07f18379ac67cbe9529a223d2dd89551c6307d3..e2738d983b7077f9c9f00ebbc4d71c1a9d2ae079 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -25,6 +25,7 @@ struct smc_diag_dump_ctx { int pos[2]; + int inet_pos[2]; }; static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb) @@ -213,11 +214,18 @@ static int smc_diag_dump_inet_proto(struct inet_hashinfo *hashinfo, struct sk_bu { struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb); struct net *net = sock_net(skb->sk); - int snum = cb_ctx->pos[p_type]; + int snum = cb_ctx->inet_pos[p_type]; struct nlattr *bc = NULL; int rc = 0, num = 0, i; + struct proto *target_proto; struct sock *sk; +#if IS_ENABLED(CONFIG_IPV6) + target_proto = (p_type == SMCPROTO_SMC6) ? &smc_inet6_prot : &smc_inet_prot; +#else + target_proto = &smc_inet_prot; +#endif + for (i = 0; i < INET_LHTABLE_SIZE; i++) { struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; @@ -227,7 +235,7 @@ static int smc_diag_dump_inet_proto(struct inet_hashinfo *hashinfo, struct sk_bu sk_nulls_for_each(sk, node, &ilb->nulls_head) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_prot != &smc_inet_prot) + if (sk->sk_prot != target_proto) continue; if (num < snum) goto next_ls; @@ -258,7 +266,7 @@ static int smc_diag_dump_inet_proto(struct inet_hashinfo *hashinfo, struct sk_bu continue; if (sk->sk_state == TCP_NEW_SYN_RECV) continue; - if (sk->sk_prot != &smc_inet_prot) + if (sk->sk_prot != target_proto) continue; if (num < snum) goto next; @@ -273,7 +281,7 @@ static int smc_diag_dump_inet_proto(struct inet_hashinfo *hashinfo, struct sk_bu spin_unlock_bh(lock); } out: - cb_ctx->pos[p_type] += num; + cb_ctx->inet_pos[p_type] = num; return rc; } @@ -308,7 +316,7 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, out: read_unlock(&prot->h.smc_hash->lock); - cb_ctx->pos[p_type] += num; + cb_ctx->pos[p_type] = num; return rc; } @@ -317,13 +325,20 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) int rc = 0; rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC); - if (!rc) - rc = smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6); - if (!rc) - rc = smc_diag_dump_inet_proto(smc_inet_prot.h.hashinfo, skb, cb, SMCPROTO_SMC); + if (rc) + return rc; +#if IS_ENABLED(CONFIG_IPV6) + rc = smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6); + if (rc) + return rc; +#endif + rc = smc_diag_dump_inet_proto(smc_inet_prot.h.hashinfo, skb, cb, SMCPROTO_SMC); + if (rc) + return rc; #if IS_ENABLED(CONFIG_IPV6) - if (!rc) - rc = smc_diag_dump_inet_proto(smc_inet6_prot.h.hashinfo, skb, cb, SMCPROTO_SMC6); + rc = smc_diag_dump_inet_proto(smc_inet6_prot.h.hashinfo, skb, cb, SMCPROTO_SMC6); + if (rc) + return rc; #endif return skb->len; } diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 78d9cc6463dbcf2bad0ffd93cf47a02830d101ec..f333c09c3d6bb2734d54c7a53bf8a7a7eb361b03 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -23,7 +23,7 @@ #define SMC_GID_SIZE sizeof(union ib_gid) #define SMC_IB_MAX_SEND_SGE 2 -#define SMC_IWARP_RSVD_PORTS_BASE 33800 +#define SMC_IWARP_RSVD_PORTS_BASE 65500 struct smc_ib_devices { /* list of smc ib devices definition */ struct list_head list; diff --git a/net/smc/smc_inet.c b/net/smc/smc_inet.c index badd6036ebd6683feffc12af14bcb59ca3b41071..d430890a2314df3adf7e5343385640bdc94624e1 100644 --- a/net/smc/smc_inet.c +++ b/net/smc/smc_inet.c @@ -25,11 +25,17 @@ static struct timewait_sock_ops smc_timewait_sock_ops = { .twsk_destructor = tcp_twsk_destructor, }; +static struct timewait_sock_ops smc6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, + .twsk_destructor = tcp_twsk_destructor, +}; + struct proto smc_inet_prot = { .name = "SMC", .owner = THIS_MODULE, .close = tcp_close, - .pre_connect = NULL, + .pre_connect = NULL, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = smc_inet_csk_accept, @@ -43,7 +49,7 @@ struct proto smc_inet_prot = { .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, - .backlog_rcv = tcp_v4_do_rcv, + .backlog_rcv = tcp_v4_do_rcv, .release_cb = smc_inet_sock_proto_release_cb, .hash = inet_hash, .unhash = inet_unhash, @@ -62,7 +68,6 @@ struct proto smc_inet_prot = { .obj_size = sizeof(struct smc_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &smc_timewait_sock_ops, - /* tcp_conn_request will use tcp_request_sock_ops */ .rsk_prot = NULL, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, @@ -117,7 +122,7 @@ struct proto smc_inet6_prot = { .name = "SMCv6", .owner = THIS_MODULE, .close = tcp_close, - .pre_connect = NULL, + .pre_connect = NULL, .connect = NULL, .disconnect = tcp_disconnect, .accept = smc_inet_csk_accept, @@ -149,8 +154,7 @@ struct proto smc_inet6_prot = { .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct smc_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, - .twsk_prot = &smc_timewait_sock_ops, - /* tcp_conn_request will use tcp_request_sock_ops */ + .twsk_prot = &smc6_timewait_sock_ops, .rsk_prot = NULL, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, @@ -201,7 +205,7 @@ struct inet_protosw smc_inet6_protosw = { }; #endif -int smc_inet_sock_switch_negotiation_state_locked(struct sock *sk, int except, int target) +int smc_inet_sock_move_state_locked(struct sock *sk, int except, int target) { struct smc_sock *smc = smc_sk(sk); int cur; @@ -214,11 +218,9 @@ int smc_inet_sock_switch_negotiation_state_locked(struct sock *sk, int except, i case SMC_NEGOTIATION_TBD: switch (target) { case SMC_NEGOTIATION_PREPARE_SMC: - /* same as passive closing */ - sock_hold(sk); - fallthrough; case SMC_NEGOTIATION_NO_SMC: isck_smc_negotiation_store(smc, target); + sock_hold(sk); /* sock hold for passive closing */ return target; default: break; @@ -227,8 +229,6 @@ int smc_inet_sock_switch_negotiation_state_locked(struct sock *sk, int except, i case SMC_NEGOTIATION_PREPARE_SMC: switch (target) { case SMC_NEGOTIATION_NO_SMC: - sock_put(sk); /* sock hold in SMC_NEGOTIATION_PREPARE_SMC */ - fallthrough; case SMC_NEGOTIATION_SMC: isck_smc_negotiation_store(smc, target); return target; @@ -276,7 +276,6 @@ int smc_inet_sock_init(void) #if IS_ENABLED(CONFIG_IPV6) smc_inet6_prot.pre_connect = tcp_v6prot->pre_connect; smc_inet6_prot.connect = tcp_v6prot->connect; - smc_inet6_prot.init = tcp_v6prot->init; smc_inet6_prot.destroy = tcp_v6prot->destroy; smc_inet6_prot.backlog_rcv = tcp_v6prot->backlog_rcv; smc_inet6_prot.hash = tcp_v6prot->hash; @@ -308,6 +307,23 @@ static int smc_inet_clcsock_sendmsg(struct socket *sock, struct msghdr *msg, siz return tcp_sendmsg_locked(sk, msg, len); } +int smc_sk_wait_tcp_data(struct sock *sk, long *timeo, const struct sk_buff *skb) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int rc; + + lock_sock(sk); + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb || + isck_smc_negotiation_get_flags(smc_sk(sk)) & SMC_NEGOTIATION_ABORT_FLAG, + &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + release_sock(sk); + return rc; +} + static int smc_inet_clcsock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { @@ -329,23 +345,32 @@ static int smc_inet_clcsock_recvmsg(struct socket *sock, struct msghdr *msg, siz timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - if (current_work() == &smc->smc_listen_work) { - err = tcp_recvmsg(sk, msg, len, flags & MSG_DONTWAIT, - flags & ~MSG_DONTWAIT, &addr_len); - } else { - /* Locked, see more details in smc_inet_clcsock_sendmsg() */ + /* Locked, see more details in smc_inet_clcsock_sendmsg() */ + if (current_work() != &smc->smc_listen_work) release_sock(sock->sk); - err = tcp_recvmsg(sk, msg, len, flags & MSG_DONTWAIT, - flags & ~MSG_DONTWAIT, &addr_len); +again: + /* recv nonblock */ + err = tcp_recvmsg(sk, msg, len, /* non block */1, flags & ~MSG_DONTWAIT, &addr_len); + if (err != -EAGAIN || !timeo) + goto out; + + smc_sk_wait_tcp_data(sk, &timeo, NULL); + if (isck_smc_negotiation_get_flags(smc_sk(sk)) & SMC_NEGOTIATION_ABORT_FLAG) { + /* TODO: THIS SHOULD NOT report as handshake error */ + pr_warn_once("smc: THIS SHOULD NOT report as handshake erro."); + err = -ECONNABORTED; + goto out; + } + goto again; +out: + if (current_work() != &smc->smc_listen_work) { lock_sock(sock->sk); /* since we release sock before, there might be state changed */ - if (smc_sk_state(&smc->sk) != SMC_INIT) + if (err >= 0 && smc_sk_state(&smc->sk) != SMC_INIT) err = -EPIPE; } - if (err >= 0) msg->msg_namelen = addr_len; - return err; } diff --git a/net/smc/smc_inet.h b/net/smc/smc_inet.h index ec235d6646fe836c1b8ac6e7cd57dc808fc90367..c96aa522c968e874537962449d677371ea5db3cb 100644 --- a/net/smc/smc_inet.h +++ b/net/smc/smc_inet.h @@ -31,6 +31,8 @@ extern struct inet_protosw smc_inet6_protosw; extern const struct proto_ops smc_inet_clcsock_ops; +void smc_inet_sock_state_change(struct sock *sk); + enum smc_inet_sock_negotiation_state { /* When creating an AF_SMC sock, the state field will be initialized to 0 by default, * which is only for logical compatibility with that situation @@ -64,17 +66,19 @@ enum smc_inet_sock_negotiation_state { /* flags */ SMC_NEGOTIATION_LISTEN_FLAG = 0x01, + SMC_NEGOTIATION_ABORT_FLAG = 0x02, }; static __always_inline void isck_smc_negotiation_store(struct smc_sock *smc, enum smc_inet_sock_negotiation_state state) { - smc->isck_smc_negotiation = (state | (smc->isck_smc_negotiation & 0x0f)); + WRITE_ONCE(smc->isck_smc_negotiation, + state | (READ_ONCE(smc->isck_smc_negotiation) & 0x0f)); } static __always_inline int isck_smc_negotiation_load(struct smc_sock *smc) { - return smc->isck_smc_negotiation & 0xf0; + return READ_ONCE(smc->isck_smc_negotiation) & 0xf0; } static __always_inline void isck_smc_negotiation_set_flags(struct smc_sock *smc, int flags) @@ -87,7 +91,7 @@ static __always_inline int isck_smc_negotiation_get_flags(struct smc_sock *smc) return smc->isck_smc_negotiation & 0x0f; } -static inline int smc_inet_sock_set_syn_smc(struct sock *sk) +static inline int smc_inet_sock_set_syn_smc(struct sock *sk, int flags) { int rc = 0; @@ -102,12 +106,30 @@ static inline int smc_inet_sock_set_syn_smc(struct sock *sk) */ if (isck_smc_negotiation_load(smc_sk(sk)) == SMC_NEGOTIATION_TBD) { tcp_sk(sk)->syn_smc = 1; + if (flags & O_NONBLOCK) + smc_clcsock_replace_cb(&sk->sk_state_change, + smc_inet_sock_state_change, + &smc_sk(sk)->clcsk_state_change); rc = 1; } read_unlock_bh(&sk->sk_callback_lock); return rc; } +static inline void smc_inet_sock_abort(struct sock *sk) +{ + write_lock_bh(&sk->sk_callback_lock); + if (isck_smc_negotiation_get_flags(smc_sk(sk)) & SMC_NEGOTIATION_ABORT_FLAG) { + write_unlock_bh(&sk->sk_callback_lock); + return; + } + isck_smc_negotiation_set_flags(smc_sk(sk), SMC_NEGOTIATION_ABORT_FLAG); + write_unlock_bh(&sk->sk_callback_lock); + sk->sk_error_report(sk); +} + +int smc_inet_sock_move_state_locked(struct sock *sk, int except, int target); + static inline int smc_inet_sock_try_fallback_fast(struct sock *sk, int abort) { struct smc_sock *smc = smc_sk(sk); @@ -116,20 +138,21 @@ static inline int smc_inet_sock_try_fallback_fast(struct sock *sk, int abort) write_lock_bh(&sk->sk_callback_lock); switch (isck_smc_negotiation_load(smc)) { case SMC_NEGOTIATION_TBD: - if (!abort && tcp_sk(sk)->syn_smc) - break; /* fallback is meanless for listen socks */ if (unlikely(inet_sk_state_load(sk) == TCP_LISTEN)) break; + if (abort) + isck_smc_negotiation_set_flags(smc_sk(sk), SMC_NEGOTIATION_ABORT_FLAG); + else if (tcp_sk(sk)->syn_smc) + break; /* In the implementation of INET sock, syn_smc will only be determined after * smc_inet_connect or smc_inet_listen, which means that if there is * no syn_smc set, we can easily fallback. */ - isck_smc_negotiation_store(smc, SMC_NEGOTIATION_NO_SMC); + smc_inet_sock_move_state_locked(sk, SMC_NEGOTIATION_TBD, SMC_NEGOTIATION_NO_SMC); + smc_sk_set_state(sk, SMC_ACTIVE); fallthrough; case SMC_NEGOTIATION_NO_SMC: - if (smc->clcsk_state_change) - sk->sk_state_change = smc->clcsk_state_change; syn_smc = 0; default: break; @@ -188,15 +211,13 @@ static __always_inline struct proto *smc_inet_get_tcp_prot(int family) return NULL; } -int smc_inet_sock_switch_negotiation_state_locked(struct sock *sk, int except, int target); - -static __always_inline int smc_inet_sock_switch_negotiation_state(struct sock *sk, - int except, int target) +static __always_inline int smc_inet_sock_move_state(struct sock *sk, + int except, int target) { int rc; write_lock_bh(&sk->sk_callback_lock); - rc = smc_inet_sock_switch_negotiation_state_locked(sk, except, target); + rc = smc_inet_sock_move_state_locked(sk, except, target); write_unlock_bh(&sk->sk_callback_lock); return rc; } diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 9856ec6cde302a0148877041d219940178eaaed0..f0e4b081642c3c5c3e59dafdcce71fa0cff82fef 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -67,7 +67,7 @@ static int smc_rx_update_consumer(struct smc_sock *smc, if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) { diff = smc_curs_comp(conn->rmb_desc->len, &cons, &conn->urg_curs); - if (sock_flag(sk, SOCK_URGINLINE)) { + if (smc_sock_flag(sk, SOCK_URGINLINE)) { if (diff == 0) { force = true; rc = 1; @@ -283,7 +283,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, struct sock *sk = &smc->sk; int rc = 0; - if (sock_flag(sk, SOCK_URGINLINE) || + if (smc_sock_flag(sk, SOCK_URGINLINE) || !(conn->urg_state == SMC_URG_VALID) || conn->urg_state == SMC_URG_READ) return -EINVAL; @@ -405,7 +405,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, break; } if (smc_sk_state(sk) == SMC_CLOSED) { - if (!sock_flag(sk, SOCK_DONE)) { + if (!smc_sock_flag(sk, SOCK_DONE)) { /* This occurs when user tries to read * from never connected socket. */ @@ -446,7 +446,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (splbytes) smc_curs_add(conn->rmb_desc->len, &cons, splbytes); if (conn->urg_state == SMC_URG_VALID && - sock_flag(&smc->sk, SOCK_URGINLINE) && + smc_sock_flag(&smc->sk, SOCK_URGINLINE) && readable > 1) readable--; /* always stop at urgent Byte */ /* not more than what user space asked for */ diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c index ed99d2f49195eb1fdc954a0893883c76e6c8fbcd..3d8dfa15ab511a691825e0dad75ad41b730619c6 100644 --- a/net/smc/smc_stats.c +++ b/net/smc/smc_stats.c @@ -32,7 +32,7 @@ int smc_stats_init(struct net *net) net->smc.smc_stats = alloc_percpu(struct smc_stats); if (!net->smc.smc_stats) goto err_stats; - mutex_init(&net->smc.mutex_fback_rsn); + spin_lock_init(&net->smc.mutex_fback_rsn); return 0; err_stats: @@ -194,8 +194,12 @@ static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb, stats_pload->buf[SMC_BUF_1024K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; - if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K, - stats_pload->buf[SMC_BUF_G_1024K], + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_2048K, + stats_pload->buf[SMC_BUF_2048K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_2048K, + stats_pload->buf[SMC_BUF_G_2048K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; @@ -407,8 +411,8 @@ static int smc_nl_get_fback_details(struct sk_buff *skb, if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE, trgt_arr[pos].fback_code)) goto errattr; - if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT, - trgt_arr[pos].count)) + if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_RSN_CNT, + trgt_arr[pos].count, SMC_NLA_FBACK_STATS_PAD)) goto errattr; cb_ctx->pos[2] = cnt_reported; @@ -433,7 +437,7 @@ int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) int snum = cb_ctx->pos[0]; bool is_srv = true; - mutex_lock(&net->smc.mutex_fback_rsn); + spin_lock_bh(&net->smc.mutex_fback_rsn); for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) { if (k < snum) continue; @@ -452,7 +456,7 @@ int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) if (rc_clnt == -ENODATA && rc_srv == -ENODATA) break; } - mutex_unlock(&net->smc.mutex_fback_rsn); + spin_unlock_bh(&net->smc.mutex_fback_rsn); cb_ctx->pos[1] = skip_serv; cb_ctx->pos[0] = k; return skb->len; diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index d63fb1297501f8fc511bc40aa6bd039c36666177..3328a32bec9f5f228471bb3baf16b4667f7d454d 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -30,13 +30,14 @@ enum { SMC_BUF_256K, SMC_BUF_512K, SMC_BUF_1024K, - SMC_BUF_G_1024K, + SMC_BUF_2048K, + SMC_BUF_G_2048K, SMC_BUF_MAX, }; struct smc_stats_fback { int fback_code; - u16 count; + u64 count; }; struct smc_stats_rsn { diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 645abd73453cd029f73125513b0e5c9632797902..22104664bbc8938b58fff828f1e7f30ef1fee541 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -341,6 +341,11 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, if (!lgr->use_rwwi) rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); rdma_wr->wr.num_sge = num_sges; + /* rtoken might be deleted if peer freed connection */ + if (conn->rtoken_idx < 0) { + pr_warn_ratelimited("smc: unexpected sends during connection termination flow(rtoken idx invalid)\n"); + return -EINVAL; + } rdma_wr->remote_addr = lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + /* RMBE within RMB */ @@ -349,9 +354,8 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; /* rtoken might be deleted if peer freed connection */ - if (!rdma_wr->rkey || - (rdma_wr->remote_addr == (conn->tx_off + peer_rmbe_offset))) { - pr_warn_ratelimited("smc: unexpected sends during connection termination flow\n"); + if (rdma_wr->remote_addr == (conn->tx_off + peer_rmbe_offset)) { + pr_warn_ratelimited("smc: unexpected sends during connection termination flow(addr invalid)\n"); return -EINVAL; } rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); @@ -376,7 +380,8 @@ static int __smcr_tx_rdma_writes_rwwi(struct smc_connection *conn, int dst_off, u8 saved_credits = 0; bool cr_flag = false; u8 conn_state_flags; - int diff_cons, rc; + int diff_cons = 0; + int rc; BUILD_BUG_ON_MSG(sizeof(union smc_wr_imm_msg) > sizeof(__be32), "sizeof(union smc_wr_imm_msg) can not exceed the size of imm_data(__be32)"); @@ -409,10 +414,11 @@ static int __smcr_tx_rdma_writes_rwwi(struct smc_connection *conn, int dst_off, if (conn_state_flags || prod_flags != urg_flags) imm_msg.hdr.opcode = SMC_WR_OP_CTRL; - smc_curs_copy(&cfed, &conn->local_tx_ctrl.cons, conn); - smc_curs_copy(&cons_old, &conn->rx_curs_confirmed, conn); - diff_cons = smc_curs_diff(conn->rmb_desc->len, &cons_old, - &conn->local_tx_ctrl.cons); + if (conn->rmb_desc) { + smc_curs_copy(&cfed, &conn->local_tx_ctrl.cons, conn); + smc_curs_copy(&cons_old, &conn->rx_curs_confirmed, conn); + diff_cons = smc_curs_diff(conn->rmb_desc->len, &cons_old, &cfed); + } switch (imm_msg.hdr.opcode) { case SMC_WR_OP_DATA: if (diff_cons > SMC_DATA_MAX_DIFF_CONS) @@ -462,7 +468,7 @@ static int __smcr_tx_rdma_writes_rwwi(struct smc_connection *conn, int dst_off, /* do not update rx_curs_confirmed if all flags equal to 0, * since diff_cons will not be carried by imm_data in this case. */ - if (update_rx_curs_confirmed) + if (update_rx_curs_confirmed && conn->rmb_desc) smc_curs_add(conn->rmb_desc->len, &conn->rx_curs_confirmed, diff_cons); } else { smc_wr_rx_put_credits(link, saved_credits); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index cede8da85bc880ec3130db480b73082782a0d553..85435426aaff22de6dbdca72818c21a4eee5f8f6 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -419,8 +419,6 @@ int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) { struct smc_link *link = (struct smc_link *)wc->qp->qp_context; - int rx_buf_size = (link->lgr->smc_version == SMC_V2) ? - SMC_WR_BUF_V2_SIZE : SMC_WR_BUF_SIZE; struct smc_wr_rx_handler *handler; struct smc_wr_rx_hdr *wr_rx; u64 temp_wr_id; @@ -430,7 +428,7 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) return; /* short message */ temp_wr_id = wc->wr_id / 2; index = do_div(temp_wr_id, link->wr_rx_cnt); - wr_rx = (struct smc_wr_rx_hdr *)((u8 *)link->wr_rx_bufs + index * rx_buf_size); + wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs[index]); hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { if (handler->type == wr_rx->type) handler->handler(wc, wr_rx); @@ -633,11 +631,8 @@ static void smc_wr_init_sge(struct smc_link *lnk) for (i = 0; i < lnk->wr_rx_cnt; i++) { int rx_msg_size = (lnk->lgr->smc_version == SMC_V2) ? SMC_WR_BUF_V2_SIZE : SMC_WR_TX_SIZE; - int rx_buf_size = (lnk->lgr->smc_version == SMC_V2) ? - SMC_WR_BUF_V2_SIZE : SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[i].addr = - lnk->wr_rx_dma_addr + i * rx_buf_size; + lnk->wr_rx_sges[i].addr = lnk->wr_rx_dma_addr[i]; lnk->wr_rx_sges[i].length = rx_msg_size; lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; lnk->wr_rx_ibs[i].next = NULL; @@ -656,6 +651,7 @@ void smc_wr_free_link(struct smc_link *lnk) int rx_buf_size = (lnk->lgr->smc_version == SMC_V2) ? SMC_WR_BUF_V2_SIZE : SMC_WR_BUF_SIZE; struct ib_device *ibdev; + int i; if (!lnk->smcibdev) return; @@ -670,11 +666,12 @@ void smc_wr_free_link(struct smc_link *lnk) percpu_ref_kill(&lnk->wr_tx_refs); wait_for_completion(&lnk->tx_ref_comp); - if (lnk->wr_rx_dma_addr) { - ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - rx_buf_size * lnk->wr_rx_cnt, - DMA_FROM_DEVICE); - lnk->wr_rx_dma_addr = 0; + for (i = 0; i < lnk->wr_rx_cnt; i++) { + if (lnk->wr_rx_dma_addr[i]) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr[i], + rx_buf_size, DMA_FROM_DEVICE); + lnk->wr_rx_dma_addr[i] = 0; + } } if (lnk->wr_tx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr, @@ -701,6 +698,8 @@ void smc_wr_free_lgr_mem(struct smc_link_group *lgr) void smc_wr_free_link_mem(struct smc_link *lnk) { + int i; + kfree(lnk->wr_tx_v2_ib); lnk->wr_tx_v2_ib = NULL; kfree(lnk->wr_tx_v2_sge); @@ -727,8 +726,10 @@ void smc_wr_free_link_mem(struct smc_link *lnk) lnk->wr_tx_ibs = NULL; kfree(lnk->wr_tx_bufs); lnk->wr_tx_bufs = NULL; - kfree(lnk->wr_rx_bufs); - lnk->wr_rx_bufs = NULL; + for (i = 0 ; i < SMC_WR_BUF_CNT; i++) { + kfree(lnk->wr_rx_bufs[i]); + lnk->wr_rx_bufs[i] = NULL; + } } int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) @@ -746,15 +747,23 @@ int smc_wr_alloc_link_mem(struct smc_link *link) { int rx_buf_size = (link->lgr->smc_version == SMC_V2) ? SMC_WR_BUF_V2_SIZE : SMC_WR_BUF_SIZE; + int i, j; /* allocate link related memory */ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT, rx_buf_size, - GFP_KERNEL); - if (!link->wr_rx_bufs) - goto no_mem_wr_tx_bufs; + + for (i = 0; i < SMC_WR_BUF_CNT; i++) { + link->wr_rx_bufs[i] = kzalloc(rx_buf_size, GFP_KERNEL); + if (!link->wr_rx_bufs[i]) { + for (j = i - 1; j >= 0; j--) { + kfree(link->wr_rx_bufs[j]); + link->wr_rx_bufs[j] = NULL; + } + goto no_mem_wr_tx_bufs; + } + } link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]), GFP_KERNEL); if (!link->wr_tx_ibs) @@ -835,7 +844,8 @@ int smc_wr_alloc_link_mem(struct smc_link *link) no_mem_wr_tx_ibs: kfree(link->wr_tx_ibs); no_mem_wr_rx_bufs: - kfree(link->wr_rx_bufs); + for (i = 0; i < SMC_WR_BUF_CNT; i++) + kfree(link->wr_rx_bufs[i]); no_mem_wr_tx_bufs: kfree(link->wr_tx_bufs); no_mem: @@ -879,17 +889,25 @@ int smc_wr_create_link(struct smc_link *lnk) int rx_buf_size = (lnk->lgr->smc_version == SMC_V2) ? SMC_WR_BUF_V2_SIZE : SMC_WR_BUF_SIZE; struct ib_device *ibdev = lnk->smcibdev->ibdev; - int rc = 0; + int i, j, rc = 0; smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); lnk->wr_rx_id = 1; - lnk->wr_rx_dma_addr = ib_dma_map_single( - ibdev, lnk->wr_rx_bufs, rx_buf_size * lnk->wr_rx_cnt, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { - lnk->wr_rx_dma_addr = 0; - rc = -EIO; - goto out; + + for (i = 0; i < lnk->wr_rx_cnt; i++) { + lnk->wr_rx_dma_addr[i] = + ib_dma_map_single(ibdev, lnk->wr_rx_bufs[i], + rx_buf_size, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr[i])) { + lnk->wr_rx_dma_addr[i] = 0; + for (j = i - 1; j >= 0; j--) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr[j], + rx_buf_size, DMA_FROM_DEVICE); + lnk->wr_rx_dma_addr[j] = 0; + } + rc = -EIO; + goto out; + } } if (lnk->lgr->smc_version == SMC_V2) { lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, @@ -940,10 +958,11 @@ int smc_wr_create_link(struct smc_link *lnk) DMA_TO_DEVICE); lnk->wr_tx_v2_dma_addr = 0; } - ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - rx_buf_size * lnk->wr_rx_cnt, - DMA_FROM_DEVICE); - lnk->wr_rx_dma_addr = 0; + for (i = 0; i < lnk->wr_rx_cnt; i++) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr[i], + rx_buf_size, DMA_FROM_DEVICE); + lnk->wr_rx_dma_addr[i] = 0; + } out: return rc; } diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 18700de14e76378e6ea67ca664478076377cd12e..3b32fa8936900c02d6ed11120fc55951fad95446 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,13 +19,6 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 64 /* # of ctrl buffers per link, SMC_WR_BUF_CNT - * should not be less than 2 * SMC_RMBS_PER_LGR_MAX, - * since every connection at least has two rq/sq - * credits in average, otherwise may result in - * waiting for credits in sending process. - */ - #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) #define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */