summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/6lowpan_i.h16
-rw-r--r--net/6lowpan/core.c8
-rw-r--r--net/6lowpan/debugfs.c97
-rw-r--r--net/8021q/vlan_dev.c1
-rw-r--r--net/Kconfig2
-rw-r--r--net/batman-adv/bat_algo.h7
-rw-r--r--net/batman-adv/bat_iv_ogm.c4
-rw-r--r--net/batman-adv/bat_v.c3
-rw-r--r--net/batman-adv/bat_v_elp.h4
-rw-r--r--net/batman-adv/bat_v_ogm.h3
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h9
-rw-r--r--net/batman-adv/debugfs.c99
-rw-r--r--net/batman-adv/debugfs.h9
-rw-r--r--net/batman-adv/distributed-arp-table.h7
-rw-r--r--net/batman-adv/fragmentation.h3
-rw-r--r--net/batman-adv/gateway_client.h9
-rw-r--r--net/batman-adv/gateway_common.c1
-rw-r--r--net/batman-adv/gateway_common.h3
-rw-r--r--net/batman-adv/hard-interface.c10
-rw-r--r--net/batman-adv/hard-interface.h5
-rw-r--r--net/batman-adv/hash.h3
-rw-r--r--net/batman-adv/icmp_socket.c20
-rw-r--r--net/batman-adv/icmp_socket.h5
-rw-r--r--net/batman-adv/log.c17
-rw-r--r--net/batman-adv/log.h1
-rw-r--r--net/batman-adv/main.h12
-rw-r--r--net/batman-adv/multicast.c1080
-rw-r--r--net/batman-adv/multicast.h6
-rw-r--r--net/batman-adv/netlink.c4
-rw-r--r--net/batman-adv/netlink.h3
-rw-r--r--net/batman-adv/network-coding.c29
-rw-r--r--net/batman-adv/network-coding.h14
-rw-r--r--net/batman-adv/originator.c4
-rw-r--r--net/batman-adv/originator.h7
-rw-r--r--net/batman-adv/routing.h3
-rw-r--r--net/batman-adv/send.h3
-rw-r--r--net/batman-adv/soft-interface.c6
-rw-r--r--net/batman-adv/soft-interface.h7
-rw-r--r--net/batman-adv/sysfs.c1
-rw-r--r--net/batman-adv/sysfs.h5
-rw-r--r--net/batman-adv/tp_meter.c1
-rw-r--r--net/batman-adv/tp_meter.h3
-rw-r--r--net/batman-adv/translation-table.c2
-rw-r--r--net/batman-adv/translation-table.h9
-rw-r--r--net/batman-adv/tvlv.h3
-rw-r--r--net/batman-adv/types.h72
-rw-r--r--net/bluetooth/6lowpan.c41
-rw-r--r--net/bluetooth/hci_conn.c5
-rw-r--r--net/bluetooth/hci_core.c4
-rw-r--r--net/bluetooth/hci_debugfs.c31
-rw-r--r--net/bluetooth/hci_event.c77
-rw-r--r--net/bluetooth/hci_request.c40
-rw-r--r--net/bluetooth/hci_request.h2
-rw-r--r--net/bluetooth/hidp/core.c2
-rw-r--r--net/bluetooth/hidp/sock.c1
-rw-r--r--net/bluetooth/l2cap_core.c29
-rw-r--r--net/bluetooth/smp.c13
-rw-r--r--net/bpfilter/main.c2
-rw-r--r--net/bridge/br_device.c1
-rw-r--r--net/bridge/br_input.c10
-rw-r--r--net/bridge/br_multicast.c23
-rw-r--r--net/bridge/br_netfilter_hooks.c247
-rw-r--r--net/bridge/br_netfilter_ipv6.c2
-rw-r--r--net/bridge/br_private.h1
-rw-r--r--net/bridge/br_stp_bpdu.c3
-rw-r--r--net/bridge/br_vlan.c29
-rw-r--r--net/bridge/netfilter/Kconfig20
-rw-r--r--net/bridge/netfilter/Makefile4
-rw-r--r--net/bridge/netfilter/ebt_dnat.c2
-rw-r--r--net/bridge/netfilter/ebt_redirect.c2
-rw-r--r--net/bridge/netfilter/ebt_snat.c2
-rw-r--r--net/bridge/netfilter/nf_conntrack_bridge.c435
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c163
-rw-r--r--net/core/bpf_sk_storage.c12
-rw-r--r--net/core/dev.c20
-rw-r--r--net/core/devlink.c398
-rw-r--r--net/core/dst.c2
-rw-r--r--net/core/ethtool.c24
-rw-r--r--net/core/filter.c382
-rw-r--r--net/core/flow_dissector.c70
-rw-r--r--net/core/flow_offload.c128
-rw-r--r--net/core/hwbm.c15
-rw-r--r--net/core/link_watch.c13
-rw-r--r--net/core/neighbour.c2
-rw-r--r--net/core/net-traces.c4
-rw-r--r--net/core/net_namespace.c28
-rw-r--r--net/core/netpoll.c10
-rw-r--r--net/core/page_pool.c103
-rw-r--r--net/core/pktgen.c8
-rw-r--r--net/core/rtnetlink.c9
-rw-r--r--net/core/skbuff.c376
-rw-r--r--net/core/sock.c6
-rw-r--r--net/core/sock_map.c9
-rw-r--r--net/core/sock_reuseport.c24
-rw-r--r--net/core/xdp.c123
-rw-r--r--net/dccp/ipv6.c2
-rw-r--r--net/dsa/Kconfig1
-rw-r--r--net/dsa/dsa2.c92
-rw-r--r--net/dsa/dsa_priv.h19
-rw-r--r--net/dsa/port.c178
-rw-r--r--net/dsa/slave.c218
-rw-r--r--net/dsa/tag_8021q.c57
-rw-r--r--net/dsa/tag_sja1105.c213
-rw-r--r--net/ethernet/eth.c14
-rw-r--r--net/hsr/hsr_device.c29
-rw-r--r--net/hsr/hsr_device.h1
-rw-r--r--net/hsr/hsr_framereg.c11
-rw-r--r--net/hsr/hsr_framereg.h3
-rw-r--r--net/hsr/hsr_netlink.c7
-rw-r--r--net/hsr/hsr_slave.c1
-rw-r--r--net/ieee802154/6lowpan/reassembly.c51
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c31
-rw-r--r--net/ipv4/ah4.c3
-rw-r--r--net/ipv4/devinet.c168
-rw-r--r--net/ipv4/esp4.c30
-rw-r--r--net/ipv4/esp4_offload.c4
-rw-r--r--net/ipv4/fib_frontend.c73
-rw-r--r--net/ipv4/fib_lookup.h1
-rw-r--r--net/ipv4/fib_rules.c8
-rw-r--r--net/ipv4/fib_semantics.c364
-rw-r--r--net/ipv4/fib_trie.c169
-rw-r--r--net/ipv4/gre_demux.c2
-rw-r--r--net/ipv4/icmp.c2
-rw-r--r--net/ipv4/igmp.c13
-rw-r--r--net/ipv4/inet_connection_sock.c5
-rw-r--r--net/ipv4/inet_fragment.c130
-rw-r--r--net/ipv4/inet_hashtables.c2
-rw-r--r--net/ipv4/ip_fragment.c81
-rw-r--r--net/ipv4/ip_options.c1
-rw-r--r--net/ipv4/ip_output.c350
-rw-r--r--net/ipv4/ipcomp.c3
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c2
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c4
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c395
-rw-r--r--net/ipv4/netfilter/iptable_raw.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic_main.c2
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c9
-rw-r--r--net/ipv4/nexthop.c1828
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/raw_diag.c3
-rw-r--r--net/ipv4/route.c182
-rw-r--r--net/ipv4/sysctl_net_ipv4.c96
-rw-r--r--net/ipv4/tcp.c54
-rw-r--r--net/ipv4/tcp_fastopen.c201
-rw-r--r--net/ipv4/tcp_input.c6
-rw-r--r--net/ipv4/tcp_ipv4.c24
-rw-r--r--net/ipv4/tcp_minisocks.c3
-rw-r--r--net/ipv4/tcp_output.c23
-rw-r--r--net/ipv4/udp.c27
-rw-r--r--net/ipv4/udp_offload.c2
-rw-r--r--net/ipv4/xfrm4_state.c45
-rw-r--r--net/ipv4/xfrm4_tunnel.c3
-rw-r--r--net/ipv6/addrconf.c19
-rw-r--r--net/ipv6/addrconf_core.c6
-rw-r--r--net/ipv6/af_inet6.c46
-rw-r--r--net/ipv6/ah6.c4
-rw-r--r--net/ipv6/esp6.c23
-rw-r--r--net/ipv6/esp6_offload.c4
-rw-r--r--net/ipv6/fib6_rules.c12
-rw-r--r--net/ipv6/icmp.c7
-rw-r--r--net/ipv6/inet6_hashtables.c2
-rw-r--r--net/ipv6/ip6_fib.c214
-rw-r--r--net/ipv6/ip6_flowlabel.c27
-rw-r--r--net/ipv6/ip6_output.c340
-rw-r--r--net/ipv6/ipcomp6.c3
-rw-r--r--net/ipv6/mip6.c6
-rw-r--r--net/ipv6/ndisc.c11
-rw-r--r--net/ipv6/netfilter.c129
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c420
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c53
-rw-r--r--net/ipv6/proc.c4
-rw-r--r--net/ipv6/raw.c4
-rw-r--r--net/ipv6/reassembly.c52
-rw-r--r--net/ipv6/route.c1471
-rw-r--r--net/ipv6/sysctl_net_ipv6.c5
-rw-r--r--net/ipv6/tcp_ipv6.c31
-rw-r--r--net/ipv6/udp.c33
-rw-r--r--net/ipv6/xfrm6_state.c137
-rw-r--r--net/key/af_key.c14
-rw-r--r--net/l2tp/l2tp_debugfs.c21
-rw-r--r--net/l2tp/l2tp_ip6.c4
-rw-r--r--net/l3mdev/l3mdev.c7
-rw-r--r--net/lapb/lapb_iface.c3
-rw-r--r--net/mac80211/cfg.c7
-rw-r--r--net/mac80211/debugfs.c1
-rw-r--r--net/mac80211/debugfs_key.c3
-rw-r--r--net/mac80211/debugfs_netdev.c10
-rw-r--r--net/mac80211/debugfs_sta.c2
-rw-r--r--net/mac80211/key.c100
-rw-r--r--net/mac80211/main.c4
-rw-r--r--net/mac80211/mlme.c25
-rw-r--r--net/mac80211/offchannel.c4
-rw-r--r--net/mac80211/rate.c27
-rw-r--r--net/mac80211/rc80211_minstrel.c4
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c3
-rw-r--r--net/mac80211/sta_info.c43
-rw-r--r--net/netfilter/Kconfig11
-rw-r--r--net/netfilter/Makefile3
-rw-r--r--net/netfilter/core.c24
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h3
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c4
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c3
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c5
-rw-r--r--net/netfilter/ipset/ip_set_core.c97
-rw-r--r--net/netfilter/ipset/ip_set_getport.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h5
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c4
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c2
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c5
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c3
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c131
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c88
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c134
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c215
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c9
-rw-r--r--net/netfilter/nf_conntrack_core.c25
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c2
-rw-r--r--net/netfilter/nf_conntrack_netlink.c7
-rw-r--r--net/netfilter/nf_conntrack_proto.c126
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c2
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c2
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c2
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c4
-rw-r--r--net/netfilter/nf_flow_table_core.c1
-rw-r--r--net/netfilter/nf_log.c2
-rw-r--r--net/netfilter/nf_nat_helper.c4
-rw-r--r--net/netfilter/nf_nat_proto.c26
-rw-r--r--net/netfilter/nf_nat_redirect.c12
-rw-r--r--net/netfilter/nf_nat_sip.c2
-rw-r--r--net/netfilter/nf_queue.c14
-rw-r--r--net/netfilter/nf_synproxy_core.c898
-rw-r--r--net/netfilter/nf_tables_api.c127
-rw-r--r--net/netfilter/nf_tables_core.c1
-rw-r--r--net/netfilter/nf_tables_offload.c267
-rw-r--r--net/netfilter/nfnetlink_osf.c5
-rw-r--r--net/netfilter/nfnetlink_queue.c2
-rw-r--r--net/netfilter/nft_cmp.c53
-rw-r--r--net/netfilter/nft_ct.c142
-rw-r--r--net/netfilter/nft_dynset.c2
-rw-r--r--net/netfilter/nft_exthdr.c136
-rw-r--r--net/netfilter/nft_immediate.c31
-rw-r--r--net/netfilter/nft_meta.c112
-rw-r--r--net/netfilter/nft_payload.c193
-rw-r--r--net/netfilter/nft_synproxy.c287
-rw-r--r--net/netfilter/utils.c5
-rw-r--r--net/netfilter/xt_DSCP.c8
-rw-r--r--net/netfilter/xt_HL.c4
-rw-r--r--net/netfilter/xt_TCPMSS.c2
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c28
-rw-r--r--net/netfilter/xt_iprange.c4
-rw-r--r--net/netfilter/xt_owner.c26
-rw-r--r--net/netfilter/xt_set.c45
-rw-r--r--net/netlink/af_netlink.c20
-rw-r--r--net/netrom/af_netrom.c3
-rw-r--r--net/nfc/nci/data.c2
-rw-r--r--net/openvswitch/actions.c83
-rw-r--r--net/openvswitch/datapath.c2
-rw-r--r--net/openvswitch/dp_notify.c2
-rw-r--r--net/openvswitch/vport-netdev.c6
-rw-r--r--net/openvswitch/vport.c2
-rw-r--r--net/packet/af_packet.c99
-rw-r--r--net/packet/internal.h1
-rw-r--r--net/rds/ib.c2
-rw-r--r--net/rxrpc/af_rxrpc.c4
-rw-r--r--net/rxrpc/output.c3
-rw-r--r--net/sched/Kconfig47
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c9
-rw-r--r--net/sched/act_ct.c984
-rw-r--r--net/sched/act_ctinfo.c407
-rw-r--r--net/sched/act_mirred.c23
-rw-r--r--net/sched/act_mpls.c406
-rw-r--r--net/sched/cls_api.c216
-rw-r--r--net/sched/cls_flower.c195
-rw-r--r--net/sched/cls_fw.c13
-rw-r--r--net/sched/cls_matchall.c9
-rw-r--r--net/sched/cls_u32.c15
-rw-r--r--net/sched/em_ipt.c48
-rw-r--r--net/sched/sch_etf.c10
-rw-r--r--net/sched/sch_ingress.c8
-rw-r--r--net/sched/sch_taprio.c421
-rw-r--r--net/sctp/associola.c2
-rw-r--r--net/sctp/bind_addr.c13
-rw-r--r--net/sctp/ipv6.c2
-rw-r--r--net/sctp/offload.c7
-rw-r--r--net/sctp/output.c3
-rw-r--r--net/sctp/protocol.c2
-rw-r--r--net/sctp/sm_make_chunk.c21
-rw-r--r--net/sctp/socket.c43
-rw-r--r--net/sctp/stream.c9
-rw-r--r--net/sctp/stream_interleave.c4
-rw-r--r--net/sctp/stream_sched.c2
-rw-r--r--net/smc/af_smc.c73
-rw-r--r--net/smc/smc_clc.c11
-rw-r--r--net/socket.c78
-rw-r--r--net/strparser/strparser.c8
-rw-r--r--net/tipc/bcast.c4
-rw-r--r--net/tipc/bearer.c14
-rw-r--r--net/tipc/link.c124
-rw-r--r--net/tipc/msg.h4
-rw-r--r--net/tipc/netlink.c2
-rw-r--r--net/tipc/netlink_compat.c10
-rw-r--r--net/tipc/node.c2
-rw-r--r--net/tipc/udp_media.c93
-rw-r--r--net/tls/tls_device.c184
-rw-r--r--net/tls/tls_device_fallback.c16
-rw-r--r--net/tls/tls_main.c4
-rw-r--r--net/tls/tls_sw.c29
-rw-r--r--net/unix/diag.c12
-rw-r--r--net/vmw_vsock/af_vsock.c38
-rw-r--r--net/vmw_vsock/hyperv_transport.c93
-rw-r--r--net/vmw_vsock/virtio_transport.c134
-rw-r--r--net/wireless/core.c13
-rw-r--r--net/wireless/core.h4
-rw-r--r--net/wireless/nl80211.c77
-rw-r--r--net/wireless/scan.c33
-rw-r--r--net/wireless/sme.c32
-rw-r--r--net/wireless/trace.h18
-rw-r--r--net/xdp/xdp_umem.c21
-rw-r--r--net/xdp/xdp_umem.h1
-rw-r--r--net/xdp/xsk.c154
-rw-r--r--net/xdp/xsk_queue.h16
-rw-r--r--net/xfrm/Kconfig2
-rw-r--r--net/xfrm/xfrm_device.c5
-rw-r--r--net/xfrm/xfrm_input.c25
-rw-r--r--net/xfrm/xfrm_interface.c104
-rw-r--r--net/xfrm/xfrm_policy.c17
-rw-r--r--net/xfrm/xfrm_state.c437
-rw-r--r--net/xfrm/xfrm_user.c19
345 files changed, 16460 insertions, 5206 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h
index 53cf446ce2e3..01853cec0209 100644
--- a/net/6lowpan/6lowpan_i.h
+++ b/net/6lowpan/6lowpan_i.h
@@ -18,24 +18,16 @@ extern const struct ndisc_ops lowpan_ndisc_ops;
int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev);
#ifdef CONFIG_6LOWPAN_DEBUGFS
-int lowpan_dev_debugfs_init(struct net_device *dev);
+void lowpan_dev_debugfs_init(struct net_device *dev);
void lowpan_dev_debugfs_exit(struct net_device *dev);
-int __init lowpan_debugfs_init(void);
+void __init lowpan_debugfs_init(void);
void lowpan_debugfs_exit(void);
#else
-static inline int lowpan_dev_debugfs_init(struct net_device *dev)
-{
- return 0;
-}
-
+static inline void lowpan_dev_debugfs_init(struct net_device *dev) { }
static inline void lowpan_dev_debugfs_exit(struct net_device *dev) { }
-static inline int __init lowpan_debugfs_init(void)
-{
- return 0;
-}
-
+static inline void __init lowpan_debugfs_init(void) { }
static inline void lowpan_debugfs_exit(void) { }
#endif /* CONFIG_6LOWPAN_DEBUGFS */
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index 2d68351f1ac4..a068757eabaf 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -42,9 +42,7 @@ int lowpan_register_netdevice(struct net_device *dev,
if (ret < 0)
return ret;
- ret = lowpan_dev_debugfs_init(dev);
- if (ret < 0)
- unregister_netdevice(dev);
+ lowpan_dev_debugfs_init(dev);
return ret;
}
@@ -152,9 +150,7 @@ static int __init lowpan_module_init(void)
{
int ret;
- ret = lowpan_debugfs_init();
- if (ret < 0)
- return ret;
+ lowpan_debugfs_init();
ret = register_netdevice_notifier(&lowpan_notifier);
if (ret < 0) {
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c
index f5a8eec9d7a3..1c140af06d52 100644
--- a/net/6lowpan/debugfs.c
+++ b/net/6lowpan/debugfs.c
@@ -163,11 +163,11 @@ static const struct file_operations lowpan_ctx_pfx_fops = {
.release = single_release,
};
-static int lowpan_dev_debugfs_ctx_init(struct net_device *dev,
- struct dentry *ctx, u8 id)
+static void lowpan_dev_debugfs_ctx_init(struct net_device *dev,
+ struct dentry *ctx, u8 id)
{
struct lowpan_dev *ldev = lowpan_dev(dev);
- struct dentry *dentry, *root;
+ struct dentry *root;
char buf[32];
WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE);
@@ -175,34 +175,18 @@ static int lowpan_dev_debugfs_ctx_init(struct net_device *dev,
sprintf(buf, "%d", id);
root = debugfs_create_dir(buf, ctx);
- if (!root)
- return -EINVAL;
- dentry = debugfs_create_file_unsafe("active", 0644, root,
- &ldev->ctx.table[id],
- &lowpan_ctx_flag_active_fops);
- if (!dentry)
- return -EINVAL;
+ debugfs_create_file("active", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_flag_active_fops);
- dentry = debugfs_create_file_unsafe("compression", 0644, root,
- &ldev->ctx.table[id],
- &lowpan_ctx_flag_c_fops);
- if (!dentry)
- return -EINVAL;
-
- dentry = debugfs_create_file("prefix", 0644, root,
- &ldev->ctx.table[id],
- &lowpan_ctx_pfx_fops);
- if (!dentry)
- return -EINVAL;
+ debugfs_create_file("compression", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_flag_c_fops);
- dentry = debugfs_create_file_unsafe("prefix_len", 0644, root,
- &ldev->ctx.table[id],
- &lowpan_ctx_plen_fops);
- if (!dentry)
- return -EINVAL;
+ debugfs_create_file("prefix", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_pfx_fops);
- return 0;
+ debugfs_create_file("prefix_len", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_plen_fops);
}
static int lowpan_context_show(struct seq_file *file, void *offset)
@@ -242,64 +226,39 @@ static int lowpan_short_addr_get(void *data, u64 *val)
DEFINE_DEBUGFS_ATTRIBUTE(lowpan_short_addr_fops, lowpan_short_addr_get, NULL,
"0x%04llx\n");
-static int lowpan_dev_debugfs_802154_init(const struct net_device *dev,
+static void lowpan_dev_debugfs_802154_init(const struct net_device *dev,
struct lowpan_dev *ldev)
{
- struct dentry *dentry, *root;
+ struct dentry *root;
if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
- return 0;
+ return;
root = debugfs_create_dir("ieee802154", ldev->iface_debugfs);
- if (!root)
- return -EINVAL;
-
- dentry = debugfs_create_file_unsafe("short_addr", 0444, root,
- lowpan_802154_dev(dev)->wdev->ieee802154_ptr,
- &lowpan_short_addr_fops);
- if (!dentry)
- return -EINVAL;
- return 0;
+ debugfs_create_file("short_addr", 0444, root,
+ lowpan_802154_dev(dev)->wdev->ieee802154_ptr,
+ &lowpan_short_addr_fops);
}
-int lowpan_dev_debugfs_init(struct net_device *dev)
+void lowpan_dev_debugfs_init(struct net_device *dev)
{
struct lowpan_dev *ldev = lowpan_dev(dev);
- struct dentry *contexts, *dentry;
- int ret, i;
+ struct dentry *contexts;
+ int i;
/* creating the root */
ldev->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs);
- if (!ldev->iface_debugfs)
- goto fail;
contexts = debugfs_create_dir("contexts", ldev->iface_debugfs);
- if (!contexts)
- goto remove_root;
-
- dentry = debugfs_create_file("show", 0644, contexts,
- &lowpan_dev(dev)->ctx,
- &lowpan_context_fops);
- if (!dentry)
- goto remove_root;
-
- for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
- ret = lowpan_dev_debugfs_ctx_init(dev, contexts, i);
- if (ret < 0)
- goto remove_root;
- }
- ret = lowpan_dev_debugfs_802154_init(dev, ldev);
- if (ret < 0)
- goto remove_root;
+ debugfs_create_file("show", 0644, contexts, &lowpan_dev(dev)->ctx,
+ &lowpan_context_fops);
- return 0;
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
+ lowpan_dev_debugfs_ctx_init(dev, contexts, i);
-remove_root:
- lowpan_dev_debugfs_exit(dev);
-fail:
- return -EINVAL;
+ lowpan_dev_debugfs_802154_init(dev, ldev);
}
void lowpan_dev_debugfs_exit(struct net_device *dev)
@@ -307,13 +266,9 @@ void lowpan_dev_debugfs_exit(struct net_device *dev)
debugfs_remove_recursive(lowpan_dev(dev)->iface_debugfs);
}
-int __init lowpan_debugfs_init(void)
+void __init lowpan_debugfs_init(void)
{
lowpan_debugfs = debugfs_create_dir("6lowpan", NULL);
- if (!lowpan_debugfs)
- return -EINVAL;
-
- return 0;
}
void lowpan_debugfs_exit(void)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index a0b2d8b9def7..93eadf179123 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -580,6 +580,7 @@ static int vlan_dev_init(struct net_device *dev)
dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE;
dev->hw_enc_features = vlan_tnl_features(real_dev);
+ dev->mpls_features = real_dev->mpls_features;
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
diff --git a/net/Kconfig b/net/Kconfig
index d122f53c6fa2..57f51a279ad6 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -67,8 +67,6 @@ source "net/xdp/Kconfig"
config INET
bool "TCP/IP networking"
- select CRYPTO
- select CRYPTO_AES
---help---
These are the protocols used on the Internet and on most local
Ethernets. It is highly recommended to say Y here (this will enlarge
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index cb7d57d16c9d..37898da8ad48 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -9,12 +9,11 @@
#include "main.h"
+#include <linux/netlink.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
-
extern char batadv_routing_algo[];
extern struct list_head batadv_hardif_list;
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index bd4138ddf7e0..240ed70912d6 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -2337,7 +2337,7 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1,
return ret;
}
-static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface)
+static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface)
{
/* begin scheduling originator messages on that interface */
batadv_iv_ogm_schedule(hard_iface);
@@ -2683,8 +2683,8 @@ unlock:
static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
.name = "BATMAN_IV",
.iface = {
- .activate = batadv_iv_iface_activate,
.enable = batadv_iv_ogm_iface_enable,
+ .enabled = batadv_iv_iface_enabled,
.disable = batadv_iv_ogm_iface_disable,
.update_mac = batadv_iv_ogm_iface_update_mac,
.primary_set = batadv_iv_ogm_primary_iface_set,
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 231b4aab4d8d..22672cb3e25d 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -21,6 +21,7 @@
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/types.h>
@@ -41,8 +42,6 @@
#include "netlink.h"
#include "originator.h"
-struct sk_buff;
-
static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface)
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index bb3d40f73bfe..1a29505f4f66 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -9,8 +9,8 @@
#include "main.h"
-struct sk_buff;
-struct work_struct;
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index 616bf2ea8755..2a50df7fc2bf 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -9,10 +9,9 @@
#include "main.h"
+#include <linux/skbuff.h>
#include <linux/types.h>
-struct sk_buff;
-
int batadv_v_ogm_init(struct batadv_priv *bat_priv);
void batadv_v_ogm_free(struct batadv_priv *bat_priv);
int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface);
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 012d72c8d064..02b24a861a85 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -10,14 +10,13 @@
#include "main.h"
#include <linux/compiler.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/stddef.h>
#include <linux/types.h>
-struct net_device;
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
-
/**
* batadv_bla_is_loopdetect_mac() - check if the mac address is from a loop
* detect frame sent by bridge loop avoidance
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index d38d70ccdd5a..38c4d8e51155 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -10,7 +10,6 @@
#include <asm/current.h>
#include <linux/dcache.h>
#include <linux/debugfs.h>
-#include <linux/err.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/fs.h>
@@ -293,31 +292,13 @@ static struct batadv_debuginfo *batadv_hardif_debuginfos[] = {
void batadv_debugfs_init(void)
{
struct batadv_debuginfo **bat_debug;
- struct dentry *file;
batadv_debugfs = debugfs_create_dir(BATADV_DEBUGFS_SUBDIR, NULL);
- if (batadv_debugfs == ERR_PTR(-ENODEV))
- batadv_debugfs = NULL;
-
- if (!batadv_debugfs)
- goto err;
-
- for (bat_debug = batadv_general_debuginfos; *bat_debug; ++bat_debug) {
- file = debugfs_create_file(((*bat_debug)->attr).name,
- S_IFREG | ((*bat_debug)->attr).mode,
- batadv_debugfs, NULL,
- &(*bat_debug)->fops);
- if (!file) {
- pr_err("Can't add general debugfs file: %s\n",
- ((*bat_debug)->attr).name);
- goto err;
- }
- }
- return;
-err:
- debugfs_remove_recursive(batadv_debugfs);
- batadv_debugfs = NULL;
+ for (bat_debug = batadv_general_debuginfos; *bat_debug; ++bat_debug)
+ debugfs_create_file(((*bat_debug)->attr).name,
+ S_IFREG | ((*bat_debug)->attr).mode,
+ batadv_debugfs, NULL, &(*bat_debug)->fops);
}
/**
@@ -333,42 +314,23 @@ void batadv_debugfs_destroy(void)
* batadv_debugfs_add_hardif() - creates the base directory for a hard interface
* in debugfs.
* @hard_iface: hard interface which should be added.
- *
- * Return: 0 on success or negative error number in case of failure
*/
-int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
+void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
{
struct net *net = dev_net(hard_iface->net_dev);
struct batadv_debuginfo **bat_debug;
- struct dentry *file;
-
- if (!batadv_debugfs)
- goto out;
if (net != &init_net)
- return 0;
+ return;
hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name,
batadv_debugfs);
- if (!hard_iface->debug_dir)
- goto out;
-
- for (bat_debug = batadv_hardif_debuginfos; *bat_debug; ++bat_debug) {
- file = debugfs_create_file(((*bat_debug)->attr).name,
- S_IFREG | ((*bat_debug)->attr).mode,
- hard_iface->debug_dir,
- hard_iface->net_dev,
- &(*bat_debug)->fops);
- if (!file)
- goto rem_attr;
- }
- return 0;
-rem_attr:
- debugfs_remove_recursive(hard_iface->debug_dir);
- hard_iface->debug_dir = NULL;
-out:
- return -ENOMEM;
+ for (bat_debug = batadv_hardif_debuginfos; *bat_debug; ++bat_debug)
+ debugfs_create_file(((*bat_debug)->attr).name,
+ S_IFREG | ((*bat_debug)->attr).mode,
+ hard_iface->debug_dir, hard_iface->net_dev,
+ &(*bat_debug)->fops);
}
/**
@@ -379,15 +341,12 @@ void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface)
{
const char *name = hard_iface->net_dev->name;
struct dentry *dir;
- struct dentry *d;
dir = hard_iface->debug_dir;
if (!dir)
return;
- d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
- if (!d)
- pr_err("Can't rename debugfs dir to %s\n", name);
+ debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
}
/**
@@ -419,44 +378,29 @@ int batadv_debugfs_add_meshif(struct net_device *dev)
struct batadv_priv *bat_priv = netdev_priv(dev);
struct batadv_debuginfo **bat_debug;
struct net *net = dev_net(dev);
- struct dentry *file;
-
- if (!batadv_debugfs)
- goto out;
if (net != &init_net)
return 0;
bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs);
- if (!bat_priv->debug_dir)
- goto out;
- if (batadv_socket_setup(bat_priv) < 0)
- goto rem_attr;
+ batadv_socket_setup(bat_priv);
if (batadv_debug_log_setup(bat_priv) < 0)
goto rem_attr;
- for (bat_debug = batadv_mesh_debuginfos; *bat_debug; ++bat_debug) {
- file = debugfs_create_file(((*bat_debug)->attr).name,
- S_IFREG | ((*bat_debug)->attr).mode,
- bat_priv->debug_dir,
- dev, &(*bat_debug)->fops);
- if (!file) {
- batadv_err(dev, "Can't add debugfs file: %s/%s\n",
- dev->name, ((*bat_debug)->attr).name);
- goto rem_attr;
- }
- }
+ for (bat_debug = batadv_mesh_debuginfos; *bat_debug; ++bat_debug)
+ debugfs_create_file(((*bat_debug)->attr).name,
+ S_IFREG | ((*bat_debug)->attr).mode,
+ bat_priv->debug_dir, dev,
+ &(*bat_debug)->fops);
- if (batadv_nc_init_debugfs(bat_priv) < 0)
- goto rem_attr;
+ batadv_nc_init_debugfs(bat_priv);
return 0;
rem_attr:
debugfs_remove_recursive(bat_priv->debug_dir);
bat_priv->debug_dir = NULL;
-out:
return -ENOMEM;
}
@@ -469,15 +413,12 @@ void batadv_debugfs_rename_meshif(struct net_device *dev)
struct batadv_priv *bat_priv = netdev_priv(dev);
const char *name = dev->name;
struct dentry *dir;
- struct dentry *d;
dir = bat_priv->debug_dir;
if (!dir)
return;
- d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
- if (!d)
- pr_err("Can't rename debugfs dir to %s\n", name);
+ debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
}
/**
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 7fac680cf740..1c5afd301ce9 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -9,8 +9,8 @@
#include "main.h"
-struct file;
-struct net_device;
+#include <linux/fs.h>
+#include <linux/netdevice.h>
#define BATADV_DEBUGFS_SUBDIR "batman_adv"
@@ -22,7 +22,7 @@ void batadv_debugfs_destroy(void);
int batadv_debugfs_add_meshif(struct net_device *dev);
void batadv_debugfs_rename_meshif(struct net_device *dev);
void batadv_debugfs_del_meshif(struct net_device *dev);
-int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface);
+void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface);
void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface);
void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface);
@@ -54,9 +54,8 @@ static inline void batadv_debugfs_del_meshif(struct net_device *dev)
}
static inline
-int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
+void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
{
- return 0;
}
static inline
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 110c27447d70..67c7729add55 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -11,15 +11,14 @@
#include <linux/compiler.h>
#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
#include <uapi/linux/batadv_packet.h>
#include "originator.h"
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
-
#ifdef CONFIG_BATMAN_ADV_DAT
/* BATADV_DAT_ADDR_MAX - maximum address value in the DHT space */
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index d6074ba2ada7..abfe8c6556de 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -11,11 +11,10 @@
#include <linux/compiler.h>
#include <linux/list.h>
+#include <linux/skbuff.h>
#include <linux/stddef.h>
#include <linux/types.h>
-struct sk_buff;
-
void batadv_frag_purge_orig(struct batadv_orig_node *orig,
bool (*check_cb)(struct batadv_frag_table_entry *));
bool batadv_frag_skb_fwd(struct sk_buff *skb,
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 0e14026feebd..0be8e7178ec7 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -9,12 +9,11 @@
#include "main.h"
+#include <linux/netlink.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
-
-struct batadv_tvlv_gateway_data;
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
+#include <uapi/linux/batadv_packet.h>
void batadv_gw_check_client_stop(struct batadv_priv *bat_priv);
void batadv_gw_reselect(struct batadv_priv *bat_priv);
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index dac097f9be03..fc55750542e4 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -11,6 +11,7 @@
#include <linux/byteorder/generic.h>
#include <linux/errno.h>
#include <linux/kernel.h>
+#include <linux/limits.h>
#include <linux/math64.h>
#include <linux/netdevice.h>
#include <linux/stddef.h>
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 5cf50736c635..211b14b37db8 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -9,10 +9,9 @@
#include "main.h"
+#include <linux/netdevice.h>
#include <linux/types.h>
-struct net_device;
-
/**
* enum batadv_bandwidth_units - bandwidth unit types
*/
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 79d1731b8306..c90e47342bb0 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -16,6 +16,7 @@
#include <linux/if_ether.h>
#include <linux/kernel.h>
#include <linux/kref.h>
+#include <linux/limits.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
@@ -795,6 +796,9 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
batadv_hardif_recalc_extra_skbroom(soft_iface);
+ if (bat_priv->algo_ops->iface.enabled)
+ bat_priv->algo_ops->iface.enabled(hard_iface);
+
out:
return 0;
@@ -920,9 +924,7 @@ batadv_hardif_add_interface(struct net_device *net_dev)
hard_iface->soft_iface = NULL;
hard_iface->if_status = BATADV_IF_NOT_IN_USE;
- ret = batadv_debugfs_add_hardif(hard_iface);
- if (ret)
- goto free_sysfs;
+ batadv_debugfs_add_hardif(hard_iface);
INIT_LIST_HEAD(&hard_iface->list);
INIT_HLIST_HEAD(&hard_iface->neigh_list);
@@ -944,8 +946,6 @@ batadv_hardif_add_interface(struct net_device *net_dev)
return hard_iface;
-free_sysfs:
- batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
free_if:
kfree(hard_iface);
release_dev:
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index c8ef6aa0e865..bbb8a6f18d6b 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -11,13 +11,12 @@
#include <linux/compiler.h>
#include <linux/kref.h>
+#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/stddef.h>
#include <linux/types.h>
-
-struct net_device;
-struct net;
+#include <net/net_namespace.h>
/**
* enum batadv_hard_if_state - State of a hard interface
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index ceef171f7f98..57877f0b78e0 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -12,13 +12,12 @@
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/list.h>
+#include <linux/lockdep.h>
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/types.h>
-struct lock_class_key;
-
/* callback to a compare function. should compare 2 element datas for their
* keys
*
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 0a91c8661357..0a70b66e8770 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -314,25 +314,11 @@ static const struct file_operations batadv_fops = {
/**
* batadv_socket_setup() - Create debugfs "socket" file
* @bat_priv: the bat priv with all the soft interface information
- *
- * Return: 0 on success or negative error number in case of failure
*/
-int batadv_socket_setup(struct batadv_priv *bat_priv)
+void batadv_socket_setup(struct batadv_priv *bat_priv)
{
- struct dentry *d;
-
- if (!bat_priv->debug_dir)
- goto err;
-
- d = debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir,
- bat_priv, &batadv_fops);
- if (!d)
- goto err;
-
- return 0;
-
-err:
- return -ENOMEM;
+ debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir,
+ bat_priv, &batadv_fops);
}
/**
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index 35eecbfd2e65..27fafff586df 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -10,12 +10,11 @@
#include "main.h"
#include <linux/types.h>
-
-struct batadv_icmp_header;
+#include <uapi/linux/batadv_packet.h>
#define BATADV_ICMP_SOCKET "socket"
-int batadv_socket_setup(struct batadv_priv *bat_priv);
+void batadv_socket_setup(struct batadv_priv *bat_priv);
#ifdef CONFIG_BATMAN_ADV_DEBUGFS
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index f79ebd5b46e9..11941cf1adcc 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -190,27 +190,16 @@ static const struct file_operations batadv_log_fops = {
*/
int batadv_debug_log_setup(struct batadv_priv *bat_priv)
{
- struct dentry *d;
-
- if (!bat_priv->debug_dir)
- goto err;
-
bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC);
if (!bat_priv->debug_log)
- goto err;
+ return -ENOMEM;
spin_lock_init(&bat_priv->debug_log->lock);
init_waitqueue_head(&bat_priv->debug_log->queue_wait);
- d = debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv,
- &batadv_log_fops);
- if (!d)
- goto err;
-
+ debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv,
+ &batadv_log_fops);
return 0;
-
-err:
- return -ENOMEM;
}
/**
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 5504637e63d8..741cfa3719ff 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -9,6 +9,7 @@
#include "main.h"
+#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/printk.h>
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index c59afcba31e0..3d4c04d87ff3 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -13,7 +13,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2019.2"
+#define BATADV_SOURCE_VERSION "2019.3"
#endif
/* B.A.T.M.A.N. parameters */
@@ -205,20 +205,20 @@ enum batadv_uev_type {
/* Kernel headers */
+#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/etherdevice.h>
#include <linux/if_vlan.h>
#include <linux/jiffies.h>
+#include <linux/netdevice.h>
#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
#include <uapi/linux/batadv_packet.h>
#include "types.h"
-
-struct net_device;
-struct packet_type;
-struct seq_file;
-struct sk_buff;
+#include "main.h"
/**
* batadv_print_vid() - return printable version of vid information
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index ec54e236e345..67d7f83009ae 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -20,6 +20,7 @@
#include <linux/igmp.h>
#include <linux/in.h>
#include <linux/in6.h>
+#include <linux/inetdevice.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/jiffies.h>
@@ -98,69 +99,307 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface)
}
/**
- * batadv_mcast_addr_is_ipv4() - check if multicast MAC is IPv4
- * @addr: the MAC address to check
+ * batadv_mcast_mla_rtr_flags_softif_get_ipv4() - get mcast router flags from
+ * node for IPv4
+ * @dev: the interface to check
*
- * Return: True, if MAC address is one reserved for IPv4 multicast, false
- * otherwise.
+ * Checks the presence of an IPv4 multicast router on this node.
+ *
+ * Caller needs to hold rcu read lock.
+ *
+ * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise.
*/
-static bool batadv_mcast_addr_is_ipv4(const u8 *addr)
+static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv4(struct net_device *dev)
{
- static const u8 prefix[] = {0x01, 0x00, 0x5E};
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
- return memcmp(prefix, addr, sizeof(prefix)) == 0;
+ if (in_dev && IN_DEV_MFORWARD(in_dev))
+ return BATADV_NO_FLAGS;
+ else
+ return BATADV_MCAST_WANT_NO_RTR4;
}
/**
- * batadv_mcast_addr_is_ipv6() - check if multicast MAC is IPv6
- * @addr: the MAC address to check
+ * batadv_mcast_mla_rtr_flags_softif_get_ipv6() - get mcast router flags from
+ * node for IPv6
+ * @dev: the interface to check
*
- * Return: True, if MAC address is one reserved for IPv6 multicast, false
- * otherwise.
+ * Checks the presence of an IPv6 multicast router on this node.
+ *
+ * Caller needs to hold rcu read lock.
+ *
+ * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise.
*/
-static bool batadv_mcast_addr_is_ipv6(const u8 *addr)
+#if IS_ENABLED(CONFIG_IPV6_MROUTE)
+static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev)
{
- static const u8 prefix[] = {0x33, 0x33};
+ struct inet6_dev *in6_dev = __in6_dev_get(dev);
- return memcmp(prefix, addr, sizeof(prefix)) == 0;
+ if (in6_dev && in6_dev->cnf.mc_forwarding)
+ return BATADV_NO_FLAGS;
+ else
+ return BATADV_MCAST_WANT_NO_RTR6;
+}
+#else
+static inline u8
+batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev)
+{
+ return BATADV_MCAST_WANT_NO_RTR6;
}
+#endif
/**
- * batadv_mcast_mla_softif_get() - get softif multicast listeners
+ * batadv_mcast_mla_rtr_flags_softif_get() - get mcast router flags from node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @bridge: bridge interface on top of the soft_iface if present,
+ * otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers on this
+ * node.
+ *
+ * Return:
+ * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ * The former two OR'd: no multicast router is present
+ */
+static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv,
+ struct net_device *bridge)
+{
+ struct net_device *dev = bridge ? bridge : bat_priv->soft_iface;
+ u8 flags = BATADV_NO_FLAGS;
+
+ rcu_read_lock();
+
+ flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv4(dev);
+ flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv6(dev);
+
+ rcu_read_unlock();
+
+ return flags;
+}
+
+/**
+ * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge
+ * @bat_priv: the bat priv with all the soft interface information
+ * @bridge: bridge interface on top of the soft_iface if present,
+ * otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge.
+ *
+ * Return:
+ * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ * The former two OR'd: no multicast router is present
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv,
+ struct net_device *bridge)
+{
+ struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list);
+ struct net_device *dev = bat_priv->soft_iface;
+ struct br_ip_list *br_ip_entry, *tmp;
+ u8 flags = BATADV_MCAST_WANT_NO_RTR6;
+ int ret;
+
+ if (!bridge)
+ return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
+
+ /* TODO: ask the bridge if a multicast router is present (the bridge
+ * is capable of performing proper RFC4286 multicast multicast router
+ * discovery) instead of searching for a ff02::2 listener here
+ */
+ ret = br_multicast_list_adjacent(dev, &bridge_mcast_list);
+ if (ret < 0)
+ return BATADV_NO_FLAGS;
+
+ list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) {
+ /* the bridge snooping does not maintain IPv4 link-local
+ * addresses - therefore we won't find any IPv4 multicast router
+ * address here, only IPv6 ones
+ */
+ if (br_ip_entry->addr.proto == htons(ETH_P_IPV6) &&
+ ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.u.ip6))
+ flags &= ~BATADV_MCAST_WANT_NO_RTR6;
+
+ list_del(&br_ip_entry->list);
+ kfree(br_ip_entry);
+ }
+
+ return flags;
+}
+#else
+static inline u8
+batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv,
+ struct net_device *bridge)
+{
+ if (bridge)
+ return BATADV_NO_FLAGS;
+ else
+ return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
+}
+#endif
+
+/**
+ * batadv_mcast_mla_rtr_flags_get() - get multicast router flags
+ * @bat_priv: the bat priv with all the soft interface information
+ * @bridge: bridge interface on top of the soft_iface if present,
+ * otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers on this
+ * node or behind its bridge.
+ *
+ * Return:
+ * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ * The former two OR'd: no multicast router is present
+ */
+static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv,
+ struct net_device *bridge)
+{
+ u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
+
+ flags &= batadv_mcast_mla_rtr_flags_softif_get(bat_priv, bridge);
+ flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge);
+
+ return flags;
+}
+
+/**
+ * batadv_mcast_mla_flags_get() - get the new multicast flags
* @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: A set of flags for the current/next TVLV, querier and
+ * bridge state.
+ */
+static struct batadv_mcast_mla_flags
+batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv)
+{
+ struct net_device *dev = bat_priv->soft_iface;
+ struct batadv_mcast_querier_state *qr4, *qr6;
+ struct batadv_mcast_mla_flags mla_flags;
+ struct net_device *bridge;
+
+ bridge = batadv_mcast_get_bridge(dev);
+
+ memset(&mla_flags, 0, sizeof(mla_flags));
+ mla_flags.enabled = 1;
+ mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv,
+ bridge);
+
+ if (!bridge)
+ return mla_flags;
+
+ dev_put(bridge);
+
+ mla_flags.bridged = 1;
+ qr4 = &mla_flags.querier_ipv4;
+ qr6 = &mla_flags.querier_ipv6;
+
+ if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING))
+ pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n");
+
+ qr4->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP);
+ qr4->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP);
+
+ qr6->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6);
+ qr6->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6);
+
+ mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES;
+
+ /* 1) If no querier exists at all, then multicast listeners on
+ * our local TT clients behind the bridge will keep silent.
+ * 2) If the selected querier is on one of our local TT clients,
+ * behind the bridge, then this querier might shadow multicast
+ * listeners on our local TT clients, behind this bridge.
+ *
+ * In both cases, we will signalize other batman nodes that
+ * we need all multicast traffic of the according protocol.
+ */
+ if (!qr4->exists || qr4->shadowing) {
+ mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4;
+ mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4;
+ }
+
+ if (!qr6->exists || qr6->shadowing) {
+ mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6;
+ mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6;
+ }
+
+ return mla_flags;
+}
+
+/**
+ * batadv_mcast_mla_is_duplicate() - check whether an address is in a list
+ * @mcast_addr: the multicast address to check
+ * @mcast_list: the list with multicast addresses to search in
+ *
+ * Return: true if the given address is already in the given list.
+ * Otherwise returns false.
+ */
+static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr,
+ struct hlist_head *mcast_list)
+{
+ struct batadv_hw_addr *mcast_entry;
+
+ hlist_for_each_entry(mcast_entry, mcast_list, list)
+ if (batadv_compare_eth(mcast_entry->addr, mcast_addr))
+ return true;
+
+ return false;
+}
+
+/**
+ * batadv_mcast_mla_softif_get_ipv4() - get softif IPv4 multicast listeners
* @dev: the device to collect multicast addresses from
* @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
*
- * Collects multicast addresses of multicast listeners residing
+ * Collects multicast addresses of IPv4 multicast listeners residing
* on this kernel on the given soft interface, dev, in
* the given mcast_list. In general, multicast listeners provided by
* your multicast receiving applications run directly on this node.
*
- * If there is a bridge interface on top of dev, collects from that one
- * instead. Just like with IP addresses and routes, multicast listeners
- * will(/should) register to the bridge interface instead of an
- * enslaved bat0.
- *
* Return: -ENOMEM on memory allocation error or the number of
* items added to the mcast_list otherwise.
*/
-static int batadv_mcast_mla_softif_get(struct batadv_priv *bat_priv,
- struct net_device *dev,
- struct hlist_head *mcast_list)
+static int
+batadv_mcast_mla_softif_get_ipv4(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
{
- bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4;
- bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6;
- struct net_device *bridge = batadv_mcast_get_bridge(dev);
- struct netdev_hw_addr *mc_list_entry;
struct batadv_hw_addr *new;
+ struct in_device *in_dev;
+ u8 mcast_addr[ETH_ALEN];
+ struct ip_mc_list *pmc;
int ret = 0;
- netif_addr_lock_bh(bridge ? bridge : dev);
- netdev_for_each_mc_addr(mc_list_entry, bridge ? bridge : dev) {
- if (all_ipv4 && batadv_mcast_addr_is_ipv4(mc_list_entry->addr))
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4)
+ return 0;
+
+ rcu_read_lock();
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for (pmc = rcu_dereference(in_dev->mc_list); pmc;
+ pmc = rcu_dereference(pmc->next_rcu)) {
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv4_is_local_multicast(pmc->multiaddr))
+ continue;
+
+ if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+ !ipv4_is_local_multicast(pmc->multiaddr))
continue;
- if (all_ipv6 && batadv_mcast_addr_is_ipv6(mc_list_entry->addr))
+ ip_eth_mc_map(pmc->multiaddr, mcast_addr);
+
+ if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
continue;
new = kmalloc(sizeof(*new), GFP_ATOMIC);
@@ -169,36 +408,142 @@ static int batadv_mcast_mla_softif_get(struct batadv_priv *bat_priv,
break;
}
- ether_addr_copy(new->addr, mc_list_entry->addr);
+ ether_addr_copy(new->addr, mcast_addr);
hlist_add_head(&new->list, mcast_list);
ret++;
}
- netif_addr_unlock_bh(bridge ? bridge : dev);
+ rcu_read_unlock();
- if (bridge)
- dev_put(bridge);
+ return ret;
+}
+
+/**
+ * batadv_mcast_mla_softif_get_ipv6() - get softif IPv6 multicast listeners
+ * @dev: the device to collect multicast addresses from
+ * @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
+ *
+ * Collects multicast addresses of IPv6 multicast listeners residing
+ * on this kernel on the given soft interface, dev, in
+ * the given mcast_list. In general, multicast listeners provided by
+ * your multicast receiving applications run directly on this node.
+ *
+ * Return: -ENOMEM on memory allocation error or the number of
+ * items added to the mcast_list otherwise.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+static int
+batadv_mcast_mla_softif_get_ipv6(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
+{
+ struct batadv_hw_addr *new;
+ struct inet6_dev *in6_dev;
+ u8 mcast_addr[ETH_ALEN];
+ struct ifmcaddr6 *pmc6;
+ int ret = 0;
+
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6)
+ return 0;
+
+ rcu_read_lock();
+
+ in6_dev = __in6_dev_get(dev);
+ if (!in6_dev) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ read_lock_bh(&in6_dev->lock);
+ for (pmc6 = in6_dev->mc_list; pmc6; pmc6 = pmc6->next) {
+ if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) <
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ continue;
+
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr))
+ continue;
+
+ if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+ IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) >
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ continue;
+
+ ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr);
+
+ if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
+ continue;
+
+ new = kmalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ether_addr_copy(new->addr, mcast_addr);
+ hlist_add_head(&new->list, mcast_list);
+ ret++;
+ }
+ read_unlock_bh(&in6_dev->lock);
+ rcu_read_unlock();
return ret;
}
+#else
+static inline int
+batadv_mcast_mla_softif_get_ipv6(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
+{
+ return 0;
+}
+#endif
/**
- * batadv_mcast_mla_is_duplicate() - check whether an address is in a list
- * @mcast_addr: the multicast address to check
- * @mcast_list: the list with multicast addresses to search in
+ * batadv_mcast_mla_softif_get() - get softif multicast listeners
+ * @dev: the device to collect multicast addresses from
+ * @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
*
- * Return: true if the given address is already in the given list.
- * Otherwise returns false.
+ * Collects multicast addresses of multicast listeners residing
+ * on this kernel on the given soft interface, dev, in
+ * the given mcast_list. In general, multicast listeners provided by
+ * your multicast receiving applications run directly on this node.
+ *
+ * If there is a bridge interface on top of dev, collects from that one
+ * instead. Just like with IP addresses and routes, multicast listeners
+ * will(/should) register to the bridge interface instead of an
+ * enslaved bat0.
+ *
+ * Return: -ENOMEM on memory allocation error or the number of
+ * items added to the mcast_list otherwise.
*/
-static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr,
- struct hlist_head *mcast_list)
+static int
+batadv_mcast_mla_softif_get(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
{
- struct batadv_hw_addr *mcast_entry;
+ struct net_device *bridge = batadv_mcast_get_bridge(dev);
+ int ret4, ret6 = 0;
- hlist_for_each_entry(mcast_entry, mcast_list, list)
- if (batadv_compare_eth(mcast_entry->addr, mcast_addr))
- return true;
+ if (bridge)
+ dev = bridge;
- return false;
+ ret4 = batadv_mcast_mla_softif_get_ipv4(dev, mcast_list, flags);
+ if (ret4 < 0)
+ goto out;
+
+ ret6 = batadv_mcast_mla_softif_get_ipv6(dev, mcast_list, flags);
+ if (ret6 < 0) {
+ ret4 = 0;
+ goto out;
+ }
+
+out:
+ if (bridge)
+ dev_put(bridge);
+
+ return ret4 + ret6;
}
/**
@@ -227,9 +572,9 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src)
/**
* batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners
- * @bat_priv: the bat priv with all the soft interface information
* @dev: a bridge slave whose bridge to collect multicast addresses from
* @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
*
* Collects multicast addresses of multicast listeners residing
* on foreign, non-mesh devices which we gave access to our mesh via
@@ -239,14 +584,13 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src)
* Return: -ENOMEM on memory allocation error or the number of
* items added to the mcast_list otherwise.
*/
-static int batadv_mcast_mla_bridge_get(struct batadv_priv *bat_priv,
- struct net_device *dev,
- struct hlist_head *mcast_list)
+static int batadv_mcast_mla_bridge_get(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
{
struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list);
- bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4;
- bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6;
struct br_ip_list *br_ip_entry, *tmp;
+ u8 tvlv_flags = flags->tvlv_flags;
struct batadv_hw_addr *new;
u8 mcast_addr[ETH_ALEN];
int ret;
@@ -259,11 +603,34 @@ static int batadv_mcast_mla_bridge_get(struct batadv_priv *bat_priv,
goto out;
list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) {
- if (all_ipv4 && br_ip_entry->addr.proto == htons(ETH_P_IP))
- continue;
+ if (br_ip_entry->addr.proto == htons(ETH_P_IP)) {
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4)
+ continue;
- if (all_ipv6 && br_ip_entry->addr.proto == htons(ETH_P_IPV6))
- continue;
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv4_is_local_multicast(br_ip_entry->addr.u.ip4))
+ continue;
+
+ if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+ !ipv4_is_local_multicast(br_ip_entry->addr.u.ip4))
+ continue;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (br_ip_entry->addr.proto == htons(ETH_P_IPV6)) {
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6)
+ continue;
+
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.u.ip6))
+ continue;
+
+ if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+ IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.u.ip6) >
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ continue;
+ }
+#endif
batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr);
if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
@@ -370,27 +737,6 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
}
/**
- * batadv_mcast_has_bridge() - check whether the soft-iface is bridged
- * @bat_priv: the bat priv with all the soft interface information
- *
- * Checks whether there is a bridge on top of our soft interface.
- *
- * Return: true if there is a bridge, false otherwise.
- */
-static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv)
-{
- struct net_device *upper = bat_priv->soft_iface;
-
- rcu_read_lock();
- do {
- upper = netdev_master_upper_dev_get_rcu(upper);
- } while (upper && !(upper->priv_flags & IFF_EBRIDGE));
- rcu_read_unlock();
-
- return upper;
-}
-
-/**
* batadv_mcast_querier_log() - debug output regarding the querier status on
* link
* @bat_priv: the bat priv with all the soft interface information
@@ -424,7 +770,7 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto,
batadv_info(bat_priv->soft_iface,
"%s Querier disappeared - multicast optimizations disabled\n",
str_proto);
- else if (!bat_priv->mcast.bridged && !new_state->exists)
+ else if (!bat_priv->mcast.mla_flags.bridged && !new_state->exists)
batadv_info(bat_priv->soft_iface,
"No %s Querier present - multicast optimizations disabled\n",
str_proto);
@@ -446,9 +792,7 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto,
* batadv_mcast_bridge_log() - debug output for topology changes in bridged
* setups
* @bat_priv: the bat priv with all the soft interface information
- * @bridged: a flag about whether the soft interface is currently bridged or not
- * @querier_ipv4: (maybe) new status of a potential, selected IGMP querier
- * @querier_ipv6: (maybe) new status of a potential, selected MLD querier
+ * @new_flags: flags indicating the new multicast state
*
* If no bridges are ever used on this node, then this function does nothing.
*
@@ -461,126 +805,86 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto,
* multicast flags this node is going to set.
*/
static void
-batadv_mcast_bridge_log(struct batadv_priv *bat_priv, bool bridged,
- struct batadv_mcast_querier_state *querier_ipv4,
- struct batadv_mcast_querier_state *querier_ipv6)
+batadv_mcast_bridge_log(struct batadv_priv *bat_priv,
+ struct batadv_mcast_mla_flags *new_flags)
{
- if (!bat_priv->mcast.bridged && bridged)
+ struct batadv_mcast_mla_flags *old_flags = &bat_priv->mcast.mla_flags;
+
+ if (!old_flags->bridged && new_flags->bridged)
batadv_dbg(BATADV_DBG_MCAST, bat_priv,
"Bridge added: Setting Unsnoopables(U)-flag\n");
- else if (bat_priv->mcast.bridged && !bridged)
+ else if (old_flags->bridged && !new_flags->bridged)
batadv_dbg(BATADV_DBG_MCAST, bat_priv,
"Bridge removed: Unsetting Unsnoopables(U)-flag\n");
- if (bridged) {
+ if (new_flags->bridged) {
batadv_mcast_querier_log(bat_priv, "IGMP",
- &bat_priv->mcast.querier_ipv4,
- querier_ipv4);
+ &old_flags->querier_ipv4,
+ &new_flags->querier_ipv4);
batadv_mcast_querier_log(bat_priv, "MLD",
- &bat_priv->mcast.querier_ipv6,
- querier_ipv6);
+ &old_flags->querier_ipv6,
+ &new_flags->querier_ipv6);
}
}
/**
* batadv_mcast_flags_logs() - output debug information about mcast flag changes
* @bat_priv: the bat priv with all the soft interface information
- * @flags: flags indicating the new multicast state
+ * @flags: TVLV flags indicating the new multicast state
*
- * Whenever the multicast flags this nodes announces changes (@mcast_flags vs.
- * bat_priv->mcast.flags), this notifies userspace via the 'mcast' log level.
+ * Whenever the multicast TVLV flags this nodes announces change this notifies
+ * userspace via the 'mcast' log level.
*/
static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags)
{
- u8 old_flags = bat_priv->mcast.flags;
- char str_old_flags[] = "[...]";
+ bool old_enabled = bat_priv->mcast.mla_flags.enabled;
+ u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags;
+ char str_old_flags[] = "[.... . ]";
- sprintf(str_old_flags, "[%c%c%c]",
+ sprintf(str_old_flags, "[%c%c%c%s%s]",
(old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
(old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
- (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
+ (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.',
+ !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ",
+ !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ");
batadv_dbg(BATADV_DBG_MCAST, bat_priv,
- "Changing multicast flags from '%s' to '[%c%c%c]'\n",
- bat_priv->mcast.enabled ? str_old_flags : "<undefined>",
+ "Changing multicast flags from '%s' to '[%c%c%c%s%s]'\n",
+ old_enabled ? str_old_flags : "<undefined>",
(flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
(flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
- (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
+ (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.',
+ !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ",
+ !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ");
}
/**
- * batadv_mcast_mla_tvlv_update() - update multicast tvlv
+ * batadv_mcast_mla_flags_update() - update multicast flags
* @bat_priv: the bat priv with all the soft interface information
+ * @flags: flags indicating the new multicast state
*
* Updates the own multicast tvlv with our current multicast related settings,
* capabilities and inabilities.
- *
- * Return: false if we want all IPv4 && IPv6 multicast traffic and true
- * otherwise.
*/
-static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv)
+static void
+batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv,
+ struct batadv_mcast_mla_flags *flags)
{
struct batadv_tvlv_mcast_data mcast_data;
- struct batadv_mcast_querier_state querier4 = {false, false};
- struct batadv_mcast_querier_state querier6 = {false, false};
- struct net_device *dev = bat_priv->soft_iface;
- bool bridged;
-
- mcast_data.flags = BATADV_NO_FLAGS;
- memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved));
-
- bridged = batadv_mcast_has_bridge(bat_priv);
- if (!bridged)
- goto update;
-
- if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING))
- pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n");
-
- querier4.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP);
- querier4.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP);
- querier6.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6);
- querier6.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6);
-
- mcast_data.flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES;
-
- /* 1) If no querier exists at all, then multicast listeners on
- * our local TT clients behind the bridge will keep silent.
- * 2) If the selected querier is on one of our local TT clients,
- * behind the bridge, then this querier might shadow multicast
- * listeners on our local TT clients, behind this bridge.
- *
- * In both cases, we will signalize other batman nodes that
- * we need all multicast traffic of the according protocol.
- */
- if (!querier4.exists || querier4.shadowing)
- mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV4;
-
- if (!querier6.exists || querier6.shadowing)
- mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV6;
-
-update:
- batadv_mcast_bridge_log(bat_priv, bridged, &querier4, &querier6);
-
- bat_priv->mcast.querier_ipv4.exists = querier4.exists;
- bat_priv->mcast.querier_ipv4.shadowing = querier4.shadowing;
+ if (!memcmp(flags, &bat_priv->mcast.mla_flags, sizeof(*flags)))
+ return;
- bat_priv->mcast.querier_ipv6.exists = querier6.exists;
- bat_priv->mcast.querier_ipv6.shadowing = querier6.shadowing;
+ batadv_mcast_bridge_log(bat_priv, flags);
+ batadv_mcast_flags_log(bat_priv, flags->tvlv_flags);
- bat_priv->mcast.bridged = bridged;
+ mcast_data.flags = flags->tvlv_flags;
+ memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved));
- if (!bat_priv->mcast.enabled ||
- mcast_data.flags != bat_priv->mcast.flags) {
- batadv_mcast_flags_log(bat_priv, mcast_data.flags);
- batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2,
- &mcast_data, sizeof(mcast_data));
- bat_priv->mcast.flags = mcast_data.flags;
- bat_priv->mcast.enabled = true;
- }
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2,
+ &mcast_data, sizeof(mcast_data));
- return !(mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV4 &&
- mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV6);
+ bat_priv->mcast.mla_flags = *flags;
}
/**
@@ -599,22 +903,24 @@ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv)
{
struct net_device *soft_iface = bat_priv->soft_iface;
struct hlist_head mcast_list = HLIST_HEAD_INIT;
+ struct batadv_mcast_mla_flags flags;
int ret;
- if (!batadv_mcast_mla_tvlv_update(bat_priv))
- goto update;
+ flags = batadv_mcast_mla_flags_get(bat_priv);
- ret = batadv_mcast_mla_softif_get(bat_priv, soft_iface, &mcast_list);
+ ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list, &flags);
if (ret < 0)
goto out;
- ret = batadv_mcast_mla_bridge_get(bat_priv, soft_iface, &mcast_list);
+ ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list, &flags);
if (ret < 0)
goto out;
-update:
+ spin_lock(&bat_priv->mcast.mla_lock);
batadv_mcast_mla_tt_retract(bat_priv, &mcast_list);
batadv_mcast_mla_tt_add(bat_priv, &mcast_list);
+ batadv_mcast_mla_flags_update(bat_priv, &flags);
+ spin_unlock(&bat_priv->mcast.mla_lock);
out:
batadv_mcast_mla_list_free(&mcast_list);
@@ -639,10 +945,7 @@ static void batadv_mcast_mla_update(struct work_struct *work)
priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work);
bat_priv = container_of(priv_mcast, struct batadv_priv, mcast);
- spin_lock(&bat_priv->mcast.mla_lock);
__batadv_mcast_mla_update(bat_priv);
- spin_unlock(&bat_priv->mcast.mla_lock);
-
batadv_mcast_start_timer(bat_priv);
}
@@ -677,6 +980,7 @@ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb)
* @bat_priv: the bat priv with all the soft interface information
* @skb: the IPv4 packet to check
* @is_unsnoopable: stores whether the destination is snoopable
+ * @is_routable: stores whether the destination is routable
*
* Checks whether the given IPv4 packet has the potential to be forwarded with a
* mode more optimal than classic flooding.
@@ -686,7 +990,8 @@ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb)
*/
static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
struct sk_buff *skb,
- bool *is_unsnoopable)
+ bool *is_unsnoopable,
+ int *is_routable)
{
struct iphdr *iphdr;
@@ -699,16 +1004,13 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
iphdr = ip_hdr(skb);
- /* TODO: Implement Multicast Router Discovery (RFC4286),
- * then allow scope > link local, too
- */
- if (!ipv4_is_local_multicast(iphdr->daddr))
- return -EINVAL;
-
/* link-local multicast listeners behind a bridge are
* not snoopable (see RFC4541, section 2.1.2.2)
*/
- *is_unsnoopable = true;
+ if (ipv4_is_local_multicast(iphdr->daddr))
+ *is_unsnoopable = true;
+ else
+ *is_routable = ETH_P_IP;
return 0;
}
@@ -743,6 +1045,7 @@ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb)
* @bat_priv: the bat priv with all the soft interface information
* @skb: the IPv6 packet to check
* @is_unsnoopable: stores whether the destination is snoopable
+ * @is_routable: stores whether the destination is routable
*
* Checks whether the given IPv6 packet has the potential to be forwarded with a
* mode more optimal than classic flooding.
@@ -751,7 +1054,8 @@ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb)
*/
static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
struct sk_buff *skb,
- bool *is_unsnoopable)
+ bool *is_unsnoopable,
+ int *is_routable)
{
struct ipv6hdr *ip6hdr;
@@ -764,10 +1068,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
ip6hdr = ipv6_hdr(skb);
- /* TODO: Implement Multicast Router Discovery (RFC4286),
- * then allow scope > link local, too
- */
- if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) != IPV6_ADDR_SCOPE_LINKLOCAL)
+ if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) < IPV6_ADDR_SCOPE_LINKLOCAL)
return -EINVAL;
/* link-local-all-nodes multicast listeners behind a bridge are
@@ -775,6 +1076,8 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
*/
if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr))
*is_unsnoopable = true;
+ else if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) > IPV6_ADDR_SCOPE_LINKLOCAL)
+ *is_routable = ETH_P_IPV6;
return 0;
}
@@ -784,6 +1087,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @skb: the multicast frame to check
* @is_unsnoopable: stores whether the destination is snoopable
+ * @is_routable: stores whether the destination is routable
*
* Checks whether the given multicast ethernet frame has the potential to be
* forwarded with a mode more optimal than classic flooding.
@@ -792,7 +1096,8 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
*/
static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
struct sk_buff *skb,
- bool *is_unsnoopable)
+ bool *is_unsnoopable,
+ int *is_routable)
{
struct ethhdr *ethhdr = eth_hdr(skb);
@@ -802,13 +1107,15 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
switch (ntohs(ethhdr->h_proto)) {
case ETH_P_IP:
return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb,
- is_unsnoopable);
+ is_unsnoopable,
+ is_routable);
case ETH_P_IPV6:
if (!IS_ENABLED(CONFIG_IPV6))
return -EINVAL;
return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb,
- is_unsnoopable);
+ is_unsnoopable,
+ is_routable);
default:
return -EINVAL;
}
@@ -839,6 +1146,29 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv,
}
/**
+ * batadv_mcast_forw_rtr_count() - count nodes with a multicast router
+ * @bat_priv: the bat priv with all the soft interface information
+ * @protocol: the ethernet protocol type to count multicast routers for
+ *
+ * Return: the number of nodes which want all routable IPv4 multicast traffic
+ * if the protocol is ETH_P_IP or the number of nodes which want all routable
+ * IPv6 traffic if the protocol is ETH_P_IPV6. Otherwise returns 0.
+ */
+
+static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv,
+ int protocol)
+{
+ switch (protocol) {
+ case ETH_P_IP:
+ return atomic_read(&bat_priv->mcast.num_want_all_rtr4);
+ case ETH_P_IPV6:
+ return atomic_read(&bat_priv->mcast.num_want_all_rtr6);
+ default:
+ return 0;
+ }
+}
+
+/**
* batadv_mcast_forw_tt_node_get() - get a multicast tt node
* @bat_priv: the bat priv with all the soft interface information
* @ethhdr: the ether header containing the multicast destination
@@ -960,6 +1290,84 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv)
}
/**
+ * batadv_mcast_forw_rtr4_node_get() - get a node with an ipv4 mcast router flag
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR4 flag unset and
+ * increases its refcount.
+ */
+static struct batadv_orig_node *
+batadv_mcast_forw_rtr4_node_get(struct batadv_priv *bat_priv)
+{
+ struct batadv_orig_node *tmp_orig_node, *orig_node = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_orig_node,
+ &bat_priv->mcast.want_all_rtr4_list,
+ mcast_want_all_rtr4_node) {
+ if (!kref_get_unless_zero(&tmp_orig_node->refcount))
+ continue;
+
+ orig_node = tmp_orig_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_node;
+}
+
+/**
+ * batadv_mcast_forw_rtr6_node_get() - get a node with an ipv6 mcast router flag
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR6 flag unset
+ * and increases its refcount.
+ */
+static struct batadv_orig_node *
+batadv_mcast_forw_rtr6_node_get(struct batadv_priv *bat_priv)
+{
+ struct batadv_orig_node *tmp_orig_node, *orig_node = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_orig_node,
+ &bat_priv->mcast.want_all_rtr6_list,
+ mcast_want_all_rtr6_node) {
+ if (!kref_get_unless_zero(&tmp_orig_node->refcount))
+ continue;
+
+ orig_node = tmp_orig_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_node;
+}
+
+/**
+ * batadv_mcast_forw_rtr_node_get() - get a node with an ipv4/ipv6 router flag
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ethhdr: an ethernet header to determine the protocol family from
+ *
+ * Return: an orig_node which has no BATADV_MCAST_WANT_NO_RTR4 or
+ * BATADV_MCAST_WANT_NO_RTR6 flag, depending on the provided ethhdr, set and
+ * increases its refcount.
+ */
+static struct batadv_orig_node *
+batadv_mcast_forw_rtr_node_get(struct batadv_priv *bat_priv,
+ struct ethhdr *ethhdr)
+{
+ switch (ntohs(ethhdr->h_proto)) {
+ case ETH_P_IP:
+ return batadv_mcast_forw_rtr4_node_get(bat_priv);
+ case ETH_P_IPV6:
+ return batadv_mcast_forw_rtr6_node_get(bat_priv);
+ default:
+ /* we shouldn't be here... */
+ return NULL;
+ }
+}
+
+/**
* batadv_mcast_forw_mode() - check on how to forward a multicast packet
* @bat_priv: the bat priv with all the soft interface information
* @skb: The multicast packet to check
@@ -977,8 +1385,11 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
bool is_unsnoopable = false;
unsigned int mcast_fanout;
struct ethhdr *ethhdr;
+ int is_routable = 0;
+ int rtr_count = 0;
- ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable);
+ ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable,
+ &is_routable);
if (ret == -ENOMEM)
return BATADV_FORW_NONE;
else if (ret < 0)
@@ -991,8 +1402,9 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr);
unsnoop_count = !is_unsnoopable ? 0 :
atomic_read(&bat_priv->mcast.num_want_all_unsnoopables);
+ rtr_count = batadv_mcast_forw_rtr_count(bat_priv, is_routable);
- total_count = tt_count + ip_count + unsnoop_count;
+ total_count = tt_count + ip_count + unsnoop_count + rtr_count;
switch (total_count) {
case 1:
@@ -1002,6 +1414,9 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
*orig = batadv_mcast_forw_ip_node_get(bat_priv, ethhdr);
else if (unsnoop_count)
*orig = batadv_mcast_forw_unsnoop_node_get(bat_priv);
+ else if (rtr_count)
+ *orig = batadv_mcast_forw_rtr_node_get(bat_priv,
+ ethhdr);
if (*orig)
return BATADV_FORW_SINGLE;
@@ -1173,6 +1588,111 @@ batadv_mcast_forw_want_all(struct batadv_priv *bat_priv,
}
/**
+ * batadv_mcast_forw_want_all_rtr4() - forward to nodes with want-all-rtr4
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_NO_RTR4 flag unset. A transmission is performed via a
+ * batman-adv unicast packet for each such destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
+ */
+static int
+batadv_mcast_forw_want_all_rtr4(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ struct batadv_orig_node *orig_node;
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node,
+ &bat_priv->mcast.want_all_rtr4_list,
+ mcast_want_all_rtr4_node) {
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
+
+ batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0,
+ orig_node, vid);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_want_all_rtr6() - forward to nodes with want-all-rtr6
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: The multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_NO_RTR6 flag unset. A transmission is performed via a
+ * batman-adv unicast packet for each such destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
+ */
+static int
+batadv_mcast_forw_want_all_rtr6(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ struct batadv_orig_node *orig_node;
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node,
+ &bat_priv->mcast.want_all_rtr6_list,
+ mcast_want_all_rtr6_node) {
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
+
+ batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0,
+ orig_node, vid);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_want_rtr() - forward packet to nodes in a want-all-rtr list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_NO_RTR4 or BATADV_MCAST_WANT_NO_RTR6 flag unset. A
+ * transmission is performed via a batman-adv unicast packet for each such
+ * destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family
+ * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise.
+ */
+static int
+batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ switch (ntohs(eth_hdr(skb)->h_proto)) {
+ case ETH_P_IP:
+ return batadv_mcast_forw_want_all_rtr4(bat_priv, skb, vid);
+ case ETH_P_IPV6:
+ return batadv_mcast_forw_want_all_rtr6(bat_priv, skb, vid);
+ default:
+ /* we shouldn't be here... */
+ return NET_XMIT_DROP;
+ }
+}
+
+/**
* batadv_mcast_forw_send() - send packet to any detected multicast recpient
* @bat_priv: the bat priv with all the soft interface information
* @skb: the multicast packet to transmit
@@ -1205,6 +1725,12 @@ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb,
return ret;
}
+ ret = batadv_mcast_forw_want_rtr(bat_priv, skb, vid);
+ if (ret != NET_XMIT_SUCCESS) {
+ kfree_skb(skb);
+ return ret;
+ }
+
consume_skb(skb);
return ret;
}
@@ -1345,6 +1871,127 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv,
}
/**
+ * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has
+ * toggled then this method updates counter and list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
+ */
+static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 mcast_flags)
+{
+ struct hlist_node *node = &orig->mcast_want_all_rtr4_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
+ /* switched from flag set to unset */
+ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) {
+ atomic_inc(&bat_priv->mcast.num_want_all_rtr4);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag unset to set */
+ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) {
+ atomic_dec(&bat_priv->mcast.num_want_all_rtr4);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has
+ * toggled then this method updates counter and list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
+ */
+static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 mcast_flags)
+{
+ struct hlist_node *node = &orig->mcast_want_all_rtr6_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
+ /* switched from flag set to unset */
+ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) {
+ atomic_inc(&bat_priv->mcast.num_want_all_rtr6);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag unset to set */
+ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) {
+ atomic_dec(&bat_priv->mcast.num_want_all_rtr6);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV
+ * @enabled: whether the originator has multicast TVLV support enabled
+ * @tvlv_value: tvlv buffer containing the multicast flags
+ * @tvlv_value_len: tvlv buffer length
+ *
+ * Return: multicast flags for the given tvlv buffer
+ */
+static u8
+batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len)
+{
+ u8 mcast_flags = BATADV_NO_FLAGS;
+
+ if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags))
+ mcast_flags = *(u8 *)tvlv_value;
+
+ if (!enabled) {
+ mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4;
+ mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6;
+ }
+
+ /* remove redundant flags to avoid sending duplicate packets later */
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)
+ mcast_flags |= BATADV_MCAST_WANT_NO_RTR4;
+
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)
+ mcast_flags |= BATADV_MCAST_WANT_NO_RTR6;
+
+ return mcast_flags;
+}
+
+/**
* batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container
* @bat_priv: the bat priv with all the soft interface information
* @orig: the orig_node of the ogm
@@ -1359,16 +2006,10 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
u16 tvlv_value_len)
{
bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
- u8 mcast_flags = BATADV_NO_FLAGS;
+ u8 mcast_flags;
- if (orig_mcast_enabled && tvlv_value &&
- tvlv_value_len >= sizeof(mcast_flags))
- mcast_flags = *(u8 *)tvlv_value;
-
- if (!orig_mcast_enabled) {
- mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4;
- mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6;
- }
+ mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled,
+ tvlv_value, tvlv_value_len);
spin_lock_bh(&orig->mcast_handler_lock);
@@ -1385,6 +2026,8 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags);
batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags);
batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags);
orig->mcast_flags = mcast_flags;
spin_unlock_bh(&orig->mcast_handler_lock);
@@ -1417,15 +2060,16 @@ void batadv_mcast_init(struct batadv_priv *bat_priv)
static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv,
struct seq_file *seq)
{
- u8 flags = bat_priv->mcast.flags;
+ struct batadv_mcast_mla_flags *mla_flags = &bat_priv->mcast.mla_flags;
char querier4, querier6, shadowing4, shadowing6;
- bool bridged = bat_priv->mcast.bridged;
+ bool bridged = mla_flags->bridged;
+ u8 flags = mla_flags->tvlv_flags;
if (bridged) {
- querier4 = bat_priv->mcast.querier_ipv4.exists ? '.' : '4';
- querier6 = bat_priv->mcast.querier_ipv6.exists ? '.' : '6';
- shadowing4 = bat_priv->mcast.querier_ipv4.shadowing ? '4' : '.';
- shadowing6 = bat_priv->mcast.querier_ipv6.shadowing ? '6' : '.';
+ querier4 = mla_flags->querier_ipv4.exists ? '.' : '4';
+ querier6 = mla_flags->querier_ipv6.exists ? '.' : '6';
+ shadowing4 = mla_flags->querier_ipv4.shadowing ? '4' : '.';
+ shadowing6 = mla_flags->querier_ipv6.shadowing ? '6' : '.';
} else {
querier4 = '?';
querier6 = '?';
@@ -1433,10 +2077,12 @@ static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv,
shadowing6 = '?';
}
- seq_printf(seq, "Multicast flags (own flags: [%c%c%c])\n",
+ seq_printf(seq, "Multicast flags (own flags: [%c%c%c%s%s])\n",
(flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
(flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
- (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
+ (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.',
+ !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ",
+ !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ");
seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.');
seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n",
querier4, querier6);
@@ -1490,13 +2136,17 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
flags = orig_node->mcast_flags;
- seq_printf(seq, "%pM [%c%c%c]\n", orig_node->orig,
+ seq_printf(seq, "%pM [%c%c%c%s%s]\n", orig_node->orig,
(flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)
? 'U' : '.',
(flags & BATADV_MCAST_WANT_ALL_IPV4)
? '4' : '.',
(flags & BATADV_MCAST_WANT_ALL_IPV6)
- ? '6' : '.');
+ ? '6' : '.',
+ !(flags & BATADV_MCAST_WANT_NO_RTR4)
+ ? "R4" : ". ",
+ !(flags & BATADV_MCAST_WANT_NO_RTR6)
+ ? "R6" : ". ");
}
rcu_read_unlock();
}
@@ -1517,19 +2167,19 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
int batadv_mcast_mesh_info_put(struct sk_buff *msg,
struct batadv_priv *bat_priv)
{
- u32 flags = bat_priv->mcast.flags;
+ u32 flags = bat_priv->mcast.mla_flags.tvlv_flags;
u32 flags_priv = BATADV_NO_FLAGS;
- if (bat_priv->mcast.bridged) {
+ if (bat_priv->mcast.mla_flags.bridged) {
flags_priv |= BATADV_MCAST_FLAGS_BRIDGED;
- if (bat_priv->mcast.querier_ipv4.exists)
+ if (bat_priv->mcast.mla_flags.querier_ipv4.exists)
flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS;
- if (bat_priv->mcast.querier_ipv6.exists)
+ if (bat_priv->mcast.mla_flags.querier_ipv6.exists)
flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS;
- if (bat_priv->mcast.querier_ipv4.shadowing)
+ if (bat_priv->mcast.mla_flags.querier_ipv4.shadowing)
flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING;
- if (bat_priv->mcast.querier_ipv6.shadowing)
+ if (bat_priv->mcast.mla_flags.querier_ipv6.shadowing)
flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING;
}
@@ -1770,6 +2420,8 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig)
batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS);
batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS);
batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS);
+ batadv_mcast_want_rtr4_update(bat_priv, orig, BATADV_NO_FLAGS);
+ batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_NO_FLAGS);
spin_unlock_bh(&orig->mcast_handler_lock);
}
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 653b9b76fabe..5d9e2bb29c97 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -9,9 +9,9 @@
#include "main.h"
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
+#include <linux/netlink.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
/**
* enum batadv_forw_mode - the way a packet should be forwarded as
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index a67720fad46c..6f08fd122a8d 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -21,6 +21,7 @@
#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/limits.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
@@ -30,6 +31,7 @@
#include <linux/stddef.h>
#include <linux/types.h>
#include <net/genetlink.h>
+#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/sock.h>
#include <uapi/linux/batadv_packet.h>
@@ -49,8 +51,6 @@
#include "tp_meter.h"
#include "translation-table.h"
-struct net;
-
struct genl_family batadv_netlink_family;
/* multicast groups */
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index d1e0681b8743..ddc674e47dbb 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -9,11 +9,10 @@
#include "main.h"
+#include <linux/netlink.h>
#include <linux/types.h>
#include <net/genetlink.h>
-struct nlmsghdr;
-
void batadv_netlink_register(void);
void batadv_netlink_unregister(void);
int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype);
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index c5e7906045f3..580609389f0f 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1951,34 +1951,19 @@ out:
/**
* batadv_nc_init_debugfs() - create nc folder and related files in debugfs
* @bat_priv: the bat priv with all the soft interface information
- *
- * Return: 0 on success or negative error number in case of failure
*/
-int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
+void batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
{
- struct dentry *nc_dir, *file;
+ struct dentry *nc_dir;
nc_dir = debugfs_create_dir("nc", bat_priv->debug_dir);
- if (!nc_dir)
- goto out;
- file = debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq);
- if (!file)
- goto out;
+ debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq);
- file = debugfs_create_u32("max_fwd_delay", 0644, nc_dir,
- &bat_priv->nc.max_fwd_delay);
- if (!file)
- goto out;
+ debugfs_create_u32("max_fwd_delay", 0644, nc_dir,
+ &bat_priv->nc.max_fwd_delay);
- file = debugfs_create_u32("max_buffer_time", 0644, nc_dir,
- &bat_priv->nc.max_buffer_time);
- if (!file)
- goto out;
-
- return 0;
-
-out:
- return -ENOMEM;
+ debugfs_create_u32("max_buffer_time", 0644, nc_dir,
+ &bat_priv->nc.max_buffer_time);
}
#endif
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index 74f56113a5d0..753fa49723cf 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -9,12 +9,11 @@
#include "main.h"
+#include <linux/netdevice.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
-
-struct batadv_ogm_packet;
-struct net_device;
-struct seq_file;
-struct sk_buff;
+#include <uapi/linux/batadv_packet.h>
#ifdef CONFIG_BATMAN_ADV_NC
@@ -40,7 +39,7 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
struct sk_buff *skb);
int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset);
-int batadv_nc_init_debugfs(struct batadv_priv *bat_priv);
+void batadv_nc_init_debugfs(struct batadv_priv *bat_priv);
#else /* ifdef CONFIG_BATMAN_ADV_NC */
@@ -111,9 +110,8 @@ static inline int batadv_nc_nodes_seq_print_text(struct seq_file *seq,
return 0;
}
-static inline int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
+static inline void batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
{
- return 0;
}
#endif /* ifdef CONFIG_BATMAN_ADV_NC */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 45db798a7297..38613487fb1b 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -27,6 +27,7 @@
#include <linux/stddef.h>
#include <linux/workqueue.h>
#include <net/sock.h>
+#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
#include "bat_algo.h"
@@ -1043,7 +1044,8 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
orig_node->bcast_seqno_reset = reset_time;
#ifdef CONFIG_BATMAN_ADV_MCAST
- orig_node->mcast_flags = BATADV_NO_FLAGS;
+ orig_node->mcast_flags = BATADV_MCAST_WANT_NO_RTR4;
+ orig_node->mcast_flags |= BATADV_MCAST_WANT_NO_RTR6;
INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node);
INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node);
INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node);
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 3829e26f9c5d..512a1f99dd75 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -12,12 +12,11 @@
#include <linux/compiler.h>
#include <linux/if_ether.h>
#include <linux/jhash.h>
+#include <linux/netlink.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
-
bool batadv_compare_orig(const struct hlist_node *node, const void *data2);
int batadv_originator_init(struct batadv_priv *bat_priv);
void batadv_originator_free(struct batadv_priv *bat_priv);
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index b96c6d06d188..c20feac95107 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -9,10 +9,9 @@
#include "main.h"
+#include <linux/skbuff.h>
#include <linux/types.h>
-struct sk_buff;
-
bool batadv_check_management_packet(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface,
int header_len);
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 5921ee4e107c..5fc0fd1e5d08 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -10,12 +10,11 @@
#include "main.h"
#include <linux/compiler.h>
+#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <uapi/linux/batadv_packet.h>
-struct sk_buff;
-
void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
bool dropped);
struct batadv_forw_packet *
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index a7677e1d000f..c7a2e77ca1da 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -24,6 +24,7 @@
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
+#include <linux/netlink.h>
#include <linux/percpu.h>
#include <linux/printk.h>
#include <linux/random.h>
@@ -803,11 +804,6 @@ static int batadv_softif_init_late(struct net_device *dev)
atomic_set(&bat_priv->distributed_arp_table, 1);
#endif
#ifdef CONFIG_BATMAN_ADV_MCAST
- bat_priv->mcast.querier_ipv4.exists = false;
- bat_priv->mcast.querier_ipv4.shadowing = false;
- bat_priv->mcast.querier_ipv6.exists = false;
- bat_priv->mcast.querier_ipv6.shadowing = false;
- bat_priv->mcast.flags = BATADV_NO_FLAGS;
atomic_set(&bat_priv->multicast_mode, 1);
atomic_set(&bat_priv->multicast_fanout, 16);
atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0);
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 275442a7acb6..29139ad769fe 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -9,13 +9,12 @@
#include "main.h"
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
+#include <net/net_namespace.h>
#include <net/rtnetlink.h>
-struct net_device;
-struct net;
-struct sk_buff;
-
int batadv_skb_head_push(struct sk_buff *skb, unsigned int len);
void batadv_interface_rx(struct net_device *soft_iface,
struct sk_buff *skb, int hdr_size,
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 80fc3253c336..1efcb97039cd 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -18,6 +18,7 @@
#include <linux/kernel.h>
#include <linux/kobject.h>
#include <linux/kref.h>
+#include <linux/limits.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
#include <linux/rculist.h>
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index 83fa808b1871..5e466093dfa5 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -9,12 +9,11 @@
#include "main.h"
+#include <linux/kobject.h>
+#include <linux/netdevice.h>
#include <linux/sysfs.h>
#include <linux/types.h>
-struct kobject;
-struct net_device;
-
#define BATADV_SYSFS_IF_MESH_SUBDIR "mesh"
#define BATADV_SYSFS_IF_BAT_SUBDIR "batman_adv"
/**
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 820392146249..dd6a9a40dbb9 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -21,6 +21,7 @@
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/kthread.h>
+#include <linux/limits.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/param.h>
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index 604b3799c972..78d310da0ad3 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -9,10 +9,9 @@
#include "main.h"
+#include <linux/skbuff.h>
#include <linux/types.h>
-struct sk_buff;
-
void batadv_tp_meter_init(void);
void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
u32 test_length, u32 *cookie);
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 1ddfd5e011ee..8a482c5ec67b 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -3813,6 +3813,8 @@ static void batadv_tt_purge(struct work_struct *work)
*/
void batadv_tt_free(struct batadv_priv *bat_priv)
{
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_ROAM, 1);
+
batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_TT, 1);
batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_TT, 1);
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index c8c48d62a430..4a98860d7f0e 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -9,13 +9,12 @@
#include "main.h"
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
-struct netlink_callback;
-struct net_device;
-struct seq_file;
-struct sk_buff;
-
int batadv_tt_init(struct batadv_priv *bat_priv);
bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
unsigned short vid, int ifindex, u32 mark);
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index 114ac01e06af..36985000a0a8 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -10,8 +10,7 @@
#include "main.h"
#include <linux/types.h>
-
-struct batadv_ogm_packet;
+#include <uapi/linux/batadv_packet.h>
void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
u8 type, u8 version,
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 74b644738a36..6ae139d74e0f 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -14,20 +14,22 @@
#include <linux/average.h>
#include <linux/bitops.h>
#include <linux/compiler.h>
+#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/kref.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/sched.h> /* for linux/wait.h */
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/spinlock.h>
+#include <linux/timer.h>
#include <linux/types.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
-struct seq_file;
-
#ifdef CONFIG_BATMAN_ADV_DAT
/**
@@ -402,6 +404,17 @@ struct batadv_orig_node {
* list
*/
struct hlist_node mcast_want_all_ipv6_node;
+
+ /**
+ * @mcast_want_all_rtr4_node: a list node for the mcast.want_all_rtr4
+ * list
+ */
+ struct hlist_node mcast_want_all_rtr4_node;
+ /**
+ * @mcast_want_all_rtr6_node: a list node for the mcast.want_all_rtr6
+ * list
+ */
+ struct hlist_node mcast_want_all_rtr6_node;
#endif
/** @capabilities: announced capabilities of this originator */
@@ -1169,6 +1182,26 @@ struct batadv_mcast_querier_state {
};
/**
+ * struct batadv_mcast_mla_flags - flags for the querier, bridge and tvlv state
+ */
+struct batadv_mcast_mla_flags {
+ /** @querier_ipv4: the current state of an IGMP querier in the mesh */
+ struct batadv_mcast_querier_state querier_ipv4;
+
+ /** @querier_ipv6: the current state of an MLD querier in the mesh */
+ struct batadv_mcast_querier_state querier_ipv6;
+
+ /** @enabled: whether the multicast tvlv is currently enabled */
+ unsigned char enabled:1;
+
+ /** @bridged: whether the soft interface has a bridge on top */
+ unsigned char bridged:1;
+
+ /** @tvlv_flags: the flags we have last sent in our mcast tvlv */
+ u8 tvlv_flags;
+};
+
+/**
* struct batadv_priv_mcast - per mesh interface mcast data
*/
struct batadv_priv_mcast {
@@ -1196,20 +1229,22 @@ struct batadv_priv_mcast {
*/
struct hlist_head want_all_ipv6_list;
- /** @querier_ipv4: the current state of an IGMP querier in the mesh */
- struct batadv_mcast_querier_state querier_ipv4;
-
- /** @querier_ipv6: the current state of an MLD querier in the mesh */
- struct batadv_mcast_querier_state querier_ipv6;
-
- /** @flags: the flags we have last sent in our mcast tvlv */
- u8 flags;
+ /**
+ * @want_all_rtr4_list: a list of orig_nodes wanting all routable IPv4
+ * multicast traffic
+ */
+ struct hlist_head want_all_rtr4_list;
- /** @enabled: whether the multicast tvlv is currently enabled */
- unsigned char enabled:1;
+ /**
+ * @want_all_rtr6_list: a list of orig_nodes wanting all routable IPv6
+ * multicast traffic
+ */
+ struct hlist_head want_all_rtr6_list;
- /** @bridged: whether the soft interface has a bridge on top */
- unsigned char bridged:1;
+ /**
+ * @mla_flags: flags for the querier, bridge and tvlv state
+ */
+ struct batadv_mcast_mla_flags mla_flags;
/**
* @mla_lock: a lock protecting mla_list and mla_flags
@@ -1228,6 +1263,12 @@ struct batadv_priv_mcast {
/** @num_want_all_ipv6: counter for items in want_all_ipv6_list */
atomic_t num_want_all_ipv6;
+ /** @num_want_all_rtr4: counter for items in want_all_rtr4_list */
+ atomic_t num_want_all_rtr4;
+
+ /** @num_want_all_rtr6: counter for items in want_all_rtr6_list */
+ atomic_t num_want_all_rtr6;
+
/**
* @want_lists_lock: lock for protecting modifications to mcasts
* want_all_{unsnoopables,ipv4,ipv6}_list (traversals are rcu-locked)
@@ -2129,6 +2170,9 @@ struct batadv_algo_iface_ops {
/** @enable: init routing info when hard-interface is enabled */
int (*enable)(struct batadv_hard_iface *hard_iface);
+ /** @enabled: notification when hard-interface was enabled (optional) */
+ void (*enabled)(struct batadv_hard_iface *hard_iface);
+
/** @disable: de-init routing info when hard-interface is disabled */
void (*disable)(struct batadv_hard_iface *hard_iface);
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 1555b0c6f7ec..9d41de1ec90f 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -164,26 +164,21 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev,
int count = atomic_read(&dev->peer_count);
const struct in6_addr *nexthop;
struct lowpan_peer *peer;
+ struct neighbour *neigh;
BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt);
- /* If we have multiple 6lowpan peers, then check where we should
- * send the packet. If only one peer exists, then we can send the
- * packet right away.
- */
- if (count == 1) {
- rcu_read_lock();
- peer = list_first_or_null_rcu(&dev->peers, struct lowpan_peer,
- list);
- rcu_read_unlock();
- return peer;
- }
-
if (!rt) {
- nexthop = &lowpan_cb(skb)->gw;
-
- if (ipv6_addr_any(nexthop))
- return NULL;
+ if (ipv6_addr_any(&lowpan_cb(skb)->gw)) {
+ /* There is neither route nor gateway,
+ * probably the destination is a direct peer.
+ */
+ nexthop = daddr;
+ } else {
+ /* There is a known gateway
+ */
+ nexthop = &lowpan_cb(skb)->gw;
+ }
} else {
nexthop = rt6_nexthop(rt, daddr);
@@ -209,6 +204,20 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev,
}
}
+ /* use the neighbour cache for matching addresses assigned by SLAAC
+ */
+ neigh = __ipv6_neigh_lookup(dev->netdev, nexthop);
+ if (neigh) {
+ list_for_each_entry_rcu(peer, &dev->peers, list) {
+ if (!memcmp(neigh->ha, peer->lladdr, ETH_ALEN)) {
+ neigh_release(neigh);
+ rcu_read_unlock();
+ return peer;
+ }
+ }
+ neigh_release(neigh);
+ }
+
rcu_read_unlock();
return NULL;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 15d1cb5aee18..ad5b0ac1f9ce 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -520,6 +520,9 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
set_bit(HCI_CONN_POWER_SAVE, &conn->flags);
conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+ /* Set Default Authenticated payload timeout to 30s */
+ conn->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT;
+
if (conn->role == HCI_ROLE_MASTER)
conn->out = true;
@@ -912,7 +915,7 @@ static void hci_req_directed_advertising(struct hci_request *req,
sizeof(cp), &cp);
}
- __hci_req_enable_ext_advertising(req);
+ __hci_req_enable_ext_advertising(req, 0x00);
} else {
struct hci_cp_le_set_adv_param cp;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index b81bf53c5ac4..b9585e7d9d2e 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2827,7 +2827,7 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
memset(adv_instance->scan_rsp_data, 0,
sizeof(adv_instance->scan_rsp_data));
} else {
- if (hdev->adv_instance_cnt >= HCI_MAX_ADV_INSTANCES ||
+ if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets ||
instance < 1 || instance > HCI_MAX_ADV_INSTANCES)
return -EOVERFLOW;
@@ -3195,11 +3195,13 @@ struct hci_dev *hci_alloc_dev(void)
hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE;
hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
+ hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES;
hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT;
hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT;
hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE;
hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE;
+ hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT;
mutex_init(&hdev->lock);
mutex_init(&hdev->req_lock);
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 51f5b1efc3a5..bb67f4a5479a 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -941,6 +941,35 @@ static int adv_max_interval_get(void *data, u64 *val)
DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get,
adv_max_interval_set, "%llu\n");
+static int auth_payload_timeout_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 0x0001 || val > 0xffff)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->auth_payload_timeout = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int auth_payload_timeout_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->auth_payload_timeout;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(auth_payload_timeout_fops,
+ auth_payload_timeout_get,
+ auth_payload_timeout_set, "%llu\n");
+
DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter,
HCI_QUIRK_STRICT_DUPLICATE_FILTER);
DEFINE_QUIRK_ATTRIBUTE(quirk_simultaneous_discovery,
@@ -994,6 +1023,8 @@ void hci_debugfs_create_le(struct hci_dev *hdev)
&adv_max_interval_fops);
debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs,
&hdev->discov_interleaved_timeout);
+ debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev,
+ &auth_payload_timeout_fops);
debugfs_create_file("quirk_strict_duplicate_filter", 0644,
hdev->debugfs, hdev,
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 9e4fcf406d9c..cdb00c2ef242 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -579,6 +579,51 @@ static void hci_cc_read_local_commands(struct hci_dev *hdev,
memcpy(hdev->commands, rp->commands, sizeof(hdev->commands));
}
+static void hci_cc_read_auth_payload_timeout(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_auth_payload_to *rp = (void *)skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->auth_payload_timeout = __le16_to_cpu(rp->timeout);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_write_auth_payload_timeout(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_write_auth_payload_to *rp = (void *)skb->data;
+ struct hci_conn *conn;
+ void *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->auth_payload_timeout = get_unaligned_le16(sent + 2);
+
+ hci_dev_unlock(hdev);
+}
+
static void hci_cc_read_local_features(struct hci_dev *hdev,
struct sk_buff *skb)
{
@@ -2975,6 +3020,25 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
goto unlock;
}
+ /* Set the default Authenticated Payload Timeout after
+ * an LE Link is established. As per Core Spec v5.0, Vol 2, Part B
+ * Section 3.3, the HCI command WRITE_AUTH_PAYLOAD_TIMEOUT should be
+ * sent when the link is active and Encryption is enabled, the conn
+ * type can be either LE or ACL and controller must support LMP Ping.
+ * Ensure for AES-CCM encryption as well.
+ */
+ if (test_bit(HCI_CONN_ENCRYPT, &conn->flags) &&
+ test_bit(HCI_CONN_AES_CCM, &conn->flags) &&
+ ((conn->type == ACL_LINK && lmp_ping_capable(hdev)) ||
+ (conn->type == LE_LINK && (hdev->le_features[0] & HCI_LE_PING)))) {
+ struct hci_cp_write_auth_payload_to cp;
+
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.timeout = cpu_to_le16(hdev->auth_payload_timeout);
+ hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO,
+ sizeof(cp), &cp);
+ }
+
notify:
if (conn->state == BT_CONFIG) {
if (!ev->status)
@@ -3170,6 +3234,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cc_write_sc_support(hdev, skb);
break;
+ case HCI_OP_READ_AUTH_PAYLOAD_TO:
+ hci_cc_read_auth_payload_timeout(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_AUTH_PAYLOAD_TO:
+ hci_cc_write_auth_payload_timeout(hdev, skb);
+ break;
+
case HCI_OP_READ_LOCAL_VERSION:
hci_cc_read_local_version(hdev, skb);
break;
@@ -5588,6 +5660,11 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev,
return send_conn_param_neg_reply(hdev, handle,
HCI_ERROR_UNKNOWN_CONN_ID);
+ if (min < hcon->le_conn_min_interval ||
+ max > hcon->le_conn_max_interval)
+ return send_conn_param_neg_reply(hdev, handle,
+ HCI_ERROR_INVALID_LL_PARAMS);
+
if (hci_check_conn_params(min, max, latency, timeout))
return send_conn_param_neg_reply(hdev, handle,
HCI_ERROR_INVALID_LL_PARAMS);
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index e9a95ed65491..621f1a97d803 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1601,7 +1601,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
cp.own_addr_type = own_addr_type;
cp.channel_map = hdev->le_adv_channel_map;
cp.tx_power = 127;
- cp.handle = 0;
+ cp.handle = instance;
if (flags & MGMT_ADV_FLAG_SEC_2M) {
cp.primary_phy = HCI_ADV_PHY_1M;
@@ -1643,11 +1643,21 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
return 0;
}
-void __hci_req_enable_ext_advertising(struct hci_request *req)
+int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance)
{
+ struct hci_dev *hdev = req->hdev;
struct hci_cp_le_set_ext_adv_enable *cp;
struct hci_cp_ext_adv_set *adv_set;
u8 data[sizeof(*cp) + sizeof(*adv_set) * 1];
+ struct adv_info *adv_instance;
+
+ if (instance > 0) {
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return -EINVAL;
+ } else {
+ adv_instance = NULL;
+ }
cp = (void *) data;
adv_set = (void *) cp->data;
@@ -1659,11 +1669,23 @@ void __hci_req_enable_ext_advertising(struct hci_request *req)
memset(adv_set, 0, sizeof(*adv_set));
- adv_set->handle = 0;
+ adv_set->handle = instance;
+
+ /* Set duration per instance since controller is responsible for
+ * scheduling it.
+ */
+ if (adv_instance && adv_instance->duration) {
+ u16 duration = adv_instance->duration * MSEC_PER_SEC;
+
+ /* Time = N * 10 ms */
+ adv_set->duration = cpu_to_le16(duration / 10);
+ }
hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE,
sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets,
data);
+
+ return 0;
}
int __hci_req_start_ext_adv(struct hci_request *req, u8 instance)
@@ -1679,7 +1701,7 @@ int __hci_req_start_ext_adv(struct hci_request *req, u8 instance)
return err;
__hci_req_update_scan_rsp_data(req, instance);
- __hci_req_enable_ext_advertising(req);
+ __hci_req_enable_ext_advertising(req, instance);
return 0;
}
@@ -1723,10 +1745,13 @@ int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
adv_instance->remaining_time =
adv_instance->remaining_time - timeout;
- hdev->adv_instance_timeout = timeout;
- queue_delayed_work(hdev->req_workqueue,
+ /* Only use work for scheduling instances with legacy advertising */
+ if (!ext_adv_capable(hdev)) {
+ hdev->adv_instance_timeout = timeout;
+ queue_delayed_work(hdev->req_workqueue,
&hdev->adv_instance_expire,
msecs_to_jiffies(timeout * 1000));
+ }
/* If we're just re-scheduling the same instance again then do not
* execute any HCI commands. This happens when a single instance is
@@ -2744,7 +2769,8 @@ static int powered_update_hci(struct hci_request *req, unsigned long opt)
if (!ext_adv_capable(hdev))
__hci_req_enable_advertising(req);
else if (!err)
- __hci_req_enable_ext_advertising(req);
+ __hci_req_enable_ext_advertising(req,
+ 0x00);
}
} else if (!list_empty(&hdev->adv_instances)) {
struct adv_info *adv_instance;
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 55b2050cc9ff..a7019fbeadd3 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -83,7 +83,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance);
int __hci_req_start_ext_adv(struct hci_request *req, u8 instance);
-void __hci_req_enable_ext_advertising(struct hci_request *req);
+int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance);
void __hci_req_clear_ext_adv_sets(struct hci_request *req);
int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
bool use_rpa, struct adv_info *adv_instance,
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index a442e21f3894..5abd423b55fa 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -775,7 +775,7 @@ static int hidp_setup_hid(struct hidp_session *session,
hid->version = req->version;
hid->country = req->country;
- strncpy(hid->name, req->name, sizeof(hid->name));
+ strscpy(hid->name, req->name, sizeof(hid->name));
snprintf(hid->phys, sizeof(hid->phys), "%pMR",
&l2cap_pi(session->ctrl_sock->sk)->chan->src);
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index 2151913892ce..03be6a4baef3 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -192,6 +192,7 @@ static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne
ca.version = ca32.version;
ca.flags = ca32.flags;
ca.idle_to = ca32.idle_to;
+ ca32.name[sizeof(ca32.name) - 1] = '\0';
memcpy(ca.name, ca32.name, 128);
csock = sockfd_lookup(ca.ctrl_sock, &err);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 5406d7cd46ad..cc506fe99b4d 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -168,11 +168,18 @@ static struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn,
return c;
}
-static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src)
+static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src,
+ u8 src_type)
{
struct l2cap_chan *c;
list_for_each_entry(c, &chan_list, global_l) {
+ if (src_type == BDADDR_BREDR && c->src_type != BDADDR_BREDR)
+ continue;
+
+ if (src_type != BDADDR_BREDR && c->src_type == BDADDR_BREDR)
+ continue;
+
if (c->sport == psm && !bacmp(&c->src, src))
return c;
}
@@ -185,7 +192,7 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm)
write_lock(&chan_list_lock);
- if (psm && __l2cap_global_chan_by_addr(psm, src)) {
+ if (psm && __l2cap_global_chan_by_addr(psm, src, chan->src_type)) {
err = -EADDRINUSE;
goto done;
}
@@ -209,7 +216,8 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm)
err = -EINVAL;
for (p = start; p <= end; p += incr)
- if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src)) {
+ if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src,
+ chan->src_type)) {
chan->psm = cpu_to_le16(p);
chan->sport = cpu_to_le16(p);
err = 0;
@@ -4394,6 +4402,12 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn,
l2cap_chan_lock(chan);
+ if (chan->state != BT_DISCONN) {
+ l2cap_chan_unlock(chan);
+ mutex_unlock(&conn->chan_lock);
+ return 0;
+ }
+
l2cap_chan_hold(chan);
l2cap_chan_del(chan, 0);
@@ -5291,7 +5305,14 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
memset(&rsp, 0, sizeof(rsp));
- err = hci_check_conn_params(min, max, latency, to_multiplier);
+ if (min < hcon->le_conn_min_interval ||
+ max > hcon->le_conn_max_interval) {
+ BT_DBG("requested connection interval exceeds current bounds.");
+ err = -EINVAL;
+ } else {
+ err = hci_check_conn_params(min, max, latency, to_multiplier);
+ }
+
if (err)
rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED);
else
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index e68c715f8d37..6c2b4e6e87ba 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -2579,6 +2579,19 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn,
goto distribute;
}
+ /* Drop IRK if peer is using identity address during pairing but is
+ * providing different address as identity information.
+ *
+ * Microsoft Surface Precision Mouse is known to have this bug.
+ */
+ if (hci_is_identity_address(&hcon->dst, hcon->dst_type) &&
+ (bacmp(&info->bdaddr, &hcon->dst) ||
+ info->addr_type != hcon->dst_type)) {
+ bt_dev_err(hcon->hdev,
+ "ignoring IRK with invalid identity address");
+ goto distribute;
+ }
+
bacpy(&smp->id_addr, &info->bdaddr);
smp->id_addr_type = info->addr_type;
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
index 61ce8454a88e..77396a098fbe 100644
--- a/net/bpfilter/main.c
+++ b/net/bpfilter/main.c
@@ -55,7 +55,7 @@ static void loop(void)
int main(void)
{
- debug_fd = open("/dev/console", 00000002);
+ debug_fd = open("/dev/kmsg", 00000002);
dprintf(debug_fd, "Started bpfilter\n");
loop();
close(debug_fd);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index c05def8fd9cd..681b72862c16 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -52,6 +52,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
br_switchdev_frame_unmark(skb);
BR_INPUT_SKB_CB(skb)->brdev = dev;
+ BR_INPUT_SKB_CB(skb)->frag_max_size = 0;
skb_reset_mac_header(skb);
eth = eth_hdr(skb);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 21b74e7a7b2f..09b1dd8cd853 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -74,7 +74,6 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
struct net_bridge_fdb_entry *dst = NULL;
struct net_bridge_mdb_entry *mdst;
bool local_rcv, mcast_hit = false;
- const unsigned char *dest;
struct net_bridge *br;
u16 vid = 0;
@@ -92,10 +91,9 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
local_rcv = !!(br->dev->flags & IFF_PROMISC);
- dest = eth_hdr(skb)->h_dest;
- if (is_multicast_ether_addr(dest)) {
+ if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) {
/* by definition the broadcast is also a multicast address */
- if (is_broadcast_ether_addr(dest)) {
+ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) {
pkt_type = BR_PKT_BROADCAST;
local_rcv = true;
} else {
@@ -145,7 +143,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
}
break;
case BR_PKT_UNICAST:
- dst = br_fdb_find_rcu(br, dest, vid);
+ dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid);
default:
break;
}
@@ -234,7 +232,7 @@ static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb)
kfree_skb(skb);
return RX_HANDLER_CONSUMED;
case NF_QUEUE:
- ret = nf_queue(skb, &state, e, i, verdict);
+ ret = nf_queue(skb, &state, i, verdict);
if (ret == 1)
continue;
return RX_HANDLER_CONSUMED;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index de22c8fbbb15..3d8deac2353d 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -911,6 +911,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
int type;
int err = 0;
__be32 group;
+ u16 nsrcs;
ih = igmpv3_report_hdr(skb);
num = ntohs(ih->ngrec);
@@ -924,8 +925,9 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
grec = (void *)(skb->data + len - sizeof(*grec));
group = grec->grec_mca;
type = grec->grec_type;
+ nsrcs = ntohs(grec->grec_nsrcs);
- len += ntohs(grec->grec_nsrcs) * 4;
+ len += nsrcs * 4;
if (!ip_mc_may_pull(skb, len))
return -EINVAL;
@@ -946,7 +948,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
src = eth_hdr(skb)->h_source;
if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
type == IGMPV3_MODE_IS_INCLUDE) &&
- ntohs(grec->grec_nsrcs) == 0) {
+ nsrcs == 0) {
br_ip4_multicast_leave_group(br, port, group, vid, src);
} else {
err = br_ip4_multicast_add_group(br, port, group, vid,
@@ -983,7 +985,8 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
len = skb_transport_offset(skb) + sizeof(*icmp6h);
for (i = 0; i < num; i++) {
- __be16 *nsrcs, _nsrcs;
+ __be16 *_nsrcs, __nsrcs;
+ u16 nsrcs;
nsrcs_offset = len + offsetof(struct mld2_grec, grec_nsrcs);
@@ -991,12 +994,13 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
nsrcs_offset + sizeof(_nsrcs))
return -EINVAL;
- nsrcs = skb_header_pointer(skb, nsrcs_offset,
- sizeof(_nsrcs), &_nsrcs);
- if (!nsrcs)
+ _nsrcs = skb_header_pointer(skb, nsrcs_offset,
+ sizeof(__nsrcs), &__nsrcs);
+ if (!_nsrcs)
return -EINVAL;
- grec_len = struct_size(grec, grec_src, ntohs(*nsrcs));
+ nsrcs = ntohs(*_nsrcs);
+ grec_len = struct_size(grec, grec_src, nsrcs);
if (!ipv6_mc_may_pull(skb, len + grec_len))
return -EINVAL;
@@ -1021,7 +1025,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
src = eth_hdr(skb)->h_source;
if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
- ntohs(*nsrcs) == 0) {
+ nsrcs == 0) {
br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
vid, src);
} else {
@@ -1275,7 +1279,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
u16 vid)
{
unsigned int transport_len = ipv6_transport_len(skb);
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct mld_msg *mld;
struct net_bridge_mdb_entry *mp;
struct mld2_query *mld2q;
@@ -1319,7 +1322,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
if (is_general_query) {
saddr.proto = htons(ETH_P_IPV6);
- saddr.u.ip6 = ip6h->saddr;
+ saddr.u.ip6 = ipv6_hdr(skb)->saddr;
br_multicast_query_received(br, port, &br->ip6_other_query,
&saddr, max_delay);
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 34fa72c72ad8..d3f9592f4ff8 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -47,25 +47,22 @@ static unsigned int brnf_net_id __read_mostly;
struct brnf_net {
bool enabled;
-};
#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *brnf_sysctl_header;
-static int brnf_call_iptables __read_mostly = 1;
-static int brnf_call_ip6tables __read_mostly = 1;
-static int brnf_call_arptables __read_mostly = 1;
-static int brnf_filter_vlan_tagged __read_mostly;
-static int brnf_filter_pppoe_tagged __read_mostly;
-static int brnf_pass_vlan_indev __read_mostly;
-#else
-#define brnf_call_iptables 1
-#define brnf_call_ip6tables 1
-#define brnf_call_arptables 1
-#define brnf_filter_vlan_tagged 0
-#define brnf_filter_pppoe_tagged 0
-#define brnf_pass_vlan_indev 0
+ struct ctl_table_header *ctl_hdr;
#endif
+ /* default value is 1 */
+ int call_iptables;
+ int call_ip6tables;
+ int call_arptables;
+
+ /* default value is 0 */
+ int filter_vlan_tagged;
+ int filter_pppoe_tagged;
+ int pass_vlan_indev;
+};
+
#define IS_IP(skb) \
(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
@@ -85,17 +82,28 @@ static inline __be16 vlan_proto(const struct sk_buff *skb)
return 0;
}
-#define IS_VLAN_IP(skb) \
- (vlan_proto(skb) == htons(ETH_P_IP) && \
- brnf_filter_vlan_tagged)
+static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged;
+}
-#define IS_VLAN_IPV6(skb) \
- (vlan_proto(skb) == htons(ETH_P_IPV6) && \
- brnf_filter_vlan_tagged)
+static inline bool is_vlan_ipv6(const struct sk_buff *skb,
+ const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
-#define IS_VLAN_ARP(skb) \
- (vlan_proto(skb) == htons(ETH_P_ARP) && \
- brnf_filter_vlan_tagged)
+ return vlan_proto(skb) == htons(ETH_P_IPV6) &&
+ brnet->filter_vlan_tagged;
+}
+
+static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged;
+}
static inline __be16 pppoe_proto(const struct sk_buff *skb)
{
@@ -103,15 +111,23 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
sizeof(struct pppoe_hdr)));
}
-#define IS_PPPOE_IP(skb) \
- (skb->protocol == htons(ETH_P_PPP_SES) && \
- pppoe_proto(skb) == htons(PPP_IP) && \
- brnf_filter_pppoe_tagged)
+static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return skb->protocol == htons(ETH_P_PPP_SES) &&
+ pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged;
+}
+
+static inline bool is_pppoe_ipv6(const struct sk_buff *skb,
+ const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
-#define IS_PPPOE_IPV6(skb) \
- (skb->protocol == htons(ETH_P_PPP_SES) && \
- pppoe_proto(skb) == htons(PPP_IPV6) && \
- brnf_filter_pppoe_tagged)
+ return skb->protocol == htons(ETH_P_PPP_SES) &&
+ pppoe_proto(skb) == htons(PPP_IPV6) &&
+ brnet->filter_pppoe_tagged;
+}
/* largest possible L2 header, see br_nf_dev_queue_xmit() */
#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
@@ -408,12 +424,16 @@ bridged_dnat:
return 0;
}
-static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev)
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb,
+ const struct net_device *dev,
+ const struct net *net)
{
struct net_device *vlan, *br;
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
br = bridge_parent(dev);
- if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+
+ if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
return br;
vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
@@ -423,7 +443,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
}
/* Some common code for IPv4/IPv6 */
-struct net_device *setup_pre_routing(struct sk_buff *skb)
+struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net)
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
@@ -434,7 +454,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb)
nf_bridge->in_prerouting = 1;
nf_bridge->physindev = skb->dev;
- skb->dev = brnf_get_logical_dev(skb, skb->dev);
+ skb->dev = brnf_get_logical_dev(skb, skb->dev, net);
if (skb->protocol == htons(ETH_P_8021Q))
nf_bridge->orig_proto = BRNF_PROTO_8021Q;
@@ -460,6 +480,7 @@ static unsigned int br_nf_pre_routing(void *priv,
struct net_bridge_port *p;
struct net_bridge *br;
__u32 len = nf_bridge_encap_header_len(skb);
+ struct brnf_net *brnet;
if (unlikely(!pskb_may_pull(skb, len)))
return NF_DROP;
@@ -469,8 +490,10 @@ static unsigned int br_nf_pre_routing(void *priv,
return NF_DROP;
br = p->br;
- if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
- if (!brnf_call_ip6tables &&
+ brnet = net_generic(state->net, brnf_net_id);
+ if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net)) {
+ if (!brnet->call_ip6tables &&
!br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
return NF_ACCEPT;
@@ -478,10 +501,11 @@ static unsigned int br_nf_pre_routing(void *priv,
return br_nf_pre_routing_ipv6(priv, skb, state);
}
- if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
+ if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
return NF_ACCEPT;
- if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
+ if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) &&
+ !is_pppoe_ip(skb, state->net))
return NF_ACCEPT;
nf_bridge_pull_encap_header_rcsum(skb);
@@ -491,7 +515,7 @@ static unsigned int br_nf_pre_routing(void *priv,
if (!nf_bridge_alloc(skb))
return NF_DROP;
- if (!setup_pre_routing(skb))
+ if (!setup_pre_routing(skb, state->net))
return NF_DROP;
nf_bridge = nf_bridge_info_get(skb);
@@ -514,7 +538,7 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
struct net_device *in;
- if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
+ if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) {
if (skb->protocol == htons(ETH_P_IP))
nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
@@ -569,9 +593,11 @@ static unsigned int br_nf_forward_ip(void *priv,
if (!parent)
return NF_DROP;
- if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+ if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+ is_pppoe_ip(skb, state->net))
pf = NFPROTO_IPV4;
- else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+ else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net))
pf = NFPROTO_IPV6;
else
return NF_ACCEPT;
@@ -602,7 +628,7 @@ static unsigned int br_nf_forward_ip(void *priv,
skb->protocol = htons(ETH_P_IPV6);
NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb,
- brnf_get_logical_dev(skb, state->in),
+ brnf_get_logical_dev(skb, state->in, state->net),
parent, br_nf_forward_finish);
return NF_STOLEN;
@@ -615,23 +641,25 @@ static unsigned int br_nf_forward_arp(void *priv,
struct net_bridge_port *p;
struct net_bridge *br;
struct net_device **d = (struct net_device **)(skb->cb);
+ struct brnf_net *brnet;
p = br_port_get_rcu(state->out);
if (p == NULL)
return NF_ACCEPT;
br = p->br;
- if (!brnf_call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
+ brnet = net_generic(state->net, brnf_net_id);
+ if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
return NF_ACCEPT;
if (!IS_ARP(skb)) {
- if (!IS_VLAN_ARP(skb))
+ if (!is_vlan_arp(skb, state->net))
return NF_ACCEPT;
nf_bridge_pull_encap_header(skb);
}
if (arp_hdr(skb)->ar_pln != 4) {
- if (IS_VLAN_ARP(skb))
+ if (is_vlan_arp(skb, state->net))
nf_bridge_push_encap_header(skb);
return NF_ACCEPT;
}
@@ -791,9 +819,11 @@ static unsigned int br_nf_post_routing(void *priv,
if (!realoutdev)
return NF_DROP;
- if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+ if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+ is_pppoe_ip(skb, state->net))
pf = NFPROTO_IPV4;
- else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+ else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net))
pf = NFPROTO_IPV6;
else
return NF_ACCEPT;
@@ -946,23 +976,6 @@ static int brnf_device_event(struct notifier_block *unused, unsigned long event,
return NOTIFY_OK;
}
-static void __net_exit brnf_exit_net(struct net *net)
-{
- struct brnf_net *brnet = net_generic(net, brnf_net_id);
-
- if (!brnet->enabled)
- return;
-
- nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
- brnet->enabled = false;
-}
-
-static struct pernet_operations brnf_net_ops __read_mostly = {
- .exit = brnf_exit_net,
- .id = &brnf_net_id,
- .size = sizeof(struct brnf_net),
-};
-
static struct notifier_block brnf_notifier __read_mostly = {
.notifier_call = brnf_device_event,
};
@@ -1021,49 +1034,124 @@ int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
static struct ctl_table brnf_table[] = {
{
.procname = "bridge-nf-call-arptables",
- .data = &brnf_call_arptables,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-call-iptables",
- .data = &brnf_call_iptables,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-call-ip6tables",
- .data = &brnf_call_ip6tables,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-filter-vlan-tagged",
- .data = &brnf_filter_vlan_tagged,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-filter-pppoe-tagged",
- .data = &brnf_filter_pppoe_tagged,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-pass-vlan-input-dev",
- .data = &brnf_pass_vlan_indev,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{ }
};
+
+static inline void br_netfilter_sysctl_default(struct brnf_net *brnf)
+{
+ brnf->call_iptables = 1;
+ brnf->call_ip6tables = 1;
+ brnf->call_arptables = 1;
+ brnf->filter_vlan_tagged = 0;
+ brnf->filter_pppoe_tagged = 0;
+ brnf->pass_vlan_indev = 0;
+}
+
+static int br_netfilter_sysctl_init_net(struct net *net)
+{
+ struct ctl_table *table = brnf_table;
+ struct brnf_net *brnet;
+
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+ }
+
+ brnet = net_generic(net, brnf_net_id);
+ table[0].data = &brnet->call_arptables;
+ table[1].data = &brnet->call_iptables;
+ table[2].data = &brnet->call_ip6tables;
+ table[3].data = &brnet->filter_vlan_tagged;
+ table[4].data = &brnet->filter_pppoe_tagged;
+ table[5].data = &brnet->pass_vlan_indev;
+
+ br_netfilter_sysctl_default(brnet);
+
+ brnet->ctl_hdr = register_net_sysctl(net, "net/bridge", table);
+ if (!brnet->ctl_hdr) {
+ if (!net_eq(net, &init_net))
+ kfree(table);
+
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void br_netfilter_sysctl_exit_net(struct net *net,
+ struct brnf_net *brnet)
+{
+ struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg;
+
+ unregister_net_sysctl_table(brnet->ctl_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static int __net_init brnf_init_net(struct net *net)
+{
+ return br_netfilter_sysctl_init_net(net);
+}
+#endif
+
+static void __net_exit brnf_exit_net(struct net *net)
+{
+ struct brnf_net *brnet;
+
+ brnet = net_generic(net, brnf_net_id);
+ if (brnet->enabled) {
+ nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ brnet->enabled = false;
+ }
+
+#ifdef CONFIG_SYSCTL
+ br_netfilter_sysctl_exit_net(net, brnet);
#endif
+}
+
+static struct pernet_operations brnf_net_ops __read_mostly = {
+#ifdef CONFIG_SYSCTL
+ .init = brnf_init_net,
+#endif
+ .exit = brnf_exit_net,
+ .id = &brnf_net_id,
+ .size = sizeof(struct brnf_net),
+};
static int __init br_netfilter_init(void)
{
@@ -1079,16 +1167,6 @@ static int __init br_netfilter_init(void)
return ret;
}
-#ifdef CONFIG_SYSCTL
- brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
- if (brnf_sysctl_header == NULL) {
- printk(KERN_WARNING
- "br_netfilter: can't register to sysctl.\n");
- unregister_netdevice_notifier(&brnf_notifier);
- unregister_pernet_subsys(&brnf_net_ops);
- return -ENOMEM;
- }
-#endif
RCU_INIT_POINTER(nf_br_ops, &br_ops);
printk(KERN_NOTICE "Bridge firewalling registered\n");
return 0;
@@ -1099,9 +1177,6 @@ static void __exit br_netfilter_fini(void)
RCU_INIT_POINTER(nf_br_ops, NULL);
unregister_netdevice_notifier(&brnf_notifier);
unregister_pernet_subsys(&brnf_net_ops);
-#ifdef CONFIG_SYSCTL
- unregister_net_sysctl_table(brnf_sysctl_header);
-#endif
}
module_init(br_netfilter_init);
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 0e63e5dc5ac4..e4e0c836c3f5 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -224,7 +224,7 @@ unsigned int br_nf_pre_routing_ipv6(void *priv,
nf_bridge = nf_bridge_alloc(skb);
if (!nf_bridge)
return NF_DROP;
- if (!setup_pre_routing(skb))
+ if (!setup_pre_routing(skb, state->net))
return NF_DROP;
nf_bridge = nf_bridge_info_get(skb);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 159a0e2cb0f6..e8cf03b43b7d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -421,6 +421,7 @@ struct net_bridge {
struct br_input_skb_cb {
struct net_device *brdev;
+ u16 frag_max_size;
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
u8 igmp;
u8 mrouters_only:1;
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index 68a6922b4141..7796dd9d42d7 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -143,7 +143,6 @@ void br_send_tcn_bpdu(struct net_bridge_port *p)
void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
struct net_device *dev)
{
- const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p;
struct net_bridge *br;
const unsigned char *buf;
@@ -172,7 +171,7 @@ void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
if (p->state == BR_STATE_DISABLED)
goto out;
- if (!ether_addr_equal(dest, br->group_addr))
+ if (!ether_addr_equal(eth_hdr(skb)->h_dest, br->group_addr))
goto out;
if (p->flags & BR_BPDU_GUARD) {
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index f47f526b4f19..021cc9f66804 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -797,6 +797,16 @@ bool br_vlan_enabled(const struct net_device *dev)
}
EXPORT_SYMBOL_GPL(br_vlan_enabled);
+int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ *p_proto = ntohs(br->vlan_proto);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_proto);
+
int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
{
int err = 0;
@@ -1227,13 +1237,11 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
}
}
-int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+static int __br_vlan_get_pvid(const struct net_device *dev,
+ struct net_bridge_port *p, u16 *p_pvid)
{
struct net_bridge_vlan_group *vg;
- struct net_bridge_port *p;
- ASSERT_RTNL();
- p = br_port_get_check_rtnl(dev);
if (p)
vg = nbp_vlan_group(p);
else if (netif_is_bridge_master(dev))
@@ -1244,8 +1252,21 @@ int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
*p_pvid = br_get_pvid(vg);
return 0;
}
+
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+ ASSERT_RTNL();
+
+ return __br_vlan_get_pvid(dev, br_port_get_check_rtnl(dev), p_pvid);
+}
EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
+int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
+{
+ return __br_vlan_get_pvid(dev, br_port_get_check_rcu(dev), p_pvid);
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu);
+
int br_vlan_get_info(const struct net_device *dev, u16 vid,
struct bridge_vlan_info *p_vinfo)
{
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 36a98d36d339..154fa558bb90 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -9,6 +9,12 @@ menuconfig NF_TABLES_BRIDGE
bool "Ethernet Bridge nf_tables support"
if NF_TABLES_BRIDGE
+
+config NFT_BRIDGE_META
+ tristate "Netfilter nf_table bridge meta support"
+ help
+ Add support for bridge dedicated meta key.
+
config NFT_BRIDGE_REJECT
tristate "Netfilter nf_tables bridge reject support"
depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6
@@ -19,6 +25,20 @@ config NF_LOG_BRIDGE
tristate "Bridge packet logging"
select NF_LOG_COMMON
+config NF_CONNTRACK_BRIDGE
+ tristate "IPv4/IPV6 bridge connection tracking support"
+ depends on NF_CONNTRACK
+ default n
+ help
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections. This is used to enhance packet filtering via
+ stateful policies. Enable this if you want native tracking from
+ the bridge. This provides a replacement for the `br_netfilter'
+ infrastructure.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
endif # NF_TABLES_BRIDGE
menuconfig BRIDGE_NF_EBTABLES
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 9b868861f21a..8e2c5759d964 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -3,8 +3,12 @@
# Makefile for the netfilter modules for Link Layer filtering on a bridge.
#
+obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o
obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o
+# connection tracking
+obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o
+
# packet logging
obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index eeae23a73c6a..ed91ea31978a 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -22,7 +22,7 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
const struct ebt_nat_info *info = par->targinfo;
struct net_device *dev;
- if (!skb_make_writable(skb, 0))
+ if (skb_ensure_writable(skb, ETH_ALEN))
return EBT_DROP;
ether_addr_copy(eth_hdr(skb)->h_dest, info->mac);
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 53ef08e6765f..0cad62a4052b 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -21,7 +21,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_redirect_info *info = par->targinfo;
- if (!skb_make_writable(skb, 0))
+ if (skb_ensure_writable(skb, ETH_ALEN))
return EBT_DROP;
if (xt_hooknum(par) != NF_BR_BROUTING)
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index 700d338d5ddb..27443bf229a3 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -22,7 +22,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_nat_info *info = par->targinfo;
- if (!skb_make_writable(skb, 0))
+ if (skb_ensure_writable(skb, ETH_ALEN * 2))
return EBT_DROP;
ether_addr_copy(eth_hdr(skb)->h_source, info->mac);
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
new file mode 100644
index 000000000000..4f5444d2a526
--- /dev/null
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -0,0 +1,435 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netfilter/nf_tables.h>
+
+#include "../br_private.h"
+
+/* Best effort variant of ip_do_fragment which preserves geometry, unless skbuff
+ * has been linearized or cloned.
+ */
+static int nf_br_ip_fragment(struct net *net, struct sock *sk,
+ struct sk_buff *skb,
+ struct nf_ct_bridge_frag_data *data,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_ct_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ unsigned int hlen, ll_rs, mtu;
+ struct ip_frag_state state;
+ struct iphdr *iph;
+ int err;
+
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto blackhole;
+
+ iph = ip_hdr(skb);
+
+ /*
+ * Setup starting values
+ */
+
+ hlen = iph->ihl * 4;
+ frag_max_size -= hlen;
+ ll_rs = LL_RESERVED_SPACE(skb->dev);
+ mtu = skb->dev->mtu;
+
+ if (skb_has_frag_list(skb)) {
+ unsigned int first_len = skb_pagelen(skb);
+ struct ip_fraglist_iter iter;
+ struct sk_buff *frag;
+
+ if (first_len - hlen > mtu ||
+ skb_headroom(skb) < ll_rs)
+ goto blackhole;
+
+ if (skb_cloned(skb))
+ goto slow_path;
+
+ skb_walk_frags(skb, frag) {
+ if (frag->len > mtu ||
+ skb_headroom(frag) < hlen + ll_rs)
+ goto blackhole;
+
+ if (skb_shared(frag))
+ goto slow_path;
+ }
+
+ ip_fraglist_init(skb, iph, hlen, &iter);
+
+ for (;;) {
+ if (iter.frag)
+ ip_fraglist_prepare(skb, &iter);
+
+ err = output(net, sk, data, skb);
+ if (err || !iter.frag)
+ break;
+
+ skb = ip_fraglist_next(&iter);
+ }
+ return err;
+ }
+slow_path:
+ /* This is a linearized skbuff, the original geometry is lost for us.
+ * This may also be a clone skbuff, we could preserve the geometry for
+ * the copies but probably not worth the effort.
+ */
+ ip_frag_init(skb, hlen, ll_rs, frag_max_size, &state);
+
+ while (state.left > 0) {
+ struct sk_buff *skb2;
+
+ skb2 = ip_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
+ goto blackhole;
+ }
+
+ err = output(net, sk, data, skb2);
+ if (err)
+ goto blackhole;
+ }
+ consume_skb(skb);
+ return err;
+
+blackhole:
+ kfree_skb(skb);
+ return 0;
+}
+
+/* ip_defrag() expects IPCB() in place. */
+static void br_skb_cb_save(struct sk_buff *skb, struct br_input_skb_cb *cb,
+ size_t inet_skb_parm_size)
+{
+ memcpy(cb, skb->cb, sizeof(*cb));
+ memset(skb->cb, 0, inet_skb_parm_size);
+}
+
+static void br_skb_cb_restore(struct sk_buff *skb,
+ const struct br_input_skb_cb *cb,
+ u16 fragsz)
+{
+ memcpy(skb->cb, cb, sizeof(*cb));
+ BR_INPUT_SKB_CB(skb)->frag_max_size = fragsz;
+}
+
+static unsigned int nf_ct_br_defrag4(struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+ enum ip_conntrack_info ctinfo;
+ struct br_input_skb_cb cb;
+ const struct nf_conn *ct;
+ int err;
+
+ if (!ip_is_fragment(ip_hdr(skb)))
+ return NF_ACCEPT;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct)
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+
+ br_skb_cb_save(skb, &cb, sizeof(struct inet_skb_parm));
+ local_bh_disable();
+ err = ip_defrag(state->net, skb,
+ IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+ local_bh_enable();
+ if (!err) {
+ br_skb_cb_restore(skb, &cb, IPCB(skb)->frag_max_size);
+ skb->ignore_df = 1;
+ return NF_ACCEPT;
+ }
+
+ return NF_STOLEN;
+}
+
+static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+ enum ip_conntrack_info ctinfo;
+ struct br_input_skb_cb cb;
+ const struct nf_conn *ct;
+ int err;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct)
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+
+ br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm));
+
+ err = nf_ipv6_br_defrag(state->net, skb,
+ IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+ /* queued */
+ if (err == -EINPROGRESS)
+ return NF_STOLEN;
+
+ br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size);
+ return err == 0 ? NF_ACCEPT : NF_DROP;
+}
+
+static int nf_ct_br_ip_check(const struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ int nhoff, len;
+
+ nhoff = skb_network_offset(skb);
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 ||
+ iph->version != 4)
+ return -1;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < nhoff + len ||
+ len < (iph->ihl * 4))
+ return -1;
+
+ return 0;
+}
+
+static int nf_ct_br_ipv6_check(const struct sk_buff *skb)
+{
+ const struct ipv6hdr *hdr;
+ int nhoff, len;
+
+ nhoff = skb_network_offset(skb);
+ hdr = ipv6_hdr(skb);
+ if (hdr->version != 6)
+ return -1;
+
+ len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff;
+ if (skb->len < len)
+ return -1;
+
+ return 0;
+}
+
+static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_hook_state bridge_state = *state;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ u32 len;
+ int ret;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if ((ct && !nf_ct_is_template(ct)) ||
+ ctinfo == IP_CT_UNTRACKED)
+ return NF_ACCEPT;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return NF_ACCEPT;
+
+ len = ntohs(ip_hdr(skb)->tot_len);
+ if (pskb_trim_rcsum(skb, len))
+ return NF_ACCEPT;
+
+ if (nf_ct_br_ip_check(skb))
+ return NF_ACCEPT;
+
+ bridge_state.pf = NFPROTO_IPV4;
+ ret = nf_ct_br_defrag4(skb, &bridge_state);
+ break;
+ case htons(ETH_P_IPV6):
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return NF_ACCEPT;
+
+ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+ if (pskb_trim_rcsum(skb, len))
+ return NF_ACCEPT;
+
+ if (nf_ct_br_ipv6_check(skb))
+ return NF_ACCEPT;
+
+ bridge_state.pf = NFPROTO_IPV6;
+ ret = nf_ct_br_defrag6(skb, &bridge_state);
+ break;
+ default:
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+ return NF_ACCEPT;
+ }
+
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ return nf_conntrack_in(skb, &bridge_state);
+}
+
+static void nf_ct_bridge_frag_save(struct sk_buff *skb,
+ struct nf_ct_bridge_frag_data *data)
+{
+ if (skb_vlan_tag_present(skb)) {
+ data->vlan_present = true;
+ data->vlan_tci = skb->vlan_tci;
+ data->vlan_proto = skb->vlan_proto;
+ } else {
+ data->vlan_present = false;
+ }
+ skb_copy_from_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN);
+}
+
+static unsigned int
+nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_ct_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ struct nf_ct_bridge_frag_data data;
+
+ if (!BR_INPUT_SKB_CB(skb)->frag_max_size)
+ return NF_ACCEPT;
+
+ nf_ct_bridge_frag_save(skb, &data);
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ nf_br_ip_fragment(state->net, state->sk, skb, &data, output);
+ break;
+ case htons(ETH_P_IPV6):
+ nf_br_ip6_fragment(state->net, state->sk, skb, &data, output);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return NF_DROP;
+ }
+
+ return NF_STOLEN;
+}
+
+/* Actually only slow path refragmentation needs this. */
+static int nf_ct_bridge_frag_restore(struct sk_buff *skb,
+ const struct nf_ct_bridge_frag_data *data)
+{
+ int err;
+
+ err = skb_cow_head(skb, ETH_HLEN);
+ if (err) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ if (data->vlan_present)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci);
+ else if (skb_vlan_tag_present(skb))
+ __vlan_hwaccel_clear_tag(skb);
+
+ skb_copy_to_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN);
+ skb_reset_mac_header(skb);
+
+ return 0;
+}
+
+static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk,
+ const struct nf_ct_bridge_frag_data *data,
+ struct sk_buff *skb)
+{
+ int err;
+
+ err = nf_ct_bridge_frag_restore(skb, data);
+ if (err < 0)
+ return err;
+
+ return br_dev_queue_push_xmit(net, sk, skb);
+}
+
+static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ int protoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ return nf_conntrack_confirm(skb);
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ protoff = skb_network_offset(skb) + ip_hdrlen(skb);
+ break;
+ case htons(ETH_P_IPV6): {
+ unsigned char pnum = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
+
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
+ return nf_conntrack_confirm(skb);
+ }
+ break;
+ default:
+ return NF_ACCEPT;
+ }
+ return nf_confirm(skb, protoff, ct, ctinfo);
+}
+
+static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ int ret;
+
+ ret = nf_ct_bridge_confirm(skb);
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ return nf_ct_bridge_refrag(skb, state, nf_ct_bridge_refrag_post);
+}
+
+static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = {
+ {
+ .hook = nf_ct_bridge_pre,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = nf_ct_bridge_post,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+};
+
+static struct nf_ct_bridge_info bridge_info = {
+ .ops = nf_ct_bridge_hook_ops,
+ .ops_size = ARRAY_SIZE(nf_ct_bridge_hook_ops),
+ .me = THIS_MODULE,
+};
+
+static int __init nf_conntrack_l3proto_bridge_init(void)
+{
+ nf_ct_bridge_register(&bridge_info);
+
+ return 0;
+}
+
+static void __exit nf_conntrack_l3proto_bridge_fini(void)
+{
+ nf_ct_bridge_unregister(&bridge_info);
+}
+
+module_init(nf_conntrack_l3proto_bridge_init);
+module_exit(nf_conntrack_l3proto_bridge_fini);
+
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_BRIDGE));
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
new file mode 100644
index 000000000000..bed66f536b34
--- /dev/null
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
+#include <linux/if_bridge.h>
+
+static const struct net_device *
+nft_meta_get_bridge(const struct net_device *dev)
+{
+ if (dev && netif_is_bridge_port(dev))
+ return netdev_master_upper_dev_get_rcu((struct net_device *)dev);
+
+ return NULL;
+}
+
+static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
+ u32 *dest = &regs->data[priv->dreg];
+ const struct net_device *br_dev;
+
+ switch (priv->key) {
+ case NFT_META_BRI_IIFNAME:
+ br_dev = nft_meta_get_bridge(in);
+ if (!br_dev)
+ goto err;
+ break;
+ case NFT_META_BRI_OIFNAME:
+ br_dev = nft_meta_get_bridge(out);
+ if (!br_dev)
+ goto err;
+ break;
+ case NFT_META_BRI_IIFPVID: {
+ u16 p_pvid;
+
+ br_dev = nft_meta_get_bridge(in);
+ if (!br_dev || !br_vlan_enabled(br_dev))
+ goto err;
+
+ br_vlan_get_pvid_rcu(in, &p_pvid);
+ nft_reg_store16(dest, p_pvid);
+ return;
+ }
+ case NFT_META_BRI_IIFVPROTO: {
+ u16 p_proto;
+
+ br_dev = nft_meta_get_bridge(in);
+ if (!br_dev || !br_vlan_enabled(br_dev))
+ goto err;
+
+ br_vlan_get_proto(br_dev, &p_proto);
+ nft_reg_store16(dest, p_proto);
+ return;
+ }
+ default:
+ goto out;
+ }
+
+ strncpy((char *)dest, br_dev->name, IFNAMSIZ);
+ return;
+out:
+ return nft_meta_get_eval(expr, regs, pkt);
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_BRI_IIFNAME:
+ case NFT_META_BRI_OIFNAME:
+ len = IFNAMSIZ;
+ break;
+ case NFT_META_BRI_IIFPVID:
+ case NFT_META_BRI_IIFVPROTO:
+ len = sizeof(u16);
+ break;
+ default:
+ return nft_meta_get_init(ctx, expr, tb);
+ }
+
+ priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+}
+
+static struct nft_expr_type nft_meta_bridge_type;
+static const struct nft_expr_ops nft_meta_bridge_get_ops = {
+ .type = &nft_meta_bridge_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_bridge_get_eval,
+ .init = nft_meta_bridge_get_init,
+ .dump = nft_meta_get_dump,
+};
+
+static const struct nft_expr_ops nft_meta_bridge_set_ops = {
+ .type = &nft_meta_bridge_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_set_eval,
+ .init = nft_meta_set_init,
+ .destroy = nft_meta_set_destroy,
+ .dump = nft_meta_set_dump,
+ .validate = nft_meta_set_validate,
+};
+
+static const struct nft_expr_ops *
+nft_meta_bridge_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_META_KEY] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG])
+ return &nft_meta_bridge_get_ops;
+
+ if (tb[NFTA_META_SREG])
+ return &nft_meta_bridge_set_ops;
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_meta_bridge_type __read_mostly = {
+ .family = NFPROTO_BRIDGE,
+ .name = "meta",
+ .select_ops = nft_meta_bridge_select_ops,
+ .policy = nft_meta_policy,
+ .maxattr = NFTA_META_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_meta_bridge_module_init(void)
+{
+ return nft_register_expr(&nft_meta_bridge_type);
+}
+
+static void __exit nft_meta_bridge_module_exit(void)
+{
+ nft_unregister_expr(&nft_meta_bridge_type);
+}
+
+module_init(nft_meta_bridge_module_init);
+module_exit(nft_meta_bridge_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("wenxu <wenxu@ucloud.cn>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta");
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index d1c4e1f3be2c..94c7f77ecb6b 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -627,6 +627,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
unsigned int i;
u32 nbuckets;
u64 cost;
+ int ret;
smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
if (!smap)
@@ -636,13 +637,21 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus())));
nbuckets = 1U << smap->bucket_log;
+ cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
+
+ ret = bpf_map_charge_init(&smap->map.memory, cost);
+ if (ret < 0) {
+ kfree(smap);
+ return ERR_PTR(ret);
+ }
+
smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
+ bpf_map_charge_finish(&smap->map.memory);
kfree(smap);
return ERR_PTR(-ENOMEM);
}
- cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
for (i = 0; i < nbuckets; i++) {
INIT_HLIST_HEAD(&smap->buckets[i].list);
@@ -652,7 +661,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) %
BPF_SK_STORAGE_CACHE_SIZE;
- smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
return &smap->map;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index d6edd218babd..fc676b2610e3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2900,12 +2900,10 @@ static void skb_warn_bad_offload(const struct sk_buff *skb)
else
name = netdev_name(dev);
}
- WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
- "gso_type=%d ip_summed=%d\n",
+ skb_dump(KERN_WARNING, skb, false);
+ WARN(1, "%s: caps=(%pNF, %pNF)\n",
name, dev ? &dev->features : &null_features,
- skb->sk ? &skb->sk->sk_route_caps : &null_features,
- skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
- skb_shinfo(skb)->gso_type, skb->ip_summed);
+ skb->sk ? &skb->sk->sk_route_caps : &null_features);
}
/*
@@ -3124,13 +3122,7 @@ void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
if (net_ratelimit()) {
pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
- if (dev)
- pr_err("dev features: %pNF\n", &dev->features);
- pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n",
- skb->len, skb->data_len, skb->pkt_type,
- skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type,
- skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum,
- skb->csum_complete_sw, skb->csum_valid, skb->csum_level);
+ skb_dump(KERN_ERR, skb, true);
dump_stack();
}
}
@@ -4689,9 +4681,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
__skb_push(skb, skb->mac_len);
skb_do_redirect(skb);
return NULL;
- case TC_ACT_REINSERT:
- /* this does not scrub the packet, and updates stats on error */
- skb_tc_reinsert(skb, &cl_res);
+ case TC_ACT_CONSUMED:
return NULL;
default:
break;
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 132f4b757963..4f40aeace902 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -17,6 +17,7 @@
#include <linux/netdevice.h>
#include <linux/spinlock.h>
#include <linux/refcount.h>
+#include <linux/workqueue.h>
#include <rdma/ib_verbs.h>
#include <net/netlink.h>
#include <net/genetlink.h>
@@ -514,14 +515,31 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
return 0;
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
return -EMSGSIZE;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
+ if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_PF) {
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+ attrs->pci_pf.pf))
+ return -EMSGSIZE;
+ } else if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_VF) {
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+ attrs->pci_vf.pf) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER,
+ attrs->pci_vf.vf))
+ return -EMSGSIZE;
+ }
+ if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PHYSICAL &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA)
+ return 0;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
+ attrs->phys.port_number))
return -EMSGSIZE;
if (!attrs->split)
return 0;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+ attrs->phys.port_number))
return -EMSGSIZE;
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
- attrs->split_subport_number))
+ attrs->phys.split_subport_number))
return -EMSGSIZE;
return 0;
}
@@ -1548,7 +1566,8 @@ static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
u32 seq, int flags)
{
const struct devlink_ops *ops = devlink->ops;
- u8 inline_mode, encap_mode;
+ enum devlink_eswitch_encap_mode encap_mode;
+ u8 inline_mode;
void *hdr;
int err = 0;
u16 mode;
@@ -1624,7 +1643,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
{
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
- u8 inline_mode, encap_mode;
+ enum devlink_eswitch_encap_mode encap_mode;
+ u8 inline_mode;
int err = 0;
u16 mode;
@@ -2668,6 +2688,108 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
return devlink->ops->reload(devlink, info->extack);
}
+static int devlink_nl_flash_update_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ enum devlink_command cmd,
+ const char *status_msg,
+ const char *component,
+ unsigned long done, unsigned long total)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
+ goto out;
+
+ if (status_msg &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
+ status_msg))
+ goto nla_put_failure;
+ if (component &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
+ component))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
+ done, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
+ total, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void __devlink_flash_update_notify(struct devlink *devlink,
+ enum devlink_command cmd,
+ const char *status_msg,
+ const char *component,
+ unsigned long done,
+ unsigned long total)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_END &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg,
+ component, done, total);
+ if (err)
+ goto out_free_msg;
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+}
+
+void devlink_flash_update_begin_notify(struct devlink *devlink)
+{
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE,
+ NULL, NULL, 0, 0);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify);
+
+void devlink_flash_update_end_notify(struct devlink *devlink)
+{
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_END,
+ NULL, NULL, 0, 0);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify);
+
+void devlink_flash_update_status_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long done,
+ unsigned long total)
+{
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ status_msg, component, done, total);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify);
+
static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
struct genl_info *info)
{
@@ -4415,6 +4537,35 @@ nla_put_failure:
return err;
}
+static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ enum devlink_command cmd)
+{
+ int index = cb->args[0];
+ int tmp_index = index;
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ err = devlink_fmsg_prepare_skb(fmsg, skb, &index);
+ if ((err && err != -EMSGSIZE) || tmp_index == index)
+ goto nla_put_failure;
+
+ cb->args[0] = index;
+ genlmsg_end(skb, hdr);
+ return skb->len;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return err;
+}
+
struct devlink_health_reporter {
struct list_head list;
void *priv;
@@ -4647,17 +4798,16 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
EXPORT_SYMBOL_GPL(devlink_health_report);
static struct devlink_health_reporter *
-devlink_health_reporter_get_from_info(struct devlink *devlink,
- struct genl_info *info)
+devlink_health_reporter_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
{
struct devlink_health_reporter *reporter;
char *reporter_name;
- if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
+ if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
return NULL;
- reporter_name =
- nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
+ reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
mutex_lock(&devlink->reporters_lock);
reporter = devlink_health_reporter_find_by_name(devlink, reporter_name);
if (reporter)
@@ -4666,6 +4816,48 @@ devlink_health_reporter_get_from_info(struct devlink *devlink,
return reporter;
}
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_health_reporter_get_from_attrs(devlink, info->attrs);
+}
+
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_cb(struct netlink_callback *cb)
+{
+ struct devlink_health_reporter *reporter;
+ struct devlink *devlink;
+ struct nlattr **attrs;
+ int err;
+
+ attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+
+ err = nlmsg_parse_deprecated(cb->nlh,
+ GENL_HDRLEN + devlink_nl_family.hdrsize,
+ attrs, DEVLINK_ATTR_MAX,
+ devlink_nl_family.policy, cb->extack);
+ if (err)
+ goto free;
+
+ mutex_lock(&devlink_mutex);
+ devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
+ if (IS_ERR(devlink))
+ goto unlock;
+
+ reporter = devlink_health_reporter_get_from_attrs(devlink, attrs);
+ mutex_unlock(&devlink_mutex);
+ kfree(attrs);
+ return reporter;
+unlock:
+ mutex_unlock(&devlink_mutex);
+free:
+ kfree(attrs);
+ return NULL;
+}
+
static void
devlink_health_reporter_put(struct devlink_health_reporter *reporter)
{
@@ -4901,32 +5093,40 @@ out:
return err;
}
-static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb,
- struct genl_info *info)
+static int
+devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
{
- struct devlink *devlink = info->user_ptr[0];
struct devlink_health_reporter *reporter;
+ u64 start = cb->args[0];
int err;
- reporter = devlink_health_reporter_get_from_info(devlink, info);
+ reporter = devlink_health_reporter_get_from_cb(cb);
if (!reporter)
return -EINVAL;
if (!reporter->ops->dump) {
- devlink_health_reporter_put(reporter);
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto out;
}
-
mutex_lock(&reporter->dump_lock);
- err = devlink_health_do_dump(reporter, NULL);
- if (err)
- goto out;
-
- err = devlink_fmsg_snd(reporter->dump_fmsg, info,
- DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, 0);
+ if (!start) {
+ err = devlink_health_do_dump(reporter, NULL);
+ if (err)
+ goto unlock;
+ cb->args[1] = reporter->dump_ts;
+ }
+ if (!reporter->dump_fmsg || cb->args[1] != reporter->dump_ts) {
+ NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry");
+ err = -EAGAIN;
+ goto unlock;
+ }
-out:
+ err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb,
+ DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET);
+unlock:
mutex_unlock(&reporter->dump_lock);
+out:
devlink_health_reporter_put(reporter);
return err;
}
@@ -5263,7 +5463,7 @@ static const struct genl_ops devlink_nl_ops[] = {
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .doit = devlink_nl_cmd_health_reporter_dump_get_doit,
+ .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
DEVLINK_NL_FLAG_NO_LOCK,
@@ -5386,6 +5586,38 @@ void devlink_free(struct devlink *devlink)
}
EXPORT_SYMBOL_GPL(devlink_free);
+static void devlink_port_type_warn(struct work_struct *work)
+{
+ WARN(true, "Type was not set for devlink port.");
+}
+
+static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
+{
+ /* Ignore CPU and DSA flavours. */
+ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA;
+}
+
+#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30)
+
+static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port)
+{
+ if (!devlink_port_type_should_warn(devlink_port))
+ return;
+ /* Schedule a work to WARN in case driver does not set port
+ * type within timeout.
+ */
+ schedule_delayed_work(&devlink_port->type_warn_dw,
+ DEVLINK_PORT_TYPE_WARN_TIMEOUT);
+}
+
+static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
+{
+ if (!devlink_port_type_should_warn(devlink_port))
+ return;
+ cancel_delayed_work_sync(&devlink_port->type_warn_dw);
+}
+
/**
* devlink_port_register - Register devlink port
*
@@ -5415,6 +5647,8 @@ int devlink_port_register(struct devlink *devlink,
list_add_tail(&devlink_port->list, &devlink->port_list);
INIT_LIST_HEAD(&devlink_port->param_list);
mutex_unlock(&devlink->lock);
+ INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
+ devlink_port_type_warn_schedule(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
return 0;
}
@@ -5429,6 +5663,7 @@ void devlink_port_unregister(struct devlink_port *devlink_port)
{
struct devlink *devlink = devlink_port->devlink;
+ devlink_port_type_warn_cancel(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
mutex_lock(&devlink->lock);
list_del(&devlink_port->list);
@@ -5442,6 +5677,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
{
if (WARN_ON(!devlink_port->registered))
return;
+ devlink_port_type_warn_cancel(devlink_port);
spin_lock(&devlink_port->type_lock);
devlink_port->type = type;
devlink_port->type_dev = type_dev;
@@ -5515,9 +5751,33 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
void devlink_port_type_clear(struct devlink_port *devlink_port)
{
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL);
+ devlink_port_type_warn_schedule(devlink_port);
}
EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
+ enum devlink_port_flavour flavour,
+ const unsigned char *switch_id,
+ unsigned char switch_id_len)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ if (WARN_ON(devlink_port->registered))
+ return -EEXIST;
+ attrs->set = true;
+ attrs->flavour = flavour;
+ if (switch_id) {
+ attrs->switch_port = true;
+ if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN))
+ switch_id_len = MAX_PHYS_ITEM_ID_LEN;
+ memcpy(attrs->switch_id.id, switch_id, switch_id_len);
+ attrs->switch_id.id_len = switch_id_len;
+ } else {
+ attrs->switch_port = false;
+ }
+ return 0;
+}
+
/**
* devlink_port_attrs_set - Set port attributes
*
@@ -5540,26 +5800,72 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
unsigned char switch_id_len)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int ret;
- if (WARN_ON(devlink_port->registered))
+ ret = __devlink_port_attrs_set(devlink_port, flavour,
+ switch_id, switch_id_len);
+ if (ret)
return;
- attrs->set = true;
- attrs->flavour = flavour;
- attrs->port_number = port_number;
attrs->split = split;
- attrs->split_subport_number = split_subport_number;
- if (switch_id) {
- attrs->switch_port = true;
- if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN))
- switch_id_len = MAX_PHYS_ITEM_ID_LEN;
- memcpy(attrs->switch_id.id, switch_id, switch_id_len);
- attrs->switch_id.id_len = switch_id_len;
- } else {
- attrs->switch_port = false;
- }
+ attrs->phys.port_number = port_number;
+ attrs->phys.split_subport_number = split_subport_number;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
+/**
+ * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes
+ *
+ * @devlink_port: devlink port
+ * @pf: associated PF for the devlink port instance
+ * @switch_id: if the port is part of switch, this is buffer with ID,
+ * otherwise this is NULL
+ * @switch_id_len: length of the switch_id buffer
+ */
+void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port,
+ const unsigned char *switch_id,
+ unsigned char switch_id_len, u16 pf)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int ret;
+
+ ret = __devlink_port_attrs_set(devlink_port,
+ DEVLINK_PORT_FLAVOUR_PCI_PF,
+ switch_id, switch_id_len);
+ if (ret)
+ return;
+
+ attrs->pci_pf.pf = pf;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
+
+/**
+ * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes
+ *
+ * @devlink_port: devlink port
+ * @pf: associated PF for the devlink port instance
+ * @vf: associated VF of a PF for the devlink port instance
+ * @switch_id: if the port is part of switch, this is buffer with ID,
+ * otherwise this is NULL
+ * @switch_id_len: length of the switch_id buffer
+ */
+void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port,
+ const unsigned char *switch_id,
+ unsigned char switch_id_len,
+ u16 pf, u16 vf)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int ret;
+
+ ret = __devlink_port_attrs_set(devlink_port,
+ DEVLINK_PORT_FLAVOUR_PCI_VF,
+ switch_id, switch_id_len);
+ if (ret)
+ return;
+ attrs->pci_vf.pf = pf;
+ attrs->pci_vf.vf = vf;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
+
static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
char *name, size_t len)
{
@@ -5572,10 +5878,11 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
switch (attrs->flavour) {
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
if (!attrs->split)
- n = snprintf(name, len, "p%u", attrs->port_number);
+ n = snprintf(name, len, "p%u", attrs->phys.port_number);
else
- n = snprintf(name, len, "p%us%u", attrs->port_number,
- attrs->split_subport_number);
+ n = snprintf(name, len, "p%us%u",
+ attrs->phys.port_number,
+ attrs->phys.split_subport_number);
break;
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
@@ -5584,6 +5891,13 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
*/
WARN_ON(1);
return -EINVAL;
+ case DEVLINK_PORT_FLAVOUR_PCI_PF:
+ n = snprintf(name, len, "pf%u", attrs->pci_pf.pf);
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_VF:
+ n = snprintf(name, len, "pf%uvf%u",
+ attrs->pci_vf.pf, attrs->pci_vf.vf);
+ break;
}
if (n >= len)
diff --git a/net/core/dst.c b/net/core/dst.c
index e46366228eaf..1325316d9eab 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -160,7 +160,7 @@ void dst_dev_put(struct dst_entry *dst)
dst->ops->ifdown(dst, dev, true);
dst->input = dst_discard;
dst->output = dst_discard_out;
- dst->dev = dev_net(dst->dev)->loopback_dev;
+ dst->dev = blackhole_netdev;
dev_hold(dst->dev);
dev_put(dev);
}
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4d1011b2e24f..6288e69e94fc 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -2883,6 +2883,30 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
match->mask.basic.n_proto = htons(0xffff);
switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) {
+ case ETHER_FLOW: {
+ const struct ethhdr *ether_spec, *ether_m_spec;
+
+ ether_spec = &fs->h_u.ether_spec;
+ ether_m_spec = &fs->m_u.ether_spec;
+
+ if (!is_zero_ether_addr(ether_m_spec->h_source)) {
+ ether_addr_copy(match->key.eth_addrs.src,
+ ether_spec->h_source);
+ ether_addr_copy(match->mask.eth_addrs.src,
+ ether_m_spec->h_source);
+ }
+ if (!is_zero_ether_addr(ether_m_spec->h_dest)) {
+ ether_addr_copy(match->key.eth_addrs.dst,
+ ether_spec->h_dest);
+ ether_addr_copy(match->mask.eth_addrs.dst,
+ ether_m_spec->h_dest);
+ }
+ if (ether_m_spec->h_proto) {
+ match->key.basic.n_proto = ether_spec->h_proto;
+ match->mask.basic.n_proto = ether_m_spec->h_proto;
+ }
+ }
+ break;
case TCP_V4_FLOW:
case UDP_V4_FLOW: {
const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec;
diff --git a/net/core/filter.c b/net/core/filter.c
index f615e42cf4ef..47f6386fb17a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -62,6 +62,7 @@
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/flow.h>
#include <net/arp.h>
#include <net/ipv6.h>
@@ -2157,8 +2158,8 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
if (unlikely(flags & ~(BPF_F_INGRESS)))
return TC_ACT_SHOT;
- ri->ifindex = ifindex;
ri->flags = flags;
+ ri->tgt_index = ifindex;
return TC_ACT_REDIRECT;
}
@@ -2168,8 +2169,8 @@ int skb_do_redirect(struct sk_buff *skb)
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net_device *dev;
- dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
- ri->ifindex = 0;
+ dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
+ ri->tgt_index = 0;
if (unlikely(!dev)) {
kfree_skb(skb);
return -EINVAL;
@@ -3487,11 +3488,11 @@ xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
{
struct net_device *fwd;
- u32 index = ri->ifindex;
+ u32 index = ri->tgt_index;
int err;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
- ri->ifindex = 0;
+ ri->tgt_index = 0;
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
@@ -3522,7 +3523,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
err = dev_map_enqueue(dst, xdp, dev_rx);
if (unlikely(err))
return err;
- __dev_map_insert_ctx(map, index);
break;
}
case BPF_MAP_TYPE_CPUMAP: {
@@ -3531,7 +3531,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
err = cpu_map_enqueue(rcpu, xdp, dev_rx);
if (unlikely(err))
return err;
- __cpu_map_insert_ctx(map, index);
break;
}
case BPF_MAP_TYPE_XSKMAP: {
@@ -3605,18 +3604,14 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_map *map,
struct bpf_redirect_info *ri)
{
- u32 index = ri->ifindex;
- void *fwd = NULL;
+ u32 index = ri->tgt_index;
+ void *fwd = ri->tgt_value;
int err;
- ri->ifindex = 0;
+ ri->tgt_index = 0;
+ ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
- fwd = __xdp_map_lookup_elem(map, index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
xdp_do_flush_map();
@@ -3652,19 +3647,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
struct bpf_map *map)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- u32 index = ri->ifindex;
- void *fwd = NULL;
+ u32 index = ri->tgt_index;
+ void *fwd = ri->tgt_value;
int err = 0;
- ri->ifindex = 0;
+ ri->tgt_index = 0;
+ ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
- fwd = __xdp_map_lookup_elem(map, index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
-
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
struct bpf_dtab_netdev *dst = fwd;
@@ -3696,14 +3686,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->ifindex;
+ u32 index = ri->tgt_index;
struct net_device *fwd;
int err = 0;
if (map)
return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
map);
- ri->ifindex = 0;
+ ri->tgt_index = 0;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
if (unlikely(!fwd)) {
err = -EINVAL;
@@ -3731,8 +3721,9 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
if (unlikely(flags))
return XDP_ABORTED;
- ri->ifindex = ifindex;
ri->flags = flags;
+ ri->tgt_index = ifindex;
+ ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
return XDP_REDIRECT;
@@ -3751,11 +3742,23 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- if (unlikely(flags))
+ /* Lower bits of the flags are used as return code on lookup failure */
+ if (unlikely(flags > XDP_TX))
return XDP_ABORTED;
- ri->ifindex = ifindex;
+ ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
+ if (unlikely(!ri->tgt_value)) {
+ /* If the lookup fails we want to clear out the state in the
+ * redirect_info struct completely, so that if an eBPF program
+ * performs multiple lookups, the last one always takes
+ * precedence.
+ */
+ WRITE_ONCE(ri->map, NULL);
+ return flags;
+ }
+
ri->flags = flags;
+ ri->tgt_index = ifindex;
WRITE_ONCE(ri->map, map);
return XDP_REDIRECT;
@@ -4670,7 +4673,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (res.type != RTN_UNICAST)
return BPF_FIB_LKUP_RET_NOT_FWDED;
- if (res.fi->fib_nhs > 1)
+ if (fib_info_num_path(res.fi) > 1)
fib_select_path(net, &res, &fl4, NULL);
if (check_mtu) {
@@ -4737,7 +4740,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
return -ENODEV;
idev = __in6_dev_get_safely(dev);
- if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
+ if (unlikely(!idev || !idev->cnf.forwarding))
return BPF_FIB_LKUP_RET_FWD_DISABLED;
if (flags & BPF_FIB_LOOKUP_OUTPUT) {
@@ -5191,54 +5194,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
};
#endif /* CONFIG_IPV6_SEG6_BPF */
-#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \
-do { \
- switch (si->off) { \
- case offsetof(md_type, snd_cwnd): \
- CONVERT(snd_cwnd); break; \
- case offsetof(md_type, srtt_us): \
- CONVERT(srtt_us); break; \
- case offsetof(md_type, snd_ssthresh): \
- CONVERT(snd_ssthresh); break; \
- case offsetof(md_type, rcv_nxt): \
- CONVERT(rcv_nxt); break; \
- case offsetof(md_type, snd_nxt): \
- CONVERT(snd_nxt); break; \
- case offsetof(md_type, snd_una): \
- CONVERT(snd_una); break; \
- case offsetof(md_type, mss_cache): \
- CONVERT(mss_cache); break; \
- case offsetof(md_type, ecn_flags): \
- CONVERT(ecn_flags); break; \
- case offsetof(md_type, rate_delivered): \
- CONVERT(rate_delivered); break; \
- case offsetof(md_type, rate_interval_us): \
- CONVERT(rate_interval_us); break; \
- case offsetof(md_type, packets_out): \
- CONVERT(packets_out); break; \
- case offsetof(md_type, retrans_out): \
- CONVERT(retrans_out); break; \
- case offsetof(md_type, total_retrans): \
- CONVERT(total_retrans); break; \
- case offsetof(md_type, segs_in): \
- CONVERT(segs_in); break; \
- case offsetof(md_type, data_segs_in): \
- CONVERT(data_segs_in); break; \
- case offsetof(md_type, segs_out): \
- CONVERT(segs_out); break; \
- case offsetof(md_type, data_segs_out): \
- CONVERT(data_segs_out); break; \
- case offsetof(md_type, lost_out): \
- CONVERT(lost_out); break; \
- case offsetof(md_type, sacked_out): \
- CONVERT(sacked_out); break; \
- case offsetof(md_type, bytes_received): \
- CONVERT(bytes_received); break; \
- case offsetof(md_type, bytes_acked): \
- CONVERT(bytes_acked); break; \
- } \
-} while (0)
-
#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
@@ -5589,7 +5544,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
- if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked))
+ if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
+ icsk_retransmits))
return false;
if (off % size != 0)
@@ -5620,8 +5576,19 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct tcp_sock, FIELD)); \
} while (0)
- CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock,
- BPF_TCP_SOCK_GET_COMMON);
+#define BPF_INET_SOCK_GET_COMMON(FIELD) \
+ do { \
+ BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock, \
+ FIELD) > \
+ FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct inet_connection_sock, \
+ FIELD), \
+ si->dst_reg, si->src_reg, \
+ offsetof( \
+ struct inet_connection_sock, \
+ FIELD)); \
+ } while (0)
if (insn > insn_buf)
return insn - insn_buf;
@@ -5637,6 +5604,81 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct tcp_sock, rtt_min) +
offsetof(struct minmax_sample, v));
break;
+ case offsetof(struct bpf_tcp_sock, snd_cwnd):
+ BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
+ break;
+ case offsetof(struct bpf_tcp_sock, srtt_us):
+ BPF_TCP_SOCK_GET_COMMON(srtt_us);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_ssthresh):
+ BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
+ break;
+ case offsetof(struct bpf_tcp_sock, rcv_nxt):
+ BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_nxt):
+ BPF_TCP_SOCK_GET_COMMON(snd_nxt);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_una):
+ BPF_TCP_SOCK_GET_COMMON(snd_una);
+ break;
+ case offsetof(struct bpf_tcp_sock, mss_cache):
+ BPF_TCP_SOCK_GET_COMMON(mss_cache);
+ break;
+ case offsetof(struct bpf_tcp_sock, ecn_flags):
+ BPF_TCP_SOCK_GET_COMMON(ecn_flags);
+ break;
+ case offsetof(struct bpf_tcp_sock, rate_delivered):
+ BPF_TCP_SOCK_GET_COMMON(rate_delivered);
+ break;
+ case offsetof(struct bpf_tcp_sock, rate_interval_us):
+ BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
+ break;
+ case offsetof(struct bpf_tcp_sock, packets_out):
+ BPF_TCP_SOCK_GET_COMMON(packets_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, retrans_out):
+ BPF_TCP_SOCK_GET_COMMON(retrans_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, total_retrans):
+ BPF_TCP_SOCK_GET_COMMON(total_retrans);
+ break;
+ case offsetof(struct bpf_tcp_sock, segs_in):
+ BPF_TCP_SOCK_GET_COMMON(segs_in);
+ break;
+ case offsetof(struct bpf_tcp_sock, data_segs_in):
+ BPF_TCP_SOCK_GET_COMMON(data_segs_in);
+ break;
+ case offsetof(struct bpf_tcp_sock, segs_out):
+ BPF_TCP_SOCK_GET_COMMON(segs_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, data_segs_out):
+ BPF_TCP_SOCK_GET_COMMON(data_segs_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, lost_out):
+ BPF_TCP_SOCK_GET_COMMON(lost_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, sacked_out):
+ BPF_TCP_SOCK_GET_COMMON(sacked_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, bytes_received):
+ BPF_TCP_SOCK_GET_COMMON(bytes_received);
+ break;
+ case offsetof(struct bpf_tcp_sock, bytes_acked):
+ BPF_TCP_SOCK_GET_COMMON(bytes_acked);
+ break;
+ case offsetof(struct bpf_tcp_sock, dsack_dups):
+ BPF_TCP_SOCK_GET_COMMON(dsack_dups);
+ break;
+ case offsetof(struct bpf_tcp_sock, delivered):
+ BPF_TCP_SOCK_GET_COMMON(delivered);
+ break;
+ case offsetof(struct bpf_tcp_sock, delivered_ce):
+ BPF_TCP_SOCK_GET_COMMON(delivered_ce);
+ break;
+ case offsetof(struct bpf_tcp_sock, icsk_retransmits):
+ BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
+ break;
}
return insn - insn_buf;
@@ -5650,7 +5692,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
return (unsigned long)NULL;
}
-static const struct bpf_func_proto bpf_tcp_sock_proto = {
+const struct bpf_func_proto bpf_tcp_sock_proto = {
.func = bpf_tcp_sock,
.gpl_only = false,
.ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL,
@@ -5694,6 +5736,46 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
return INET_ECN_set_ce(skb);
}
+bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ default:
+ return size == sizeof(__u32);
+ }
+}
+
+u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+#define BPF_XDP_SOCK_GET(FIELD) \
+ do { \
+ BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \
+ FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct xdp_sock, FIELD)); \
+ } while (0)
+
+ switch (si->off) {
+ case offsetof(struct bpf_xdp_sock, queue_id):
+ BPF_XDP_SOCK_GET(queue_id);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
.func = bpf_skb_ecn_set_ce,
.gpl_only = false,
@@ -5896,6 +5978,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skc_lookup_tcp:
return &bpf_sock_addr_skc_lookup_tcp_proto;
#endif /* CONFIG_INET */
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -5933,6 +6019,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_cgroup_id:
+ return &bpf_skb_cgroup_id_proto;
+#endif
#ifdef CONFIG_INET
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
@@ -6113,6 +6203,14 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
return &bpf_sockopt_event_output_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+#endif /* CONFIG_INET */
default:
return bpf_base_func_proto(func_id);
}
@@ -6792,6 +6890,16 @@ static bool sock_addr_is_valid_access(int off, int size,
if (!bpf_ctx_narrow_access_ok(off, size, size_default))
return false;
} else {
+ if (bpf_ctx_wide_store_ok(off, size,
+ struct bpf_sock_addr,
+ user_ip6))
+ return true;
+
+ if (bpf_ctx_wide_store_ok(off, size,
+ struct bpf_sock_addr,
+ msg_src_ip6))
+ return true;
+
if (size != size_default)
return false;
}
@@ -6800,6 +6908,13 @@ static bool sock_addr_is_valid_access(int off, int size,
if (size != size_default)
return false;
break;
+ case offsetof(struct bpf_sock_addr, sk):
+ if (type != BPF_READ)
+ return false;
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
default:
if (type == BPF_READ) {
if (size != size_default)
@@ -6843,6 +6958,11 @@ static bool sock_ops_is_valid_access(int off, int size,
if (size != sizeof(__u64))
return false;
break;
+ case offsetof(struct bpf_sock_ops, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET_OR_NULL;
+ break;
default:
if (size != size_default)
return false;
@@ -7620,9 +7740,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
* SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
*
- * It doesn't support SIZE argument though since narrow stores are not
- * supported for now.
- *
* In addition it uses Temporary Field TF (member of struct S) as the 3rd
* "register" since two registers available in convert_ctx_access are not
* enough: we can't override neither SRC, since it contains value to store, nor
@@ -7630,7 +7747,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
* instructions. But we need a temporary place to save pointer to nested
* structure whose field we want to store to.
*/
-#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \
+#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \
do { \
int tmp_reg = BPF_REG_9; \
if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
@@ -7641,8 +7758,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
offsetof(S, TF)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \
si->dst_reg, offsetof(S, F)); \
- *insn++ = BPF_STX_MEM( \
- BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \
+ *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \
bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \
target_size) \
+ OFF); \
@@ -7654,8 +7770,8 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
TF) \
do { \
if (type == BPF_WRITE) { \
- SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \
- TF); \
+ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \
+ OFF, TF); \
} else { \
SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \
S, NS, F, NF, SIZE, OFF); \
@@ -7750,6 +7866,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
break;
+ case offsetof(struct bpf_sock_addr, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_addr_kern, sk));
+ break;
}
return insn - insn_buf;
@@ -7837,9 +7958,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
} while (0)
- CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops,
- SOCK_OPS_GET_TCP_SOCK_FIELD);
-
if (insn > insn_buf)
return insn - insn_buf;
@@ -8009,6 +8127,82 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
struct sock, type);
break;
+ case offsetof(struct bpf_sock_ops, snd_cwnd):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
+ break;
+ case offsetof(struct bpf_sock_ops, srtt_us):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_ssthresh):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
+ break;
+ case offsetof(struct bpf_sock_ops, rcv_nxt):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_nxt):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_una):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
+ break;
+ case offsetof(struct bpf_sock_ops, mss_cache):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
+ break;
+ case offsetof(struct bpf_sock_ops, ecn_flags):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
+ break;
+ case offsetof(struct bpf_sock_ops, rate_delivered):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
+ break;
+ case offsetof(struct bpf_sock_ops, rate_interval_us):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
+ break;
+ case offsetof(struct bpf_sock_ops, packets_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
+ break;
+ case offsetof(struct bpf_sock_ops, retrans_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
+ break;
+ case offsetof(struct bpf_sock_ops, total_retrans):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
+ break;
+ case offsetof(struct bpf_sock_ops, segs_in):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
+ break;
+ case offsetof(struct bpf_sock_ops, data_segs_in):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
+ break;
+ case offsetof(struct bpf_sock_ops, segs_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
+ break;
+ case offsetof(struct bpf_sock_ops, data_segs_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
+ break;
+ case offsetof(struct bpf_sock_ops, lost_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
+ break;
+ case offsetof(struct bpf_sock_ops, sacked_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
+ break;
+ case offsetof(struct bpf_sock_ops, bytes_received):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
+ break;
+ case offsetof(struct bpf_sock_ops, bytes_acked):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
+ break;
+ case offsetof(struct bpf_sock_ops, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern,
+ is_fullsock),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ is_fullsock));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ break;
}
return insn - insn_buf;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index edd622956083..3e6fedb57bc1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -27,6 +27,10 @@
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
#include <linux/bpf.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#endif
static DEFINE_MUTEX(flow_dissector_mutex);
@@ -199,6 +203,22 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
}
EXPORT_SYMBOL(__skb_flow_get_ports);
+void skb_flow_dissect_meta(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_meta *meta;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META))
+ return;
+
+ meta = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_META,
+ target_container);
+ meta->ingress_ifindex = skb->skb_iif;
+}
+EXPORT_SYMBOL(skb_flow_dissect_meta);
+
static void
skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
struct flow_dissector *flow_dissector,
@@ -216,6 +236,46 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
}
void
+skb_flow_dissect_ct(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container,
+ u16 *ctinfo_map,
+ size_t mapsize)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ struct flow_dissector_key_ct *key;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_labels *cl;
+ struct nf_conn *ct;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT))
+ return;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return;
+
+ key = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CT,
+ target_container);
+
+ if (ctinfo < mapsize)
+ key->ct_state = ctinfo_map[ctinfo];
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
+ key->ct_zone = ct->zone.id;
+#endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ key->ct_mark = ct->mark;
+#endif
+
+ cl = nf_ct_labels_find(ct);
+ if (cl)
+ memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels));
+#endif /* CONFIG_NF_CONNTRACK */
+}
+EXPORT_SYMBOL(skb_flow_dissect_ct);
+
+void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container)
@@ -757,7 +817,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
* @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
* @hlen: packet header length, if @data is NULL use skb_headlen(skb)
* @flags: flags that control the dissection process, e.g.
- * FLOW_DISSECTOR_F_STOP_AT_L3.
+ * FLOW_DISSECTOR_F_STOP_AT_ENCAP.
*
* The function will try to retrieve individual keys into target specified
* by flow_dissector from either the skbuff or a raw buffer specified by the
@@ -922,11 +982,6 @@ proto_again:
__skb_flow_dissect_ipv4(skb, flow_dissector,
target_container, data, iph);
- if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) {
- fdret = FLOW_DISSECT_RET_OUT_GOOD;
- break;
- }
-
break;
}
case htons(ETH_P_IPV6): {
@@ -975,9 +1030,6 @@ proto_again:
__skb_flow_dissect_ipv6(skb, flow_dissector,
target_container, data, iph);
- if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
- fdret = FLOW_DISSECT_RET_OUT_GOOD;
-
break;
}
case htons(ETH_P_8021AD):
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 5ce7d47a960e..76f8db3841d7 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -7,8 +7,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions)
{
struct flow_rule *rule;
- rule = kzalloc(sizeof(struct flow_rule) +
- sizeof(struct flow_action_entry) * num_actions,
+ rule = kzalloc(struct_size(rule, action.entries, num_actions),
GFP_KERNEL);
if (!rule)
return NULL;
@@ -26,6 +25,13 @@ EXPORT_SYMBOL(flow_rule_alloc);
(__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \
(__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \
+void flow_rule_match_meta(const struct flow_rule *rule,
+ struct flow_match_meta *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out);
+}
+EXPORT_SYMBOL(flow_rule_match_meta);
+
void flow_rule_match_basic(const struct flow_rule *rule,
struct flow_match_basic *out)
{
@@ -158,3 +164,121 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule,
FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out);
}
EXPORT_SYMBOL(flow_rule_match_enc_opts);
+
+struct flow_block_cb *flow_block_cb_alloc(struct net *net, tc_setup_cb_t *cb,
+ void *cb_ident, void *cb_priv,
+ void (*release)(void *cb_priv))
+{
+ struct flow_block_cb *block_cb;
+
+ block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
+ if (!block_cb)
+ return ERR_PTR(-ENOMEM);
+
+ block_cb->net = net;
+ block_cb->cb = cb;
+ block_cb->cb_ident = cb_ident;
+ block_cb->cb_priv = cb_priv;
+ block_cb->release = release;
+
+ return block_cb;
+}
+EXPORT_SYMBOL(flow_block_cb_alloc);
+
+void flow_block_cb_free(struct flow_block_cb *block_cb)
+{
+ if (block_cb->release)
+ block_cb->release(block_cb->cb_priv);
+
+ kfree(block_cb);
+}
+EXPORT_SYMBOL(flow_block_cb_free);
+
+struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *f,
+ tc_setup_cb_t *cb, void *cb_ident)
+{
+ struct flow_block_cb *block_cb;
+
+ list_for_each_entry(block_cb, f->driver_block_list, driver_list) {
+ if (block_cb->net == f->net &&
+ block_cb->cb == cb &&
+ block_cb->cb_ident == cb_ident)
+ return block_cb;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(flow_block_cb_lookup);
+
+void *flow_block_cb_priv(struct flow_block_cb *block_cb)
+{
+ return block_cb->cb_priv;
+}
+EXPORT_SYMBOL(flow_block_cb_priv);
+
+void flow_block_cb_incref(struct flow_block_cb *block_cb)
+{
+ block_cb->refcnt++;
+}
+EXPORT_SYMBOL(flow_block_cb_incref);
+
+unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb)
+{
+ return --block_cb->refcnt;
+}
+EXPORT_SYMBOL(flow_block_cb_decref);
+
+bool flow_block_cb_is_busy(tc_setup_cb_t *cb, void *cb_ident,
+ struct list_head *driver_block_list)
+{
+ struct flow_block_cb *block_cb;
+
+ list_for_each_entry(block_cb, driver_block_list, driver_list) {
+ if (block_cb->cb == cb &&
+ block_cb->cb_ident == cb_ident)
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(flow_block_cb_is_busy);
+
+int flow_block_cb_setup_simple(struct flow_block_offload *f,
+ struct list_head *driver_block_list,
+ tc_setup_cb_t *cb, void *cb_ident, void *cb_priv,
+ bool ingress_only)
+{
+ struct flow_block_cb *block_cb;
+
+ if (ingress_only &&
+ f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+ return -EOPNOTSUPP;
+
+ f->driver_block_list = driver_block_list;
+
+ switch (f->command) {
+ case FLOW_BLOCK_BIND:
+ if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list))
+ return -EBUSY;
+
+ block_cb = flow_block_cb_alloc(f->net, cb, cb_ident,
+ cb_priv, NULL);
+ if (IS_ERR(block_cb))
+ return PTR_ERR(block_cb);
+
+ flow_block_cb_add(block_cb, f);
+ list_add_tail(&block_cb->driver_list, driver_block_list);
+ return 0;
+ case FLOW_BLOCK_UNBIND:
+ block_cb = flow_block_cb_lookup(f, cb, cb_ident);
+ if (!block_cb)
+ return -ENOENT;
+
+ flow_block_cb_remove(block_cb, f);
+ list_del(&block_cb->driver_list);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL(flow_block_cb_setup_simple);
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
index fd822ca5a245..ac1a66df9adc 100644
--- a/net/core/hwbm.c
+++ b/net/core/hwbm.c
@@ -43,34 +43,33 @@ int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(hwbm_pool_refill);
-int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num)
{
int err, i;
- unsigned long flags;
- spin_lock_irqsave(&bm_pool->lock, flags);
+ mutex_lock(&bm_pool->buf_lock);
if (bm_pool->buf_num == bm_pool->size) {
pr_warn("pool already filled\n");
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return bm_pool->buf_num;
}
if (buf_num + bm_pool->buf_num > bm_pool->size) {
pr_warn("cannot allocate %d buffers for pool\n",
buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return 0;
}
if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
buf_num, bm_pool->buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return 0;
}
for (i = 0; i < buf_num; i++) {
- err = hwbm_pool_refill(bm_pool, gfp);
+ err = hwbm_pool_refill(bm_pool, GFP_KERNEL);
if (err < 0)
break;
}
@@ -79,7 +78,7 @@ int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
bm_pool->buf_num += i;
pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return i;
}
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 04fdc9535772..f153e0601838 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -163,9 +163,16 @@ static void linkwatch_do_dev(struct net_device *dev)
static void __linkwatch_run_queue(int urgent_only)
{
+#define MAX_DO_DEV_PER_LOOP 100
+
+ int do_dev = MAX_DO_DEV_PER_LOOP;
struct net_device *dev;
LIST_HEAD(wrk);
+ /* Give urgent case more budget */
+ if (urgent_only)
+ do_dev += MAX_DO_DEV_PER_LOOP;
+
/*
* Limit the number of linkwatch events to one
* per second so that a runaway driver does not
@@ -184,7 +191,7 @@ static void __linkwatch_run_queue(int urgent_only)
spin_lock_irq(&lweventlist_lock);
list_splice_init(&lweventlist, &wrk);
- while (!list_empty(&wrk)) {
+ while (!list_empty(&wrk) && do_dev > 0) {
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
list_del_init(&dev->link_watch_list);
@@ -195,9 +202,13 @@ static void __linkwatch_run_queue(int urgent_only)
}
spin_unlock_irq(&lweventlist_lock);
linkwatch_do_dev(dev);
+ do_dev--;
spin_lock_irq(&lweventlist_lock);
}
+ /* Add the remaining work back to lweventlist */
+ list_splice_init(&wrk, &lweventlist);
+
if (!list_empty(&lweventlist))
linkwatch_schedule_work(0);
spin_unlock_irq(&lweventlist_lock);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 9e7fc929bc50..742cea4ce72e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -583,6 +583,8 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl,
int error;
struct neigh_hash_table *nht;
+ trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
+
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 470b179d599e..283ddb2dbc7d 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -43,6 +43,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
#endif
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+#include <trace/events/page_pool.h>
+#endif
+
#include <trace/events/neigh.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index f7b6dda798e0..a0e0d298c991 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -152,6 +152,17 @@ static void ops_free(const struct pernet_operations *ops, struct net *net)
}
}
+static void ops_pre_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ if (ops->pre_exit) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops->pre_exit(net);
+ }
+}
+
static void ops_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
@@ -337,6 +348,12 @@ out_undo:
list_add(&net->exit_list, &net_exit_list);
saved_ops = ops;
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_pre_exit_list(ops, &net_exit_list);
+
+ synchronize_rcu();
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list);
ops = saved_ops;
@@ -560,10 +577,15 @@ static void cleanup_net(struct work_struct *work)
list_add_tail(&net->exit_list, &net_exit_list);
}
+ /* Run all of the network namespace pre_exit methods */
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_pre_exit_list(ops, &net_exit_list);
+
/*
* Another CPU might be rcu-iterating the list, wait for it.
* This needs to be before calling the exit() notifiers, so
* the rcu_barrier() below isn't sufficient alone.
+ * Also the pre_exit() and exit() methods need this barrier.
*/
synchronize_rcu();
@@ -1121,6 +1143,8 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
return error;
@@ -1135,6 +1159,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
/* See comment in __register_pernet_operations() */
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
@@ -1159,6 +1185,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
} else {
LIST_HEAD(net_exit_list);
list_add(&init_net.exit_list, &net_exit_list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index dd8b1a460d64..2cf27da1baeb 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -696,16 +696,22 @@ int netpoll_setup(struct netpoll *np)
if (!np->local_ip.ip) {
if (!np->ipv6) {
+ const struct in_ifaddr *ifa;
+
in_dev = __in_dev_get_rtnl(ndev);
+ if (!in_dev)
+ goto put_noaddr;
- if (!in_dev || !in_dev->ifa_list) {
+ ifa = rtnl_dereference(in_dev->ifa_list);
+ if (!ifa) {
+put_noaddr:
np_err(np, "no IP address for %s, aborting\n",
np->dev_name);
err = -EDESTADDRREQ;
goto put;
}
- np->local_ip.ip = in_dev->ifa_list->ifa_local;
+ np->local_ip.ip = ifa->ifa_local;
np_info(np, "local IP %pI4\n", &np->local_ip.ip);
} else {
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5b2252c6d49b..3272dc7a8c81 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -4,9 +4,11 @@
* Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
* Copyright (C) 2016 Red Hat, Inc.
*/
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/device.h>
#include <net/page_pool.h>
#include <linux/dma-direction.h>
@@ -14,6 +16,8 @@
#include <linux/page-flags.h>
#include <linux/mm.h> /* for __put_page() */
+#include <trace/events/page_pool.h>
+
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params)
{
@@ -43,6 +47,14 @@ static int page_pool_init(struct page_pool *pool,
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
return -ENOMEM;
+ atomic_set(&pool->pages_state_release_cnt, 0);
+
+ /* Driver calling page_pool_create() also call page_pool_destroy() */
+ refcount_set(&pool->user_cnt, 1);
+
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
+ get_device(pool->p.dev);
+
return 0;
}
@@ -61,6 +73,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
kfree(pool);
return ERR_PTR(err);
}
+
return pool;
}
EXPORT_SYMBOL(page_pool_create);
@@ -151,6 +164,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
page->dma_addr = dma;
skip_dma_map:
+ /* Track how many pages are held 'in-flight' */
+ pool->pages_state_hold_cnt++;
+
+ trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
+
/* When page just alloc'ed is should/must have refcnt 1. */
return page;
}
@@ -173,6 +191,33 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
}
EXPORT_SYMBOL(page_pool_alloc_pages);
+/* Calculate distance between two u32 values, valid if distance is below 2^(31)
+ * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
+ */
+#define _distance(a, b) (s32)((a) - (b))
+
+static s32 page_pool_inflight(struct page_pool *pool)
+{
+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+ s32 distance;
+
+ distance = _distance(hold_cnt, release_cnt);
+
+ trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt);
+ return distance;
+}
+
+static bool __page_pool_safe_to_destroy(struct page_pool *pool)
+{
+ s32 inflight = page_pool_inflight(pool);
+
+ /* The distance should not be able to become negative */
+ WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
+
+ return (inflight == 0);
+}
+
/* Cleanup page_pool state from page */
static void __page_pool_clean_page(struct page_pool *pool,
struct page *page)
@@ -180,7 +225,7 @@ static void __page_pool_clean_page(struct page_pool *pool,
dma_addr_t dma;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
- return;
+ goto skip_dma_unmap;
dma = page->dma_addr;
/* DMA unmap */
@@ -188,12 +233,27 @@ static void __page_pool_clean_page(struct page_pool *pool,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC);
page->dma_addr = 0;
+skip_dma_unmap:
+ atomic_inc(&pool->pages_state_release_cnt);
+ trace_page_pool_state_release(pool, page,
+ atomic_read(&pool->pages_state_release_cnt));
+}
+
+/* unmap the page and clean our state */
+void page_pool_unmap_page(struct page_pool *pool, struct page *page)
+{
+ /* When page is unmapped, this implies page will not be
+ * returned to page_pool.
+ */
+ __page_pool_clean_page(pool, page);
}
+EXPORT_SYMBOL(page_pool_unmap_page);
/* Return a page to the page allocator, cleaning up our state */
static void __page_pool_return_page(struct page_pool *pool, struct page *page)
{
__page_pool_clean_page(pool, page);
+
put_page(page);
/* An optimization would be to call __free_pages(page, pool->p.order)
* knowing page is not part of page-cache (thus avoiding a
@@ -285,21 +345,45 @@ static void __page_pool_empty_ring(struct page_pool *pool)
}
}
-static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+static void __warn_in_flight(struct page_pool *pool)
{
- struct page_pool *pool;
+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+ s32 distance;
- pool = container_of(rcu, struct page_pool, rcu);
+ distance = _distance(hold_cnt, release_cnt);
+
+ /* Drivers should fix this, but only problematic when DMA is used */
+ WARN(1, "Still in-flight pages:%d hold:%u released:%u",
+ distance, hold_cnt, release_cnt);
+}
+
+void __page_pool_free(struct page_pool *pool)
+{
+ /* Only last user actually free/release resources */
+ if (!page_pool_put(pool))
+ return;
WARN(pool->alloc.count, "API usage violation");
+ WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
+
+ /* Can happen due to forced shutdown */
+ if (!__page_pool_safe_to_destroy(pool))
+ __warn_in_flight(pool);
- __page_pool_empty_ring(pool);
ptr_ring_cleanup(&pool->ring, NULL);
+
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
+ put_device(pool->p.dev);
+
kfree(pool);
}
+EXPORT_SYMBOL(__page_pool_free);
-/* Cleanup and release resources */
-void page_pool_destroy(struct page_pool *pool)
+/* Request to shutdown: release pages cached by page_pool, and check
+ * for in-flight pages
+ */
+bool __page_pool_request_shutdown(struct page_pool *pool)
{
struct page *page;
@@ -317,7 +401,6 @@ void page_pool_destroy(struct page_pool *pool)
*/
__page_pool_empty_ring(pool);
- /* An xdp_mem_allocator can still ref page_pool pointer */
- call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+ return __page_pool_safe_to_destroy(pool);
}
-EXPORT_SYMBOL(page_pool_destroy);
+EXPORT_SYMBOL(__page_pool_request_shutdown);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f975c5e2a369..bb9915291644 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2118,9 +2118,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
rcu_read_lock();
in_dev = __in_dev_get_rcu(pkt_dev->odev);
if (in_dev) {
- if (in_dev->ifa_list) {
- pkt_dev->saddr_min =
- in_dev->ifa_list->ifa_address;
+ const struct in_ifaddr *ifa;
+
+ ifa = rcu_dereference(in_dev->ifa_list);
+ if (ifa) {
+ pkt_dev->saddr_min = ifa->ifa_address;
pkt_dev->saddr_max = pkt_dev->saddr_min;
}
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index cec60583931f..1ee6460f8275 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -751,6 +751,10 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
struct nlattr *mx;
int i, valid = 0;
+ /* nothing is dumped for dst_default_metrics, so just skip the loop */
+ if (metrics == dst_default_metrics.metrics)
+ return 0;
+
mx = nla_nest_start_noflag(skb, RTA_METRICS);
if (mx == NULL)
return -ENOBUFS;
@@ -908,6 +912,7 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
size += num_vfs *
(nla_total_size(0) +
nla_total_size(sizeof(struct ifla_vf_mac)) +
+ nla_total_size(sizeof(struct ifla_vf_broadcast)) +
nla_total_size(sizeof(struct ifla_vf_vlan)) +
nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
nla_total_size(MAX_VLAN_LIST_LEN *
@@ -1197,6 +1202,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct ifla_vf_vlan vf_vlan;
struct ifla_vf_rate vf_rate;
struct ifla_vf_mac vf_mac;
+ struct ifla_vf_broadcast vf_broadcast;
struct ifla_vf_info ivi;
memset(&ivi, 0, sizeof(ivi));
@@ -1231,6 +1237,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_trust.vf = ivi.vf;
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
vf_vlan_info.vlan = ivi.vlan;
@@ -1247,6 +1254,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
if (!vf)
goto nla_put_vfinfo_failure;
if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+ nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) ||
nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
&vf_rate) ||
@@ -1753,6 +1761,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
+ [IFLA_VF_BROADCAST] = { .type = NLA_REJECT },
[IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
[IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED },
[IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c8cd99c3603f..6f1e31f674a3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -59,6 +59,7 @@
#include <linux/errqueue.h>
#include <linux/prefetch.h>
#include <linux/if_vlan.h>
+#include <linux/mpls.h>
#include <net/protocol.h>
#include <net/dst.h>
@@ -66,12 +67,14 @@
#include <net/checksum.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
+#include <net/mpls.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
#include <linux/user_namespace.h>
+#include <linux/indirect_call_wrapper.h>
#include "datagram.h"
@@ -365,18 +368,20 @@ struct napi_alloc_cache {
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
-static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
- struct page_frag_cache *nc;
- unsigned long flags;
- void *data;
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- local_irq_save(flags);
- nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc(nc, fragsz, gfp_mask);
- local_irq_restore(flags);
- return data;
+ return page_frag_alloc(&nc->page, fragsz, gfp_mask);
+}
+
+void *napi_alloc_frag(unsigned int fragsz)
+{
+ fragsz = SKB_DATA_ALIGN(fragsz);
+
+ return __napi_alloc_frag(fragsz, GFP_ATOMIC);
}
+EXPORT_SYMBOL(napi_alloc_frag);
/**
* netdev_alloc_frag - allocate a page fragment
@@ -387,26 +392,21 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
*/
void *netdev_alloc_frag(unsigned int fragsz)
{
- fragsz = SKB_DATA_ALIGN(fragsz);
-
- return __netdev_alloc_frag(fragsz, GFP_ATOMIC);
-}
-EXPORT_SYMBOL(netdev_alloc_frag);
-
-static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
-{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
-
- return page_frag_alloc(&nc->page, fragsz, gfp_mask);
-}
+ struct page_frag_cache *nc;
+ void *data;
-void *napi_alloc_frag(unsigned int fragsz)
-{
fragsz = SKB_DATA_ALIGN(fragsz);
-
- return __napi_alloc_frag(fragsz, GFP_ATOMIC);
+ if (in_irq() || irqs_disabled()) {
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc(nc, fragsz, GFP_ATOMIC);
+ } else {
+ local_bh_disable();
+ data = __napi_alloc_frag(fragsz, GFP_ATOMIC);
+ local_bh_enable();
+ }
+ return data;
}
-EXPORT_SYMBOL(napi_alloc_frag);
+EXPORT_SYMBOL(netdev_alloc_frag);
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
@@ -425,7 +425,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
gfp_t gfp_mask)
{
struct page_frag_cache *nc;
- unsigned long flags;
struct sk_buff *skb;
bool pfmemalloc;
void *data;
@@ -446,13 +445,17 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- local_irq_save(flags);
-
- nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
-
- local_irq_restore(flags);
+ if (in_irq() || irqs_disabled()) {
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc(nc, len, gfp_mask);
+ pfmemalloc = nc->pfmemalloc;
+ } else {
+ local_bh_disable();
+ nc = this_cpu_ptr(&napi_alloc_cache.page);
+ data = page_frag_alloc(nc, len, gfp_mask);
+ pfmemalloc = nc->pfmemalloc;
+ local_bh_enable();
+ }
if (unlikely(!data))
return NULL;
@@ -706,6 +709,105 @@ void kfree_skb_list(struct sk_buff *segs)
}
EXPORT_SYMBOL(kfree_skb_list);
+/* Dump skb information and contents.
+ *
+ * Must only be called from net_ratelimit()-ed paths.
+ *
+ * Dumps up to can_dump_full whole packets if full_pkt, headers otherwise.
+ */
+void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
+{
+ static atomic_t can_dump_full = ATOMIC_INIT(5);
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ struct net_device *dev = skb->dev;
+ struct sock *sk = skb->sk;
+ struct sk_buff *list_skb;
+ bool has_mac, has_trans;
+ int headroom, tailroom;
+ int i, len, seg_len;
+
+ if (full_pkt)
+ full_pkt = atomic_dec_if_positive(&can_dump_full) >= 0;
+
+ if (full_pkt)
+ len = skb->len;
+ else
+ len = min_t(int, skb->len, MAX_HEADER + 128);
+
+ headroom = skb_headroom(skb);
+ tailroom = skb_tailroom(skb);
+
+ has_mac = skb_mac_header_was_set(skb);
+ has_trans = skb_transport_header_was_set(skb);
+
+ printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
+ "mac=(%d,%d) net=(%d,%d) trans=%d\n"
+ "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
+ "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
+ "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
+ level, skb->len, headroom, skb_headlen(skb), tailroom,
+ has_mac ? skb->mac_header : -1,
+ has_mac ? skb_mac_header_len(skb) : -1,
+ skb->network_header,
+ has_trans ? skb_network_header_len(skb) : -1,
+ has_trans ? skb->transport_header : -1,
+ sh->tx_flags, sh->nr_frags,
+ sh->gso_size, sh->gso_type, sh->gso_segs,
+ skb->csum, skb->ip_summed, skb->csum_complete_sw,
+ skb->csum_valid, skb->csum_level,
+ skb->hash, skb->sw_hash, skb->l4_hash,
+ ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
+
+ if (dev)
+ printk("%sdev name=%s feat=0x%pNF\n",
+ level, dev->name, &dev->features);
+ if (sk)
+ printk("%ssk family=%hu type=%hu proto=%hu\n",
+ level, sk->sk_family, sk->sk_type, sk->sk_protocol);
+
+ if (full_pkt && headroom)
+ print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb->head, headroom, false);
+
+ seg_len = min_t(int, skb_headlen(skb), len);
+ if (seg_len)
+ print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb->data, seg_len, false);
+ len -= seg_len;
+
+ if (full_pkt && tailroom)
+ print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb_tail_pointer(skb), tailroom, false);
+
+ for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ skb_frag_foreach_page(frag, frag->page_offset,
+ skb_frag_size(frag), p, p_off, p_len,
+ copied) {
+ seg_len = min_t(int, p_len, len);
+ vaddr = kmap_atomic(p);
+ print_hex_dump(level, "skb frag: ",
+ DUMP_PREFIX_OFFSET,
+ 16, 1, vaddr + p_off, seg_len, false);
+ kunmap_atomic(vaddr);
+ len -= seg_len;
+ if (!len)
+ break;
+ }
+ }
+
+ if (full_pkt && skb_has_frag_list(skb)) {
+ printk("skb fraglist:\n");
+ skb_walk_frags(skb, list_skb)
+ skb_dump(level, list_skb, true);
+ }
+}
+EXPORT_SYMBOL(skb_dump);
+
/**
* skb_tx_error - report an sk_buff xmit error
* @skb: buffer that triggered an error
@@ -909,6 +1011,31 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
}
/**
+ * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
+ * @first: first sk_buff of the msg
+ */
+struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
+{
+ struct sk_buff *n;
+
+ n = alloc_skb(0, GFP_ATOMIC);
+ if (!n)
+ return NULL;
+
+ n->len = first->len;
+ n->data_len = first->len;
+ n->truesize = first->truesize;
+
+ skb_shinfo(n)->frag_list = first;
+
+ __copy_skb_header(n, first);
+ n->destructor = NULL;
+
+ return n;
+}
+EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
+
+/**
* skb_morph - morph one skb into another
* @dst: the skb to receive the contents
* @src: the skb to supply the contents
@@ -2508,7 +2635,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
if (copy > 0) {
if (copy > len)
copy = len;
- csum = ops->update(skb->data + offset, copy, csum);
+ csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
+ skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -2535,9 +2663,13 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
frag->page_offset + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
- csum2 = ops->update(vaddr + p_off, p_len, 0);
+ csum2 = INDIRECT_CALL_1(ops->update,
+ csum_partial_ext,
+ vaddr + p_off, p_len, 0);
kunmap_atomic(vaddr);
- csum = ops->combine(csum, csum2, pos, p_len);
+ csum = INDIRECT_CALL_1(ops->combine,
+ csum_block_add_ext, csum,
+ csum2, pos, p_len);
pos += p_len;
}
@@ -2560,7 +2692,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
copy = len;
csum2 = __skb_checksum(frag_iter, offset - start,
copy, 0, ops);
- csum = ops->combine(csum, csum2, pos, copy);
+ csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
+ csum, csum2, pos, copy);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -5294,6 +5427,173 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
}
EXPORT_SYMBOL(skb_vlan_push);
+/* Update the ethertype of hdr and the skb csum value if required. */
+static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
+ __be16 ethertype)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be16 diff[] = { ~hdr->h_proto, ethertype };
+
+ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+ }
+
+ hdr->h_proto = ethertype;
+}
+
+/**
+ * skb_mpls_push() - push a new MPLS header after the mac header
+ *
+ * @skb: buffer
+ * @mpls_lse: MPLS label stack entry to push
+ * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto)
+{
+ struct mpls_shim_hdr *lse;
+ int err;
+
+ if (unlikely(!eth_p_mpls(mpls_proto)))
+ return -EINVAL;
+
+ /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
+ if (skb->encapsulation)
+ return -EINVAL;
+
+ err = skb_cow_head(skb, MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (!skb->inner_protocol) {
+ skb_set_inner_network_header(skb, skb->mac_len);
+ skb_set_inner_protocol(skb, skb->protocol);
+ }
+
+ skb_push(skb, MPLS_HLEN);
+ memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+ skb->mac_len);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb->mac_len);
+
+ lse = mpls_hdr(skb);
+ lse->label_stack_entry = mpls_lse;
+ skb_postpush_rcsum(skb, lse, MPLS_HLEN);
+
+ if (skb->dev && skb->dev->type == ARPHRD_ETHER)
+ skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
+ skb->protocol = mpls_proto;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_push);
+
+/**
+ * skb_mpls_pop() - pop the outermost MPLS header
+ *
+ * @skb: buffer
+ * @next_proto: ethertype of header after popped MPLS header
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto)
+{
+ int err;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return -EINVAL;
+
+ err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
+ memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+ skb->mac_len);
+
+ __skb_pull(skb, MPLS_HLEN);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb->mac_len);
+
+ if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
+ struct ethhdr *hdr;
+
+ /* use mpls_hdr() to get ethertype to account for VLANs. */
+ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
+ skb_mod_eth_type(skb, hdr, next_proto);
+ }
+ skb->protocol = next_proto;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_pop);
+
+/**
+ * skb_mpls_update_lse() - modify outermost MPLS header and update csum
+ *
+ * @skb: buffer
+ * @mpls_lse: new MPLS label stack entry to update to
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
+{
+ int err;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return -EINVAL;
+
+ err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
+
+ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+ }
+
+ mpls_hdr(skb)->label_stack_entry = mpls_lse;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
+
+/**
+ * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
+ *
+ * @skb: buffer
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_dec_ttl(struct sk_buff *skb)
+{
+ u32 lse;
+ u8 ttl;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return -EINVAL;
+
+ lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
+ ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+ if (!--ttl)
+ return -EINVAL;
+
+ lse &= ~MPLS_LS_TTL_MASK;
+ lse |= ttl << MPLS_LS_TTL_SHIFT;
+
+ return skb_mpls_update_lse(skb, cpu_to_be32(lse));
+}
+EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
+
/**
* alloc_skb_with_frags - allocate skb with page frags
*
diff --git a/net/core/sock.c b/net/core/sock.c
index aa4a00d381e3..3e073ca6138f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1039,6 +1039,10 @@ set_rcvbuf:
}
break;
+ case SO_DETACH_REUSEPORT_BPF:
+ ret = reuseport_detach_prog(sk);
+ break;
+
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk);
break;
@@ -2843,7 +2847,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
if (sock) {
sk->sk_type = sock->type;
- RCU_INIT_POINTER(sk->sk_wq, sock->wq);
+ RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
sock->sk = sk;
sk->sk_uid = SOCK_INODE(sock)->i_uid;
} else {
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index be6092ac69f8..52d4faeee18b 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -44,13 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
/* Make sure page count doesn't overflow. */
cost = (u64) stab->map.max_entries * sizeof(struct sock *);
- if (cost >= U32_MAX - PAGE_SIZE) {
- err = -EINVAL;
- goto free_stab;
- }
-
- stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- err = bpf_map_precharge_memlock(stab->map.pages);
+ err = bpf_map_charge_init(&stab->map.memory, cost);
if (err)
goto free_stab;
@@ -60,6 +54,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
if (stab->sks)
return &stab->map;
err = -ENOMEM;
+ bpf_map_charge_finish(&stab->map.memory);
free_stab:
kfree(stab);
return ERR_PTR(err);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index dc4aefdf2a08..9408f9264d05 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -332,3 +332,27 @@ int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
return 0;
}
EXPORT_SYMBOL(reuseport_attach_prog);
+
+int reuseport_detach_prog(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *old_prog;
+
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return sk->sk_reuseport ? -ENOENT : -EINVAL;
+
+ old_prog = NULL;
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ rcu_swap_protected(reuse->prog, old_prog,
+ lockdep_is_held(&reuseport_lock));
+ spin_unlock_bh(&reuseport_lock);
+
+ if (!old_prog)
+ return -ENOENT;
+
+ sk_reuseport_prog_free(old_prog);
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_detach_prog);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 8aab08b131d9..d7bf62ffbb5e 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -14,6 +14,8 @@
#include <net/page_pool.h>
#include <net/xdp.h>
+#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
+#include <trace/events/xdp.h>
#define REG_STATE_NEW 0x0
#define REG_STATE_REGISTERED 0x1
@@ -29,17 +31,6 @@ static int mem_id_next = MEM_ID_MIN;
static bool mem_id_init; /* false */
static struct rhashtable *mem_id_ht;
-struct xdp_mem_allocator {
- struct xdp_mem_info mem;
- union {
- void *allocator;
- struct page_pool *page_pool;
- struct zero_copy_allocator *zc_alloc;
- };
- struct rhash_head node;
- struct rcu_head rcu;
-};
-
static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
{
const u32 *k = data;
@@ -79,13 +70,13 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+ /* Allocator have indicated safe to remove before this is called */
+ if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+ page_pool_free(xa->page_pool);
+
/* Allow this ID to be reused */
ida_simple_remove(&mem_id_pool, xa->mem.id);
- /* Notice, driver is expected to free the *allocator,
- * e.g. page_pool, and MUST also use RCU free.
- */
-
/* Poison memory */
xa->mem.id = 0xFFFF;
xa->mem.type = 0xF0F0;
@@ -94,6 +85,64 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
kfree(xa);
}
+static bool __mem_id_disconnect(int id, bool force)
+{
+ struct xdp_mem_allocator *xa;
+ bool safe_to_remove = true;
+
+ mutex_lock(&mem_id_lock);
+
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ WARN(1, "Request remove non-existing id(%d), driver bug?", id);
+ return true;
+ }
+ xa->disconnect_cnt++;
+
+ /* Detects in-flight packet-pages for page_pool */
+ if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+ safe_to_remove = page_pool_request_shutdown(xa->page_pool);
+
+ trace_mem_disconnect(xa, safe_to_remove, force);
+
+ if ((safe_to_remove || force) &&
+ !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+ mutex_unlock(&mem_id_lock);
+ return (safe_to_remove|force);
+}
+
+#define DEFER_TIME (msecs_to_jiffies(1000))
+#define DEFER_WARN_INTERVAL (30 * HZ)
+#define DEFER_MAX_RETRIES 120
+
+static void mem_id_disconnect_defer_retry(struct work_struct *wq)
+{
+ struct delayed_work *dwq = to_delayed_work(wq);
+ struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
+ bool force = false;
+
+ if (xa->disconnect_cnt > DEFER_MAX_RETRIES)
+ force = true;
+
+ if (__mem_id_disconnect(xa->mem.id, force))
+ return;
+
+ /* Periodic warning */
+ if (time_after_eq(jiffies, xa->defer_warn)) {
+ int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ;
+
+ pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n",
+ __func__, xa->mem.id, xa->disconnect_cnt, sec);
+ xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ }
+
+ /* Still not ready to be disconnected, retry later */
+ schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
+}
+
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
@@ -112,16 +161,30 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
if (id == 0)
return;
+ if (__mem_id_disconnect(id, false))
+ return;
+
+ /* Could not disconnect, defer new disconnect attempt to later */
mutex_lock(&mem_id_lock);
xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
- call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ return;
+ }
+ xa->defer_start = jiffies;
+ xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
mutex_unlock(&mem_id_lock);
+ schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
+/* This unregister operation will also cleanup and destroy the
+ * allocator. The page_pool_free() operation is first called when it's
+ * safe to remove, possibly deferred to a workqueue.
+ */
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -301,12 +364,18 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
/* Insert allocator into ID lookup table */
ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
if (IS_ERR(ptr)) {
+ ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id);
+ xdp_rxq->mem.id = 0;
errno = PTR_ERR(ptr);
goto err;
}
+ if (type == MEM_TYPE_PAGE_POOL)
+ page_pool_get(xdp_alloc->page_pool);
+
mutex_unlock(&mem_id_lock);
+ trace_mem_connect(xdp_alloc, xdp_rxq);
return 0;
err:
mutex_unlock(&mem_id_lock);
@@ -333,10 +402,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
- if (xa) {
+ if (likely(xa)) {
napi_direct &= !xdp_return_frame_no_direct();
page_pool_put_page(xa->page_pool, page, napi_direct);
} else {
+ /* Hopefully stack show who to blame for late return */
+ WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
+ trace_mem_return_failed(mem, page);
put_page(page);
}
rcu_read_unlock();
@@ -379,6 +451,21 @@ void xdp_return_buff(struct xdp_buff *xdp)
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
+/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
+void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
+{
+ struct xdp_mem_allocator *xa;
+ struct page *page;
+
+ rcu_read_lock();
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ page = virt_to_head_page(data);
+ if (xa)
+ page_pool_release_page(xa->page_pool, page);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(__xdp_release_frame);
+
int xdp_attachment_query(struct xdp_attachment_info *info,
struct netdev_bpf *bpf)
{
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 85c10c8f50bd..1b7381ff787b 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -830,7 +830,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
struct ip6_flowlabel *flowlabel;
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (flowlabel == NULL)
+ if (IS_ERR(flowlabel))
return -EINVAL;
fl6_sock_release(flowlabel);
}
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index d449f78c1bd0..6e942dda1bcd 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -106,6 +106,7 @@ config NET_DSA_TAG_LAN9303
config NET_DSA_TAG_SJA1105
tristate "Tag driver for NXP SJA1105 switches"
select NET_DSA_TAG_8021Q
+ select PACKING
help
Say Y or M if you want to enable support for tagging frames with the
NXP SJA1105 switch family. Both the native tagging protocol (which
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 820dd8da57fc..3abd173ebacb 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -257,7 +257,7 @@ static int dsa_port_setup(struct dsa_port *dp)
enum devlink_port_flavour flavour;
struct dsa_switch *ds = dp->ds;
struct dsa_switch_tree *dst = ds->dst;
- int err;
+ int err = 0;
if (dp->type == DSA_PORT_TYPE_UNUSED)
return 0;
@@ -295,19 +295,15 @@ static int dsa_port_setup(struct dsa_port *dp)
break;
case DSA_PORT_TYPE_CPU:
err = dsa_port_link_register_of(dp);
- if (err) {
+ if (err)
dev_err(ds->dev, "failed to setup link for port %d.%d\n",
ds->index, dp->index);
- return err;
- }
break;
case DSA_PORT_TYPE_DSA:
err = dsa_port_link_register_of(dp);
- if (err) {
+ if (err)
dev_err(ds->dev, "failed to setup link for port %d.%d\n",
ds->index, dp->index);
- return err;
- }
break;
case DSA_PORT_TYPE_USER:
err = dsa_slave_create(dp);
@@ -319,7 +315,10 @@ static int dsa_port_setup(struct dsa_port *dp)
break;
}
- return 0;
+ if (err)
+ devlink_port_unregister(&dp->devlink_port);
+
+ return err;
}
static void dsa_port_teardown(struct dsa_port *dp)
@@ -347,7 +346,7 @@ static void dsa_port_teardown(struct dsa_port *dp)
static int dsa_switch_setup(struct dsa_switch *ds)
{
- int err;
+ int err = 0;
/* Initialize ds->phys_mii_mask before registering the slave MDIO bus
* driver and before ops->setup() has run, since the switch drivers and
@@ -365,29 +364,41 @@ static int dsa_switch_setup(struct dsa_switch *ds)
err = devlink_register(ds->devlink, ds->dev);
if (err)
- return err;
+ goto free_devlink;
err = dsa_switch_register_notifier(ds);
if (err)
- return err;
+ goto unregister_devlink;
err = ds->ops->setup(ds);
if (err < 0)
- return err;
+ goto unregister_notifier;
if (!ds->slave_mii_bus && ds->ops->phy_read) {
ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
- if (!ds->slave_mii_bus)
- return -ENOMEM;
+ if (!ds->slave_mii_bus) {
+ err = -ENOMEM;
+ goto unregister_notifier;
+ }
dsa_slave_mii_bus_init(ds);
err = mdiobus_register(ds->slave_mii_bus);
if (err < 0)
- return err;
+ goto unregister_notifier;
}
return 0;
+
+unregister_notifier:
+ dsa_switch_unregister_notifier(ds);
+unregister_devlink:
+ devlink_unregister(ds->devlink);
+free_devlink:
+ devlink_free(ds->devlink);
+ ds->devlink = NULL;
+
+ return err;
}
static void dsa_switch_teardown(struct dsa_switch *ds)
@@ -397,6 +408,9 @@ static void dsa_switch_teardown(struct dsa_switch *ds)
dsa_switch_unregister_notifier(ds);
+ if (ds->ops->teardown)
+ ds->ops->teardown(ds);
+
if (ds->devlink) {
devlink_unregister(ds->devlink);
devlink_free(ds->devlink);
@@ -409,8 +423,8 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
{
struct dsa_switch *ds;
struct dsa_port *dp;
- int device, port;
- int err;
+ int device, port, i;
+ int err = 0;
for (device = 0; device < DSA_MAX_SWITCHES; device++) {
ds = dst->ds[device];
@@ -419,18 +433,41 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
err = dsa_switch_setup(ds);
if (err)
- return err;
+ goto switch_teardown;
for (port = 0; port < ds->num_ports; port++) {
dp = &ds->ports[port];
err = dsa_port_setup(dp);
if (err)
- return err;
+ goto ports_teardown;
}
}
return 0;
+
+ports_teardown:
+ for (i = 0; i < port; i++)
+ dsa_port_teardown(&ds->ports[i]);
+
+ dsa_switch_teardown(ds);
+
+switch_teardown:
+ for (i = 0; i < device; i++) {
+ ds = dst->ds[i];
+ if (!ds)
+ continue;
+
+ for (port = 0; port < ds->num_ports; port++) {
+ dp = &ds->ports[port];
+
+ dsa_port_teardown(dp);
+ }
+
+ dsa_switch_teardown(ds);
+ }
+
+ return err;
}
static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst)
@@ -492,17 +529,24 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst)
err = dsa_tree_setup_switches(dst);
if (err)
- return err;
+ goto teardown_default_cpu;
err = dsa_tree_setup_master(dst);
if (err)
- return err;
+ goto teardown_switches;
dst->setup = true;
pr_info("DSA: tree %d setup\n", dst->index);
return 0;
+
+teardown_switches:
+ dsa_tree_teardown_switches(dst);
+teardown_default_cpu:
+ dsa_tree_teardown_default_cpu(dst);
+
+ return err;
}
static void dsa_tree_teardown(struct dsa_switch_tree *dst)
@@ -543,8 +587,10 @@ static int dsa_tree_add_switch(struct dsa_switch_tree *dst,
dst->ds[index] = ds;
err = dsa_tree_setup(dst);
- if (err)
- dsa_tree_remove_switch(dst, index);
+ if (err) {
+ dst->ds[index] = NULL;
+ dsa_tree_put(dst);
+ }
return err;
}
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 3986cedfafc0..12f8c7ee4dd8 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -150,6 +150,8 @@ int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags,
struct switchdev_trans *trans);
int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags,
struct switchdev_trans *trans);
+int dsa_port_mrouter(struct dsa_port *dp, bool mrouter,
+ struct switchdev_trans *trans);
int dsa_port_vlan_add(struct dsa_port *dp,
const struct switchdev_obj_port_vlan *vlan,
struct switchdev_trans *trans);
@@ -159,6 +161,23 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags);
int dsa_port_vid_del(struct dsa_port *dp, u16 vid);
int dsa_port_link_register_of(struct dsa_port *dp);
void dsa_port_link_unregister_of(struct dsa_port *dp);
+void dsa_port_phylink_validate(struct phylink_config *config,
+ unsigned long *supported,
+ struct phylink_link_state *state);
+int dsa_port_phylink_mac_link_state(struct phylink_config *config,
+ struct phylink_link_state *state);
+void dsa_port_phylink_mac_config(struct phylink_config *config,
+ unsigned int mode,
+ const struct phylink_link_state *state);
+void dsa_port_phylink_mac_an_restart(struct phylink_config *config);
+void dsa_port_phylink_mac_link_down(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface);
+void dsa_port_phylink_mac_link_up(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface,
+ struct phy_device *phydev);
+extern const struct phylink_mac_ops dsa_port_phylink_mac_ops;
/* slave.c */
extern const struct dsa_device_ops notag_netdev_ops;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 363eab6df51b..f071acf2842b 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -261,6 +261,18 @@ int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags,
return err;
}
+int dsa_port_mrouter(struct dsa_port *dp, bool mrouter,
+ struct switchdev_trans *trans)
+{
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (switchdev_trans_ph_prepare(trans))
+ return ds->ops->port_egress_floods ? 0 : -EOPNOTSUPP;
+
+ return ds->ops->port_egress_floods(ds, port, true, mrouter);
+}
+
int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
u16 vid)
{
@@ -336,9 +348,6 @@ int dsa_port_vlan_add(struct dsa_port *dp,
.vlan = vlan,
};
- /* Can be called from dsa_slave_port_obj_add() or
- * dsa_slave_vlan_rx_add_vid()
- */
if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev))
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
@@ -354,12 +363,6 @@ int dsa_port_vlan_del(struct dsa_port *dp,
.vlan = vlan,
};
- if (vlan->obj.orig_dev && netif_is_bridge_master(vlan->obj.orig_dev))
- return -EOPNOTSUPP;
-
- /* Can be called from dsa_slave_port_obj_del() or
- * dsa_slave_vlan_rx_kill_vid()
- */
if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev))
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
@@ -418,6 +421,108 @@ static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
return phydev;
}
+void dsa_port_phylink_validate(struct phylink_config *config,
+ unsigned long *supported,
+ struct phylink_link_state *state)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_validate)
+ return;
+
+ ds->ops->phylink_validate(ds, dp->index, supported, state);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_validate);
+
+int dsa_port_phylink_mac_link_state(struct phylink_config *config,
+ struct phylink_link_state *state)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct dsa_switch *ds = dp->ds;
+
+ /* Only called for SGMII and 802.3z */
+ if (!ds->ops->phylink_mac_link_state)
+ return -EOPNOTSUPP;
+
+ return ds->ops->phylink_mac_link_state(ds, dp->index, state);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_state);
+
+void dsa_port_phylink_mac_config(struct phylink_config *config,
+ unsigned int mode,
+ const struct phylink_link_state *state)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_mac_config)
+ return;
+
+ ds->ops->phylink_mac_config(ds, dp->index, mode, state);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_config);
+
+void dsa_port_phylink_mac_an_restart(struct phylink_config *config)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_mac_an_restart)
+ return;
+
+ ds->ops->phylink_mac_an_restart(ds, dp->index);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_an_restart);
+
+void dsa_port_phylink_mac_link_down(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct phy_device *phydev = NULL;
+ struct dsa_switch *ds = dp->ds;
+
+ if (dsa_is_user_port(ds, dp->index))
+ phydev = dp->slave->phydev;
+
+ if (!ds->ops->phylink_mac_link_down) {
+ if (ds->ops->adjust_link && phydev)
+ ds->ops->adjust_link(ds, dp->index, phydev);
+ return;
+ }
+
+ ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_down);
+
+void dsa_port_phylink_mac_link_up(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface,
+ struct phy_device *phydev)
+{
+ struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_mac_link_up) {
+ if (ds->ops->adjust_link && phydev)
+ ds->ops->adjust_link(ds, dp->index, phydev);
+ return;
+ }
+
+ ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up);
+
+const struct phylink_mac_ops dsa_port_phylink_mac_ops = {
+ .validate = dsa_port_phylink_validate,
+ .mac_link_state = dsa_port_phylink_mac_link_state,
+ .mac_config = dsa_port_phylink_mac_config,
+ .mac_an_restart = dsa_port_phylink_mac_an_restart,
+ .mac_link_down = dsa_port_phylink_mac_link_down,
+ .mac_link_up = dsa_port_phylink_mac_link_up,
+};
+
static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
{
struct dsa_switch *ds = dp->ds;
@@ -495,8 +600,53 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp)
return 0;
}
+static int dsa_port_phylink_register(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct device_node *port_dn = dp->dn;
+ int mode, err;
+
+ mode = of_get_phy_mode(port_dn);
+ if (mode < 0)
+ mode = PHY_INTERFACE_MODE_NA;
+
+ dp->pl_config.dev = ds->dev;
+ dp->pl_config.type = PHYLINK_DEV;
+
+ dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn),
+ mode, &dsa_port_phylink_mac_ops);
+ if (IS_ERR(dp->pl)) {
+ pr_err("error creating PHYLINK: %ld\n", PTR_ERR(dp->pl));
+ return PTR_ERR(dp->pl);
+ }
+
+ err = phylink_of_phy_connect(dp->pl, port_dn, 0);
+ if (err && err != -ENODEV) {
+ pr_err("could not attach to PHY: %d\n", err);
+ goto err_phy_connect;
+ }
+
+ rtnl_lock();
+ phylink_start(dp->pl);
+ rtnl_unlock();
+
+ return 0;
+
+err_phy_connect:
+ phylink_destroy(dp->pl);
+ return err;
+}
+
int dsa_port_link_register_of(struct dsa_port *dp)
{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->adjust_link)
+ return dsa_port_phylink_register(dp);
+
+ dev_warn(ds->dev,
+ "Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n");
+
if (of_phy_is_fixed_link(dp->dn))
return dsa_port_fixed_link_register_of(dp);
else
@@ -505,6 +655,16 @@ int dsa_port_link_register_of(struct dsa_port *dp)
void dsa_port_link_unregister_of(struct dsa_port *dp)
{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->adjust_link) {
+ rtnl_lock();
+ phylink_disconnect_phy(dp->pl);
+ rtnl_unlock();
+ phylink_destroy(dp->pl);
+ return;
+ }
+
if (of_phy_is_fixed_link(dp->dn))
of_phy_deregister_fixed_link(dp->dn);
else
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8157be7e162d..614c38ece104 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -22,7 +22,7 @@
#include "dsa_priv.h"
-static bool dsa_slave_dev_check(struct net_device *dev);
+static bool dsa_slave_dev_check(const struct net_device *dev);
/* slave mii_bus handling ***************************************************/
static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
@@ -301,6 +301,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, trans);
break;
+ case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER:
+ ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter, trans);
+ break;
default:
ret = -EOPNOTSUPP;
break;
@@ -311,7 +314,8 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
static int dsa_slave_port_obj_add(struct net_device *dev,
const struct switchdev_obj *obj,
- struct switchdev_trans *trans)
+ struct switchdev_trans *trans,
+ struct netlink_ext_ack *extack)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
int err;
@@ -323,6 +327,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
switch (obj->id) {
case SWITCHDEV_OBJ_ID_PORT_MDB:
+ if (obj->orig_dev != dev)
+ return -EOPNOTSUPP;
err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
break;
case SWITCHDEV_OBJ_ID_HOST_MDB:
@@ -333,6 +339,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
trans);
break;
case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ if (obj->orig_dev != dev)
+ return -EOPNOTSUPP;
err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj),
trans);
break;
@@ -352,6 +360,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
switch (obj->id) {
case SWITCHDEV_OBJ_ID_PORT_MDB:
+ if (obj->orig_dev != dev)
+ return -EOPNOTSUPP;
err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
break;
case SWITCHDEV_OBJ_ID_HOST_MDB:
@@ -361,6 +371,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj));
break;
case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ if (obj->orig_dev != dev)
+ return -EOPNOTSUPP;
err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj));
break;
default:
@@ -423,6 +435,8 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p,
if (!clone)
return;
+ DSA_SKB_CB(skb)->clone = clone;
+
if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type))
return;
@@ -460,6 +474,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
u64_stats_update_end(&s->syncp);
DSA_SKB_CB(skb)->deferred_xmit = false;
+ DSA_SKB_CB(skb)->clone = NULL;
/* Identify PTP protocol packets, clone them, and pass them to the
* switch driver
@@ -930,23 +945,42 @@ static int dsa_slave_setup_tc_block_cb_eg(enum tc_setup_type type,
return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, false);
}
+static LIST_HEAD(dsa_slave_block_cb_list);
+
static int dsa_slave_setup_tc_block(struct net_device *dev,
- struct tc_block_offload *f)
+ struct flow_block_offload *f)
{
+ struct flow_block_cb *block_cb;
tc_setup_cb_t *cb;
- if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+ if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
cb = dsa_slave_setup_tc_block_cb_ig;
- else if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+ else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
cb = dsa_slave_setup_tc_block_cb_eg;
else
return -EOPNOTSUPP;
+ f->driver_block_list = &dsa_slave_block_cb_list;
+
switch (f->command) {
- case TC_BLOCK_BIND:
- return tcf_block_cb_register(f->block, cb, dev, dev, f->extack);
- case TC_BLOCK_UNBIND:
- tcf_block_cb_unregister(f->block, cb, dev);
+ case FLOW_BLOCK_BIND:
+ if (flow_block_cb_is_busy(cb, dev, &dsa_slave_block_cb_list))
+ return -EBUSY;
+
+ block_cb = flow_block_cb_alloc(f->net, cb, dev, dev, NULL);
+ if (IS_ERR(block_cb))
+ return PTR_ERR(block_cb);
+
+ flow_block_cb_add(block_cb, f);
+ list_add_tail(&block_cb->driver_list, &dsa_slave_block_cb_list);
+ return 0;
+ case FLOW_BLOCK_UNBIND:
+ block_cb = flow_block_cb_lookup(f, cb, dev);
+ if (!block_cb)
+ return -ENOENT;
+
+ flow_block_cb_remove(block_cb, f);
+ list_del(&block_cb->driver_list);
return 0;
default:
return -EOPNOTSUPP;
@@ -1160,98 +1194,6 @@ static struct device_type dsa_type = {
.name = "dsa",
};
-static void dsa_slave_phylink_validate(struct net_device *dev,
- unsigned long *supported,
- struct phylink_link_state *state)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_validate)
- return;
-
- ds->ops->phylink_validate(ds, dp->index, supported, state);
-}
-
-static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
- struct phylink_link_state *state)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- /* Only called for SGMII and 802.3z */
- if (!ds->ops->phylink_mac_link_state)
- return -EOPNOTSUPP;
-
- return ds->ops->phylink_mac_link_state(ds, dp->index, state);
-}
-
-static void dsa_slave_phylink_mac_config(struct net_device *dev,
- unsigned int mode,
- const struct phylink_link_state *state)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_config)
- return;
-
- ds->ops->phylink_mac_config(ds, dp->index, mode, state);
-}
-
-static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_an_restart)
- return;
-
- ds->ops->phylink_mac_an_restart(ds, dp->index);
-}
-
-static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
- unsigned int mode,
- phy_interface_t interface)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_link_down) {
- if (ds->ops->adjust_link && dev->phydev)
- ds->ops->adjust_link(ds, dp->index, dev->phydev);
- return;
- }
-
- ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
-}
-
-static void dsa_slave_phylink_mac_link_up(struct net_device *dev,
- unsigned int mode,
- phy_interface_t interface,
- struct phy_device *phydev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_link_up) {
- if (ds->ops->adjust_link && dev->phydev)
- ds->ops->adjust_link(ds, dp->index, dev->phydev);
- return;
- }
-
- ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
-}
-
-static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = {
- .validate = dsa_slave_phylink_validate,
- .mac_link_state = dsa_slave_phylink_mac_link_state,
- .mac_config = dsa_slave_phylink_mac_config,
- .mac_an_restart = dsa_slave_phylink_mac_an_restart,
- .mac_link_down = dsa_slave_phylink_mac_link_down,
- .mac_link_up = dsa_slave_phylink_mac_link_up,
-};
-
void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
{
const struct dsa_port *dp = dsa_to_port(ds, port);
@@ -1299,8 +1241,11 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev)
if (mode < 0)
mode = PHY_INTERFACE_MODE_NA;
- dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode,
- &dsa_slave_phylink_mac_ops);
+ dp->pl_config.dev = &slave_dev->dev;
+ dp->pl_config.type = PHYLINK_NETDEV;
+
+ dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode,
+ &dsa_port_phylink_mac_ops);
if (IS_ERR(dp->pl)) {
netdev_err(slave_dev,
"error creating PHYLINK: %ld\n", PTR_ERR(dp->pl));
@@ -1494,7 +1439,7 @@ void dsa_slave_destroy(struct net_device *slave_dev)
free_netdev(slave_dev);
}
-static bool dsa_slave_dev_check(struct net_device *dev)
+static bool dsa_slave_dev_check(const struct net_device *dev)
{
return dev->netdev_ops == &dsa_slave_netdev_ops;
}
@@ -1565,19 +1510,6 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
return NOTIFY_DONE;
}
-static int
-dsa_slave_switchdev_port_attr_set_event(struct net_device *netdev,
- struct switchdev_notifier_port_attr_info *port_attr_info)
-{
- int err;
-
- err = dsa_slave_port_attr_set(netdev, port_attr_info->attr,
- port_attr_info->trans);
-
- port_attr_info->handled = true;
- return notifier_from_errno(err);
-}
-
struct dsa_switchdev_event_work {
struct work_struct work;
struct switchdev_notifier_fdb_info fdb_info;
@@ -1652,13 +1584,18 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
{
struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
struct dsa_switchdev_event_work *switchdev_work;
+ int err;
+
+ if (event == SWITCHDEV_PORT_ATTR_SET) {
+ err = switchdev_handle_port_attr_set(dev, ptr,
+ dsa_slave_dev_check,
+ dsa_slave_port_attr_set);
+ return notifier_from_errno(err);
+ }
if (!dsa_slave_dev_check(dev))
return NOTIFY_DONE;
- if (event == SWITCHDEV_PORT_ATTR_SET)
- return dsa_slave_switchdev_port_attr_set_event(dev, ptr);
-
switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
if (!switchdev_work)
return NOTIFY_BAD;
@@ -1688,41 +1625,28 @@ err_fdb_work_init:
return NOTIFY_BAD;
}
-static int
-dsa_slave_switchdev_port_obj_event(unsigned long event,
- struct net_device *netdev,
- struct switchdev_notifier_port_obj_info *port_obj_info)
-{
- int err = -EOPNOTSUPP;
-
- switch (event) {
- case SWITCHDEV_PORT_OBJ_ADD:
- err = dsa_slave_port_obj_add(netdev, port_obj_info->obj,
- port_obj_info->trans);
- break;
- case SWITCHDEV_PORT_OBJ_DEL:
- err = dsa_slave_port_obj_del(netdev, port_obj_info->obj);
- break;
- }
-
- port_obj_info->handled = true;
- return notifier_from_errno(err);
-}
-
static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
-
- if (!dsa_slave_dev_check(dev))
- return NOTIFY_DONE;
+ int err;
switch (event) {
- case SWITCHDEV_PORT_OBJ_ADD: /* fall through */
+ case SWITCHDEV_PORT_OBJ_ADD:
+ err = switchdev_handle_port_obj_add(dev, ptr,
+ dsa_slave_dev_check,
+ dsa_slave_port_obj_add);
+ return notifier_from_errno(err);
case SWITCHDEV_PORT_OBJ_DEL:
- return dsa_slave_switchdev_port_obj_event(event, dev, ptr);
+ err = switchdev_handle_port_obj_del(dev, ptr,
+ dsa_slave_dev_check,
+ dsa_slave_port_obj_del);
+ return notifier_from_errno(err);
case SWITCHDEV_PORT_ATTR_SET:
- return dsa_slave_switchdev_port_attr_set_event(dev, ptr);
+ err = switchdev_handle_port_attr_set(dev, ptr,
+ dsa_slave_dev_check,
+ dsa_slave_port_attr_set);
+ return notifier_from_errno(err);
}
return NOTIFY_DONE;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 65a35e976d7b..6ebbd799c4eb 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -235,31 +235,48 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
}
EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
-struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev,
- struct packet_type *pt, u16 *tpid, u16 *tci)
+/* In the DSA packet_type handler, skb->data points in the middle of the VLAN
+ * tag, after tpid and before tci. This is because so far, ETH_HLEN
+ * (DMAC, SMAC, EtherType) bytes were pulled.
+ * There are 2 bytes of VLAN tag left in skb->data, and upper
+ * layers expect the 'real' EtherType to be consumed as well.
+ * Coincidentally, a VLAN header is also of the same size as
+ * the number of bytes that need to be pulled.
+ *
+ * skb_mac_header skb->data
+ * | |
+ * v v
+ * | | | | | | | | | | | | | | | | | | |
+ * +-----------------------+-----------------------+-------+-------+-------+
+ * | Destination MAC | Source MAC | TPID | TCI | EType |
+ * +-----------------------+-----------------------+-------+-------+-------+
+ * ^ | |
+ * |<--VLAN_HLEN-->to <---VLAN_HLEN--->
+ * from |
+ * >>>>>>> v
+ * >>>>>>> | | | | | | | | | | | | | | |
+ * >>>>>>> +-----------------------+-----------------------+-------+
+ * >>>>>>> | Destination MAC | Source MAC | EType |
+ * +-----------------------+-----------------------+-------+
+ * ^ ^
+ * (now part of | |
+ * skb->head) skb_mac_header skb->data
+ */
+struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb)
{
- struct vlan_ethhdr *tag;
-
- if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
- return NULL;
+ u8 *from = skb_mac_header(skb);
+ u8 *dest = from + VLAN_HLEN;
- tag = vlan_eth_hdr(skb);
- *tpid = ntohs(tag->h_vlan_proto);
- *tci = ntohs(tag->h_vlan_TCI);
-
- /* skb->data points in the middle of the VLAN tag,
- * after tpid and before tci. This is because so far,
- * ETH_HLEN (DMAC, SMAC, EtherType) bytes were pulled.
- * There are 2 bytes of VLAN tag left in skb->data, and upper
- * layers expect the 'real' EtherType to be consumed as well.
- * Coincidentally, a VLAN header is also of the same size as
- * the number of bytes that need to be pulled.
- */
- skb_pull_rcsum(skb, VLAN_HLEN);
+ memmove(dest, from, ETH_HLEN - VLAN_HLEN);
+ skb_pull(skb, VLAN_HLEN);
+ skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+ skb_pull_rcsum(skb, ETH_HLEN);
return skb;
}
-EXPORT_SYMBOL_GPL(dsa_8021q_rcv);
+EXPORT_SYMBOL_GPL(dsa_8021q_remove_header);
static const struct dsa_device_ops dsa_8021q_netdev_ops = {
.name = "8021q",
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index d43737e6c3fb..1d96c9d4a8e9 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -13,6 +13,8 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb)
const struct ethhdr *hdr = eth_hdr(skb);
u64 dmac = ether_addr_to_u64(hdr->h_dest);
+ if (ntohs(hdr->h_proto) == ETH_P_SJA1105_META)
+ return false;
if ((dmac & SJA1105_LINKLOCAL_FILTER_A_MASK) ==
SJA1105_LINKLOCAL_FILTER_A)
return true;
@@ -22,15 +24,61 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb)
return false;
}
+struct sja1105_meta {
+ u64 tstamp;
+ u64 dmac_byte_4;
+ u64 dmac_byte_3;
+ u64 source_port;
+ u64 switch_id;
+};
+
+static void sja1105_meta_unpack(const struct sk_buff *skb,
+ struct sja1105_meta *meta)
+{
+ u8 *buf = skb_mac_header(skb) + ETH_HLEN;
+
+ /* UM10944.pdf section 4.2.17 AVB Parameters:
+ * Structure of the meta-data follow-up frame.
+ * It is in network byte order, so there are no quirks
+ * while unpacking the meta frame.
+ *
+ * Also SJA1105 E/T only populates bits 23:0 of the timestamp
+ * whereas P/Q/R/S does 32 bits. Since the structure is the
+ * same and the E/T puts zeroes in the high-order byte, use
+ * a unified unpacking command for both device series.
+ */
+ packing(buf, &meta->tstamp, 31, 0, 4, UNPACK, 0);
+ packing(buf + 4, &meta->dmac_byte_4, 7, 0, 1, UNPACK, 0);
+ packing(buf + 5, &meta->dmac_byte_3, 7, 0, 1, UNPACK, 0);
+ packing(buf + 6, &meta->source_port, 7, 0, 1, UNPACK, 0);
+ packing(buf + 7, &meta->switch_id, 7, 0, 1, UNPACK, 0);
+}
+
+static inline bool sja1105_is_meta_frame(const struct sk_buff *skb)
+{
+ const struct ethhdr *hdr = eth_hdr(skb);
+ u64 smac = ether_addr_to_u64(hdr->h_source);
+ u64 dmac = ether_addr_to_u64(hdr->h_dest);
+
+ if (smac != SJA1105_META_SMAC)
+ return false;
+ if (dmac != SJA1105_META_DMAC)
+ return false;
+ if (ntohs(hdr->h_proto) != ETH_P_SJA1105_META)
+ return false;
+ return true;
+}
+
/* This is the first time the tagger sees the frame on RX.
- * Figure out if we can decode it, and if we can, annotate skb->cb with how we
- * plan to do that, so we don't need to check again in the rcv function.
+ * Figure out if we can decode it.
*/
static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev)
{
+ if (!dsa_port_is_vlan_filtering(dev->dsa_ptr))
+ return true;
if (sja1105_is_link_local(skb))
return true;
- if (!dsa_port_is_vlan_filtering(dev->dsa_ptr))
+ if (sja1105_is_meta_frame(skb))
return true;
return false;
}
@@ -62,25 +110,152 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
((pcp << VLAN_PRIO_SHIFT) | tx_vid));
}
+static void sja1105_transfer_meta(struct sk_buff *skb,
+ const struct sja1105_meta *meta)
+{
+ struct ethhdr *hdr = eth_hdr(skb);
+
+ hdr->h_dest[3] = meta->dmac_byte_3;
+ hdr->h_dest[4] = meta->dmac_byte_4;
+ SJA1105_SKB_CB(skb)->meta_tstamp = meta->tstamp;
+}
+
+/* This is a simple state machine which follows the hardware mechanism of
+ * generating RX timestamps:
+ *
+ * After each timestampable skb (all traffic for which send_meta1 and
+ * send_meta0 is true, aka all MAC-filtered link-local traffic) a meta frame
+ * containing a partial timestamp is immediately generated by the switch and
+ * sent as a follow-up to the link-local frame on the CPU port.
+ *
+ * The meta frames have no unique identifier (such as sequence number) by which
+ * one may pair them to the correct timestampable frame.
+ * Instead, the switch has internal logic that ensures no frames are sent on
+ * the CPU port between a link-local timestampable frame and its corresponding
+ * meta follow-up. It also ensures strict ordering between ports (lower ports
+ * have higher priority towards the CPU port). For this reason, a per-port
+ * data structure is not needed/desirable.
+ *
+ * This function pairs the link-local frame with its partial timestamp from the
+ * meta follow-up frame. The full timestamp will be reconstructed later in a
+ * work queue.
+ */
+static struct sk_buff
+*sja1105_rcv_meta_state_machine(struct sk_buff *skb,
+ struct sja1105_meta *meta,
+ bool is_link_local,
+ bool is_meta)
+{
+ struct sja1105_port *sp;
+ struct dsa_port *dp;
+
+ dp = dsa_slave_to_port(skb->dev);
+ sp = dp->priv;
+
+ /* Step 1: A timestampable frame was received.
+ * Buffer it until we get its meta frame.
+ */
+ if (is_link_local && sp->data->hwts_rx_en) {
+ spin_lock(&sp->data->meta_lock);
+ /* Was this a link-local frame instead of the meta
+ * that we were expecting?
+ */
+ if (sp->data->stampable_skb) {
+ dev_err_ratelimited(dp->ds->dev,
+ "Expected meta frame, is %12llx "
+ "in the DSA master multicast filter?\n",
+ SJA1105_META_DMAC);
+ }
+
+ /* Hold a reference to avoid dsa_switch_rcv
+ * from freeing the skb.
+ */
+ sp->data->stampable_skb = skb_get(skb);
+ spin_unlock(&sp->data->meta_lock);
+
+ /* Tell DSA we got nothing */
+ return NULL;
+
+ /* Step 2: The meta frame arrived.
+ * Time to take the stampable skb out of the closet, annotate it
+ * with the partial timestamp, and pretend that we received it
+ * just now (basically masquerade the buffered frame as the meta
+ * frame, which serves no further purpose).
+ */
+ } else if (is_meta) {
+ struct sk_buff *stampable_skb;
+
+ spin_lock(&sp->data->meta_lock);
+
+ stampable_skb = sp->data->stampable_skb;
+ sp->data->stampable_skb = NULL;
+
+ /* Was this a meta frame instead of the link-local
+ * that we were expecting?
+ */
+ if (!stampable_skb) {
+ dev_err_ratelimited(dp->ds->dev,
+ "Unexpected meta frame\n");
+ spin_unlock(&sp->data->meta_lock);
+ return NULL;
+ }
+
+ if (stampable_skb->dev != skb->dev) {
+ dev_err_ratelimited(dp->ds->dev,
+ "Meta frame on wrong port\n");
+ spin_unlock(&sp->data->meta_lock);
+ return NULL;
+ }
+
+ /* Free the meta frame and give DSA the buffered stampable_skb
+ * for further processing up the network stack.
+ */
+ kfree_skb(skb);
+
+ skb = skb_copy(stampable_skb, GFP_ATOMIC);
+ if (!skb) {
+ dev_err_ratelimited(dp->ds->dev,
+ "Failed to copy stampable skb\n");
+ return NULL;
+ }
+ sja1105_transfer_meta(skb, meta);
+ /* The cached copy will be freed now */
+ skb_unref(stampable_skb);
+
+ spin_unlock(&sp->data->meta_lock);
+ }
+
+ return skb;
+}
+
static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
struct net_device *netdev,
struct packet_type *pt)
{
- struct ethhdr *hdr = eth_hdr(skb);
- u64 source_port, switch_id;
- struct sk_buff *nskb;
+ struct sja1105_meta meta = {0};
+ int source_port, switch_id;
+ struct vlan_ethhdr *hdr;
u16 tpid, vid, tci;
+ bool is_link_local;
bool is_tagged;
+ bool is_meta;
- nskb = dsa_8021q_rcv(skb, netdev, pt, &tpid, &tci);
- is_tagged = (nskb && tpid == ETH_P_SJA1105);
-
- skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
- vid = tci & VLAN_VID_MASK;
+ hdr = vlan_eth_hdr(skb);
+ tpid = ntohs(hdr->h_vlan_proto);
+ is_tagged = (tpid == ETH_P_SJA1105);
+ is_link_local = sja1105_is_link_local(skb);
+ is_meta = sja1105_is_meta_frame(skb);
skb->offload_fwd_mark = 1;
- if (sja1105_is_link_local(skb)) {
+ if (is_tagged) {
+ /* Normal traffic path. */
+ tci = ntohs(hdr->h_vlan_TCI);
+ vid = tci & VLAN_VID_MASK;
+ source_port = dsa_8021q_rx_source_port(vid);
+ switch_id = dsa_8021q_rx_switch_id(vid);
+ skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+ } else if (is_link_local) {
/* Management traffic path. Switch embeds the switch ID and
* port ID into bytes of the destination MAC, courtesy of
* the incl_srcpt options.
@@ -90,10 +265,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
/* Clear the DMAC bytes that were mangled by the switch */
hdr->h_dest[3] = 0;
hdr->h_dest[4] = 0;
+ } else if (is_meta) {
+ sja1105_meta_unpack(skb, &meta);
+ source_port = meta.source_port;
+ switch_id = meta.switch_id;
} else {
- /* Normal traffic path. */
- source_port = dsa_8021q_rx_source_port(vid);
- switch_id = dsa_8021q_rx_switch_id(vid);
+ return NULL;
}
skb->dev = dsa_master_find_slave(netdev, switch_id, source_port);
@@ -106,10 +283,10 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
* it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN).
*/
if (is_tagged)
- memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - VLAN_HLEN,
- ETH_HLEN - VLAN_HLEN);
+ skb = dsa_8021q_remove_header(skb);
- return skb;
+ return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local,
+ is_meta);
}
static struct dsa_device_ops sja1105_netdev_ops = {
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 20b0bcb7e9e3..17374afee28f 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -545,17 +545,10 @@ unsigned char * __weak arch_get_platform_mac_address(void)
int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
{
- const unsigned char *addr;
- struct device_node *dp;
+ const unsigned char *addr = NULL;
- if (dev_is_pci(dev))
- dp = pci_device_to_OF_node(to_pci_dev(dev));
- else
- dp = dev->of_node;
-
- addr = NULL;
- if (dp)
- addr = of_get_mac_address(dp);
+ if (dev->of_node)
+ addr = of_get_mac_address(dev->of_node);
if (IS_ERR_OR_NULL(addr))
addr = arch_get_platform_mac_address();
@@ -563,6 +556,7 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
return -ENODEV;
ether_addr_copy(mac_addr, addr);
+
return 0;
}
EXPORT_SYMBOL(eth_platform_get_mac_address);
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 15c72065df79..f0f9b493c47b 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -227,9 +227,13 @@ static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
struct hsr_port *master;
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
- skb->dev = master->dev;
- hsr_forward_skb(skb, master);
-
+ if (master) {
+ skb->dev = master->dev;
+ hsr_forward_skb(skb, master);
+ } else {
+ atomic_long_inc(&dev->tx_dropped);
+ dev_kfree_skb_any(skb);
+ }
return NETDEV_TX_OK;
}
@@ -344,27 +348,26 @@ static void hsr_announce(struct timer_list *t)
rcu_read_unlock();
}
-/* According to comments in the declaration of struct net_device, this function
- * is "Called from unregister, can be used to call free_netdev". Ok then...
- */
-static void hsr_dev_destroy(struct net_device *hsr_dev)
+void hsr_dev_destroy(struct net_device *hsr_dev)
{
struct hsr_priv *hsr;
struct hsr_port *port;
+ struct hsr_port *tmp;
hsr = netdev_priv(hsr_dev);
hsr_debugfs_term(hsr);
- rtnl_lock();
- hsr_for_each_port(hsr, port)
+ list_for_each_entry_safe(port, tmp, &hsr->ports, port_list)
hsr_del_port(port);
- rtnl_unlock();
del_timer_sync(&hsr->prune_timer);
del_timer_sync(&hsr->announce_timer);
synchronize_rcu();
+
+ hsr_del_self_node(&hsr->self_node_db);
+ hsr_del_nodes(&hsr->node_db);
}
static const struct net_device_ops hsr_device_ops = {
@@ -391,7 +394,6 @@ void hsr_dev_setup(struct net_device *dev)
dev->priv_flags |= IFF_NO_QUEUE;
dev->needs_free_netdev = true;
- dev->priv_destructor = hsr_dev_destroy;
dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
NETIF_F_GSO_MASK | NETIF_F_HW_CSUM |
@@ -428,6 +430,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
{
struct hsr_priv *hsr;
struct hsr_port *port;
+ struct hsr_port *tmp;
int res;
hsr = netdev_priv(hsr_dev);
@@ -492,10 +495,10 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
return 0;
fail:
- hsr_for_each_port(hsr, port)
+ list_for_each_entry_safe(port, tmp, &hsr->ports, port_list)
hsr_del_port(port);
err_add_port:
- hsr_del_node(&hsr->self_node_db);
+ hsr_del_self_node(&hsr->self_node_db);
return res;
}
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
index 6d7759c4f5f9..d0fa6b0696d2 100644
--- a/net/hsr/hsr_device.h
+++ b/net/hsr/hsr_device.h
@@ -14,6 +14,7 @@
void hsr_dev_setup(struct net_device *dev);
int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
unsigned char multicast_spec, u8 protocol_version);
+void hsr_dev_destroy(struct net_device *hsr_dev);
void hsr_check_carrier_and_operstate(struct hsr_priv *hsr);
bool is_hsr_master(struct net_device *dev);
int hsr_get_max_mtu(struct hsr_priv *hsr);
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 2d7a19750436..292be446007b 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -104,7 +104,7 @@ int hsr_create_self_node(struct list_head *self_node_db,
return 0;
}
-void hsr_del_node(struct list_head *self_node_db)
+void hsr_del_self_node(struct list_head *self_node_db)
{
struct hsr_node *node;
@@ -117,6 +117,15 @@ void hsr_del_node(struct list_head *self_node_db)
}
}
+void hsr_del_nodes(struct list_head *node_db)
+{
+ struct hsr_node *node;
+ struct hsr_node *tmp;
+
+ list_for_each_entry_safe(node, tmp, node_db, mac_list)
+ kfree(node);
+}
+
/* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A;
* seq_out is used to initialize filtering of outgoing duplicate frames
* originating from the newly added node.
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
index a3bdcdab469d..89a3ce38151d 100644
--- a/net/hsr/hsr_framereg.h
+++ b/net/hsr/hsr_framereg.h
@@ -12,7 +12,8 @@
struct hsr_node;
-void hsr_del_node(struct list_head *self_node_db);
+void hsr_del_self_node(struct list_head *self_node_db);
+void hsr_del_nodes(struct list_head *node_db);
struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
u16 seq_out);
struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb,
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 8f8337f893ba..160edd24de4e 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -69,6 +69,12 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
return hsr_dev_finalize(dev, link, multicast_spec, hsr_version);
}
+static void hsr_dellink(struct net_device *hsr_dev, struct list_head *head)
+{
+ hsr_dev_destroy(hsr_dev);
+ unregister_netdevice_queue(hsr_dev, head);
+}
+
static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct hsr_priv *hsr;
@@ -113,6 +119,7 @@ static struct rtnl_link_ops hsr_link_ops __read_mostly = {
.priv_size = sizeof(struct hsr_priv),
.setup = hsr_dev_setup,
.newlink = hsr_newlink,
+ .dellink = hsr_dellink,
.fill_info = hsr_fill_info,
};
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 88b6705ded83..ee561297d8a7 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -193,4 +193,5 @@ void hsr_del_port(struct hsr_port *port)
if (port != master)
dev_put(port->dev);
+ kfree(port);
}
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 9133e3cede77..e4aba5d485be 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -74,7 +74,7 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
key.src = *src;
key.dst = *dst;
- q = inet_frag_find(&ieee802154_lowpan->frags, &key);
+ q = inet_frag_find(ieee802154_lowpan->fqdir, &key);
if (!q)
return NULL;
@@ -134,7 +134,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
fq->q.flags |= INET_FRAG_FIRST_IN;
fq->q.meat += skb->len;
- add_frag_mem_limit(fq->q.net, skb->truesize);
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
fq->q.meat == fq->q.len) {
@@ -321,23 +321,18 @@ err:
static struct ctl_table lowpan_frags_ns_ctl_table[] = {
{
.procname = "6lowpanfrag_high_thresh",
- .data = &init_net.ieee802154_lowpan.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh
},
{
.procname = "6lowpanfrag_low_thresh",
- .data = &init_net.ieee802154_lowpan.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh
},
{
.procname = "6lowpanfrag_time",
- .data = &init_net.ieee802154_lowpan.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -372,17 +367,17 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
if (table == NULL)
goto err_alloc;
- table[0].data = &ieee802154_lowpan->frags.high_thresh;
- table[0].extra1 = &ieee802154_lowpan->frags.low_thresh;
- table[1].data = &ieee802154_lowpan->frags.low_thresh;
- table[1].extra2 = &ieee802154_lowpan->frags.high_thresh;
- table[2].data = &ieee802154_lowpan->frags.timeout;
-
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
table[0].procname = NULL;
}
+ table[0].data = &ieee802154_lowpan->fqdir->high_thresh;
+ table[0].extra1 = &ieee802154_lowpan->fqdir->low_thresh;
+ table[1].data = &ieee802154_lowpan->fqdir->low_thresh;
+ table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh;
+ table[2].data = &ieee802154_lowpan->fqdir->timeout;
+
hdr = register_net_sysctl(net, "net/ieee802154/6lowpan", table);
if (hdr == NULL)
goto err_reg;
@@ -449,32 +444,42 @@ static int __net_init lowpan_frags_init_net(struct net *net)
net_ieee802154_lowpan(net);
int res;
- ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
- ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
- ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
- ieee802154_lowpan->frags.f = &lowpan_frags;
- res = inet_frags_init_net(&ieee802154_lowpan->frags);
+ res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net);
if (res < 0)
return res;
+
+ ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
res = lowpan_frags_ns_sysctl_register(net);
if (res < 0)
- inet_frags_exit_net(&ieee802154_lowpan->frags);
+ fqdir_exit(ieee802154_lowpan->fqdir);
return res;
}
+static void __net_exit lowpan_frags_pre_exit_net(struct net *net)
+{
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+
+ fqdir_pre_exit(ieee802154_lowpan->fqdir);
+}
+
static void __net_exit lowpan_frags_exit_net(struct net *net)
{
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
lowpan_frags_ns_sysctl_unregister(net);
- inet_frags_exit_net(&ieee802154_lowpan->frags);
+ fqdir_exit(ieee802154_lowpan->fqdir);
}
static struct pernet_operations lowpan_frags_ops = {
- .init = lowpan_frags_init_net,
- .exit = lowpan_frags_exit_net,
+ .init = lowpan_frags_init_net,
+ .pre_exit = lowpan_frags_pre_exit_net,
+ .exit = lowpan_frags_exit_net,
};
static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
@@ -539,7 +544,7 @@ err_sysctl:
void lowpan_net_frag_exit(void)
{
- inet_frags_fini(&lowpan_frags);
lowpan_frags_sysctl_unregister();
unregister_pernet_subsys(&lowpan_frags_ops);
+ inet_frags_fini(&lowpan_frags);
}
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 000a61994c8f..d57ecfaf89d4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -14,7 +14,7 @@ obj-y := route.o inetpeer.o protocol.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
- metrics.o netlink.o
+ metrics.o netlink.o nexthop.o
obj-$(CONFIG_BPFILTER) += bpfilter/
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 52bdb881a506..ed2301ef872e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -784,10 +784,8 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
}
EXPORT_SYMBOL(inet_getname);
-int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+int inet_send_prepare(struct sock *sk)
{
- struct sock *sk = sock->sk;
-
sock_rps_record_flow(sk);
/* We may need to bind the socket. */
@@ -795,7 +793,19 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
inet_autobind(sk))
return -EAGAIN;
- return sk->sk_prot->sendmsg(sk, msg, size);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(inet_send_prepare);
+
+int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+
+ if (unlikely(inet_send_prepare(sk)))
+ return -EAGAIN;
+
+ return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg,
+ sk, msg, size);
}
EXPORT_SYMBOL(inet_sendmsg);
@@ -804,11 +814,7 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
{
struct sock *sk = sock->sk;
- sock_rps_record_flow(sk);
-
- /* We may need to bind the socket. */
- if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
- inet_autobind(sk))
+ if (unlikely(inet_send_prepare(sk)))
return -EAGAIN;
if (sk->sk_prot->sendpage)
@@ -817,6 +823,8 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
}
EXPORT_SYMBOL(inet_sendpage);
+INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *,
+ size_t, int, int, int *));
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
@@ -827,8 +835,9 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
if (likely(!(flags & MSG_ERRQUEUE)))
sock_rps_record_flow(sk);
- err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
- flags & ~MSG_DONTWAIT, &addr_len);
+ err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
+ sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 9c3afd550612..974179b3b314 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -590,8 +590,7 @@ static void __exit ah4_fini(void)
{
if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&ah_type, AF_INET);
}
module_init(ah4_init);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c6bd0f7a020a..a4b5bd4d2c89 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -62,6 +62,11 @@
#include <net/net_namespace.h>
#include <net/addrconf.h>
+#define IPV6ONLY_FLAGS \
+ (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \
+ IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \
+ IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY)
+
static struct ipv4_devconf ipv4_devconf = {
.data = {
[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
@@ -190,7 +195,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain);
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static void inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
int destroy);
#ifdef CONFIG_SYSCTL
static int devinet_sysctl_register(struct in_device *idev);
@@ -296,8 +302,8 @@ static void in_dev_rcu_put(struct rcu_head *head)
static void inetdev_destroy(struct in_device *in_dev)
{
- struct in_ifaddr *ifa;
struct net_device *dev;
+ struct in_ifaddr *ifa;
ASSERT_RTNL();
@@ -307,7 +313,7 @@ static void inetdev_destroy(struct in_device *in_dev)
ip_mc_destroy_dev(in_dev);
- while ((ifa = in_dev->ifa_list) != NULL) {
+ while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) {
inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
inet_free_ifa(ifa);
}
@@ -323,30 +329,35 @@ static void inetdev_destroy(struct in_device *in_dev)
int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
{
+ const struct in_ifaddr *ifa;
+
rcu_read_lock();
- for_primary_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(a, ifa)) {
if (!b || inet_ifa_match(b, ifa)) {
rcu_read_unlock();
return 1;
}
}
- } endfor_ifa(in_dev);
+ }
rcu_read_unlock();
return 0;
}
-static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
- int destroy, struct nlmsghdr *nlh, u32 portid)
+static void __inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
+ int destroy, struct nlmsghdr *nlh, u32 portid)
{
struct in_ifaddr *promote = NULL;
- struct in_ifaddr *ifa, *ifa1 = *ifap;
- struct in_ifaddr *last_prim = in_dev->ifa_list;
+ struct in_ifaddr *ifa, *ifa1;
+ struct in_ifaddr *last_prim;
struct in_ifaddr *prev_prom = NULL;
int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
ASSERT_RTNL();
+ ifa1 = rtnl_dereference(*ifap);
+ last_prim = rtnl_dereference(in_dev->ifa_list);
if (in_dev->dead)
goto no_promotions;
@@ -355,9 +366,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
**/
if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
- struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+ struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next;
- while ((ifa = *ifap1) != NULL) {
+ while ((ifa = rtnl_dereference(*ifap1)) != NULL) {
if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
ifa1->ifa_scope <= ifa->ifa_scope)
last_prim = ifa;
@@ -390,7 +401,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
* and later to add them back with new prefsrc. Do this
* while all addresses are on the device list.
*/
- for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+ for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) {
if (ifa1->ifa_mask == ifa->ifa_mask &&
inet_ifa_match(ifa1->ifa_address, ifa))
fib_del_ifaddr(ifa, ifa1);
@@ -416,19 +427,25 @@ no_promotions:
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
if (promote) {
- struct in_ifaddr *next_sec = promote->ifa_next;
+ struct in_ifaddr *next_sec;
+ next_sec = rtnl_dereference(promote->ifa_next);
if (prev_prom) {
- prev_prom->ifa_next = promote->ifa_next;
- promote->ifa_next = last_prim->ifa_next;
- last_prim->ifa_next = promote;
+ struct in_ifaddr *last_sec;
+
+ rcu_assign_pointer(prev_prom->ifa_next, next_sec);
+
+ last_sec = rtnl_dereference(last_prim->ifa_next);
+ rcu_assign_pointer(promote->ifa_next, last_sec);
+ rcu_assign_pointer(last_prim->ifa_next, promote);
}
promote->ifa_flags &= ~IFA_F_SECONDARY;
rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain,
NETDEV_UP, promote);
- for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
+ for (ifa = next_sec; ifa;
+ ifa = rtnl_dereference(ifa->ifa_next)) {
if (ifa1->ifa_mask != ifa->ifa_mask ||
!inet_ifa_match(ifa1->ifa_address, ifa))
continue;
@@ -440,7 +457,8 @@ no_promotions:
inet_free_ifa(ifa1);
}
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static void inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
int destroy)
{
__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
@@ -453,9 +471,10 @@ static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
u32 portid, struct netlink_ext_ack *extack)
{
+ struct in_ifaddr __rcu **last_primary, **ifap;
struct in_device *in_dev = ifa->ifa_dev;
- struct in_ifaddr *ifa1, **ifap, **last_primary;
struct in_validator_info ivi;
+ struct in_ifaddr *ifa1;
int ret;
ASSERT_RTNL();
@@ -468,8 +487,13 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
ifa->ifa_flags &= ~IFA_F_SECONDARY;
last_primary = &in_dev->ifa_list;
- for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
- ifap = &ifa1->ifa_next) {
+ /* Don't set IPv6 only flags to IPv4 addresses */
+ ifa->ifa_flags &= ~IPV6ONLY_FLAGS;
+
+ ifap = &in_dev->ifa_list;
+ ifa1 = rtnl_dereference(*ifap);
+
+ while (ifa1) {
if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
ifa->ifa_scope <= ifa1->ifa_scope)
last_primary = &ifa1->ifa_next;
@@ -485,6 +509,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
}
ifa->ifa_flags |= IFA_F_SECONDARY;
}
+
+ ifap = &ifa1->ifa_next;
+ ifa1 = rtnl_dereference(*ifap);
}
/* Allow any devices that wish to register ifaddr validtors to weigh
@@ -510,8 +537,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
ifap = last_primary;
}
- ifa->ifa_next = *ifap;
- *ifap = ifa;
+ rcu_assign_pointer(ifa->ifa_next, *ifap);
+ rcu_assign_pointer(*ifap, ifa);
inet_hash_insert(dev_net(in_dev->dev), ifa);
@@ -576,12 +603,14 @@ EXPORT_SYMBOL(inetdev_by_index);
struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
__be32 mask)
{
+ struct in_ifaddr *ifa;
+
ASSERT_RTNL();
- for_primary_ifa(in_dev) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
return ifa;
- } endfor_ifa(in_dev);
+ }
return NULL;
}
@@ -609,10 +638,12 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
+ struct in_ifaddr __rcu **ifap;
struct nlattr *tb[IFA_MAX+1];
struct in_device *in_dev;
struct ifaddrmsg *ifm;
- struct in_ifaddr *ifa, **ifap;
+ struct in_ifaddr *ifa;
+
int err = -EINVAL;
ASSERT_RTNL();
@@ -629,7 +660,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL;
ifap = &ifa->ifa_next) {
if (tb[IFA_LOCAL] &&
ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
@@ -717,15 +748,19 @@ static void check_lifetime(struct work_struct *work)
if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
age >= ifa->ifa_valid_lft) {
- struct in_ifaddr **ifap;
+ struct in_ifaddr __rcu **ifap;
+ struct in_ifaddr *tmp;
- for (ifap = &ifa->ifa_dev->ifa_list;
- *ifap != NULL; ifap = &(*ifap)->ifa_next) {
- if (*ifap == ifa) {
+ ifap = &ifa->ifa_dev->ifa_list;
+ tmp = rtnl_dereference(*ifap);
+ while (tmp) {
+ if (tmp == ifa) {
inet_del_ifa(ifa->ifa_dev,
ifap, 1);
break;
}
+ ifap = &tmp->ifa_next;
+ tmp = rtnl_dereference(*ifap);
}
} else if (ifa->ifa_preferred_lft !=
INFINITY_LIFE_TIME &&
@@ -869,13 +904,12 @@ errout:
static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
{
struct in_device *in_dev = ifa->ifa_dev;
- struct in_ifaddr *ifa1, **ifap;
+ struct in_ifaddr *ifa1;
if (!ifa->ifa_local)
return NULL;
- for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
- ifap = &ifa1->ifa_next) {
+ in_dev_for_each_ifa_rtnl(ifa1, in_dev) {
if (ifa1->ifa_mask == ifa->ifa_mask &&
inet_ifa_match(ifa1->ifa_address, ifa) &&
ifa1->ifa_local == ifa->ifa_local)
@@ -970,8 +1004,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
{
struct sockaddr_in sin_orig;
struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr;
+ struct in_ifaddr __rcu **ifap = NULL;
struct in_device *in_dev;
- struct in_ifaddr **ifap = NULL;
struct in_ifaddr *ifa = NULL;
struct net_device *dev;
char *colon;
@@ -1042,7 +1076,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
/* note: we only do this for a limited set of ioctls
and only if the original address family was AF_INET.
This is checked above. */
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+
+ for (ifap = &in_dev->ifa_list;
+ (ifa = rtnl_dereference(*ifap)) != NULL;
ifap = &ifa->ifa_next) {
if (!strcmp(ifr->ifr_name, ifa->ifa_label) &&
sin_orig.sin_addr.s_addr ==
@@ -1055,7 +1091,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
4.3BSD-style and passed in junk so we fall back to
comparing just the label */
if (!ifa) {
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ for (ifap = &in_dev->ifa_list;
+ (ifa = rtnl_dereference(*ifap)) != NULL;
ifap = &ifa->ifa_next)
if (!strcmp(ifr->ifr_name, ifa->ifa_label))
break;
@@ -1204,7 +1241,7 @@ out:
static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
{
struct in_device *in_dev = __in_dev_get_rtnl(dev);
- struct in_ifaddr *ifa;
+ const struct in_ifaddr *ifa;
struct ifreq ifr;
int done = 0;
@@ -1214,7 +1251,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int s
if (!in_dev)
goto out;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (!buf) {
done += size;
continue;
@@ -1242,18 +1279,24 @@ out:
static __be32 in_dev_select_addr(const struct in_device *in_dev,
int scope)
{
- for_primary_ifa(in_dev) {
+ const struct in_ifaddr *ifa;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
if (ifa->ifa_scope != RT_SCOPE_LINK &&
ifa->ifa_scope <= scope)
return ifa->ifa_local;
- } endfor_ifa(in_dev);
+ }
return 0;
}
__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
{
+ const struct in_ifaddr *ifa;
__be32 addr = 0;
+ unsigned char localnet_scope = RT_SCOPE_HOST;
struct in_device *in_dev;
struct net *net = dev_net(dev);
int master_idx;
@@ -1263,8 +1306,13 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
if (!in_dev)
goto no_in_dev;
- for_primary_ifa(in_dev) {
- if (ifa->ifa_scope > scope)
+ if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
+ localnet_scope = RT_SCOPE_LINK;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
+ if (min(ifa->ifa_scope, localnet_scope) > scope)
continue;
if (!dst || inet_ifa_match(dst, ifa)) {
addr = ifa->ifa_local;
@@ -1272,7 +1320,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
}
if (!addr)
addr = ifa->ifa_local;
- } endfor_ifa(in_dev);
+ }
if (addr)
goto out_unlock;
@@ -1317,13 +1365,20 @@ EXPORT_SYMBOL(inet_select_addr);
static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
__be32 local, int scope)
{
- int same = 0;
+ unsigned char localnet_scope = RT_SCOPE_HOST;
+ const struct in_ifaddr *ifa;
__be32 addr = 0;
+ int same = 0;
+
+ if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
+ localnet_scope = RT_SCOPE_LINK;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ unsigned char min_scope = min(ifa->ifa_scope, localnet_scope);
- for_ifa(in_dev) {
if (!addr &&
(local == ifa->ifa_local || !local) &&
- ifa->ifa_scope <= scope) {
+ min_scope <= scope) {
addr = ifa->ifa_local;
if (same)
break;
@@ -1338,7 +1393,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
if (inet_ifa_match(addr, ifa))
break;
/* No, then can we use new local src? */
- if (ifa->ifa_scope <= scope) {
+ if (min_scope <= scope) {
addr = ifa->ifa_local;
break;
}
@@ -1346,7 +1401,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
same = 0;
}
}
- } endfor_ifa(in_dev);
+ }
return same ? addr : 0;
}
@@ -1420,7 +1475,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
struct in_ifaddr *ifa;
int named = 0;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
char old[IFNAMSIZ], *dot;
memcpy(old, ifa->ifa_label, IFNAMSIZ);
@@ -1450,10 +1505,9 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev,
struct in_device *in_dev)
{
- struct in_ifaddr *ifa;
+ const struct in_ifaddr *ifa;
- for (ifa = in_dev->ifa_list; ifa;
- ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
arp_send(ARPOP_REQUEST, ETH_P_ARP,
ifa->ifa_local, dev,
ifa->ifa_local, NULL,
@@ -1723,15 +1777,17 @@ static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
int ip_idx = 0;
int err;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+ if (ip_idx < s_ip_idx) {
+ ip_idx++;
continue;
-
+ }
err = inet_fill_ifaddr(skb, ifa, fillargs);
if (err < 0)
goto done;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ ip_idx++;
}
err = 0;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b9ae95576084..5c967764041f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -33,8 +33,6 @@ struct esp_output_extra {
#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
-static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
-
/*
* Allocate an AEAD request structure with extra space for SG and IV.
*
@@ -506,7 +504,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
u32 padto;
- padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+ padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
if (skb->len < padto)
esp.tfclen = padto - skb->len;
}
@@ -788,28 +786,6 @@ out:
return err;
}
-static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
-{
- struct crypto_aead *aead = x->data;
- u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
- unsigned int net_adj;
-
- switch (x->props.mode) {
- case XFRM_MODE_TRANSPORT:
- case XFRM_MODE_BEET:
- net_adj = sizeof(struct iphdr);
- break;
- case XFRM_MODE_TUNNEL:
- net_adj = 0;
- break;
- default:
- BUG();
- }
-
- return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
- net_adj) & ~(blksize - 1)) + net_adj - 2;
-}
-
static int esp4_err(struct sk_buff *skb, u32 info)
{
struct net *net = dev_net(skb->dev);
@@ -1035,7 +1011,6 @@ static const struct xfrm_type esp_type =
.flags = XFRM_TYPE_REPLAY_PROT,
.init_state = esp_init_state,
.destructor = esp_destroy,
- .get_mtu = esp4_get_mtu,
.input = esp_input,
.output = esp_output,
};
@@ -1066,8 +1041,7 @@ static void __exit esp4_fini(void)
{
if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&esp_type, AF_INET);
}
module_init(esp4_init);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 2e5e377f50a1..0e4a7cf6bc87 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -312,9 +312,7 @@ static int __init esp4_offload_init(void)
static void __exit esp4_offload_exit(void)
{
- if (xfrm_unregister_type_offload(&esp_type_offload, AF_INET) < 0)
- pr_info("%s: can't remove xfrm type offload\n", __func__);
-
+ xfrm_unregister_type_offload(&esp_type_offload, AF_INET);
inet_del_offload(&esp4_offload, IPPROTO_ESP);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e54c2bcbb465..317339cd7f03 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -39,6 +39,7 @@
#include <net/sock.h>
#include <net/arp.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/rtnetlink.h>
#include <net/xfrm.h>
#include <net/l3mdev.h>
@@ -188,7 +189,7 @@ int fib_unmerge(struct net *net)
return 0;
}
-static void fib_flush(struct net *net)
+void fib_flush(struct net *net)
{
int flushed = 0;
unsigned int h;
@@ -230,7 +231,9 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
if (table) {
ret = RTN_UNICAST;
if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
- if (!dev || dev == res.fi->fib_dev)
+ struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
+
+ if (!dev || dev == nhc->nhc_dev)
ret = res.type;
}
}
@@ -317,19 +320,19 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int ret;
- for (ret = 0; ret < fi->fib_nhs; ret++) {
- struct fib_nh *nh = &fi->fib_nh[ret];
+ for (ret = 0; ret < fib_info_num_path(fi); ret++) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
- if (nh->fib_nh_dev == dev) {
+ if (nhc->nhc_dev == dev) {
dev_match = true;
break;
- } else if (l3mdev_master_ifindex_rcu(nh->fib_nh_dev) == dev->ifindex) {
+ } else if (l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) {
dev_match = true;
break;
}
}
#else
- if (fi->fib_nh[0].fib_nh_dev == dev)
+ if (fib_info_nhc(fi, 0)->nhc_dev == dev)
dev_match = true;
#endif
@@ -536,14 +539,22 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
cfg->fc_oif = dev->ifindex;
cfg->fc_table = l3mdev_fib_table(dev);
if (colon) {
- struct in_ifaddr *ifa;
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ const struct in_ifaddr *ifa;
+ struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rtnl(dev);
if (!in_dev)
return -ENODEV;
+
*colon = ':';
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+
+ rcu_read_lock();
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (strcmp(ifa->ifa_label, devname) == 0)
break;
+ }
+ rcu_read_unlock();
+
if (!ifa)
return -ENODEV;
cfg->fc_prefsrc = ifa->ifa_local;
@@ -641,6 +652,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
}
const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
+ [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 },
[RTA_DST] = { .type = NLA_U32 },
[RTA_SRC] = { .type = NLA_U32 },
[RTA_IIF] = { .type = NLA_U32 },
@@ -659,6 +671,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_IP_PROTO] = { .type = NLA_U8 },
[RTA_SPORT] = { .type = NLA_U16 },
[RTA_DPORT] = { .type = NLA_U16 },
+ [RTA_NH_ID] = { .type = NLA_U32 },
};
int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
@@ -796,6 +809,18 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
if (err < 0)
goto errout;
break;
+ case RTA_NH_ID:
+ cfg->fc_nh_id = nla_get_u32(attr);
+ break;
+ }
+ }
+
+ if (cfg->fc_nh_id) {
+ if (cfg->fc_oif || cfg->fc_gw_family ||
+ cfg->fc_encap || cfg->fc_mp) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop specification and nexthop id are mutually exclusive");
+ return -EINVAL;
}
}
@@ -822,6 +847,12 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
+ if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ err = -EINVAL;
+ goto errout;
+ }
+
tb = fib_get_table(net, cfg.fc_table);
if (!tb) {
NL_SET_ERR_MSG(extack, "FIB table does not exist");
@@ -881,10 +912,15 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
return -EINVAL;
}
+
if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
return -EINVAL;
}
+ if (rtm->rtm_flags & RTM_F_CLONED)
+ filter->dump_routes = false;
+ else
+ filter->dump_exceptions = false;
filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC);
filter->flags = rtm->rtm_flags;
@@ -931,9 +967,10 @@ EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct fib_dump_filter filter = { .dump_routes = true,
+ .dump_exceptions = true };
const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
- struct fib_dump_filter filter = {};
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib_table *tb;
@@ -950,8 +987,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
}
- /* fib entries are never clones and ipv4 does not use prefix flag */
- if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED))
+ /* ipv4 does not use prefix flag */
+ if (filter.flags & RTM_F_PREFIX)
return skb->len;
if (filter.table_id) {
@@ -1172,8 +1209,8 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
*
* Scan address list to be sure that addresses are really gone.
*/
-
- for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ rcu_read_lock();
+ in_dev_for_each_ifa_rcu(ifa1, in_dev) {
if (ifa1 == ifa) {
/* promotion, keep the IP */
gone = 0;
@@ -1241,6 +1278,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
}
}
}
+ rcu_read_unlock();
no_promotions:
if (!(ok & BRD_OK))
@@ -1410,6 +1448,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
struct netdev_notifier_info_ext *info_ext = ptr;
struct in_device *in_dev;
struct net *net = dev_net(dev);
+ struct in_ifaddr *ifa;
unsigned int flags;
if (event == NETDEV_UNREGISTER) {
@@ -1424,9 +1463,9 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
switch (event) {
case NETDEV_UP:
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
fib_add_ifaddr(ifa);
- } endfor_ifa(in_dev);
+ }
#ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev, RTNH_F_DEAD);
#endif
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 7945f0534db7..a68b5e21ec51 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/list.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
struct fib_alias {
struct hlist_node fa_list;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index a38e86b98e4f..b43a7ba5c6a4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -27,6 +27,7 @@
#include <net/route.h>
#include <net/tcp.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/fib_rules.h>
struct fib4_rule {
@@ -141,8 +142,11 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
struct fib_result *result = (struct fib_result *) arg->result;
struct net_device *dev = NULL;
- if (result->fi)
- dev = result->fi->fib_dev;
+ if (result->fi) {
+ struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0);
+
+ dev = nhc->nhc_dev;
+ }
/* do not accept result if the route does
* not meet the required prefix length
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index bfa49a88d03a..2db089e10ba0 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -38,6 +38,7 @@
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/ip6_fib.h>
+#include <net/nexthop.h>
#include <net/netlink.h>
#include <net/rtnh.h>
#include <net/lwtunnel.h>
@@ -56,18 +57,21 @@ static unsigned int fib_info_cnt;
#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
+/* for_nexthops and change_nexthops only used when nexthop object
+ * is not set in a fib_info. The logic within can reference fib_nh.
+ */
#ifdef CONFIG_IP_ROUTE_MULTIPATH
#define for_nexthops(fi) { \
int nhsel; const struct fib_nh *nh; \
for (nhsel = 0, nh = (fi)->fib_nh; \
- nhsel < (fi)->fib_nhs; \
+ nhsel < fib_info_num_path((fi)); \
nh++, nhsel++)
#define change_nexthops(fi) { \
int nhsel; struct fib_nh *nexthop_nh; \
for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
- nhsel < (fi)->fib_nhs; \
+ nhsel < fib_info_num_path((fi)); \
nexthop_nh++, nhsel++)
#else /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -228,9 +232,13 @@ static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
- change_nexthops(fi) {
- fib_nh_release(fi->fib_net, nexthop_nh);
- } endfor_nexthops(fi);
+ if (fi->nh) {
+ nexthop_put(fi->nh);
+ } else {
+ change_nexthops(fi) {
+ fib_nh_release(fi->fib_net, nexthop_nh);
+ } endfor_nexthops(fi);
+ }
ip_fib_metrics_put(fi->fib_metrics);
@@ -256,22 +264,34 @@ void fib_release_info(struct fib_info *fi)
hlist_del(&fi->fib_hash);
if (fi->fib_prefsrc)
hlist_del(&fi->fib_lhash);
- change_nexthops(fi) {
- if (!nexthop_nh->fib_nh_dev)
- continue;
- hlist_del(&nexthop_nh->nh_hash);
- } endfor_nexthops(fi)
+ if (fi->nh) {
+ list_del(&fi->nh_list);
+ } else {
+ change_nexthops(fi) {
+ if (!nexthop_nh->fib_nh_dev)
+ continue;
+ hlist_del(&nexthop_nh->nh_hash);
+ } endfor_nexthops(fi)
+ }
fi->fib_dead = 1;
fib_info_put(fi);
}
spin_unlock_bh(&fib_info_lock);
}
-static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi)
{
- const struct fib_nh *onh = ofi->fib_nh;
+ const struct fib_nh *onh;
+
+ if (fi->nh || ofi->nh)
+ return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1;
+
+ if (ofi->fib_nhs == 0)
+ return 0;
for_nexthops(fi) {
+ onh = fib_info_nh(ofi, nhsel);
+
if (nh->fib_nh_oif != onh->fib_nh_oif ||
nh->fib_nh_gw_family != onh->fib_nh_gw_family ||
nh->fib_nh_scope != onh->fib_nh_scope ||
@@ -292,8 +312,6 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
if (nh->fib_nh_gw_family == AF_INET6 &&
ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6))
return -1;
-
- onh++;
} endfor_nexthops(fi);
return 0;
}
@@ -307,22 +325,78 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
(val >> (DEVINDEX_HASHBITS * 2))) & mask;
}
-static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
+static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
+ u32 prefsrc, u32 priority)
{
- unsigned int mask = (fib_info_hash_size - 1);
- unsigned int val = fi->fib_nhs;
+ unsigned int val = init_val;
- val ^= (fi->fib_protocol << 8) | fi->fib_scope;
- val ^= (__force u32)fi->fib_prefsrc;
- val ^= fi->fib_priority;
- for_nexthops(fi) {
- val ^= fib_devindex_hashfn(nh->fib_nh_oif);
- } endfor_nexthops(fi)
+ val ^= (protocol << 8) | scope;
+ val ^= prefsrc;
+ val ^= priority;
+
+ return val;
+}
+
+static unsigned int fib_info_hashfn_result(unsigned int val)
+{
+ unsigned int mask = (fib_info_hash_size - 1);
return (val ^ (val >> 7) ^ (val >> 12)) & mask;
}
-static struct fib_info *fib_find_info(const struct fib_info *nfi)
+static inline unsigned int fib_info_hashfn(struct fib_info *fi)
+{
+ unsigned int val;
+
+ val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol,
+ fi->fib_scope, (__force u32)fi->fib_prefsrc,
+ fi->fib_priority);
+
+ if (fi->nh) {
+ val ^= fib_devindex_hashfn(fi->nh->id);
+ } else {
+ for_nexthops(fi) {
+ val ^= fib_devindex_hashfn(nh->fib_nh_oif);
+ } endfor_nexthops(fi)
+ }
+
+ return fib_info_hashfn_result(val);
+}
+
+/* no metrics, only nexthop id */
+static struct fib_info *fib_find_info_nh(struct net *net,
+ const struct fib_config *cfg)
+{
+ struct hlist_head *head;
+ struct fib_info *fi;
+ unsigned int hash;
+
+ hash = fib_info_hashfn_1(fib_devindex_hashfn(cfg->fc_nh_id),
+ cfg->fc_protocol, cfg->fc_scope,
+ (__force u32)cfg->fc_prefsrc,
+ cfg->fc_priority);
+ hash = fib_info_hashfn_result(hash);
+ head = &fib_info_hash[hash];
+
+ hlist_for_each_entry(fi, head, fib_hash) {
+ if (!net_eq(fi->fib_net, net))
+ continue;
+ if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
+ continue;
+ if (cfg->fc_protocol == fi->fib_protocol &&
+ cfg->fc_scope == fi->fib_scope &&
+ cfg->fc_prefsrc == fi->fib_prefsrc &&
+ cfg->fc_priority == fi->fib_priority &&
+ cfg->fc_type == fi->fib_type &&
+ cfg->fc_table == fi->fib_tb_id &&
+ !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK))
+ return fi;
+ }
+
+ return NULL;
+}
+
+static struct fib_info *fib_find_info(struct fib_info *nfi)
{
struct hlist_head *head;
struct fib_info *fi;
@@ -344,7 +418,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
memcmp(nfi->fib_metrics, fi->fib_metrics,
sizeof(u32) * RTAX_MAX) == 0 &&
!((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
- (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+ nh_comp(fi, nfi) == 0)
return fi;
}
@@ -386,34 +460,40 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
+ nla_total_size(4) /* RTA_PRIORITY */
+ nla_total_size(4) /* RTA_PREFSRC */
+ nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
+ unsigned int nhs = fib_info_num_path(fi);
/* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
- if (fi->fib_nhs) {
+ if (fi->nh)
+ payload += nla_total_size(4); /* RTA_NH_ID */
+
+ if (nhs) {
size_t nh_encapsize = 0;
- /* Also handles the special case fib_nhs == 1 */
+ /* Also handles the special case nhs == 1 */
/* each nexthop is packed in an attribute */
size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
+ unsigned int i;
/* may contain flow and gateway attribute */
nhsize += 2 * nla_total_size(4);
/* grab encap info */
- for_nexthops(fi) {
- if (nh->fib_nh_lws) {
+ for (i = 0; i < fib_info_num_path(fi); i++) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, i);
+
+ if (nhc->nhc_lwtstate) {
/* RTA_ENCAP_TYPE */
nh_encapsize += lwtunnel_get_encap_size(
- nh->fib_nh_lws);
+ nhc->nhc_lwtstate);
/* RTA_ENCAP */
nh_encapsize += nla_total_size(2);
}
- } endfor_nexthops(fi);
+ }
/* all nexthops are packed in a nested attribute */
- payload += nla_total_size((fi->fib_nhs * nhsize) +
- nh_encapsize);
+ payload += nla_total_size((nhs * nhsize) + nh_encapsize);
}
@@ -574,12 +654,14 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
return nhs;
}
+/* only called when fib_nh is integrated into fib_info */
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
int remaining, struct fib_config *cfg,
struct netlink_ext_ack *extack)
{
struct net *net = fi->fib_net;
struct fib_config fib_cfg;
+ struct fib_nh *nh;
int ret;
change_nexthops(fi) {
@@ -642,24 +724,25 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
} endfor_nexthops(fi);
ret = -EINVAL;
- if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) {
+ nh = fib_info_nh(fi, 0);
+ if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) {
NL_SET_ERR_MSG(extack,
"Nexthop device index does not match RTA_OIF");
goto errout;
}
if (cfg->fc_gw_family) {
- if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family ||
+ if (cfg->fc_gw_family != nh->fib_nh_gw_family ||
(cfg->fc_gw_family == AF_INET &&
- fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) ||
+ nh->fib_nh_gw4 != cfg->fc_gw4) ||
(cfg->fc_gw_family == AF_INET6 &&
- ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) {
+ ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) {
NL_SET_ERR_MSG(extack,
"Nexthop gateway does not match RTA_GATEWAY or RTA_VIA");
goto errout;
}
}
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) {
+ if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) {
NL_SET_ERR_MSG(extack,
"Nexthop class id does not match RTA_FLOW");
goto errout;
@@ -670,12 +753,13 @@ errout:
return ret;
}
+/* only called when fib_nh is integrated into fib_info */
static void fib_rebalance(struct fib_info *fi)
{
int total;
int w;
- if (fi->fib_nhs < 2)
+ if (fib_info_num_path(fi) < 2)
return;
total = 0;
@@ -756,28 +840,36 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
return 1;
+ if (cfg->fc_nh_id) {
+ if (fi->nh && cfg->fc_nh_id == fi->nh->id)
+ return 0;
+ return 1;
+ }
+
if (cfg->fc_oif || cfg->fc_gw_family) {
+ struct fib_nh *nh = fib_info_nh(fi, 0);
+
if (cfg->fc_encap) {
if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
- fi->fib_nh, cfg, extack))
+ nh, cfg, extack))
return 1;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (cfg->fc_flow &&
- cfg->fc_flow != fi->fib_nh->nh_tclassid)
+ cfg->fc_flow != nh->nh_tclassid)
return 1;
#endif
- if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) ||
+ if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) ||
(cfg->fc_gw_family &&
- cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family))
+ cfg->fc_gw_family != nh->fib_nh_gw_family))
return 1;
if (cfg->fc_gw_family == AF_INET &&
- cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4)
+ cfg->fc_gw4 != nh->fib_nh_gw4)
return 1;
if (cfg->fc_gw_family == AF_INET6 &&
- ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6))
+ ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6))
return 1;
return 0;
@@ -1088,15 +1180,13 @@ out:
return err;
}
-static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
- struct netlink_ext_ack *extack)
+int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
+ struct netlink_ext_ack *extack)
{
- struct net *net = cfg->fc_nlinfo.nl_net;
- u32 table = cfg->fc_table;
int err;
if (nh->fib_nh_gw_family == AF_INET)
- err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack);
+ err = fib_check_nh_v4_gw(net, nh, table, scope, extack);
else if (nh->fib_nh_gw_family == AF_INET6)
err = fib_check_nh_v6_gw(net, nh, table, extack);
else
@@ -1187,11 +1277,16 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
fib_info_hash_free(old_laddrhash, bytes);
}
-__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
+__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
+ unsigned char scope)
{
- nh->nh_saddr = inet_select_addr(nh->fib_nh_dev,
- nh->fib_nh_gw4,
- nh->nh_parent->fib_scope);
+ struct fib_nh *nh;
+
+ if (nhc->nhc_family != AF_INET)
+ return inet_select_addr(nhc->nhc_dev, 0, scope);
+
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope);
nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
return nh->nh_saddr;
@@ -1200,16 +1295,19 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
__be32 fib_result_prefsrc(struct net *net, struct fib_result *res)
{
struct fib_nh_common *nhc = res->nhc;
- struct fib_nh *nh;
if (res->fi->fib_prefsrc)
return res->fi->fib_prefsrc;
- nh = container_of(nhc, struct fib_nh, nh_common);
- if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid))
- return nh->nh_saddr;
+ if (nhc->nhc_family == AF_INET) {
+ struct fib_nh *nh;
+
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid))
+ return nh->nh_saddr;
+ }
- return fib_info_update_nh_saddr(net, nh);
+ return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope);
}
static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
@@ -1241,6 +1339,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
{
int err;
struct fib_info *fi = NULL;
+ struct nexthop *nh = NULL;
struct fib_info *ofi;
int nhs = 1;
struct net *net = cfg->fc_nlinfo.nl_net;
@@ -1260,6 +1359,23 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto err_inval;
}
+ if (cfg->fc_nh_id) {
+ if (!cfg->fc_mx) {
+ fi = fib_find_info_nh(net, cfg);
+ if (fi) {
+ fi->fib_treeref++;
+ return fi;
+ }
+ }
+
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ goto err_inval;
+ }
+ nhs = 0;
+ }
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (cfg->fc_mp) {
nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack);
@@ -1295,7 +1411,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto failure;
fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
cfg->fc_mx_len, extack);
- if (unlikely(IS_ERR(fi->fib_metrics))) {
+ if (IS_ERR(fi->fib_metrics)) {
err = PTR_ERR(fi->fib_metrics);
kfree(fi);
return ERR_PTR(err);
@@ -1312,14 +1428,25 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
fi->fib_tb_id = cfg->fc_table;
fi->fib_nhs = nhs;
- change_nexthops(fi) {
- nexthop_nh->nh_parent = fi;
- } endfor_nexthops(fi)
+ if (nh) {
+ if (!nexthop_get(nh)) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ err = -EINVAL;
+ } else {
+ err = 0;
+ fi->nh = nh;
+ }
+ } else {
+ change_nexthops(fi) {
+ nexthop_nh->nh_parent = fi;
+ } endfor_nexthops(fi)
- if (cfg->fc_mp)
- err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
- else
- err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
+ if (cfg->fc_mp)
+ err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg,
+ extack);
+ else
+ err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
+ }
if (err != 0)
goto failure;
@@ -1350,7 +1477,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto err_inval;
}
- if (cfg->fc_scope == RT_SCOPE_HOST) {
+ if (fi->nh) {
+ err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack);
+ if (err)
+ goto failure;
+ } else if (cfg->fc_scope == RT_SCOPE_HOST) {
struct fib_nh *nh = fi->fib_nh;
/* Local address is added. */
@@ -1365,7 +1496,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto err_inval;
}
nh->fib_nh_scope = RT_SCOPE_NOWHERE;
- nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif);
+ nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif);
err = -ENODEV;
if (!nh->fib_nh_dev)
goto failure;
@@ -1373,7 +1504,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
int linkdown = 0;
change_nexthops(fi) {
- err = fib_check_nh(cfg, nexthop_nh, extack);
+ err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh,
+ cfg->fc_table, cfg->fc_scope,
+ extack);
if (err != 0)
goto failure;
if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN)
@@ -1388,13 +1521,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto err_inval;
}
- change_nexthops(fi) {
- fib_info_update_nh_saddr(net, nexthop_nh);
- if (nexthop_nh->fib_nh_gw_family == AF_INET6)
- fi->fib_nh_is_v6 = true;
- } endfor_nexthops(fi)
+ if (!fi->nh) {
+ change_nexthops(fi) {
+ fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common,
+ fi->fib_scope);
+ if (nexthop_nh->fib_nh_gw_family == AF_INET6)
+ fi->fib_nh_is_v6 = true;
+ } endfor_nexthops(fi)
- fib_rebalance(fi);
+ fib_rebalance(fi);
+ }
link_it:
ofi = fib_find_info(fi);
@@ -1416,16 +1552,20 @@ link_it:
head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
hlist_add_head(&fi->fib_lhash, head);
}
- change_nexthops(fi) {
- struct hlist_head *head;
- unsigned int hash;
+ if (fi->nh) {
+ list_add(&fi->nh_list, &nh->fi_list);
+ } else {
+ change_nexthops(fi) {
+ struct hlist_head *head;
+ unsigned int hash;
- if (!nexthop_nh->fib_nh_dev)
- continue;
- hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex);
- head = &fib_info_devhash[hash];
- hlist_add_head(&nexthop_nh->nh_hash, head);
- } endfor_nexthops(fi)
+ if (!nexthop_nh->fib_nh_dev)
+ continue;
+ hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex);
+ head = &fib_info_devhash[hash];
+ hlist_add_head(&nexthop_nh->nh_hash, head);
+ } endfor_nexthops(fi)
+ }
spin_unlock_bh(&fib_info_lock);
return fi;
@@ -1552,6 +1692,12 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
if (!mp)
goto nla_put_failure;
+ if (unlikely(fi->nh)) {
+ if (nexthop_mpath_fill_node(skb, fi->nh) < 0)
+ goto nla_put_failure;
+ goto mp_end;
+ }
+
for_nexthops(fi) {
if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0)
goto nla_put_failure;
@@ -1562,6 +1708,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
#endif
} endfor_nexthops(fi);
+mp_end:
nla_nest_end(skb, mp);
return 0;
@@ -1580,6 +1727,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
struct fib_info *fi, unsigned int flags)
{
+ unsigned int nhs = fib_info_num_path(fi);
struct nlmsghdr *nlh;
struct rtmsg *rtm;
@@ -1615,18 +1763,31 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
if (fi->fib_prefsrc &&
nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
goto nla_put_failure;
- if (fi->fib_nhs == 1) {
- struct fib_nh *nh = &fi->fib_nh[0];
+
+ if (fi->nh) {
+ if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id))
+ goto nla_put_failure;
+ if (nexthop_is_blackhole(fi->nh))
+ rtm->rtm_type = RTN_BLACKHOLE;
+ }
+
+ if (nhs == 1) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
unsigned char flags = 0;
- if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0)
+ if (fib_nexthop_info(skb, nhc, &flags, false) < 0)
goto nla_put_failure;
rtm->rtm_flags = flags;
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (nh->nh_tclassid &&
- nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
- goto nla_put_failure;
+ if (nhc->nhc_family == AF_INET) {
+ struct fib_nh *nh;
+
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ if (nh->nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+ goto nla_put_failure;
+ }
#endif
} else {
if (fib_add_multipath(skb, fi) < 0)
@@ -1709,7 +1870,7 @@ static int call_fib_nh_notifiers(struct fib_nh *nh,
* - if the new MTU is greater than the PMTU, don't make any change
* - otherwise, unlock and set PMTU
*/
-static void nh_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
+void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
{
struct fnhe_hash_bucket *bucket;
int i;
@@ -1745,7 +1906,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
hlist_for_each_entry(nh, head, nh_hash) {
if (nh->fib_nh_dev == dev)
- nh_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
+ fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
}
}
@@ -1754,6 +1915,8 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
* NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
* NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed
* NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed
+ *
+ * only used when fib_nh is built into fib_info
*/
int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
{
@@ -1835,6 +1998,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
+ struct fib_nh *nh;
if (fa->fa_slen != slen)
continue;
@@ -1856,8 +2020,9 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
if (next_fi->fib_scope != res->scope ||
fa->fa_type != RTN_UNICAST)
continue;
- if (!next_fi->fib_nh[0].fib_nh_gw4 ||
- next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK)
+
+ nh = fib_info_nh(next_fi, 0);
+ if (!nh->fib_nh_gw4 || nh->fib_nh_scope != RT_SCOPE_LINK)
continue;
fib_alias_accessed(fa);
@@ -1899,6 +2064,8 @@ out:
/*
* Dead device goes up. We wake up dead nexthops.
* It takes sense only on multipath routes.
+ *
+ * only used when fib_nh is built into fib_info
*/
int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
{
@@ -1993,6 +2160,11 @@ void fib_select_multipath(struct fib_result *res, int hash)
struct net *net = fi->fib_net;
bool first = false;
+ if (unlikely(res->fi->nh)) {
+ nexthop_path_fib_result(res, hash);
+ return;
+ }
+
change_nexthops(fi) {
if (net->ipv4.sysctl_fib_multipath_use_neigh) {
if (!fib_good_nh(nexthop_nh))
@@ -2021,7 +2193,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
goto check_saddr;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi->fib_nhs > 1) {
+ if (fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(net, fl4, skb, NULL);
fib_select_multipath(res, h);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 868c74771fa9..2b2b3d291ab0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -338,12 +338,18 @@ static struct tnode *tnode_alloc(int bits)
static inline void empty_child_inc(struct key_vector *n)
{
- ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
+ tn_info(n)->empty_children++;
+
+ if (!tn_info(n)->empty_children)
+ tn_info(n)->full_children++;
}
static inline void empty_child_dec(struct key_vector *n)
{
- tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
+ if (!tn_info(n)->empty_children)
+ tn_info(n)->full_children--;
+
+ tn_info(n)->empty_children--;
}
static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
@@ -1449,6 +1455,7 @@ found:
fib_alias_accessed(fa);
err = fib_props[fa->fa_type].error;
if (unlikely(err < 0)) {
+out_reject:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
@@ -1457,7 +1464,13 @@ found:
}
if (fi->fib_flags & RTNH_F_DEAD)
continue;
- for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+
+ if (unlikely(fi->nh && nexthop_is_blackhole(fi->nh))) {
+ err = fib_props[RTN_BLACKHOLE].error;
+ goto out_reject;
+ }
+
+ for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
if (nhc->nhc_flags & RTNH_F_DEAD)
@@ -1931,6 +1944,77 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
return found;
}
+/* derived from fib_trie_free */
+static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
+ struct nl_info *info)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct fib_alias *fa;
+
+ for (;;) {
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ if (IS_TRIE(pn))
+ break;
+
+ pn = node_parent(pn);
+ cindex = get_index(pkey, pn);
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry(fa, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
+ continue;
+
+ rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
+ KEYLENGTH - fa->fa_slen, tb->tb_id,
+ info, NLM_F_REPLACE);
+
+ /* call_fib_entry_notifiers will be removed when
+ * in-kernel notifier is implemented and supported
+ * for nexthop objects
+ */
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
+ n->key,
+ KEYLENGTH - fa->fa_slen, fa,
+ NULL);
+ }
+ }
+}
+
+void fib_info_notify_update(struct net *net, struct nl_info *info)
+{
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist)
+ __fib_info_notify_update(net, tb, info);
+ }
+}
+
static void fib_leaf_notify(struct net *net, struct key_vector *l,
struct fib_table *tb, struct notifier_block *nb)
{
@@ -2006,22 +2090,26 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
{
unsigned int flags = NLM_F_MULTI;
__be32 xkey = htonl(l->key);
+ int i, s_i, i_fa, s_fa, err;
struct fib_alias *fa;
- int i, s_i;
- if (filter->filter_set)
+ if (filter->filter_set ||
+ !filter->dump_exceptions || !filter->dump_routes)
flags |= NLM_F_DUMP_FILTERED;
s_i = cb->args[4];
+ s_fa = cb->args[5];
i = 0;
/* rcu_read_lock is hold by caller */
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
- int err;
+ struct fib_info *fi = fa->fa_info;
if (i < s_i)
goto next;
+ i_fa = 0;
+
if (tb->tb_id != fa->tb_id)
goto next;
@@ -2030,29 +2118,49 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
goto next;
if ((filter->protocol &&
- fa->fa_info->fib_protocol != filter->protocol))
+ fi->fib_protocol != filter->protocol))
goto next;
if (filter->dev &&
- !fib_info_nh_uses_dev(fa->fa_info, filter->dev))
+ !fib_info_nh_uses_dev(fi, filter->dev))
goto next;
}
- err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, RTM_NEWROUTE,
- tb->tb_id, fa->fa_type,
- xkey, KEYLENGTH - fa->fa_slen,
- fa->fa_tos, fa->fa_info, flags);
- if (err < 0) {
- cb->args[4] = i;
- return err;
+ if (filter->dump_routes) {
+ if (!s_fa) {
+ err = fib_dump_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE,
+ tb->tb_id, fa->fa_type,
+ xkey,
+ KEYLENGTH - fa->fa_slen,
+ fa->fa_tos, fi, flags);
+ if (err < 0)
+ goto stop;
+ }
+
+ i_fa++;
}
+
+ if (filter->dump_exceptions) {
+ err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
+ &i_fa, s_fa);
+ if (err < 0)
+ goto stop;
+ }
+
next:
i++;
}
cb->args[4] = i;
return skb->len;
+
+stop:
+ cb->args[4] = i;
+ cb->args[5] = i_fa;
+ return err;
}
/* rcu_read_lock needs to be hold by caller from readside */
@@ -2634,14 +2742,18 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
rcu_read_unlock();
}
-static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
{
unsigned int flags = 0;
if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
flags = RTF_REJECT;
- if (fi && fi->fib_nh->fib_nh_gw4)
- flags |= RTF_GATEWAY;
+ if (fi) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+
+ if (nhc->nhc_gw.ipv4)
+ flags |= RTF_GATEWAY;
+ }
if (mask == htonl(0xFFFFFFFF))
flags |= RTF_HOST;
flags |= RTF_UP;
@@ -2672,7 +2784,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
prefix = htonl(l->key);
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
- const struct fib_info *fi = fa->fa_info;
+ struct fib_info *fi = fa->fa_info;
__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
@@ -2685,26 +2797,31 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
seq_setwidth(seq, 127);
- if (fi)
+ if (fi) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+ __be32 gw = 0;
+
+ if (nhc->nhc_gw_family == AF_INET)
+ gw = nhc->nhc_gw.ipv4;
+
seq_printf(seq,
"%s\t%08X\t%08X\t%04X\t%d\t%u\t"
"%d\t%08X\t%d\t%u\t%u",
- fi->fib_dev ? fi->fib_dev->name : "*",
- prefix,
- fi->fib_nh->fib_nh_gw4, flags, 0, 0,
+ nhc->nhc_dev ? nhc->nhc_dev->name : "*",
+ prefix, gw, flags, 0, 0,
fi->fib_priority,
mask,
(fi->fib_advmss ?
fi->fib_advmss + 40 : 0),
fi->fib_window,
fi->fib_rtt >> 3);
- else
+ } else {
seq_printf(seq,
"*\t%08X\t%08X\t%04X\t%d\t%u\t"
"%d\t%08X\t%d\t%u\t%u",
prefix, 0, flags, 0, 0, 0,
mask, 0, 0, 0);
-
+ }
seq_pad(seq, '\n');
}
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 293acfb36376..44bfeecac33e 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -83,7 +83,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
if (!skb_checksum_simple_validate(skb)) {
- skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ skb_checksum_try_convert(skb, IPPROTO_GRE,
null_compute_pseudo);
} else if (csum_err) {
*csum_err = true;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 7c857c72aad1..1510e951f451 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -201,7 +201,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
*/
static struct sock *icmp_sk(struct net *net)
{
- return *this_cpu_ptr(net->ipv4.icmp_sk);
+ return this_cpu_read(*net->ipv4.icmp_sk);
}
/* Called with BH disabled */
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a57f0d69eadb..180f6896b98b 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -332,14 +332,15 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev,
const struct flowi4 *fl4)
{
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ const struct in_ifaddr *ifa;
if (!in_dev)
return htonl(INADDR_ANY);
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (fl4->saddr == ifa->ifa_local)
return fl4->saddr;
- } endfor_ifa(in_dev);
+ }
return htonl(INADDR_ANY);
}
@@ -1228,12 +1229,8 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
if (pmc) {
im->interface = pmc->interface;
if (im->sfmode == MCAST_INCLUDE) {
- im->tomb = pmc->tomb;
- pmc->tomb = NULL;
-
- im->sources = pmc->sources;
- pmc->sources = NULL;
-
+ swap(im->tomb, pmc->tomb);
+ swap(im->sources, pmc->sources);
for (psf = im->sources; psf; psf = psf->sf_next)
psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
} else {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7fd6db3fe366..f5c163d4771b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -649,8 +649,7 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
EXPORT_SYMBOL(inet_rtx_syn_ack);
/* return true if req was found in the ehash table */
-static bool reqsk_queue_unlink(struct request_sock_queue *queue,
- struct request_sock *req)
+static bool reqsk_queue_unlink(struct request_sock *req)
{
struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
bool found = false;
@@ -669,7 +668,7 @@ static bool reqsk_queue_unlink(struct request_sock_queue *queue,
void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
{
- if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
+ if (reqsk_queue_unlink(req)) {
reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
reqsk_put(req);
}
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5ce6969896f5..d666756be5f1 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -106,48 +106,90 @@ int inet_frags_init(struct inet_frags *f)
if (!f->frags_cachep)
return -ENOMEM;
+ refcount_set(&f->refcnt, 1);
+ init_completion(&f->completion);
return 0;
}
EXPORT_SYMBOL(inet_frags_init);
void inet_frags_fini(struct inet_frags *f)
{
- /* We must wait that all inet_frag_destroy_rcu() have completed. */
- rcu_barrier();
+ if (refcount_dec_and_test(&f->refcnt))
+ complete(&f->completion);
+
+ wait_for_completion(&f->completion);
kmem_cache_destroy(f->frags_cachep);
f->frags_cachep = NULL;
}
EXPORT_SYMBOL(inet_frags_fini);
+/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
static void inet_frags_free_cb(void *ptr, void *arg)
{
struct inet_frag_queue *fq = ptr;
+ int count;
- /* If we can not cancel the timer, it means this frag_queue
- * is already disappearing, we have nothing to do.
- * Otherwise, we own a refcount until the end of this function.
- */
- if (!del_timer(&fq->timer))
- return;
+ count = del_timer_sync(&fq->timer) ? 1 : 0;
spin_lock_bh(&fq->lock);
if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq->flags |= INET_FRAG_COMPLETE;
- refcount_dec(&fq->refcnt);
+ count++;
+ } else if (fq->flags & INET_FRAG_HASH_DEAD) {
+ count++;
}
spin_unlock_bh(&fq->lock);
- inet_frag_put(fq);
+ if (refcount_sub_and_test(count, &fq->refcnt))
+ inet_frag_destroy(fq);
+}
+
+static void fqdir_work_fn(struct work_struct *work)
+{
+ struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
+ struct inet_frags *f = fqdir->f;
+
+ rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
+
+ /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
+ * have completed, since they need to dereference fqdir.
+ * Would it not be nice to have kfree_rcu_barrier() ? :)
+ */
+ rcu_barrier();
+
+ if (refcount_dec_and_test(&f->refcnt))
+ complete(&f->completion);
+
+ kfree(fqdir);
}
-void inet_frags_exit_net(struct netns_frags *nf)
+int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
{
- nf->high_thresh = 0; /* prevent creation of new frags */
+ struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
+ int res;
- rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
+ if (!fqdir)
+ return -ENOMEM;
+ fqdir->f = f;
+ fqdir->net = net;
+ res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
+ if (res < 0) {
+ kfree(fqdir);
+ return res;
+ }
+ refcount_inc(&f->refcnt);
+ *fqdirp = fqdir;
+ return 0;
}
-EXPORT_SYMBOL(inet_frags_exit_net);
+EXPORT_SYMBOL(fqdir_init);
+
+void fqdir_exit(struct fqdir *fqdir)
+{
+ INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
+ queue_work(system_wq, &fqdir->destroy_work);
+}
+EXPORT_SYMBOL(fqdir_exit);
void inet_frag_kill(struct inet_frag_queue *fq)
{
@@ -155,11 +197,23 @@ void inet_frag_kill(struct inet_frag_queue *fq)
refcount_dec(&fq->refcnt);
if (!(fq->flags & INET_FRAG_COMPLETE)) {
- struct netns_frags *nf = fq->net;
+ struct fqdir *fqdir = fq->fqdir;
fq->flags |= INET_FRAG_COMPLETE;
- rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
- refcount_dec(&fq->refcnt);
+ rcu_read_lock();
+ /* The RCU read lock provides a memory barrier
+ * guaranteeing that if fqdir->dead is false then
+ * the hash table destruction will not start until
+ * after we unlock. Paired with inet_frags_exit_net().
+ */
+ if (!fqdir->dead) {
+ rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
+ fqdir->f->rhash_params);
+ refcount_dec(&fq->refcnt);
+ } else {
+ fq->flags |= INET_FRAG_HASH_DEAD;
+ }
+ rcu_read_unlock();
}
}
EXPORT_SYMBOL(inet_frag_kill);
@@ -168,7 +222,7 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
{
struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
rcu);
- struct inet_frags *f = q->net->f;
+ struct inet_frags *f = q->fqdir->f;
if (f->destructor)
f->destructor(q);
@@ -199,7 +253,7 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge);
void inet_frag_destroy(struct inet_frag_queue *q)
{
- struct netns_frags *nf;
+ struct fqdir *fqdir;
unsigned int sum, sum_truesize = 0;
struct inet_frags *f;
@@ -207,18 +261,18 @@ void inet_frag_destroy(struct inet_frag_queue *q)
WARN_ON(del_timer(&q->timer) != 0);
/* Release all fragment data. */
- nf = q->net;
- f = nf->f;
+ fqdir = q->fqdir;
+ f = fqdir->f;
sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
sum = sum_truesize + f->qsize;
call_rcu(&q->rcu, inet_frag_destroy_rcu);
- sub_frag_mem_limit(nf, sum);
+ sub_frag_mem_limit(fqdir, sum);
}
EXPORT_SYMBOL(inet_frag_destroy);
-static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
+static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
struct inet_frags *f,
void *arg)
{
@@ -228,9 +282,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
if (!q)
return NULL;
- q->net = nf;
+ q->fqdir = fqdir;
f->constructor(q, arg);
- add_frag_mem_limit(nf, f->qsize);
+ add_frag_mem_limit(fqdir, f->qsize);
timer_setup(&q->timer, f->frag_expire, 0);
spin_lock_init(&q->lock);
@@ -239,21 +293,21 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
return q;
}
-static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
+static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
void *arg,
struct inet_frag_queue **prev)
{
- struct inet_frags *f = nf->f;
+ struct inet_frags *f = fqdir->f;
struct inet_frag_queue *q;
- q = inet_frag_alloc(nf, f, arg);
+ q = inet_frag_alloc(fqdir, f, arg);
if (!q) {
*prev = ERR_PTR(-ENOMEM);
return NULL;
}
- mod_timer(&q->timer, jiffies + nf->timeout);
+ mod_timer(&q->timer, jiffies + fqdir->timeout);
- *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key,
+ *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
&q->node, f->rhash_params);
if (*prev) {
q->flags |= INET_FRAG_COMPLETE;
@@ -265,18 +319,18 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
}
/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
+struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
{
struct inet_frag_queue *fq = NULL, *prev;
- if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+ if (!fqdir->high_thresh || frag_mem_limit(fqdir) > fqdir->high_thresh)
return NULL;
rcu_read_lock();
- prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
+ prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
if (!prev)
- fq = inet_frag_create(nf, key, &prev);
+ fq = inet_frag_create(fqdir, key, &prev);
if (prev && !IS_ERR(prev)) {
fq = prev;
if (!refcount_inc_not_zero(&fq->refcnt))
@@ -387,7 +441,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
delta += head->truesize;
if (delta)
- add_frag_mem_limit(q->net, delta);
+ add_frag_mem_limit(q->fqdir, delta);
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
@@ -409,7 +463,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
head->truesize += clone->truesize;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- add_frag_mem_limit(q->net, clone->truesize);
+ add_frag_mem_limit(q->fqdir, clone->truesize);
skb_shinfo(head)->frag_list = clone;
nextp = &clone->next;
} else {
@@ -462,7 +516,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
rbn = rbnext;
}
}
- sub_frag_mem_limit(q->net, head->truesize);
+ sub_frag_mem_limit(q->fqdir, head->truesize);
*nextp = NULL;
skb_mark_not_on_list(head);
@@ -490,7 +544,7 @@ struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
if (head == q->fragments_tail)
q->fragments_tail = NULL;
- sub_frag_mem_limit(q->net, head->truesize);
+ sub_frag_mem_limit(q->fqdir, head->truesize);
return head;
}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c4503073248b..97824864e40d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -316,7 +316,7 @@ struct sock *__inet_lookup_listener(struct net *net,
saddr, sport, htonl(INADDR_ANY), hnum,
dif, sdif);
done:
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cf2b0a6a3337..4385eb9e781f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -82,15 +82,13 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
- struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
- frags);
- struct net *net = container_of(ipv4, struct net, ipv4);
+ struct net *net = q->fqdir->net;
const struct frag_v4_compare_key *key = a;
q->key.v4 = *key;
qp->ecn = 0;
- qp->peer = q->net->max_dist ?
+ qp->peer = q->fqdir->max_dist ?
inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
NULL;
}
@@ -142,9 +140,13 @@ static void ip_expire(struct timer_list *t)
int err;
qp = container_of(frag, struct ipq, q);
- net = container_of(qp->q.net, struct net, ipv4.frags);
+ net = qp->q.fqdir->net;
rcu_read_lock();
+
+ if (qp->q.fqdir->dead)
+ goto out_rcu_unlock;
+
spin_lock(&qp->q.lock);
if (qp->q.flags & INET_FRAG_COMPLETE)
@@ -211,7 +213,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
};
struct inet_frag_queue *q;
- q = inet_frag_find(&net->ipv4.frags, &key);
+ q = inet_frag_find(net->ipv4.fqdir, &key);
if (!q)
return NULL;
@@ -222,7 +224,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
static int ip_frag_too_far(struct ipq *qp)
{
struct inet_peer *peer = qp->peer;
- unsigned int max = qp->q.net->max_dist;
+ unsigned int max = qp->q.fqdir->max_dist;
unsigned int start, end;
int rc;
@@ -236,12 +238,8 @@ static int ip_frag_too_far(struct ipq *qp)
rc = qp->q.fragments_tail && (end - start) > max;
- if (rc) {
- struct net *net;
-
- net = container_of(qp->q.net, struct net, ipv4.frags);
- __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
- }
+ if (rc)
+ __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS);
return rc;
}
@@ -250,13 +248,13 @@ static int ip_frag_reinit(struct ipq *qp)
{
unsigned int sum_truesize = 0;
- if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
+ if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
refcount_inc(&qp->q.refcnt);
return -ETIMEDOUT;
}
sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
- sub_frag_mem_limit(qp->q.net, sum_truesize);
+ sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
@@ -273,7 +271,7 @@ static int ip_frag_reinit(struct ipq *qp)
/* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct net *net = qp->q.fqdir->net;
int ihl, end, flags, offset;
struct sk_buff *prev_tail;
struct net_device *dev;
@@ -352,7 +350,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
- add_frag_mem_limit(qp->q.net, skb->truesize);
+ add_frag_mem_limit(qp->q.fqdir, skb->truesize);
if (offset == 0)
qp->q.flags |= INET_FRAG_FIRST_IN;
@@ -399,7 +397,7 @@ err:
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev)
{
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct net *net = qp->q.fqdir->net;
struct iphdr *iph;
void *reasm_data;
int len, err;
@@ -544,30 +542,24 @@ static int dist_min;
static struct ctl_table ip4_frags_ns_ctl_table[] = {
{
.procname = "ipfrag_high_thresh",
- .data = &init_net.ipv4.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.ipv4.frags.low_thresh
},
{
.procname = "ipfrag_low_thresh",
- .data = &init_net.ipv4.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.ipv4.frags.high_thresh
},
{
.procname = "ipfrag_time",
- .data = &init_net.ipv4.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ipfrag_max_dist",
- .data = &init_net.ipv4.frags.max_dist,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -600,13 +592,13 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
if (!table)
goto err_alloc;
- table[0].data = &net->ipv4.frags.high_thresh;
- table[0].extra1 = &net->ipv4.frags.low_thresh;
- table[1].data = &net->ipv4.frags.low_thresh;
- table[1].extra2 = &net->ipv4.frags.high_thresh;
- table[2].data = &net->ipv4.frags.timeout;
- table[3].data = &net->ipv4.frags.max_dist;
}
+ table[0].data = &net->ipv4.fqdir->high_thresh;
+ table[0].extra1 = &net->ipv4.fqdir->low_thresh;
+ table[1].data = &net->ipv4.fqdir->low_thresh;
+ table[1].extra2 = &net->ipv4.fqdir->high_thresh;
+ table[2].data = &net->ipv4.fqdir->timeout;
+ table[3].data = &net->ipv4.fqdir->max_dist;
hdr = register_net_sysctl(net, "net/ipv4", table);
if (!hdr)
@@ -654,6 +646,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
{
int res;
+ res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
+ if (res < 0)
+ return res;
/* Fragment cache limits.
*
* The fragment memory accounting code, (tries to) account for
@@ -668,36 +663,38 @@ static int __net_init ipv4_frags_init_net(struct net *net)
* we will prune down to 3MB, making room for approx 8 big 64K
* fragments 8x128k.
*/
- net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
- net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
+ net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024;
+ net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024;
/*
* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival
* by TTL.
*/
- net->ipv4.frags.timeout = IP_FRAG_TIME;
+ net->ipv4.fqdir->timeout = IP_FRAG_TIME;
- net->ipv4.frags.max_dist = 64;
- net->ipv4.frags.f = &ip4_frags;
+ net->ipv4.fqdir->max_dist = 64;
- res = inet_frags_init_net(&net->ipv4.frags);
- if (res < 0)
- return res;
res = ip4_frags_ns_ctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->ipv4.frags);
+ fqdir_exit(net->ipv4.fqdir);
return res;
}
+static void __net_exit ipv4_frags_pre_exit_net(struct net *net)
+{
+ fqdir_pre_exit(net->ipv4.fqdir);
+}
+
static void __net_exit ipv4_frags_exit_net(struct net *net)
{
ip4_frags_ns_ctl_unregister(net);
- inet_frags_exit_net(&net->ipv4.frags);
+ fqdir_exit(net->ipv4.fqdir);
}
static struct pernet_operations ip4_frags_ops = {
- .init = ipv4_frags_init_net,
- .exit = ipv4_frags_exit_net,
+ .init = ipv4_frags_init_net,
+ .pre_exit = ipv4_frags_pre_exit_net,
+ .exit = ipv4_frags_exit_net,
};
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 3db31bb9df50..ddaa01ec2bce 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -473,6 +473,7 @@ error:
*info = htonl((pp_ptr-iph)<<24);
return -EINVAL;
}
+EXPORT_SYMBOL(__ip_options_compile);
int ip_options_compile(struct net *net,
struct ip_options *opt, struct sk_buff *skb)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8c2ec35b6512..cc7ef0d05bbd 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -287,16 +287,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
return ret;
}
-static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
- int ret;
-
- ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
- kfree_skb(skb);
- return ret;
- }
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
@@ -315,14 +308,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return ip_finish_output2(net, sk, skb);
}
+static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ return __ip_finish_output(net, sk, skb);
+ case NET_XMIT_CN:
+ return __ip_finish_output(net, sk, skb) ? : ret;
+ default:
+ kfree_skb(skb);
+ return ret;
+ }
+}
+
static int ip_mc_finish_output(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
struct rtable *new_rt;
- int ret;
+ bool do_cn = false;
+ int ret, err;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
+ switch (ret) {
+ case NET_XMIT_CN:
+ do_cn = true;
+ /* fall through */
+ case NET_XMIT_SUCCESS:
+ break;
+ default:
kfree_skb(skb);
return ret;
}
@@ -338,7 +354,8 @@ static int ip_mc_finish_output(struct net *net, struct sock *sk,
skb_dst_set(skb, &new_rt->dst);
}
- return dev_loopback_xmit(net, sk, skb);
+ err = dev_loopback_xmit(net, sk, skb);
+ return (do_cn && err) ? ret : err;
}
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -537,9 +554,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_hash(to, from);
- /* Copy the flags to each fragment. */
- IPCB(to)->flags = IPCB(from)->flags;
-
#ifdef CONFIG_NET_SCHED
to->tc_index = from->tc_index;
#endif
@@ -573,6 +587,175 @@ static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
return ip_do_fragment(net, sk, skb, output);
}
+void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int hlen, struct ip_fraglist_iter *iter)
+{
+ unsigned int first_len = skb_pagelen(skb);
+
+ iter->frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+
+ iter->offset = 0;
+ iter->iph = iph;
+ iter->hlen = hlen;
+
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ iph->tot_len = htons(first_len);
+ iph->frag_off = htons(IP_MF);
+ ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_init);
+
+static void ip_fraglist_ipcb_prepare(struct sk_buff *skb,
+ struct ip_fraglist_iter *iter)
+{
+ struct sk_buff *to = iter->frag;
+
+ /* Copy the flags to each fragment. */
+ IPCB(to)->flags = IPCB(skb)->flags;
+
+ if (iter->offset == 0)
+ ip_options_fragment(to);
+}
+
+void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
+{
+ unsigned int hlen = iter->hlen;
+ struct iphdr *iph = iter->iph;
+ struct sk_buff *frag;
+
+ frag = iter->frag;
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), iph, hlen);
+ iter->iph = ip_hdr(frag);
+ iph = iter->iph;
+ iph->tot_len = htons(frag->len);
+ ip_copy_metadata(frag, skb);
+ iter->offset += skb->len - hlen;
+ iph->frag_off = htons(iter->offset >> 3);
+ if (frag->next)
+ iph->frag_off |= htons(IP_MF);
+ /* Ready, complete checksum */
+ ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_prepare);
+
+void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
+ unsigned int ll_rs, unsigned int mtu,
+ struct ip_frag_state *state)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ state->hlen = hlen;
+ state->ll_rs = ll_rs;
+ state->mtu = mtu;
+
+ state->left = skb->len - hlen; /* Space per frame */
+ state->ptr = hlen; /* Where to start from */
+
+ state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+ state->not_last_frag = iph->frag_off & htons(IP_MF);
+}
+EXPORT_SYMBOL(ip_frag_init);
+
+static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
+ bool first_frag, struct ip_frag_state *state)
+{
+ /* Copy the flags to each fragment. */
+ IPCB(to)->flags = IPCB(from)->flags;
+
+ if (IPCB(from)->flags & IPSKB_FRAG_PMTU)
+ state->iph->frag_off |= htons(IP_DF);
+
+ /* ANK: dirty, but effective trick. Upgrade options only if
+ * the segment to be fragmented was THE FIRST (otherwise,
+ * options are already fixed) and make it ONCE
+ * on the initial skb, so that all the following fragments
+ * will inherit fixed options.
+ */
+ if (first_frag)
+ ip_options_fragment(from);
+}
+
+struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
+{
+ unsigned int len = state->left;
+ struct sk_buff *skb2;
+ struct iphdr *iph;
+
+ len = state->left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > state->mtu)
+ len = state->mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < state->left) {
+ len &= ~7;
+ }
+
+ /* Allocate buffer */
+ skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
+ if (!skb2)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Set up data on packet
+ */
+
+ ip_copy_metadata(skb2, skb);
+ skb_reserve(skb2, state->ll_rs);
+ skb_put(skb2, len + state->hlen);
+ skb_reset_network_header(skb2);
+ skb2->transport_header = skb2->network_header + state->hlen;
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+
+ skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
+ BUG();
+ state->left -= len;
+
+ /*
+ * Fill in the new header fields.
+ */
+ iph = ip_hdr(skb2);
+ iph->frag_off = htons((state->offset >> 3));
+
+ /*
+ * Added AC : If we are fragmenting a fragment that's not the
+ * last fragment then keep MF on each bit
+ */
+ if (state->left > 0 || state->not_last_frag)
+ iph->frag_off |= htons(IP_MF);
+ state->ptr += len;
+ state->offset += len;
+
+ iph->tot_len = htons(len + state->hlen);
+
+ ip_send_check(iph);
+
+ return skb2;
+}
+EXPORT_SYMBOL(ip_frag_next);
+
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
@@ -584,12 +767,11 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph;
- int ptr;
struct sk_buff *skb2;
- unsigned int mtu, hlen, left, len, ll_rs;
- int offset;
- __be16 not_last_frag;
struct rtable *rt = skb_rtable(skb);
+ unsigned int mtu, hlen, ll_rs;
+ struct ip_fraglist_iter iter;
+ struct ip_frag_state state;
int err = 0;
/* for offloaded checksums cleanup checksum before fragmentation */
@@ -654,49 +836,24 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
}
/* Everything is OK. Generate! */
-
- err = 0;
- offset = 0;
- frag = skb_shinfo(skb)->frag_list;
- skb_frag_list_init(skb);
- skb->data_len = first_len - skb_headlen(skb);
- skb->len = first_len;
- iph->tot_len = htons(first_len);
- iph->frag_off = htons(IP_MF);
- ip_send_check(iph);
+ ip_fraglist_init(skb, iph, hlen, &iter);
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
- if (frag) {
- frag->ip_summed = CHECKSUM_NONE;
- skb_reset_transport_header(frag);
- __skb_push(frag, hlen);
- skb_reset_network_header(frag);
- memcpy(skb_network_header(frag), iph, hlen);
- iph = ip_hdr(frag);
- iph->tot_len = htons(frag->len);
- ip_copy_metadata(frag, skb);
- if (offset == 0)
- ip_options_fragment(frag);
- offset += skb->len - hlen;
- iph->frag_off = htons(offset>>3);
- if (frag->next)
- iph->frag_off |= htons(IP_MF);
- /* Ready, complete checksum */
- ip_send_check(iph);
+ if (iter.frag) {
+ ip_fraglist_ipcb_prepare(skb, &iter);
+ ip_fraglist_prepare(skb, &iter);
}
err = output(net, sk, skb);
if (!err)
IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
- if (err || !frag)
+ if (err || !iter.frag)
break;
- skb = frag;
- frag = skb->next;
- skb_mark_not_on_list(skb);
+ skb = ip_fraglist_next(&iter);
}
if (err == 0) {
@@ -704,7 +861,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
return 0;
}
- kfree_skb_list(frag);
+ kfree_skb_list(iter.frag);
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
@@ -720,105 +877,29 @@ slow_path_clean:
}
slow_path:
- iph = ip_hdr(skb);
-
- left = skb->len - hlen; /* Space per frame */
- ptr = hlen; /* Where to start from */
-
/*
* Fragment the datagram.
*/
- offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
- not_last_frag = iph->frag_off & htons(IP_MF);
+ ip_frag_init(skb, hlen, ll_rs, mtu, &state);
/*
* Keep copying data until we run out.
*/
- while (left > 0) {
- len = left;
- /* IF: it doesn't fit, use 'mtu' - the data space left */
- if (len > mtu)
- len = mtu;
- /* IF: we are not sending up to and including the packet end
- then align the next start on an eight byte boundary */
- if (len < left) {
- len &= ~7;
- }
+ while (state.left > 0) {
+ bool first_frag = (state.offset == 0);
- /* Allocate buffer */
- skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
- if (!skb2) {
- err = -ENOMEM;
+ skb2 = ip_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
goto fail;
}
-
- /*
- * Set up data on packet
- */
-
- ip_copy_metadata(skb2, skb);
- skb_reserve(skb2, ll_rs);
- skb_put(skb2, len + hlen);
- skb_reset_network_header(skb2);
- skb2->transport_header = skb2->network_header + hlen;
-
- /*
- * Charge the memory for the fragment to any owner
- * it might possess
- */
-
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
-
- /*
- * Copy the packet header into the new buffer.
- */
-
- skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
-
- /*
- * Copy a block of the IP datagram.
- */
- if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
- BUG();
- left -= len;
-
- /*
- * Fill in the new header fields.
- */
- iph = ip_hdr(skb2);
- iph->frag_off = htons((offset >> 3));
-
- if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
- iph->frag_off |= htons(IP_DF);
-
- /* ANK: dirty, but effective trick. Upgrade options only if
- * the segment to be fragmented was THE FIRST (otherwise,
- * options are already fixed) and make it ONCE
- * on the initial skb, so that all the following fragments
- * will inherit fixed options.
- */
- if (offset == 0)
- ip_options_fragment(skb);
-
- /*
- * Added AC : If we are fragmenting a fragment that's not the
- * last fragment then keep MF on each bit
- */
- if (left > 0 || not_last_frag)
- iph->frag_off |= htons(IP_MF);
- ptr += len;
- offset += len;
+ ip_frag_ipcb(skb, skb2, first_frag, &state);
/*
* Put this fragment into the sending queue.
*/
- iph->tot_len = htons(len + hlen);
-
- ip_send_check(iph);
-
err = output(net, sk, skb2);
if (err)
goto fail;
@@ -1568,7 +1649,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
- unsigned int len)
+ unsigned int len, u64 transmit_time)
{
struct ip_options_data replyopts;
struct ipcm_cookie ipc;
@@ -1584,6 +1665,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
ipcm_init(&ipc);
ipc.addr = daddr;
+ ipc.sockc.transmit_time = transmit_time;
if (replyopts.opt.opt.optlen) {
ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 2f4cdcc13d53..59bfa3825810 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -186,8 +186,7 @@ static void __exit ipcomp4_fini(void)
{
if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&ipcomp_type, AF_INET);
}
module_init(ipcomp4_init);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index 87ca2c42359b..a4e07e5e9c11 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -17,7 +17,7 @@ target(struct sk_buff *skb, const struct xt_action_param *par)
unsigned char *arpptr;
int pln, hln;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return NF_DROP;
arp = arp_hdr(skb);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 5f116c3749b4..5930d3b02555 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -29,7 +29,7 @@ set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
__u8 oldtos;
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return false;
iph = ip_hdr(skb);
oldtos = iph->tos;
@@ -58,7 +58,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
tcph->cwr == einfo->proto.tcp.cwr))
return true;
- if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
+ if (skb_ensure_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
return false;
tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb);
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 64d9563c0218..8e7f84ec783d 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -3,258 +3,11 @@
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
*/
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/tcp.h>
-
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_SYNPROXY.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_synproxy.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-
-static struct iphdr *
-synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
- __be32 daddr)
-{
- struct iphdr *iph;
-
- skb_reset_network_header(skb);
- iph = skb_put(skb, sizeof(*iph));
- iph->version = 4;
- iph->ihl = sizeof(*iph) / 4;
- iph->tos = 0;
- iph->id = 0;
- iph->frag_off = htons(IP_DF);
- iph->ttl = net->ipv4.sysctl_ip_default_ttl;
- iph->protocol = IPPROTO_TCP;
- iph->check = 0;
- iph->saddr = saddr;
- iph->daddr = daddr;
-
- return iph;
-}
-
-static void
-synproxy_send_tcp(struct net *net,
- const struct sk_buff *skb, struct sk_buff *nskb,
- struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
- struct iphdr *niph, struct tcphdr *nth,
- unsigned int tcp_hdr_size)
-{
- nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
- nskb->ip_summed = CHECKSUM_PARTIAL;
- nskb->csum_start = (unsigned char *)nth - nskb->head;
- nskb->csum_offset = offsetof(struct tcphdr, check);
-
- skb_dst_set_noref(nskb, skb_dst(skb));
- nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
- goto free_nskb;
-
- if (nfct) {
- nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
- nf_conntrack_get(nfct);
- }
-
- ip_local_out(net, nskb->sk, nskb);
- return;
-
-free_nskb:
- kfree_skb(nskb);
-}
-
-static void
-synproxy_send_client_synack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
- u16 mss = opts->mss;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE;
- nth->doff = tcp_hdr_size / 4;
- nth->window = 0;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_syn(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(recv_seq - 1);
- /* ack_seq is used to relay our ISN to the synproxy hook to initialize
- * sequence number translation once a connection tracking entry exists.
- */
- nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
- nth->doff = tcp_hdr_size / 4;
- nth->window = th->window;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
- niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_ack(struct net *net,
- const struct ip_ct_tcp *state,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(ntohl(th->ack_seq));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(ntohl(th->seq) + 1);
- nth->ack_seq = th->ack_seq;
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(ntohs(th->window) >> opts->wscale);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static bool
-synproxy_recv_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- int mss;
-
- mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
- if (mss == 0) {
- this_cpu_inc(snet->stats->cookie_invalid);
- return false;
- }
-
- this_cpu_inc(snet->stats->cookie_valid);
- opts->mss = mss;
- opts->options |= XT_SYNPROXY_OPT_MSS;
-
- if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy_check_timestamp_cookie(opts);
-
- synproxy_send_server_syn(net, skb, th, opts, recv_seq);
- return true;
-}
+#include <net/netfilter/nf_synproxy.h>
static unsigned int
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
@@ -306,135 +59,6 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv4_synproxy_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *nhs)
-{
- struct net *net = nhs->net;
- struct synproxy_net *snet = synproxy_pernet(net);
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
- struct nf_conn_synproxy *synproxy;
- struct synproxy_options opts = {};
- const struct ip_ct_tcp *state;
- struct tcphdr *th, _th;
- unsigned int thoff;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL)
- return NF_ACCEPT;
-
- synproxy = nfct_synproxy(ct);
- if (synproxy == NULL)
- return NF_ACCEPT;
-
- if (nf_is_loopback_packet(skb) ||
- ip_hdr(skb)->protocol != IPPROTO_TCP)
- return NF_ACCEPT;
-
- thoff = ip_hdrlen(skb);
- th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
- if (th == NULL)
- return NF_DROP;
-
- state = &ct->proto.tcp;
- switch (state->state) {
- case TCP_CONNTRACK_CLOSE:
- if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
- ntohl(th->seq) + 1);
- break;
- }
-
- if (!th->syn || th->ack ||
- CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- break;
-
- /* Reopened connection - reset the sequence number and timestamp
- * adjustments, they will get initialized once the connection is
- * reestablished.
- */
- nf_ct_seqadj_init(ct, ctinfo, 0);
- synproxy->tsoff = 0;
- this_cpu_inc(snet->stats->conn_reopened);
-
- /* fall through */
- case TCP_CONNTRACK_SYN_SENT:
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (!th->syn && th->ack &&
- CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
- /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
- * therefore we need to add 1 to make the SYN sequence
- * number match the one of first SYN.
- */
- if (synproxy_recv_client_ack(net, skb, th, &opts,
- ntohl(th->seq) + 1)) {
- this_cpu_inc(snet->stats->cookie_retrans);
- consume_skb(skb);
- return NF_STOLEN;
- } else {
- return NF_DROP;
- }
- }
-
- synproxy->isn = ntohl(th->ack_seq);
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy->its = opts.tsecr;
-
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- break;
- case TCP_CONNTRACK_SYN_RECV:
- if (!th->syn || !th->ack)
- break;
-
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) {
- synproxy->tsoff = opts.tsval - synproxy->its;
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- }
-
- opts.options &= ~(XT_SYNPROXY_OPT_MSS |
- XT_SYNPROXY_OPT_WSCALE |
- XT_SYNPROXY_OPT_SACK_PERM);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_server_ack(net, state, skb, th, &opts);
-
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
- nf_conntrack_event_cache(IPCT_SEQADJ, ct);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_client_ack(net, skb, th, &opts);
-
- consume_skb(skb);
- return NF_STOLEN;
- default:
- break;
- }
-
- synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
- return NF_ACCEPT;
-}
-
-static const struct nf_hook_ops ipv4_synproxy_ops[] = {
- {
- .hook = ipv4_synproxy_hook,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
- {
- .hook = ipv4_synproxy_hook,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
-};
-
static int synproxy_tg4_check(const struct xt_tgchk_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
@@ -449,16 +73,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par)
if (err)
return err;
- if (snet->hook_ref4 == 0) {
- err = nf_register_net_hooks(par->net, ipv4_synproxy_ops,
- ARRAY_SIZE(ipv4_synproxy_ops));
- if (err) {
- nf_ct_netns_put(par->net, par->family);
- return err;
- }
+ err = nf_synproxy_ipv4_init(snet, par->net);
+ if (err) {
+ nf_ct_netns_put(par->net, par->family);
+ return err;
}
- snet->hook_ref4++;
return err;
}
@@ -466,10 +86,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
- snet->hook_ref4--;
- if (snet->hook_ref4 == 0)
- nf_unregister_net_hooks(par->net, ipv4_synproxy_ops,
- ARRAY_SIZE(ipv4_synproxy_ops));
+ nf_synproxy_ipv4_fini(snet, par->net);
nf_ct_netns_put(par->net, par->family);
}
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 6eefde5bc468..69697eb4bfc6 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -2,7 +2,7 @@
/*
* 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
*
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index dfea10f13878..87b711fd5a44 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -6,7 +6,7 @@
* Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* Based on the 'brute force' H.323 NAT module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
*/
#include <linux/module.h>
@@ -58,7 +58,7 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff,
net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
return -1;
}
- /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
+ /* nf_nat_mangle_udp_packet uses skb_ensure_writable() to copy
* or pull everything in a linear buffer, so we can safely
* use the skb pointers now */
*data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
index 657d2dcec3cc..717b726504fe 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -186,7 +186,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
return NF_DROP;
}
- if (!skb_make_writable(skb, skb->len)) {
+ if (skb_ensure_writable(skb, skb->len)) {
nf_ct_helper_log(skb, ct, "cannot mangle packet");
return NF_DROP;
}
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index b6dd39636bea..b2bae0b0e42a 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -49,6 +49,7 @@ EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
{
+ const struct in_ifaddr *ifa;
struct in_device *indev;
__be32 laddr;
@@ -57,10 +58,14 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
laddr = 0;
indev = __in_dev_get_rcu(skb->dev);
- for_primary_ifa(indev) {
+
+ in_dev_for_each_ifa_rcu(ifa, indev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
+
laddr = ifa->ifa_local;
break;
- } endfor_ifa(indev);
+ }
return laddr ? laddr : daddr;
}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
new file mode 100644
index 000000000000..5fe5a3981d43
--- /dev/null
+++ b/net/ipv4/nexthop.c
@@ -0,0 +1,1828 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Generic nexthop implementation
+ *
+ * Copyright (c) 2017-19 Cumulus Networks
+ * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
+ */
+
+#include <linux/nexthop.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/arp.h>
+#include <net/ipv6_stubs.h>
+#include <net/lwtunnel.h>
+#include <net/ndisc.h>
+#include <net/nexthop.h>
+#include <net/route.h>
+#include <net/sock.h>
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo);
+
+#define NH_DEV_HASHBITS 8
+#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
+
+static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
+ [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 },
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_GROUP] = { .type = NLA_BINARY },
+ [NHA_GROUP_TYPE] = { .type = NLA_U16 },
+ [NHA_BLACKHOLE] = { .type = NLA_FLAG },
+ [NHA_OIF] = { .type = NLA_U32 },
+ [NHA_GATEWAY] = { .type = NLA_BINARY },
+ [NHA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [NHA_ENCAP] = { .type = NLA_NESTED },
+ [NHA_GROUPS] = { .type = NLA_FLAG },
+ [NHA_MASTER] = { .type = NLA_U32 },
+};
+
+static unsigned int nh_dev_hashfn(unsigned int val)
+{
+ unsigned int mask = NH_DEV_HASHSIZE - 1;
+
+ return (val ^
+ (val >> NH_DEV_HASHBITS) ^
+ (val >> (NH_DEV_HASHBITS * 2))) & mask;
+}
+
+static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
+{
+ struct net_device *dev = nhi->fib_nhc.nhc_dev;
+ struct hlist_head *head;
+ unsigned int hash;
+
+ WARN_ON(!dev);
+
+ hash = nh_dev_hashfn(dev->ifindex);
+ head = &net->nexthop.devhash[hash];
+ hlist_add_head(&nhi->dev_hash, head);
+}
+
+static void nexthop_free_mpath(struct nexthop *nh)
+{
+ struct nh_group *nhg;
+ int i;
+
+ nhg = rcu_dereference_raw(nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; ++i)
+ WARN_ON(nhg->nh_entries[i].nh);
+
+ kfree(nhg);
+}
+
+static void nexthop_free_single(struct nexthop *nh)
+{
+ struct nh_info *nhi;
+
+ nhi = rcu_dereference_raw(nh->nh_info);
+ switch (nhi->family) {
+ case AF_INET:
+ fib_nh_release(nh->net, &nhi->fib_nh);
+ break;
+ case AF_INET6:
+ ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
+ break;
+ }
+ kfree(nhi);
+}
+
+void nexthop_free_rcu(struct rcu_head *head)
+{
+ struct nexthop *nh = container_of(head, struct nexthop, rcu);
+
+ if (nh->is_group)
+ nexthop_free_mpath(nh);
+ else
+ nexthop_free_single(nh);
+
+ kfree(nh);
+}
+EXPORT_SYMBOL_GPL(nexthop_free_rcu);
+
+static struct nexthop *nexthop_alloc(void)
+{
+ struct nexthop *nh;
+
+ nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
+ if (nh) {
+ INIT_LIST_HEAD(&nh->fi_list);
+ INIT_LIST_HEAD(&nh->f6i_list);
+ INIT_LIST_HEAD(&nh->grp_list);
+ }
+ return nh;
+}
+
+static struct nh_group *nexthop_grp_alloc(u16 num_nh)
+{
+ size_t sz = offsetof(struct nexthop, nh_grp)
+ + sizeof(struct nh_group)
+ + sizeof(struct nh_grp_entry) * num_nh;
+ struct nh_group *nhg;
+
+ nhg = kzalloc(sz, GFP_KERNEL);
+ if (nhg)
+ nhg->num_nh = num_nh;
+
+ return nhg;
+}
+
+static void nh_base_seq_inc(struct net *net)
+{
+ while (++net->nexthop.seq == 0)
+ ;
+}
+
+/* no reference taken; rcu lock or rtnl must be held */
+struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
+{
+ struct rb_node **pp, *parent = NULL, *next;
+
+ pp = &net->nexthop.rb_root.rb_node;
+ while (1) {
+ struct nexthop *nh;
+
+ next = rcu_dereference_raw(*pp);
+ if (!next)
+ break;
+ parent = next;
+
+ nh = rb_entry(parent, struct nexthop, rb_node);
+ if (id < nh->id)
+ pp = &next->rb_left;
+ else if (id > nh->id)
+ pp = &next->rb_right;
+ else
+ return nh;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_find_by_id);
+
+/* used for auto id allocation; called with rtnl held */
+static u32 nh_find_unused_id(struct net *net)
+{
+ u32 id_start = net->nexthop.last_id_allocated;
+
+ while (1) {
+ net->nexthop.last_id_allocated++;
+ if (net->nexthop.last_id_allocated == id_start)
+ break;
+
+ if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
+ return net->nexthop.last_id_allocated;
+ }
+ return 0;
+}
+
+static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
+{
+ struct nexthop_grp *p;
+ size_t len = nhg->num_nh * sizeof(*p);
+ struct nlattr *nla;
+ u16 group_type = 0;
+ int i;
+
+ if (nhg->mpath)
+ group_type = NEXTHOP_GRP_TYPE_MPATH;
+
+ if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, NHA_GROUP, len);
+ if (!nla)
+ goto nla_put_failure;
+
+ p = nla_data(nla);
+ for (i = 0; i < nhg->num_nh; ++i) {
+ p->id = nhg->nh_entries[i].nh->id;
+ p->weight = nhg->nh_entries[i].weight - 1;
+ p += 1;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
+ int event, u32 portid, u32 seq, unsigned int nlflags)
+{
+ struct fib6_nh *fib6_nh;
+ struct fib_nh *fib_nh;
+ struct nlmsghdr *nlh;
+ struct nh_info *nhi;
+ struct nhmsg *nhm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ nhm = nlmsg_data(nlh);
+ nhm->nh_family = AF_UNSPEC;
+ nhm->nh_flags = nh->nh_flags;
+ nhm->nh_protocol = nh->protocol;
+ nhm->nh_scope = 0;
+ nhm->resvd = 0;
+
+ if (nla_put_u32(skb, NHA_ID, nh->id))
+ goto nla_put_failure;
+
+ if (nh->is_group) {
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ if (nla_put_nh_group(skb, nhg))
+ goto nla_put_failure;
+ goto out;
+ }
+
+ nhi = rtnl_dereference(nh->nh_info);
+ nhm->nh_family = nhi->family;
+ if (nhi->reject_nh) {
+ if (nla_put_flag(skb, NHA_BLACKHOLE))
+ goto nla_put_failure;
+ goto out;
+ } else {
+ const struct net_device *dev;
+
+ dev = nhi->fib_nhc.nhc_dev;
+ if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
+ goto nla_put_failure;
+ }
+
+ nhm->nh_scope = nhi->fib_nhc.nhc_scope;
+ switch (nhi->family) {
+ case AF_INET:
+ fib_nh = &nhi->fib_nh;
+ if (fib_nh->fib_nh_gw_family &&
+ nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
+ goto nla_put_failure;
+ break;
+
+ case AF_INET6:
+ fib6_nh = &nhi->fib6_nh;
+ if (fib6_nh->fib_nh_gw_family &&
+ nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
+ goto nla_put_failure;
+ break;
+ }
+
+ if (nhi->fib_nhc.nhc_lwtstate &&
+ lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
+ NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
+
+out:
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static size_t nh_nlmsg_size_grp(struct nexthop *nh)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
+
+ return nla_total_size(sz) +
+ nla_total_size(2); /* NHA_GROUP_TYPE */
+}
+
+static size_t nh_nlmsg_size_single(struct nexthop *nh)
+{
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+ size_t sz;
+
+ /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+ * are mutually exclusive
+ */
+ sz = nla_total_size(4); /* NHA_OIF */
+
+ switch (nhi->family) {
+ case AF_INET:
+ if (nhi->fib_nh.fib_nh_gw_family)
+ sz += nla_total_size(4); /* NHA_GATEWAY */
+ break;
+
+ case AF_INET6:
+ /* NHA_GATEWAY */
+ if (nhi->fib6_nh.fib_nh_gw_family)
+ sz += nla_total_size(sizeof(const struct in6_addr));
+ break;
+ }
+
+ if (nhi->fib_nhc.nhc_lwtstate) {
+ sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
+ sz += nla_total_size(2); /* NHA_ENCAP_TYPE */
+ }
+
+ return sz;
+}
+
+static size_t nh_nlmsg_size(struct nexthop *nh)
+{
+ size_t sz = nla_total_size(4); /* NHA_ID */
+
+ if (nh->is_group)
+ sz += nh_nlmsg_size_grp(nh);
+ else
+ sz += nh_nlmsg_size_single(nh);
+
+ return sz;
+}
+
+static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
+{
+ unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
+ if (!skb)
+ goto errout;
+
+ err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
+ info->nlh, gfp_any());
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
+}
+
+static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
+ struct netlink_ext_ack *extack)
+{
+ if (nh->is_group) {
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ /* nested multipath (group within a group) is not
+ * supported
+ */
+ if (nhg->mpath) {
+ NL_SET_ERR_MSG(extack,
+ "Multipath group can not be a nexthop within a group");
+ return false;
+ }
+ } else {
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+ if (nhi->reject_nh && npaths > 1) {
+ NL_SET_ERR_MSG(extack,
+ "Blackhole nexthop can not be used in a group with more than 1 path");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ unsigned int len = nla_len(tb[NHA_GROUP]);
+ struct nexthop_grp *nhg;
+ unsigned int i, j;
+
+ if (len & (sizeof(struct nexthop_grp) - 1)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid length for nexthop group attribute");
+ return -EINVAL;
+ }
+
+ /* convert len to number of nexthop ids */
+ len /= sizeof(*nhg);
+
+ nhg = nla_data(tb[NHA_GROUP]);
+ for (i = 0; i < len; ++i) {
+ if (nhg[i].resvd1 || nhg[i].resvd2) {
+ NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
+ return -EINVAL;
+ }
+ if (nhg[i].weight > 254) {
+ NL_SET_ERR_MSG(extack, "Invalid value for weight");
+ return -EINVAL;
+ }
+ for (j = i + 1; j < len; ++j) {
+ if (nhg[i].id == nhg[j].id) {
+ NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
+ return -EINVAL;
+ }
+ }
+ }
+
+ nhg = nla_data(tb[NHA_GROUP]);
+ for (i = 0; i < len; ++i) {
+ struct nexthop *nh;
+
+ nh = nexthop_find_by_id(net, nhg[i].id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ return -EINVAL;
+ }
+ if (!valid_group_nh(nh, len, extack))
+ return -EINVAL;
+ }
+ for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ NL_SET_ERR_MSG(extack,
+ "No other attributes can be set in nexthop groups");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool ipv6_good_nh(const struct fib6_nh *nh)
+{
+ int state = NUD_REACHABLE;
+ struct neighbour *n;
+
+ rcu_read_lock_bh();
+
+ n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
+ if (n)
+ state = n->nud_state;
+
+ rcu_read_unlock_bh();
+
+ return !!(state & NUD_VALID);
+}
+
+static bool ipv4_good_nh(const struct fib_nh *nh)
+{
+ int state = NUD_REACHABLE;
+ struct neighbour *n;
+
+ rcu_read_lock_bh();
+
+ n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+ (__force u32)nh->fib_nh_gw4);
+ if (n)
+ state = n->nud_state;
+
+ rcu_read_unlock_bh();
+
+ return !!(state & NUD_VALID);
+}
+
+struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
+{
+ struct nexthop *rc = NULL;
+ struct nh_group *nhg;
+ int i;
+
+ if (!nh->is_group)
+ return nh;
+
+ nhg = rcu_dereference(nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ struct nh_info *nhi;
+
+ if (hash > atomic_read(&nhge->upper_bound))
+ continue;
+
+ /* nexthops always check if it is good and does
+ * not rely on a sysctl for this behavior
+ */
+ nhi = rcu_dereference(nhge->nh->nh_info);
+ switch (nhi->family) {
+ case AF_INET:
+ if (ipv4_good_nh(&nhi->fib_nh))
+ return nhge->nh;
+ break;
+ case AF_INET6:
+ if (ipv6_good_nh(&nhi->fib6_nh))
+ return nhge->nh;
+ break;
+ }
+
+ if (!rc)
+ rc = nhge->nh;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(nexthop_select_path);
+
+int nexthop_for_each_fib6_nh(struct nexthop *nh,
+ int (*cb)(struct fib6_nh *nh, void *arg),
+ void *arg)
+{
+ struct nh_info *nhi;
+ int err;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+ int i;
+
+ nhg = rcu_dereference_rtnl(nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
+ err = cb(&nhi->fib6_nh, arg);
+ if (err)
+ return err;
+ }
+ } else {
+ nhi = rcu_dereference_rtnl(nh->nh_info);
+ err = cb(&nhi->fib6_nh, arg);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
+
+static int check_src_addr(const struct in6_addr *saddr,
+ struct netlink_ext_ack *extack)
+{
+ if (!ipv6_addr_any(saddr)) {
+ NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+
+ /* fib6_src is unique to a fib6_info and limits the ability to cache
+ * routes in fib6_nh within a nexthop that is potentially shared
+ * across multiple fib entries. If the config wants to use source
+ * routing it can not use nexthop objects. mlxsw also does not allow
+ * fib6_src on routes.
+ */
+ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
+ return -EINVAL;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ if (nhg->has_v4)
+ goto no_v4_nh;
+ } else {
+ nhi = rtnl_dereference(nh->nh_info);
+ if (nhi->family == AF_INET)
+ goto no_v4_nh;
+ }
+
+ return 0;
+no_v4_nh:
+ NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(fib6_check_nexthop);
+
+/* if existing nexthop has ipv6 routes linked to it, need
+ * to verify this new spec works with ipv6
+ */
+static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_info *f6i;
+
+ if (list_empty(&old->f6i_list))
+ return 0;
+
+ list_for_each_entry(f6i, &old->f6i_list, nh_list) {
+ if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
+ return -EINVAL;
+ }
+
+ return fib6_check_nexthop(new, NULL, extack);
+}
+
+static int nexthop_check_scope(struct nexthop *nh, u8 scope,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
+ NL_SET_ERR_MSG(extack,
+ "Route with host scope can not have a gateway");
+ return -EINVAL;
+ }
+
+ if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
+ NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Invoked by fib add code to verify nexthop by id is ok with
+ * config for prefix; parts of fib_check_nh not done when nexthop
+ * object is used.
+ */
+int fib_check_nexthop(struct nexthop *nh, u8 scope,
+ struct netlink_ext_ack *extack)
+{
+ int err = 0;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+
+ if (scope == RT_SCOPE_HOST) {
+ NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
+ err = -EINVAL;
+ goto out;
+ }
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ /* all nexthops in a group have the same scope */
+ err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack);
+ } else {
+ err = nexthop_check_scope(nh, scope, extack);
+ }
+out:
+ return err;
+}
+
+static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_info *fi;
+
+ list_for_each_entry(fi, &old->fi_list, nh_list) {
+ int err;
+
+ err = fib_check_nexthop(new, fi->fib_scope, extack);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static void nh_group_rebalance(struct nh_group *nhg)
+{
+ int total = 0;
+ int w = 0;
+ int i;
+
+ for (i = 0; i < nhg->num_nh; ++i)
+ total += nhg->nh_entries[i].weight;
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ int upper_bound;
+
+ w += nhge->weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
+ atomic_set(&nhge->upper_bound, upper_bound);
+ }
+}
+
+static void remove_nh_grp_entry(struct nh_grp_entry *nhge,
+ struct nh_group *nhg,
+ struct nl_info *nlinfo)
+{
+ struct nexthop *nh = nhge->nh;
+ struct nh_grp_entry *nhges;
+ bool found = false;
+ int i;
+
+ WARN_ON(!nh);
+
+ nhges = nhg->nh_entries;
+ for (i = 0; i < nhg->num_nh; ++i) {
+ if (found) {
+ nhges[i-1].nh = nhges[i].nh;
+ nhges[i-1].weight = nhges[i].weight;
+ list_del(&nhges[i].nh_list);
+ list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list);
+ } else if (nhg->nh_entries[i].nh == nh) {
+ found = true;
+ }
+ }
+
+ if (WARN_ON(!found))
+ return;
+
+ nhg->num_nh--;
+ nhg->nh_entries[nhg->num_nh].nh = NULL;
+
+ nh_group_rebalance(nhg);
+
+ nexthop_put(nh);
+
+ if (nlinfo)
+ nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo);
+}
+
+static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ struct nh_grp_entry *nhge, *tmp;
+
+ list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) {
+ struct nh_group *nhg;
+
+ list_del(&nhge->nh_list);
+ nhg = rtnl_dereference(nhge->nh_parent->nh_grp);
+ remove_nh_grp_entry(nhge, nhg, nlinfo);
+
+ /* if this group has no more entries then remove it */
+ if (!nhg->num_nh)
+ remove_nexthop(net, nhge->nh_parent, nlinfo);
+ }
+}
+
+static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
+{
+ struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
+ int i, num_nh = nhg->num_nh;
+
+ for (i = 0; i < num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ if (WARN_ON(!nhge->nh))
+ continue;
+
+ list_del(&nhge->nh_list);
+ nexthop_put(nhge->nh);
+ nhge->nh = NULL;
+ nhg->num_nh--;
+ }
+}
+
+/* not called for nexthop replace */
+static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
+{
+ struct fib6_info *f6i, *tmp;
+ bool do_flush = false;
+ struct fib_info *fi;
+
+ list_for_each_entry(fi, &nh->fi_list, nh_list) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ do_flush = true;
+ }
+ if (do_flush)
+ fib_flush(net);
+
+ /* ip6_del_rt removes the entry from this list hence the _safe */
+ list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
+ /* __ip6_del_rt does a release, so do a hold here */
+ fib6_info_hold(f6i);
+ ipv6_stub->ip6_del_rt(net, f6i);
+ }
+}
+
+static void __remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ __remove_nexthop_fib(net, nh);
+
+ if (nh->is_group) {
+ remove_nexthop_group(nh, nlinfo);
+ } else {
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (nhi->fib_nhc.nhc_dev)
+ hlist_del(&nhi->dev_hash);
+
+ remove_nexthop_from_groups(net, nh, nlinfo);
+ }
+}
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ /* remove from the tree */
+ rb_erase(&nh->rb_node, &net->nexthop.rb_root);
+
+ if (nlinfo)
+ nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+
+ __remove_nexthop(net, nh, nlinfo);
+ nh_base_seq_inc(net);
+
+ nexthop_put(nh);
+}
+
+/* if any FIB entries reference this nexthop, any dst entries
+ * need to be regenerated
+ */
+static void nh_rt_cache_flush(struct net *net, struct nexthop *nh)
+{
+ struct fib6_info *f6i;
+
+ if (!list_empty(&nh->fi_list))
+ rt_cache_flush(net);
+
+ list_for_each_entry(f6i, &nh->f6i_list, nh_list)
+ ipv6_stub->fib6_update_sernum(net, f6i);
+}
+
+static int replace_nexthop_grp(struct net *net, struct nexthop *old,
+ struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_group *oldg, *newg;
+ int i;
+
+ if (!new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
+ return -EINVAL;
+ }
+
+ oldg = rtnl_dereference(old->nh_grp);
+ newg = rtnl_dereference(new->nh_grp);
+
+ /* update parents - used by nexthop code for cleanup */
+ for (i = 0; i < newg->num_nh; i++)
+ newg->nh_entries[i].nh_parent = old;
+
+ rcu_assign_pointer(old->nh_grp, newg);
+
+ for (i = 0; i < oldg->num_nh; i++)
+ oldg->nh_entries[i].nh_parent = new;
+
+ rcu_assign_pointer(new->nh_grp, oldg);
+
+ return 0;
+}
+
+static int replace_nexthop_single(struct net *net, struct nexthop *old,
+ struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *oldi, *newi;
+
+ if (new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
+ return -EINVAL;
+ }
+
+ oldi = rtnl_dereference(old->nh_info);
+ newi = rtnl_dereference(new->nh_info);
+
+ newi->nh_parent = old;
+ oldi->nh_parent = new;
+
+ old->protocol = new->protocol;
+ old->nh_flags = new->nh_flags;
+
+ rcu_assign_pointer(old->nh_info, newi);
+ rcu_assign_pointer(new->nh_info, oldi);
+
+ return 0;
+}
+
+static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
+ struct nl_info *info)
+{
+ struct fib6_info *f6i;
+
+ if (!list_empty(&nh->fi_list)) {
+ struct fib_info *fi;
+
+ /* expectation is a few fib_info per nexthop and then
+ * a lot of routes per fib_info. So mark the fib_info
+ * and then walk the fib tables once
+ */
+ list_for_each_entry(fi, &nh->fi_list, nh_list)
+ fi->nh_updated = true;
+
+ fib_info_notify_update(net, info);
+
+ list_for_each_entry(fi, &nh->fi_list, nh_list)
+ fi->nh_updated = false;
+ }
+
+ list_for_each_entry(f6i, &nh->f6i_list, nh_list)
+ ipv6_stub->fib6_rt_update(net, f6i, info);
+}
+
+/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
+ * linked to this nexthop and for all groups that the nexthop
+ * is a member of
+ */
+static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
+ struct nl_info *info)
+{
+ struct nh_grp_entry *nhge;
+
+ __nexthop_replace_notify(net, nh, info);
+
+ list_for_each_entry(nhge, &nh->grp_list, nh_list)
+ __nexthop_replace_notify(net, nhge->nh_parent, info);
+}
+
+static int replace_nexthop(struct net *net, struct nexthop *old,
+ struct nexthop *new, struct netlink_ext_ack *extack)
+{
+ bool new_is_reject = false;
+ struct nh_grp_entry *nhge;
+ int err;
+
+ /* check that existing FIB entries are ok with the
+ * new nexthop definition
+ */
+ err = fib_check_nh_list(old, new, extack);
+ if (err)
+ return err;
+
+ err = fib6_check_nh_list(old, new, extack);
+ if (err)
+ return err;
+
+ if (!new->is_group) {
+ struct nh_info *nhi = rtnl_dereference(new->nh_info);
+
+ new_is_reject = nhi->reject_nh;
+ }
+
+ list_for_each_entry(nhge, &old->grp_list, nh_list) {
+ /* if new nexthop is a blackhole, any groups using this
+ * nexthop cannot have more than 1 path
+ */
+ if (new_is_reject &&
+ nexthop_num_path(nhge->nh_parent) > 1) {
+ NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
+ return -EINVAL;
+ }
+
+ err = fib_check_nh_list(nhge->nh_parent, new, extack);
+ if (err)
+ return err;
+
+ err = fib6_check_nh_list(nhge->nh_parent, new, extack);
+ if (err)
+ return err;
+ }
+
+ if (old->is_group)
+ err = replace_nexthop_grp(net, old, new, extack);
+ else
+ err = replace_nexthop_single(net, old, new, extack);
+
+ if (!err) {
+ nh_rt_cache_flush(net, old);
+
+ __remove_nexthop(net, new, NULL);
+ nexthop_put(new);
+ }
+
+ return err;
+}
+
+/* called with rtnl_lock held */
+static int insert_nexthop(struct net *net, struct nexthop *new_nh,
+ struct nh_config *cfg, struct netlink_ext_ack *extack)
+{
+ struct rb_node **pp, *parent = NULL, *next;
+ struct rb_root *root = &net->nexthop.rb_root;
+ bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
+ bool create = !!(cfg->nlflags & NLM_F_CREATE);
+ u32 new_id = new_nh->id;
+ int replace_notify = 0;
+ int rc = -EEXIST;
+
+ pp = &root->rb_node;
+ while (1) {
+ struct nexthop *nh;
+
+ next = rtnl_dereference(*pp);
+ if (!next)
+ break;
+
+ parent = next;
+
+ nh = rb_entry(parent, struct nexthop, rb_node);
+ if (new_id < nh->id) {
+ pp = &next->rb_left;
+ } else if (new_id > nh->id) {
+ pp = &next->rb_right;
+ } else if (replace) {
+ rc = replace_nexthop(net, nh, new_nh, extack);
+ if (!rc) {
+ new_nh = nh; /* send notification with old nh */
+ replace_notify = 1;
+ }
+ goto out;
+ } else {
+ /* id already exists and not a replace */
+ goto out;
+ }
+ }
+
+ if (replace && !create) {
+ NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
+ rc = -ENOENT;
+ goto out;
+ }
+
+ rb_link_node_rcu(&new_nh->rb_node, parent, pp);
+ rb_insert_color(&new_nh->rb_node, root);
+ rc = 0;
+out:
+ if (!rc) {
+ nh_base_seq_inc(net);
+ nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
+ if (replace_notify)
+ nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
+ }
+
+ return rc;
+}
+
+/* rtnl */
+/* remove all nexthops tied to a device being deleted */
+static void nexthop_flush_dev(struct net_device *dev)
+{
+ unsigned int hash = nh_dev_hashfn(dev->ifindex);
+ struct net *net = dev_net(dev);
+ struct hlist_head *head = &net->nexthop.devhash[hash];
+ struct hlist_node *n;
+ struct nh_info *nhi;
+
+ hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+ if (nhi->fib_nhc.nhc_dev != dev)
+ continue;
+
+ remove_nexthop(net, nhi->nh_parent, NULL);
+ }
+}
+
+/* rtnl; called when net namespace is deleted */
+static void flush_all_nexthops(struct net *net)
+{
+ struct rb_root *root = &net->nexthop.rb_root;
+ struct rb_node *node;
+ struct nexthop *nh;
+
+ while ((node = rb_first(root))) {
+ nh = rb_entry(node, struct nexthop, rb_node);
+ remove_nexthop(net, nh, NULL);
+ cond_resched();
+ }
+}
+
+static struct nexthop *nexthop_create_group(struct net *net,
+ struct nh_config *cfg)
+{
+ struct nlattr *grps_attr = cfg->nh_grp;
+ struct nexthop_grp *entry = nla_data(grps_attr);
+ struct nh_group *nhg;
+ struct nexthop *nh;
+ int i;
+
+ nh = nexthop_alloc();
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nh->is_group = 1;
+
+ nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry));
+ if (!nhg) {
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nexthop *nhe;
+ struct nh_info *nhi;
+
+ nhe = nexthop_find_by_id(net, entry[i].id);
+ if (!nexthop_get(nhe))
+ goto out_no_nh;
+
+ nhi = rtnl_dereference(nhe->nh_info);
+ if (nhi->family == AF_INET)
+ nhg->has_v4 = true;
+
+ nhg->nh_entries[i].nh = nhe;
+ nhg->nh_entries[i].weight = entry[i].weight + 1;
+ list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
+ nhg->nh_entries[i].nh_parent = nh;
+ }
+
+ if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+ nhg->mpath = 1;
+ nh_group_rebalance(nhg);
+ }
+
+ rcu_assign_pointer(nh->nh_grp, nhg);
+
+ return nh;
+
+out_no_nh:
+ for (; i >= 0; --i)
+ nexthop_put(nhg->nh_entries[i].nh);
+
+ kfree(nhg);
+ kfree(nh);
+
+ return ERR_PTR(-ENOENT);
+}
+
+static int nh_create_ipv4(struct net *net, struct nexthop *nh,
+ struct nh_info *nhi, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_nh *fib_nh = &nhi->fib_nh;
+ struct fib_config fib_cfg = {
+ .fc_oif = cfg->nh_ifindex,
+ .fc_gw4 = cfg->gw.ipv4,
+ .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
+ .fc_flags = cfg->nh_flags,
+ .fc_encap = cfg->nh_encap,
+ .fc_encap_type = cfg->nh_encap_type,
+ };
+ u32 tb_id = l3mdev_fib_table(cfg->dev);
+ int err = -EINVAL;
+
+ err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
+ if (err) {
+ fib_nh_release(net, fib_nh);
+ goto out;
+ }
+
+ /* sets nh_dev if successful */
+ err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
+ if (!err) {
+ nh->nh_flags = fib_nh->fib_nh_flags;
+ fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
+ fib_nh->fib_nh_scope);
+ } else {
+ fib_nh_release(net, fib_nh);
+ }
+out:
+ return err;
+}
+
+static int nh_create_ipv6(struct net *net, struct nexthop *nh,
+ struct nh_info *nhi, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_nh *fib6_nh = &nhi->fib6_nh;
+ struct fib6_config fib6_cfg = {
+ .fc_table = l3mdev_fib_table(cfg->dev),
+ .fc_ifindex = cfg->nh_ifindex,
+ .fc_gateway = cfg->gw.ipv6,
+ .fc_flags = cfg->nh_flags,
+ .fc_encap = cfg->nh_encap,
+ .fc_encap_type = cfg->nh_encap_type,
+ };
+ int err;
+
+ if (!ipv6_addr_any(&cfg->gw.ipv6))
+ fib6_cfg.fc_flags |= RTF_GATEWAY;
+
+ /* sets nh_dev if successful */
+ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
+ extack);
+ if (err)
+ ipv6_stub->fib6_nh_release(fib6_nh);
+ else
+ nh->nh_flags = fib6_nh->fib_nh_flags;
+
+ return err;
+}
+
+static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+ struct nexthop *nh;
+ int err = 0;
+
+ nh = nexthop_alloc();
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
+ if (!nhi) {
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ nh->nh_flags = cfg->nh_flags;
+ nh->net = net;
+
+ nhi->nh_parent = nh;
+ nhi->family = cfg->nh_family;
+ nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
+
+ if (cfg->nh_blackhole) {
+ nhi->reject_nh = 1;
+ cfg->nh_ifindex = net->loopback_dev->ifindex;
+ }
+
+ switch (cfg->nh_family) {
+ case AF_INET:
+ err = nh_create_ipv4(net, nh, nhi, cfg, extack);
+ break;
+ case AF_INET6:
+ err = nh_create_ipv6(net, nh, nhi, cfg, extack);
+ break;
+ }
+
+ if (err) {
+ kfree(nhi);
+ kfree(nh);
+ return ERR_PTR(err);
+ }
+
+ /* add the entry to the device based hash */
+ nexthop_devhash_add(net, nhi);
+
+ rcu_assign_pointer(nh->nh_info, nhi);
+
+ return nh;
+}
+
+/* called with rtnl lock held */
+static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nexthop *nh;
+ int err;
+
+ if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
+ NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!cfg->nh_id) {
+ cfg->nh_id = nh_find_unused_id(net);
+ if (!cfg->nh_id) {
+ NL_SET_ERR_MSG(extack, "No unused id");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ if (cfg->nh_grp)
+ nh = nexthop_create_group(net, cfg);
+ else
+ nh = nexthop_create(net, cfg, extack);
+
+ if (IS_ERR(nh))
+ return nh;
+
+ refcount_set(&nh->refcnt, 1);
+ nh->id = cfg->nh_id;
+ nh->protocol = cfg->nh_protocol;
+ nh->net = net;
+
+ err = insert_nexthop(net, nh, cfg, extack);
+ if (err) {
+ __remove_nexthop(net, nh, NULL);
+ nexthop_put(nh);
+ nh = ERR_PTR(err);
+ }
+
+ return nh;
+}
+
+static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nhmsg *nhm = nlmsg_data(nlh);
+ struct nlattr *tb[NHA_MAX + 1];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = -EINVAL;
+ if (nhm->resvd || nhm->nh_scope) {
+ NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
+ goto out;
+ }
+ if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
+ goto out;
+ }
+
+ switch (nhm->nh_family) {
+ case AF_INET:
+ case AF_INET6:
+ break;
+ case AF_UNSPEC:
+ if (tb[NHA_GROUP])
+ break;
+ /* fallthrough */
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid address family");
+ goto out;
+ }
+
+ if (tb[NHA_GROUPS] || tb[NHA_MASTER]) {
+ NL_SET_ERR_MSG(extack, "Invalid attributes in request");
+ goto out;
+ }
+
+ memset(cfg, 0, sizeof(*cfg));
+ cfg->nlflags = nlh->nlmsg_flags;
+ cfg->nlinfo.portid = NETLINK_CB(skb).portid;
+ cfg->nlinfo.nlh = nlh;
+ cfg->nlinfo.nl_net = net;
+
+ cfg->nh_family = nhm->nh_family;
+ cfg->nh_protocol = nhm->nh_protocol;
+ cfg->nh_flags = nhm->nh_flags;
+
+ if (tb[NHA_ID])
+ cfg->nh_id = nla_get_u32(tb[NHA_ID]);
+
+ if (tb[NHA_GROUP]) {
+ if (nhm->nh_family != AF_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Invalid family for group");
+ goto out;
+ }
+ cfg->nh_grp = tb[NHA_GROUP];
+
+ cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
+ if (tb[NHA_GROUP_TYPE])
+ cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
+
+ if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid group type");
+ goto out;
+ }
+ err = nh_check_attr_group(net, tb, extack);
+
+ /* no other attributes should be set */
+ goto out;
+ }
+
+ if (tb[NHA_BLACKHOLE]) {
+ if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
+ tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif");
+ goto out;
+ }
+
+ cfg->nh_blackhole = 1;
+ err = 0;
+ goto out;
+ }
+
+ if (!tb[NHA_OIF]) {
+ NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops");
+ goto out;
+ }
+
+ cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+ if (cfg->nh_ifindex)
+ cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+
+ if (!cfg->dev) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ goto out;
+ } else if (!(cfg->dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ err = -ENETDOWN;
+ goto out;
+ } else if (!netif_carrier_ok(cfg->dev)) {
+ NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
+ err = -ENETDOWN;
+ goto out;
+ }
+
+ err = -EINVAL;
+ if (tb[NHA_GATEWAY]) {
+ struct nlattr *gwa = tb[NHA_GATEWAY];
+
+ switch (cfg->nh_family) {
+ case AF_INET:
+ if (nla_len(gwa) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid gateway");
+ goto out;
+ }
+ cfg->gw.ipv4 = nla_get_be32(gwa);
+ break;
+ case AF_INET6:
+ if (nla_len(gwa) != sizeof(struct in6_addr)) {
+ NL_SET_ERR_MSG(extack, "Invalid gateway");
+ goto out;
+ }
+ cfg->gw.ipv6 = nla_get_in6_addr(gwa);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack,
+ "Unknown address family for gateway");
+ goto out;
+ }
+ } else {
+ /* device only nexthop (no gateway) */
+ if (cfg->nh_flags & RTNH_F_ONLINK) {
+ NL_SET_ERR_MSG(extack,
+ "ONLINK flag can not be set for nexthop without a gateway");
+ goto out;
+ }
+ }
+
+ if (tb[NHA_ENCAP]) {
+ cfg->nh_encap = tb[NHA_ENCAP];
+
+ if (!tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
+ goto out;
+ }
+
+ cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
+ err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
+ if (err < 0)
+ goto out;
+
+ } else if (tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
+ goto out;
+ }
+
+
+ err = 0;
+out:
+ return err;
+}
+
+/* rtnl */
+static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nh_config cfg;
+ struct nexthop *nh;
+ int err;
+
+ err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
+ if (!err) {
+ nh = nexthop_add(net, &cfg, extack);
+ if (IS_ERR(nh))
+ err = PTR_ERR(nh);
+ }
+
+ return err;
+}
+
+static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id,
+ struct netlink_ext_ack *extack)
+{
+ struct nhmsg *nhm = nlmsg_data(nlh);
+ struct nlattr *tb[NHA_MAX + 1];
+ int err, i;
+
+ err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = -EINVAL;
+ for (i = 0; i < __NHA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NHA_ID:
+ break;
+ default:
+ NL_SET_ERR_MSG_ATTR(extack, tb[i],
+ "Unexpected attribute in request");
+ goto out;
+ }
+ }
+ if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header");
+ goto out;
+ }
+
+ if (!tb[NHA_ID]) {
+ NL_SET_ERR_MSG(extack, "Nexthop id is missing");
+ goto out;
+ }
+
+ *id = nla_get_u32(tb[NHA_ID]);
+ if (!(*id))
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ else
+ err = 0;
+out:
+ return err;
+}
+
+/* rtnl */
+static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nl_info nlinfo = {
+ .nlh = nlh,
+ .nl_net = net,
+ .portid = NETLINK_CB(skb).portid,
+ };
+ struct nexthop *nh;
+ int err;
+ u32 id;
+
+ err = nh_valid_get_del_req(nlh, &id, extack);
+ if (err)
+ return err;
+
+ nh = nexthop_find_by_id(net, id);
+ if (!nh)
+ return -ENOENT;
+
+ remove_nexthop(net, nh, &nlinfo);
+
+ return 0;
+}
+
+/* rtnl */
+static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *skb = NULL;
+ struct nexthop *nh;
+ int err;
+ u32 id;
+
+ err = nh_valid_get_del_req(nlh, &id, extack);
+ if (err)
+ return err;
+
+ err = -ENOBUFS;
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ goto out;
+
+ err = -ENOENT;
+ nh = nexthop_find_by_id(net, id);
+ if (!nh)
+ goto errout_free;
+
+ err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ goto errout_free;
+ }
+
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+out:
+ return err;
+errout_free:
+ kfree_skb(skb);
+ goto out;
+}
+
+static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
+ bool group_filter, u8 family)
+{
+ const struct net_device *dev;
+ const struct nh_info *nhi;
+
+ if (group_filter && !nh->is_group)
+ return true;
+
+ if (!dev_idx && !master_idx && !family)
+ return false;
+
+ if (nh->is_group)
+ return true;
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (family && nhi->family != family)
+ return true;
+
+ dev = nhi->fib_nhc.nhc_dev;
+ if (dev_idx && (!dev || dev->ifindex != dev_idx))
+ return true;
+
+ if (master_idx) {
+ struct net_device *master;
+
+ if (!dev)
+ return true;
+
+ master = netdev_master_upper_dev_get((struct net_device *)dev);
+ if (!master || master->ifindex != master_idx)
+ return true;
+ }
+
+ return false;
+}
+
+static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
+ int *master_idx, bool *group_filter,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[NHA_MAX + 1];
+ struct nhmsg *nhm;
+ int err, i;
+ u32 idx;
+
+ err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
+ NULL);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= NHA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NHA_OIF:
+ idx = nla_get_u32(tb[i]);
+ if (idx > INT_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ return -EINVAL;
+ }
+ *dev_idx = idx;
+ break;
+ case NHA_MASTER:
+ idx = nla_get_u32(tb[i]);
+ if (idx > INT_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid master device index");
+ return -EINVAL;
+ }
+ *master_idx = idx;
+ break;
+ case NHA_GROUPS:
+ *group_filter = true;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ nhm = nlmsg_data(nlh);
+ if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* rtnl */
+static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nhmsg *nhm = nlmsg_data(cb->nlh);
+ int dev_filter_idx = 0, master_idx = 0;
+ struct net *net = sock_net(skb->sk);
+ struct rb_root *root = &net->nexthop.rb_root;
+ bool group_filter = false;
+ struct rb_node *node;
+ int idx = 0, s_idx;
+ int err;
+
+ err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
+ &group_filter, cb);
+ if (err < 0)
+ return err;
+
+ s_idx = cb->args[0];
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ struct nexthop *nh;
+
+ if (idx < s_idx)
+ goto cont;
+
+ nh = rb_entry(node, struct nexthop, rb_node);
+ if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
+ group_filter, nhm->nh_family))
+ goto cont;
+
+ err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ if (err < 0) {
+ if (likely(skb->len))
+ goto out;
+
+ goto out_err;
+ }
+cont:
+ idx++;
+ }
+
+out:
+ err = skb->len;
+out_err:
+ cb->args[0] = idx;
+ cb->seq = net->nexthop.seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+
+ return err;
+}
+
+static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+ unsigned int hash = nh_dev_hashfn(dev->ifindex);
+ struct net *net = dev_net(dev);
+ struct hlist_head *head = &net->nexthop.devhash[hash];
+ struct hlist_node *n;
+ struct nh_info *nhi;
+
+ hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+ if (nhi->fib_nhc.nhc_dev == dev) {
+ if (nhi->family == AF_INET)
+ fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
+ orig_mtu);
+ }
+ }
+}
+
+/* rtnl */
+static int nh_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_info_ext *info_ext;
+
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ nexthop_flush_dev(dev);
+ break;
+ case NETDEV_CHANGE:
+ if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
+ nexthop_flush_dev(dev);
+ break;
+ case NETDEV_CHANGEMTU:
+ info_ext = ptr;
+ nexthop_sync_mtu(dev, info_ext->ext.mtu);
+ rt_cache_flush(dev_net(dev));
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nh_netdev_notifier = {
+ .notifier_call = nh_netdev_event,
+};
+
+static void __net_exit nexthop_net_exit(struct net *net)
+{
+ rtnl_lock();
+ flush_all_nexthops(net);
+ rtnl_unlock();
+ kfree(net->nexthop.devhash);
+}
+
+static int __net_init nexthop_net_init(struct net *net)
+{
+ size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
+
+ net->nexthop.rb_root = RB_ROOT;
+ net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
+ if (!net->nexthop.devhash)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static struct pernet_operations nexthop_net_ops = {
+ .init = nexthop_net_init,
+ .exit = nexthop_net_exit,
+};
+
+static int __init nexthop_init(void)
+{
+ register_pernet_subsys(&nexthop_net_ops);
+
+ register_netdevice_notifier(&nh_netdev_notifier);
+
+ rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
+ rtm_dump_nexthop, 0);
+
+ rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+ rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
+
+ rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+ rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
+
+ return 0;
+}
+subsys_initcall(nexthop_init);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 073273b751f8..cc90243ccf76 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -68,8 +68,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "RAW: inuse %d\n",
sock_prot_inuse_get(net, &raw_prot));
seq_printf(seq, "FRAG: inuse %u memory %lu\n",
- atomic_read(&net->ipv4.frags.rhashtable.nelems),
- frag_mem_limit(&net->ipv4.frags));
+ atomic_read(&net->ipv4.fqdir->rhashtable.nelems),
+ frag_mem_limit(net->ipv4.fqdir));
return 0;
}
@@ -288,6 +288,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
+ SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index 899e34ceb560..e35736b99300 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -24,9 +24,6 @@ raw_get_hashinfo(const struct inet_diag_req_v2 *r)
return &raw_v6_hashinfo;
#endif
} else {
- pr_warn_once("Unexpected inet family %d\n",
- r->sdiag_family);
- WARN_ON_ONCE(1);
return ERR_PTR(-EINVAL);
}
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8ea0735a6754..517300d587a7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -95,6 +95,7 @@
#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/arp.h>
#include <net/tcp.h>
#include <net/icmp.h>
@@ -447,7 +448,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
n = ip_neigh_gw4(dev, pkey);
}
- if (n && !refcount_inc_not_zero(&n->refcnt))
+ if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
n = NULL;
rcu_read_unlock_bh();
@@ -1531,7 +1532,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
void rt_flush_dev(struct net_device *dev)
{
- struct net *net = dev_net(dev);
struct rtable *rt;
int cpu;
@@ -1542,7 +1542,7 @@ void rt_flush_dev(struct net_device *dev)
list_for_each_entry(rt, &ul->head, rt_uncached) {
if (rt->dst.dev != dev)
continue;
- rt->dst.dev = net->loopback_dev;
+ rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(dev);
}
@@ -1580,7 +1580,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
#ifdef CONFIG_IP_ROUTE_CLASSID
- {
+ if (nhc->nhc_family == AF_INET) {
struct fib_nh *nh;
nh = container_of(nhc, struct fib_nh, nh_common);
@@ -1962,6 +1962,36 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
hash_keys.basic.ip_proto = fl4->flowi4_proto;
}
break;
+ case 2:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ /* skb is currently provided only when forwarding */
+ if (skb) {
+ struct flow_keys keys;
+
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+ /* Inner can be v4 or v6 */
+ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ ip_multipath_l3_keys(skb, &hash_keys);
+ }
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
+ }
+ break;
}
mhash = flow_hash_from_keys(&hash_keys);
@@ -1979,7 +2009,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
struct flow_keys *hkeys)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1) {
+ if (res->fi && fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
fib_select_multipath(res, h);
@@ -2714,7 +2744,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
- r->rtm_tos = fl4->flowi4_tos;
+ r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, table_id))
goto nla_put_failure;
@@ -2742,7 +2772,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
goto nla_put_failure;
#endif
- if (!rt_is_input_route(rt) &&
+ if (fl4 && !rt_is_input_route(rt) &&
fl4->saddr != src) {
if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
goto nla_put_failure;
@@ -2782,36 +2812,40 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
if (rtnetlink_put_metrics(skb, metrics) < 0)
goto nla_put_failure;
- if (fl4->flowi4_mark &&
- nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
- goto nla_put_failure;
-
- if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
- nla_put_u32(skb, RTA_UID,
- from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
- goto nla_put_failure;
+ if (fl4) {
+ if (fl4->flowi4_mark &&
+ nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
+ goto nla_put_failure;
- error = rt->dst.error;
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+ nla_put_u32(skb, RTA_UID,
+ from_kuid_munged(current_user_ns(),
+ fl4->flowi4_uid)))
+ goto nla_put_failure;
- if (rt_is_input_route(rt)) {
+ if (rt_is_input_route(rt)) {
#ifdef CONFIG_IP_MROUTE
- if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
- IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
- int err = ipmr_get_route(net, skb,
- fl4->saddr, fl4->daddr,
- r, portid);
-
- if (err <= 0) {
- if (err == 0)
- return 0;
- goto nla_put_failure;
- }
- } else
+ if (ipv4_is_multicast(dst) &&
+ !ipv4_is_local_multicast(dst) &&
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+ int err = ipmr_get_route(net, skb,
+ fl4->saddr, fl4->daddr,
+ r, portid);
+
+ if (err <= 0) {
+ if (err == 0)
+ return 0;
+ goto nla_put_failure;
+ }
+ } else
#endif
- if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
- goto nla_put_failure;
+ if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
+ goto nla_put_failure;
+ }
}
+ error = rt->dst.error;
+
if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
goto nla_put_failure;
@@ -2823,6 +2857,80 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, u32 table_id,
+ struct fnhe_hash_bucket *bucket, int genid,
+ int *fa_index, int fa_start)
+{
+ int i;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ struct rtable *rt;
+ int err;
+
+ if (*fa_index < fa_start)
+ goto next;
+
+ if (fnhe->fnhe_genid != genid)
+ goto next;
+
+ if (fnhe->fnhe_expires &&
+ time_after(jiffies, fnhe->fnhe_expires))
+ goto next;
+
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (!rt)
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (!rt)
+ goto next;
+
+ err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
+ table_id, NULL, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq);
+ if (err)
+ return err;
+next:
+ (*fa_index)++;
+ }
+ }
+
+ return 0;
+}
+
+int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
+ u32 table_id, struct fib_info *fi,
+ int *fa_index, int fa_start)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ int nhsel, genid = fnhe_genid(net);
+
+ for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
+ struct fnhe_hash_bucket *bucket;
+ int err;
+
+ if (nhc->nhc_flags & RTNH_F_DEAD)
+ continue;
+
+ rcu_read_lock();
+ bucket = rcu_dereference(nhc->nhc_exceptions);
+ err = 0;
+ if (bucket)
+ err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
+ genid, fa_index, fa_start);
+ rcu_read_unlock();
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
u8 ip_proto, __be16 sport,
__be16 dport)
@@ -3230,9 +3338,11 @@ static struct ctl_table ipv4_route_table[] = {
{ }
};
+static const char ipv4_route_flush_procname[] = "flush";
+
static struct ctl_table ipv4_route_flush_table[] = {
{
- .procname = "flush",
+ .procname = ipv4_route_flush_procname,
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = ipv4_sysctl_rtcache_flush,
@@ -3250,9 +3360,11 @@ static __net_init int sysctl_route_net_init(struct net *net)
if (!tbl)
goto err_dup;
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- tbl[0].procname = NULL;
+ /* Don't export non-whitelisted sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns) {
+ if (tbl[0].procname != ipv4_route_flush_procname)
+ tbl[0].procname = NULL;
+ }
}
tbl[0].extra1 = net;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b6f14af926fa..7d66306b5f39 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -279,55 +279,96 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
return ret;
}
+static int sscanf_key(char *buf, __le32 *key)
+{
+ u32 user_key[4];
+ int i, ret = 0;
+
+ if (sscanf(buf, "%x-%x-%x-%x", user_key, user_key + 1,
+ user_key + 2, user_key + 3) != 4) {
+ ret = -EINVAL;
+ } else {
+ for (i = 0; i < ARRAY_SIZE(user_key); i++)
+ key[i] = cpu_to_le32(user_key[i]);
+ }
+ pr_debug("proc TFO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
+ user_key[0], user_key[1], user_key[2], user_key[3], buf, ret);
+
+ return ret;
+}
+
static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
ipv4.sysctl_tcp_fastopen);
- struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
- struct tcp_fastopen_context *ctxt;
- u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
- __le32 key[4];
- int ret, i;
+ /* maxlen to print the list of keys in hex (*2), with dashes
+ * separating doublewords and a comma in between keys.
+ */
+ struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH *
+ 2 * TCP_FASTOPEN_KEY_MAX) +
+ (TCP_FASTOPEN_KEY_MAX * 5)) };
+ struct tcp_fastopen_context *ctx;
+ u32 user_key[TCP_FASTOPEN_KEY_MAX * 4];
+ __le32 key[TCP_FASTOPEN_KEY_MAX * 4];
+ char *backup_data;
+ int ret, i = 0, off = 0, n_keys = 0;
tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
if (!tbl.data)
return -ENOMEM;
rcu_read_lock();
- ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
- if (ctxt)
- memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
- else
- memset(key, 0, sizeof(key));
+ ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+ if (ctx) {
+ n_keys = tcp_fastopen_context_len(ctx);
+ memcpy(&key[0], &ctx->key[0], TCP_FASTOPEN_KEY_LENGTH * n_keys);
+ }
rcu_read_unlock();
- for (i = 0; i < ARRAY_SIZE(key); i++)
+ if (!n_keys) {
+ memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH);
+ n_keys = 1;
+ }
+
+ for (i = 0; i < n_keys * 4; i++)
user_key[i] = le32_to_cpu(key[i]);
- snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
- user_key[0], user_key[1], user_key[2], user_key[3]);
+ for (i = 0; i < n_keys; i++) {
+ off += snprintf(tbl.data + off, tbl.maxlen - off,
+ "%08x-%08x-%08x-%08x",
+ user_key[i * 4],
+ user_key[i * 4 + 1],
+ user_key[i * 4 + 2],
+ user_key[i * 4 + 3]);
+ if (i + 1 < n_keys)
+ off += snprintf(tbl.data + off, tbl.maxlen - off, ",");
+ }
+
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
if (write && ret == 0) {
- if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
- user_key + 2, user_key + 3) != 4) {
+ backup_data = strchr(tbl.data, ',');
+ if (backup_data) {
+ *backup_data = '\0';
+ backup_data++;
+ }
+ if (sscanf_key(tbl.data, key)) {
ret = -EINVAL;
goto bad_key;
}
-
- for (i = 0; i < ARRAY_SIZE(user_key); i++)
- key[i] = cpu_to_le32(user_key[i]);
-
+ if (backup_data) {
+ if (sscanf_key(backup_data, key + 4)) {
+ ret = -EINVAL;
+ goto bad_key;
+ }
+ }
tcp_fastopen_reset_cipher(net, NULL, key,
- TCP_FASTOPEN_KEY_LENGTH);
+ backup_data ? key + 4 : NULL);
}
bad_key:
- pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
- user_key[0], user_key[1], user_key[2], user_key[3],
- (char *)tbl.data, ret);
kfree(tbl.data);
return ret;
}
@@ -956,7 +997,12 @@ static struct ctl_table ipv4_net_table[] = {
.procname = "tcp_fastopen_key",
.mode = 0600,
.data = &init_net.ipv4.sysctl_tcp_fastopen,
- .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+ /* maxlen to print the list of keys in hex (*2), with dashes
+ * separating doublewords and a comma in between keys.
+ */
+ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH *
+ 2 * TCP_FASTOPEN_KEY_MAX) +
+ (TCP_FASTOPEN_KEY_MAX * 5)),
.proc_handler = proc_tcp_fastopen_key,
},
{
@@ -984,7 +1030,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_fib_multipath_hash_policy,
.extra1 = &zero,
- .extra2 = &one,
+ .extra2 = &two,
},
#endif
{
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7dc9ab84bb69..7846afacdf0b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2614,6 +2614,8 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
tp->bytes_sent = 0;
+ tp->bytes_acked = 0;
+ tp->bytes_received = 0;
tp->bytes_retrans = 0;
tp->duplicate_sack[0].start_seq = 0;
tp->duplicate_sack[0].end_seq = 0;
@@ -2741,6 +2743,21 @@ static int tcp_repair_options_est(struct sock *sk,
return 0;
}
+DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
+EXPORT_SYMBOL(tcp_tx_delay_enabled);
+
+static void tcp_enable_tx_delay(void)
+{
+ if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
+ static int __tcp_tx_delay_enabled = 0;
+
+ if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
+ static_branch_enable(&tcp_tx_delay_enabled);
+ pr_info("TCP_TX_DELAY enabled\n");
+ }
+ }
+}
+
/*
* Socket option code for TCP.
*/
@@ -2791,15 +2808,23 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
return err;
}
case TCP_FASTOPEN_KEY: {
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
+ __u8 *backup_key = NULL;
- if (optlen != sizeof(key))
+ /* Allow a backup key as well to facilitate key rotation
+ * First key is the active one.
+ */
+ if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
+ optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
return -EINVAL;
if (copy_from_user(key, optval, optlen))
return -EFAULT;
- return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
+ if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
+ backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
+
+ return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
}
default:
/* fallthru */
@@ -3083,6 +3108,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
tp->recvmsg_inq = val;
break;
+ case TCP_TX_DELAY:
+ if (val)
+ tcp_enable_tx_delay();
+ tp->tcp_tx_delay = val;
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -3453,21 +3483,23 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return 0;
case TCP_FASTOPEN_KEY: {
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
struct tcp_fastopen_context *ctx;
+ unsigned int key_len = 0;
if (get_user(len, optlen))
return -EFAULT;
rcu_read_lock();
ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
- if (ctx)
- memcpy(key, ctx->key, sizeof(key));
- else
- len = 0;
+ if (ctx) {
+ key_len = tcp_fastopen_context_len(ctx) *
+ TCP_FASTOPEN_KEY_LENGTH;
+ memcpy(&key[0], &ctx->key[0], key_len);
+ }
rcu_read_unlock();
- len = min_t(unsigned int, len, sizeof(key));
+ len = min_t(unsigned int, len, key_len);
if (put_user(len, optlen))
return -EFAULT;
if (copy_to_user(optval, key, len))
@@ -3540,6 +3572,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->fastopen_no_cookie;
break;
+ case TCP_TX_DELAY:
+ val = tp->tcp_tx_delay;
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp_raw() + tp->tsoffset;
break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f5a45e1e1182..3fd451271a70 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -30,15 +30,15 @@ void tcp_fastopen_init_key_once(struct net *net)
* for a valid cookie, so this is an acceptable risk.
*/
get_random_bytes(key, sizeof(key));
- tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key));
+ tcp_fastopen_reset_cipher(net, NULL, key, NULL);
}
static void tcp_fastopen_ctx_free(struct rcu_head *head)
{
struct tcp_fastopen_context *ctx =
container_of(head, struct tcp_fastopen_context, rcu);
- crypto_free_cipher(ctx->tfm);
- kfree(ctx);
+
+ kzfree(ctx);
}
void tcp_fastopen_destroy_cipher(struct sock *sk)
@@ -67,31 +67,27 @@ void tcp_fastopen_ctx_destroy(struct net *net)
}
int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
- void *key, unsigned int len)
+ void *primary_key, void *backup_key)
{
struct tcp_fastopen_context *ctx, *octx;
struct fastopen_queue *q;
- int err;
+ int err = 0;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
- ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
-
- if (IS_ERR(ctx->tfm)) {
- err = PTR_ERR(ctx->tfm);
-error: kfree(ctx);
- pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
- return err;
+ if (!ctx) {
+ err = -ENOMEM;
+ goto out;
}
- err = crypto_cipher_setkey(ctx->tfm, key, len);
- if (err) {
- pr_err("TCP: TFO cipher key error: %d\n", err);
- crypto_free_cipher(ctx->tfm);
- goto error;
- }
- memcpy(ctx->key, key, len);
+ ctx->key[0].key[0] = get_unaligned_le64(primary_key);
+ ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8);
+ if (backup_key) {
+ ctx->key[1].key[0] = get_unaligned_le64(backup_key);
+ ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8);
+ ctx->num = 2;
+ } else {
+ ctx->num = 1;
+ }
spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
if (sk) {
@@ -108,66 +104,58 @@ error: kfree(ctx);
if (octx)
call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+out:
return err;
}
-static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path,
- struct tcp_fastopen_cookie *foc)
+static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
+ struct sk_buff *syn,
+ const siphash_key_t *key,
+ struct tcp_fastopen_cookie *foc)
{
- struct tcp_fastopen_context *ctx;
- bool ok = false;
-
- rcu_read_lock();
-
- ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
- if (!ctx)
- ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
+ BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
- if (ctx) {
- crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
- foc->len = TCP_FASTOPEN_COOKIE_SIZE;
- ok = true;
- }
- rcu_read_unlock();
- return ok;
-}
-
-/* Generate the fastopen cookie by doing aes128 encryption on both
- * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
- * addresses. For the longer IPv6 addresses use CBC-MAC.
- *
- * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
- */
-static bool tcp_fastopen_cookie_gen(struct sock *sk,
- struct request_sock *req,
- struct sk_buff *syn,
- struct tcp_fastopen_cookie *foc)
-{
if (req->rsk_ops->family == AF_INET) {
const struct iphdr *iph = ip_hdr(syn);
- __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
- return __tcp_fastopen_cookie_gen(sk, path, foc);
+ foc->val[0] = cpu_to_le64(siphash(&iph->saddr,
+ sizeof(iph->saddr) +
+ sizeof(iph->daddr),
+ key));
+ foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ return true;
}
-
#if IS_ENABLED(CONFIG_IPV6)
if (req->rsk_ops->family == AF_INET6) {
const struct ipv6hdr *ip6h = ipv6_hdr(syn);
- struct tcp_fastopen_cookie tmp;
-
- if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) {
- struct in6_addr *buf = &tmp.addr;
- int i;
- for (i = 0; i < 4; i++)
- buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
- return __tcp_fastopen_cookie_gen(sk, buf, foc);
- }
+ foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr,
+ sizeof(ip6h->saddr) +
+ sizeof(ip6h->daddr),
+ key));
+ foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ return true;
}
#endif
return false;
}
+/* Generate the fastopen cookie by applying SipHash to both the source and
+ * destination addresses.
+ */
+static void tcp_fastopen_cookie_gen(struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct tcp_fastopen_context *ctx;
+
+ rcu_read_lock();
+ ctx = tcp_fastopen_get_ctx(sk);
+ if (ctx)
+ __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc);
+ rcu_read_unlock();
+}
/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
* queue this additional data / FIN.
@@ -212,6 +200,35 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
tcp_fin(sk);
}
+/* returns 0 - no key match, 1 for primary, 2 for backup */
+static int tcp_fastopen_cookie_gen_check(struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *orig,
+ struct tcp_fastopen_cookie *valid_foc)
+{
+ struct tcp_fastopen_cookie search_foc = { .len = -1 };
+ struct tcp_fastopen_cookie *foc = valid_foc;
+ struct tcp_fastopen_context *ctx;
+ int i, ret = 0;
+
+ rcu_read_lock();
+ ctx = tcp_fastopen_get_ctx(sk);
+ if (!ctx)
+ goto out;
+ for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
+ __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc);
+ if (tcp_fastopen_cookie_match(foc, orig)) {
+ ret = i + 1;
+ goto out;
+ }
+ foc = &search_foc;
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct sk_buff *skb,
struct request_sock *req)
@@ -327,6 +344,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
struct sock *child;
+ int ret = 0;
if (foc->len == 0) /* Client requests a cookie */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
@@ -342,31 +360,44 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
goto fastopen;
- if (foc->len >= 0 && /* Client presents or requests a cookie */
- tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) &&
- foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
- foc->len == valid_foc.len &&
- !memcmp(foc->val, valid_foc.val, foc->len)) {
- /* Cookie is valid. Create a (full) child socket to accept
- * the data in SYN before returning a SYN-ACK to ack the
- * data. If we fail to create the socket, fall back and
- * ack the ISN only but includes the same cookie.
- *
- * Note: Data-less SYN with valid cookie is allowed to send
- * data in SYN_RECV state.
- */
+ if (foc->len == 0) {
+ /* Client requests a cookie. */
+ tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc);
+ } else if (foc->len > 0) {
+ ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc,
+ &valid_foc);
+ if (!ret) {
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ } else {
+ /* Cookie is valid. Create a (full) child socket to
+ * accept the data in SYN before returning a SYN-ACK to
+ * ack the data. If we fail to create the socket, fall
+ * back and ack the ISN only but includes the same
+ * cookie.
+ *
+ * Note: Data-less SYN with valid cookie is allowed to
+ * send data in SYN_RECV state.
+ */
fastopen:
- child = tcp_fastopen_create_child(sk, skb, req);
- if (child) {
- foc->len = -1;
+ child = tcp_fastopen_create_child(sk, skb, req);
+ if (child) {
+ if (ret == 2) {
+ valid_foc.exp = foc->exp;
+ *foc = valid_foc;
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVEALTKEY);
+ } else {
+ foc->len = -1;
+ }
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
+ return child;
+ }
NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENPASSIVE);
- return child;
+ LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
}
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
- } else if (foc->len > 0) /* Client presents an invalid cookie */
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
-
+ }
valid_foc.exp = foc->exp;
*foc = valid_foc;
return NULL;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d95ee40df6c2..c21e8a22fb3b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -119,7 +119,7 @@ void clean_acked_data_enable(struct inet_connection_sock *icsk,
void (*cad)(struct sock *sk, u32 ack_seq))
{
icsk->icsk_clean_acked = cad;
- static_branch_inc(&clean_acked_data_enabled.key);
+ static_branch_deferred_inc(&clean_acked_data_enabled);
}
EXPORT_SYMBOL_GPL(clean_acked_data_enable);
@@ -778,6 +778,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
tp->rtt_seq = tp->snd_nxt;
tp->mdev_max_us = tcp_rto_min_us(sk);
+
+ tcp_bpf_rtt(sk);
}
} else {
/* no previous measure. */
@@ -786,6 +788,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt;
+
+ tcp_bpf_rtt(sk);
}
tp->srtt_us = max(1U, srtt);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cfa81190a1b1..d57641cb3477 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -662,8 +662,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
int genhash;
struct sock *sk1 = NULL;
#endif
- struct net *net;
+ u64 transmit_time = 0;
struct sock *ctl_sk;
+ struct net *net;
/* Never send a reset in response to a reset. */
if (th->rst)
@@ -766,14 +767,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
- if (sk)
+ ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
+ transmit_time = tcp_transmit_time(sk);
+ }
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
- &arg, arg.iov[0].iov_len);
+ &arg, arg.iov[0].iov_len,
+ transmit_time);
ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
@@ -808,6 +812,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
struct net *net = sock_net(sk);
struct ip_reply_arg arg;
struct sock *ctl_sk;
+ u64 transmit_time;
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -858,14 +863,15 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
- if (sk)
- ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_mark : sk->sk_mark;
+ transmit_time = tcp_transmit_time(sk);
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
- &arg, arg.iov[0].iov_len);
+ &arg, arg.iov[0].iov_len,
+ transmit_time);
ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7c35731816e2..8bcaf2586b68 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
tcptw->tw_ts_offset = tp->tsoffset;
tcptw->tw_last_oow_ack_time = 0;
-
+ tcptw->tw_tx_delay = tp->tcp_tx_delay;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -283,6 +283,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_tclass = np->tclass;
tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
+ tw->tw_txhash = sk->sk_txhash;
tw->tw_ipv6only = sk->sk_ipv6only;
}
#endif
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0ebc33d1c9e5..4af1f5dae9d3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
sizeof(struct inet6_skb_parm)));
+ tcp_add_tx_delay(skb, tp);
+
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
if (unlikely(err > 0)) {
@@ -2239,6 +2241,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
+ if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
+ tcp_sk(sk)->tcp_tx_delay) {
+ u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
+
+ /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
+ * approximate our needs assuming an ~100% skb->truesize overhead.
+ * USEC_PER_SEC is approximated by 2^20.
+ * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
+ */
+ extra_bytes >>= (20 - 1);
+ limit += extra_bytes;
+ }
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
/* Always send skb if rtx queue is empty.
* No need to wait for TX completion to call us back,
@@ -3217,6 +3231,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
int tcp_header_size;
struct tcphdr *th;
int mss;
+ u64 now;
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (unlikely(!skb)) {
@@ -3248,13 +3263,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
memset(&opts, 0, sizeof(opts));
+ now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
skb->skb_mstamp_ns = cookie_init_timestamp(req);
else
#endif
{
- skb->skb_mstamp_ns = tcp_clock_ns();
+ skb->skb_mstamp_ns = now;
if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
}
@@ -3297,8 +3313,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_unlock();
#endif
- /* Do not fool tcpdump (if any), clean our debris */
- skb->tstamp = 0;
+ skb->skb_mstamp_ns = now;
+ tcp_add_tx_delay(skb, tp);
+
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index eed59c847722..c21862ba9c02 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -125,17 +125,6 @@ EXPORT_SYMBOL(udp_memory_allocated);
#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
-/* IPCB reference means this can not be used from early demux */
-static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
-{
-#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
- if (!net->ipv4.sysctl_udp_l3mdev_accept &&
- skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
- return true;
-#endif
- return false;
-}
-
static int udp_lib_lport_inuse(struct net *net, __u16 num,
const struct udp_hslot *hslot,
unsigned long *bitmap,
@@ -364,7 +353,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
static int compute_score(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned short hnum,
- int dif, int sdif, bool exact_dif)
+ int dif, int sdif)
{
int score;
struct inet_sock *inet;
@@ -420,7 +409,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum,
- int dif, int sdif, bool exact_dif,
+ int dif, int sdif,
struct udp_hslot *hslot2,
struct sk_buff *skb)
{
@@ -432,7 +421,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif);
+ daddr, hnum, dif, sdif);
if (score > badness) {
if (sk->sk_reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
@@ -460,7 +449,6 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2;
struct udp_hslot *hslot2;
- bool exact_dif = udp_lib_exact_dif_match(net, skb);
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
@@ -468,7 +456,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif, sdif,
- exact_dif, hslot2, skb);
+ hslot2, skb);
if (!result) {
hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
@@ -476,9 +464,9 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
htonl(INADDR_ANY), hnum, dif, sdif,
- exact_dif, hslot2, skb);
+ hslot2, skb);
}
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
@@ -2236,8 +2224,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
int ret;
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
- skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
- inet_compute_pseudo);
+ skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
ret = udp_queue_rcv_skb(sk, skb);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 9763464a75d7..a3908e55ed89 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -208,7 +208,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
gso_skb->destructor = NULL;
segs = skb_segment(gso_skb, features);
- if (unlikely(IS_ERR_OR_NULL(segs))) {
+ if (IS_ERR_OR_NULL(segs)) {
if (copy_dtor)
gso_skb->destructor = sock_wfree;
return segs;
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 80c40b4981bb..f8ed3c3bb928 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -15,46 +15,6 @@
#include <linux/netfilter_ipv4.h>
#include <linux/export.h>
-static int xfrm4_init_flags(struct xfrm_state *x)
-{
- if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)
- x->props.flags |= XFRM_STATE_NOPMTUDISC;
- return 0;
-}
-
-static void
-__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
-{
- const struct flowi4 *fl4 = &fl->u.ip4;
-
- sel->daddr.a4 = fl4->daddr;
- sel->saddr.a4 = fl4->saddr;
- sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
- sel->dport_mask = htons(0xffff);
- sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
- sel->sport_mask = htons(0xffff);
- sel->family = AF_INET;
- sel->prefixlen_d = 32;
- sel->prefixlen_s = 32;
- sel->proto = fl4->flowi4_proto;
- sel->ifindex = fl4->flowi4_oif;
-}
-
-static void
-xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
- const xfrm_address_t *daddr, const xfrm_address_t *saddr)
-{
- x->id = tmpl->id;
- if (x->id.daddr.a4 == 0)
- x->id.daddr.a4 = daddr->a4;
- x->props.saddr = tmpl->saddr;
- if (x->props.saddr.a4 == 0)
- x->props.saddr.a4 = saddr->a4;
- x->props.mode = tmpl->mode;
- x->props.reqid = tmpl->reqid;
- x->props.family = AF_INET;
-}
-
int xfrm4_extract_header(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
@@ -74,11 +34,6 @@ int xfrm4_extract_header(struct sk_buff *skb)
static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.family = AF_INET,
.proto = IPPROTO_IPIP,
- .eth_proto = htons(ETH_P_IP),
- .owner = THIS_MODULE,
- .init_flags = xfrm4_init_flags,
- .init_tempsel = __xfrm4_init_tempsel,
- .init_temprop = xfrm4_init_temprop,
.output = xfrm4_output,
.output_finish = xfrm4_output_finish,
.extract_input = xfrm4_extract_input,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 5d00e54cd319..dc19aff7c2e0 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -108,8 +108,7 @@ static void __exit ipip_fini(void)
if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
pr_info("%s: can't remove xfrm handler for AF_INET\n",
__func__);
- if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&ipip_type, AF_INET);
}
module_init(ipip_init);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 081bb517e40d..521e3203e83a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2417,9 +2417,13 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
goto out;
for_each_fib6_node_rt_rcu(fn) {
- if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex)
+ /* prefix routes only use builtin fib6_nh */
+ if (rt->nh)
continue;
- if (no_gw && rt->fib6_nh.fib_nh_gw_family)
+
+ if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
+ continue;
+ if (no_gw && rt->fib6_nh->fib_nh_gw_family)
continue;
if ((rt->fib6_flags & flags) != flags)
continue;
@@ -3123,11 +3127,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (in_dev && (dev->flags & IFF_UP)) {
struct in_ifaddr *ifa;
-
int flag = scope;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
-
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
addr.s6_addr32[3] = ifa->ifa_local;
if (ifa->ifa_scope == RT_SCOPE_LINK)
@@ -6350,16 +6352,17 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
list_for_each_entry(ifa, &idev->addr_list, if_list) {
spin_lock(&ifa->lock);
if (ifa->rt) {
- struct fib6_info *rt = ifa->rt;
+ /* host routes only use builtin fib6_nh */
+ struct fib6_nh *nh = ifa->rt->fib6_nh;
int cpu;
rcu_read_lock();
ifa->rt->dst_nopolicy = val ? true : false;
- if (rt->rt6i_pcpu) {
+ if (nh->rt6i_pcpu) {
for_each_possible_cpu(cpu) {
struct rt6_info **rtp;
- rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+ rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu);
addrconf_set_nopolicy(*rtp, val);
}
}
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 5b1246635e02..783f3c1466da 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -183,6 +183,11 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
return -EAFNOSUPPORT;
}
+static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt)
+{
+ return -EAFNOSUPPORT;
+}
+
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
.ipv6_route_input = eafnosupport_ipv6_route_input,
@@ -192,6 +197,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.fib6_select_path = eafnosupport_fib6_select_path,
.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
.fib6_nh_init = eafnosupport_fib6_nh_init,
+ .ip6_del_rt = eafnosupport_ip6_del_rt,
};
EXPORT_SYMBOL_GPL(ipv6_stub);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 5352708b7b2d..ef37e0574f54 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -208,7 +208,7 @@ lookup_protocol:
np->mc_loop = 1;
np->mc_all = 1;
np->pmtudisc = IPV6_PMTUDISC_WANT;
- np->repflow = net->ipv6.sysctl.flowlabel_reflect;
+ np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
/* Init the ipv4 part of the socket since we can have sockets
@@ -564,6 +564,39 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
EXPORT_SYMBOL(inet6_ioctl);
+INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *,
+ size_t));
+int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+
+ if (unlikely(inet_send_prepare(sk)))
+ return -EAGAIN;
+
+ return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udpv6_sendmsg,
+ sk, msg, size);
+}
+
+INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *,
+ size_t, int, int, int *));
+int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ if (likely(!(flags & MSG_ERRQUEUE)))
+ sock_rps_record_flow(sk);
+
+ err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udpv6_recvmsg,
+ sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+
const struct proto_ops inet6_stream_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
@@ -580,8 +613,8 @@ const struct proto_ops inet6_stream_ops = {
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
.getsockopt = sock_common_getsockopt, /* ok */
- .sendmsg = inet_sendmsg, /* ok */
- .recvmsg = inet_recvmsg, /* ok */
+ .sendmsg = inet6_sendmsg, /* retpoline's sake */
+ .recvmsg = inet6_recvmsg, /* retpoline's sake */
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
@@ -614,8 +647,8 @@ const struct proto_ops inet6_dgram_ops = {
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
.getsockopt = sock_common_getsockopt, /* ok */
- .sendmsg = inet_sendmsg, /* ok */
- .recvmsg = inet_recvmsg, /* ok */
+ .sendmsg = inet6_sendmsg, /* retpoline's sake */
+ .recvmsg = inet6_recvmsg, /* retpoline's sake */
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.set_peek_off = sk_set_peek_off,
@@ -922,6 +955,9 @@ static const struct ipv6_stub ipv6_stub_impl = {
.ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
.fib6_nh_init = fib6_nh_init,
.fib6_nh_release = fib6_nh_release,
+ .fib6_update_sernum = fib6_update_sernum_stub,
+ .fib6_rt_update = fib6_rt_update,
+ .ip6_del_rt = ip6_del_rt,
.udpv6_encap_enable = udpv6_encap_enable,
.ndisc_send_na = ndisc_send_na,
.nd_tbl = &nd_tbl,
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 68b9e92e469e..25e1172fd1c3 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -793,9 +793,7 @@ static void __exit ah6_fini(void)
if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
-
+ xfrm_unregister_type(&ah6_type, AF_INET6);
}
module_init(ah6_init);
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index ae6a739c5f52..a3b403ba8f8f 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -41,8 +41,6 @@ struct esp_skb_cb {
#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
-static u32 esp6_get_mtu(struct xfrm_state *x, int mtu);
-
/*
* Allocate an AEAD request structure with extra space for SG and IV.
*
@@ -447,7 +445,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
u32 padto;
- padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached));
+ padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
if (skb->len < padto)
esp.tfclen = padto - skb->len;
}
@@ -687,21 +685,6 @@ out:
return ret;
}
-static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
-{
- struct crypto_aead *aead = x->data;
- u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
- unsigned int net_adj;
-
- if (x->props.mode != XFRM_MODE_TUNNEL)
- net_adj = sizeof(struct ipv6hdr);
- else
- net_adj = 0;
-
- return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
- net_adj) & ~(blksize - 1)) + net_adj - 2;
-}
-
static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
@@ -919,7 +902,6 @@ static const struct xfrm_type esp6_type = {
.flags = XFRM_TYPE_REPLAY_PROT,
.init_state = esp6_init_state,
.destructor = esp6_destroy,
- .get_mtu = esp6_get_mtu,
.input = esp6_input,
.output = esp6_output,
.hdr_offset = xfrm6_find_1stfragopt,
@@ -951,8 +933,7 @@ static void __exit esp6_fini(void)
{
if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&esp6_type, AF_INET6);
}
module_init(esp6_init);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index d0d8528b294a..e31626ffccd1 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -336,9 +336,7 @@ static int __init esp6_offload_init(void)
static void __exit esp6_offload_exit(void)
{
- if (xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type offload\n", __func__);
-
+ xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6);
inet6_del_offload(&esp6_offload, IPPROTO_ESP);
}
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index bcfae13409b5..d22b6c140f23 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
return &rt->dst;
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error != -EAGAIN)
return &rt->dst;
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
}
- dst_hold(&net->ipv6.ip6_null_entry->dst);
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&net->ipv6.ip6_null_entry->dst);
return &net->ipv6.ip6_null_entry->dst;
}
@@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
goto out;
}
again:
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
err = -EAGAIN;
rt = NULL;
goto out;
discard_pkt:
- dst_hold(&rt->dst);
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&rt->dst);
out:
res->rt6 = rt;
return err;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 375b4b4f9bf5..62c997201970 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -75,9 +75,9 @@
*
* On SMP we have one ICMP socket per-cpu.
*/
-static inline struct sock *icmpv6_sk(struct net *net)
+static struct sock *icmpv6_sk(struct net *net)
{
- return *this_cpu_ptr(net->ipv6.icmp_sk);
+ return this_cpu_read(*net->ipv6.icmp_sk);
}
static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
@@ -703,6 +703,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY;
memset(&fl6, 0, sizeof(fl6));
+ if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES)
+ fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb));
+
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.daddr = ipv6_hdr(skb)->saddr;
if (saddr)
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b2a55f300318..cf60fae9533b 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -174,7 +174,7 @@ struct sock *inet6_lookup_listener(struct net *net,
saddr, sport, &in6addr_any, hnum,
dif, sdif);
done:
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 9180c8b6f764..49884f96232b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -143,20 +143,19 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
addr[fn_bit >> 5];
}
-struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
{
struct fib6_info *f6i;
+ size_t sz = sizeof(*f6i);
- f6i = kzalloc(sizeof(*f6i), gfp_flags);
+ if (with_fib6_nh)
+ sz += sizeof(struct fib6_nh);
+
+ f6i = kzalloc(sz, gfp_flags);
if (!f6i)
return NULL;
- f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
- if (!f6i->rt6i_pcpu) {
- kfree(f6i);
- return NULL;
- }
-
+ /* fib6_siblings is a union with nh_list, so this initializes both */
INIT_LIST_HEAD(&f6i->fib6_siblings);
refcount_set(&f6i->fib6_ref, 1);
@@ -166,36 +165,15 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
void fib6_info_destroy_rcu(struct rcu_head *head)
{
struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
- struct rt6_exception_bucket *bucket;
WARN_ON(f6i->fib6_node);
- bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
- kfree(bucket);
-
- if (f6i->rt6i_pcpu) {
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct rt6_info **ppcpu_rt;
- struct rt6_info *pcpu_rt;
-
- ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
- pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
- dst_dev_put(&pcpu_rt->dst);
- dst_release(&pcpu_rt->dst);
- *ppcpu_rt = NULL;
- }
- }
-
- free_percpu(f6i->rt6i_pcpu);
- }
-
- fib6_nh_release(&f6i->fib6_nh);
+ if (f6i->nh)
+ nexthop_put(f6i->nh);
+ else
+ fib6_nh_release(f6i->fib6_nh);
ip_fib_metrics_put(f6i->fib6_metrics);
-
kfree(f6i);
}
EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
@@ -338,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error == -EAGAIN) {
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
rt = net->ipv6.ip6_null_entry;
- dst_hold(&rt->dst);
+ if (!(flags | RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&rt->dst);
}
return &rt->dst;
@@ -389,14 +368,30 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
return call_fib6_notifier(nb, net, event_type, &info.info);
}
-static int call_fib6_entry_notifiers(struct net *net,
- enum fib_event_type event_type,
- struct fib6_info *rt,
- struct netlink_ext_ack *extack)
+int call_fib6_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
+ .rt = rt,
+ };
+
+ rt->fib6_table->fib_seq++;
+ return call_fib6_notifiers(net, event_type, &info.info);
+}
+
+int call_fib6_multipath_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ unsigned int nsiblings,
+ struct netlink_ext_ack *extack)
{
struct fib6_entry_notifier_info info = {
.info.extack = extack,
.rt = rt,
+ .nsiblings = nsiblings,
};
rt->fib6_table->fib_seq++;
@@ -469,12 +464,19 @@ static int fib6_dump_node(struct fib6_walker *w)
struct fib6_info *rt;
for_each_fib6_walker_rt(w) {
- res = rt6_dump_route(rt, w->args);
- if (res < 0) {
+ res = rt6_dump_route(rt, w->args, w->skip_in_node);
+ if (res >= 0) {
/* Frame is full, suspend walking */
w->leaf = rt;
+
+ /* We'll restart from this node, so if some routes were
+ * already dumped, skip them next time.
+ */
+ w->skip_in_node += res;
+
return 1;
}
+ w->skip_in_node = 0;
/* Multipath routes are dumped in one route with the
* RTA_MULTIPATH attribute. Jump 'rt' to point to the
@@ -526,6 +528,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
if (cb->args[4] == 0) {
w->count = 0;
w->skip = 0;
+ w->skip_in_node = 0;
spin_lock_bh(&table->tb6_lock);
res = fib6_walk(net, w);
@@ -541,6 +544,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
w->state = FWS_INIT;
w->node = w->root;
w->skip = w->count;
+ w->skip_in_node = 0;
} else
w->skip = 0;
@@ -558,9 +562,10 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true,
+ .filter.dump_routes = true };
const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
- struct rt6_rtnl_dump_arg arg = {};
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib6_walker *w;
@@ -577,13 +582,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
struct rtmsg *rtm = nlmsg_data(nlh);
- arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED);
+ if (rtm->rtm_flags & RTM_F_PREFIX)
+ arg.filter.flags = RTM_F_PREFIX;
}
- /* fib entries are never clones */
- if (arg.filter.flags & RTM_F_CLONED)
- goto out;
-
w = (void *)cb->args[2];
if (!w) {
/* New dump:
@@ -895,16 +897,14 @@ insert_above:
return ln;
}
-static void fib6_drop_pcpu_from(struct fib6_info *f6i,
- const struct fib6_table *table)
+static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
+ const struct fib6_info *match,
+ const struct fib6_table *table)
{
int cpu;
- /* Make sure rt6_make_pcpu_route() wont add other percpu routes
- * while we are cleaning them here.
- */
- f6i->fib6_destroying = 1;
- mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
+ if (!fib6_nh->rt6i_pcpu)
+ return;
/* release the reference to this fib entry from
* all of its cached pcpu routes
@@ -913,9 +913,15 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
struct rt6_info **ppcpu_rt;
struct rt6_info *pcpu_rt;
- ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
+
+ /* only dropping the 'from' reference if the cached route
+ * is using 'match'. The cached pcpu_rt->from only changes
+ * from a fib6_info to NULL (ip6_dst_destroy); it can never
+ * change from one fib6_info reference to another
+ */
+ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
struct fib6_info *from;
from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
@@ -924,13 +930,53 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
}
}
+struct fib6_nh_pcpu_arg {
+ struct fib6_info *from;
+ const struct fib6_table *table;
+};
+
+static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_pcpu_arg *arg = _arg;
+
+ __fib6_drop_pcpu_from(nh, arg->from, arg->table);
+ return 0;
+}
+
+static void fib6_drop_pcpu_from(struct fib6_info *f6i,
+ const struct fib6_table *table)
+{
+ /* Make sure rt6_make_pcpu_route() wont add other percpu routes
+ * while we are cleaning them here.
+ */
+ f6i->fib6_destroying = 1;
+ mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
+
+ if (f6i->nh) {
+ struct fib6_nh_pcpu_arg arg = {
+ .from = f6i,
+ .table = table
+ };
+
+ nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
+ &arg);
+ } else {
+ struct fib6_nh *fib6_nh;
+
+ fib6_nh = f6i->fib6_nh;
+ __fib6_drop_pcpu_from(fib6_nh, f6i, table);
+ }
+}
+
static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
struct net *net)
{
struct fib6_table *table = rt->fib6_table;
- if (rt->rt6i_pcpu)
- fib6_drop_pcpu_from(rt, table);
+ fib6_drop_pcpu_from(rt, table);
+
+ if (rt->nh && !list_empty(&rt->nh_list))
+ list_del_init(&rt->nh_list);
if (refcount_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
@@ -1101,11 +1147,13 @@ next_iter:
add:
nlflags |= NLM_F_CREATE;
- err = call_fib6_entry_notifiers(info->nl_net,
- FIB_EVENT_ENTRY_ADD,
- rt, extack);
- if (err)
- return err;
+ if (!info->skip_notify_kernel) {
+ err = call_fib6_entry_notifiers(info->nl_net,
+ FIB_EVENT_ENTRY_ADD,
+ rt, extack);
+ if (err)
+ return err;
+ }
rcu_assign_pointer(rt->fib6_next, iter);
fib6_info_hold(rt);
@@ -1130,11 +1178,13 @@ add:
return -ENOENT;
}
- err = call_fib6_entry_notifiers(info->nl_net,
- FIB_EVENT_ENTRY_REPLACE,
- rt, extack);
- if (err)
- return err;
+ if (!info->skip_notify_kernel) {
+ err = call_fib6_entry_notifiers(info->nl_net,
+ FIB_EVENT_ENTRY_REPLACE,
+ rt, extack);
+ if (err)
+ return err;
+ }
fib6_info_hold(rt);
rcu_assign_pointer(rt->fib6_node, fn);
@@ -1218,6 +1268,14 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
}
+/* allow ipv4 to update sernum via ipv6_stub */
+void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
+{
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
+ fib6_update_sernum_upto_root(net, f6i);
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
+}
+
/*
* Add routing information to the routing tree.
* <destination addr>/<source addr>
@@ -1331,6 +1389,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
err = fib6_add_rt2node(fn, rt, info, extack);
if (!err) {
+ if (rt->nh)
+ list_add(&rt->nh_list, &rt->nh->f6i_list);
__fib6_update_sernum_upto_root(rt, sernum);
fib6_start_gc(info->nl_net, rt);
}
@@ -1536,7 +1596,8 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
if (plen == fn->fn_bit)
return fn;
- prev = fn;
+ if (fn->fn_flags & RTN_RTINFO)
+ prev = fn;
next:
/*
@@ -1807,9 +1868,11 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
fib6_purge_rt(rt, fn, net);
- call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
+ if (!info->skip_notify_kernel)
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
if (!info->skip_notify)
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
+
fib6_info_release(rt);
}
@@ -2041,6 +2104,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
c.w.func = fib6_clean_node;
c.w.count = 0;
c.w.skip = 0;
+ c.w.skip_in_node = 0;
c.func = func;
c.sernum = sernum;
c.arg = arg;
@@ -2292,9 +2356,13 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
struct fib6_info *rt = v;
struct ipv6_route_iter *iter = seq->private;
+ struct fib6_nh *fib6_nh = rt->fib6_nh;
unsigned int flags = rt->fib6_flags;
const struct net_device *dev;
+ if (rt->nh)
+ fib6_nh = nexthop_fib6_nh(rt->nh);
+
seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
#ifdef CONFIG_IPV6_SUBTREES
@@ -2302,14 +2370,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
#else
seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
- if (rt->fib6_nh.fib_nh_gw_family) {
+ if (fib6_nh->fib_nh_gw_family) {
flags |= RTF_GATEWAY;
- seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6);
+ seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
} else {
seq_puts(seq, "00000000000000000000000000000000");
}
- dev = rt->fib6_nh.fib_nh_dev;
+ dev = fib6_nh->fib_nh_dev;
seq_printf(seq, " %08x %08x %08x %08x %8s\n",
rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
flags, dev ? dev->name : "");
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 545e339b8c4f..ad284b1fd308 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/pid_namespace.h>
+#include <linux/jump_label_ratelimit.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -53,6 +54,9 @@ static DEFINE_SPINLOCK(ip6_fl_lock);
static DEFINE_SPINLOCK(ip6_sk_fl_lock);
+DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ);
+EXPORT_SYMBOL(ipv6_flowlabel_exclusive);
+
#define for_each_fl_rcu(hash, fl) \
for (fl = rcu_dereference_bh(fl_ht[(hash)]); \
fl != NULL; \
@@ -90,6 +94,13 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label)
return fl;
}
+static bool fl_shared_exclusive(struct ip6_flowlabel *fl)
+{
+ return fl->share == IPV6_FL_S_EXCL ||
+ fl->share == IPV6_FL_S_PROCESS ||
+ fl->share == IPV6_FL_S_USER;
+}
+
static void fl_free_rcu(struct rcu_head *head)
{
struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu);
@@ -103,8 +114,13 @@ static void fl_free_rcu(struct rcu_head *head)
static void fl_free(struct ip6_flowlabel *fl)
{
- if (fl)
- call_rcu(&fl->rcu, fl_free_rcu);
+ if (!fl)
+ return;
+
+ if (fl_shared_exclusive(fl) || fl->opt)
+ static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive);
+
+ call_rcu(&fl->rcu, fl_free_rcu);
}
static void fl_release(struct ip6_flowlabel *fl)
@@ -240,7 +256,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
/* Socket flowlabel lists */
-struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label)
+struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label)
{
struct ipv6_fl_socklist *sfl;
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -260,7 +276,7 @@ struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label)
rcu_read_unlock_bh();
return NULL;
}
-EXPORT_SYMBOL_GPL(fl6_sock_lookup);
+EXPORT_SYMBOL_GPL(__fl6_sock_lookup);
void fl6_free_socklist(struct sock *sk)
{
@@ -419,6 +435,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
}
fl->dst = freq->flr_dst;
atomic_set(&fl->users, 1);
+ if (fl_shared_exclusive(fl) || fl->opt)
+ static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
switch (fl->share) {
case IPV6_FL_S_EXCL:
case IPV6_FL_S_ANY:
@@ -854,6 +872,7 @@ int ip6_flowlabel_init(void)
void ip6_flowlabel_cleanup(void)
{
+ static_key_deferred_flush(&ipv6_flowlabel_exclusive);
del_timer(&ip6_fl_gc_timer);
unregister_pernet_subsys(&ip6_flowlabel_net_ops);
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 21efcd02f337..8e49fd62eea9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -124,16 +124,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
return -EINVAL;
}
-static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- int ret;
-
- ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
- kfree_skb(skb);
- return ret;
- }
-
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
@@ -150,6 +142,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
return ip6_finish_output2(net, sk, skb);
}
+static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ return __ip6_finish_output(net, sk, skb);
+ case NET_XMIT_CN:
+ return __ip6_finish_output(net, sk, skb) ? : ret;
+ default:
+ kfree_skb(skb);
+ return ret;
+ }
+}
+
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
@@ -588,6 +596,169 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
+int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
+ u8 nexthdr, __be32 frag_id,
+ struct ip6_fraglist_iter *iter)
+{
+ unsigned int first_len;
+ struct frag_hdr *fh;
+
+ /* BUILD HEADER */
+ *prevhdr = NEXTHDR_FRAGMENT;
+ iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
+ if (!iter->tmp_hdr)
+ return -ENOMEM;
+
+ iter->frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+
+ iter->offset = 0;
+ iter->hlen = hlen;
+ iter->frag_id = frag_id;
+ iter->nexthdr = nexthdr;
+
+ __skb_pull(skb, hlen);
+ fh = __skb_push(skb, sizeof(struct frag_hdr));
+ __skb_push(skb, hlen);
+ skb_reset_network_header(skb);
+ memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
+
+ fh->nexthdr = nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(IP6_MF);
+ fh->identification = frag_id;
+
+ first_len = skb_pagelen(skb);
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
+
+ return 0;
+}
+EXPORT_SYMBOL(ip6_fraglist_init);
+
+void ip6_fraglist_prepare(struct sk_buff *skb,
+ struct ip6_fraglist_iter *iter)
+{
+ struct sk_buff *frag = iter->frag;
+ unsigned int hlen = iter->hlen;
+ struct frag_hdr *fh;
+
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ fh = __skb_push(frag, sizeof(struct frag_hdr));
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
+ iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
+ fh->nexthdr = iter->nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(iter->offset);
+ if (frag->next)
+ fh->frag_off |= htons(IP6_MF);
+ fh->identification = iter->frag_id;
+ ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+ ip6_copy_metadata(frag, skb);
+}
+EXPORT_SYMBOL(ip6_fraglist_prepare);
+
+void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
+ unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
+ u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
+{
+ state->prevhdr = prevhdr;
+ state->nexthdr = nexthdr;
+ state->frag_id = frag_id;
+
+ state->hlen = hlen;
+ state->mtu = mtu;
+
+ state->left = skb->len - hlen; /* Space per frame */
+ state->ptr = hlen; /* Where to start from */
+
+ state->hroom = hdr_room;
+ state->troom = needed_tailroom;
+
+ state->offset = 0;
+}
+EXPORT_SYMBOL(ip6_frag_init);
+
+struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
+{
+ u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
+ struct sk_buff *frag;
+ struct frag_hdr *fh;
+ unsigned int len;
+
+ len = state->left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > state->mtu)
+ len = state->mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < state->left)
+ len &= ~7;
+
+ /* Allocate buffer */
+ frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
+ state->hroom + state->troom, GFP_ATOMIC);
+ if (!frag)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Set up data on packet
+ */
+
+ ip6_copy_metadata(frag, skb);
+ skb_reserve(frag, state->hroom);
+ skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
+ skb_reset_network_header(frag);
+ fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
+ frag->transport_header = (frag->network_header + state->hlen +
+ sizeof(struct frag_hdr));
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+ if (skb->sk)
+ skb_set_owner_w(frag, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+ skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
+
+ fragnexthdr_offset = skb_network_header(frag);
+ fragnexthdr_offset += prevhdr - skb_network_header(skb);
+ *fragnexthdr_offset = NEXTHDR_FRAGMENT;
+
+ /*
+ * Build fragment header.
+ */
+ fh->nexthdr = state->nexthdr;
+ fh->reserved = 0;
+ fh->identification = state->frag_id;
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
+ len));
+ state->left -= len;
+
+ fh->frag_off = htons(state->offset);
+ if (state->left > 0)
+ fh->frag_off |= htons(IP6_MF);
+ ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+
+ state->ptr += len;
+ state->offset += len;
+
+ return frag;
+}
+EXPORT_SYMBOL(ip6_frag_next);
+
int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int (*output)(struct net *, struct sock *, struct sk_buff *))
{
@@ -595,12 +766,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
inet6_sk(skb->sk) : NULL;
- struct ipv6hdr *tmp_hdr;
- struct frag_hdr *fh;
- unsigned int mtu, hlen, left, len, nexthdr_offset;
- int hroom, troom;
+ struct ip6_frag_state state;
+ unsigned int mtu, hlen, nexthdr_offset;
+ int hroom, err = 0;
__be32 frag_id;
- int ptr, offset = 0, err = 0;
u8 *prevhdr, nexthdr = 0;
err = ip6_find_1stfragopt(skb, &prevhdr);
@@ -647,6 +816,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
hroom = LL_RESERVED_SPACE(rt->dst.dev);
if (skb_has_frag_list(skb)) {
unsigned int first_len = skb_pagelen(skb);
+ struct ip6_fraglist_iter iter;
struct sk_buff *frag2;
if (first_len - hlen > mtu ||
@@ -674,74 +844,29 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
skb->truesize -= frag->truesize;
}
- err = 0;
- offset = 0;
- /* BUILD HEADER */
-
- *prevhdr = NEXTHDR_FRAGMENT;
- tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
- if (!tmp_hdr) {
- err = -ENOMEM;
+ err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+ &iter);
+ if (err < 0)
goto fail;
- }
- frag = skb_shinfo(skb)->frag_list;
- skb_frag_list_init(skb);
-
- __skb_pull(skb, hlen);
- fh = __skb_push(skb, sizeof(struct frag_hdr));
- __skb_push(skb, hlen);
- skb_reset_network_header(skb);
- memcpy(skb_network_header(skb), tmp_hdr, hlen);
-
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->frag_off = htons(IP6_MF);
- fh->identification = frag_id;
-
- first_len = skb_pagelen(skb);
- skb->data_len = first_len - skb_headlen(skb);
- skb->len = first_len;
- ipv6_hdr(skb)->payload_len = htons(first_len -
- sizeof(struct ipv6hdr));
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
- if (frag) {
- frag->ip_summed = CHECKSUM_NONE;
- skb_reset_transport_header(frag);
- fh = __skb_push(frag, sizeof(struct frag_hdr));
- __skb_push(frag, hlen);
- skb_reset_network_header(frag);
- memcpy(skb_network_header(frag), tmp_hdr,
- hlen);
- offset += skb->len - hlen - sizeof(struct frag_hdr);
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->frag_off = htons(offset);
- if (frag->next)
- fh->frag_off |= htons(IP6_MF);
- fh->identification = frag_id;
- ipv6_hdr(frag)->payload_len =
- htons(frag->len -
- sizeof(struct ipv6hdr));
- ip6_copy_metadata(frag, skb);
- }
+ if (iter.frag)
+ ip6_fraglist_prepare(skb, &iter);
err = output(net, sk, skb);
if (!err)
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGCREATES);
- if (err || !frag)
+ if (err || !iter.frag)
break;
- skb = frag;
- frag = skb->next;
- skb_mark_not_on_list(skb);
+ skb = ip6_fraglist_next(&iter);
}
- kfree(tmp_hdr);
+ kfree(iter.tmp_hdr);
if (err == 0) {
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
@@ -749,7 +874,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
return 0;
}
- kfree_skb_list(frag);
+ kfree_skb_list(iter.frag);
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGFAILS);
@@ -766,91 +891,26 @@ slow_path_clean:
}
slow_path:
- left = skb->len - hlen; /* Space per frame */
- ptr = hlen; /* Where to start from */
-
/*
* Fragment the datagram.
*/
- troom = rt->dst.dev->needed_tailroom;
+ ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
+ LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
+ &state);
/*
* Keep copying data until we run out.
*/
- while (left > 0) {
- u8 *fragnexthdr_offset;
-
- len = left;
- /* IF: it doesn't fit, use 'mtu' - the data space left */
- if (len > mtu)
- len = mtu;
- /* IF: we are not sending up to and including the packet end
- then align the next start on an eight byte boundary */
- if (len < left) {
- len &= ~7;
- }
- /* Allocate buffer */
- frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
- hroom + troom, GFP_ATOMIC);
- if (!frag) {
- err = -ENOMEM;
+ while (state.left > 0) {
+ frag = ip6_frag_next(skb, &state);
+ if (IS_ERR(frag)) {
+ err = PTR_ERR(frag);
goto fail;
}
/*
- * Set up data on packet
- */
-
- ip6_copy_metadata(frag, skb);
- skb_reserve(frag, hroom);
- skb_put(frag, len + hlen + sizeof(struct frag_hdr));
- skb_reset_network_header(frag);
- fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
- frag->transport_header = (frag->network_header + hlen +
- sizeof(struct frag_hdr));
-
- /*
- * Charge the memory for the fragment to any owner
- * it might possess
- */
- if (skb->sk)
- skb_set_owner_w(frag, skb->sk);
-
- /*
- * Copy the packet header into the new buffer.
- */
- skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
-
- fragnexthdr_offset = skb_network_header(frag);
- fragnexthdr_offset += prevhdr - skb_network_header(skb);
- *fragnexthdr_offset = NEXTHDR_FRAGMENT;
-
- /*
- * Build fragment header.
- */
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->identification = frag_id;
-
- /*
- * Copy a block of the IP datagram.
- */
- BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
- len));
- left -= len;
-
- fh->frag_off = htons(offset);
- if (left > 0)
- fh->frag_off |= htons(IP6_MF);
- ipv6_hdr(frag)->payload_len = htons(frag->len -
- sizeof(struct ipv6hdr));
-
- ptr += len;
- offset += len;
-
- /*
* Put this fragment into the sending queue.
*/
err = output(net, sk, frag);
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 51fd33294c7c..3752bd3e92ce 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -206,8 +206,7 @@ static void __exit ipcomp6_fini(void)
{
if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&ipcomp6_type, AF_INET6);
}
module_init(ipcomp6_init);
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index 91801432878c..878fcec14949 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -499,10 +499,8 @@ static void __exit mip6_fini(void)
{
if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0)
pr_info("%s: can't remove rawv6 mh filter\n", __func__);
- if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type(rthdr)\n", __func__);
- if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type(destopt)\n", __func__);
+ xfrm_unregister_type(&mip6_rthdr_type, AF_INET6);
+ xfrm_unregister_type(&mip6_destopt_type, AF_INET6);
}
module_init(mip6_init);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 09dd2edfb868..083cc1c94cd3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1285,12 +1285,11 @@ static void ndisc_router_discovery(struct sk_buff *skb)
!in6_dev->cnf.accept_ra_rtr_pref)
pref = ICMPV6_ROUTER_PREF_MEDIUM;
#endif
-
+ /* routes added from RAs do not use nexthop objects */
rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
-
if (rt) {
- neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6,
- rt->fib6_nh.fib_nh_dev, NULL,
+ neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+ rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
@@ -1319,8 +1318,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
- neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6,
- rt->fib6_nh.fib_nh_dev, NULL,
+ neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+ rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 1240ccd57f39..61819ed858b1 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -16,6 +16,9 @@
#include <net/ip6_route.h>
#include <net/xfrm.h>
#include <net/netfilter/nf_queue.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include "../bridge/br_private.h"
int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
{
@@ -109,16 +112,142 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst,
}
EXPORT_SYMBOL_GPL(__nf_ip6_route);
+int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ struct nf_ct_bridge_frag_data *data,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_ct_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ struct ip6_frag_state state;
+ u8 *prevhdr, nexthdr = 0;
+ unsigned int mtu, hlen;
+ int hroom, err = 0;
+ __be32 frag_id;
+
+ err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ goto blackhole;
+ hlen = err;
+ nexthdr = *prevhdr;
+
+ mtu = skb->dev->mtu;
+ if (frag_max_size > mtu ||
+ frag_max_size < IPV6_MIN_MTU)
+ goto blackhole;
+
+ mtu = frag_max_size;
+ if (mtu < hlen + sizeof(struct frag_hdr) + 8)
+ goto blackhole;
+ mtu -= hlen + sizeof(struct frag_hdr);
+
+ frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
+ &ipv6_hdr(skb)->saddr);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto blackhole;
+
+ hroom = LL_RESERVED_SPACE(skb->dev);
+ if (skb_has_frag_list(skb)) {
+ unsigned int first_len = skb_pagelen(skb);
+ struct ip6_fraglist_iter iter;
+ struct sk_buff *frag2;
+
+ if (first_len - hlen > mtu ||
+ skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
+ goto blackhole;
+
+ if (skb_cloned(skb))
+ goto slow_path;
+
+ skb_walk_frags(skb, frag2) {
+ if (frag2->len > mtu ||
+ skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr)))
+ goto blackhole;
+
+ /* Partially cloned skb? */
+ if (skb_shared(frag2))
+ goto slow_path;
+ }
+
+ err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+ &iter);
+ if (err < 0)
+ goto blackhole;
+
+ for (;;) {
+ /* Prepare header of the next frame,
+ * before previous one went down.
+ */
+ if (iter.frag)
+ ip6_fraglist_prepare(skb, &iter);
+
+ err = output(net, sk, data, skb);
+ if (err || !iter.frag)
+ break;
+
+ skb = ip6_fraglist_next(&iter);
+ }
+
+ kfree(iter.tmp_hdr);
+ if (!err)
+ return 0;
+
+ kfree_skb_list(iter.frag);
+ return err;
+ }
+slow_path:
+ /* This is a linearized skbuff, the original geometry is lost for us.
+ * This may also be a clone skbuff, we could preserve the geometry for
+ * the copies but probably not worth the effort.
+ */
+ ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom,
+ LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id,
+ &state);
+
+ while (state.left > 0) {
+ struct sk_buff *skb2;
+
+ skb2 = ip6_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
+ goto blackhole;
+ }
+
+ err = output(net, sk, data, skb2);
+ if (err)
+ goto blackhole;
+ }
+ consume_skb(skb);
+ return err;
+
+blackhole:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_ip6_fragment);
+
static const struct nf_ipv6_ops ipv6ops = {
#if IS_MODULE(CONFIG_IPV6)
.chk_addr = ipv6_chk_addr,
.route_me_harder = ip6_route_me_harder,
.dev_get_saddr = ipv6_dev_get_saddr,
.route = __nf_ip6_route,
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
+ .cookie_init_sequence = __cookie_v6_init_sequence,
+ .cookie_v6_check = __cookie_v6_check,
+#endif
#endif
.route_input = ip6_route_input,
.fragment = ip6_fragment,
.reroute = nf_ip6_reroute,
+#if IS_MODULE(CONFIG_IPV6) && IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ .br_defrag = nf_ct_frag6_gather,
+#endif
+#if IS_MODULE(CONFIG_IPV6)
+ .br_fragment = br_ip6_fragment,
+#endif
};
int __init ipv6_netfilter_init(void)
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 41325d517478..e77ea1ed5edd 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -3,272 +3,11 @@
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
*/
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/ip6_checksum.h>
-#include <net/ip6_route.h>
-#include <net/tcp.h>
-
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_SYNPROXY.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_synproxy.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-
-static struct ipv6hdr *
-synproxy_build_ip(struct net *net, struct sk_buff *skb,
- const struct in6_addr *saddr,
- const struct in6_addr *daddr)
-{
- struct ipv6hdr *iph;
-
- skb_reset_network_header(skb);
- iph = skb_put(skb, sizeof(*iph));
- ip6_flow_hdr(iph, 0, 0);
- iph->hop_limit = net->ipv6.devconf_all->hop_limit;
- iph->nexthdr = IPPROTO_TCP;
- iph->saddr = *saddr;
- iph->daddr = *daddr;
-
- return iph;
-}
-
-static void
-synproxy_send_tcp(struct net *net,
- const struct sk_buff *skb, struct sk_buff *nskb,
- struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
- struct ipv6hdr *niph, struct tcphdr *nth,
- unsigned int tcp_hdr_size)
-{
- struct dst_entry *dst;
- struct flowi6 fl6;
-
- nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
- nskb->ip_summed = CHECKSUM_PARTIAL;
- nskb->csum_start = (unsigned char *)nth - nskb->head;
- nskb->csum_offset = offsetof(struct tcphdr, check);
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_TCP;
- fl6.saddr = niph->saddr;
- fl6.daddr = niph->daddr;
- fl6.fl6_sport = nth->source;
- fl6.fl6_dport = nth->dest;
- security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6));
- dst = ip6_route_output(net, NULL, &fl6);
- if (dst->error) {
- dst_release(dst);
- goto free_nskb;
- }
- dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
- if (IS_ERR(dst))
- goto free_nskb;
-
- skb_dst_set(nskb, dst);
-
- if (nfct) {
- nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
- nf_conntrack_get(nfct);
- }
-
- ip6_local_out(net, nskb->sk, nskb);
- return;
-
-free_nskb:
- kfree_skb(nskb);
-}
-
-static void
-synproxy_send_client_synack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
- u16 mss = opts->mss;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE;
- nth->doff = tcp_hdr_size / 4;
- nth->window = 0;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-static void
-synproxy_send_server_syn(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(recv_seq - 1);
- /* ack_seq is used to relay our ISN to the synproxy hook to initialize
- * sequence number translation once a connection tracking entry exists.
- */
- nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
- nth->doff = tcp_hdr_size / 4;
- nth->window = th->window;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
- niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_ack(struct net *net,
- const struct ip_ct_tcp *state,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(ntohl(th->ack_seq));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(ntohl(th->seq) + 1);
- nth->ack_seq = th->ack_seq;
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(ntohs(th->window) >> opts->wscale);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static bool
-synproxy_recv_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- int mss;
-
- mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
- if (mss == 0) {
- this_cpu_inc(snet->stats->cookie_invalid);
- return false;
- }
-
- this_cpu_inc(snet->stats->cookie_valid);
- opts->mss = mss;
- opts->options |= XT_SYNPROXY_OPT_MSS;
-
- if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy_check_timestamp_cookie(opts);
-
- synproxy_send_server_syn(net, skb, th, opts, recv_seq);
- return true;
-}
+#include <net/netfilter/nf_synproxy.h>
static unsigned int
synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
@@ -304,13 +43,14 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(net, skb, th, &opts);
+ synproxy_send_client_synack_ipv6(net, skb, th, &opts);
consume_skb(skb);
return NF_STOLEN;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
/* ACK from client */
- if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) {
+ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts,
+ ntohl(th->seq))) {
consume_skb(skb);
return NF_STOLEN;
} else {
@@ -321,141 +61,6 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv6_synproxy_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *nhs)
-{
- struct net *net = nhs->net;
- struct synproxy_net *snet = synproxy_pernet(net);
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
- struct nf_conn_synproxy *synproxy;
- struct synproxy_options opts = {};
- const struct ip_ct_tcp *state;
- struct tcphdr *th, _th;
- __be16 frag_off;
- u8 nexthdr;
- int thoff;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL)
- return NF_ACCEPT;
-
- synproxy = nfct_synproxy(ct);
- if (synproxy == NULL)
- return NF_ACCEPT;
-
- if (nf_is_loopback_packet(skb))
- return NF_ACCEPT;
-
- nexthdr = ipv6_hdr(skb)->nexthdr;
- thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
- &frag_off);
- if (thoff < 0 || nexthdr != IPPROTO_TCP)
- return NF_ACCEPT;
-
- th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
- if (th == NULL)
- return NF_DROP;
-
- state = &ct->proto.tcp;
- switch (state->state) {
- case TCP_CONNTRACK_CLOSE:
- if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
- ntohl(th->seq) + 1);
- break;
- }
-
- if (!th->syn || th->ack ||
- CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- break;
-
- /* Reopened connection - reset the sequence number and timestamp
- * adjustments, they will get initialized once the connection is
- * reestablished.
- */
- nf_ct_seqadj_init(ct, ctinfo, 0);
- synproxy->tsoff = 0;
- this_cpu_inc(snet->stats->conn_reopened);
-
- /* fall through */
- case TCP_CONNTRACK_SYN_SENT:
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (!th->syn && th->ack &&
- CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
- /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
- * therefore we need to add 1 to make the SYN sequence
- * number match the one of first SYN.
- */
- if (synproxy_recv_client_ack(net, skb, th, &opts,
- ntohl(th->seq) + 1)) {
- this_cpu_inc(snet->stats->cookie_retrans);
- consume_skb(skb);
- return NF_STOLEN;
- } else {
- return NF_DROP;
- }
- }
-
- synproxy->isn = ntohl(th->ack_seq);
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy->its = opts.tsecr;
-
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- break;
- case TCP_CONNTRACK_SYN_RECV:
- if (!th->syn || !th->ack)
- break;
-
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) {
- synproxy->tsoff = opts.tsval - synproxy->its;
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- }
-
- opts.options &= ~(XT_SYNPROXY_OPT_MSS |
- XT_SYNPROXY_OPT_WSCALE |
- XT_SYNPROXY_OPT_SACK_PERM);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_server_ack(net, state, skb, th, &opts);
-
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
- nf_conntrack_event_cache(IPCT_SEQADJ, ct);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_client_ack(net, skb, th, &opts);
-
- consume_skb(skb);
- return NF_STOLEN;
- default:
- break;
- }
-
- synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
- return NF_ACCEPT;
-}
-
-static const struct nf_hook_ops ipv6_synproxy_ops[] = {
- {
- .hook = ipv6_synproxy_hook,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
- {
- .hook = ipv6_synproxy_hook,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
-};
-
static int synproxy_tg6_check(const struct xt_tgchk_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
@@ -471,16 +76,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par)
if (err)
return err;
- if (snet->hook_ref6 == 0) {
- err = nf_register_net_hooks(par->net, ipv6_synproxy_ops,
- ARRAY_SIZE(ipv6_synproxy_ops));
- if (err) {
- nf_ct_netns_put(par->net, par->family);
- return err;
- }
+ err = nf_synproxy_ipv6_init(snet, par->net);
+ if (err) {
+ nf_ct_netns_put(par->net, par->family);
+ return err;
}
- snet->hook_ref6++;
return err;
}
@@ -488,10 +89,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
- snet->hook_ref6--;
- if (snet->hook_ref6 == 0)
- nf_unregister_net_hooks(par->net, ipv6_synproxy_ops,
- ARRAY_SIZE(ipv6_synproxy_ops));
+ nf_synproxy_ipv6_fini(snet, par->net);
nf_ct_netns_put(par->net, par->family);
}
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 3f7d4691c423..a22100b1cf2c 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -2,7 +2,7 @@
/*
* IPv6 raw table, a port of the IPv4 raw table to IPv6
*
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 84322ce81d70..398e1df41406 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -54,26 +54,21 @@ static struct inet_frags nf_frags;
static struct ctl_table nf_ct_frag6_sysctl_table[] = {
{
.procname = "nf_conntrack_frag6_timeout",
- .data = &init_net.nf_frag.frags.timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_frag6_low_thresh",
- .data = &init_net.nf_frag.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.nf_frag.frags.high_thresh
},
{
.procname = "nf_conntrack_frag6_high_thresh",
- .data = &init_net.nf_frag.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.nf_frag.frags.low_thresh
},
{ }
};
@@ -89,15 +84,15 @@ static int nf_ct_frag6_sysctl_register(struct net *net)
GFP_KERNEL);
if (table == NULL)
goto err_alloc;
-
- table[0].data = &net->nf_frag.frags.timeout;
- table[1].data = &net->nf_frag.frags.low_thresh;
- table[1].extra2 = &net->nf_frag.frags.high_thresh;
- table[2].data = &net->nf_frag.frags.high_thresh;
- table[2].extra1 = &net->nf_frag.frags.low_thresh;
- table[2].extra2 = &init_net.nf_frag.frags.high_thresh;
}
+ table[0].data = &net->nf_frag.fqdir->timeout;
+ table[1].data = &net->nf_frag.fqdir->low_thresh;
+ table[1].extra2 = &net->nf_frag.fqdir->high_thresh;
+ table[2].data = &net->nf_frag.fqdir->high_thresh;
+ table[2].extra1 = &net->nf_frag.fqdir->low_thresh;
+ table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh;
+
hdr = register_net_sysctl(net, "net/netfilter", table);
if (hdr == NULL)
goto err_reg;
@@ -144,12 +139,10 @@ static void nf_ct_frag6_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
struct frag_queue *fq;
- struct net *net;
fq = container_of(frag, struct frag_queue, q);
- net = container_of(fq->q.net, struct net, nf_frag.frags);
- ip6frag_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
}
/* Creation primitives. */
@@ -165,7 +158,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
};
struct inet_frag_queue *q;
- q = inet_frag_find(&net->nf_frag.frags, &key);
+ q = inet_frag_find(net->nf_frag.fqdir, &key);
if (!q)
return NULL;
@@ -278,7 +271,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
fq->ecn |= ecn;
if (payload_len > fq->q.max_size)
fq->q.max_size = payload_len;
- add_frag_mem_limit(fq->q.net, skb->truesize);
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
/* The first fragment.
* nhoffset is obtained from the first fragment, of course.
@@ -494,29 +487,35 @@ static int nf_ct_net_init(struct net *net)
{
int res;
- net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
- net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
- net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
- net->nf_frag.frags.f = &nf_frags;
-
- res = inet_frags_init_net(&net->nf_frag.frags);
+ res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net);
if (res < 0)
return res;
+
+ net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
res = nf_ct_frag6_sysctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->nf_frag.frags);
+ fqdir_exit(net->nf_frag.fqdir);
return res;
}
+static void nf_ct_net_pre_exit(struct net *net)
+{
+ fqdir_pre_exit(net->nf_frag.fqdir);
+}
+
static void nf_ct_net_exit(struct net *net)
{
nf_ct_frags6_sysctl_unregister(net);
- inet_frags_exit_net(&net->nf_frag.frags);
+ fqdir_exit(net->nf_frag.fqdir);
}
static struct pernet_operations nf_ct_net_ops = {
- .init = nf_ct_net_init,
- .exit = nf_ct_net_exit,
+ .init = nf_ct_net_init,
+ .pre_exit = nf_ct_net_pre_exit,
+ .exit = nf_ct_net_exit,
};
static const struct rhashtable_params nfct_rhash_params = {
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 4a8da679866e..bbff3e02e302 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -44,8 +44,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "RAW6: inuse %d\n",
sock_prot_inuse_get(net, &rawv6_prot));
seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
- atomic_read(&net->ipv6.frags.rhashtable.nelems),
- frag_mem_limit(&net->ipv6.frags));
+ atomic_read(&net->ipv6.fqdir->rhashtable.nelems),
+ frag_mem_limit(net->ipv6.fqdir));
return 0;
}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 70693bc7ad9d..8a6131991e38 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -834,7 +834,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
}
@@ -876,7 +876,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
if (!(opt->opt_nflen|opt->opt_flen))
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index b2b2c0c38b87..ca05b16f1bb9 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -72,12 +72,10 @@ static void ip6_frag_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
struct frag_queue *fq;
- struct net *net;
fq = container_of(frag, struct frag_queue, q);
- net = container_of(fq->q.net, struct net, ipv6.frags);
- ip6frag_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
}
static struct frag_queue *
@@ -96,7 +94,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
IPV6_ADDR_LINKLOCAL)))
key.iif = 0;
- q = inet_frag_find(&net->ipv6.frags, &key);
+ q = inet_frag_find(net->ipv6.fqdir, &key);
if (!q)
return NULL;
@@ -196,7 +194,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
fq->q.stamp = skb->tstamp;
fq->q.meat += skb->len;
fq->ecn |= ecn;
- add_frag_mem_limit(fq->q.net, skb->truesize);
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
fragsize = -skb_network_offset(skb) + skb->len;
if (fragsize > fq->q.max_size)
@@ -250,7 +248,7 @@ err:
static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev)
{
- struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
+ struct net *net = fq->q.fqdir->net;
unsigned int nhoff;
void *reasm_data;
int payload_len;
@@ -397,23 +395,18 @@ static const struct inet6_protocol frag_protocol = {
static struct ctl_table ip6_frags_ns_ctl_table[] = {
{
.procname = "ip6frag_high_thresh",
- .data = &init_net.ipv6.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.ipv6.frags.low_thresh
},
{
.procname = "ip6frag_low_thresh",
- .data = &init_net.ipv6.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.ipv6.frags.high_thresh
},
{
.procname = "ip6frag_time",
- .data = &init_net.ipv6.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -445,12 +438,12 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
if (!table)
goto err_alloc;
- table[0].data = &net->ipv6.frags.high_thresh;
- table[0].extra1 = &net->ipv6.frags.low_thresh;
- table[1].data = &net->ipv6.frags.low_thresh;
- table[1].extra2 = &net->ipv6.frags.high_thresh;
- table[2].data = &net->ipv6.frags.timeout;
}
+ table[0].data = &net->ipv6.fqdir->high_thresh;
+ table[0].extra1 = &net->ipv6.fqdir->low_thresh;
+ table[1].data = &net->ipv6.fqdir->low_thresh;
+ table[1].extra2 = &net->ipv6.fqdir->high_thresh;
+ table[2].data = &net->ipv6.fqdir->timeout;
hdr = register_net_sysctl(net, "net/ipv6", table);
if (!hdr)
@@ -513,30 +506,35 @@ static int __net_init ipv6_frags_init_net(struct net *net)
{
int res;
- net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
- net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
- net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
- net->ipv6.frags.f = &ip6_frags;
-
- res = inet_frags_init_net(&net->ipv6.frags);
+ res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net);
if (res < 0)
return res;
+ net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
res = ip6_frags_ns_sysctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->ipv6.frags);
+ fqdir_exit(net->ipv6.fqdir);
return res;
}
+static void __net_exit ipv6_frags_pre_exit_net(struct net *net)
+{
+ fqdir_pre_exit(net->ipv6.fqdir);
+}
+
static void __net_exit ipv6_frags_exit_net(struct net *net)
{
ip6_frags_ns_sysctl_unregister(net);
- inet_frags_exit_net(&net->ipv6.frags);
+ fqdir_exit(net->ipv6.fqdir);
}
static struct pernet_operations ip6_frags_ops = {
- .init = ipv6_frags_init_net,
- .exit = ipv6_frags_exit_net,
+ .init = ipv6_frags_init_net,
+ .pre_exit = ipv6_frags_pre_exit_net,
+ .exit = ipv6_frags_exit_net,
};
static const struct rhashtable_params ip6_rhash_params = {
@@ -587,8 +585,8 @@ err_protocol:
void ipv6_frag_exit(void)
{
- inet_frags_fini(&ip6_frags);
ip6_frags_sysctl_unregister();
unregister_pernet_subsys(&ip6_frags_ops);
inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+ inet_frags_fini(&ip6_frags);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 97a843cf164c..4d2e6b31a8d6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -100,7 +100,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
int strict);
-static size_t rt6_nlmsg_size(struct fib6_info *rt);
+static size_t rt6_nlmsg_size(struct fib6_info *f6i);
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct fib6_info *rt, struct dst_entry *dst,
struct in6_addr *dest, struct in6_addr *src,
@@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
}
if (rt_dev == dev) {
- rt->dst.dev = loopback_dev;
+ rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(rt_dev);
}
@@ -429,21 +429,27 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
struct fib6_info *sibling, *next_sibling;
struct fib6_info *match = res->f6i;
- if (!match->fib6_nsiblings || have_oif_match)
+ if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
goto out;
/* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it.
*/
- if (!fl6->mp_hash)
+ if (!fl6->mp_hash &&
+ (!match->nh || nexthop_is_multipath(match->nh)))
fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
- if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
+ if (unlikely(match->nh)) {
+ nexthop_path_fib6_result(res, fl6->mp_hash);
+ return;
+ }
+
+ if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
goto out;
list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
fib6_siblings) {
- const struct fib6_nh *nh = &sibling->fib6_nh;
+ const struct fib6_nh *nh = sibling->fib6_nh;
int nh_upper_bound;
nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
@@ -457,7 +463,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
out:
res->f6i = match;
- res->nh = &match->fib6_nh;
+ res->nh = match->fib6_nh;
}
/*
@@ -485,6 +491,45 @@ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
return false;
}
+struct fib6_nh_dm_arg {
+ struct net *net;
+ const struct in6_addr *saddr;
+ int oif;
+ int flags;
+ struct fib6_nh *nh;
+};
+
+static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_dm_arg *arg = _arg;
+
+ arg->nh = nh;
+ return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
+ arg->flags);
+}
+
+/* returns fib6_nh from nexthop or NULL */
+static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
+ struct fib6_result *res,
+ const struct in6_addr *saddr,
+ int oif, int flags)
+{
+ struct fib6_nh_dm_arg arg = {
+ .net = net,
+ .saddr = saddr,
+ .oif = oif,
+ .flags = flags,
+ };
+
+ if (nexthop_is_blackhole(nh))
+ return NULL;
+
+ if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
+ return arg.nh;
+
+ return NULL;
+}
+
static void rt6_device_match(struct net *net, struct fib6_result *res,
const struct in6_addr *saddr, int oif, int flags)
{
@@ -493,14 +538,31 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
struct fib6_nh *nh;
if (!oif && ipv6_addr_any(saddr)) {
- nh = &f6i->fib6_nh;
+ if (unlikely(f6i->nh)) {
+ nh = nexthop_fib6_nh(f6i->nh);
+ if (nexthop_is_blackhole(f6i->nh))
+ goto out_blackhole;
+ } else {
+ nh = f6i->fib6_nh;
+ }
if (!(nh->fib_nh_flags & RTNH_F_DEAD))
goto out;
}
for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
- nh = &spf6i->fib6_nh;
- if (__rt6_device_match(net, nh, saddr, oif, flags)) {
+ bool matched = false;
+
+ if (unlikely(spf6i->nh)) {
+ nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
+ oif, flags);
+ if (nh)
+ matched = true;
+ } else {
+ nh = spf6i->fib6_nh;
+ if (__rt6_device_match(net, nh, saddr, oif, flags))
+ matched = true;
+ }
+ if (matched) {
res->f6i = spf6i;
goto out;
}
@@ -508,19 +570,32 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
if (oif && flags & RT6_LOOKUP_F_IFACE) {
res->f6i = net->ipv6.fib6_null_entry;
- nh = &res->f6i->fib6_nh;
+ nh = res->f6i->fib6_nh;
goto out;
}
- nh = &f6i->fib6_nh;
+ if (unlikely(f6i->nh)) {
+ nh = nexthop_fib6_nh(f6i->nh);
+ if (nexthop_is_blackhole(f6i->nh))
+ goto out_blackhole;
+ } else {
+ nh = f6i->fib6_nh;
+ }
+
if (nh->fib_nh_flags & RTNH_F_DEAD) {
res->f6i = net->ipv6.fib6_null_entry;
- nh = &res->f6i->fib6_nh;
+ nh = res->f6i->fib6_nh;
}
out:
res->nh = nh;
res->fib6_type = res->f6i->fib6_type;
res->fib6_flags = res->f6i->fib6_flags;
+ return;
+
+out_blackhole:
+ res->fib6_flags |= RTF_REJECT;
+ res->fib6_type = RTN_BLACKHOLE;
+ res->nh = nh;
}
#ifdef CONFIG_IPV6_ROUTER_PREF
@@ -691,6 +766,24 @@ out:
return rc;
}
+struct fib6_nh_frl_arg {
+ u32 flags;
+ int oif;
+ int strict;
+ int *mpri;
+ bool *do_rr;
+ struct fib6_nh *nh;
+};
+
+static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_frl_arg *arg = _arg;
+
+ arg->nh = nh;
+ return find_match(nh, arg->flags, arg->oif, arg->strict,
+ arg->mpri, arg->do_rr);
+}
+
static void __find_rr_leaf(struct fib6_info *f6i_start,
struct fib6_info *nomatch, u32 metric,
struct fib6_result *res, struct fib6_info **cont,
@@ -701,6 +794,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start,
for (f6i = f6i_start;
f6i && f6i != nomatch;
f6i = rcu_dereference(f6i->fib6_next)) {
+ bool matched = false;
struct fib6_nh *nh;
if (cont && f6i->fib6_metric != metric) {
@@ -711,8 +805,34 @@ static void __find_rr_leaf(struct fib6_info *f6i_start,
if (fib6_check_expired(f6i))
continue;
- nh = &f6i->fib6_nh;
- if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
+ if (unlikely(f6i->nh)) {
+ struct fib6_nh_frl_arg arg = {
+ .flags = f6i->fib6_flags,
+ .oif = oif,
+ .strict = strict,
+ .mpri = mpri,
+ .do_rr = do_rr
+ };
+
+ if (nexthop_is_blackhole(f6i->nh)) {
+ res->fib6_flags = RTF_REJECT;
+ res->fib6_type = RTN_BLACKHOLE;
+ res->f6i = f6i;
+ res->nh = nexthop_fib6_nh(f6i->nh);
+ return;
+ }
+ if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
+ &arg)) {
+ matched = true;
+ nh = arg.nh;
+ }
+ } else {
+ nh = f6i->fib6_nh;
+ if (find_match(nh, f6i->fib6_flags, oif, strict,
+ mpri, do_rr))
+ matched = true;
+ }
+ if (matched) {
res->f6i = f6i;
res->nh = nh;
res->fib6_flags = f6i->fib6_flags;
@@ -793,7 +913,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
out:
if (!res->f6i) {
res->f6i = net->ipv6.fib6_null_entry;
- res->nh = &res->f6i->fib6_nh;
+ res->nh = res->f6i->fib6_nh;
res->fib6_flags = res->f6i->fib6_flags;
res->fib6_type = res->f6i->fib6_type;
}
@@ -1114,6 +1234,8 @@ restart:
rt = net->ipv6.ip6_null_entry;
dst_hold(&rt->dst);
goto out;
+ } else if (res.fib6_flags & RTF_REJECT) {
+ goto do_create;
}
fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
@@ -1125,6 +1247,7 @@ restart:
if (ip6_hold_safe(net, &rt))
dst_use_noref(&rt->dst, jiffies);
} else {
+do_create:
rt = ip6_create_rt_rcu(&res);
}
@@ -1265,13 +1388,9 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
/* It should be called with rcu_read_lock() acquired */
static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
{
- struct rt6_info *pcpu_rt, **p;
-
- p = this_cpu_ptr(res->f6i->rt6i_pcpu);
- pcpu_rt = *p;
+ struct rt6_info *pcpu_rt;
- if (pcpu_rt)
- ip6_hold_safe(NULL, &pcpu_rt);
+ pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
return pcpu_rt;
}
@@ -1282,13 +1401,10 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net,
struct rt6_info *pcpu_rt, *prev, **p;
pcpu_rt = ip6_rt_pcpu_alloc(res);
- if (!pcpu_rt) {
- dst_hold(&net->ipv6.ip6_null_entry->dst);
- return net->ipv6.ip6_null_entry;
- }
+ if (!pcpu_rt)
+ return NULL;
- dst_hold(&pcpu_rt->dst);
- p = this_cpu_ptr(res->f6i->rt6i_pcpu);
+ p = this_cpu_ptr(res->nh->rt6i_pcpu);
prev = cmpxchg(p, NULL, pcpu_rt);
BUG_ON(prev);
@@ -1458,25 +1574,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res)
return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}
+#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL
+
+/* used when the flushed bit is not relevant, only access to the bucket
+ * (ie., all bucket users except rt6_insert_exception);
+ *
+ * called under rcu lock; sometimes called with rt6_exception_lock held
+ */
+static
+struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
+ spinlock_t *lock)
+{
+ struct rt6_exception_bucket *bucket;
+
+ if (lock)
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(lock));
+ else
+ bucket = rcu_dereference(nh->rt6i_exception_bucket);
+
+ /* remove bucket flushed bit if set */
+ if (bucket) {
+ unsigned long p = (unsigned long)bucket;
+
+ p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
+ bucket = (struct rt6_exception_bucket *)p;
+ }
+
+ return bucket;
+}
+
+static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
+{
+ unsigned long p = (unsigned long)bucket;
+
+ return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
+}
+
+/* called with rt6_exception_lock held */
+static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
+ spinlock_t *lock)
+{
+ struct rt6_exception_bucket *bucket;
+ unsigned long p;
+
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(lock));
+
+ p = (unsigned long)bucket;
+ p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
+ bucket = (struct rt6_exception_bucket *)p;
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
+}
+
static int rt6_insert_exception(struct rt6_info *nrt,
const struct fib6_result *res)
{
struct net *net = dev_net(nrt->dst.dev);
struct rt6_exception_bucket *bucket;
+ struct fib6_info *f6i = res->f6i;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
- struct fib6_info *f6i = res->f6i;
+ struct fib6_nh *nh = res->nh;
int err = 0;
spin_lock_bh(&rt6_exception_lock);
- if (f6i->exception_bucket_flushed) {
- err = -EINVAL;
- goto out;
- }
-
- bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
if (!bucket) {
bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
GFP_ATOMIC);
@@ -1484,7 +1649,10 @@ static int rt6_insert_exception(struct rt6_info *nrt,
err = -ENOMEM;
goto out;
}
- rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
+ } else if (fib6_nh_excptn_bucket_flushed(bucket)) {
+ err = -EINVAL;
+ goto out;
}
#ifdef CONFIG_IPV6_SUBTREES
@@ -1539,7 +1707,7 @@ out:
return err;
}
-void rt6_flush_exceptions(struct fib6_info *rt)
+static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
@@ -1547,25 +1715,46 @@ void rt6_flush_exceptions(struct fib6_info *rt)
int i;
spin_lock_bh(&rt6_exception_lock);
- /* Prevent rt6_insert_exception() to recreate the bucket list */
- rt->exception_bucket_flushed = 1;
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (!bucket)
goto out;
+ /* Prevent rt6_insert_exception() to recreate the bucket list */
+ if (!from)
+ fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
+
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
- hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
- rt6_remove_exception(bucket, rt6_ex);
- WARN_ON_ONCE(bucket->depth);
+ hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
+ if (!from ||
+ rcu_access_pointer(rt6_ex->rt6i->from) == from)
+ rt6_remove_exception(bucket, rt6_ex);
+ }
+ WARN_ON_ONCE(!from && bucket->depth);
bucket++;
}
-
out:
spin_unlock_bh(&rt6_exception_lock);
}
+static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
+{
+ struct fib6_info *f6i = arg;
+
+ fib6_nh_flush_exceptions(nh, f6i);
+
+ return 0;
+}
+
+void rt6_flush_exceptions(struct fib6_info *f6i)
+{
+ if (f6i->nh)
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
+ f6i);
+ else
+ fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
+}
+
/* Find cached rt in the hash table inside passed in rt
* Caller has to hold rcu_read_lock()
*/
@@ -1594,7 +1783,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
src_key = saddr;
find_ex:
#endif
- bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
+ bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
@@ -1612,25 +1801,20 @@ find_ex:
}
/* Remove the passed in cached rt from the hash table that contains it */
-static int rt6_remove_exception_rt(struct rt6_info *rt)
+static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
+ const struct rt6_info *rt)
{
+ const struct in6_addr *src_key = NULL;
struct rt6_exception_bucket *bucket;
- struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
- struct fib6_info *from;
int err;
- from = rcu_dereference(rt->from);
- if (!from ||
- !(rt->rt6i_flags & RTF_CACHE))
- return -EINVAL;
-
- if (!rcu_access_pointer(from->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return -ENOENT;
spin_lock_bh(&rt6_exception_lock);
- bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
+
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
@@ -1638,7 +1822,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->fib6_src.plen)
+ if (plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_spinlock(&bucket,
@@ -1655,23 +1839,60 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
return err;
}
-/* Find rt6_ex which contains the passed in rt cache and
- * refresh its stamp
- */
-static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+struct fib6_nh_excptn_arg {
+ struct rt6_info *rt;
+ int plen;
+};
+
+static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_excptn_arg *arg = _arg;
+ int err;
+
+ err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
+ if (err == 0)
+ return 1;
+
+ return 0;
+}
+
+static int rt6_remove_exception_rt(struct rt6_info *rt)
{
- struct rt6_exception_bucket *bucket;
- struct in6_addr *src_key = NULL;
- struct rt6_exception *rt6_ex;
struct fib6_info *from;
- rcu_read_lock();
from = rcu_dereference(rt->from);
if (!from || !(rt->rt6i_flags & RTF_CACHE))
- goto unlock;
+ return -EINVAL;
- bucket = rcu_dereference(from->rt6i_exception_bucket);
+ if (from->nh) {
+ struct fib6_nh_excptn_arg arg = {
+ .rt = rt,
+ .plen = from->fib6_src.plen
+ };
+ int rc;
+
+ /* rc = 1 means an entry was found */
+ rc = nexthop_for_each_fib6_nh(from->nh,
+ rt6_nh_remove_exception_rt,
+ &arg);
+ return rc ? 0 : -ENOENT;
+ }
+ return fib6_nh_remove_exception(from->fib6_nh,
+ from->fib6_src.plen, rt);
+}
+
+/* Find rt6_ex which contains the passed in rt cache and
+ * refresh its stamp
+ */
+static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
+ const struct rt6_info *rt)
+{
+ const struct in6_addr *src_key = NULL;
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
@@ -1679,15 +1900,63 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->fib6_src.plen)
+ if (plen)
src_key = &rt->rt6i_src.addr;
#endif
- rt6_ex = __rt6_find_exception_rcu(&bucket,
- &rt->rt6i_dst.addr,
- src_key);
+ rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
if (rt6_ex)
rt6_ex->stamp = jiffies;
+}
+
+struct fib6_nh_match_arg {
+ const struct net_device *dev;
+ const struct in6_addr *gw;
+ struct fib6_nh *match;
+};
+
+/* determine if fib6_nh has given device and gateway */
+static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_match_arg *arg = _arg;
+
+ if (arg->dev != nh->fib_nh_dev ||
+ (arg->gw && !nh->fib_nh_gw_family) ||
+ (!arg->gw && nh->fib_nh_gw_family) ||
+ (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
+ return 0;
+
+ arg->match = nh;
+
+ /* found a match, break the loop */
+ return 1;
+}
+
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+{
+ struct fib6_info *from;
+ struct fib6_nh *fib6_nh;
+
+ rcu_read_lock();
+
+ from = rcu_dereference(rt->from);
+ if (!from || !(rt->rt6i_flags & RTF_CACHE))
+ goto unlock;
+ if (from->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = rt->dst.dev,
+ .gw = &rt->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
+
+ if (!arg.match)
+ return;
+ fib6_nh = arg.match;
+ } else {
+ fib6_nh = from->fib6_nh;
+ }
+ fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
unlock:
rcu_read_unlock();
}
@@ -1715,15 +1984,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
}
static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
- struct fib6_info *rt, int mtu)
+ const struct fib6_nh *nh, int mtu)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
int i;
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (!bucket)
return;
@@ -1745,21 +2012,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
-static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
- struct in6_addr *gateway)
+static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
+ const struct in6_addr *gateway)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
- if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return;
spin_lock_bh(&rt6_exception_lock);
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
@@ -1824,23 +2089,21 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
gc_args->more++;
}
-void rt6_age_exceptions(struct fib6_info *rt,
- struct fib6_gc_args *gc_args,
- unsigned long now)
+static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
- if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return;
rcu_read_lock_bh();
spin_lock(&rt6_exception_lock);
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
@@ -1855,6 +2118,36 @@ void rt6_age_exceptions(struct fib6_info *rt,
rcu_read_unlock_bh();
}
+struct fib6_nh_age_excptn_arg {
+ struct fib6_gc_args *gc_args;
+ unsigned long now;
+};
+
+static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_age_excptn_arg *arg = _arg;
+
+ fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
+ return 0;
+}
+
+void rt6_age_exceptions(struct fib6_info *f6i,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
+{
+ if (f6i->nh) {
+ struct fib6_nh_age_excptn_arg arg = {
+ .gc_args = gc_args,
+ .now = now
+ };
+
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
+ &arg);
+ } else {
+ fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
+ }
+}
+
/* must be called with rcu lock held */
int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
struct flowi6 *fl6, struct fib6_result *res, int strict)
@@ -1891,9 +2184,12 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
const struct sk_buff *skb, int flags)
{
struct fib6_result res = {};
- struct rt6_info *rt;
+ struct rt6_info *rt = NULL;
int strict = 0;
+ WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
+ !rcu_read_lock_held());
+
strict |= flags & RT6_LOOKUP_F_IFACE;
strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
if (net->ipv6.devconf_all->forwarding == 0)
@@ -1902,23 +2198,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
rcu_read_lock();
fib6_table_lookup(net, table, oif, fl6, &res, strict);
- if (res.f6i == net->ipv6.fib6_null_entry) {
- rt = net->ipv6.ip6_null_entry;
- rcu_read_unlock();
- dst_hold(&rt->dst);
- return rt;
- }
+ if (res.f6i == net->ipv6.fib6_null_entry)
+ goto out;
fib6_select_path(net, &res, fl6, oif, false, skb, strict);
/*Search through exception table */
rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
if (rt) {
- if (ip6_hold_safe(net, &rt))
- dst_use_noref(&rt->dst, jiffies);
-
- rcu_read_unlock();
- return rt;
+ goto out;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
!res.nh->fib_nh_gw_family)) {
/* Create a RTF_CACHE clone which will not be
@@ -1926,40 +2214,38 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
* the daddr in the skb during the neighbor look-up is different
* from the fl6->daddr used to look-up route here.
*/
- struct rt6_info *uncached_rt;
-
- uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
+ rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
- rcu_read_unlock();
-
- if (uncached_rt) {
- /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
- * No need for another dst_hold()
+ if (rt) {
+ /* 1 refcnt is taken during ip6_rt_cache_alloc().
+ * As rt6_uncached_list_add() does not consume refcnt,
+ * this refcnt is always returned to the caller even
+ * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
*/
- rt6_uncached_list_add(uncached_rt);
+ rt6_uncached_list_add(rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
- } else {
- uncached_rt = net->ipv6.ip6_null_entry;
- dst_hold(&uncached_rt->dst);
- }
+ rcu_read_unlock();
- return uncached_rt;
+ return rt;
+ }
} else {
/* Get a percpu copy */
-
- struct rt6_info *pcpu_rt;
-
local_bh_disable();
- pcpu_rt = rt6_get_pcpu_route(&res);
+ rt = rt6_get_pcpu_route(&res);
- if (!pcpu_rt)
- pcpu_rt = rt6_make_pcpu_route(net, &res);
+ if (!rt)
+ rt = rt6_make_pcpu_route(net, &res);
local_bh_enable();
- rcu_read_unlock();
-
- return pcpu_rt;
}
+out:
+ if (!rt)
+ rt = net->ipv6.ip6_null_entry;
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ ip6_hold_safe(net, &rt);
+ rcu_read_unlock();
+
+ return rt;
}
EXPORT_SYMBOL_GPL(ip6_pol_route);
@@ -2084,17 +2370,54 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}
break;
+ case 2:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (skb) {
+ struct flow_keys keys;
+
+ if (!flkeys) {
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+ flkeys = &keys;
+ }
+
+ /* Inner can be v4 or v6 */
+ if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
+ hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
+ } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
+ hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
+ hash_keys.tags.flow_label = flkeys->tags.flow_label;
+ hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
+ }
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = fl6->saddr;
+ hash_keys.addrs.v6addrs.dst = fl6->daddr;
+ hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
+ hash_keys.basic.ip_proto = fl6->flowi6_proto;
+ }
+ break;
}
mhash = flow_hash_from_keys(&hash_keys);
return mhash >> 1;
}
+/* Called with rcu held */
void ip6_route_input(struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
- int flags = RT6_LOOKUP_F_HAS_SADDR;
+ int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
struct ip_tunnel_info *tun_info;
struct flowi6 fl6 = {
.flowi6_iif = skb->dev->ifindex,
@@ -2116,8 +2439,8 @@ void ip6_route_input(struct sk_buff *skb)
if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
skb_dst_drop(skb);
- skb_dst_set(skb,
- ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
+ skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
+ &fl6, skb, flags));
}
static struct rt6_info *ip6_pol_route_output(struct net *net,
@@ -2129,8 +2452,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net,
return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}
-struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
- struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_output_flags_noref(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6, int flags)
{
bool any_src;
@@ -2138,6 +2462,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
struct dst_entry *dst;
+ /* This function does not take refcnt on the dst */
dst = l3mdev_link_scope_lookup(net, fl6);
if (dst)
return dst;
@@ -2145,6 +2470,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
fl6->flowi6_iif = LOOPBACK_IFINDEX;
+ flags |= RT6_LOOKUP_F_DST_NOREF;
any_src = ipv6_addr_any(&fl6->saddr);
if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
(fl6->flowi6_oif && any_src))
@@ -2157,6 +2483,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
}
+EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref);
+
+struct dst_entry *ip6_route_output_flags(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6,
+ int flags)
+{
+ struct dst_entry *dst;
+ struct rt6_info *rt6;
+
+ rcu_read_lock();
+ dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
+ rt6 = (struct rt6_info *)dst;
+ /* For dst cached in uncached_list, refcnt is already taken. */
+ if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
+ dst = &net->ipv6.ip6_null_entry->dst;
+ dst_hold(dst);
+ }
+ rcu_read_unlock();
+
+ return dst;
+}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
@@ -2381,10 +2729,31 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
rcu_read_unlock();
return;
}
- res.nh = &res.f6i->fib6_nh;
res.fib6_flags = res.f6i->fib6_flags;
res.fib6_type = res.f6i->fib6_type;
+ if (res.f6i->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = dst->dev,
+ .gw = &rt6->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(res.f6i->nh,
+ fib6_nh_find_match, &arg);
+
+ /* fib6_info uses a nexthop that does not have fib6_nh
+ * using the dst->dev + gw. Should be impossible.
+ */
+ if (!arg.match) {
+ rcu_read_unlock();
+ return;
+ }
+
+ res.nh = arg.match;
+ } else {
+ res.nh = res.f6i->fib6_nh;
+ }
+
nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
@@ -2491,6 +2860,21 @@ static bool ip6_redirect_nh_match(const struct fib6_result *res,
return true;
}
+struct fib6_nh_rd_arg {
+ struct fib6_result *res;
+ struct flowi6 *fl6;
+ const struct in6_addr *gw;
+ struct rt6_info **ret;
+};
+
+static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_rd_arg *arg = _arg;
+
+ arg->res->nh = nh;
+ return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
+}
+
/* Handle redirects */
struct ip6rd_flowi {
struct flowi6 fl6;
@@ -2506,6 +2890,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
struct rt6_info *ret = NULL;
struct fib6_result res = {};
+ struct fib6_nh_rd_arg arg = {
+ .res = &res,
+ .fl6 = fl6,
+ .gw = &rdfl->gateway,
+ .ret = &ret
+ };
struct fib6_info *rt;
struct fib6_node *fn;
@@ -2530,14 +2920,24 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
restart:
for_each_fib6_node_rt_rcu(fn) {
res.f6i = rt;
- res.nh = &rt->fib6_nh;
-
if (fib6_check_expired(rt))
continue;
if (rt->fib6_flags & RTF_REJECT)
break;
- if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
- goto out;
+ if (unlikely(rt->nh)) {
+ if (nexthop_is_blackhole(rt->nh))
+ continue;
+ /* on match, res->nh is filled in and potentially ret */
+ if (nexthop_for_each_fib6_nh(rt->nh,
+ fib6_nh_redirect_match,
+ &arg))
+ goto out;
+ } else {
+ res.nh = rt->fib6_nh;
+ if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
+ &ret))
+ goto out;
+ }
}
if (!rt)
@@ -2554,7 +2954,7 @@ restart:
}
res.f6i = rt;
- res.nh = &rt->fib6_nh;
+ res.nh = rt->fib6_nh;
out:
if (ret) {
ip6_hold_safe(net, &ret);
@@ -2781,10 +3181,9 @@ out:
return entries > rt_max_size;
}
-static struct rt6_info *ip6_nh_lookup_table(struct net *net,
- struct fib6_config *cfg,
- const struct in6_addr *gw_addr,
- u32 tbid, int flags)
+static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
+ const struct in6_addr *gw_addr, u32 tbid,
+ int flags, struct fib6_result *res)
{
struct flowi6 fl6 = {
.flowi6_oif = cfg->fc_ifindex,
@@ -2792,25 +3191,23 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
.saddr = cfg->fc_prefsrc,
};
struct fib6_table *table;
- struct rt6_info *rt;
+ int err;
table = fib6_get_table(net, tbid);
if (!table)
- return NULL;
+ return -EINVAL;
if (!ipv6_addr_any(&cfg->fc_prefsrc))
flags |= RT6_LOOKUP_F_HAS_SADDR;
flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
- rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
- /* if table lookup failed, fall back to full lookup */
- if (rt == net->ipv6.ip6_null_entry) {
- ip6_rt_put(rt);
- rt = NULL;
- }
+ err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
+ if (!err && res->f6i != net->ipv6.fib6_null_entry)
+ fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
+ cfg->fc_ifindex != 0, NULL, flags);
- return rt;
+ return err;
}
static int ip6_route_check_nh_onlink(struct net *net,
@@ -2818,29 +3215,19 @@ static int ip6_route_check_nh_onlink(struct net *net,
const struct net_device *dev,
struct netlink_ext_ack *extack)
{
- u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
const struct in6_addr *gw_addr = &cfg->fc_gateway;
- u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
- struct fib6_info *from;
- struct rt6_info *grt;
+ struct fib6_result res = {};
int err;
- err = 0;
- grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
- if (grt) {
- rcu_read_lock();
- from = rcu_dereference(grt->from);
- if (!grt->dst.error &&
- /* ignore match if it is the default route */
- from && !ipv6_addr_any(&from->fib6_dst.addr) &&
- (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
- NL_SET_ERR_MSG(extack,
- "Nexthop has invalid gateway or device mismatch");
- err = -EINVAL;
- }
- rcu_read_unlock();
-
- ip6_rt_put(grt);
+ err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
+ if (!err && !(res.fib6_flags & RTF_REJECT) &&
+ /* ignore match if it is the default route */
+ !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
+ (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop has invalid gateway or device mismatch");
+ err = -EINVAL;
}
return err;
@@ -2853,47 +3240,50 @@ static int ip6_route_check_nh(struct net *net,
{
const struct in6_addr *gw_addr = &cfg->fc_gateway;
struct net_device *dev = _dev ? *_dev : NULL;
- struct rt6_info *grt = NULL;
+ int flags = RT6_LOOKUP_F_IFACE;
+ struct fib6_result res = {};
int err = -EHOSTUNREACH;
if (cfg->fc_table) {
- int flags = RT6_LOOKUP_F_IFACE;
-
- grt = ip6_nh_lookup_table(net, cfg, gw_addr,
- cfg->fc_table, flags);
- if (grt) {
- if (grt->rt6i_flags & RTF_GATEWAY ||
- (dev && dev != grt->dst.dev)) {
- ip6_rt_put(grt);
- grt = NULL;
- }
- }
+ err = ip6_nh_lookup_table(net, cfg, gw_addr,
+ cfg->fc_table, flags, &res);
+ /* gw_addr can not require a gateway or resolve to a reject
+ * route. If a device is given, it must match the result.
+ */
+ if (err || res.fib6_flags & RTF_REJECT ||
+ res.nh->fib_nh_gw_family ||
+ (dev && dev != res.nh->fib_nh_dev))
+ err = -EHOSTUNREACH;
}
- if (!grt)
- grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
+ if (err < 0) {
+ struct flowi6 fl6 = {
+ .flowi6_oif = cfg->fc_ifindex,
+ .daddr = *gw_addr,
+ };
- if (!grt)
- goto out;
+ err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
+ if (err || res.fib6_flags & RTF_REJECT ||
+ res.nh->fib_nh_gw_family)
+ err = -EHOSTUNREACH;
+ if (err)
+ return err;
+
+ fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
+ cfg->fc_ifindex != 0, NULL, flags);
+ }
+
+ err = 0;
if (dev) {
- if (dev != grt->dst.dev) {
- ip6_rt_put(grt);
- goto out;
- }
+ if (dev != res.nh->fib_nh_dev)
+ err = -EHOSTUNREACH;
} else {
- *_dev = dev = grt->dst.dev;
- *idev = grt->rt6i_idev;
+ *_dev = dev = res.nh->fib_nh_dev;
dev_hold(dev);
- in6_dev_hold(grt->rt6i_idev);
+ *idev = in6_dev_get(dev);
}
- if (!(grt->rt6i_flags & RTF_GATEWAY))
- err = 0;
-
- ip6_rt_put(grt);
-
-out:
return err;
}
@@ -2934,11 +3324,15 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
goto out;
}
+ rcu_read_lock();
+
if (cfg->fc_flags & RTNH_F_ONLINK)
err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
else
err = ip6_route_check_nh(net, cfg, _dev, idev);
+ rcu_read_unlock();
+
if (err)
goto out;
}
@@ -3039,7 +3433,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
goto out;
}
}
- goto set_dev;
+ goto pcpu_alloc;
}
if (cfg->fc_flags & RTF_GATEWAY) {
@@ -3075,7 +3469,14 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
cfg->fc_encap_type, cfg, gfp_flags, extack);
if (err)
goto out;
-set_dev:
+
+pcpu_alloc:
+ fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+ if (!fib6_nh->rt6i_pcpu) {
+ err = -ENOMEM;
+ goto out;
+ }
+
fib6_nh->fib_nh_dev = dev;
fib6_nh->fib_nh_oif = dev->ifindex;
err = 0;
@@ -3095,6 +3496,38 @@ out:
void fib6_nh_release(struct fib6_nh *fib6_nh)
{
+ struct rt6_exception_bucket *bucket;
+
+ rcu_read_lock();
+
+ fib6_nh_flush_exceptions(fib6_nh, NULL);
+ bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
+ if (bucket) {
+ rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
+ kfree(bucket);
+ }
+
+ rcu_read_unlock();
+
+ if (fib6_nh->rt6i_pcpu) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **ppcpu_rt;
+ struct rt6_info *pcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
+ pcpu_rt = *ppcpu_rt;
+ if (pcpu_rt) {
+ dst_dev_put(&pcpu_rt->dst);
+ dst_release(&pcpu_rt->dst);
+ *ppcpu_rt = NULL;
+ }
+ }
+
+ free_percpu(fib6_nh->rt6i_pcpu);
+ }
+
fib_nh_common_release(&fib6_nh->nh_common);
}
@@ -3104,7 +3537,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
{
struct net *net = cfg->fc_nlinfo.nl_net;
struct fib6_info *rt = NULL;
+ struct nexthop *nh = NULL;
struct fib6_table *table;
+ struct fib6_nh *fib6_nh;
int err = -EINVAL;
int addr_type;
@@ -3140,6 +3575,16 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
goto out;
}
#endif
+ if (cfg->fc_nh_id) {
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ goto out;
+ }
+ err = fib6_check_nexthop(nh, cfg, extack);
+ if (err)
+ goto out;
+ }
err = -ENOBUFS;
if (cfg->fc_nlinfo.nlh &&
@@ -3157,7 +3602,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
goto out;
err = -ENOMEM;
- rt = fib6_info_alloc(gfp_flags);
+ rt = fib6_info_alloc(gfp_flags, !nh);
if (!rt)
goto out;
@@ -3197,19 +3642,35 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
rt->fib6_src.plen = cfg->fc_src_len;
#endif
- err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
- if (err)
- goto out;
+ if (nh) {
+ if (!nexthop_get(nh)) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ goto out;
+ }
+ if (rt->fib6_src.plen) {
+ NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
+ goto out;
+ }
+ rt->nh = nh;
+ fib6_nh = nexthop_fib6_nh(rt->nh);
+ } else {
+ err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
+ if (err)
+ goto out;
- /* We cannot add true routes via loopback here,
- * they would result in kernel looping; promote them to reject routes
- */
- addr_type = ipv6_addr_type(&cfg->fc_dst);
- if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
- rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
+ fib6_nh = rt->fib6_nh;
+
+ /* We cannot add true routes via loopback here, they would
+ * result in kernel looping; promote them to reject routes
+ */
+ addr_type = ipv6_addr_type(&cfg->fc_dst);
+ if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
+ addr_type))
+ rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
+ }
if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
- struct net_device *dev = fib6_info_nh_dev(rt);
+ struct net_device *dev = fib6_nh->fib_nh_dev;
if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
NL_SET_ERR_MSG(extack, "Invalid source address");
@@ -3301,6 +3762,12 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
info->skip_notify = 1;
}
+ info->skip_notify_kernel = 1;
+ call_fib6_multipath_entry_notifiers(net,
+ FIB_EVENT_ENTRY_DEL,
+ rt,
+ rt->fib6_nsiblings,
+ NULL);
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings,
fib6_siblings) {
@@ -3323,7 +3790,7 @@ out_put:
return err;
}
-static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
+static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
{
int rc = -ESRCH;
@@ -3339,10 +3806,49 @@ out:
return rc;
}
+static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
+ struct fib6_nh *nh)
+{
+ struct fib6_result res = {
+ .f6i = rt,
+ .nh = nh,
+ };
+ struct rt6_info *rt_cache;
+
+ rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
+ if (rt_cache)
+ return __ip6_del_cached_rt(rt_cache, cfg);
+
+ return 0;
+}
+
+struct fib6_nh_del_cached_rt_arg {
+ struct fib6_config *cfg;
+ struct fib6_info *f6i;
+};
+
+static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_del_cached_rt_arg *arg = _arg;
+ int rc;
+
+ rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
+ return rc != -ESRCH ? rc : 0;
+}
+
+static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
+{
+ struct fib6_nh_del_cached_rt_arg arg = {
+ .cfg = cfg,
+ .f6i = f6i
+ };
+
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
+}
+
static int ip6_route_del(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
- struct rt6_info *rt_cache;
struct fib6_table *table;
struct fib6_info *rt;
struct fib6_node *fn;
@@ -3365,26 +3871,45 @@ static int ip6_route_del(struct fib6_config *cfg,
for_each_fib6_node_rt_rcu(fn) {
struct fib6_nh *nh;
+ if (rt->nh && cfg->fc_nh_id &&
+ rt->nh->id != cfg->fc_nh_id)
+ continue;
+
if (cfg->fc_flags & RTF_CACHE) {
- struct fib6_result res = {
- .f6i = rt,
- };
- int rc;
-
- rt_cache = rt6_find_cached_rt(&res,
- &cfg->fc_dst,
- &cfg->fc_src);
- if (rt_cache) {
- rc = ip6_del_cached_rt(rt_cache, cfg);
- if (rc != -ESRCH) {
- rcu_read_unlock();
- return rc;
- }
+ int rc = 0;
+
+ if (rt->nh) {
+ rc = ip6_del_cached_rt_nh(cfg, rt);
+ } else if (cfg->fc_nh_id) {
+ continue;
+ } else {
+ nh = rt->fib6_nh;
+ rc = ip6_del_cached_rt(cfg, rt, nh);
+ }
+ if (rc != -ESRCH) {
+ rcu_read_unlock();
+ return rc;
}
continue;
}
- nh = &rt->fib6_nh;
+ if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
+ continue;
+ if (cfg->fc_protocol &&
+ cfg->fc_protocol != rt->fib6_protocol)
+ continue;
+
+ if (rt->nh) {
+ if (!fib6_info_hold_safe(rt))
+ continue;
+ rcu_read_unlock();
+
+ return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ }
+ if (cfg->fc_nh_id)
+ continue;
+
+ nh = rt->fib6_nh;
if (cfg->fc_ifindex &&
(!nh->fib_nh_dev ||
nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
@@ -3392,10 +3917,6 @@ static int ip6_route_del(struct fib6_config *cfg,
if (cfg->fc_flags & RTF_GATEWAY &&
!ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
continue;
- if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
- continue;
- if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
- continue;
if (!fib6_info_hold_safe(rt))
continue;
rcu_read_unlock();
@@ -3506,7 +4027,25 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
if (!res.f6i)
goto out;
- res.nh = &res.f6i->fib6_nh;
+ if (res.f6i->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = dst->dev,
+ .gw = &rt->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(res.f6i->nh,
+ fib6_nh_find_match, &arg);
+
+ /* fib6_info uses a nexthop that does not have fib6_nh
+ * using the dst->dev. Should be impossible
+ */
+ if (!arg.match)
+ goto out;
+ res.nh = arg.match;
+ } else {
+ res.nh = res.f6i->fib6_nh;
+ }
+
res.fib6_flags = res.f6i->fib6_flags;
res.fib6_type = res.f6i->fib6_type;
nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
@@ -3558,12 +4097,15 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
goto out;
for_each_fib6_node_rt_rcu(fn) {
- if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
+ /* these routes do not use nexthops */
+ if (rt->nh)
+ continue;
+ if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
continue;
if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
- !rt->fib6_nh.fib_nh_gw_family)
+ !rt->fib6_nh->fib_nh_gw_family)
continue;
- if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
+ if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
continue;
if (!fib6_info_hold_safe(rt))
continue;
@@ -3621,8 +4163,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
rcu_read_lock();
for_each_fib6_node_rt_rcu(&table->tb6_root) {
- struct fib6_nh *nh = &rt->fib6_nh;
+ struct fib6_nh *nh;
+
+ /* RA routes do not use nexthops */
+ if (rt->nh)
+ continue;
+ nh = rt->fib6_nh;
if (dev == nh->fib_nh_dev &&
((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
ipv6_addr_equal(&nh->fib_nh_gw6, addr))
@@ -3873,7 +4420,8 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
struct net *net = ((struct arg_dev_net_ip *)arg)->net;
struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
- if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
+ if (!rt->nh &&
+ ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
rt != net->ipv6.fib6_null_entry &&
ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
spin_lock_bh(&rt6_exception_lock);
@@ -3901,18 +4449,22 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
{
struct in6_addr *gateway = (struct in6_addr *)arg;
+ struct fib6_nh *nh;
+ /* RA routes do not use nexthops */
+ if (rt->nh)
+ return 0;
+
+ nh = rt->fib6_nh;
if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
- rt->fib6_nh.fib_nh_gw_family &&
- ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
+ nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
return -1;
- }
/* Further clean up cached routes in exception table.
* This is needed because cached route may have a different
* gateway than its 'parent' in the case of an ip redirect.
*/
- rt6_exceptions_clean_tohost(rt, gateway);
+ fib6_nh_exceptions_clean_tohost(nh, gateway);
return 0;
}
@@ -3950,11 +4502,12 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
return NULL;
}
+/* only called for fib entries with builtin fib6_nh */
static bool rt6_is_dead(const struct fib6_info *rt)
{
- if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
- (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
- ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
+ if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
+ (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
+ ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
return true;
return false;
@@ -3966,11 +4519,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt)
int total = 0;
if (!rt6_is_dead(rt))
- total += rt->fib6_nh.fib_nh_weight;
+ total += rt->fib6_nh->fib_nh_weight;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
if (!rt6_is_dead(iter))
- total += iter->fib6_nh.fib_nh_weight;
+ total += iter->fib6_nh->fib_nh_weight;
}
return total;
@@ -3981,11 +4534,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
int upper_bound = -1;
if (!rt6_is_dead(rt)) {
- *weight += rt->fib6_nh.fib_nh_weight;
+ *weight += rt->fib6_nh->fib_nh_weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
total) - 1;
}
- atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
+ atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
}
static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
@@ -4028,9 +4581,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
const struct arg_netdev_event *arg = p_arg;
struct net *net = dev_net(arg->dev);
- if (rt != net->ipv6.fib6_null_entry &&
- rt->fib6_nh.fib_nh_dev == arg->dev) {
- rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
+ if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
+ rt->fib6_nh->fib_nh_dev == arg->dev) {
+ rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
fib6_update_sernum_upto_root(net, rt);
rt6_multipath_rebalance(rt);
}
@@ -4053,15 +4606,16 @@ void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}
+/* only called for fib entries with inline fib6_nh */
static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
const struct net_device *dev)
{
struct fib6_info *iter;
- if (rt->fib6_nh.fib_nh_dev == dev)
+ if (rt->fib6_nh->fib_nh_dev == dev)
return true;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.fib_nh_dev == dev)
+ if (iter->fib6_nh->fib_nh_dev == dev)
return true;
return false;
@@ -4082,12 +4636,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
struct fib6_info *iter;
unsigned int dead = 0;
- if (rt->fib6_nh.fib_nh_dev == down_dev ||
- rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
+ if (rt->fib6_nh->fib_nh_dev == down_dev ||
+ rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
dead++;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.fib_nh_dev == down_dev ||
- iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
+ if (iter->fib6_nh->fib_nh_dev == down_dev ||
+ iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
dead++;
return dead;
@@ -4099,11 +4653,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
{
struct fib6_info *iter;
- if (rt->fib6_nh.fib_nh_dev == dev)
- rt->fib6_nh.fib_nh_flags |= nh_flags;
+ if (rt->fib6_nh->fib_nh_dev == dev)
+ rt->fib6_nh->fib_nh_flags |= nh_flags;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.fib_nh_dev == dev)
- iter->fib6_nh.fib_nh_flags |= nh_flags;
+ if (iter->fib6_nh->fib_nh_dev == dev)
+ iter->fib6_nh->fib_nh_flags |= nh_flags;
}
/* called with write lock held for table with rt */
@@ -4113,17 +4667,17 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
const struct net_device *dev = arg->dev;
struct net *net = dev_net(dev);
- if (rt == net->ipv6.fib6_null_entry)
+ if (rt == net->ipv6.fib6_null_entry || rt->nh)
return 0;
switch (arg->event) {
case NETDEV_UNREGISTER:
- return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
case NETDEV_DOWN:
if (rt->should_flush)
return -1;
if (!rt->fib6_nsiblings)
- return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
if (rt6_multipath_uses_dev(rt, dev)) {
unsigned int count;
@@ -4139,10 +4693,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
}
return -2;
case NETDEV_CHANGE:
- if (rt->fib6_nh.fib_nh_dev != dev ||
+ if (rt->fib6_nh->fib_nh_dev != dev ||
rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
break;
- rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
+ rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
rt6_multipath_rebalance(rt);
break;
}
@@ -4176,9 +4730,36 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event)
struct rt6_mtu_change_arg {
struct net_device *dev;
unsigned int mtu;
+ struct fib6_info *f6i;
};
-static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
+static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
+{
+ struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
+ struct fib6_info *f6i = arg->f6i;
+
+ /* For administrative MTU increase, there is no way to discover
+ * IPv6 PMTU increase, so PMTU increase should be updated here.
+ * Since RFC 1981 doesn't include administrative MTU increase
+ * update PMTU increase is a MUST. (i.e. jumbo frame)
+ */
+ if (nh->fib_nh_dev == arg->dev) {
+ struct inet6_dev *idev = __in6_dev_get(arg->dev);
+ u32 mtu = f6i->fib6_pmtu;
+
+ if (mtu >= arg->mtu ||
+ (mtu < arg->mtu && mtu == idev->cnf.mtu6))
+ fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
+
+ spin_lock_bh(&rt6_exception_lock);
+ rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
+ spin_unlock_bh(&rt6_exception_lock);
+ }
+
+ return 0;
+}
+
+static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
{
struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
struct inet6_dev *idev;
@@ -4193,24 +4774,17 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
if (!idev)
return 0;
- /* For administrative MTU increase, there is no way to discover
- IPv6 PMTU increase, so PMTU increase should be updated here.
- Since RFC 1981 doesn't include administrative MTU increase
- update PMTU increase is a MUST. (i.e. jumbo frame)
- */
- if (rt->fib6_nh.fib_nh_dev == arg->dev &&
- !fib6_metric_locked(rt, RTAX_MTU)) {
- u32 mtu = rt->fib6_pmtu;
-
- if (mtu >= arg->mtu ||
- (mtu < arg->mtu && mtu == idev->cnf.mtu6))
- fib6_metric_set(rt, RTAX_MTU, arg->mtu);
+ if (fib6_metric_locked(f6i, RTAX_MTU))
+ return 0;
- spin_lock_bh(&rt6_exception_lock);
- rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
- spin_unlock_bh(&rt6_exception_lock);
+ arg->f6i = f6i;
+ if (f6i->nh) {
+ /* fib6_nh_mtu_change only returns 0, so this is safe */
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
+ arg);
}
- return 0;
+
+ return fib6_nh_mtu_change(f6i->fib6_nh, arg);
}
void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
@@ -4224,6 +4798,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
}
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
+ [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 },
[RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
[RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
[RTA_OIF] = { .type = NLA_U32 },
@@ -4241,6 +4816,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_IP_PROTO] = { .type = NLA_U8 },
[RTA_SPORT] = { .type = NLA_U16 },
[RTA_DPORT] = { .type = NLA_U16 },
+ [RTA_NH_ID] = { .type = NLA_U32 },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4287,6 +4863,16 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
+ if (tb[RTA_NH_ID]) {
+ if (tb[RTA_GATEWAY] || tb[RTA_OIF] ||
+ tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop specification and nexthop id are mutually exclusive");
+ goto errout;
+ }
+ cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
+ }
+
if (tb[RTA_GATEWAY]) {
cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
cfg->fc_flags |= RTF_GATEWAY;
@@ -4430,6 +5016,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
{
struct fib6_info *rt_notif = NULL, *rt_last = NULL;
struct nl_info *info = &cfg->fc_nlinfo;
+ enum fib_event_type event_type;
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
struct fib6_info *rt;
@@ -4489,7 +5076,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
goto cleanup;
}
- rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
+ rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
rt, &r_cfg);
@@ -4501,12 +5088,23 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
rtnh = rtnh_next(rtnh, &remaining);
}
+ if (list_empty(&rt6_nh_list)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid nexthop configuration - no valid nexthops");
+ return -EINVAL;
+ }
+
/* for add and replace send one notification with all nexthops.
* Skip the notification in fib6_add_rt2node and send one with
* the full route when done
*/
info->skip_notify = 1;
+ /* For add and replace, send one notification with all nexthops. For
+ * append, send one notification with all appended nexthops.
+ */
+ info->skip_notify_kernel = 1;
+
err_nh = NULL;
list_for_each_entry(nh, &rt6_nh_list, next) {
err = __ip6_ins_rt(nh->fib6_info, info, extack);
@@ -4543,6 +5141,15 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
nhn++;
}
+ event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD;
+ err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type,
+ rt_notif, nhn - 1, extack);
+ if (err) {
+ /* Delete all the siblings that were just added */
+ err_nh = NULL;
+ goto add_errout;
+ }
+
/* success ... tell user about new route */
ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
goto cleanup;
@@ -4621,6 +5228,12 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
+ if (cfg.fc_nh_id &&
+ !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ return -EINVAL;
+ }
+
if (cfg.fc_mp)
return ip6_route_multipath_del(&cfg, extack);
else {
@@ -4648,17 +5261,46 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
return ip6_route_add(&cfg, GFP_KERNEL, extack);
}
-static size_t rt6_nlmsg_size(struct fib6_info *rt)
+/* add the overhead of this fib6_nh to nexthop_len */
+static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
{
- int nexthop_len = 0;
+ int *nexthop_len = arg;
- if (rt->fib6_nsiblings) {
- nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
- + NLA_ALIGN(sizeof(struct rtnexthop))
- + nla_total_size(16) /* RTA_GATEWAY */
- + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
+ *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */
+ + NLA_ALIGN(sizeof(struct rtnexthop))
+ + nla_total_size(16); /* RTA_GATEWAY */
- nexthop_len *= rt->fib6_nsiblings;
+ if (nh->fib_nh_lws) {
+ /* RTA_ENCAP_TYPE */
+ *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
+ /* RTA_ENCAP */
+ *nexthop_len += nla_total_size(2);
+ }
+
+ return 0;
+}
+
+static size_t rt6_nlmsg_size(struct fib6_info *f6i)
+{
+ int nexthop_len;
+
+ if (f6i->nh) {
+ nexthop_len = nla_total_size(4); /* RTA_NH_ID */
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
+ &nexthop_len);
+ } else {
+ struct fib6_nh *nh = f6i->fib6_nh;
+
+ nexthop_len = 0;
+ if (f6i->fib6_nsiblings) {
+ nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
+ + NLA_ALIGN(sizeof(struct rtnexthop))
+ + nla_total_size(16) /* RTA_GATEWAY */
+ + lwtunnel_get_encap_size(nh->fib_nh_lws);
+
+ nexthop_len *= f6i->fib6_nsiblings;
+ }
+ nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
}
return NLMSG_ALIGN(sizeof(struct rtmsg))
@@ -4674,10 +5316,38 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
+ nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
+ nla_total_size(1) /* RTA_PREF */
- + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
+ nexthop_len;
}
+static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
+ unsigned char *flags)
+{
+ if (nexthop_is_multipath(nh)) {
+ struct nlattr *mp;
+
+ mp = nla_nest_start(skb, RTA_MULTIPATH);
+ if (!mp)
+ goto nla_put_failure;
+
+ if (nexthop_mpath_fill_node(skb, nh))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, mp);
+ } else {
+ struct fib6_nh *fib6_nh;
+
+ fib6_nh = nexthop_fib6_nh(nh);
+ if (fib_nexthop_info(skb, &fib6_nh->nh_common,
+ flags, false) < 0)
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct fib6_info *rt, struct dst_entry *dst,
struct in6_addr *dest, struct in6_addr *src,
@@ -4687,6 +5357,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct rt6_info *rt6 = (struct rt6_info *)dst;
struct rt6key *rt6_dst, *rt6_src;
u32 *pmetrics, table, rt6_flags;
+ unsigned char nh_flags = 0;
struct nlmsghdr *nlh;
struct rtmsg *rtm;
long expires = 0;
@@ -4794,22 +5465,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
if (!mp)
goto nla_put_failure;
- if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
- rt->fib6_nh.fib_nh_weight) < 0)
+ if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
+ rt->fib6_nh->fib_nh_weight) < 0)
goto nla_put_failure;
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings, fib6_siblings) {
- if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
- sibling->fib6_nh.fib_nh_weight) < 0)
+ if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
+ sibling->fib6_nh->fib_nh_weight) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, mp);
- } else {
- unsigned char nh_flags = 0;
+ } else if (rt->nh) {
+ if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
+ goto nla_put_failure;
- if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
+ if (nexthop_is_blackhole(rt->nh))
+ rtm->rtm_type = RTN_BLACKHOLE;
+
+ if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
+ goto nla_put_failure;
+
+ rtm->rtm_flags |= nh_flags;
+ } else {
+ if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
&nh_flags, false) < 0)
goto nla_put_failure;
@@ -4836,10 +5516,28 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
+{
+ const struct net_device *dev = arg;
+
+ if (nh->fib_nh_dev == dev)
+ return 1;
+
+ return 0;
+}
+
static bool fib6_info_uses_dev(const struct fib6_info *f6i,
const struct net_device *dev)
{
- if (f6i->fib6_nh.fib_nh_dev == dev)
+ if (f6i->nh) {
+ struct net_device *_dev = (struct net_device *)dev;
+
+ return !!nexthop_for_each_fib6_nh(f6i->nh,
+ fib6_info_nh_uses_dev,
+ _dev);
+ }
+
+ if (f6i->fib6_nh->fib_nh_dev == dev)
return true;
if (f6i->fib6_nsiblings) {
@@ -4847,7 +5545,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i,
list_for_each_entry_safe(sibling, next_sibling,
&f6i->fib6_siblings, fib6_siblings) {
- if (sibling->fib6_nh.fib_nh_dev == dev)
+ if (sibling->fib6_nh->fib_nh_dev == dev)
return true;
}
}
@@ -4855,33 +5553,131 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i,
return false;
}
-int rt6_dump_route(struct fib6_info *rt, void *p_arg)
+struct fib6_nh_exception_dump_walker {
+ struct rt6_rtnl_dump_arg *dump;
+ struct fib6_info *rt;
+ unsigned int flags;
+ unsigned int skip;
+ unsigned int count;
+};
+
+static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
+{
+ struct fib6_nh_exception_dump_walker *w = arg;
+ struct rt6_rtnl_dump_arg *dump = w->dump;
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ int i, err;
+
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
+ if (!bucket)
+ return 0;
+
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+ if (w->skip) {
+ w->skip--;
+ continue;
+ }
+
+ /* Expiration of entries doesn't bump sernum, insertion
+ * does. Removal is triggered by insertion, so we can
+ * rely on the fact that if entries change between two
+ * partial dumps, this node is scanned again completely,
+ * see rt6_insert_exception() and fib6_dump_table().
+ *
+ * Count expired entries we go through as handled
+ * entries that we'll skip next time, in case of partial
+ * node dump. Otherwise, if entries expire meanwhile,
+ * we'll skip the wrong amount.
+ */
+ if (rt6_check_expired(rt6_ex->rt6i)) {
+ w->count++;
+ continue;
+ }
+
+ err = rt6_fill_node(dump->net, dump->skb, w->rt,
+ &rt6_ex->rt6i->dst, NULL, NULL, 0,
+ RTM_NEWROUTE,
+ NETLINK_CB(dump->cb->skb).portid,
+ dump->cb->nlh->nlmsg_seq, w->flags);
+ if (err)
+ return err;
+
+ w->count++;
+ }
+ bucket++;
+ }
+
+ return 0;
+}
+
+/* Return -1 if done with node, number of handled routes on partial dump */
+int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
{
struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
struct fib_dump_filter *filter = &arg->filter;
unsigned int flags = NLM_F_MULTI;
struct net *net = arg->net;
+ int count = 0;
if (rt == net->ipv6.fib6_null_entry)
- return 0;
+ return -1;
if ((filter->flags & RTM_F_PREFIX) &&
!(rt->fib6_flags & RTF_PREFIX_RT)) {
/* success since this is not a prefix route */
- return 1;
+ return -1;
}
- if (filter->filter_set) {
- if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
- (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
- (filter->protocol && rt->fib6_protocol != filter->protocol)) {
- return 1;
- }
+ if (filter->filter_set &&
+ ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
+ (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
+ (filter->protocol && rt->fib6_protocol != filter->protocol))) {
+ return -1;
+ }
+
+ if (filter->filter_set ||
+ !filter->dump_routes || !filter->dump_exceptions) {
flags |= NLM_F_DUMP_FILTERED;
}
- return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
- RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
- arg->cb->nlh->nlmsg_seq, flags);
+ if (filter->dump_routes) {
+ if (skip) {
+ skip--;
+ } else {
+ if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
+ 0, RTM_NEWROUTE,
+ NETLINK_CB(arg->cb->skb).portid,
+ arg->cb->nlh->nlmsg_seq, flags)) {
+ return 0;
+ }
+ count++;
+ }
+ }
+
+ if (filter->dump_exceptions) {
+ struct fib6_nh_exception_dump_walker w = { .dump = arg,
+ .rt = rt,
+ .flags = flags,
+ .skip = skip,
+ .count = 0 };
+ int err;
+
+ rcu_read_lock();
+ if (rt->nh) {
+ err = nexthop_for_each_fib6_nh(rt->nh,
+ rt6_nh_dump_exceptions,
+ &w);
+ } else {
+ err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
+ }
+ rcu_read_unlock();
+
+ if (err)
+ return count += w.count;
+ }
+
+ return -1;
}
static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
@@ -5126,6 +5922,38 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}
+void fib6_rt_update(struct net *net, struct fib6_info *rt,
+ struct nl_info *info)
+{
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ /* call_fib6_entry_notifiers will be removed when in-kernel notifier
+ * is implemented and supported for nexthop objects
+ */
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL);
+
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+ if (!skb)
+ goto errout;
+
+ err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
+ RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
+ info->nlh, gfp_any());
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+}
+
static int ip6_route_dev_notify(struct notifier_block *this,
unsigned long event, void *ptr)
{
@@ -5136,7 +5964,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
return NOTIFY_OK;
if (event == NETDEV_REGISTER) {
- net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
+ net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
net->ipv6.ip6_null_entry->dst.dev = dev;
net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -5330,11 +6158,11 @@ static int __net_init ip6_route_net_init(struct net *net)
if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
goto out_ip6_dst_ops;
- net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
- sizeof(*net->ipv6.fib6_null_entry),
- GFP_KERNEL);
+ net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
if (!net->ipv6.fib6_null_entry)
goto out_ip6_dst_entries;
+ memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
+ sizeof(*net->ipv6.fib6_null_entry));
net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
sizeof(*net->ipv6.ip6_null_entry),
@@ -5344,6 +6172,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
net->ipv6.fib6_has_custom_rules = false;
@@ -5355,6 +6184,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -5364,6 +6194,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
#endif
net->ipv6.sysctl.flush_delay = 0;
@@ -5471,7 +6302,7 @@ void __init ip6_route_init_special_entries(void)
/* Registering of the loopback is done before this portion of code,
* the loopback reference in rt6_info will not be taken, do it
* manually for init_net */
- init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
+ init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index e15cd37024fd..dc4c91e0bfb8 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -23,6 +23,7 @@
static int zero;
static int one = 1;
+static int flowlabel_reflect_max = 0x7;
static int auto_flowlabels_min;
static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
@@ -113,7 +114,9 @@ static struct ctl_table ipv6_table_template[] = {
.data = &init_net.ipv6.sysctl.flowlabel_reflect,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &flowlabel_reflect_max,
},
{
.procname = "max_dst_opts_number",
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7a14ea37d2df..d56a9019a0fe 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -171,7 +171,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
struct ip6_flowlabel *flowlabel;
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
fl6_sock_release(flowlabel);
}
@@ -883,9 +883,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowi6_oif = oif;
}
- if (sk)
- mark = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_mark : sk->sk_mark;
+ if (sk) {
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ mark = inet_twsk(sk)->tw_mark;
+ /* autoflowlabel relies on buff->hash */
+ skb_set_hash(buff, inet_twsk(sk)->tw_txhash,
+ PKT_HASH_TYPE_L4);
+ } else {
+ mark = sk->sk_mark;
+ }
+ buff->tstamp = tcp_transmit_time(sk);
+ }
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
@@ -912,15 +920,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
u32 seq = 0, ack_seq = 0;
struct tcp_md5sig_key *key = NULL;
#ifdef CONFIG_TCP_MD5SIG
const __u8 *hash_location = NULL;
- struct ipv6hdr *ipv6h = ipv6_hdr(skb);
unsigned char newhash[16];
int genhash;
struct sock *sk1 = NULL;
#endif
+ __be32 label = 0;
+ struct net *net;
int oif = 0;
if (th->rst)
@@ -932,6 +942,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
if (!sk && !ipv6_unicast_destination(skb))
return;
+ net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
@@ -945,7 +956,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
+ sk1 = inet6_lookup_listener(net,
&tcp_hashinfo, NULL, 0,
&ipv6h->saddr,
th->source, &ipv6h->daddr,
@@ -975,9 +986,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
oif = sk->sk_bound_dev_if;
if (sk_fullsock(sk))
trace_tcp_send_reset(sk, skb);
+ if (sk->sk_state == TCP_TIME_WAIT)
+ label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
+ } else {
+ if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
+ label = ip6_flowlabel(ipv6h);
}
- tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
+ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0,
+ label);
#ifdef CONFIG_TCP_MD5SIG
out:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 70b01bd95022..827fe7385078 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -54,16 +54,6 @@
#include <trace/events/skb.h>
#include "udp_impl.h"
-static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
-{
-#if defined(CONFIG_NET_L3_MASTER_DEV)
- if (!net->ipv4.sysctl_udp_l3mdev_accept &&
- skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
- return true;
-#endif
- return false;
-}
-
static u32 udp6_ehashfn(const struct net *net,
const struct in6_addr *laddr,
const u16 lport,
@@ -111,7 +101,7 @@ void udp_v6_rehash(struct sock *sk)
static int compute_score(struct sock *sk, struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned short hnum,
- int dif, int sdif, bool exact_dif)
+ int dif, int sdif)
{
int score;
struct inet_sock *inet;
@@ -155,8 +145,8 @@ static int compute_score(struct sock *sk, struct net *net,
static struct sock *udp6_lib_lookup2(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned int hnum,
- int dif, int sdif, bool exact_dif,
- struct udp_hslot *hslot2, struct sk_buff *skb)
+ int dif, int sdif, struct udp_hslot *hslot2,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
int score, badness;
@@ -166,7 +156,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
badness = -1;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif);
+ daddr, hnum, dif, sdif);
if (score > badness) {
if (sk->sk_reuseport) {
hash = udp6_ehashfn(net, daddr, hnum,
@@ -195,14 +185,13 @@ struct sock *__udp6_lib_lookup(struct net *net,
unsigned int hash2, slot2;
struct udp_hslot *hslot2;
struct sock *result;
- bool exact_dif = udp6_lib_exact_dif_match(net, skb);
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif,
+ daddr, hnum, dif, sdif,
hslot2, skb);
if (!result) {
hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
@@ -212,10 +201,9 @@ struct sock *__udp6_lib_lookup(struct net *net,
result = udp6_lib_lookup2(net, saddr, sport,
&in6addr_any, hnum, dif, sdif,
- exact_dif, hslot2,
- skb);
+ hslot2, skb);
}
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
@@ -838,8 +826,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
int ret;
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
- skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
- ip6_compute_pseudo);
+ skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo);
ret = udpv6_queue_rcv_skb(sk, skb);
@@ -1332,7 +1319,7 @@ do_udp_sendmsg:
fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
}
@@ -1384,7 +1371,7 @@ do_udp_sendmsg:
}
if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
if (!(opt->opt_nflen|opt->opt_flen))
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 5bdca3d5d6b7..78daadecbdef 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -21,137 +21,6 @@
#include <net/ipv6.h>
#include <net/addrconf.h>
-static void
-__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
-{
- const struct flowi6 *fl6 = &fl->u.ip6;
-
- /* Initialize temporary selector matching only
- * to current session. */
- *(struct in6_addr *)&sel->daddr = fl6->daddr;
- *(struct in6_addr *)&sel->saddr = fl6->saddr;
- sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
- sel->dport_mask = htons(0xffff);
- sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
- sel->sport_mask = htons(0xffff);
- sel->family = AF_INET6;
- sel->prefixlen_d = 128;
- sel->prefixlen_s = 128;
- sel->proto = fl6->flowi6_proto;
- sel->ifindex = fl6->flowi6_oif;
-}
-
-static void
-xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
- const xfrm_address_t *daddr, const xfrm_address_t *saddr)
-{
- x->id = tmpl->id;
- if (ipv6_addr_any((struct in6_addr *)&x->id.daddr))
- memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
- memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
- if (ipv6_addr_any((struct in6_addr *)&x->props.saddr))
- memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
- x->props.mode = tmpl->mode;
- x->props.reqid = tmpl->reqid;
- x->props.family = AF_INET6;
-}
-
-/* distribution counting sort function for xfrm_state and xfrm_tmpl */
-static int
-__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
-{
- int count[XFRM_MAX_DEPTH] = { };
- int class[XFRM_MAX_DEPTH];
- int i;
-
- for (i = 0; i < n; i++) {
- int c;
- class[i] = c = cmp(src[i]);
- count[c]++;
- }
-
- for (i = 2; i < maxclass; i++)
- count[i] += count[i - 1];
-
- for (i = 0; i < n; i++) {
- dst[count[class[i] - 1]++] = src[i];
- src[i] = NULL;
- }
-
- return 0;
-}
-
-/*
- * Rule for xfrm_state:
- *
- * rule 1: select IPsec transport except AH
- * rule 2: select MIPv6 RO or inbound trigger
- * rule 3: select IPsec transport AH
- * rule 4: select IPsec tunnel
- * rule 5: others
- */
-static int __xfrm6_state_sort_cmp(void *p)
-{
- struct xfrm_state *v = p;
-
- switch (v->props.mode) {
- case XFRM_MODE_TRANSPORT:
- if (v->id.proto != IPPROTO_AH)
- return 1;
- else
- return 3;
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- case XFRM_MODE_ROUTEOPTIMIZATION:
- case XFRM_MODE_IN_TRIGGER:
- return 2;
-#endif
- case XFRM_MODE_TUNNEL:
- case XFRM_MODE_BEET:
- return 4;
- }
- return 5;
-}
-
-static int
-__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n)
-{
- return __xfrm6_sort((void **)dst, (void **)src, n,
- __xfrm6_state_sort_cmp, 6);
-}
-
-/*
- * Rule for xfrm_tmpl:
- *
- * rule 1: select IPsec transport
- * rule 2: select MIPv6 RO or inbound trigger
- * rule 3: select IPsec tunnel
- * rule 4: others
- */
-static int __xfrm6_tmpl_sort_cmp(void *p)
-{
- struct xfrm_tmpl *v = p;
- switch (v->mode) {
- case XFRM_MODE_TRANSPORT:
- return 1;
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- case XFRM_MODE_ROUTEOPTIMIZATION:
- case XFRM_MODE_IN_TRIGGER:
- return 2;
-#endif
- case XFRM_MODE_TUNNEL:
- case XFRM_MODE_BEET:
- return 3;
- }
- return 4;
-}
-
-static int
-__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n)
-{
- return __xfrm6_sort((void **)dst, (void **)src, n,
- __xfrm6_tmpl_sort_cmp, 5);
-}
-
int xfrm6_extract_header(struct sk_buff *skb)
{
struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -171,12 +40,6 @@ int xfrm6_extract_header(struct sk_buff *skb)
static struct xfrm_state_afinfo xfrm6_state_afinfo = {
.family = AF_INET6,
.proto = IPPROTO_IPV6,
- .eth_proto = htons(ETH_P_IPV6),
- .owner = THIS_MODULE,
- .init_tempsel = __xfrm6_init_tempsel,
- .init_temprop = xfrm6_init_temprop,
- .tmpl_sort = __xfrm6_tmpl_sort,
- .state_sort = __xfrm6_state_sort,
.output = xfrm6_output,
.output_finish = xfrm6_output_finish,
.extract_input = xfrm6_extract_input,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index a50dd6f34b91..b67ed3a8486c 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -928,8 +928,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x,
pfkey_sockaddr_fill(&x->props.saddr, 0,
(struct sockaddr *) (addr + 1),
x->props.family);
- if (!addr->sadb_address_prefixlen)
- BUG();
+ BUG_ON(!addr->sadb_address_prefixlen);
/* dst address */
addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size);
@@ -944,8 +943,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x,
pfkey_sockaddr_fill(&x->id.daddr, 0,
(struct sockaddr *) (addr + 1),
x->props.family);
- if (!addr->sadb_address_prefixlen)
- BUG();
+ BUG_ON(!addr->sadb_address_prefixlen);
if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr,
x->props.family)) {
@@ -2438,8 +2436,10 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc
goto out;
}
err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
- if (err < 0)
+ if (err < 0) {
+ kfree_skb(out_skb);
goto out;
+ }
out_hdr = (struct sadb_msg *) out_skb->data;
out_hdr->sadb_msg_version = hdr->sadb_msg_version;
@@ -2690,8 +2690,10 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr)
return PTR_ERR(out_skb);
err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
- if (err < 0)
+ if (err < 0) {
+ kfree_skb(out_skb);
return err;
+ }
out_hdr = (struct sadb_msg *) out_skb->data;
out_hdr->sadb_msg_version = pfk->dump.msg_version;
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 6e2b4b9267e1..35bb4f3bdbe0 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -31,7 +31,6 @@
#include "l2tp_core.h"
static struct dentry *rootdir;
-static struct dentry *tunnels;
struct l2tp_dfs_seq_data {
struct net *net;
@@ -326,32 +325,18 @@ static const struct file_operations l2tp_dfs_fops = {
static int __init l2tp_debugfs_init(void)
{
- int rc = 0;
-
rootdir = debugfs_create_dir("l2tp", NULL);
- if (IS_ERR(rootdir)) {
- rc = PTR_ERR(rootdir);
- rootdir = NULL;
- goto out;
- }
- tunnels = debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops);
- if (tunnels == NULL)
- rc = -EIO;
+ debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops);
pr_info("L2TP debugfs support\n");
-out:
- if (rc)
- pr_warn("unable to init\n");
-
- return rc;
+ return 0;
}
static void __exit l2tp_debugfs_exit(void)
{
- debugfs_remove(tunnels);
- debugfs_remove(rootdir);
+ debugfs_remove_recursive(rootdir);
}
module_init(l2tp_debugfs_init);
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 1a76a0a4e3ab..687e23a8b326 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -536,7 +536,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK;
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (flowlabel == NULL)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
}
@@ -577,7 +577,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if ((fl6.flowlabel & IPV6_FLOWLABEL_MASK) && !flowlabel) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (flowlabel == NULL)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
if (!(opt->opt_nflen|opt->opt_flen))
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index cfc9fcb97465..f35899d45a9a 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -118,6 +118,8 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
* local and multicast addresses
* @net: network namespace for device index lookup
* @fl6: IPv6 flow struct for lookup
+ * This function does not hold refcnt on the returned dst.
+ * Caller must hold rcu_read_lock().
*/
struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
@@ -126,9 +128,8 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
struct dst_entry *dst = NULL;
struct net_device *dev;
+ WARN_ON_ONCE(!rcu_read_lock_held());
if (fl6->flowi6_oif) {
- rcu_read_lock();
-
dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
if (dev && netif_is_l3_slave(dev))
dev = netdev_master_upper_dev_get_rcu(dev);
@@ -136,8 +137,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
if (dev && netif_is_l3_master(dev) &&
dev->l3mdev_ops->l3mdev_link_scope_lookup)
dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6);
-
- rcu_read_unlock();
}
return dst;
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
index 5d2d1f746b91..3c03f6512c5f 100644
--- a/net/lapb/lapb_iface.c
+++ b/net/lapb/lapb_iface.c
@@ -68,7 +68,6 @@ static void __lapb_remove_cb(struct lapb_cb *lapb)
lapb_put(lapb);
}
}
-EXPORT_SYMBOL(lapb_register);
/*
* Add a socket to the bound sockets list.
@@ -115,7 +114,6 @@ static struct lapb_cb *lapb_create_cb(void)
{
struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC);
-
if (!lapb)
goto out;
@@ -167,6 +165,7 @@ out:
write_unlock_bh(&lapb_list_lock);
return rc;
}
+EXPORT_SYMBOL(lapb_register);
int lapb_unregister(struct net_device *dev)
{
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 3fae902937fd..76cc9e967fa6 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -5,6 +5,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2015 Intel Mobile Communications GmbH
* Copyright (C) 2015-2017 Intel Deutschland GmbH
+ * Copyright (C) 2018-2019 Intel Corporation
* Copyright (C) 2018 Intel Corporation
*/
@@ -974,7 +975,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
BSS_CHANGED_BEACON |
BSS_CHANGED_SSID |
BSS_CHANGED_P2P_PS |
- BSS_CHANGED_TXPOWER;
+ BSS_CHANGED_TXPOWER |
+ BSS_CHANGED_TWT;
int err;
int prev_beacon_int;
@@ -1044,6 +1046,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
sdata->vif.bss_conf.dtim_period = params->dtim_period;
sdata->vif.bss_conf.enable_beacon = true;
sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p;
+ sdata->vif.bss_conf.twt_responder = params->twt_responder;
sdata->vif.bss_conf.ssid_len = params->ssid_len;
if (params->ssid_len)
@@ -1465,7 +1468,7 @@ static int sta_apply_parameters(struct ieee80211_local *local,
return ret;
}
- if (params->supported_rates) {
+ if (params->supported_rates && params->supported_rates_len) {
ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef,
sband, params->supported_rates,
params->supported_rates_len,
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 271bc2b676a4..2e7f75938c51 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -272,6 +272,7 @@ static const char *hw_flag_names[] = {
FLAG(SUPPORTS_MULTI_BSSID),
FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID),
FLAG(EXT_KEY_ID_NATIVE),
+ FLAG(NO_AMPDU_KEYBORDER_SUPPORT),
#undef FLAG
};
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 3509ce0daea3..7b8735ced2a1 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -339,9 +339,6 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key)
key->debugfs.dir = debugfs_create_dir(buf,
key->local->debugfs.keys);
- if (!key->debugfs.dir)
- return;
-
sta = key->sta;
if (sta) {
sprintf(buf, "../../netdev:%s/stations/%pM",
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index f1f2e1c7ac0c..b1438fd4d876 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -815,9 +815,8 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
sprintf(buf, "netdev:%s", sdata->name);
sdata->vif.debugfs_dir = debugfs_create_dir(buf,
sdata->local->hw.wiphy->debugfsdir);
- if (sdata->vif.debugfs_dir)
- sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
- sdata->vif.debugfs_dir);
+ sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
+ sdata->vif.debugfs_dir);
add_files(sdata);
}
@@ -842,8 +841,5 @@ void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata)
return;
sprintf(buf, "netdev:%s", sdata->name);
- if (!debugfs_rename(dir->d_parent, dir, dir->d_parent, buf))
- sdata_err(sdata,
- "debugfs: failed to rename debugfs dir to %s\n",
- buf);
+ debugfs_rename(dir->d_parent, dir, dir->d_parent, buf);
}
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 3fd79ccb293b..c8ad20c28c43 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -957,8 +957,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
* dir might still be around.
*/
sta->debugfs_dir = debugfs_create_dir(mac, stations_dir);
- if (!sta->debugfs_dir)
- return;
DEBUGFS_ADD(flags);
DEBUGFS_ADD(aid);
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 157ff5f890d2..dd60f6428049 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -269,50 +269,61 @@ int ieee80211_set_tx_key(struct ieee80211_key *key)
assert_key_lock(local);
sta->ptk_idx = key->conf.keyidx;
+
+ if (ieee80211_hw_check(&local->hw, NO_AMPDU_KEYBORDER_SUPPORT))
+ clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
ieee80211_check_fast_xmit(sta);
return 0;
}
-static int ieee80211_hw_key_replace(struct ieee80211_key *old_key,
- struct ieee80211_key *new_key,
- bool pairwise)
+static void ieee80211_pairwise_rekey(struct ieee80211_key *old,
+ struct ieee80211_key *new)
{
- struct ieee80211_sub_if_data *sdata;
- struct ieee80211_local *local;
- struct sta_info *sta;
- int ret;
-
- /* Aggregation sessions are OK when running on SW crypto.
- * A broken remote STA may cause issues not observed with HW
- * crypto, though.
- */
- if (!(old_key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
- return 0;
+ struct ieee80211_local *local = new->local;
+ struct sta_info *sta = new->sta;
+ int i;
- assert_key_lock(old_key->local);
- sta = old_key->sta;
+ assert_key_lock(local);
- /* Unicast rekey without Extended Key ID needs special handling */
- if (new_key && sta && pairwise &&
- rcu_access_pointer(sta->ptk[sta->ptk_idx]) == old_key) {
- local = old_key->local;
- sdata = old_key->sdata;
+ if (new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) {
+ /* Extended Key ID key install, initial one or rekey */
+
+ if (sta->ptk_idx != INVALID_PTK_KEYIDX &&
+ ieee80211_hw_check(&local->hw,
+ NO_AMPDU_KEYBORDER_SUPPORT)) {
+ /* Aggregation Sessions with Extended Key ID must not
+ * mix MPDUs with different keyIDs within one A-MPDU.
+ * Tear down any running Tx aggregation and all new
+ * Rx/Tx aggregation request during rekey if the driver
+ * asks us to do so. (Blocking Tx only would be
+ * sufficient but WLAN_STA_BLOCK_BA gets the job done
+ * for the few ms we need it.)
+ */
+ set_sta_flag(sta, WLAN_STA_BLOCK_BA);
+ mutex_lock(&sta->ampdu_mlme.mtx);
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++)
+ ___ieee80211_stop_tx_ba_session(sta, i,
+ AGG_STOP_LOCAL_REQUEST);
+ mutex_unlock(&sta->ampdu_mlme.mtx);
+ }
+ } else if (old) {
+ /* Rekey without Extended Key ID.
+ * Aggregation sessions are OK when running on SW crypto.
+ * A broken remote STA may cause issues not observed with HW
+ * crypto, though.
+ */
+ if (!(old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
+ return;
- /* Stop TX till we are on the new key */
- old_key->flags |= KEY_FLAG_TAINTED;
+ /* Stop Tx till we are on the new key */
+ old->flags |= KEY_FLAG_TAINTED;
ieee80211_clear_fast_xmit(sta);
-
- /* Aggregation sessions during rekey are complicated due to the
- * reorder buffer and retransmits. Side step that by blocking
- * aggregation during rekey and tear down running sessions.
- */
if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) {
set_sta_flag(sta, WLAN_STA_BLOCK_BA);
ieee80211_sta_tear_down_BA_sessions(sta,
AGG_STOP_LOCAL_REQUEST);
}
-
if (!wiphy_ext_feature_isset(local->hw.wiphy,
NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) {
pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.",
@@ -320,18 +331,9 @@ static int ieee80211_hw_key_replace(struct ieee80211_key *old_key,
/* Flushing the driver queues *may* help prevent
* the clear text leaks and freezes.
*/
- ieee80211_flush_queues(local, sdata, false);
+ ieee80211_flush_queues(local, old->sdata, false);
}
}
-
- ieee80211_key_disable_hw_accel(old_key);
-
- if (new_key)
- ret = ieee80211_key_enable_hw_accel(new_key);
- else
- ret = 0;
-
- return ret;
}
static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
@@ -389,7 +391,6 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
mutex_unlock(&sdata->local->key_mtx);
}
-
static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta,
bool pairwise,
@@ -397,7 +398,7 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
struct ieee80211_key *new)
{
int idx;
- int ret;
+ int ret = 0;
bool defunikey, defmultikey, defmgmtkey;
/* caller must provide at least one old/new */
@@ -409,16 +410,27 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
+ if (new && sta && pairwise) {
+ /* Unicast rekey needs special handling. With Extended Key ID
+ * old is still NULL for the first rekey.
+ */
+ ieee80211_pairwise_rekey(old, new);
+ }
+
if (old) {
idx = old->conf.keyidx;
- ret = ieee80211_hw_key_replace(old, new, pairwise);
+
+ if (old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) {
+ ieee80211_key_disable_hw_accel(old);
+
+ if (new)
+ ret = ieee80211_key_enable_hw_accel(new);
+ }
} else {
/* new must be provided in case old is not */
idx = new->conf.keyidx;
if (!new->local->wowlan)
ret = ieee80211_key_enable_hw_accel(new);
- else
- ret = 0;
}
if (ret)
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 087c90e461b5..4c2702f128f3 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -352,11 +352,11 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
sdata_lock(sdata);
/* Copy the addresses to the bss_conf list */
- ifa = idev->ifa_list;
+ ifa = rtnl_dereference(idev->ifa_list);
while (ifa) {
if (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN)
bss_conf->arp_addr_list[c] = ifa->ifa_address;
- ifa = ifa->ifa_next;
+ ifa = rtnl_dereference(ifa->ifa_next);
c++;
}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a00b703bfdfd..a99ad0325309 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3156,6 +3156,19 @@ static bool ieee80211_twt_req_supported(const struct sta_info *sta,
IEEE80211_HE_MAC_CAP0_TWT_RES;
}
+static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee802_11_elems *elems)
+{
+ bool twt = ieee80211_twt_req_supported(sta, elems);
+
+ if (sdata->vif.bss_conf.twt_requester != twt) {
+ sdata->vif.bss_conf.twt_requester = twt;
+ return BSS_CHANGED_TWT;
+ }
+ return 0;
+}
+
static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
struct cfg80211_bss *cbss,
struct ieee80211_mgmt *mgmt, size_t len)
@@ -3338,8 +3351,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
sta);
bss_conf->he_support = sta->sta.he_cap.has_he;
- bss_conf->twt_requester =
- ieee80211_twt_req_supported(sta, &elems);
+ changed |= ieee80211_recalc_twt_req(sdata, sta, &elems);
} else {
bss_conf->he_support = false;
bss_conf->twt_requester = false;
@@ -3999,6 +4011,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
mutex_lock(&local->sta_mtx);
sta = sta_info_get(sdata, bssid);
+ changed |= ieee80211_recalc_twt_req(sdata, sta, &elems);
+
if (ieee80211_config_bw(sdata, sta,
elems.ht_cap_elem, elems.ht_operation,
elems.vht_operation, elems.he_operation,
@@ -4949,7 +4963,12 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
basic_rates = BIT(min_rate_index);
}
- new_sta->sta.supp_rates[cbss->channel->band] = rates;
+ if (rates)
+ new_sta->sta.supp_rates[cbss->channel->band] = rates;
+ else
+ sdata_info(sdata,
+ "No rates found, keeping mandatory only\n");
+
sdata->vif.bss_conf.basic_rates = basic_rates;
/* cf. IEEE 802.11 9.2.12 */
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 6e5961d7f639..60ef8972b254 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -199,6 +199,10 @@ static void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc)
cfg80211_remain_on_channel_expired(&roc->sdata->wdev,
roc->cookie, roc->chan,
GFP_KERNEL);
+ else
+ cfg80211_tx_mgmt_expired(&roc->sdata->wdev,
+ roc->mgmt_tx_cookie,
+ roc->chan, GFP_KERNEL);
list_del(&roc->list);
kfree(roc);
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 47ee36677c2b..a1e9fc7878aa 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -354,8 +354,10 @@ static void __rate_control_send_low(struct ieee80211_hw *hw,
break;
}
WARN_ONCE(i == sband->n_bitrates,
- "no supported rates (0x%x) in rate_mask 0x%x with flags 0x%x\n",
+ "no supported rates for sta %pM (0x%x, band %d) in rate_mask 0x%x with flags 0x%x\n",
+ sta ? sta->addr : NULL,
sta ? sta->supp_rates[sband->band] : -1,
+ sband->band,
rate_mask, rate_flags);
info->control.rates[0].count =
@@ -366,9 +368,8 @@ static void __rate_control_send_low(struct ieee80211_hw *hw,
}
-bool rate_control_send_low(struct ieee80211_sta *pubsta,
- void *priv_sta,
- struct ieee80211_tx_rate_control *txrc)
+static bool rate_control_send_low(struct ieee80211_sta *pubsta,
+ struct ieee80211_tx_rate_control *txrc)
{
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
struct ieee80211_supported_band *sband = txrc->sband;
@@ -376,7 +377,7 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta,
int mcast_rate;
bool use_basicrate = false;
- if (!pubsta || !priv_sta || rc_no_data_or_no_ack_use_min(txrc)) {
+ if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) {
__rate_control_send_low(txrc->hw, sband, pubsta, info,
txrc->rate_idx_mask);
@@ -402,7 +403,6 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta,
}
return false;
}
-EXPORT_SYMBOL(rate_control_send_low);
static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask)
{
@@ -885,26 +885,29 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
int i;
- if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) {
- ista = &sta->sta;
- priv_sta = sta->rate_ctrl_priv;
- }
-
for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
info->control.rates[i].idx = -1;
info->control.rates[i].flags = 0;
info->control.rates[i].count = 0;
}
+ if (rate_control_send_low(sta ? &sta->sta : NULL, txrc))
+ return;
+
if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL))
return;
+ if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) {
+ ista = &sta->sta;
+ priv_sta = sta->rate_ctrl_priv;
+ }
+
if (ista) {
spin_lock_bh(&sta->rate_ctrl_lock);
ref->ops->get_rate(ref->priv, ista, priv_sta, txrc);
spin_unlock_bh(&sta->rate_ctrl_lock);
} else {
- ref->ops->get_rate(ref->priv, NULL, NULL, txrc);
+ rate_control_send_low(NULL, txrc);
}
if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE))
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index a34e9c2ca626..ee86c3333999 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -340,10 +340,6 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
int delta;
int sampling_ratio;
- /* management/no-ack frames do not use rate control */
- if (rate_control_send_low(sta, priv_sta, txrc))
- return;
-
/* check multi-rate-retry capabilities & adjust lookaround_rate */
mrr_capable = mp->has_mrr &&
!txrc->rts &&
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 298a1acb3ce5..5a882da82f0e 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -1093,9 +1093,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
struct minstrel_priv *mp = priv;
int sample_idx;
- if (rate_control_send_low(sta, priv_sta, txrc))
- return;
-
if (!msp->is_ht)
return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 187f62a48b2b..95eb8220e2e4 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -4,7 +4,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018-2019 Intel Corporation
*/
#include <linux/module.h>
@@ -401,6 +401,47 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
for (i = 0; i < IEEE80211_NUM_TIDS; i++)
sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX);
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
+ u32 mandatory = 0;
+ int r;
+
+ if (!hw->wiphy->bands[i])
+ continue;
+
+ switch (i) {
+ case NL80211_BAND_2GHZ:
+ /*
+ * We use both here, even if we cannot really know for
+ * sure the station will support both, but the only use
+ * for this is when we don't know anything yet and send
+ * management frames, and then we'll pick the lowest
+ * possible rate anyway.
+ * If we don't include _G here, we cannot find a rate
+ * in P2P, and thus trigger the WARN_ONCE() in rate.c
+ */
+ mandatory = IEEE80211_RATE_MANDATORY_B |
+ IEEE80211_RATE_MANDATORY_G;
+ break;
+ case NL80211_BAND_5GHZ:
+ mandatory = IEEE80211_RATE_MANDATORY_A;
+ break;
+ case NL80211_BAND_60GHZ:
+ WARN_ON(1);
+ mandatory = 0;
+ break;
+ }
+
+ for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) {
+ struct ieee80211_rate *rate;
+
+ rate = &hw->wiphy->bands[i]->bitrates[r];
+
+ if (!(rate->flags & mandatory))
+ continue;
+ sta->sta.supp_rates[i] |= BIT(r);
+ }
+ }
+
sta->sta.smps_mode = IEEE80211_SMPS_OFF;
if (sdata->vif.type == NL80211_IFTYPE_AP ||
sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 1837734ce85b..32a45c03786e 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -651,6 +651,17 @@ config NFT_TPROXY
help
This makes transparent proxy support available in nftables.
+config NFT_SYNPROXY
+ tristate "Netfilter nf_tables SYNPROXY expression support"
+ depends on NF_CONNTRACK && NETFILTER_ADVANCED
+ select NETFILTER_SYNPROXY
+ select SYN_COOKIES
+ help
+ The SYNPROXY expression allows you to intercept TCP connections and
+ establish them using syncookies before they are passed on to the
+ server. This allows to avoid conntrack and server resource usage
+ during SYN-flood attacks.
+
if NF_TABLES_NETDEV
config NF_DUP_NETDEV
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 72cca6b48960..9270a7fae484 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -78,7 +78,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \
- nft_chain_route.o
+ nft_chain_route.o nf_tables_offload.o
nf_tables_set-objs := nf_tables_set_core.o \
nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o
@@ -110,6 +110,7 @@ obj-$(CONFIG_NFT_SOCKET) += nft_socket.o
obj-$(CONFIG_NFT_OSF) += nft_osf.o
obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o
obj-$(CONFIG_NFT_XFRM) += nft_xfrm.o
+obj-$(CONFIG_NFT_SYNPROXY) += nft_synproxy.o
obj-$(CONFIG_NFT_NAT) += nft_chain_nat.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index b96fd3f54705..5d5bdf450091 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -520,7 +520,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
ret = -EPERM;
return ret;
case NF_QUEUE:
- ret = nf_queue(skb, state, e, s, verdict);
+ ret = nf_queue(skb, state, s, verdict);
if (ret == 1)
continue;
return ret;
@@ -536,28 +536,6 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
}
EXPORT_SYMBOL(nf_hook_slow);
-
-int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
-{
- if (writable_len > skb->len)
- return 0;
-
- /* Not exclusive use of packet? Must copy. */
- if (!skb_cloned(skb)) {
- if (writable_len <= skb_headlen(skb))
- return 1;
- } else if (skb_clone_writable(skb, writable_len))
- return 1;
-
- if (writable_len <= skb_headlen(skb))
- writable_len = 0;
- else
- writable_len -= skb_headlen(skb);
-
- return !!__pskb_pull_tail(skb, writable_len);
-}
-EXPORT_SYMBOL(skb_make_writable);
-
/* This needs to be compiled in any case to avoid dependencies between the
* nfnetlink_queue code and nf_conntrack.
*/
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 8acc4e173167..063df74b4647 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
#ifndef __IP_SET_BITMAP_IP_GEN_H
#define __IP_SET_BITMAP_IP_GEN_H
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index e3884b0cca91..11ff9d4a7006 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
/* Kernel module implementing an IP set type: the bitmap:ip type */
@@ -28,7 +28,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:ip");
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index b73c37b3a791..ca7ac4a25ada 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -2,7 +2,6 @@
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
* Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*/
/* Kernel module implementing an IP set type: the bitmap:ip,mac type */
@@ -28,7 +27,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:ip,mac");
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index d8c140553379..704a0dda1609 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the bitmap:port type */
@@ -23,7 +22,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:port");
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 3cdf171cd468..2e151856ad99 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
/* Kernel module for IP set management */
@@ -48,7 +48,7 @@ static unsigned int max_sets;
module_param(max_sets, int, 0600);
MODULE_PARM_DESC(max_sets, "maximal number of sets");
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
MODULE_DESCRIPTION("core IP set support");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
@@ -1290,11 +1290,13 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
struct nlattr *attr = (void *)nlh + min_len;
u32 dump_type;
ip_set_id_t index;
+ int ret;
- /* Second pass, so parser can't fail */
- nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr,
- nlh->nlmsg_len - min_len, ip_set_setname_policy,
- NULL);
+ ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr,
+ nlh->nlmsg_len - min_len,
+ ip_set_setname_policy, NULL);
+ if (ret)
+ return ret;
cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]);
if (cda[IPSET_ATTR_SETNAME]) {
@@ -1541,10 +1543,14 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
cmdattr = (void *)&errmsg->msg + min_len;
- nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr,
- nlh->nlmsg_len - min_len,
- ip_set_adt_policy, NULL);
+ ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr,
+ nlh->nlmsg_len - min_len,
+ ip_set_adt_policy, NULL);
+ if (ret) {
+ nlmsg_free(skb2);
+ return ret;
+ }
errline = nla_data(cda[IPSET_ATTR_LINENO]);
*errline = lineno;
@@ -1558,10 +1564,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
return ret;
}
-static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_ad(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ enum ipset_adt adt,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[],
+ struct netlink_ext_ack *extack)
{
struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *set;
@@ -1590,18 +1598,17 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
if (attr[IPSET_ATTR_DATA]) {
if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags,
+ ret = call_ad(ctnl, skb, set, tb, adt, flags,
use_lineno);
} else {
int nla_rem;
nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
- memset(tb, 0, sizeof(tb));
if (nla_type(nla) != IPSET_ATTR_DATA ||
!flag_nested(nla) ||
nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_ADD,
+ ret = call_ad(ctnl, skb, set, tb, adt,
flags, use_lineno);
if (ret < 0)
return ret;
@@ -1610,56 +1617,22 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
return ret;
}
-static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int ip_set_uadd(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const attr[],
struct netlink_ext_ack *extack)
{
- struct ip_set_net *inst = ip_set_pernet(net);
- struct ip_set *set;
- struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
- const struct nlattr *nla;
- u32 flags = flag_exist(nlh);
- bool use_lineno;
- int ret = 0;
-
- if (unlikely(protocol_min_failed(attr) ||
- !attr[IPSET_ATTR_SETNAME] ||
- !((attr[IPSET_ATTR_DATA] != NULL) ^
- (attr[IPSET_ATTR_ADT] != NULL)) ||
- (attr[IPSET_ATTR_DATA] &&
- !flag_nested(attr[IPSET_ATTR_DATA])) ||
- (attr[IPSET_ATTR_ADT] &&
- (!flag_nested(attr[IPSET_ATTR_ADT]) ||
- !attr[IPSET_ATTR_LINENO]))))
- return -IPSET_ERR_PROTOCOL;
-
- set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (!set)
- return -ENOENT;
-
- use_lineno = !!attr[IPSET_ATTR_LINENO];
- if (attr[IPSET_ATTR_DATA]) {
- if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL))
- return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags,
- use_lineno);
- } else {
- int nla_rem;
+ return ip_set_ad(net, ctnl, skb,
+ IPSET_ADD, nlh, attr, extack);
+}
- nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
- memset(tb, 0, sizeof(*tb));
- if (nla_type(nla) != IPSET_ATTR_DATA ||
- !flag_nested(nla) ||
- nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL))
- return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_DEL,
- flags, use_lineno);
- if (ret < 0)
- return ret;
- }
- }
- return ret;
+static int ip_set_udel(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[],
+ struct netlink_ext_ack *extack)
+{
+ return ip_set_ad(net, ctnl, skb,
+ IPSET_DEL, nlh, attr, extack);
}
static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index 2384e36aef5c..2b8f959574b4 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -1,5 +1,9 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
*/
/* Get Layer-4 data from the packets */
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 10f619625abd..0feb77fa9edc 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
#ifndef _IP_SET_HASH_GEN_H
#define _IP_SET_HASH_GEN_H
@@ -622,7 +621,7 @@ retry:
goto cleanup;
}
m->size = AHASH_INIT_SIZE;
- extsize = ext_size(AHASH_INIT_SIZE, dsize);
+ extsize += ext_size(AHASH_INIT_SIZE, dsize);
RCU_INIT_POINTER(hbucket(t, key), m);
} else if (m->pos >= m->size) {
struct hbucket *ht;
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 69d7576be2e6..f4432d9fcad0 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip type */
@@ -27,7 +26,7 @@
#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip");
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index 6fe1ec0d2154..7a1734aad0c5 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,mark type */
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 74ec7e097e34..32e240658334 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port type */
@@ -29,7 +28,7 @@
#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip,port");
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index ced57d63b01f..15d419353179 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port,ip type */
@@ -29,7 +28,7 @@
#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip,port,ip");
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 905f6cf0f55e..7a4d7afd4121 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port,net type */
@@ -31,7 +30,7 @@
#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip,port,net");
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index 853e772ab4d9..d94c585d33c5 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:mac type */
@@ -20,7 +19,7 @@
#define IPSET_TYPE_REV_MAX 0
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:mac");
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 06c91e49bf25..c259cbc3ef45 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:net type */
@@ -28,7 +27,7 @@
#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net");
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 0a8cbcdfb42b..87b29f971226 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:net,iface type */
@@ -29,7 +28,7 @@
#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net,iface");
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 832e4f5491cb..a3ae69bfee66 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
* Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
*/
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index a4f3f15b874a..799f2272cc65 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:net,port type */
@@ -30,7 +29,7 @@
#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net,port");
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index e54d415405f3..a82b70e8b9a6 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port,net type */
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 8ada318bf09d..6f9ead6319e0 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
+/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the list:set type */
@@ -19,7 +18,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_list:set");
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index bfd4365a8d73..4515056ef1c2 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -358,7 +358,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
struct tcphdr *th;
__u32 seq;
- if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ if (skb_ensure_writable(skb, tcp_offset + sizeof(*th)))
return 0;
th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
@@ -435,7 +435,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
struct tcphdr *th;
__u32 seq;
- if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ if (skb_ensure_writable(skb, tcp_offset + sizeof(*th)))
return 0;
th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 7138556b206b..46f06f92ab8f 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -34,6 +34,8 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
+#include <net/gue.h>
+#include <net/gre.h>
#include <net/route.h>
#include <net/ip6_checksum.h>
#include <net/netns/generic.h> /* net_generic() */
@@ -892,7 +894,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
IPPROTO_SCTP == protocol)
offset += 2 * sizeof(__u16);
- if (!skb_make_writable(skb, offset))
+ if (skb_ensure_writable(skb, offset))
goto out;
#ifdef CONFIG_IP_VS_IPV6
@@ -1282,7 +1284,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
- if (!skb_make_writable(skb, iph->len))
+ if (skb_ensure_writable(skb, iph->len))
goto drop;
/* mangle the packet */
@@ -1574,6 +1576,73 @@ ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
return 1;
}
+/* Check the UDP tunnel and return its header length */
+static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
+ unsigned int offset, __u16 af,
+ const union nf_inet_addr *daddr, __u8 *proto)
+{
+ struct udphdr _udph, *udph;
+ struct ip_vs_dest *dest;
+
+ udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+ if (!udph)
+ goto unk;
+ offset += sizeof(struct udphdr);
+ dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest);
+ if (!dest)
+ goto unk;
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ struct guehdr _gueh, *gueh;
+
+ gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh);
+ if (!gueh)
+ goto unk;
+ if (gueh->control != 0 || gueh->version != 0)
+ goto unk;
+ /* Later we can support also IPPROTO_IPV6 */
+ if (gueh->proto_ctype != IPPROTO_IPIP)
+ goto unk;
+ *proto = gueh->proto_ctype;
+ return sizeof(struct udphdr) + sizeof(struct guehdr) +
+ (gueh->hlen << 2);
+ }
+
+unk:
+ return 0;
+}
+
+/* Check the GRE tunnel and return its header length */
+static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
+ unsigned int offset, __u16 af,
+ const union nf_inet_addr *daddr, __u8 *proto)
+{
+ struct gre_base_hdr _greh, *greh;
+ struct ip_vs_dest *dest;
+
+ greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh);
+ if (!greh)
+ goto unk;
+ dest = ip_vs_find_tunnel(ipvs, af, daddr, 0);
+ if (!dest)
+ goto unk;
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ __be16 type;
+
+ /* Only support version 0 and C (csum) */
+ if ((greh->flags & ~GRE_CSUM) != 0)
+ goto unk;
+ type = greh->protocol;
+ /* Later we can support also IPPROTO_IPV6 */
+ if (type != htons(ETH_P_IP))
+ goto unk;
+ *proto = IPPROTO_IPIP;
+ return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags));
+ }
+
+unk:
+ return 0;
+}
+
/*
* Handle ICMP messages in the outside-to-inside direction (incoming).
* Find any that might be relevant, check against existing connections,
@@ -1593,6 +1662,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
struct ip_vs_proto_data *pd;
unsigned int offset, offset2, ihl, verdict;
bool ipip, new_cp = false;
+ union nf_inet_addr *raddr;
*related = 1;
@@ -1631,20 +1701,56 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
+ raddr = (union nf_inet_addr *)&cih->daddr;
/* Special case for errors for IPIP packets */
ipip = false;
if (cih->protocol == IPPROTO_IPIP) {
+ struct ip_vs_dest *dest;
+
if (unlikely(cih->frag_off & htons(IP_OFFSET)))
return NF_ACCEPT;
/* Error for our IPIP must arrive at LOCAL_IN */
if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
return NF_ACCEPT;
+ dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0);
+ /* Only for known tunnel */
+ if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP)
+ return NF_ACCEPT;
offset += cih->ihl * 4;
cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
ipip = true;
+ } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */
+ cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */
+ /* Error for our tunnel must arrive at LOCAL_IN */
+ (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) {
+ __u8 iproto;
+ int ulen;
+
+ /* Non-first fragment has no UDP header */
+ if (unlikely(cih->frag_off & htons(IP_OFFSET)))
+ return NF_ACCEPT;
+ offset2 = offset + cih->ihl * 4;
+ if (cih->protocol == IPPROTO_UDP)
+ ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET,
+ raddr, &iproto);
+ else
+ ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET,
+ raddr, &iproto);
+ if (ulen > 0) {
+ /* Skip IP and UDP/GRE tunnel headers */
+ offset = offset2 + ulen;
+ /* Now we should be at the original IP header */
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph),
+ &_ciph);
+ if (cih && cih->version == 4 && cih->ihl >= 5 &&
+ iproto == IPPROTO_IPIP)
+ ipip = true;
+ else
+ return NF_ACCEPT;
+ }
}
pd = ip_vs_proto_data_get(ipvs, cih->protocol);
@@ -2245,7 +2351,6 @@ static const struct nf_hook_ops ip_vs_ops[] = {
static int __net_init __ip_vs_init(struct net *net)
{
struct netns_ipvs *ipvs;
- int ret;
ipvs = net_generic(net, ip_vs_net_id);
if (ipvs == NULL)
@@ -2277,17 +2382,11 @@ static int __net_init __ip_vs_init(struct net *net)
if (ip_vs_sync_net_init(ipvs) < 0)
goto sync_fail;
- ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
- if (ret < 0)
- goto hook_fail;
-
return 0;
/*
* Error handling
*/
-hook_fail:
- ip_vs_sync_net_cleanup(ipvs);
sync_fail:
ip_vs_conn_net_cleanup(ipvs);
conn_fail:
@@ -2317,6 +2416,19 @@ static void __net_exit __ip_vs_cleanup(struct net *net)
net->ipvs = NULL;
}
+static int __net_init __ip_vs_dev_init(struct net *net)
+{
+ int ret;
+
+ ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ if (ret < 0)
+ goto hook_fail;
+ return 0;
+
+hook_fail:
+ return ret;
+}
+
static void __net_exit __ip_vs_dev_cleanup(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
@@ -2336,6 +2448,7 @@ static struct pernet_operations ipvs_core_ops = {
};
static struct pernet_operations ipvs_core_dev_ops = {
+ .init = __ip_vs_dev_init,
.exit = __ip_vs_dev_cleanup,
};
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 776c87ed4813..07e0967bf129 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -510,15 +510,37 @@ static inline unsigned int ip_vs_rs_hashkey(int af,
static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
unsigned int hash;
+ __be16 port;
if (dest->in_rs_table)
return;
+ switch (IP_VS_DFWD_METHOD(dest)) {
+ case IP_VS_CONN_F_MASQ:
+ port = dest->port;
+ break;
+ case IP_VS_CONN_F_TUNNEL:
+ switch (dest->tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ port = dest->tun_port;
+ break;
+ case IP_VS_CONN_F_TUNNEL_TYPE_IPIP:
+ case IP_VS_CONN_F_TUNNEL_TYPE_GRE:
+ port = 0;
+ break;
+ default:
+ return;
+ }
+ break;
+ default:
+ return;
+ }
+
/*
* Hash by proto,addr,port,
* which are the parameters of the real service.
*/
- hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
+ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port);
hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
dest->in_rs_table = 1;
@@ -550,7 +572,8 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
if (dest->port == dport &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
- (dest->protocol == protocol || dest->vfwmark)) {
+ (dest->protocol == protocol || dest->vfwmark) &&
+ IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
/* HIT */
return true;
}
@@ -580,7 +603,37 @@ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
if (dest->port == dport &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
- (dest->protocol == protocol || dest->vfwmark)) {
+ (dest->protocol == protocol || dest->vfwmark) &&
+ IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
+ /* HIT */
+ return dest;
+ }
+ }
+
+ return NULL;
+}
+
+/* Find real service record by <af,addr,tun_port>.
+ * In case of multiple records with the same <af,addr,tun_port>, only
+ * the first found record is returned.
+ *
+ * To be called under RCU lock.
+ */
+struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af,
+ const union nf_inet_addr *daddr,
+ __be16 tun_port)
+{
+ struct ip_vs_dest *dest;
+ unsigned int hash;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_rs_hashkey(af, daddr, tun_port);
+
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->tun_port == tun_port &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) {
/* HIT */
return dest;
}
@@ -826,24 +879,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
conn_flags |= IP_VS_CONN_F_INACTIVE;
+ /* Need to rehash? */
+ if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) !=
+ IP_VS_DFWD_METHOD(dest) ||
+ udest->tun_type != dest->tun_type ||
+ udest->tun_port != dest->tun_port)
+ ip_vs_rs_unhash(dest);
+
/* set the tunnel info */
dest->tun_type = udest->tun_type;
dest->tun_port = udest->tun_port;
+ dest->tun_flags = udest->tun_flags;
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
- /*
- * Put the real service in rs_table if not present.
- * For now only for NAT!
- */
- ip_vs_rs_hash(ipvs, dest);
/* FTP-NAT requires conntrack for mangling */
if (svc->port == FTPPORT)
ip_vs_register_conntrack(svc);
}
atomic_set(&dest->conn_flags, conn_flags);
+ /* Put the real service in rs_table if not present. */
+ ip_vs_rs_hash(ipvs, dest);
/* bind the service */
old_svc = rcu_dereference_protected(dest->svc, 1);
@@ -2396,9 +2454,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
cfg.syncid = dm->syncid;
ret = start_sync_thread(ipvs, &cfg, dm->state);
} else {
- mutex_lock(&ipvs->sync_mutex);
ret = stop_sync_thread(ipvs, dm->state);
- mutex_unlock(&ipvs->sync_mutex);
}
goto out_dec;
}
@@ -2906,6 +2962,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
[IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
[IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
[IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
+ [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 },
};
static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
@@ -3212,6 +3269,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
dest->tun_type) ||
nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
dest->tun_port) ||
+ nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS,
+ dest->tun_flags) ||
nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
@@ -3332,7 +3391,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
- *nla_l_thresh, *nla_tun_type, *nla_tun_port;
+ *nla_l_thresh, *nla_tun_type, *nla_tun_port,
+ *nla_tun_flags;
nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
@@ -3340,6 +3400,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
+ nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS];
if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
return -EINVAL;
@@ -3355,6 +3416,9 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
if (nla_tun_port)
udest->tun_port = nla_get_be16(nla_tun_port);
+
+ if (nla_tun_flags)
+ udest->tun_flags = nla_get_u16(nla_tun_flags);
}
return 0;
@@ -3515,10 +3579,8 @@ static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
if (!attrs[IPVS_DAEMON_ATTR_STATE])
return -EINVAL;
- mutex_lock(&ipvs->sync_mutex);
ret = stop_sync_thread(ipvs,
nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
- mutex_unlock(&ipvs->sync_mutex);
return ret;
}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index c244b2545e24..cf925906f59b 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -267,7 +267,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
return 1;
/* Linear packets are much easier to deal with. */
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return 0;
if (cp->app_data == (void *) IP_VS_FTP_PASV) {
@@ -433,7 +433,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
return 1;
/* Linear packets are much easier to deal with. */
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return 0;
data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh);
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index b58ddb7dffd1..a0921adc31a9 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -101,7 +101,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
#endif
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph)))
return 0;
if (unlikely(cp->app != NULL)) {
@@ -148,7 +148,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
#endif
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph)))
return 0;
if (unlikely(cp->app != NULL)) {
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 915ac8206076..000d961b97e4 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -159,7 +159,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph)))
return 0;
if (unlikely(cp->app != NULL)) {
@@ -237,7 +237,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph)))
return 0;
if (unlikely(cp->app != NULL)) {
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 379140075e95..153d89647c87 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -148,7 +148,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ if (skb_ensure_writable(skb, udphoff + sizeof(*udph)))
return 0;
if (unlikely(cp->app != NULL)) {
@@ -231,7 +231,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ if (skb_ensure_writable(skb, udphoff + sizeof(*udph)))
return 0;
if (unlikely(cp->app != NULL)) {
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 2526be6b3d90..a4a78c4b06de 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -195,6 +195,7 @@ union ip_vs_sync_conn {
#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
struct ip_vs_sync_thread_data {
+ struct task_struct *task;
struct netns_ipvs *ipvs;
struct socket *sock;
char *buf;
@@ -374,8 +375,11 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs,
max(IPVS_SYNC_SEND_DELAY, 1));
ms->sync_queue_len++;
list_add_tail(&sb->list, &ms->sync_queue);
- if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
- wake_up_process(ms->master_thread);
+ if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) {
+ int id = (int)(ms - ipvs->ms);
+
+ wake_up_process(ipvs->master_tinfo[id].task);
+ }
} else
ip_vs_sync_buff_release(sb);
spin_unlock(&ipvs->sync_lock);
@@ -1636,8 +1640,10 @@ static void master_wakeup_work_handler(struct work_struct *work)
spin_lock_bh(&ipvs->sync_lock);
if (ms->sync_queue_len &&
ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
+ int id = (int)(ms - ipvs->ms);
+
ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
- wake_up_process(ms->master_thread);
+ wake_up_process(ipvs->master_tinfo[id].task);
}
spin_unlock_bh(&ipvs->sync_lock);
}
@@ -1703,10 +1709,6 @@ done:
if (sb)
ip_vs_sync_buff_release(sb);
- /* release the sending multicast socket */
- sock_release(tinfo->sock);
- kfree(tinfo);
-
return 0;
}
@@ -1740,11 +1742,6 @@ static int sync_thread_backup(void *data)
}
}
- /* release the sending multicast socket */
- sock_release(tinfo->sock);
- kfree(tinfo->buf);
- kfree(tinfo);
-
return 0;
}
@@ -1752,8 +1749,8 @@ static int sync_thread_backup(void *data)
int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
int state)
{
- struct ip_vs_sync_thread_data *tinfo = NULL;
- struct task_struct **array = NULL, *task;
+ struct ip_vs_sync_thread_data *ti = NULL, *tinfo;
+ struct task_struct *task;
struct net_device *dev;
char *name;
int (*threadfn)(void *data);
@@ -1822,7 +1819,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
threadfn = sync_thread_master;
} else if (state == IP_VS_STATE_BACKUP) {
result = -EEXIST;
- if (ipvs->backup_threads)
+ if (ipvs->backup_tinfo)
goto out_early;
ipvs->bcfg = *c;
@@ -1849,28 +1846,22 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
master_wakeup_work_handler);
ms->ipvs = ipvs;
}
- } else {
- array = kcalloc(count, sizeof(struct task_struct *),
- GFP_KERNEL);
- result = -ENOMEM;
- if (!array)
- goto out;
}
+ result = -ENOMEM;
+ ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data),
+ GFP_KERNEL);
+ if (!ti)
+ goto out;
for (id = 0; id < count; id++) {
- result = -ENOMEM;
- tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
- if (!tinfo)
- goto out;
+ tinfo = &ti[id];
tinfo->ipvs = ipvs;
- tinfo->sock = NULL;
if (state == IP_VS_STATE_BACKUP) {
+ result = -ENOMEM;
tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
GFP_KERNEL);
if (!tinfo->buf)
goto out;
- } else {
- tinfo->buf = NULL;
}
tinfo->id = id;
if (state == IP_VS_STATE_MASTER)
@@ -1885,17 +1876,15 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
result = PTR_ERR(task);
goto out;
}
- tinfo = NULL;
- if (state == IP_VS_STATE_MASTER)
- ipvs->ms[id].master_thread = task;
- else
- array[id] = task;
+ tinfo->task = task;
}
/* mark as active */
- if (state == IP_VS_STATE_BACKUP)
- ipvs->backup_threads = array;
+ if (state == IP_VS_STATE_MASTER)
+ ipvs->master_tinfo = ti;
+ else
+ ipvs->backup_tinfo = ti;
spin_lock_bh(&ipvs->sync_buff_lock);
ipvs->sync_state |= state;
spin_unlock_bh(&ipvs->sync_buff_lock);
@@ -1910,29 +1899,31 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
out:
/* We do not need RTNL lock anymore, release it here so that
- * sock_release below and in the kthreads can use rtnl_lock
- * to leave the mcast group.
+ * sock_release below can use rtnl_lock to leave the mcast group.
*/
rtnl_unlock();
- count = id;
- while (count-- > 0) {
- if (state == IP_VS_STATE_MASTER)
- kthread_stop(ipvs->ms[count].master_thread);
- else
- kthread_stop(array[count]);
+ id = min(id, count - 1);
+ if (ti) {
+ for (tinfo = ti + id; tinfo >= ti; tinfo--) {
+ if (tinfo->task)
+ kthread_stop(tinfo->task);
+ }
}
if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
kfree(ipvs->ms);
ipvs->ms = NULL;
}
mutex_unlock(&ipvs->sync_mutex);
- if (tinfo) {
- if (tinfo->sock)
- sock_release(tinfo->sock);
- kfree(tinfo->buf);
- kfree(tinfo);
+
+ /* No more mutexes, release socks */
+ if (ti) {
+ for (tinfo = ti + id; tinfo >= ti; tinfo--) {
+ if (tinfo->sock)
+ sock_release(tinfo->sock);
+ kfree(tinfo->buf);
+ }
+ kfree(ti);
}
- kfree(array);
return result;
out_early:
@@ -1944,15 +1935,18 @@ out_early:
int stop_sync_thread(struct netns_ipvs *ipvs, int state)
{
- struct task_struct **array;
+ struct ip_vs_sync_thread_data *ti, *tinfo;
int id;
int retc = -EINVAL;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
+ mutex_lock(&ipvs->sync_mutex);
if (state == IP_VS_STATE_MASTER) {
+ retc = -ESRCH;
if (!ipvs->ms)
- return -ESRCH;
+ goto err;
+ ti = ipvs->master_tinfo;
/*
* The lock synchronizes with sb_queue_tail(), so that we don't
@@ -1971,38 +1965,56 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state)
struct ipvs_master_sync_state *ms = &ipvs->ms[id];
int ret;
+ tinfo = &ti[id];
pr_info("stopping master sync thread %d ...\n",
- task_pid_nr(ms->master_thread));
+ task_pid_nr(tinfo->task));
cancel_delayed_work_sync(&ms->master_wakeup_work);
- ret = kthread_stop(ms->master_thread);
+ ret = kthread_stop(tinfo->task);
if (retc >= 0)
retc = ret;
}
kfree(ipvs->ms);
ipvs->ms = NULL;
+ ipvs->master_tinfo = NULL;
} else if (state == IP_VS_STATE_BACKUP) {
- if (!ipvs->backup_threads)
- return -ESRCH;
+ retc = -ESRCH;
+ if (!ipvs->backup_tinfo)
+ goto err;
+ ti = ipvs->backup_tinfo;
ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
- array = ipvs->backup_threads;
retc = 0;
for (id = ipvs->threads_mask; id >= 0; id--) {
int ret;
+ tinfo = &ti[id];
pr_info("stopping backup sync thread %d ...\n",
- task_pid_nr(array[id]));
- ret = kthread_stop(array[id]);
+ task_pid_nr(tinfo->task));
+ ret = kthread_stop(tinfo->task);
if (retc >= 0)
retc = ret;
}
- kfree(array);
- ipvs->backup_threads = NULL;
+ ipvs->backup_tinfo = NULL;
+ } else {
+ goto err;
}
+ id = ipvs->threads_mask;
+ mutex_unlock(&ipvs->sync_mutex);
+
+ /* No more mutexes, release socks */
+ for (tinfo = ti + id; tinfo >= ti; tinfo--) {
+ if (tinfo->sock)
+ sock_release(tinfo->sock);
+ kfree(tinfo->buf);
+ }
+ kfree(ti);
/* decrease the module use count */
ip_vs_use_count_dec();
+ return retc;
+err:
+ mutex_unlock(&ipvs->sync_mutex);
return retc;
}
@@ -2021,7 +2033,6 @@ void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
{
int retc;
- mutex_lock(&ipvs->sync_mutex);
retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
if (retc && retc != -ESRCH)
pr_err("Failed to stop Master Daemon\n");
@@ -2029,5 +2040,4 @@ void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
if (retc && retc != -ESRCH)
pr_err("Failed to stop Backup Daemon\n");
- mutex_unlock(&ipvs->sync_mutex);
}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index e101eda05d55..9c464d24beec 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -29,6 +29,7 @@
#include <linux/tcp.h> /* for tcphdr */
#include <net/ip.h>
#include <net/gue.h>
+#include <net/gre.h>
#include <net/tcp.h> /* for csum_tcpudp_magic */
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
@@ -36,6 +37,7 @@
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/ip_tunnels.h>
+#include <net/ip6_checksum.h>
#include <net/addrconf.h>
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
@@ -275,7 +277,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
}
/* don't propagate ttl change to cloned packets */
- if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
return false;
ipv6_hdr(skb)->hop_limit--;
@@ -290,7 +292,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
}
/* don't propagate ttl change to cloned packets */
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return false;
/* Decrease ttl */
@@ -381,8 +383,19 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
if (!dest)
goto err_put;
- if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
+ if ((dest->tun_flags &
+ IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL)
+ mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ __be16 tflags = 0;
+
+ if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ tflags |= TUNNEL_CSUM;
+ mtu -= gre_calc_hlen(tflags);
+ }
if (mtu < 68) {
IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
goto err_put;
@@ -536,8 +549,19 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
if (!dest)
goto err_put;
- if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
+ if ((dest->tun_flags &
+ IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL)
+ mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ __be16 tflags = 0;
+
+ if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ tflags |= TUNNEL_CSUM;
+ mtu -= gre_calc_hlen(tflags);
+ }
if (mtu < IPV6_MIN_MTU) {
IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
IPV6_MIN_MTU);
@@ -792,7 +816,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
@@ -881,7 +905,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
@@ -1002,17 +1026,56 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb,
__be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
struct udphdr *udph; /* Our new UDP header */
struct guehdr *gueh; /* Our new GUE header */
+ size_t hdrlen, optlen = 0;
+ void *data;
+ bool need_priv = false;
+
+ if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ need_priv = true;
+ }
- skb_push(skb, sizeof(struct guehdr));
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ skb_push(skb, hdrlen);
gueh = (struct guehdr *)skb->data;
gueh->control = 0;
gueh->version = 0;
- gueh->hlen = 0;
+ gueh->hlen = optlen >> 2;
gueh->flags = 0;
gueh->proto_ctype = *next_protocol;
+ data = &gueh[1];
+
+ if (need_priv) {
+ __be32 *flags = data;
+ u16 csum_start = skb_checksum_start_offset(skb);
+ __be16 *pd;
+
+ gueh->flags |= GUE_FLAG_PRIV;
+ *flags = 0;
+ data += GUE_LEN_PRIV;
+
+ if (csum_start < hdrlen)
+ return -EINVAL;
+
+ csum_start -= hdrlen;
+ pd = data;
+ pd[0] = htons(csum_start);
+ pd[1] = htons(csum_start + skb->csum_offset);
+
+ if (!skb_is_gso(skb)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->encapsulation = 0;
+ }
+
+ *flags |= GUE_PFLAG_REMCSUM;
+ data += GUE_PLEN_REMCSUM;
+ }
+
skb_push(skb, sizeof(struct udphdr));
skb_reset_transport_header(skb);
@@ -1029,6 +1092,24 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb,
return 0;
}
+static void
+ipvs_gre_encap(struct net *net, struct sk_buff *skb,
+ struct ip_vs_conn *cp, __u8 *next_protocol)
+{
+ __be16 proto = *next_protocol == IPPROTO_IPIP ?
+ htons(ETH_P_IP) : htons(ETH_P_IPV6);
+ __be16 tflags = 0;
+ size_t hdrlen;
+
+ if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ tflags |= TUNNEL_CSUM;
+
+ hdrlen = gre_calc_hlen(tflags);
+ gre_build_header(skb, hdrlen, tflags, proto, 0, 0);
+
+ *next_protocol = IPPROTO_GRE;
+}
+
/*
* IP Tunneling transmitter
*
@@ -1066,6 +1147,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
int tun_type, gso_type;
+ int tun_flags;
EnterFunction(10);
@@ -1088,9 +1170,28 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
tun_type = cp->dest->tun_type;
+ tun_flags = cp->dest->tun_flags;
+
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ size_t gue_hdrlen, gue_optlen = 0;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ }
+ gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
+
+ max_headroom += sizeof(struct udphdr) + gue_hdrlen;
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ size_t gre_hdrlen;
+ __be16 tflags = 0;
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ tflags |= TUNNEL_CSUM;
+ gre_hdrlen = gre_calc_hlen(tflags);
+
+ max_headroom += gre_hdrlen;
+ }
/* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
@@ -1101,8 +1202,22 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error;
gso_type = __tun_gso_type_mask(AF_INET, cp->af);
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- gso_type |= SKB_GSO_UDP_TUNNEL;
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ else
+ gso_type |= SKB_GSO_UDP_TUNNEL;
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+ }
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ gso_type |= SKB_GSO_GRE_CSUM;
+ else
+ gso_type |= SKB_GSO_GRE;
+ }
if (iptunnel_handle_offloads(skb, gso_type))
goto tx_error;
@@ -1111,8 +1226,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_set_inner_ipproto(skb, next_protocol);
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- ipvs_gue_encap(net, skb, cp, &next_protocol);
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ bool check = false;
+
+ if (ipvs_gue_encap(net, skb, cp, &next_protocol))
+ goto tx_error;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ check = true;
+
+ udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len);
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
+ ipvs_gre_encap(net, skb, cp, &next_protocol);
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
@@ -1170,6 +1296,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
int tun_type, gso_type;
+ int tun_flags;
EnterFunction(10);
@@ -1193,9 +1320,28 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
tun_type = cp->dest->tun_type;
+ tun_flags = cp->dest->tun_flags;
+
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ size_t gue_hdrlen, gue_optlen = 0;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ }
+ gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
+
+ max_headroom += sizeof(struct udphdr) + gue_hdrlen;
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ size_t gre_hdrlen;
+ __be16 tflags = 0;
+
+ if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ tflags |= TUNNEL_CSUM;
+ gre_hdrlen = gre_calc_hlen(tflags);
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
+ max_headroom += gre_hdrlen;
+ }
skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
&next_protocol, &payload_len,
@@ -1204,8 +1350,22 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error;
gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- gso_type |= SKB_GSO_UDP_TUNNEL;
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ else
+ gso_type |= SKB_GSO_UDP_TUNNEL;
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+ }
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ gso_type |= SKB_GSO_GRE_CSUM;
+ else
+ gso_type |= SKB_GSO_GRE;
+ }
if (iptunnel_handle_offloads(skb, gso_type))
goto tx_error;
@@ -1214,8 +1374,19 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_set_inner_ipproto(skb, next_protocol);
- if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
- ipvs_gue_encap(net, skb, cp, &next_protocol);
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ bool check = false;
+
+ if (ipvs_gue_encap(net, skb, cp, &next_protocol))
+ goto tx_error;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ check = true;
+
+ udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len);
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
+ ipvs_gre_encap(net, skb, cp, &next_protocol);
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
@@ -1400,7 +1571,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, offset))
+ if (skb_ensure_writable(skb, offset))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
@@ -1489,7 +1660,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, offset))
+ if (skb_ensure_writable(skb, offset))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index e52fcb9c9a96..921a7b95be68 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -37,12 +37,17 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
in_dev = __in_dev_get_rcu(rt->dst.dev);
if (in_dev != NULL) {
- for_primary_ifa(in_dev) {
+ const struct in_ifaddr *ifa;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
+
if (ifa->ifa_broadcast == iph->daddr) {
mask = ifa->ifa_mask;
break;
}
- } endfor_ifa(in_dev);
+ }
}
if (mask == 0)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f4f9b8344a32..bdfeacee0817 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -749,9 +749,6 @@ begin:
continue;
}
- if (nf_ct_is_dying(ct))
- continue;
-
if (nf_ct_key_equal(h, tuple, zone, net))
return h;
}
@@ -777,20 +774,24 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
struct nf_conn *ct;
rcu_read_lock();
-begin:
+
h = ____nf_conntrack_find(net, zone, tuple, hash);
if (h) {
+ /* We have a candidate that matches the tuple we're interested
+ * in, try to obtain a reference and re-check tuple
+ */
ct = nf_ct_tuplehash_to_ctrack(h);
- if (unlikely(nf_ct_is_dying(ct) ||
- !atomic_inc_not_zero(&ct->ct_general.use)))
- h = NULL;
- else {
- if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
- nf_ct_put(ct);
- goto begin;
- }
+ if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
+ if (likely(nf_ct_key_equal(h, tuple, zone, net)))
+ goto found;
+
+ /* TYPESAFE_BY_RCU recycled the candidate */
+ nf_ct_put(ct);
}
+
+ h = NULL;
}
+found:
rcu_read_unlock();
return h;
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index fac6986d37a8..6497e5fc0871 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -6,7 +6,7 @@
* Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* Based on the 'brute force' H.323 connection tracking module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
*
* For more information, please see http://nath323.sourceforge.net/
*/
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 7db79c1b8084..1b77444d5b52 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1256,7 +1256,6 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
struct nf_conntrack_tuple tuple;
struct nf_conn *ct;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC;
struct nf_conntrack_zone zone;
int err;
@@ -1266,11 +1265,13 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
if (cda[CTA_TUPLE_ORIG])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
- u3, &zone);
+ nfmsg->nfgen_family, &zone);
else if (cda[CTA_TUPLE_REPLY])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
- u3, &zone);
+ nfmsg->nfgen_family, &zone);
else {
+ u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC;
+
return ctnetlink_flush_conntrack(net, cda,
NETLINK_CB(skb).portid,
nlmsg_report(nlh), u3);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 37bb530d848f..a0560d175a7f 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -16,6 +16,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
#include <net/netfilter/nf_log.h>
#include <linux/ip.h>
@@ -120,10 +121,8 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
};
EXPORT_SYMBOL_GPL(nf_ct_l4proto_find);
-static unsigned int nf_confirm(struct sk_buff *skb,
- unsigned int protoff,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
+unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
{
const struct nf_conn_help *help;
@@ -154,6 +153,7 @@ static unsigned int nf_confirm(struct sk_buff *skb,
/* We've seen it coming out the other side: confirm it */
return nf_conntrack_confirm(skb);
}
+EXPORT_SYMBOL_GPL(nf_confirm);
static unsigned int ipv4_confirm(void *priv,
struct sk_buff *skb,
@@ -442,12 +442,14 @@ static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto)
return 0;
}
+static struct nf_ct_bridge_info *nf_ct_bridge_info;
+
static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
{
struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
- bool fixup_needed = false;
+ bool fixup_needed = false, retry = true;
int err = 0;
-
+retry:
mutex_lock(&nf_ct_proto_mutex);
switch (nfproto) {
@@ -487,6 +489,32 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
fixup_needed = true;
break;
#endif
+ case NFPROTO_BRIDGE:
+ if (!nf_ct_bridge_info) {
+ if (!retry) {
+ err = -EPROTO;
+ goto out_unlock;
+ }
+ mutex_unlock(&nf_ct_proto_mutex);
+ request_module("nf_conntrack_bridge");
+ retry = false;
+ goto retry;
+ }
+ if (!try_module_get(nf_ct_bridge_info->me)) {
+ err = -EPROTO;
+ goto out_unlock;
+ }
+ cnet->users_bridge++;
+ if (cnet->users_bridge > 1)
+ goto out_unlock;
+
+ err = nf_register_net_hooks(net, nf_ct_bridge_info->ops,
+ nf_ct_bridge_info->ops_size);
+ if (err)
+ cnet->users_bridge = 0;
+ else
+ fixup_needed = true;
+ break;
default:
err = -EPROTO;
break;
@@ -519,47 +547,99 @@ static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
ARRAY_SIZE(ipv6_conntrack_ops));
break;
#endif
+ case NFPROTO_BRIDGE:
+ if (!nf_ct_bridge_info)
+ break;
+ if (cnet->users_bridge && (--cnet->users_bridge == 0))
+ nf_unregister_net_hooks(net, nf_ct_bridge_info->ops,
+ nf_ct_bridge_info->ops_size);
+
+ module_put(nf_ct_bridge_info->me);
+ break;
}
-
mutex_unlock(&nf_ct_proto_mutex);
}
-int nf_ct_netns_get(struct net *net, u8 nfproto)
+static int nf_ct_netns_inet_get(struct net *net)
{
int err;
- if (nfproto == NFPROTO_INET) {
- err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
- if (err < 0)
- goto err1;
- err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
- if (err < 0)
- goto err2;
- } else {
- err = nf_ct_netns_do_get(net, nfproto);
- if (err < 0)
- goto err1;
- }
- return 0;
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
+ if (err < 0)
+ goto err1;
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
+ if (err < 0)
+ goto err2;
+ return err;
err2:
nf_ct_netns_put(net, NFPROTO_IPV4);
err1:
return err;
}
+
+int nf_ct_netns_get(struct net *net, u8 nfproto)
+{
+ int err;
+
+ switch (nfproto) {
+ case NFPROTO_INET:
+ err = nf_ct_netns_inet_get(net);
+ break;
+ case NFPROTO_BRIDGE:
+ err = nf_ct_netns_do_get(net, NFPROTO_BRIDGE);
+ if (err < 0)
+ return err;
+
+ err = nf_ct_netns_inet_get(net);
+ if (err < 0) {
+ nf_ct_netns_put(net, NFPROTO_BRIDGE);
+ return err;
+ }
+ break;
+ default:
+ err = nf_ct_netns_do_get(net, nfproto);
+ break;
+ }
+ return err;
+}
EXPORT_SYMBOL_GPL(nf_ct_netns_get);
void nf_ct_netns_put(struct net *net, uint8_t nfproto)
{
- if (nfproto == NFPROTO_INET) {
+ switch (nfproto) {
+ case NFPROTO_BRIDGE:
+ nf_ct_netns_do_put(net, NFPROTO_BRIDGE);
+ /* fall through */
+ case NFPROTO_INET:
nf_ct_netns_do_put(net, NFPROTO_IPV4);
nf_ct_netns_do_put(net, NFPROTO_IPV6);
- } else {
+ break;
+ default:
nf_ct_netns_do_put(net, nfproto);
+ break;
}
}
EXPORT_SYMBOL_GPL(nf_ct_netns_put);
+void nf_ct_bridge_register(struct nf_ct_bridge_info *info)
+{
+ WARN_ON(nf_ct_bridge_info);
+ mutex_lock(&nf_ct_proto_mutex);
+ nf_ct_bridge_info = info;
+ mutex_unlock(&nf_ct_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_ct_bridge_register);
+
+void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info)
+{
+ WARN_ON(!nf_ct_bridge_info);
+ mutex_lock(&nf_ct_proto_mutex);
+ nf_ct_bridge_info = NULL;
+ mutex_unlock(&nf_ct_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_ct_bridge_unregister);
+
int nf_conntrack_proto_init(void)
{
int ret;
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index a824367ed518..dd53e2b20f6b 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -218,7 +218,7 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
/* See ip_conntrack_proto_tcp.c */
if (state->net->ct.sysctl_checksum &&
state->hook == NF_INET_PRE_ROUTING &&
- nf_ip_checksum(skb, state->hook, dataoff, 0)) {
+ nf_ip_checksum(skb, state->hook, dataoff, IPPROTO_ICMP)) {
icmp_error_log(skb, state, "bad hw icmp checksum");
return -NF_ACCEPT;
}
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 522c08c23600..fce3d93f1541 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -336,7 +336,7 @@ static bool sctp_error(struct sk_buff *skb,
if (state->hook == NF_INET_PRE_ROUTING &&
state->net->ct.sysctl_checksum &&
skb->ip_summed == CHECKSUM_NONE) {
- if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) {
+ if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) {
logmsg = "nf_ct_sctp: failed to read header ";
goto out_invalid;
}
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 1e2cc83ff5da..d5fdfa00d683 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*/
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index dc21a43cd145..3066449f8bd8 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -126,7 +126,7 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
optoff = protoff + sizeof(struct tcphdr);
optend = protoff + tcph->doff * 4;
- if (!skb_make_writable(skb, optend))
+ if (skb_ensure_writable(skb, optend))
return 0;
tcph = (void *)skb->data + protoff;
@@ -176,7 +176,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
this_way = &seqadj->seq[dir];
other_way = &seqadj->seq[!dir];
- if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
+ if (skb_ensure_writable(skb, protoff + sizeof(*tcph)))
return 0;
tcph = (void *)skb->data + protoff;
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 948b4ebbe3fb..e3d797252a98 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -53,7 +53,6 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
ft->dst_port = ctt->dst.u.tcp.port;
ft->iifidx = other_dst->dev->ifindex;
- ft->oifidx = dst->dev->ifindex;
ft->dst_cache = dst;
}
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 3574a212bdc2..bb25d4c794c7 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -374,7 +374,7 @@ static int seq_show(struct seq_file *s, void *v)
continue;
logger = nft_log_dereference(loggers[*pos][i]);
- seq_printf(s, "%s", logger->name);
+ seq_puts(s, logger->name);
if (i == 0 && loggers[*pos][i + 1] != NULL)
seq_puts(s, ",");
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 98bf543e9891..a263505455fc 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -95,7 +95,7 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
struct tcphdr *tcph;
int oldlen, datalen;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return false;
if (rep_len > match_len &&
@@ -145,7 +145,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
struct udphdr *udph;
int datalen, oldlen;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return false;
if (rep_len > match_len &&
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index 07da07788f6b..7ac733ebd060 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -70,7 +70,7 @@ static bool udp_manip_pkt(struct sk_buff *skb,
struct udphdr *hdr;
bool do_csum;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct udphdr *)(skb->data + hdroff);
@@ -88,7 +88,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb,
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
struct udphdr *hdr;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct udphdr *)(skb->data + hdroff);
@@ -114,7 +114,7 @@ sctp_manip_pkt(struct sk_buff *skb,
if (skb->len >= hdroff + sizeof(*hdr))
hdrsize = sizeof(*hdr);
- if (!skb_make_writable(skb, hdroff + hdrsize))
+ if (skb_ensure_writable(skb, hdroff + hdrsize))
return false;
hdr = (struct sctphdr *)(skb->data + hdroff);
@@ -155,7 +155,7 @@ tcp_manip_pkt(struct sk_buff *skb,
if (skb->len >= hdroff + sizeof(struct tcphdr))
hdrsize = sizeof(struct tcphdr);
- if (!skb_make_writable(skb, hdroff + hdrsize))
+ if (skb_ensure_writable(skb, hdroff + hdrsize))
return false;
hdr = (struct tcphdr *)(skb->data + hdroff);
@@ -195,7 +195,7 @@ dccp_manip_pkt(struct sk_buff *skb,
if (skb->len >= hdroff + sizeof(struct dccp_hdr))
hdrsize = sizeof(struct dccp_hdr);
- if (!skb_make_writable(skb, hdroff + hdrsize))
+ if (skb_ensure_writable(skb, hdroff + hdrsize))
return false;
hdr = (struct dccp_hdr *)(skb->data + hdroff);
@@ -229,7 +229,7 @@ icmp_manip_pkt(struct sk_buff *skb,
{
struct icmphdr *hdr;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct icmphdr *)(skb->data + hdroff);
@@ -247,7 +247,7 @@ icmpv6_manip_pkt(struct sk_buff *skb,
{
struct icmp6hdr *hdr;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct icmp6hdr *)(skb->data + hdroff);
@@ -275,7 +275,7 @@ gre_manip_pkt(struct sk_buff *skb,
/* pgreh includes two optional 32bit fields which are not required
* to be there. That's where the magic '8' comes from */
- if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8))
return false;
greh = (void *)skb->data + hdroff;
@@ -347,7 +347,7 @@ static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
struct iphdr *iph;
unsigned int hdroff;
- if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
+ if (skb_ensure_writable(skb, iphdroff + sizeof(*iph)))
return false;
iph = (void *)skb->data + iphdroff;
@@ -378,7 +378,7 @@ static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb,
int hdroff;
u8 nexthdr;
- if (!skb_make_writable(skb, iphdroff + sizeof(*ipv6h)))
+ if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h)))
return false;
ipv6h = (void *)skb->data + iphdroff;
@@ -562,9 +562,9 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
- if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+ if (skb_ensure_writable(skb, hdrlen + sizeof(*inside)))
return 0;
- if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
+ if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP))
return 0;
inside = (void *)skb->data + hdrlen;
@@ -784,7 +784,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
- if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+ if (skb_ensure_writable(skb, hdrlen + sizeof(*inside)))
return 0;
if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6))
return 0;
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 4ffe5e5e65ba..f91579c821e9 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -44,15 +44,17 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
if (hooknum == NF_INET_LOCAL_OUT) {
newdst = htonl(0x7F000001);
} else {
- struct in_device *indev;
- struct in_ifaddr *ifa;
+ const struct in_device *indev;
newdst = 0;
indev = __in_dev_get_rcu(skb->dev);
- if (indev && indev->ifa_list) {
- ifa = indev->ifa_list;
- newdst = ifa->ifa_local;
+ if (indev) {
+ const struct in_ifaddr *ifa;
+
+ ifa = rcu_dereference(indev->ifa_list);
+ if (ifa)
+ newdst = ifa->ifa_local;
}
if (!newdst)
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 7de28fa0f14a..e338d91980d8 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -282,7 +282,7 @@ next:
if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) {
struct udphdr *uh;
- if (!skb_make_writable(skb, skb->len)) {
+ if (skb_ensure_writable(skb, skb->len)) {
nf_ct_helper_log(skb, ct, "cannot mangle packet");
return NF_DROP;
}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index b5b2be55ca82..a2b58de82600 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -156,7 +156,6 @@ static void nf_ip6_saveroute(const struct sk_buff *skb,
}
static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
- const struct nf_hook_entries *entries,
unsigned int index, unsigned int queuenum)
{
int status = -ENOENT;
@@ -190,6 +189,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
goto err;
}
+ if (!skb_dst_force(skb) && state->hook != NF_INET_PRE_ROUTING) {
+ status = -ENETDOWN;
+ goto err;
+ }
+
*entry = (struct nf_queue_entry) {
.skb = skb,
.state = *state,
@@ -198,7 +202,6 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
};
nf_queue_entry_get_refs(entry);
- skb_dst_force(skb);
switch (entry->state.pf) {
case AF_INET:
@@ -225,12 +228,11 @@ err:
/* Packets leaving via this function must come back through nf_reinject(). */
int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
- const struct nf_hook_entries *entries, unsigned int index,
- unsigned int verdict)
+ unsigned int index, unsigned int verdict)
{
int ret;
- ret = __nf_queue(skb, state, entries, index, verdict >> NF_VERDICT_QBITS);
+ ret = __nf_queue(skb, state, index, verdict >> NF_VERDICT_QBITS);
if (ret < 0) {
if (ret == -ESRCH &&
(verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
@@ -336,7 +338,7 @@ next_hook:
local_bh_enable();
break;
case NF_QUEUE:
- err = nf_queue(skb, &entry->state, hooks, i, verdict);
+ err = nf_queue(skb, &entry->state, i, verdict);
if (err == 1)
goto next_hook;
break;
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 8ce74ed985c0..b101f187eda8 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -10,16 +10,16 @@
#include <net/netns/generic.h>
#include <linux/proc_fs.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter/xt_tcpudp.h>
-#include <linux/netfilter/xt_SYNPROXY.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nf_synproxy.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_synproxy.h>
unsigned int synproxy_net_id;
EXPORT_SYMBOL_GPL(synproxy_net_id);
@@ -57,7 +57,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
case TCPOPT_MSS:
if (opsize == TCPOLEN_MSS) {
opts->mss = get_unaligned_be16(ptr);
- opts->options |= XT_SYNPROXY_OPT_MSS;
+ opts->options |= NF_SYNPROXY_OPT_MSS;
}
break;
case TCPOPT_WINDOW:
@@ -65,19 +65,19 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
opts->wscale = *ptr;
if (opts->wscale > TCP_MAX_WSCALE)
opts->wscale = TCP_MAX_WSCALE;
- opts->options |= XT_SYNPROXY_OPT_WSCALE;
+ opts->options |= NF_SYNPROXY_OPT_WSCALE;
}
break;
case TCPOPT_TIMESTAMP:
if (opsize == TCPOLEN_TIMESTAMP) {
opts->tsval = get_unaligned_be32(ptr);
opts->tsecr = get_unaligned_be32(ptr + 4);
- opts->options |= XT_SYNPROXY_OPT_TIMESTAMP;
+ opts->options |= NF_SYNPROXY_OPT_TIMESTAMP;
}
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM)
- opts->options |= XT_SYNPROXY_OPT_SACK_PERM;
+ opts->options |= NF_SYNPROXY_OPT_SACK_PERM;
break;
}
@@ -89,36 +89,36 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
}
EXPORT_SYMBOL_GPL(synproxy_parse_options);
-unsigned int synproxy_options_size(const struct synproxy_options *opts)
+static unsigned int
+synproxy_options_size(const struct synproxy_options *opts)
{
unsigned int size = 0;
- if (opts->options & XT_SYNPROXY_OPT_MSS)
+ if (opts->options & NF_SYNPROXY_OPT_MSS)
size += TCPOLEN_MSS_ALIGNED;
- if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+ if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
size += TCPOLEN_TSTAMP_ALIGNED;
- else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ else if (opts->options & NF_SYNPROXY_OPT_SACK_PERM)
size += TCPOLEN_SACKPERM_ALIGNED;
- if (opts->options & XT_SYNPROXY_OPT_WSCALE)
+ if (opts->options & NF_SYNPROXY_OPT_WSCALE)
size += TCPOLEN_WSCALE_ALIGNED;
return size;
}
-EXPORT_SYMBOL_GPL(synproxy_options_size);
-void
+static void
synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
{
__be32 *ptr = (__be32 *)(th + 1);
u8 options = opts->options;
- if (options & XT_SYNPROXY_OPT_MSS)
+ if (options & NF_SYNPROXY_OPT_MSS)
*ptr++ = htonl((TCPOPT_MSS << 24) |
(TCPOLEN_MSS << 16) |
opts->mss);
- if (options & XT_SYNPROXY_OPT_TIMESTAMP) {
- if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ if (options & NF_SYNPROXY_OPT_TIMESTAMP) {
+ if (options & NF_SYNPROXY_OPT_SACK_PERM)
*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
(TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) |
@@ -131,58 +131,56 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
*ptr++ = htonl(opts->tsval);
*ptr++ = htonl(opts->tsecr);
- } else if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ } else if (options & NF_SYNPROXY_OPT_SACK_PERM)
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
- if (options & XT_SYNPROXY_OPT_WSCALE)
+ if (options & NF_SYNPROXY_OPT_WSCALE)
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
opts->wscale);
}
-EXPORT_SYMBOL_GPL(synproxy_build_options);
-void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info,
struct synproxy_options *opts)
{
opts->tsecr = opts->tsval;
opts->tsval = tcp_time_stamp_raw() & ~0x3f;
- if (opts->options & XT_SYNPROXY_OPT_WSCALE) {
+ if (opts->options & NF_SYNPROXY_OPT_WSCALE) {
opts->tsval |= opts->wscale;
opts->wscale = info->wscale;
} else
opts->tsval |= 0xf;
- if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ if (opts->options & NF_SYNPROXY_OPT_SACK_PERM)
opts->tsval |= 1 << 4;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
opts->tsval |= 1 << 5;
}
EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie);
-void synproxy_check_timestamp_cookie(struct synproxy_options *opts)
+static void
+synproxy_check_timestamp_cookie(struct synproxy_options *opts)
{
opts->wscale = opts->tsecr & 0xf;
if (opts->wscale != 0xf)
- opts->options |= XT_SYNPROXY_OPT_WSCALE;
+ opts->options |= NF_SYNPROXY_OPT_WSCALE;
- opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0;
+ opts->options |= opts->tsecr & (1 << 4) ? NF_SYNPROXY_OPT_SACK_PERM : 0;
- opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0;
+ opts->options |= opts->tsecr & (1 << 5) ? NF_SYNPROXY_OPT_ECN : 0;
}
-EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie);
-unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
- unsigned int protoff,
- struct tcphdr *th,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- const struct nf_conn_synproxy *synproxy)
+static unsigned int
+synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
+ struct tcphdr *th, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_conn_synproxy *synproxy)
{
unsigned int optoff, optend;
__be32 *ptr, old;
@@ -193,7 +191,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
optoff = protoff + sizeof(struct tcphdr);
optend = protoff + th->doff * 4;
- if (!skb_make_writable(skb, optend))
+ if (skb_ensure_writable(skb, optend))
return 0;
while (optoff < optend) {
@@ -232,7 +230,6 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
}
return 1;
}
-EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust);
static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
.len = sizeof(struct nf_conn_synproxy),
@@ -413,5 +410,830 @@ static void __exit synproxy_core_exit(void)
module_init(synproxy_core_init);
module_exit(synproxy_core_exit);
+static struct iphdr *
+synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
+ __be32 daddr)
+{
+ struct iphdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = skb_put(skb, sizeof(*iph));
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) / 4;
+ iph->tos = 0;
+ iph->id = 0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = net->ipv4.sysctl_ip_default_ttl;
+ iph->protocol = IPPROTO_TCP;
+ iph->check = 0;
+ iph->saddr = saddr;
+ iph->daddr = daddr;
+
+ return iph;
+}
+
+static void
+synproxy_send_tcp(struct net *net,
+ const struct sk_buff *skb, struct sk_buff *nskb,
+ struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+ struct iphdr *niph, struct tcphdr *nth,
+ unsigned int tcp_hdr_size)
+{
+ nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)nth - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+
+ skb_dst_set_noref(nskb, skb_dst(skb));
+ nskb->protocol = htons(ETH_P_IP);
+ if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
+ goto free_nskb;
+
+ if (nfct) {
+ nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
+ nf_conntrack_get(nfct);
+ }
+
+ ip_local_out(net, nskb->sk, nskb);
+ return;
+
+free_nskb:
+ kfree_skb(nskb);
+}
+
+void
+synproxy_send_client_synack(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+ u16 mss = opts->mss;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = 0;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
+}
+EXPORT_SYMBOL_GPL(synproxy_send_client_synack);
+
+static void
+synproxy_send_server_syn(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(recv_seq - 1);
+ /* ack_seq is used to relay our ISN to the synproxy hook to initialize
+ * sequence number translation once a connection tracking entry exists.
+ */
+ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = th->window;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack(struct net *net,
+ const struct ip_ct_tcp *state,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(ntohl(th->ack_seq));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(ntohl(th->seq) + 1);
+ nth->ack_seq = th->ack_seq;
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(ntohs(th->window) >> opts->wscale);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
+}
+
+bool
+synproxy_recv_client_ack(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ int mss;
+
+ mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ if (mss == 0) {
+ this_cpu_inc(snet->stats->cookie_invalid);
+ return false;
+ }
+
+ this_cpu_inc(snet->stats->cookie_valid);
+ opts->mss = mss;
+ opts->options |= NF_SYNPROXY_OPT_MSS;
+
+ if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_check_timestamp_cookie(opts);
+
+ synproxy_send_server_syn(net, skb, th, opts, recv_seq);
+ return true;
+}
+EXPORT_SYMBOL_GPL(synproxy_recv_client_ack);
+
+unsigned int
+ipv4_synproxy_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *nhs)
+{
+ struct net *net = nhs->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct nf_conn_synproxy *synproxy;
+ struct synproxy_options opts = {};
+ const struct ip_ct_tcp *state;
+ struct tcphdr *th, _th;
+ unsigned int thoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return NF_ACCEPT;
+
+ synproxy = nfct_synproxy(ct);
+ if (!synproxy)
+ return NF_ACCEPT;
+
+ if (nf_is_loopback_packet(skb) ||
+ ip_hdr(skb)->protocol != IPPROTO_TCP)
+ return NF_ACCEPT;
+
+ thoff = ip_hdrlen(skb);
+ th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+ if (!th)
+ return NF_DROP;
+
+ state = &ct->proto.tcp;
+ switch (state->state) {
+ case TCP_CONNTRACK_CLOSE:
+ if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+ ntohl(th->seq) + 1);
+ break;
+ }
+
+ if (!th->syn || th->ack ||
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ break;
+
+ /* Reopened connection - reset the sequence number and timestamp
+ * adjustments, they will get initialized once the connection is
+ * reestablished.
+ */
+ nf_ct_seqadj_init(ct, ctinfo, 0);
+ synproxy->tsoff = 0;
+ this_cpu_inc(snet->stats->conn_reopened);
+
+ /* fall through */
+ case TCP_CONNTRACK_SYN_SENT:
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (!th->syn && th->ack &&
+ CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+ * therefore we need to add 1 to make the SYN sequence
+ * number match the one of first SYN.
+ */
+ if (synproxy_recv_client_ack(net, skb, th, &opts,
+ ntohl(th->seq) + 1)) {
+ this_cpu_inc(snet->stats->cookie_retrans);
+ consume_skb(skb);
+ return NF_STOLEN;
+ } else {
+ return NF_DROP;
+ }
+ }
+
+ synproxy->isn = ntohl(th->ack_seq);
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->its = opts.tsecr;
+
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ break;
+ case TCP_CONNTRACK_SYN_RECV:
+ if (!th->syn || !th->ack)
+ break;
+
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) {
+ synproxy->tsoff = opts.tsval - synproxy->its;
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ }
+
+ opts.options &= ~(NF_SYNPROXY_OPT_MSS |
+ NF_SYNPROXY_OPT_WSCALE |
+ NF_SYNPROXY_OPT_SACK_PERM);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_server_ack(net, state, skb, th, &opts);
+
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+ nf_conntrack_event_cache(IPCT_SEQADJ, ct);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_client_ack(net, skb, th, &opts);
+
+ consume_skb(skb);
+ return NF_STOLEN;
+ default:
+ break;
+ }
+
+ synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(ipv4_synproxy_hook);
+
+static const struct nf_hook_ops ipv4_synproxy_ops[] = {
+ {
+ .hook = ipv4_synproxy_hook,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+ {
+ .hook = ipv4_synproxy_hook,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+};
+
+int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net)
+{
+ int err;
+
+ if (snet->hook_ref4 == 0) {
+ err = nf_register_net_hooks(net, ipv4_synproxy_ops,
+ ARRAY_SIZE(ipv4_synproxy_ops));
+ if (err)
+ return err;
+ }
+
+ snet->hook_ref4++;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init);
+
+void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net)
+{
+ snet->hook_ref4--;
+ if (snet->hook_ref4 == 0)
+ nf_unregister_net_hooks(net, ipv4_synproxy_ops,
+ ARRAY_SIZE(ipv4_synproxy_ops));
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini);
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct ipv6hdr *
+synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
+{
+ struct ipv6hdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = skb_put(skb, sizeof(*iph));
+ ip6_flow_hdr(iph, 0, 0);
+ iph->hop_limit = net->ipv6.devconf_all->hop_limit;
+ iph->nexthdr = IPPROTO_TCP;
+ iph->saddr = *saddr;
+ iph->daddr = *daddr;
+
+ return iph;
+}
+
+static void
+synproxy_send_tcp_ipv6(struct net *net,
+ const struct sk_buff *skb, struct sk_buff *nskb,
+ struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+ struct ipv6hdr *niph, struct tcphdr *nth,
+ unsigned int tcp_hdr_size)
+{
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+ int err;
+
+ nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)nth - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ fl6.saddr = niph->saddr;
+ fl6.daddr = niph->daddr;
+ fl6.fl6_sport = nth->source;
+ fl6.fl6_dport = nth->dest;
+ security_skb_classify_flow((struct sk_buff *)skb,
+ flowi6_to_flowi(&fl6));
+ err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
+ if (err) {
+ goto free_nskb;
+ }
+
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst))
+ goto free_nskb;
+
+ skb_dst_set(nskb, dst);
+
+ if (nfct) {
+ nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
+ nf_conntrack_get(nfct);
+ }
+
+ ip6_local_out(net, nskb->sk, nskb);
+ return;
+
+free_nskb:
+ kfree_skb(nskb);
+}
+
+void
+synproxy_send_client_synack_ipv6(struct net *net,
+ const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+ u16 mss = opts->mss;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(nf_ipv6_cookie_init_sequence(iph, th, &mss));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = 0;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth,
+ tcp_hdr_size);
+}
+EXPORT_SYMBOL_GPL(synproxy_send_client_synack_ipv6);
+
+static void
+synproxy_send_server_syn_ipv6(struct net *net, const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(recv_seq - 1);
+ /* ack_seq is used to relay our ISN to the synproxy hook to initialize
+ * sequence number translation once a connection tracking entry exists.
+ */
+ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = th->window;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, &snet->tmpl->ct_general,
+ IP_CT_NEW, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack_ipv6(struct net *net, const struct ip_ct_tcp *state,
+ const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(ntohl(th->ack_seq));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, NULL, 0, niph, nth,
+ tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack_ipv6(struct net *net, const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(ntohl(th->seq) + 1);
+ nth->ack_seq = th->ack_seq;
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(ntohs(th->window) >> opts->wscale);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth,
+ tcp_hdr_size);
+}
+
+bool
+synproxy_recv_client_ack_ipv6(struct net *net,
+ const struct sk_buff *skb,
+ const struct tcphdr *th,
+ struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ int mss;
+
+ mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ if (mss == 0) {
+ this_cpu_inc(snet->stats->cookie_invalid);
+ return false;
+ }
+
+ this_cpu_inc(snet->stats->cookie_valid);
+ opts->mss = mss;
+ opts->options |= NF_SYNPROXY_OPT_MSS;
+
+ if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_check_timestamp_cookie(opts);
+
+ synproxy_send_server_syn_ipv6(net, skb, th, opts, recv_seq);
+ return true;
+}
+EXPORT_SYMBOL_GPL(synproxy_recv_client_ack_ipv6);
+
+unsigned int
+ipv6_synproxy_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *nhs)
+{
+ struct net *net = nhs->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct nf_conn_synproxy *synproxy;
+ struct synproxy_options opts = {};
+ const struct ip_ct_tcp *state;
+ struct tcphdr *th, _th;
+ __be16 frag_off;
+ u8 nexthdr;
+ int thoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return NF_ACCEPT;
+
+ synproxy = nfct_synproxy(ct);
+ if (!synproxy)
+ return NF_ACCEPT;
+
+ if (nf_is_loopback_packet(skb))
+ return NF_ACCEPT;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (thoff < 0 || nexthdr != IPPROTO_TCP)
+ return NF_ACCEPT;
+
+ th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+ if (!th)
+ return NF_DROP;
+
+ state = &ct->proto.tcp;
+ switch (state->state) {
+ case TCP_CONNTRACK_CLOSE:
+ if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+ ntohl(th->seq) + 1);
+ break;
+ }
+
+ if (!th->syn || th->ack ||
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ break;
+
+ /* Reopened connection - reset the sequence number and timestamp
+ * adjustments, they will get initialized once the connection is
+ * reestablished.
+ */
+ nf_ct_seqadj_init(ct, ctinfo, 0);
+ synproxy->tsoff = 0;
+ this_cpu_inc(snet->stats->conn_reopened);
+
+ /* fall through */
+ case TCP_CONNTRACK_SYN_SENT:
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (!th->syn && th->ack &&
+ CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+ * therefore we need to add 1 to make the SYN sequence
+ * number match the one of first SYN.
+ */
+ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts,
+ ntohl(th->seq) + 1)) {
+ this_cpu_inc(snet->stats->cookie_retrans);
+ consume_skb(skb);
+ return NF_STOLEN;
+ } else {
+ return NF_DROP;
+ }
+ }
+
+ synproxy->isn = ntohl(th->ack_seq);
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->its = opts.tsecr;
+
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ break;
+ case TCP_CONNTRACK_SYN_RECV:
+ if (!th->syn || !th->ack)
+ break;
+
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) {
+ synproxy->tsoff = opts.tsval - synproxy->its;
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ }
+
+ opts.options &= ~(NF_SYNPROXY_OPT_MSS |
+ NF_SYNPROXY_OPT_WSCALE |
+ NF_SYNPROXY_OPT_SACK_PERM);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_server_ack_ipv6(net, state, skb, th, &opts);
+
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+ nf_conntrack_event_cache(IPCT_SEQADJ, ct);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_client_ack_ipv6(net, skb, th, &opts);
+
+ consume_skb(skb);
+ return NF_STOLEN;
+ default:
+ break;
+ }
+
+ synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(ipv6_synproxy_hook);
+
+static const struct nf_hook_ops ipv6_synproxy_ops[] = {
+ {
+ .hook = ipv6_synproxy_hook,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+ {
+ .hook = ipv6_synproxy_hook,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+};
+
+int
+nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net)
+{
+ int err;
+
+ if (snet->hook_ref6 == 0) {
+ err = nf_register_net_hooks(net, ipv6_synproxy_ops,
+ ARRAY_SIZE(ipv6_synproxy_ops));
+ if (err)
+ return err;
+ }
+
+ snet->hook_ref6++;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init);
+
+void
+nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net)
+{
+ snet->hook_ref6--;
+ if (snet->hook_ref6 == 0)
+ nf_unregister_net_hooks(net, ipv6_synproxy_ops,
+ ARRAY_SIZE(ipv6_synproxy_ops));
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini);
+#endif /* CONFIG_IPV6 */
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index bcf17fb46d96..ed17a7c29b86 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -18,6 +18,7 @@
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -97,6 +98,7 @@ static void nft_ctx_init(struct nft_ctx *ctx,
ctx->nla = nla;
ctx->portid = NETLINK_CB(skb).portid;
ctx->report = nlmsg_report(nlh);
+ ctx->flags = nlh->nlmsg_flags;
ctx->seq = nlh->nlmsg_seq;
}
@@ -1169,6 +1171,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
[NFTA_CHAIN_POLICY] = { .type = NLA_U32 },
[NFTA_CHAIN_TYPE] = { .type = NLA_STRING },
[NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED },
+ [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 },
};
static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -1446,25 +1449,18 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
return newstats;
}
-static void nft_chain_stats_replace(struct net *net,
- struct nft_base_chain *chain,
- struct nft_stats __percpu *newstats)
+static void nft_chain_stats_replace(struct nft_trans *trans)
{
- struct nft_stats __percpu *oldstats;
+ struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain);
- if (newstats == NULL)
+ if (!nft_trans_chain_stats(trans))
return;
- if (rcu_access_pointer(chain->stats)) {
- oldstats = rcu_dereference_protected(chain->stats,
- lockdep_commit_lock_is_held(net));
- rcu_assign_pointer(chain->stats, newstats);
- synchronize_rcu();
- free_percpu(oldstats);
- } else {
- rcu_assign_pointer(chain->stats, newstats);
+ rcu_swap_protected(chain->stats, nft_trans_chain_stats(trans),
+ lockdep_commit_lock_is_held(trans->ctx.net));
+
+ if (!nft_trans_chain_stats(trans))
static_branch_inc(&nft_counters_enabled);
- }
}
static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
@@ -1610,7 +1606,7 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha
}
static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
- u8 policy)
+ u8 policy, u32 flags)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
@@ -1664,8 +1660,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
ops->hook = hook.type->hooks[ops->hooknum];
ops->dev = hook.dev;
- chain->flags |= NFT_BASE_CHAIN;
+ chain->flags |= NFT_BASE_CHAIN | flags;
basechain->policy = NF_ACCEPT;
+ INIT_LIST_HEAD(&basechain->cb_list);
} else {
chain = kzalloc(sizeof(*chain), GFP_KERNEL);
if (chain == NULL)
@@ -1725,7 +1722,8 @@ err1:
return err;
}
-static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy)
+static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
+ u32 flags)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
@@ -1737,6 +1735,9 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy)
struct nft_trans *trans;
int err;
+ if (chain->flags ^ flags)
+ return -EOPNOTSUPP;
+
if (nla[NFTA_CHAIN_HOOK]) {
if (!nft_is_base_chain(chain))
return -EBUSY;
@@ -1842,6 +1843,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
u8 policy = NF_ACCEPT;
struct nft_ctx ctx;
u64 handle = 0;
+ u32 flags = 0;
lockdep_assert_held(&net->nft.commit_mutex);
@@ -1896,6 +1898,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
}
}
+ if (nla[NFTA_CHAIN_FLAGS])
+ flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS]));
+
nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
if (chain != NULL) {
@@ -1906,10 +1911,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- return nf_tables_updchain(&ctx, genmask, policy);
+ return nf_tables_updchain(&ctx, genmask, policy, flags);
}
- return nf_tables_addchain(&ctx, family, genmask, policy);
+ return nf_tables_addchain(&ctx, family, genmask, policy, flags);
}
static int nf_tables_delchain(struct net *net, struct sock *nlsk,
@@ -2016,15 +2021,31 @@ EXPORT_SYMBOL_GPL(nft_unregister_expr);
static const struct nft_expr_type *__nft_expr_type_get(u8 family,
struct nlattr *nla)
{
- const struct nft_expr_type *type;
+ const struct nft_expr_type *type, *candidate = NULL;
list_for_each_entry(type, &nf_tables_expressions, list) {
- if (!nla_strcmp(nla, type->name) &&
- (!type->family || type->family == family))
- return type;
+ if (!nla_strcmp(nla, type->name)) {
+ if (!type->family && !candidate)
+ candidate = type;
+ else if (type->family == family)
+ candidate = type;
+ }
}
- return NULL;
+ return candidate;
+}
+
+#ifdef CONFIG_MODULES
+static int nft_expr_type_request_module(struct net *net, u8 family,
+ struct nlattr *nla)
+{
+ nft_request_module(net, "nft-expr-%u-%.*s", family,
+ nla_len(nla), (char *)nla_data(nla));
+ if (__nft_expr_type_get(family, nla))
+ return -EAGAIN;
+
+ return 0;
}
+#endif
static const struct nft_expr_type *nft_expr_type_get(struct net *net,
u8 family,
@@ -2042,9 +2063,7 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net,
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nft_request_module(net, "nft-expr-%u-%.*s", family,
- nla_len(nla), (char *)nla_data(nla));
- if (__nft_expr_type_get(family, nla))
+ if (nft_expr_type_request_module(net, family, nla) == -EAGAIN)
return ERR_PTR(-EAGAIN);
nft_request_module(net, "nft-expr-%.*s",
@@ -2137,6 +2156,12 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
(const struct nlattr * const *)info->tb);
if (IS_ERR(ops)) {
err = PTR_ERR(ops);
+#ifdef CONFIG_MODULES
+ if (err == -EAGAIN)
+ nft_expr_type_request_module(ctx->net,
+ ctx->family,
+ tb[NFTA_EXPR_NAME]);
+#endif
goto err1;
}
} else
@@ -2645,6 +2670,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
u8 genmask = nft_genmask_next(net);
struct nft_expr_info *info = NULL;
int family = nfmsg->nfgen_family;
+ struct nft_flow_rule *flow;
struct nft_table *table;
struct nft_chain *chain;
struct nft_rule *rule, *old_rule = NULL;
@@ -2791,7 +2817,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
list_add_tail_rcu(&rule->list, &old_rule->list);
} else {
- if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
+ trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
+ if (!trans) {
err = -ENOMEM;
goto err2;
}
@@ -2814,6 +2841,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
if (net->nft.validate_state == NFT_VALIDATE_DO)
return nft_table_validate(net, table);
+ if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
+ flow = nft_flow_rule_create(rule);
+ if (IS_ERR(flow))
+ return PTR_ERR(flow);
+
+ nft_trans_flow_rule(trans) = flow;
+ }
+
return 0;
err2:
nf_tables_rule_release(&ctx, rule);
@@ -3877,6 +3912,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
[NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED },
[NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 },
[NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 },
+ [NFTA_SET_ELEM_EXPIRATION] = { .type = NLA_U64 },
[NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY,
.len = NFT_USERDATA_MAXLEN },
[NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED },
@@ -4330,7 +4366,7 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
void *nft_set_elem_init(const struct nft_set *set,
const struct nft_set_ext_tmpl *tmpl,
const u32 *key, const u32 *data,
- u64 timeout, gfp_t gfp)
+ u64 timeout, u64 expiration, gfp_t gfp)
{
struct nft_set_ext *ext;
void *elem;
@@ -4345,9 +4381,11 @@ void *nft_set_elem_init(const struct nft_set *set,
memcpy(nft_set_ext_key(ext), key, set->klen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
memcpy(nft_set_ext_data(ext), data, set->dlen);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
- *nft_set_ext_expiration(ext) =
- get_jiffies_64() + timeout;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
+ *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration;
+ if (expiration == 0)
+ *nft_set_ext_expiration(ext) += timeout;
+ }
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
*nft_set_ext_timeout(ext) = timeout;
@@ -4412,6 +4450,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_trans *trans;
u32 flags = 0;
u64 timeout;
+ u64 expiration;
u8 ulen;
int err;
@@ -4455,6 +4494,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
timeout = set->timeout;
}
+ expiration = 0;
+ if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) {
+ if (!(set->flags & NFT_SET_TIMEOUT))
+ return -EINVAL;
+ err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION],
+ &expiration);
+ if (err)
+ return err;
+ }
+
err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1,
nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
@@ -4537,7 +4586,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data,
- timeout, GFP_KERNEL);
+ timeout, expiration, GFP_KERNEL);
if (elem.priv == NULL)
goto err3;
@@ -4739,7 +4788,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0,
- GFP_KERNEL);
+ 0, GFP_KERNEL);
if (elem.priv == NULL)
goto err2;
@@ -6359,9 +6408,9 @@ static void nft_chain_commit_update(struct nft_trans *trans)
if (!nft_is_base_chain(trans->ctx.chain))
return;
+ nft_chain_stats_replace(trans);
+
basechain = nft_base_chain(trans->ctx.chain);
- nft_chain_stats_replace(trans->ctx.net, basechain,
- nft_trans_chain_stats(trans));
switch (nft_trans_chain_policy(trans)) {
case NF_DROP:
@@ -6378,6 +6427,7 @@ static void nft_commit_release(struct nft_trans *trans)
nf_tables_table_destroy(&trans->ctx);
break;
case NFT_MSG_NEWCHAIN:
+ free_percpu(nft_trans_chain_stats(trans));
kfree(nft_trans_chain_name(trans));
break;
case NFT_MSG_DELCHAIN:
@@ -6596,6 +6646,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
struct nft_trans_elem *te;
struct nft_chain *chain;
struct nft_table *table;
+ int err;
if (list_empty(&net->nft.commit_list)) {
mutex_unlock(&net->nft.commit_mutex);
@@ -6606,6 +6657,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
if (nf_tables_validate(net) < 0)
return -EAGAIN;
+ err = nft_flow_rule_offload_commit(net);
+ if (err < 0)
+ return err;
+
/* 1. Allocate space for next generation rules_gen_X[] */
list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
int ret;
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index b950cd31348b..96c74c4c7176 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -19,6 +19,7 @@
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_log.h>
+#include <net/netfilter/nft_meta.h>
static noinline void __nft_trace_packet(struct nft_traceinfo *info,
const struct nft_chain *chain,
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
new file mode 100644
index 000000000000..2c3302845f67
--- /dev/null
+++ b/net/netfilter/nf_tables_offload.c
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <net/flow_offload.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
+#include <net/pkt_cls.h>
+
+static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions)
+{
+ struct nft_flow_rule *flow;
+
+ flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL);
+ if (!flow)
+ return NULL;
+
+ flow->rule = flow_rule_alloc(num_actions);
+ if (!flow->rule) {
+ kfree(flow);
+ return NULL;
+ }
+
+ flow->rule->match.dissector = &flow->match.dissector;
+ flow->rule->match.mask = &flow->match.mask;
+ flow->rule->match.key = &flow->match.key;
+
+ return flow;
+}
+
+struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule)
+{
+ struct nft_offload_ctx ctx = {
+ .dep = {
+ .type = NFT_OFFLOAD_DEP_UNSPEC,
+ },
+ };
+ struct nft_flow_rule *flow;
+ int num_actions = 0, err;
+ struct nft_expr *expr;
+
+ expr = nft_expr_first(rule);
+ while (expr->ops && expr != nft_expr_last(rule)) {
+ if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION)
+ num_actions++;
+
+ expr = nft_expr_next(expr);
+ }
+
+ flow = nft_flow_rule_alloc(num_actions);
+ if (!flow)
+ return ERR_PTR(-ENOMEM);
+
+ expr = nft_expr_first(rule);
+ while (expr->ops && expr != nft_expr_last(rule)) {
+ if (!expr->ops->offload) {
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+ err = expr->ops->offload(&ctx, flow, expr);
+ if (err < 0)
+ goto err_out;
+
+ expr = nft_expr_next(expr);
+ }
+ flow->proto = ctx.dep.l3num;
+
+ return flow;
+err_out:
+ nft_flow_rule_destroy(flow);
+
+ return ERR_PTR(err);
+}
+
+void nft_flow_rule_destroy(struct nft_flow_rule *flow)
+{
+ kfree(flow->rule);
+ kfree(flow);
+}
+
+void nft_offload_set_dependency(struct nft_offload_ctx *ctx,
+ enum nft_offload_dep_type type)
+{
+ ctx->dep.type = type;
+}
+
+void nft_offload_update_dependency(struct nft_offload_ctx *ctx,
+ const void *data, u32 len)
+{
+ switch (ctx->dep.type) {
+ case NFT_OFFLOAD_DEP_NETWORK:
+ WARN_ON(len != sizeof(__u16));
+ memcpy(&ctx->dep.l3num, data, sizeof(__u16));
+ break;
+ case NFT_OFFLOAD_DEP_TRANSPORT:
+ WARN_ON(len != sizeof(__u8));
+ memcpy(&ctx->dep.protonum, data, sizeof(__u8));
+ break;
+ default:
+ break;
+ }
+ ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC;
+}
+
+static void nft_flow_offload_common_init(struct flow_cls_common_offload *common,
+ __be16 proto,
+ struct netlink_ext_ack *extack)
+{
+ common->protocol = proto;
+ common->extack = extack;
+}
+
+static int nft_setup_cb_call(struct nft_base_chain *basechain,
+ enum tc_setup_type type, void *type_data)
+{
+ struct flow_block_cb *block_cb;
+ int err;
+
+ list_for_each_entry(block_cb, &basechain->cb_list, list) {
+ err = block_cb->cb(type, type_data, block_cb->cb_priv);
+ if (err < 0)
+ return err;
+ }
+ return 0;
+}
+
+static int nft_flow_offload_rule(struct nft_trans *trans,
+ enum flow_cls_command command)
+{
+ struct nft_flow_rule *flow = nft_trans_flow_rule(trans);
+ struct nft_rule *rule = nft_trans_rule(trans);
+ struct flow_cls_offload cls_flow = {};
+ struct nft_base_chain *basechain;
+ struct netlink_ext_ack extack;
+ __be16 proto = ETH_P_ALL;
+
+ if (!nft_is_base_chain(trans->ctx.chain))
+ return -EOPNOTSUPP;
+
+ basechain = nft_base_chain(trans->ctx.chain);
+
+ if (flow)
+ proto = flow->proto;
+
+ nft_flow_offload_common_init(&cls_flow.common, proto, &extack);
+ cls_flow.command = command;
+ cls_flow.cookie = (unsigned long) rule;
+ if (flow)
+ cls_flow.rule = flow->rule;
+
+ return nft_setup_cb_call(basechain, TC_SETUP_CLSFLOWER, &cls_flow);
+}
+
+static int nft_flow_offload_bind(struct flow_block_offload *bo,
+ struct nft_base_chain *basechain)
+{
+ list_splice(&bo->cb_list, &basechain->cb_list);
+ return 0;
+}
+
+static int nft_flow_offload_unbind(struct flow_block_offload *bo,
+ struct nft_base_chain *basechain)
+{
+ struct flow_block_cb *block_cb, *next;
+
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ list_del(&block_cb->list);
+ flow_block_cb_free(block_cb);
+ }
+
+ return 0;
+}
+
+#define FLOW_SETUP_BLOCK TC_SETUP_BLOCK
+
+static int nft_flow_offload_chain(struct nft_trans *trans,
+ enum flow_block_command cmd)
+{
+ struct nft_chain *chain = trans->ctx.chain;
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo = {};
+ struct nft_base_chain *basechain;
+ struct net_device *dev;
+ int err;
+
+ if (!nft_is_base_chain(chain))
+ return -EOPNOTSUPP;
+
+ basechain = nft_base_chain(chain);
+ dev = basechain->ops.dev;
+ if (!dev || !dev->netdev_ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ /* Only default policy to accept is supported for now. */
+ if (cmd == FLOW_BLOCK_BIND &&
+ nft_trans_chain_policy(trans) != -1 &&
+ nft_trans_chain_policy(trans) != NF_ACCEPT)
+ return -EOPNOTSUPP;
+
+ bo.command = cmd;
+ bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ bo.extack = &extack;
+ INIT_LIST_HEAD(&bo.cb_list);
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, FLOW_SETUP_BLOCK, &bo);
+ if (err < 0)
+ return err;
+
+ switch (cmd) {
+ case FLOW_BLOCK_BIND:
+ err = nft_flow_offload_bind(&bo, basechain);
+ break;
+ case FLOW_BLOCK_UNBIND:
+ err = nft_flow_offload_unbind(&bo, basechain);
+ break;
+ }
+
+ return err;
+}
+
+int nft_flow_rule_offload_commit(struct net *net)
+{
+ struct nft_trans *trans;
+ int err = 0;
+
+ list_for_each_entry(trans, &net->nft.commit_list, list) {
+ if (trans->ctx.family != NFPROTO_NETDEV)
+ continue;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWCHAIN:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_chain(trans, FLOW_BLOCK_BIND);
+ break;
+ case NFT_MSG_DELCHAIN:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_chain(trans, FLOW_BLOCK_UNBIND);
+ break;
+ case NFT_MSG_NEWRULE:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ if (trans->ctx.flags & NLM_F_REPLACE ||
+ !(trans->ctx.flags & NLM_F_APPEND))
+ return -EOPNOTSUPP;
+
+ err = nft_flow_offload_rule(trans, FLOW_CLS_REPLACE);
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+ break;
+ case NFT_MSG_DELRULE:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_rule(trans, FLOW_CLS_DESTROY);
+ break;
+ }
+
+ if (err)
+ return err;
+ }
+
+ return err;
+}
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index f42326b40d6f..9f5dea0064ea 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -33,6 +33,7 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
{
struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
const struct iphdr *ip = ip_hdr(skb);
+ const struct in_ifaddr *ifa;
int ret = 0;
if (ttl_check == NF_OSF_TTL_TRUE)
@@ -42,15 +43,13 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
else if (ip->ttl <= f_ttl)
return 1;
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(ip->saddr, ifa)) {
ret = (ip->ttl == f_ttl);
break;
}
}
- endfor_ifa(in_dev);
-
return ret;
}
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 89750f74e3a2..b6a7ce622c72 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -859,7 +859,7 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
}
skb_put(e->skb, diff);
}
- if (!skb_make_writable(e->skb, data_len))
+ if (skb_ensure_writable(e->skb, data_len))
return -ENOMEM;
skb_copy_to_linear_data(e->skb, data, data_len);
e->skb->ip_summed = CHECKSUM_NONE;
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 411c0cf741e3..bd173b1824c6 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -12,6 +12,7 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_tables.h>
struct nft_cmp_expr {
@@ -107,12 +108,44 @@ nla_put_failure:
return -1;
}
+static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_cmp_expr *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->sreg];
+ u8 *mask = (u8 *)&flow->match.mask;
+ u8 *key = (u8 *)&flow->match.key;
+
+ if (priv->op != NFT_CMP_EQ)
+ return -EOPNOTSUPP;
+
+ memcpy(key + reg->offset, &priv->data, priv->len);
+ memcpy(mask + reg->offset, &reg->mask, priv->len);
+
+ flow->match.dissector.used_keys |= BIT(reg->key);
+ flow->match.dissector.offset[reg->key] = reg->base_offset;
+
+ nft_offload_update_dependency(ctx, &priv->data, priv->len);
+
+ return 0;
+}
+
+static int nft_cmp_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+
+ return __nft_cmp_offload(ctx, flow, priv);
+}
+
static const struct nft_expr_ops nft_cmp_ops = {
.type = &nft_cmp_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
.eval = nft_cmp_eval,
.init = nft_cmp_init,
.dump = nft_cmp_dump,
+ .offload = nft_cmp_offload,
};
static int nft_cmp_fast_init(const struct nft_ctx *ctx,
@@ -143,6 +176,25 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx,
return 0;
}
+static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_cmp_expr cmp = {
+ .data = {
+ .data = {
+ [0] = priv->data,
+ },
+ },
+ .sreg = priv->sreg,
+ .len = priv->len / BITS_PER_BYTE,
+ .op = NFT_CMP_EQ,
+ };
+
+ return __nft_cmp_offload(ctx, flow, &cmp);
+}
+
static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
@@ -169,6 +221,7 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
.eval = NULL, /* inlined */
.init = nft_cmp_fast_init,
.dump = nft_cmp_fast_dump,
+ .offload = nft_cmp_fast_offload,
};
static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index dfcdea6619f1..827ab6196df9 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -21,6 +21,7 @@
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
struct nft_ct {
enum nft_ct_keys key:8;
@@ -1153,6 +1154,135 @@ static struct nft_object_type nft_ct_helper_obj_type __read_mostly = {
.owner = THIS_MODULE,
};
+struct nft_ct_expect_obj {
+ u16 l3num;
+ __be16 dport;
+ u8 l4proto;
+ u8 size;
+ u32 timeout;
+};
+
+static int nft_ct_expect_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+
+ if (!tb[NFTA_CT_EXPECT_L4PROTO] ||
+ !tb[NFTA_CT_EXPECT_DPORT] ||
+ !tb[NFTA_CT_EXPECT_TIMEOUT] ||
+ !tb[NFTA_CT_EXPECT_SIZE])
+ return -EINVAL;
+
+ priv->l3num = ctx->family;
+ if (tb[NFTA_CT_EXPECT_L3PROTO])
+ priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO]));
+
+ priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]);
+ priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]);
+ priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]);
+ priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]);
+
+ return nf_ct_netns_get(ctx->net, ctx->family);
+}
+
+static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ nf_ct_netns_put(ctx->net, ctx->family);
+}
+
+static int nft_ct_expect_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ const struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+
+ if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) ||
+ nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) ||
+ nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) ||
+ nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) ||
+ nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size))
+ return -1;
+
+ return 0;
+}
+
+static void nft_ct_expect_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+ struct nf_conntrack_expect *exp;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_help *help;
+ enum ip_conntrack_dir dir;
+ u16 l3num = priv->l3num;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_UNTRACKED) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ dir = CTINFO2DIR(ctinfo);
+
+ help = nfct_help(ct);
+ if (!help)
+ help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
+ if (!help) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ if (l3num == NFPROTO_INET)
+ l3num = nf_ct_l3num(ct);
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ priv->l4proto, NULL, &priv->dport);
+ exp->timeout.expires = jiffies + priv->timeout * HZ;
+
+ if (nf_ct_expect_related(exp) != 0)
+ regs->verdict.code = NF_DROP;
+}
+
+static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = {
+ [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 },
+ [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 },
+ [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 },
+ [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 },
+ [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 },
+};
+
+static struct nft_object_type nft_ct_expect_obj_type;
+
+static const struct nft_object_ops nft_ct_expect_obj_ops = {
+ .type = &nft_ct_expect_obj_type,
+ .size = sizeof(struct nft_ct_expect_obj),
+ .eval = nft_ct_expect_obj_eval,
+ .init = nft_ct_expect_obj_init,
+ .destroy = nft_ct_expect_obj_destroy,
+ .dump = nft_ct_expect_obj_dump,
+};
+
+static struct nft_object_type nft_ct_expect_obj_type __read_mostly = {
+ .type = NFT_OBJECT_CT_EXPECT,
+ .ops = &nft_ct_expect_obj_ops,
+ .maxattr = NFTA_CT_EXPECT_MAX,
+ .policy = nft_ct_expect_policy,
+ .owner = THIS_MODULE,
+};
+
static int __init nft_ct_module_init(void)
{
int err;
@@ -1170,17 +1300,23 @@ static int __init nft_ct_module_init(void)
err = nft_register_obj(&nft_ct_helper_obj_type);
if (err < 0)
goto err2;
+
+ err = nft_register_obj(&nft_ct_expect_obj_type);
+ if (err < 0)
+ goto err3;
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
err = nft_register_obj(&nft_ct_timeout_obj_type);
if (err < 0)
- goto err3;
+ goto err4;
#endif
return 0;
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+err4:
+ nft_unregister_obj(&nft_ct_expect_obj_type);
+#endif
err3:
nft_unregister_obj(&nft_ct_helper_obj_type);
-#endif
err2:
nft_unregister_expr(&nft_notrack_type);
err1:
@@ -1193,6 +1329,7 @@ static void __exit nft_ct_module_exit(void)
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
nft_unregister_obj(&nft_ct_timeout_obj_type);
#endif
+ nft_unregister_obj(&nft_ct_expect_obj_type);
nft_unregister_obj(&nft_ct_helper_obj_type);
nft_unregister_expr(&nft_notrack_type);
nft_unregister_expr(&nft_ct_type);
@@ -1207,3 +1344,4 @@ MODULE_ALIAS_NFT_EXPR("ct");
MODULE_ALIAS_NFT_EXPR("notrack");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER);
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT);
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT);
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 505bdfc66801..33833a0cb989 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -56,7 +56,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
elem = nft_set_elem_init(set, &priv->tmpl,
&regs->data[priv->sreg_key],
&regs->data[priv->sreg_data],
- timeout, GFP_ATOMIC);
+ timeout, 0, GFP_ATOMIC);
if (elem == NULL)
goto err1;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a7aa6c5250a4..a5e8469859e3 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -59,6 +59,103 @@ err:
regs->verdict.code = NFT_BREAK;
}
+/* find the offset to specified option.
+ *
+ * If target header is found, its offset is set in *offset and return option
+ * number. Otherwise, return negative error.
+ *
+ * If the first fragment doesn't contain the End of Options it is considered
+ * invalid.
+ */
+static int ipv4_find_option(struct net *net, struct sk_buff *skb,
+ unsigned int *offset, int target)
+{
+ unsigned char optbuf[sizeof(struct ip_options) + 40];
+ struct ip_options *opt = (struct ip_options *)optbuf;
+ struct iphdr *iph, _iph;
+ unsigned int start;
+ bool found = false;
+ __be32 info;
+ int optlen;
+
+ iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (!iph)
+ return -EBADMSG;
+ start = sizeof(struct iphdr);
+
+ optlen = iph->ihl * 4 - (int)sizeof(struct iphdr);
+ if (optlen <= 0)
+ return -ENOENT;
+
+ memset(opt, 0, sizeof(struct ip_options));
+ /* Copy the options since __ip_options_compile() modifies
+ * the options.
+ */
+ if (skb_copy_bits(skb, start, opt->__data, optlen))
+ return -EBADMSG;
+ opt->optlen = optlen;
+
+ if (__ip_options_compile(net, opt, NULL, &info))
+ return -EBADMSG;
+
+ switch (target) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ if (!opt->srr)
+ break;
+ found = target == IPOPT_SSRR ? opt->is_strictroute :
+ !opt->is_strictroute;
+ if (found)
+ *offset = opt->srr + start;
+ break;
+ case IPOPT_RR:
+ if (!opt->rr)
+ break;
+ *offset = opt->rr + start;
+ found = true;
+ break;
+ case IPOPT_RA:
+ if (!opt->router_alert)
+ break;
+ *offset = opt->router_alert + start;
+ found = true;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return found ? target : -ENOENT;
+}
+
+static void nft_exthdr_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ struct sk_buff *skb = pkt->skb;
+ unsigned int offset;
+ int err;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ goto err;
+
+ err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type);
+ if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+ *dest = (err >= 0);
+ return;
+ } else if (err < 0) {
+ goto err;
+ }
+ offset += priv->offset;
+
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+ if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0)
+ goto err;
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
static void *
nft_tcp_header_pointer(const struct nft_pktinfo *pkt,
unsigned int len, void *buffer, unsigned int *tcphdr_len)
@@ -153,7 +250,8 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
return;
- if (!skb_make_writable(pkt->skb, pkt->xt.thoff + i + priv->len))
+ if (skb_ensure_writable(pkt->skb,
+ pkt->xt.thoff + i + priv->len))
return;
tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff,
@@ -311,6 +409,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
return nft_validate_register_load(priv->sreg, priv->len);
}
+static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ int err = nft_exthdr_init(ctx, expr, tb);
+
+ if (err < 0)
+ return err;
+
+ switch (priv->type) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ case IPOPT_RR:
+ case IPOPT_RA:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv)
{
if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type))
@@ -357,6 +477,14 @@ static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
.dump = nft_exthdr_dump,
};
+static const struct nft_expr_ops nft_exthdr_ipv4_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_ipv4_eval,
+ .init = nft_exthdr_ipv4_init,
+ .dump = nft_exthdr_dump,
+};
+
static const struct nft_expr_ops nft_exthdr_tcp_ops = {
.type = &nft_exthdr_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -397,6 +525,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_ipv6_ops;
break;
+ case NFT_EXTHDR_OP_IPV4:
+ if (ctx->family != NFPROTO_IPV6) {
+ if (tb[NFTA_EXTHDR_DREG])
+ return &nft_exthdr_ipv4_ops;
+ }
+ break;
}
return ERR_PTR(-EOPNOTSUPP);
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index cb8547f97220..ca2ae4b95a8d 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -13,6 +13,7 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
void nft_immediate_eval(const struct nft_expr *expr,
struct nft_regs *regs,
@@ -124,6 +125,34 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
return 0;
}
+static int nft_immediate_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ struct flow_action_entry *entry;
+ const struct nft_data *data;
+
+ if (priv->dreg != NFT_REG_VERDICT)
+ return -EOPNOTSUPP;
+
+ entry = &flow->rule->action.entries[ctx->num_actions++];
+
+ data = &priv->data;
+ switch (data->verdict.code) {
+ case NF_ACCEPT:
+ entry->id = FLOW_ACTION_ACCEPT;
+ break;
+ case NF_DROP:
+ entry->id = FLOW_ACTION_DROP;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
static const struct nft_expr_ops nft_imm_ops = {
.type = &nft_imm_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
@@ -133,6 +162,8 @@ static const struct nft_expr_ops nft_imm_ops = {
.deactivate = nft_immediate_deactivate,
.dump = nft_immediate_dump,
.validate = nft_immediate_validate,
+ .offload = nft_immediate_offload,
+ .offload_flags = NFT_OFFLOAD_F_ACTION,
};
struct nft_expr_type nft_imm_type __read_mostly = {
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index a54329b8634a..76866f77e343 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -21,23 +21,13 @@
#include <net/tcp_states.h> /* for TCP_TIME_WAIT */
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nft_meta.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
-struct nft_meta {
- enum nft_meta_keys key:8;
- union {
- enum nft_registers dreg:8;
- enum nft_registers sreg:8;
- };
-};
-
static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
-#ifdef CONFIG_NF_TABLES_BRIDGE
-#include "../bridge/br_private.h"
-#endif
-
void nft_meta_get_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -47,9 +37,6 @@ void nft_meta_get_eval(const struct nft_expr *expr,
const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
struct sock *sk;
u32 *dest = &regs->data[priv->dreg];
-#ifdef CONFIG_NF_TABLES_BRIDGE
- const struct net_bridge_port *p;
-#endif
switch (priv->key) {
case NFT_META_LEN:
@@ -229,18 +216,6 @@ void nft_meta_get_eval(const struct nft_expr *expr,
nft_reg_store8(dest, secpath_exists(skb));
break;
#endif
-#ifdef CONFIG_NF_TABLES_BRIDGE
- case NFT_META_BRI_IIFNAME:
- if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
- goto err;
- strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
- return;
- case NFT_META_BRI_OIFNAME:
- if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
- goto err;
- strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
- return;
-#endif
case NFT_META_IIFKIND:
if (in == NULL || in->rtnl_link_ops == NULL)
goto err;
@@ -260,10 +235,11 @@ void nft_meta_get_eval(const struct nft_expr *expr,
err:
regs->verdict.code = NFT_BREAK;
}
+EXPORT_SYMBOL_GPL(nft_meta_get_eval);
-static void nft_meta_set_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_meta_set_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_meta *meta = nft_expr_priv(expr);
struct sk_buff *skb = pkt->skb;
@@ -300,16 +276,18 @@ static void nft_meta_set_eval(const struct nft_expr *expr,
WARN_ON(1);
}
}
+EXPORT_SYMBOL_GPL(nft_meta_set_eval);
-static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
[NFTA_META_DREG] = { .type = NLA_U32 },
[NFTA_META_KEY] = { .type = NLA_U32 },
[NFTA_META_SREG] = { .type = NLA_U32 },
};
+EXPORT_SYMBOL_GPL(nft_meta_policy);
-static int nft_meta_get_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+int nft_meta_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int len;
@@ -360,14 +338,6 @@ static int nft_meta_get_init(const struct nft_ctx *ctx,
len = sizeof(u8);
break;
#endif
-#ifdef CONFIG_NF_TABLES_BRIDGE
- case NFT_META_BRI_IIFNAME:
- case NFT_META_BRI_OIFNAME:
- if (ctx->family != NFPROTO_BRIDGE)
- return -EOPNOTSUPP;
- len = IFNAMSIZ;
- break;
-#endif
default:
return -EOPNOTSUPP;
}
@@ -376,6 +346,7 @@ static int nft_meta_get_init(const struct nft_ctx *ctx,
return nft_validate_register_store(ctx, priv->dreg, NULL,
NFT_DATA_VALUE, len);
}
+EXPORT_SYMBOL_GPL(nft_meta_get_init);
static int nft_meta_get_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
@@ -409,9 +380,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
#endif
}
-static int nft_meta_set_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int hooks;
@@ -437,10 +408,11 @@ static int nft_meta_set_validate(const struct nft_ctx *ctx,
return nft_chain_validate_hooks(ctx->chain, hooks);
}
+EXPORT_SYMBOL_GPL(nft_meta_set_validate);
-static int nft_meta_set_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+int nft_meta_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int len;
@@ -475,9 +447,10 @@ static int nft_meta_set_init(const struct nft_ctx *ctx,
return 0;
}
+EXPORT_SYMBOL_GPL(nft_meta_set_init);
-static int nft_meta_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+int nft_meta_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -490,8 +463,9 @@ static int nft_meta_get_dump(struct sk_buff *skb,
nla_put_failure:
return -1;
}
+EXPORT_SYMBOL_GPL(nft_meta_get_dump);
-static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -505,15 +479,42 @@ static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
nla_put_failure:
return -1;
}
+EXPORT_SYMBOL_GPL(nft_meta_set_dump);
-static void nft_meta_set_destroy(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+void nft_meta_set_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
if (priv->key == NFT_META_NFTRACE)
static_branch_dec(&nft_trace_enabled);
}
+EXPORT_SYMBOL_GPL(nft_meta_set_destroy);
+
+static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->key) {
+ case NFT_META_PROTOCOL:
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto,
+ sizeof(__u16), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
+ break;
+ case NFT_META_L4PROTO:
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+ sizeof(__u8), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
static const struct nft_expr_ops nft_meta_get_ops = {
.type = &nft_meta_type,
@@ -522,6 +523,7 @@ static const struct nft_expr_ops nft_meta_get_ops = {
.init = nft_meta_get_init,
.dump = nft_meta_get_dump,
.validate = nft_meta_get_validate,
+ .offload = nft_meta_get_offload,
};
static const struct nft_expr_ops nft_meta_set_ops = {
@@ -544,6 +546,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
return ERR_PTR(-EINVAL);
+#ifdef CONFIG_NF_TABLES_BRIDGE
+ if (ctx->family == NFPROTO_BRIDGE)
+ return ERR_PTR(-EAGAIN);
+#endif
if (tb[NFTA_META_DREG])
return &nft_meta_get_ops;
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 680bd9f38a81..22a80eb60222 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -15,10 +15,13 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
/* For layer 4 checksum field offset. */
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/icmpv6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
/* add vlan header into the user buffer for if tag was removed by offloads */
static bool
@@ -150,12 +153,195 @@ nla_put_failure:
return -1;
}
+static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->offset) {
+ case offsetof(struct ethhdr, h_source):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
+ src, ETH_ALEN, reg);
+ break;
+ case offsetof(struct ethhdr, h_dest):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
+ dst, ETH_ALEN, reg);
+ break;
+ }
+
+ return 0;
+}
+
+static int nft_payload_offload_ip(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->offset) {
+ case offsetof(struct iphdr, saddr):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src,
+ sizeof(struct in_addr), reg);
+ break;
+ case offsetof(struct iphdr, daddr):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst,
+ sizeof(struct in_addr), reg);
+ break;
+ case offsetof(struct iphdr, protocol):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+ sizeof(__u8), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->offset) {
+ case offsetof(struct ipv6hdr, saddr):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src,
+ sizeof(struct in6_addr), reg);
+ break;
+ case offsetof(struct ipv6hdr, daddr):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst,
+ sizeof(struct in6_addr), reg);
+ break;
+ case offsetof(struct ipv6hdr, nexthdr):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+ sizeof(__u8), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int nft_payload_offload_nh(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ int err;
+
+ switch (ctx->dep.l3num) {
+ case htons(ETH_P_IP):
+ err = nft_payload_offload_ip(ctx, flow, priv);
+ break;
+ case htons(ETH_P_IPV6):
+ err = nft_payload_offload_ip6(ctx, flow, priv);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return err;
+}
+
+static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->offset) {
+ case offsetof(struct tcphdr, source):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
+ sizeof(__be16), reg);
+ break;
+ case offsetof(struct tcphdr, dest):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
+ sizeof(__be16), reg);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int nft_payload_offload_udp(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->offset) {
+ case offsetof(struct udphdr, source):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
+ sizeof(__be16), reg);
+ break;
+ case offsetof(struct udphdr, dest):
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
+ sizeof(__be16), reg);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int nft_payload_offload_th(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ int err;
+
+ switch (ctx->dep.protonum) {
+ case IPPROTO_TCP:
+ err = nft_payload_offload_tcp(ctx, flow, priv);
+ break;
+ case IPPROTO_UDP:
+ err = nft_payload_offload_udp(ctx, flow, priv);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return err;
+}
+
+static int nft_payload_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ int err;
+
+ switch (priv->base) {
+ case NFT_PAYLOAD_LL_HEADER:
+ err = nft_payload_offload_ll(ctx, flow, priv);
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ err = nft_payload_offload_nh(ctx, flow, priv);
+ break;
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ err = nft_payload_offload_th(ctx, flow, priv);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+ return err;
+}
+
static const struct nft_expr_ops nft_payload_ops = {
.type = &nft_payload_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
.eval = nft_payload_eval,
.init = nft_payload_init,
.dump = nft_payload_dump,
+ .offload = nft_payload_offload,
};
const struct nft_expr_ops nft_payload_fast_ops = {
@@ -164,6 +350,7 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.eval = nft_payload_eval,
.init = nft_payload_init,
.dump = nft_payload_dump,
+ .offload = nft_payload_offload,
};
static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
@@ -240,7 +427,7 @@ static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt,
tsum));
}
- if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) ||
+ if (skb_ensure_writable(skb, l4csum_offset + sizeof(sum)) ||
skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0)
return -1;
@@ -256,7 +443,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
return -1;
nft_csum_replace(&sum, fsum, tsum);
- if (!skb_make_writable(skb, csum_offset + sizeof(sum)) ||
+ if (skb_ensure_writable(skb, csum_offset + sizeof(sum)) ||
skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
return -1;
@@ -309,7 +496,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
goto err;
}
- if (!skb_make_writable(skb, max(offset + priv->len, 0)) ||
+ if (skb_ensure_writable(skb, max(offset + priv->len, 0)) ||
skb_store_bits(skb, offset, src, priv->len) < 0)
goto err;
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
new file mode 100644
index 000000000000..80060ade8a5b
--- /dev/null
+++ b/net/netfilter/nft_synproxy.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/netlink.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_synproxy.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nf_synproxy.h>
+
+struct nft_synproxy {
+ struct nf_synproxy_info info;
+};
+
+static const struct nla_policy nft_synproxy_policy[NFTA_SYNPROXY_MAX + 1] = {
+ [NFTA_SYNPROXY_MSS] = { .type = NLA_U16 },
+ [NFTA_SYNPROXY_WSCALE] = { .type = NLA_U8 },
+ [NFTA_SYNPROXY_FLAGS] = { .type = NLA_U32 },
+};
+
+static void nft_synproxy_tcp_options(struct synproxy_options *opts,
+ const struct tcphdr *tcp,
+ struct synproxy_net *snet,
+ struct nf_synproxy_info *info,
+ struct nft_synproxy *priv)
+{
+ this_cpu_inc(snet->stats->syn_received);
+ if (tcp->ece && tcp->cwr)
+ opts->options |= NF_SYNPROXY_OPT_ECN;
+
+ opts->options &= priv->info.options;
+ if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_init_timestamp_cookie(info, opts);
+ else
+ opts->options &= ~(NF_SYNPROXY_OPT_WSCALE |
+ NF_SYNPROXY_OPT_SACK_PERM |
+ NF_SYNPROXY_OPT_ECN);
+}
+
+static void nft_synproxy_eval_v4(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ const struct tcphdr *tcp,
+ struct tcphdr *_tcph,
+ struct synproxy_options *opts)
+{
+ struct nft_synproxy *priv = nft_expr_priv(expr);
+ struct nf_synproxy_info info = priv->info;
+ struct net *net = nft_net(pkt);
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct sk_buff *skb = pkt->skb;
+
+ if (tcp->syn) {
+ /* Initial SYN from client */
+ nft_synproxy_tcp_options(opts, tcp, snet, &info, priv);
+ synproxy_send_client_synack(net, skb, tcp, opts);
+ consume_skb(skb);
+ regs->verdict.code = NF_STOLEN;
+ } else if (tcp->ack) {
+ /* ACK from client */
+ if (synproxy_recv_client_ack(net, skb, tcp, opts,
+ ntohl(tcp->seq))) {
+ consume_skb(skb);
+ regs->verdict.code = NF_STOLEN;
+ } else {
+ regs->verdict.code = NF_DROP;
+ }
+ }
+}
+
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+static void nft_synproxy_eval_v6(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ const struct tcphdr *tcp,
+ struct tcphdr *_tcph,
+ struct synproxy_options *opts)
+{
+ struct nft_synproxy *priv = nft_expr_priv(expr);
+ struct nf_synproxy_info info = priv->info;
+ struct net *net = nft_net(pkt);
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct sk_buff *skb = pkt->skb;
+
+ if (tcp->syn) {
+ /* Initial SYN from client */
+ nft_synproxy_tcp_options(opts, tcp, snet, &info, priv);
+ synproxy_send_client_synack_ipv6(net, skb, tcp, opts);
+ consume_skb(skb);
+ regs->verdict.code = NF_STOLEN;
+ } else if (tcp->ack) {
+ /* ACK from client */
+ if (synproxy_recv_client_ack_ipv6(net, skb, tcp, opts,
+ ntohl(tcp->seq))) {
+ consume_skb(skb);
+ regs->verdict.code = NF_STOLEN;
+ } else {
+ regs->verdict.code = NF_DROP;
+ }
+ }
+}
+#endif /* CONFIG_NF_TABLES_IPV6*/
+
+static void nft_synproxy_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct synproxy_options opts = {};
+ struct sk_buff *skb = pkt->skb;
+ int thoff = pkt->xt.thoff;
+ const struct tcphdr *tcp;
+ struct tcphdr _tcph;
+
+ if (pkt->tprot != IPPROTO_TCP) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ if (nf_ip_checksum(skb, nft_hook(pkt), thoff, IPPROTO_TCP)) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ tcp = skb_header_pointer(skb, pkt->xt.thoff,
+ sizeof(struct tcphdr),
+ &_tcph);
+ if (!tcp) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ if (!synproxy_parse_options(skb, thoff, tcp, &opts)) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ nft_synproxy_eval_v4(expr, regs, pkt, tcp, &_tcph, &opts);
+ return;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case htons(ETH_P_IPV6):
+ nft_synproxy_eval_v6(expr, regs, pkt, tcp, &_tcph, &opts);
+ return;
+#endif
+ }
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_synproxy_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct synproxy_net *snet = synproxy_pernet(ctx->net);
+ struct nft_synproxy *priv = nft_expr_priv(expr);
+ u32 flags;
+ int err;
+
+ if (tb[NFTA_SYNPROXY_MSS])
+ priv->info.mss = ntohs(nla_get_be16(tb[NFTA_SYNPROXY_MSS]));
+ if (tb[NFTA_SYNPROXY_WSCALE])
+ priv->info.wscale = nla_get_u8(tb[NFTA_SYNPROXY_WSCALE]);
+ if (tb[NFTA_SYNPROXY_FLAGS]) {
+ flags = ntohl(nla_get_be32(tb[NFTA_SYNPROXY_FLAGS]));
+ if (flags & ~NF_SYNPROXY_OPT_MASK)
+ return -EOPNOTSUPP;
+ priv->info.options = flags;
+ }
+
+ err = nf_ct_netns_get(ctx->net, ctx->family);
+ if (err)
+ return err;
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ err = nf_synproxy_ipv4_init(snet, ctx->net);
+ if (err)
+ goto nf_ct_failure;
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ err = nf_synproxy_ipv6_init(snet, ctx->net);
+ if (err)
+ goto nf_ct_failure;
+ break;
+#endif
+ case NFPROTO_INET:
+ case NFPROTO_BRIDGE:
+ err = nf_synproxy_ipv4_init(snet, ctx->net);
+ if (err)
+ goto nf_ct_failure;
+ err = nf_synproxy_ipv6_init(snet, ctx->net);
+ if (err)
+ goto nf_ct_failure;
+ break;
+ }
+
+ return 0;
+
+nf_ct_failure:
+ nf_ct_netns_put(ctx->net, ctx->family);
+ return err;
+}
+
+static void nft_synproxy_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct synproxy_net *snet = synproxy_pernet(ctx->net);
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ nf_synproxy_ipv4_fini(snet, ctx->net);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ nf_synproxy_ipv6_fini(snet, ctx->net);
+ break;
+#endif
+ case NFPROTO_INET:
+ case NFPROTO_BRIDGE:
+ nf_synproxy_ipv4_fini(snet, ctx->net);
+ nf_synproxy_ipv6_fini(snet, ctx->net);
+ break;
+ }
+ nf_ct_netns_put(ctx->net, ctx->family);
+}
+
+static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_synproxy *priv = nft_expr_priv(expr);
+
+ if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) ||
+ nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) ||
+ nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_synproxy_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD));
+}
+
+static struct nft_expr_type nft_synproxy_type;
+static const struct nft_expr_ops nft_synproxy_ops = {
+ .eval = nft_synproxy_eval,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_synproxy)),
+ .init = nft_synproxy_init,
+ .destroy = nft_synproxy_destroy,
+ .dump = nft_synproxy_dump,
+ .type = &nft_synproxy_type,
+ .validate = nft_synproxy_validate,
+};
+
+static struct nft_expr_type nft_synproxy_type __read_mostly = {
+ .ops = &nft_synproxy_ops,
+ .name = "synproxy",
+ .owner = THIS_MODULE,
+ .policy = nft_synproxy_policy,
+ .maxattr = NFTA_SYNPROXY_MAX,
+};
+
+static int __init nft_synproxy_module_init(void)
+{
+ return nft_register_expr(&nft_synproxy_type);
+}
+
+static void __exit nft_synproxy_module_exit(void)
+{
+ return nft_unregister_expr(&nft_synproxy_type);
+}
+
+module_init(nft_synproxy_module_init);
+module_exit(nft_synproxy_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
+MODULE_ALIAS_NFT_EXPR("synproxy");
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 06dc55590441..51b454d8fa9c 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -17,7 +17,8 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
case CHECKSUM_COMPLETE:
if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
break;
- if ((protocol == 0 && !csum_fold(skb->csum)) ||
+ if ((protocol != IPPROTO_TCP && protocol != IPPROTO_UDP &&
+ !csum_fold(skb->csum)) ||
!csum_tcpudp_magic(iph->saddr, iph->daddr,
skb->len - dataoff, protocol,
skb->csum)) {
@@ -26,7 +27,7 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
}
/* fall through */
case CHECKSUM_NONE:
- if (protocol == 0)
+ if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP)
skb->csum = 0;
else
skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index b1054a3d18c5..eababc354ff1 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -31,7 +31,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
if (dscp != dinfo->dscp) {
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
ipv4_change_dsfield(ip_hdr(skb),
@@ -49,7 +49,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par)
u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
if (dscp != dinfo->dscp) {
- if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
return NF_DROP;
ipv6_change_dsfield(ipv6_hdr(skb),
@@ -79,7 +79,7 @@ tos_tg(struct sk_buff *skb, const struct xt_action_param *par)
nv = (orig & ~info->tos_mask) ^ info->tos_value;
if (orig != nv) {
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
iph = ip_hdr(skb);
ipv4_change_dsfield(iph, 0, nv);
@@ -99,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par)
nv = (orig & ~info->tos_mask) ^ info->tos_value;
if (orig != nv) {
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
iph = ipv6_hdr(skb);
ipv6_change_dsfield(iph, 0, nv);
diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c
index 8221a5ce44bf..7873b834c300 100644
--- a/net/netfilter/xt_HL.c
+++ b/net/netfilter/xt_HL.c
@@ -29,7 +29,7 @@ ttl_tg(struct sk_buff *skb, const struct xt_action_param *par)
const struct ipt_TTL_info *info = par->targinfo;
int new_ttl;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, sizeof(*iph)))
return NF_DROP;
iph = ip_hdr(skb);
@@ -69,7 +69,7 @@ hl_tg6(struct sk_buff *skb, const struct xt_action_param *par)
const struct ip6t_HL_info *info = par->targinfo;
int new_hl;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, sizeof(*ip6h)))
return NF_DROP;
ip6h = ipv6_hdr(skb);
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 0b3a1b291c91..122db9fbb9f4 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -86,7 +86,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
if (par->fragoff != 0)
return 0;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return -1;
len = skb->len - tcphoff;
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 666f4ca9b15f..30e99464171b 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -28,33 +28,33 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset)
static unsigned int
tcpoptstrip_mangle_packet(struct sk_buff *skb,
const struct xt_action_param *par,
- unsigned int tcphoff, unsigned int minlen)
+ unsigned int tcphoff)
{
const struct xt_tcpoptstrip_target_info *info = par->targinfo;
+ struct tcphdr *tcph, _th;
unsigned int optl, i, j;
- struct tcphdr *tcph;
u_int16_t n, o;
u_int8_t *opt;
- int len, tcp_hdrlen;
+ int tcp_hdrlen;
/* This is a fragment, no TCP header is available */
if (par->fragoff != 0)
return XT_CONTINUE;
- if (!skb_make_writable(skb, skb->len))
+ tcph = skb_header_pointer(skb, tcphoff, sizeof(_th), &_th);
+ if (!tcph)
return NF_DROP;
- len = skb->len - tcphoff;
- if (len < (int)sizeof(struct tcphdr))
- return NF_DROP;
-
- tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
tcp_hdrlen = tcph->doff * 4;
+ if (tcp_hdrlen < sizeof(struct tcphdr))
+ return NF_DROP;
- if (len < tcp_hdrlen)
+ if (skb_ensure_writable(skb, tcphoff + tcp_hdrlen))
return NF_DROP;
- opt = (u_int8_t *)tcph;
+ /* must reload tcph, might have been moved */
+ tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
+ opt = (u8 *)tcph;
/*
* Walk through all TCP options - if we find some option to remove,
@@ -88,8 +88,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb,
static unsigned int
tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
- return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb),
- sizeof(struct iphdr) + sizeof(struct tcphdr));
+ return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb));
}
#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
@@ -106,8 +105,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
if (tcphoff < 0)
return NF_DROP;
- return tcpoptstrip_mangle_packet(skb, par, tcphoff,
- sizeof(*ipv6h) + sizeof(struct tcphdr));
+ return tcpoptstrip_mangle_packet(skb, par, tcphoff);
}
#endif
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index 140ce6be639a..0c9e014e30b4 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -2,7 +2,7 @@
/*
* xt_iprange - Netfilter module to match IP address ranges
*
- * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
* (C) CC Computer Consultants GmbH, 2008
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -130,7 +130,7 @@ static void __exit iprange_mt_exit(void)
module_init(iprange_mt_init);
module_exit(iprange_mt_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching");
MODULE_ALIAS("ipt_iprange");
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 95f64a99e425..e85ce69924ae 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -22,6 +22,9 @@ static int owner_check(const struct xt_mtchk_param *par)
struct xt_owner_match_info *info = par->matchinfo;
struct net *net = par->net;
+ if (info->match & ~XT_OWNER_MASK)
+ return -EINVAL;
+
/* Only allow the common case where the userns of the writer
* matches the userns of the network namespace.
*/
@@ -88,11 +91,28 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
if (info->match & XT_OWNER_GID) {
+ unsigned int i, match = false;
kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
- if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
- gid_lte(filp->f_cred->fsgid, gid_max)) ^
- !(info->invert & XT_OWNER_GID))
+ struct group_info *gi = filp->f_cred->group_info;
+
+ if (gid_gte(filp->f_cred->fsgid, gid_min) &&
+ gid_lte(filp->f_cred->fsgid, gid_max))
+ match = true;
+
+ if (!match && (info->match & XT_OWNER_SUPPL_GROUPS) && gi) {
+ for (i = 0; i < gi->ngroups; ++i) {
+ kgid_t group = gi->gid[i];
+
+ if (gid_gte(group, gid_min) &&
+ gid_lte(group, gid_max)) {
+ match = true;
+ break;
+ }
+ }
+ }
+
+ if (match ^ !(info->invert & XT_OWNER_GID))
return false;
}
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index f099228cb9c4..ecbfa291fb70 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -2,7 +2,7 @@
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
* Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
/* Kernel module which implements the set match and SET target
@@ -18,7 +18,7 @@
#include <uapi/linux/netfilter/xt_set.h>
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
MODULE_DESCRIPTION("Xtables: IP set match and target module");
MODULE_ALIAS("xt_SET");
MODULE_ALIAS("ipt_set");
@@ -436,6 +436,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
{
const struct xt_set_info_target_v3 *info = par->targinfo;
ip_set_id_t index;
+ int ret = 0;
if (info->add_set.index != IPSET_INVALID_ID) {
index = ip_set_nfnl_get_byindex(par->net,
@@ -453,17 +454,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
if (index == IPSET_INVALID_ID) {
pr_info_ratelimited("Cannot find del_set index %u as target\n",
info->del_set.index);
- if (info->add_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net,
- info->add_set.index);
- return -ENOENT;
+ ret = -ENOENT;
+ goto cleanup_add;
}
}
if (info->map_set.index != IPSET_INVALID_ID) {
if (strncmp(par->table, "mangle", 7)) {
pr_info_ratelimited("--map-set only usable from mangle table\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto cleanup_del;
}
if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) |
(info->flags & IPSET_FLAG_MAP_SKBQUEUE)) &&
@@ -471,20 +471,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
1 << NF_INET_LOCAL_OUT |
1 << NF_INET_POST_ROUTING))) {
pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto cleanup_del;
}
index = ip_set_nfnl_get_byindex(par->net,
info->map_set.index);
if (index == IPSET_INVALID_ID) {
pr_info_ratelimited("Cannot find map_set index %u as target\n",
info->map_set.index);
- if (info->add_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net,
- info->add_set.index);
- if (info->del_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net,
- info->del_set.index);
- return -ENOENT;
+ ret = -ENOENT;
+ goto cleanup_del;
}
}
@@ -492,16 +488,21 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
info->del_set.dim > IPSET_DIM_MAX ||
info->map_set.dim > IPSET_DIM_MAX) {
pr_info_ratelimited("SET target dimension over the limit!\n");
- if (info->add_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net, info->add_set.index);
- if (info->del_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net, info->del_set.index);
- if (info->map_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net, info->map_set.index);
- return -ERANGE;
+ ret = -ERANGE;
+ goto cleanup_mark;
}
return 0;
+cleanup_mark:
+ if (info->map_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->map_set.index);
+cleanup_del:
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+cleanup_add:
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ return ret;
}
static void
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e9ddfd782d16..90b2ab9dd449 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -241,13 +241,8 @@ static __net_init int netlink_tap_init_net(struct net *net)
return 0;
}
-static void __net_exit netlink_tap_exit_net(struct net *net)
-{
-}
-
static struct pernet_operations netlink_tap_net_ops = {
.init = netlink_tap_init_net,
- .exit = netlink_tap_exit_net,
.id = &netlink_tap_net_id,
.size = sizeof(struct netlink_tap_net),
};
@@ -2544,12 +2539,10 @@ struct nl_seq_iter {
int link;
};
-static int netlink_walk_start(struct nl_seq_iter *iter)
+static void netlink_walk_start(struct nl_seq_iter *iter)
{
rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti);
rhashtable_walk_start(&iter->hti);
-
- return 0;
}
static void netlink_walk_stop(struct nl_seq_iter *iter)
@@ -2565,8 +2558,6 @@ static void *__netlink_seq_next(struct seq_file *seq)
do {
for (;;) {
- int err;
-
nlk = rhashtable_walk_next(&iter->hti);
if (IS_ERR(nlk)) {
@@ -2583,9 +2574,7 @@ static void *__netlink_seq_next(struct seq_file *seq)
if (++iter->link >= MAX_LINKS)
return NULL;
- err = netlink_walk_start(iter);
- if (err)
- return ERR_PTR(err);
+ netlink_walk_start(iter);
}
} while (sock_net(&nlk->sk) != seq_file_net(seq));
@@ -2597,13 +2586,10 @@ static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
struct nl_seq_iter *iter = seq->private;
void *obj = SEQ_START_TOKEN;
loff_t pos;
- int err;
iter->link = 0;
- err = netlink_walk_start(iter);
- if (err)
- return ERR_PTR(err);
+ netlink_walk_start(iter);
for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
obj = __netlink_seq_next(seq);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 86b87925ef34..96740d389377 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -869,7 +869,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
unsigned short frametype, flags, window, timeout;
int ret;
- skb->sk = NULL; /* Initially we don't know who it's for */
+ skb_orphan(skb);
/*
* skb->data points to the netrom frame start
@@ -968,6 +968,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
window = skb->data[20];
skb->sk = make;
+ skb->destructor = sock_efree;
make->sk_state = TCP_ESTABLISHED;
/* Fill in his circuit details */
diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c
index 0a0c265baaa4..ce3382be937f 100644
--- a/net/nfc/nci/data.c
+++ b/net/nfc/nci/data.c
@@ -107,7 +107,7 @@ static int nci_queue_tx_data_frags(struct nci_dev *ndev,
conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id);
if (!conn_info) {
rc = -EPROTO;
- goto free_exit;
+ goto exit;
}
__skb_queue_head_init(&frags_q);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 151518dbabad..3572e11b6f21 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -160,50 +160,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
const struct nlattr *attr, int len);
-static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
- __be16 ethertype)
-{
- if (skb->ip_summed == CHECKSUM_COMPLETE) {
- __be16 diff[] = { ~(hdr->h_proto), ethertype };
-
- skb->csum = ~csum_partial((char *)diff, sizeof(diff),
- ~skb->csum);
- }
-
- hdr->h_proto = ethertype;
-}
-
static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_mpls *mpls)
{
- struct mpls_shim_hdr *new_mpls_lse;
-
- /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
- if (skb->encapsulation)
- return -ENOTSUPP;
-
- if (skb_cow_head(skb, MPLS_HLEN) < 0)
- return -ENOMEM;
-
- if (!skb->inner_protocol) {
- skb_set_inner_network_header(skb, skb->mac_len);
- skb_set_inner_protocol(skb, skb->protocol);
- }
-
- skb_push(skb, MPLS_HLEN);
- memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
- skb->mac_len);
- skb_reset_mac_header(skb);
- skb_set_network_header(skb, skb->mac_len);
-
- new_mpls_lse = mpls_hdr(skb);
- new_mpls_lse->label_stack_entry = mpls->mpls_lse;
-
- skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
+ int err;
- if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET)
- update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
- skb->protocol = mpls->mpls_ethertype;
+ err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype);
+ if (err)
+ return err;
invalidate_flow_key(key);
return 0;
@@ -214,31 +178,10 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
{
int err;
- err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
- if (unlikely(err))
+ err = skb_mpls_pop(skb, ethertype);
+ if (err)
return err;
- skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
-
- memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
- skb->mac_len);
-
- __skb_pull(skb, MPLS_HLEN);
- skb_reset_mac_header(skb);
- skb_set_network_header(skb, skb->mac_len);
-
- if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) {
- struct ethhdr *hdr;
-
- /* mpls_hdr() is used to locate the ethertype field correctly in the
- * presence of VLAN tags.
- */
- hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
- update_ethertype(skb, hdr, ethertype);
- }
- if (eth_p_mpls(skb->protocol))
- skb->protocol = ethertype;
-
invalidate_flow_key(key);
return 0;
}
@@ -250,20 +193,12 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
__be32 lse;
int err;
- err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
- if (unlikely(err))
- return err;
-
stack = mpls_hdr(skb);
lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask);
- if (skb->ip_summed == CHECKSUM_COMPLETE) {
- __be32 diff[] = { ~(stack->label_stack_entry), lse };
-
- skb->csum = ~csum_partial((char *)diff, sizeof(diff),
- ~skb->csum);
- }
+ err = skb_mpls_update_lse(skb, lse);
+ if (err)
+ return err;
- stack->label_stack_entry = lse;
flow_key->mpls.top_lse = lse;
return 0;
}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 6747bc57b6fa..33b388103741 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1334,7 +1334,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts,
&flow->id, info, false, ufid_flags);
if (likely(reply)) {
- if (likely(!IS_ERR(reply))) {
+ if (!IS_ERR(reply)) {
rcu_read_lock(); /*To keep RCU checker happy. */
err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex,
reply, info->snd_portid,
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index 53cf07d141b4..7af0cde8b293 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -48,7 +48,7 @@ void ovs_dp_notify_wq(struct work_struct *work)
if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL)
continue;
- if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH))
+ if (!(netif_is_ovs_port(vport->dev)))
dp_detach_port_notify(vport);
}
}
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 52a1ed9633ec..57d6436e6f6a 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -156,7 +156,7 @@ void ovs_netdev_detach_dev(struct vport *vport)
static void netdev_destroy(struct vport *vport)
{
rtnl_lock();
- if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ if (netif_is_ovs_port(vport->dev))
ovs_netdev_detach_dev(vport);
rtnl_unlock();
@@ -166,7 +166,7 @@ static void netdev_destroy(struct vport *vport)
void ovs_netdev_tunnel_destroy(struct vport *vport)
{
rtnl_lock();
- if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ if (netif_is_ovs_port(vport->dev))
ovs_netdev_detach_dev(vport);
/* We can be invoked by both explicit vport deletion and
@@ -186,7 +186,7 @@ EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy);
/* Returns null if this device is not attached to a datapath. */
struct vport *ovs_netdev_get_vport(struct net_device *dev)
{
- if (likely(dev->priv_flags & IFF_OVS_DATAPATH))
+ if (likely(netif_is_ovs_port(dev)))
return (struct vport *)
rcu_dereference_rtnl(dev->rx_handler_data);
else
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index f927de9bda0a..3fc38d16c456 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -248,8 +248,6 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options)
*/
void ovs_vport_del(struct vport *vport)
{
- ASSERT_OVSL();
-
hlist_del_rcu(&vport->hash_node);
module_put(vport->ops->owner);
vport->ops->destroy(vport);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 5f78df080573..8d54f3047768 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -384,7 +384,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
smp_wmb();
}
-static int __packet_get_status(struct packet_sock *po, void *frame)
+static int __packet_get_status(const struct packet_sock *po, void *frame)
{
union tpacket_uhdr h;
@@ -460,10 +460,10 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
return ts_status;
}
-static void *packet_lookup_frame(struct packet_sock *po,
- struct packet_ring_buffer *rb,
- unsigned int position,
- int status)
+static void *packet_lookup_frame(const struct packet_sock *po,
+ const struct packet_ring_buffer *rb,
+ unsigned int position,
+ int status)
{
unsigned int pg_vec_pos, frame_offset;
union tpacket_uhdr h;
@@ -758,7 +758,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1,
struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
struct sock *sk = &po->sk;
- if (po->stats.stats3.tp_drops)
+ if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
last_pkt = (struct tpacket3_hdr *)pkc1->prev;
@@ -1003,7 +1003,6 @@ static void prb_fill_curr_block(char *curr,
/* Assumes caller has the sk->rx_queue.lock */
static void *__packet_lookup_frame_in_block(struct packet_sock *po,
struct sk_buff *skb,
- int status,
unsigned int len
)
{
@@ -1075,7 +1074,7 @@ static void *packet_current_rx_frame(struct packet_sock *po,
po->rx_ring.head, status);
return curr;
case TPACKET_V3:
- return __packet_lookup_frame_in_block(po, skb, status, len);
+ return __packet_lookup_frame_in_block(po, skb, len);
default:
WARN(1, "TPACKET version not supported\n");
BUG();
@@ -1083,10 +1082,10 @@ static void *packet_current_rx_frame(struct packet_sock *po,
}
}
-static void *prb_lookup_block(struct packet_sock *po,
- struct packet_ring_buffer *rb,
- unsigned int idx,
- int status)
+static void *prb_lookup_block(const struct packet_sock *po,
+ const struct packet_ring_buffer *rb,
+ unsigned int idx,
+ int status)
{
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
@@ -1199,12 +1198,12 @@ static void packet_free_pending(struct packet_sock *po)
#define ROOM_LOW 0x1
#define ROOM_NORMAL 0x2
-static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
+static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
{
int idx, len;
- len = po->rx_ring.frame_max + 1;
- idx = po->rx_ring.head;
+ len = READ_ONCE(po->rx_ring.frame_max) + 1;
+ idx = READ_ONCE(po->rx_ring.head);
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
@@ -1212,12 +1211,12 @@ static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}
-static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
+static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
{
int idx, len;
- len = po->rx_ring.prb_bdqc.knum_blocks;
- idx = po->rx_ring.prb_bdqc.kactive_blk_num;
+ len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
+ idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
@@ -1225,15 +1224,18 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}
-static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+static int __packet_rcv_has_room(const struct packet_sock *po,
+ const struct sk_buff *skb)
{
- struct sock *sk = &po->sk;
+ const struct sock *sk = &po->sk;
int ret = ROOM_NONE;
if (po->prot_hook.func != tpacket_rcv) {
- int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
- - (skb ? skb->truesize : 0);
- if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
+ int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
+ - (skb ? skb->truesize : 0);
+
+ if (avail > (rcvbuf >> ROOM_POW_OFF))
return ROOM_NORMAL;
else if (avail > 0)
return ROOM_LOW;
@@ -1258,19 +1260,24 @@ static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
- int ret;
- bool has_room;
+ int pressure, ret;
- spin_lock_bh(&po->sk.sk_receive_queue.lock);
ret = __packet_rcv_has_room(po, skb);
- has_room = ret == ROOM_NORMAL;
- if (po->pressure == has_room)
- po->pressure = !has_room;
- spin_unlock_bh(&po->sk.sk_receive_queue.lock);
+ pressure = ret != ROOM_NORMAL;
+
+ if (READ_ONCE(po->pressure) != pressure)
+ WRITE_ONCE(po->pressure, pressure);
return ret;
}
+static void packet_rcv_try_clear_pressure(struct packet_sock *po)
+{
+ if (READ_ONCE(po->pressure) &&
+ __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
+ WRITE_ONCE(po->pressure, 0);
+}
+
static void packet_sock_destruct(struct sock *sk)
{
skb_queue_purge(&sk->sk_error_queue);
@@ -1351,7 +1358,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
i = j = min_t(int, po->rollover->sock, num - 1);
do {
po_next = pkt_sk(f->arr[i]);
- if (po_next != po_skip && !po_next->pressure &&
+ if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
if (i != j)
po->rollover->sock = i;
@@ -2126,10 +2133,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
drop_n_acct:
is_drop_n_account = true;
- spin_lock(&sk->sk_receive_queue.lock);
- po->stats.stats1.tp_drops++;
+ atomic_inc(&po->tp_drops);
atomic_inc(&sk->sk_drops);
- spin_unlock(&sk->sk_receive_queue.lock);
drop_n_restore:
if (skb_head != skb->data && skb_shared(skb)) {
@@ -2193,6 +2198,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (!res)
goto drop_n_restore;
+ /* If we are flooded, just give up */
+ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
+ atomic_inc(&po->tp_drops);
+ goto drop_n_restore;
+ }
+
if (skb->ip_summed == CHECKSUM_PARTIAL)
status |= TP_STATUS_CSUMNOTREADY;
else if (skb->pkt_type != PACKET_OUTGOING &&
@@ -2263,7 +2274,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
* Anyways, moving it for V1/V2 only as V3 doesn't need this
* at packet level.
*/
- if (po->stats.stats1.tp_drops)
+ if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
}
@@ -2379,9 +2390,9 @@ drop:
return 0;
drop_n_account:
- is_drop_n_account = true;
- po->stats.stats1.tp_drops++;
spin_unlock(&sk->sk_receive_queue.lock);
+ atomic_inc(&po->tp_drops);
+ is_drop_n_account = true;
sk->sk_data_ready(sk);
kfree_skb(copy_skb);
@@ -3318,8 +3329,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (skb == NULL)
goto out;
- if (pkt_sk(sk)->pressure)
- packet_rcv_has_room(pkt_sk(sk), NULL);
+ packet_rcv_try_clear_pressure(pkt_sk(sk));
if (pkt_sk(sk)->has_vnet_hdr) {
err = packet_rcv_vnet(msg, skb, &len);
@@ -3891,6 +3901,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
void *data = &val;
union tpacket_stats_u st;
struct tpacket_rollover_stats rstats;
+ int drops;
if (level != SOL_PACKET)
return -ENOPROTOOPT;
@@ -3907,14 +3918,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
memcpy(&st, &po->stats, sizeof(st));
memset(&po->stats, 0, sizeof(po->stats));
spin_unlock_bh(&sk->sk_receive_queue.lock);
+ drops = atomic_xchg(&po->tp_drops, 0);
if (po->tp_version == TPACKET_V3) {
lv = sizeof(struct tpacket_stats_v3);
- st.stats3.tp_packets += st.stats3.tp_drops;
+ st.stats3.tp_drops = drops;
+ st.stats3.tp_packets += drops;
data = &st.stats3;
} else {
lv = sizeof(struct tpacket_stats);
- st.stats1.tp_packets += st.stats1.tp_drops;
+ st.stats1.tp_drops = drops;
+ st.stats1.tp_packets += drops;
data = &st.stats1;
}
@@ -4133,8 +4147,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock,
TP_STATUS_KERNEL))
mask |= EPOLLIN | EPOLLRDNORM;
}
- if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
- po->pressure = 0;
+ packet_rcv_try_clear_pressure(po);
spin_unlock_bh(&sk->sk_receive_queue.lock);
spin_lock_bh(&sk->sk_write_queue.lock);
if (po->tx_ring.pg_vec) {
diff --git a/net/packet/internal.h b/net/packet/internal.h
index c70a2794456f..82fb2b10f790 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -132,6 +132,7 @@ struct packet_sock {
struct net_device __rcu *cached_dev;
int (*xmit)(struct sk_buff *skb);
struct packet_type prot_hook ____cacheline_aligned_in_smp;
+ atomic_t tp_drops ____cacheline_aligned_in_smp;
};
static struct packet_sock *pkt_sk(struct sock *sk)
diff --git a/net/rds/ib.c b/net/rds/ib.c
index b8d581b779b2..ec05d91aa9a2 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -318,6 +318,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo->max_send_sge = rds_ibdev->max_sge;
rds_ib_get_mr_info(rds_ibdev, iinfo);
+ iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs);
}
return 1;
}
@@ -351,6 +352,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo6->max_send_sge = rds_ibdev->max_sge;
rds6_ib_get_mr_info(rds_ibdev, iinfo6);
+ iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs);
}
return 1;
}
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index f9f4721cdfa7..d09eaf153544 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -545,6 +545,7 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
switch (rx->sk.sk_state) {
case RXRPC_UNBOUND:
+ case RXRPC_CLIENT_UNBOUND:
rx->srx.srx_family = AF_RXRPC;
rx->srx.srx_service = 0;
rx->srx.transport_type = SOCK_DGRAM;
@@ -569,10 +570,9 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
}
rx->local = local;
- rx->sk.sk_state = RXRPC_CLIENT_UNBOUND;
+ rx->sk.sk_state = RXRPC_CLIENT_BOUND;
/* Fall through */
- case RXRPC_CLIENT_UNBOUND:
case RXRPC_CLIENT_BOUND:
if (!m->msg_name &&
test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) {
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index a0b6abfbd277..948e3fe249ec 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -519,6 +519,9 @@ send_fragmentable:
}
break;
#endif
+
+ default:
+ BUG();
}
if (ret < 0)
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2c72d95c3050..dd55b9ac3a66 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -842,6 +842,17 @@ config NET_ACT_CSUM
To compile this code as a module, choose M here: the
module will be called act_csum.
+config NET_ACT_MPLS
+ tristate "MPLS manipulation"
+ depends on NET_CLS_ACT
+ help
+ Say Y here to push or pop MPLS headers.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_mpls.
+
config NET_ACT_VLAN
tristate "Vlan manipulation"
depends on NET_CLS_ACT
@@ -877,6 +888,23 @@ config NET_ACT_CONNMARK
To compile this code as a module, choose M here: the
module will be called act_connmark.
+config NET_ACT_CTINFO
+ tristate "Netfilter Connection Mark Actions"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ help
+ Say Y here to allow transfer of a connmark stored information.
+ Current actions transfer connmark stored DSCP into
+ ipv4/v6 diffserv and/or to transfer connmark to packet
+ mark. Both are useful for restoring egress based marks
+ back onto ingress connections for qdisc priority mapping
+ purposes.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ctinfo.
+
config NET_ACT_SKBMOD
tristate "skb data modification action"
depends on NET_CLS_ACT
@@ -912,6 +940,17 @@ config NET_ACT_TUNNEL_KEY
To compile this code as a module, choose M here: the
module will be called act_tunnel_key.
+config NET_ACT_CT
+ tristate "connection tracking tc action"
+ depends on NET_CLS_ACT && NF_CONNTRACK
+ help
+ Say Y here to allow sending the packets to conntrack module.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ct.
+
config NET_IFE_SKBMARK
tristate "Support to encoding decoding skb mark on IFE action"
depends on NET_ACT_IFE
@@ -924,14 +963,6 @@ config NET_IFE_SKBTCINDEX
tristate "Support to encoding decoding skb tcindex on IFE action"
depends on NET_ACT_IFE
-config NET_CLS_IND
- bool "Incoming device classification"
- depends on NET_CLS_U32 || NET_CLS_FW
- ---help---
- Say Y here to extend the u32 and fw classifier to support
- classification based on the incoming device. This option is
- likely to disappear in favour of the metadata ematch.
-
endif # NET_SCHED
config NET_SCH_FIFO
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8a40431d7b5c..415d1e1f237e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -18,15 +18,18 @@ obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
+obj-$(CONFIG_NET_ACT_MPLS) += act_mpls.o
obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
+obj-$(CONFIG_NET_ACT_CTINFO) += act_ctinfo.o
obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o
obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o
obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
+obj-$(CONFIG_NET_ACT_CT) += act_ct.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4e5d2e9ace5d..339712296164 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -221,12 +221,13 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
struct idr *idr = &idrinfo->action_idr;
struct tc_action *p;
unsigned long id = 1;
+ unsigned long tmp;
mutex_lock(&idrinfo->lock);
s_i = cb->args[0];
- idr_for_each_entry_ul(idr, p, id) {
+ idr_for_each_entry_ul(idr, p, tmp, id) {
index++;
if (index < s_i)
continue;
@@ -292,6 +293,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
struct idr *idr = &idrinfo->action_idr;
struct tc_action *p;
unsigned long id = 1;
+ unsigned long tmp;
nest = nla_nest_start_noflag(skb, 0);
if (nest == NULL)
@@ -300,7 +302,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
goto nla_put_failure;
mutex_lock(&idrinfo->lock);
- idr_for_each_entry_ul(idr, p, id) {
+ idr_for_each_entry_ul(idr, p, tmp, id) {
ret = tcf_idr_release_unsafe(p);
if (ret == ACT_P_DELETED) {
module_put(ops->owner);
@@ -533,8 +535,9 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
struct tc_action *p;
int ret;
unsigned long id = 1;
+ unsigned long tmp;
- idr_for_each_entry_ul(idr, p, id) {
+ idr_for_each_entry_ul(idr, p, tmp, id) {
ret = __tcf_idr_release(p, false, true);
if (ret == ACT_P_DELETED)
module_put(ops->owner);
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
new file mode 100644
index 000000000000..b501ce0cf116
--- /dev/null
+++ b/net/sched/act_ct.c
@@ -0,0 +1,984 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* -
+ * net/sched/act_ct.c Connection Tracking action
+ *
+ * Authors: Paul Blakey <paulb@mellanox.com>
+ * Yossi Kuperman <yossiku@mellanox.com>
+ * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/act_api.h>
+#include <net/ip.h>
+#include <net/ipv6_frag.h>
+#include <uapi/linux/tc_act/tc_ct.h>
+#include <net/tc_act/tc_ct.h>
+
+#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+static struct tc_action_ops act_ct_ops;
+static unsigned int ct_net_id;
+
+struct tc_ct_action_net {
+ struct tc_action_net tn; /* Must be first */
+ bool labels;
+};
+
+/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
+static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
+ u16 zone_id, bool force)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return false;
+ if (!net_eq(net, read_pnet(&ct->ct_net)))
+ return false;
+ if (nf_ct_zone(ct)->id != zone_id)
+ return false;
+
+ /* Force conntrack entry direction. */
+ if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+ if (nf_ct_is_confirmed(ct))
+ nf_ct_kill(ct);
+
+ nf_conntrack_put(&ct->ct_general);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+
+ return false;
+ }
+
+ return true;
+}
+
+/* Trim the skb to the length specified by the IP/IPv6 header,
+ * removing any trailing lower-layer padding. This prepares the skb
+ * for higher-layer processing that assumes skb->len excludes padding
+ * (such as nf_ip_checksum). The caller needs to pull the skb to the
+ * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
+ */
+static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family)
+{
+ unsigned int len;
+ int err;
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ len = ntohs(ip_hdr(skb)->tot_len);
+ break;
+ case NFPROTO_IPV6:
+ len = sizeof(struct ipv6hdr)
+ + ntohs(ipv6_hdr(skb)->payload_len);
+ break;
+ default:
+ len = skb->len;
+ }
+
+ err = pskb_trim_rcsum(skb, len);
+
+ return err;
+}
+
+static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
+{
+ u8 family = NFPROTO_UNSPEC;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ family = NFPROTO_IPV4;
+ break;
+ case htons(ETH_P_IPV6):
+ family = NFPROTO_IPV6;
+ break;
+ default:
+ break;
+ }
+
+ return family;
+}
+
+static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag)
+{
+ unsigned int len;
+
+ len = skb_network_offset(skb) + sizeof(struct iphdr);
+ if (unlikely(skb->len < len))
+ return -EINVAL;
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
+ *frag = ip_is_fragment(ip_hdr(skb));
+ return 0;
+}
+
+static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag)
+{
+ unsigned int flags = 0, len, payload_ofs = 0;
+ unsigned short frag_off;
+ int nexthdr;
+
+ len = skb_network_offset(skb) + sizeof(struct ipv6hdr);
+ if (unlikely(skb->len < len))
+ return -EINVAL;
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
+ nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
+ if (unlikely(nexthdr < 0))
+ return -EPROTO;
+
+ *frag = flags & IP6_FH_F_FRAG;
+ return 0;
+}
+
+static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+ u8 family, u16 zone)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ int err = 0;
+ bool frag;
+
+ /* Previously seen (loopback)? Ignore. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
+ return 0;
+
+ if (family == NFPROTO_IPV4)
+ err = tcf_ct_ipv4_is_fragment(skb, &frag);
+ else
+ err = tcf_ct_ipv6_is_fragment(skb, &frag);
+ if (err || !frag)
+ return err;
+
+ skb_get(skb);
+
+ if (family == NFPROTO_IPV4) {
+ enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ local_bh_disable();
+ err = ip_defrag(net, skb, user);
+ local_bh_enable();
+ if (err && err != -EINPROGRESS)
+ goto out_free;
+ } else { /* NFPROTO_IPV6 */
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+ err = nf_ct_frag6_gather(net, skb, user);
+ if (err && err != -EINPROGRESS)
+ goto out_free;
+#else
+ err = -EOPNOTSUPP;
+ goto out_free;
+#endif
+ }
+
+ skb_clear_hash(skb);
+ skb->ignore_df = 1;
+ return err;
+
+out_free:
+ kfree_skb(skb);
+ return err;
+}
+
+static void tcf_ct_params_free(struct rcu_head *head)
+{
+ struct tcf_ct_params *params = container_of(head,
+ struct tcf_ct_params, rcu);
+
+ if (params->tmpl)
+ nf_conntrack_put(&params->tmpl->ct_general);
+ kfree(params);
+}
+
+#if IS_ENABLED(CONFIG_NF_NAT)
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_nat_range2 *range,
+ enum nf_nat_manip_type maniptype)
+{
+ int hooknum, err = NF_ACCEPT;
+
+ /* See HOOK2MANIP(). */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+ else
+ hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (skb->protocol == htons(ETH_P_IP) &&
+ ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ hooknum))
+ err = NF_DROP;
+ goto out;
+ } else if (IS_ENABLED(CONFIG_IPV6) &&
+ skb->protocol == htons(ETH_P_IPV6)) {
+ __be16 frag_off;
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ int hdrlen = ipv6_skip_exthdr(skb,
+ sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+
+ if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+ if (!nf_nat_icmpv6_reply_translation(skb, ct,
+ ctinfo,
+ hooknum,
+ hdrlen))
+ err = NF_DROP;
+ goto out;
+ }
+ }
+ /* Non-ICMP, fall thru to initialize if needed. */
+ /* fall through */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ /* Initialize according to the NAT action. */
+ err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+ /* Action is set up to establish a new
+ * mapping.
+ */
+ ? nf_nat_setup_info(ct, range, maniptype)
+ : nf_nat_alloc_null_binding(ct, hooknum);
+ if (err != NF_ACCEPT)
+ goto out;
+ }
+ break;
+
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+
+ default:
+ err = NF_DROP;
+ goto out;
+ }
+
+ err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+out:
+ return err;
+}
+#endif /* CONFIG_NF_NAT */
+
+static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ u32 new_mark;
+
+ if (!mask)
+ return;
+
+ new_mark = mark | (ct->mark & ~(mask));
+ if (ct->mark != new_mark) {
+ ct->mark = new_mark;
+ if (nf_ct_is_confirmed(ct))
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+#endif
+}
+
+static void tcf_ct_act_set_labels(struct nf_conn *ct,
+ u32 *labels,
+ u32 *labels_m)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
+ size_t labels_sz = FIELD_SIZEOF(struct tcf_ct_params, labels);
+
+ if (!memchr_inv(labels_m, 0, labels_sz))
+ return;
+
+ nf_connlabels_replace(ct, labels, labels_m, 4);
+#endif
+}
+
+static int tcf_ct_act_nat(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ int ct_action,
+ struct nf_nat_range2 *range,
+ bool commit)
+{
+#if IS_ENABLED(CONFIG_NF_NAT)
+ enum nf_nat_manip_type maniptype;
+
+ if (!(ct_action & TCA_CT_ACT_NAT))
+ return NF_ACCEPT;
+
+ /* Add NAT extension if not confirmed yet. */
+ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+ return NF_DROP; /* Can't NAT. */
+
+ if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
+ (ctinfo != IP_CT_RELATED || commit)) {
+ /* NAT an established or related connection like before. */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+ /* This is the REPLY direction for a connection
+ * for which NAT was applied in the forward
+ * direction. Do the reverse NAT.
+ */
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+ else
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+ } else if (ct_action & TCA_CT_ACT_NAT_SRC) {
+ maniptype = NF_NAT_MANIP_SRC;
+ } else if (ct_action & TCA_CT_ACT_NAT_DST) {
+ maniptype = NF_NAT_MANIP_DST;
+ } else {
+ return NF_ACCEPT;
+ }
+
+ return ct_nat_execute(skb, ct, ctinfo, range, maniptype);
+#else
+ return NF_ACCEPT;
+#endif
+}
+
+static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct net *net = dev_net(skb->dev);
+ bool cached, commit, clear, force;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_ct *c = to_ct(a);
+ struct nf_conn *tmpl = NULL;
+ struct nf_hook_state state;
+ int nh_ofs, err, retval;
+ struct tcf_ct_params *p;
+ struct nf_conn *ct;
+ u8 family;
+
+ p = rcu_dereference_bh(c->params);
+
+ retval = READ_ONCE(c->tcf_action);
+ commit = p->ct_action & TCA_CT_ACT_COMMIT;
+ clear = p->ct_action & TCA_CT_ACT_CLEAR;
+ force = p->ct_action & TCA_CT_ACT_FORCE;
+ tmpl = p->tmpl;
+
+ if (clear) {
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ nf_conntrack_put(&ct->ct_general);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+ }
+
+ goto out;
+ }
+
+ family = tcf_ct_skb_nf_family(skb);
+ if (family == NFPROTO_UNSPEC)
+ goto drop;
+
+ /* The conntrack module expects to be working at L3.
+ * We also try to pull the IPv4/6 header to linear area
+ */
+ nh_ofs = skb_network_offset(skb);
+ skb_pull_rcsum(skb, nh_ofs);
+ err = tcf_ct_handle_fragments(net, skb, family, p->zone);
+ if (err == -EINPROGRESS) {
+ retval = TC_ACT_STOLEN;
+ goto out;
+ }
+ if (err)
+ goto drop;
+
+ err = tcf_ct_skb_network_trim(skb, family);
+ if (err)
+ goto drop;
+
+ /* If we are recirculating packets to match on ct fields and
+ * committing with a separate ct action, then we don't need to
+ * actually run the packet through conntrack twice unless it's for a
+ * different zone.
+ */
+ cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
+ if (!cached) {
+ /* Associate skb with specified zone. */
+ if (tmpl) {
+ ct = nf_ct_get(skb, &ctinfo);
+ if (skb_nfct(skb))
+ nf_conntrack_put(skb_nfct(skb));
+ nf_conntrack_get(&tmpl->ct_general);
+ nf_ct_set(skb, tmpl, IP_CT_NEW);
+ }
+
+ state.hook = NF_INET_PRE_ROUTING;
+ state.net = net;
+ state.pf = family;
+ err = nf_conntrack_in(skb, &state);
+ if (err != NF_ACCEPT)
+ goto out_push;
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ goto out_push;
+ nf_ct_deliver_cached_events(ct);
+
+ err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit);
+ if (err != NF_ACCEPT)
+ goto drop;
+
+ if (commit) {
+ tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
+ tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
+
+ /* This will take care of sending queued events
+ * even if the connection is already confirmed.
+ */
+ nf_conntrack_confirm(skb);
+ }
+
+out_push:
+ skb_push_rcsum(skb, nh_ofs);
+
+out:
+ bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
+ return retval;
+
+drop:
+ qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+ return TC_ACT_SHOT;
+}
+
+static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
+ [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 },
+ [TCA_CT_ACTION] = { .type = NLA_U16 },
+ [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) },
+ [TCA_CT_ZONE] = { .type = NLA_U16 },
+ [TCA_CT_MARK] = { .type = NLA_U32 },
+ [TCA_CT_MARK_MASK] = { .type = NLA_U32 },
+ [TCA_CT_LABELS] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_CT_LABELS_MASK] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
+ [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
+ [TCA_CT_NAT_IPV6_MIN] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct in6_addr) },
+ [TCA_CT_NAT_IPV6_MAX] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct in6_addr) },
+ [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
+ [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
+};
+
+static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
+ struct tc_ct *parm,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct nf_nat_range2 *range;
+
+ if (!(p->ct_action & TCA_CT_ACT_NAT))
+ return 0;
+
+ if (!IS_ENABLED(CONFIG_NF_NAT)) {
+ NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel");
+ return -EOPNOTSUPP;
+ }
+
+ if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
+ return 0;
+
+ if ((p->ct_action & TCA_CT_ACT_NAT_SRC) &&
+ (p->ct_action & TCA_CT_ACT_NAT_DST)) {
+ NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time");
+ return -EOPNOTSUPP;
+ }
+
+ range = &p->range;
+ if (tb[TCA_CT_NAT_IPV4_MIN]) {
+ struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX];
+
+ p->ipv4_range = true;
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ range->min_addr.ip =
+ nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]);
+
+ range->max_addr.ip = max_attr ?
+ nla_get_in_addr(max_attr) :
+ range->min_addr.ip;
+ } else if (tb[TCA_CT_NAT_IPV6_MIN]) {
+ struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX];
+
+ p->ipv4_range = false;
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ range->min_addr.in6 =
+ nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]);
+
+ range->max_addr.in6 = max_attr ?
+ nla_get_in6_addr(max_attr) :
+ range->min_addr.in6;
+ }
+
+ if (tb[TCA_CT_NAT_PORT_MIN]) {
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]);
+
+ range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ?
+ nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) :
+ range->min_proto.all;
+ }
+
+ return 0;
+}
+
+static void tcf_ct_set_key_val(struct nlattr **tb,
+ void *val, int val_type,
+ void *mask, int mask_type,
+ int len)
+{
+ if (!tb[val_type])
+ return;
+ nla_memcpy(val, tb[val_type], len);
+
+ if (!mask)
+ return;
+
+ if (mask_type == TCA_CT_UNSPEC || !tb[mask_type])
+ memset(mask, 0xff, len);
+ else
+ nla_memcpy(mask, tb[mask_type], len);
+}
+
+static int tcf_ct_fill_params(struct net *net,
+ struct tcf_ct_params *p,
+ struct tc_ct *parm,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+ struct nf_conntrack_zone zone;
+ struct nf_conn *tmpl;
+ int err;
+
+ p->zone = NF_CT_DEFAULT_ZONE_ID;
+
+ tcf_ct_set_key_val(tb,
+ &p->ct_action, TCA_CT_ACTION,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->ct_action));
+
+ if (p->ct_action & TCA_CT_ACT_CLEAR)
+ return 0;
+
+ err = tcf_ct_fill_params_nat(p, parm, tb, extack);
+ if (err)
+ return err;
+
+ if (tb[TCA_CT_MARK]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+ tcf_ct_set_key_val(tb,
+ &p->mark, TCA_CT_MARK,
+ &p->mark_mask, TCA_CT_MARK_MASK,
+ sizeof(p->mark));
+ }
+
+ if (tb[TCA_CT_LABELS]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+
+ if (!tn->labels) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length");
+ return -EOPNOTSUPP;
+ }
+ tcf_ct_set_key_val(tb,
+ p->labels, TCA_CT_LABELS,
+ p->labels_mask, TCA_CT_LABELS_MASK,
+ sizeof(p->labels));
+ }
+
+ if (tb[TCA_CT_ZONE]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+
+ tcf_ct_set_key_val(tb,
+ &p->zone, TCA_CT_ZONE,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->zone));
+ }
+
+ if (p->zone == NF_CT_DEFAULT_ZONE_ID)
+ return 0;
+
+ nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0);
+ tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL);
+ if (!tmpl) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
+ return -ENOMEM;
+ }
+ __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
+ nf_conntrack_get(&tmpl->ct_general);
+ p->tmpl = tmpl;
+
+ return 0;
+}
+
+static int tcf_ct_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int replace, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ct_net_id);
+ struct tcf_ct_params *params = NULL;
+ struct nlattr *tb[TCA_CT_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ struct tc_ct *parm;
+ struct tcf_ct *c;
+ int err, res = 0;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_CT_PARMS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters");
+ return -EINVAL;
+ }
+ parm = nla_data(tb[TCA_CT_PARMS]);
+
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+
+ if (!err) {
+ err = tcf_idr_create(tn, parm->index, est, a,
+ &act_ct_ops, bind, true);
+ if (err) {
+ tcf_idr_cleanup(tn, parm->index);
+ return err;
+ }
+ res = ACT_P_CREATED;
+ } else {
+ if (bind)
+ return 0;
+
+ if (!replace) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ }
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto cleanup;
+
+ c = to_ct(*a);
+
+ params = kzalloc(sizeof(*params), GFP_KERNEL);
+ if (unlikely(!params)) {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+
+ err = tcf_ct_fill_params(net, params, parm, tb, extack);
+ if (err)
+ goto cleanup;
+
+ spin_lock_bh(&c->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock));
+ spin_unlock_bh(&c->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (params)
+ kfree_rcu(params, rcu);
+ if (res == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+
+ return res;
+
+cleanup:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ kfree(params);
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static void tcf_ct_cleanup(struct tc_action *a)
+{
+ struct tcf_ct_params *params;
+ struct tcf_ct *c = to_ct(a);
+
+ params = rcu_dereference_protected(c->params, 1);
+ if (params)
+ call_rcu(&params->rcu, tcf_ct_params_free);
+}
+
+static int tcf_ct_dump_key_val(struct sk_buff *skb,
+ void *val, int val_type,
+ void *mask, int mask_type,
+ int len)
+{
+ int err;
+
+ if (mask && !memchr_inv(mask, 0, len))
+ return 0;
+
+ err = nla_put(skb, val_type, len, val);
+ if (err)
+ return err;
+
+ if (mask_type != TCA_CT_UNSPEC) {
+ err = nla_put(skb, mask_type, len, mask);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
+{
+ struct nf_nat_range2 *range = &p->range;
+
+ if (!(p->ct_action & TCA_CT_ACT_NAT))
+ return 0;
+
+ if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
+ return 0;
+
+ if (range->flags & NF_NAT_RANGE_MAP_IPS) {
+ if (p->ipv4_range) {
+ if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN,
+ range->min_addr.ip))
+ return -1;
+ if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX,
+ range->max_addr.ip))
+ return -1;
+ } else {
+ if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN,
+ &range->min_addr.in6))
+ return -1;
+ if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX,
+ &range->max_addr.in6))
+ return -1;
+ }
+ }
+
+ if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
+ if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN,
+ range->min_proto.all))
+ return -1;
+ if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX,
+ range->max_proto.all))
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_ct *c = to_ct(a);
+ struct tcf_ct_params *p;
+
+ struct tc_ct opt = {
+ .index = c->tcf_index,
+ .refcnt = refcount_read(&c->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&c->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&c->tcf_lock);
+ p = rcu_dereference_protected(c->params,
+ lockdep_is_held(&c->tcf_lock));
+ opt.action = c->tcf_action;
+
+ if (tcf_ct_dump_key_val(skb,
+ &p->ct_action, TCA_CT_ACTION,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->ct_action)))
+ goto nla_put_failure;
+
+ if (p->ct_action & TCA_CT_ACT_CLEAR)
+ goto skip_dump;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+ tcf_ct_dump_key_val(skb,
+ &p->mark, TCA_CT_MARK,
+ &p->mark_mask, TCA_CT_MARK_MASK,
+ sizeof(p->mark)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+ tcf_ct_dump_key_val(skb,
+ p->labels, TCA_CT_LABELS,
+ p->labels_mask, TCA_CT_LABELS_MASK,
+ sizeof(p->labels)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+ tcf_ct_dump_key_val(skb,
+ &p->zone, TCA_CT_ZONE,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->zone)))
+ goto nla_put_failure;
+
+ if (tcf_ct_dump_nat(skb, p))
+ goto nla_put_failure;
+
+skip_dump:
+ if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &c->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&c->tcf_lock);
+
+ return skb->len;
+nla_put_failure:
+ spin_unlock_bh(&c->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_ct_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse, bool hw)
+{
+ struct tcf_ct *c = to_ct(a);
+
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+ bytes, packets);
+ c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
+}
+
+static struct tc_action_ops act_ct_ops = {
+ .kind = "ct",
+ .id = TCA_ID_CT,
+ .owner = THIS_MODULE,
+ .act = tcf_ct_act,
+ .dump = tcf_ct_dump,
+ .init = tcf_ct_init,
+ .cleanup = tcf_ct_cleanup,
+ .walk = tcf_ct_walker,
+ .lookup = tcf_ct_search,
+ .stats_update = tcf_stats_update,
+ .size = sizeof(struct tcf_ct),
+};
+
+static __net_init int ct_init_net(struct net *net)
+{
+ unsigned int n_bits = FIELD_SIZEOF(struct tcf_ct_params, labels) * 8;
+ struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+
+ if (nf_connlabels_get(net, n_bits - 1)) {
+ tn->labels = false;
+ pr_err("act_ct: Failed to set connlabels length");
+ } else {
+ tn->labels = true;
+ }
+
+ return tc_action_net_init(&tn->tn, &act_ct_ops);
+}
+
+static void __net_exit ct_exit_net(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+
+ if (tn->labels)
+ nf_connlabels_put(net);
+ }
+ rtnl_unlock();
+
+ tc_action_net_exit(net_list, ct_net_id);
+}
+
+static struct pernet_operations ct_net_ops = {
+ .init = ct_init_net,
+ .exit_batch = ct_exit_net,
+ .id = &ct_net_id,
+ .size = sizeof(struct tc_ct_action_net),
+};
+
+static int __init ct_init_module(void)
+{
+ return tcf_register_action(&act_ct_ops, &ct_net_ops);
+}
+
+static void __exit ct_cleanup_module(void)
+{
+ tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+}
+
+module_init(ct_init_module);
+module_exit(ct_cleanup_module);
+MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>");
+MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>");
+MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>");
+MODULE_DESCRIPTION("Connection tracking action");
+MODULE_LICENSE("GPL v2");
+
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
new file mode 100644
index 000000000000..10eb2bb99861
--- /dev/null
+++ b/net/sched/act_ctinfo.c
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* net/sched/act_ctinfo.c netfilter ctinfo connmark actions
+ *
+ * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <uapi/linux/tc_act/tc_ctinfo.h>
+#include <net/tc_act/tc_ctinfo.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static struct tc_action_ops act_ctinfo_ops;
+static unsigned int ctinfo_net_id;
+
+static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+ struct tcf_ctinfo_params *cp,
+ struct sk_buff *skb, int wlen, int proto)
+{
+ u8 dscp, newdscp;
+
+ newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) &
+ ~INET_ECN_MASK;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK;
+ if (dscp != newdscp) {
+ if (likely(!skb_try_make_writable(skb, wlen))) {
+ ipv4_change_dsfield(ip_hdr(skb),
+ INET_ECN_MASK,
+ newdscp);
+ ca->stats_dscp_set++;
+ } else {
+ ca->stats_dscp_error++;
+ }
+ }
+ break;
+ case NFPROTO_IPV6:
+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK;
+ if (dscp != newdscp) {
+ if (likely(!skb_try_make_writable(skb, wlen))) {
+ ipv6_change_dsfield(ipv6_hdr(skb),
+ INET_ECN_MASK,
+ newdscp);
+ ca->stats_dscp_set++;
+ } else {
+ ca->stats_dscp_error++;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+ struct tcf_ctinfo_params *cp,
+ struct sk_buff *skb)
+{
+ ca->stats_cpmark_set++;
+ skb->mark = ct->mark & cp->cpmarkmask;
+}
+
+static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ const struct nf_conntrack_tuple_hash *thash = NULL;
+ struct tcf_ctinfo *ca = to_ctinfo(a);
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_zone zone;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_ctinfo_params *cp;
+ struct nf_conn *ct;
+ int proto, wlen;
+ int action;
+
+ cp = rcu_dereference_bh(ca->params);
+
+ tcf_lastuse_update(&ca->tcf_tm);
+ bstats_update(&ca->tcf_bstats, skb);
+ action = READ_ONCE(ca->tcf_action);
+
+ wlen = skb_network_offset(skb);
+ if (tc_skb_protocol(skb) == htons(ETH_P_IP)) {
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto out;
+
+ proto = NFPROTO_IPV4;
+ } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) {
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto out;
+
+ proto = NFPROTO_IPV6;
+ } else {
+ goto out;
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct) { /* look harder, usually ingress */
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ proto, cp->net, &tuple))
+ goto out;
+ zone.id = cp->zone;
+ zone.dir = NF_CT_DEFAULT_ZONE_DIR;
+
+ thash = nf_conntrack_find_get(cp->net, &zone, &tuple);
+ if (!thash)
+ goto out;
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+ }
+
+ if (cp->mode & CTINFO_MODE_DSCP)
+ if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask))
+ tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto);
+
+ if (cp->mode & CTINFO_MODE_CPMARK)
+ tcf_ctinfo_cpmark_set(ct, ca, cp, skb);
+
+ if (thash)
+ nf_ct_put(ct);
+out:
+ return action;
+}
+
+static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
+ [TCA_CTINFO_ACT] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct
+ tc_ctinfo) },
+ [TCA_CTINFO_ZONE] = { .type = NLA_U16 },
+ [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 },
+ [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 },
+ [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 },
+};
+
+static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+ struct nlattr *tb[TCA_CTINFO_MAX + 1];
+ struct tcf_ctinfo_params *cp_new;
+ struct tcf_chain *goto_ch = NULL;
+ u32 dscpmask = 0, dscpstatemask;
+ struct tc_ctinfo *actparm;
+ struct tcf_ctinfo *ci;
+ u8 dscpmaskshift;
+ int ret = 0, err;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "ctinfo requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_CTINFO_ACT]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing required TCA_CTINFO_ACT attribute");
+ return -EINVAL;
+ }
+ actparm = nla_data(tb[TCA_CTINFO_ACT]);
+
+ /* do some basic validation here before dynamically allocating things */
+ /* that we would otherwise have to clean up. */
+ if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) {
+ dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]);
+ /* need contiguous 6 bit mask */
+ dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0;
+ if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_CTINFO_PARMS_DSCP_MASK],
+ "dscp mask must be 6 contiguous bits");
+ return -EINVAL;
+ }
+ dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ?
+ nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0;
+ /* mask & statemask must not overlap */
+ if (dscpmask & dscpstatemask) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_CTINFO_PARMS_DSCP_STATEMASK],
+ "dscp statemask must not overlap dscp mask");
+ return -EINVAL;
+ }
+ }
+
+ /* done the validation:now to the actual action allocation */
+ err = tcf_idr_check_alloc(tn, &actparm->index, a, bind);
+ if (!err) {
+ ret = tcf_idr_create(tn, actparm->index, est, a,
+ &act_ctinfo_ops, bind, false);
+ if (ret) {
+ tcf_idr_cleanup(tn, actparm->index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (err > 0) {
+ if (bind) /* don't override defaults */
+ return 0;
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ } else {
+ return err;
+ }
+
+ err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ ci = to_ctinfo(*a);
+
+ cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL);
+ if (unlikely(!cp_new)) {
+ err = -ENOMEM;
+ goto put_chain;
+ }
+
+ cp_new->net = net;
+ cp_new->zone = tb[TCA_CTINFO_ZONE] ?
+ nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0;
+ if (dscpmask) {
+ cp_new->dscpmask = dscpmask;
+ cp_new->dscpmaskshift = dscpmaskshift;
+ cp_new->dscpstatemask = dscpstatemask;
+ cp_new->mode |= CTINFO_MODE_DSCP;
+ }
+
+ if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) {
+ cp_new->cpmarkmask =
+ nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]);
+ cp_new->mode |= CTINFO_MODE_CPMARK;
+ }
+
+ spin_lock_bh(&ci->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
+ rcu_swap_protected(ci->params, cp_new,
+ lockdep_is_held(&ci->tcf_lock));
+ spin_unlock_bh(&ci->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (cp_new)
+ kfree_rcu(cp_new, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+
+ return ret;
+
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ struct tcf_ctinfo *ci = to_ctinfo(a);
+ struct tc_ctinfo opt = {
+ .index = ci->tcf_index,
+ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
+ };
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_ctinfo_params *cp;
+ struct tcf_t t;
+
+ spin_lock_bh(&ci->tcf_lock);
+ cp = rcu_dereference_protected(ci->params,
+ lockdep_is_held(&ci->tcf_lock));
+
+ tcf_tm_dump(&t, &ci->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ opt.action = ci->tcf_action;
+ if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone))
+ goto nla_put_failure;
+
+ if (cp->mode & CTINFO_MODE_DSCP) {
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK,
+ cp->dscpmask))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK,
+ cp->dscpstatemask))
+ goto nla_put_failure;
+ }
+
+ if (cp->mode & CTINFO_MODE_CPMARK) {
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK,
+ cp->cpmarkmask))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET,
+ ci->stats_dscp_set, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR,
+ ci->stats_dscp_error, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET,
+ ci->stats_cpmark_set, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&ci->tcf_lock);
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&ci->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ctinfo_ops = {
+ .kind = "ctinfo",
+ .id = TCA_ID_CTINFO,
+ .owner = THIS_MODULE,
+ .act = tcf_ctinfo_act,
+ .dump = tcf_ctinfo_dump,
+ .init = tcf_ctinfo_init,
+ .walk = tcf_ctinfo_walker,
+ .lookup = tcf_ctinfo_search,
+ .size = sizeof(struct tcf_ctinfo),
+};
+
+static __net_init int ctinfo_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tc_action_net_init(tn, &act_ctinfo_ops);
+}
+
+static void __net_exit ctinfo_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, ctinfo_net_id);
+}
+
+static struct pernet_operations ctinfo_net_ops = {
+ .init = ctinfo_init_net,
+ .exit_batch = ctinfo_exit_net,
+ .id = &ctinfo_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init ctinfo_init_module(void)
+{
+ return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+static void __exit ctinfo_cleanup_module(void)
+{
+ tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+module_init(ctinfo_init_module);
+module_exit(ctinfo_cleanup_module);
+MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>");
+MODULE_DESCRIPTION("Connection tracking mark actions");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 58e7573dded4..055faa298c8e 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -27,6 +27,9 @@
static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
+#define MIRRED_RECURSION_LIMIT 4
+static DEFINE_PER_CPU(unsigned int, mirred_rec_level);
+
static bool tcf_mirred_is_act_redirect(int action)
{
return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
@@ -210,6 +213,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
struct sk_buff *skb2 = skb;
bool m_mac_header_xmit;
struct net_device *dev;
+ unsigned int rec_level;
int retval, err = 0;
bool use_reinsert;
bool want_ingress;
@@ -217,6 +221,14 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
int m_eaction;
int mac_len;
+ rec_level = __this_cpu_inc_return(mirred_rec_level);
+ if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) {
+ net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
+ netdev_name(skb->dev));
+ __this_cpu_dec(mirred_rec_level);
+ return TC_ACT_SHOT;
+ }
+
tcf_lastuse_update(&m->tcf_tm);
bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -277,7 +289,9 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
if (use_reinsert) {
res->ingress = want_ingress;
res->qstats = this_cpu_ptr(m->common.cpu_qstats);
- return TC_ACT_REINSERT;
+ skb_tc_reinsert(skb, res);
+ __this_cpu_dec(mirred_rec_level);
+ return TC_ACT_CONSUMED;
}
}
@@ -292,6 +306,7 @@ out:
if (tcf_mirred_is_act_redirect(m_eaction))
retval = TC_ACT_SHOT;
}
+ __this_cpu_dec(mirred_rec_level);
return retval;
}
@@ -411,6 +426,11 @@ static void tcf_mirred_put_dev(struct net_device *dev)
dev_put(dev);
}
+static size_t tcf_mirred_get_fill_size(const struct tc_action *act)
+{
+ return nla_total_size(sizeof(struct tc_mirred));
+}
+
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
.id = TCA_ID_MIRRED,
@@ -422,6 +442,7 @@ static struct tc_action_ops act_mirred_ops = {
.init = tcf_mirred_init,
.walk = tcf_mirred_walker,
.lookup = tcf_mirred_search,
+ .get_fill_size = tcf_mirred_get_fill_size,
.size = sizeof(struct tcf_mirred),
.get_dev = tcf_mirred_get_dev,
.put_dev = tcf_mirred_put_dev,
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
new file mode 100644
index 000000000000..ca2597ce4ac9
--- /dev/null
+++ b/net/sched/act_mpls.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/tc_act/tc_mpls.h>
+#include <net/mpls.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mpls.h>
+
+static unsigned int mpls_net_id;
+static struct tc_action_ops act_mpls_ops;
+
+#define ACT_MPLS_TTL_DEFAULT 255
+
+static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse,
+ struct tcf_mpls_params *p, bool set_bos)
+{
+ u32 new_lse = 0;
+
+ if (lse)
+ new_lse = be32_to_cpu(lse->label_stack_entry);
+
+ if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET) {
+ new_lse &= ~MPLS_LS_LABEL_MASK;
+ new_lse |= p->tcfm_label << MPLS_LS_LABEL_SHIFT;
+ }
+ if (p->tcfm_ttl) {
+ new_lse &= ~MPLS_LS_TTL_MASK;
+ new_lse |= p->tcfm_ttl << MPLS_LS_TTL_SHIFT;
+ }
+ if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET) {
+ new_lse &= ~MPLS_LS_TC_MASK;
+ new_lse |= p->tcfm_tc << MPLS_LS_TC_SHIFT;
+ }
+ if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET) {
+ new_lse &= ~MPLS_LS_S_MASK;
+ new_lse |= p->tcfm_bos << MPLS_LS_S_SHIFT;
+ } else if (set_bos) {
+ new_lse |= 1 << MPLS_LS_S_SHIFT;
+ }
+
+ return cpu_to_be32(new_lse);
+}
+
+static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_mpls *m = to_mpls(a);
+ struct tcf_mpls_params *p;
+ __be32 new_lse;
+ int ret;
+
+ tcf_lastuse_update(&m->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
+
+ /* Ensure 'data' points at mac_header prior calling mpls manipulating
+ * functions.
+ */
+ if (skb_at_tc_ingress(skb))
+ skb_push_rcsum(skb, skb->mac_len);
+
+ ret = READ_ONCE(m->tcf_action);
+
+ p = rcu_dereference_bh(m->mpls_p);
+
+ switch (p->tcfm_action) {
+ case TCA_MPLS_ACT_POP:
+ if (skb_mpls_pop(skb, p->tcfm_proto))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_PUSH:
+ new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol));
+ if (skb_mpls_push(skb, new_lse, p->tcfm_proto))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false);
+ if (skb_mpls_update_lse(skb, new_lse))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_DEC_TTL:
+ if (skb_mpls_dec_ttl(skb))
+ goto drop;
+ break;
+ }
+
+ if (skb_at_tc_ingress(skb))
+ skb_pull_rcsum(skb, skb->mac_len);
+
+ return ret;
+
+drop:
+ qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats));
+ return TC_ACT_SHOT;
+}
+
+static int valid_label(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u32 *label = nla_data(attr);
+
+ if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) {
+ NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
+ [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 },
+ [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)),
+ [TCA_MPLS_PROTO] = { .type = NLA_U16 },
+ [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label),
+ [TCA_MPLS_TC] = NLA_POLICY_RANGE(NLA_U8, 0, 7),
+ [TCA_MPLS_TTL] = NLA_POLICY_MIN(NLA_U8, 1),
+ [TCA_MPLS_BOS] = NLA_POLICY_RANGE(NLA_U8, 0, 1),
+};
+
+static int tcf_mpls_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+ struct nlattr *tb[TCA_MPLS_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ struct tcf_mpls_params *p;
+ struct tc_mpls *parm;
+ bool exists = false;
+ struct tcf_mpls *m;
+ int ret = 0, err;
+ u8 mpls_ttl = 0;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_MPLS_MAX, nla, mpls_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_MPLS_PARMS]) {
+ NL_SET_ERR_MSG_MOD(extack, "No MPLS params");
+ return -EINVAL;
+ }
+ parm = nla_data(tb[TCA_MPLS_PARMS]);
+
+ /* Verify parameters against action type. */
+ switch (parm->m_action) {
+ case TCA_MPLS_ACT_POP:
+ if (!tb[TCA_MPLS_PROTO]) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol must be set for MPLS pop");
+ return -EINVAL;
+ }
+ if (!eth_proto_is_802_3(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid protocol type for MPLS pop");
+ return -EINVAL;
+ }
+ if (tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] ||
+ tb[TCA_MPLS_BOS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC or BOS cannot be used with MPLS pop");
+ return -EINVAL;
+ }
+ break;
+ case TCA_MPLS_ACT_DEC_TTL:
+ if (tb[TCA_MPLS_PROTO] || tb[TCA_MPLS_LABEL] ||
+ tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC, BOS or protocol cannot be used with MPLS dec_ttl");
+ return -EINVAL;
+ }
+ break;
+ case TCA_MPLS_ACT_PUSH:
+ if (!tb[TCA_MPLS_LABEL]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push");
+ return -EINVAL;
+ }
+ if (tb[TCA_MPLS_PROTO] &&
+ !eth_p_mpls(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol must be an MPLS type for MPLS push");
+ return -EPROTONOSUPPORT;
+ }
+ /* Push needs a TTL - if not specified, set a default value. */
+ if (!tb[TCA_MPLS_TTL]) {
+#if IS_ENABLED(CONFIG_MPLS)
+ mpls_ttl = net->mpls.default_ttl ?
+ net->mpls.default_ttl : ACT_MPLS_TTL_DEFAULT;
+#else
+ mpls_ttl = ACT_MPLS_TTL_DEFAULT;
+#endif
+ }
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ if (tb[TCA_MPLS_PROTO]) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be used with MPLS modify");
+ return -EINVAL;
+ }
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unknown MPLS action");
+ return -EINVAL;
+ }
+
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, parm->index, est, a,
+ &act_mpls_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
+ return ret;
+ }
+
+ ret = ACT_P_CREATED;
+ } else if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ m = to_mpls(*a);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p) {
+ err = -ENOMEM;
+ goto put_chain;
+ }
+
+ p->tcfm_action = parm->m_action;
+ p->tcfm_label = tb[TCA_MPLS_LABEL] ? nla_get_u32(tb[TCA_MPLS_LABEL]) :
+ ACT_MPLS_LABEL_NOT_SET;
+ p->tcfm_tc = tb[TCA_MPLS_TC] ? nla_get_u8(tb[TCA_MPLS_TC]) :
+ ACT_MPLS_TC_NOT_SET;
+ p->tcfm_ttl = tb[TCA_MPLS_TTL] ? nla_get_u8(tb[TCA_MPLS_TTL]) :
+ mpls_ttl;
+ p->tcfm_bos = tb[TCA_MPLS_BOS] ? nla_get_u8(tb[TCA_MPLS_BOS]) :
+ ACT_MPLS_BOS_NOT_SET;
+ p->tcfm_proto = tb[TCA_MPLS_PROTO] ? nla_get_be16(tb[TCA_MPLS_PROTO]) :
+ htons(ETH_P_MPLS_UC);
+
+ spin_lock_bh(&m->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
+ spin_unlock_bh(&m->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (p)
+ kfree_rcu(p, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+ return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static void tcf_mpls_cleanup(struct tc_action *a)
+{
+ struct tcf_mpls *m = to_mpls(a);
+ struct tcf_mpls_params *p;
+
+ p = rcu_dereference_protected(m->mpls_p, 1);
+ if (p)
+ kfree_rcu(p, rcu);
+}
+
+static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_mpls *m = to_mpls(a);
+ struct tcf_mpls_params *p;
+ struct tc_mpls opt = {
+ .index = m->tcf_index,
+ .refcnt = refcount_read(&m->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&m->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&m->tcf_lock);
+ opt.action = m->tcf_action;
+ p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock));
+ opt.m_action = p->tcfm_action;
+
+ if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET &&
+ nla_put_u32(skb, TCA_MPLS_LABEL, p->tcfm_label))
+ goto nla_put_failure;
+
+ if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET &&
+ nla_put_u8(skb, TCA_MPLS_TC, p->tcfm_tc))
+ goto nla_put_failure;
+
+ if (p->tcfm_ttl && nla_put_u8(skb, TCA_MPLS_TTL, p->tcfm_ttl))
+ goto nla_put_failure;
+
+ if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET &&
+ nla_put_u8(skb, TCA_MPLS_BOS, p->tcfm_bos))
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, TCA_MPLS_PROTO, p->tcfm_proto))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &m->tcf_tm);
+
+ if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&m->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&m->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -EMSGSIZE;
+}
+
+static int tcf_mpls_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_mpls_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_mpls_ops = {
+ .kind = "mpls",
+ .id = TCA_ID_MPLS,
+ .owner = THIS_MODULE,
+ .act = tcf_mpls_act,
+ .dump = tcf_mpls_dump,
+ .init = tcf_mpls_init,
+ .cleanup = tcf_mpls_cleanup,
+ .walk = tcf_mpls_walker,
+ .lookup = tcf_mpls_search,
+ .size = sizeof(struct tcf_mpls),
+};
+
+static __net_init int mpls_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+ return tc_action_net_init(tn, &act_mpls_ops);
+}
+
+static void __net_exit mpls_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, mpls_net_id);
+}
+
+static struct pernet_operations mpls_net_ops = {
+ .init = mpls_init_net,
+ .exit_batch = mpls_exit_net,
+ .id = &mpls_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init mpls_init_module(void)
+{
+ return tcf_register_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+static void __exit mpls_cleanup_module(void)
+{
+ tcf_unregister_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+module_init(mpls_init_module);
+module_exit(mpls_cleanup_module);
+
+MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MPLS manipulation actions");
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ad36bbcc583e..638c1bc1ea1b 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -35,6 +35,7 @@
#include <net/tc_act/tc_police.h>
#include <net/tc_act/tc_sample.h>
#include <net/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_ct.h>
extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
@@ -672,21 +673,27 @@ static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
kfree(indr_block_cb);
}
+static int tcf_block_setup(struct tcf_block *block,
+ struct flow_block_offload *bo);
+
static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
struct tc_indr_block_cb *indr_block_cb,
- enum tc_block_command command)
+ enum flow_block_command command)
{
- struct tc_block_offload bo = {
+ struct flow_block_offload bo = {
.command = command,
- .binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
- .block = indr_dev->block,
+ .binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
+ .net = dev_net(indr_dev->dev),
+ .block_shared = tcf_block_shared(indr_dev->block),
};
+ INIT_LIST_HEAD(&bo.cb_list);
if (!indr_dev->block)
return;
indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
&bo);
+ tcf_block_setup(indr_dev->block, &bo);
}
int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
@@ -705,7 +712,7 @@ int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
if (err)
goto err_dev_put;
- tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_BIND);
+ tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_BIND);
return 0;
err_dev_put:
@@ -742,7 +749,7 @@ void __tc_indr_block_cb_unregister(struct net_device *dev,
return;
/* Send unbind message if required to free any block cbs. */
- tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_UNBIND);
+ tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_UNBIND);
tc_indr_block_cb_del(indr_block_cb);
tc_indr_block_dev_put(indr_dev);
}
@@ -759,27 +766,31 @@ EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister);
static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
struct tcf_block_ext_info *ei,
- enum tc_block_command command,
+ enum flow_block_command command,
struct netlink_ext_ack *extack)
{
struct tc_indr_block_cb *indr_block_cb;
struct tc_indr_block_dev *indr_dev;
- struct tc_block_offload bo = {
+ struct flow_block_offload bo = {
.command = command,
.binder_type = ei->binder_type,
- .block = block,
+ .net = dev_net(dev),
+ .block_shared = tcf_block_shared(block),
.extack = extack,
};
+ INIT_LIST_HEAD(&bo.cb_list);
indr_dev = tc_indr_block_dev_lookup(dev);
if (!indr_dev)
return;
- indr_dev->block = command == TC_BLOCK_BIND ? block : NULL;
+ indr_dev->block = command == FLOW_BLOCK_BIND ? block : NULL;
list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
&bo);
+
+ tcf_block_setup(block, &bo);
}
static bool tcf_block_offload_in_use(struct tcf_block *block)
@@ -790,16 +801,24 @@ static bool tcf_block_offload_in_use(struct tcf_block *block)
static int tcf_block_offload_cmd(struct tcf_block *block,
struct net_device *dev,
struct tcf_block_ext_info *ei,
- enum tc_block_command command,
+ enum flow_block_command command,
struct netlink_ext_ack *extack)
{
- struct tc_block_offload bo = {};
+ struct flow_block_offload bo = {};
+ int err;
+ bo.net = dev_net(dev);
bo.command = command;
bo.binder_type = ei->binder_type;
- bo.block = block;
+ bo.block_shared = tcf_block_shared(block);
bo.extack = extack;
- return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+ INIT_LIST_HEAD(&bo.cb_list);
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+ if (err < 0)
+ return err;
+
+ return tcf_block_setup(block, &bo);
}
static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
@@ -820,20 +839,20 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
return -EOPNOTSUPP;
}
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
+ err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack);
if (err == -EOPNOTSUPP)
goto no_offload_dev_inc;
if (err)
return err;
- tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
+ tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
return 0;
no_offload_dev_inc:
if (tcf_block_offload_in_use(block))
return -EOPNOTSUPP;
block->nooffloaddevcnt++;
- tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
+ tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
return 0;
}
@@ -843,11 +862,11 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
struct net_device *dev = q->dev_queue->dev;
int err;
- tc_indr_block_call(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+ tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
if (!dev->netdev_ops->ndo_setup_tc)
goto no_offload_dev_dec;
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+ err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
if (err == -EOPNOTSUPP)
goto no_offload_dev_dec;
return;
@@ -1340,17 +1359,17 @@ static void tcf_block_release(struct Qdisc *q, struct tcf_block *block,
struct tcf_block_owner_item {
struct list_head list;
struct Qdisc *q;
- enum tcf_block_binder_type binder_type;
+ enum flow_block_binder_type binder_type;
};
static void
tcf_block_owner_netif_keep_dst(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
if (block->keep_dst &&
- binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
- binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+ binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
+ binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
netif_keep_dst(qdisc_dev(q));
}
@@ -1367,7 +1386,7 @@ EXPORT_SYMBOL(tcf_block_netif_keep_dst);
static int tcf_block_owner_add(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
struct tcf_block_owner_item *item;
@@ -1382,7 +1401,7 @@ static int tcf_block_owner_add(struct tcf_block *block,
static void tcf_block_owner_del(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
struct tcf_block_owner_item *item;
@@ -1494,43 +1513,6 @@ void tcf_block_put(struct tcf_block *block)
EXPORT_SYMBOL(tcf_block_put);
-struct tcf_block_cb {
- struct list_head list;
- tc_setup_cb_t *cb;
- void *cb_ident;
- void *cb_priv;
- unsigned int refcnt;
-};
-
-void *tcf_block_cb_priv(struct tcf_block_cb *block_cb)
-{
- return block_cb->cb_priv;
-}
-EXPORT_SYMBOL(tcf_block_cb_priv);
-
-struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident)
-{ struct tcf_block_cb *block_cb;
-
- list_for_each_entry(block_cb, &block->cb_list, list)
- if (block_cb->cb == cb && block_cb->cb_ident == cb_ident)
- return block_cb;
- return NULL;
-}
-EXPORT_SYMBOL(tcf_block_cb_lookup);
-
-void tcf_block_cb_incref(struct tcf_block_cb *block_cb)
-{
- block_cb->refcnt++;
-}
-EXPORT_SYMBOL(tcf_block_cb_incref);
-
-unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
-{
- return --block_cb->refcnt;
-}
-EXPORT_SYMBOL(tcf_block_cb_decref);
-
static int
tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb,
void *cb_priv, bool add, bool offload_in_use,
@@ -1572,66 +1554,76 @@ err_playback_remove:
return err;
}
-struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv,
- struct netlink_ext_ack *extack)
+static int tcf_block_bind(struct tcf_block *block,
+ struct flow_block_offload *bo)
{
- struct tcf_block_cb *block_cb;
- int err;
+ struct flow_block_cb *block_cb, *next;
+ int err, i = 0;
- /* Replay any already present rules */
- err = tcf_block_playback_offloads(block, cb, cb_priv, true,
- tcf_block_offload_in_use(block),
- extack);
- if (err)
- return ERR_PTR(err);
+ list_for_each_entry(block_cb, &bo->cb_list, list) {
+ err = tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, true,
+ tcf_block_offload_in_use(block),
+ bo->extack);
+ if (err)
+ goto err_unroll;
- block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
- if (!block_cb)
- return ERR_PTR(-ENOMEM);
- block_cb->cb = cb;
- block_cb->cb_ident = cb_ident;
- block_cb->cb_priv = cb_priv;
- list_add(&block_cb->list, &block->cb_list);
- return block_cb;
-}
-EXPORT_SYMBOL(__tcf_block_cb_register);
+ i++;
+ }
+ list_splice(&bo->cb_list, &block->cb_list);
-int tcf_block_cb_register(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv, struct netlink_ext_ack *extack)
-{
- struct tcf_block_cb *block_cb;
+ return 0;
+
+err_unroll:
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ if (i-- > 0) {
+ list_del(&block_cb->list);
+ tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, false,
+ tcf_block_offload_in_use(block),
+ NULL);
+ }
+ flow_block_cb_free(block_cb);
+ }
- block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv,
- extack);
- return PTR_ERR_OR_ZERO(block_cb);
+ return err;
}
-EXPORT_SYMBOL(tcf_block_cb_register);
-void __tcf_block_cb_unregister(struct tcf_block *block,
- struct tcf_block_cb *block_cb)
+static void tcf_block_unbind(struct tcf_block *block,
+ struct flow_block_offload *bo)
{
- tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv,
- false, tcf_block_offload_in_use(block),
- NULL);
- list_del(&block_cb->list);
- kfree(block_cb);
+ struct flow_block_cb *block_cb, *next;
+
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, false,
+ tcf_block_offload_in_use(block),
+ NULL);
+ list_del(&block_cb->list);
+ flow_block_cb_free(block_cb);
+ }
}
-EXPORT_SYMBOL(__tcf_block_cb_unregister);
-void tcf_block_cb_unregister(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident)
+static int tcf_block_setup(struct tcf_block *block,
+ struct flow_block_offload *bo)
{
- struct tcf_block_cb *block_cb;
+ int err;
- block_cb = tcf_block_cb_lookup(block, cb, cb_ident);
- if (!block_cb)
- return;
- __tcf_block_cb_unregister(block, block_cb);
+ switch (bo->command) {
+ case FLOW_BLOCK_BIND:
+ err = tcf_block_bind(block, bo);
+ break;
+ case FLOW_BLOCK_UNBIND:
+ err = 0;
+ tcf_block_unbind(block, bo);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EOPNOTSUPP;
+ }
+
+ return err;
}
-EXPORT_SYMBOL(tcf_block_cb_unregister);
/* Main classifier routine: scans classifier chain attached
* to this qdisc, (optionally) tests for protocol and asks
@@ -3155,7 +3147,7 @@ EXPORT_SYMBOL(tcf_exts_dump_stats);
int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
void *type_data, bool err_stop)
{
- struct tcf_block_cb *block_cb;
+ struct flow_block_cb *block_cb;
int ok_count = 0;
int err;
@@ -3266,6 +3258,10 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->police.burst = tcf_police_tcfp_burst(act);
entry->police.rate_bytes_ps =
tcf_police_rate_bytes_ps(act);
+ } else if (is_tcf_ct(act)) {
+ entry->id = FLOW_ACTION_CT;
+ entry->ct.action = tcf_ct_action(act);
+ entry->ct.zone = tcf_ct_zone(act);
} else {
goto err_out;
}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index eedd5786c084..38d6e85693fc 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -26,8 +26,10 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <uapi/linux/netfilter/nf_conntrack_common.h>
+
struct fl_flow_key {
- int indev_ifindex;
+ struct flow_dissector_key_meta meta;
struct flow_dissector_key_control control;
struct flow_dissector_key_control enc_control;
struct flow_dissector_key_basic basic;
@@ -54,6 +56,7 @@ struct fl_flow_key {
struct flow_dissector_key_enc_opts enc_opts;
struct flow_dissector_key_ports tp_min;
struct flow_dissector_key_ports tp_max;
+ struct flow_dissector_key_ct ct;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -272,24 +275,40 @@ static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
return __fl_lookup(mask, mkey);
}
+static u16 fl_ct_info_to_flower_map[] = {
+ [IP_CT_ESTABLISHED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED,
+ [IP_CT_RELATED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_RELATED,
+ [IP_CT_ESTABLISHED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED,
+ [IP_CT_RELATED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_RELATED,
+ [IP_CT_NEW] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_NEW,
+};
+
static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_fl_head *head = rcu_dereference_bh(tp->root);
- struct cls_fl_filter *f;
- struct fl_flow_mask *mask;
- struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
+ struct fl_flow_key skb_key;
+ struct fl_flow_mask *mask;
+ struct cls_fl_filter *f;
list_for_each_entry_rcu(mask, &head->masks, list) {
fl_clear_masked_range(&skb_key, mask);
- skb_key.indev_ifindex = skb->skb_iif;
+ skb_flow_dissect_meta(skb, &mask->dissector, &skb_key);
/* skb_flow_dissect() does not set n_proto in case an unknown
* protocol, so do it rather here.
*/
skb_key.basic.n_proto = skb->protocol;
skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
+ skb_flow_dissect_ct(skb, &mask->dissector, &skb_key,
+ fl_ct_info_to_flower_map,
+ ARRAY_SIZE(fl_ct_info_to_flower_map));
skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
fl_set_masked_key(&skb_mkey, &skb_key, mask);
@@ -390,14 +409,14 @@ static void fl_destroy_filter_work(struct work_struct *work)
static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
bool rtnl_held, struct netlink_ext_ack *extack)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
if (!rtnl_held)
rtnl_lock();
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
- cls_flower.command = TC_CLSFLOWER_DESTROY;
+ cls_flower.command = FLOW_CLS_DESTROY;
cls_flower.cookie = (unsigned long) f;
tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
@@ -415,8 +434,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
struct cls_fl_head *head = fl_head_dereference(tp);
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
bool skip_sw = tc_skip_sw(f->flags);
int err = 0;
@@ -430,7 +449,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
}
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
- cls_flower.command = TC_CLSFLOWER_REPLACE;
+ cls_flower.command = FLOW_CLS_REPLACE;
cls_flower.cookie = (unsigned long) f;
cls_flower.rule->match.dissector = &f->mask->dissector;
cls_flower.rule->match.mask = &f->mask->key;
@@ -479,14 +498,14 @@ errout:
static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
bool rtnl_held)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
if (!rtnl_held)
rtnl_lock();
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
- cls_flower.command = TC_CLSFLOWER_STATS;
+ cls_flower.command = FLOW_CLS_STATS;
cls_flower.cookie = (unsigned long) f;
cls_flower.classid = f->res.classid;
@@ -524,24 +543,6 @@ static struct cls_fl_filter *__fl_get(struct cls_fl_head *head, u32 handle)
return f;
}
-static struct cls_fl_filter *fl_get_next_filter(struct tcf_proto *tp,
- unsigned long *handle)
-{
- struct cls_fl_head *head = fl_head_dereference(tp);
- struct cls_fl_filter *f;
-
- rcu_read_lock();
- while ((f = idr_get_next_ul(&head->handle_idr, handle))) {
- /* don't return filters that are being deleted */
- if (refcount_inc_not_zero(&f->refcnt))
- break;
- ++(*handle);
- }
- rcu_read_unlock();
-
- return f;
-}
-
static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
bool *last, bool rtnl_held,
struct netlink_ext_ack *extack)
@@ -704,6 +705,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED },
[TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_CT_STATE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_STATE_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_CT_MARK_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_CT_LABELS] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
};
static const struct nla_policy
@@ -725,11 +736,11 @@ static void fl_set_key_val(struct nlattr **tb,
{
if (!tb[val_type])
return;
- memcpy(val, nla_data(tb[val_type]), len);
+ nla_memcpy(val, tb[val_type], len);
if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type])
memset(mask, 0xff, len);
else
- memcpy(mask, nla_data(tb[mask_type]), len);
+ nla_memcpy(mask, tb[mask_type], len);
}
static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
@@ -1015,21 +1026,65 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
return 0;
}
+static int fl_set_key_ct(struct nlattr **tb,
+ struct flow_dissector_key_ct *key,
+ struct flow_dissector_key_ct *mask,
+ struct netlink_ext_ack *extack)
+{
+ if (tb[TCA_FLOWER_KEY_CT_STATE]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) {
+ NL_SET_ERR_MSG(extack, "Conntrack isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+ &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+ sizeof(key->ct_state));
+ }
+ if (tb[TCA_FLOWER_KEY_CT_ZONE]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
+ NL_SET_ERR_MSG(extack, "Conntrack zones isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+ &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+ sizeof(key->ct_zone));
+ }
+ if (tb[TCA_FLOWER_KEY_CT_MARK]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
+ NL_SET_ERR_MSG(extack, "Conntrack mark isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+ &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+ sizeof(key->ct_mark));
+ }
+ if (tb[TCA_FLOWER_KEY_CT_LABELS]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
+ NL_SET_ERR_MSG(extack, "Conntrack labels aren't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+ mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+ sizeof(key->ct_labels));
+ }
+
+ return 0;
+}
+
static int fl_set_key(struct net *net, struct nlattr **tb,
struct fl_flow_key *key, struct fl_flow_key *mask,
struct netlink_ext_ack *extack)
{
__be16 ethertype;
int ret = 0;
-#ifdef CONFIG_NET_CLS_IND
+
if (tb[TCA_FLOWER_INDEV]) {
int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack);
if (err < 0)
return err;
- key->indev_ifindex = err;
- mask->indev_ifindex = 0xffffffff;
+ key->meta.ingress_ifindex = err;
+ mask->meta.ingress_ifindex = 0xffffffff;
}
-#endif
fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
@@ -1225,6 +1280,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
return ret;
}
+ ret = fl_set_key_ct(tb, &key->ct, &mask->ct, extack);
+ if (ret)
+ return ret;
+
if (tb[TCA_FLOWER_KEY_FLAGS])
ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
@@ -1282,6 +1341,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
size_t cnt = 0;
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_META, meta);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1323,6 +1384,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_CT, ct);
skb_flow_dissector_init(dissector, keys, cnt);
}
@@ -1691,20 +1754,25 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
bool rtnl_held)
{
+ struct cls_fl_head *head = fl_head_dereference(tp);
+ unsigned long id = arg->cookie, tmp;
struct cls_fl_filter *f;
arg->count = arg->skip;
- while ((f = fl_get_next_filter(tp, &arg->cookie)) != NULL) {
+ idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) {
+ /* don't return filters that are being deleted */
+ if (!refcount_inc_not_zero(&f->refcnt))
+ continue;
if (arg->fn(tp, f, arg) < 0) {
__fl_put(f);
arg->stop = 1;
break;
}
__fl_put(f);
- arg->cookie++;
arg->count++;
}
+ arg->cookie = id;
}
static struct cls_fl_filter *
@@ -1735,8 +1803,8 @@ fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool add)
static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
struct cls_fl_filter *f = NULL;
int err;
@@ -1757,7 +1825,7 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags,
extack);
cls_flower.command = add ?
- TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY;
+ FLOW_CLS_REPLACE : FLOW_CLS_DESTROY;
cls_flower.cookie = (unsigned long)f;
cls_flower.rule->match.dissector = &f->mask->dissector;
cls_flower.rule->match.mask = &f->mask->key;
@@ -1801,7 +1869,7 @@ next_flow:
static int fl_hw_create_tmplt(struct tcf_chain *chain,
struct fl_flow_tmplt *tmplt)
{
- struct tc_cls_flower_offload cls_flower = {};
+ struct flow_cls_offload cls_flower = {};
struct tcf_block *block = chain->block;
cls_flower.rule = flow_rule_alloc(0);
@@ -1809,7 +1877,7 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain,
return -ENOMEM;
cls_flower.common.chain_index = chain->index;
- cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE;
+ cls_flower.command = FLOW_CLS_TMPLT_CREATE;
cls_flower.cookie = (unsigned long) tmplt;
cls_flower.rule->match.dissector = &tmplt->dissector;
cls_flower.rule->match.mask = &tmplt->mask;
@@ -1827,11 +1895,11 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain,
static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
struct fl_flow_tmplt *tmplt)
{
- struct tc_cls_flower_offload cls_flower = {};
+ struct flow_cls_offload cls_flower = {};
struct tcf_block *block = chain->block;
cls_flower.common.chain_index = chain->index;
- cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY;
+ cls_flower.command = FLOW_CLS_TMPLT_DESTROY;
cls_flower.cookie = (unsigned long) tmplt;
tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
@@ -2077,6 +2145,40 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fl_dump_key_ct(struct sk_buff *skb,
+ struct flow_dissector_key_ct *key,
+ struct flow_dissector_key_ct *mask)
+{
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK) &&
+ fl_dump_key_val(skb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+ &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+ sizeof(key->ct_state)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+ fl_dump_key_val(skb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+ &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+ sizeof(key->ct_zone)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+ fl_dump_key_val(skb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+ &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+ sizeof(key->ct_mark)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+ fl_dump_key_val(skb, &key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+ &mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+ sizeof(key->ct_labels)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
struct flow_dissector_key_enc_opts *enc_opts)
{
@@ -2123,10 +2225,10 @@ static int fl_dump_key_enc_opt(struct sk_buff *skb,
static int fl_dump_key(struct sk_buff *skb, struct net *net,
struct fl_flow_key *key, struct fl_flow_key *mask)
{
- if (mask->indev_ifindex) {
+ if (mask->meta.ingress_ifindex) {
struct net_device *dev;
- dev = __dev_get_by_index(net, key->indev_ifindex);
+ dev = __dev_get_by_index(net, key->meta.ingress_ifindex);
if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name))
goto nla_put_failure;
}
@@ -2310,6 +2412,9 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts))
goto nla_put_failure;
+ if (fl_dump_key_ct(skb, &key->ct, &mask->ct))
+ goto nla_put_failure;
+
if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
goto nla_put_failure;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 4dab833f66cb..c9496c920d6f 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -8,9 +8,6 @@
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
* Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
- *
- * JHS: We should remove the CONFIG_NET_CLS_IND from here
- * eventually when the meta match extension is made available
*/
#include <linux/module.h>
@@ -37,9 +34,7 @@ struct fw_filter {
struct fw_filter __rcu *next;
u32 id;
struct tcf_result res;
-#ifdef CONFIG_NET_CLS_IND
int ifindex;
-#endif /* CONFIG_NET_CLS_IND */
struct tcf_exts exts;
struct tcf_proto *tp;
struct rcu_work rwork;
@@ -67,10 +62,8 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
f = rcu_dereference_bh(f->next)) {
if (f->id == id) {
*res = f->res;
-#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, f->ifindex))
continue;
-#endif /* CONFIG_NET_CLS_IND */
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
continue;
@@ -222,7 +215,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
tcf_bind_filter(tp, &f->res, base);
}
-#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_FW_INDEV]) {
int ret;
ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack);
@@ -230,7 +222,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
return ret;
f->ifindex = ret;
}
-#endif /* CONFIG_NET_CLS_IND */
err = -EINVAL;
if (tb[TCA_FW_MASK]) {
@@ -276,9 +267,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
fnew->id = f->id;
fnew->res = f->res;
-#ifdef CONFIG_NET_CLS_IND
fnew->ifindex = f->ifindex;
-#endif /* CONFIG_NET_CLS_IND */
fnew->tp = f->tp;
err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT,
@@ -405,14 +394,12 @@ static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (f->res.classid &&
nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
goto nla_put_failure;
-#ifdef CONFIG_NET_CLS_IND
if (f->ifindex) {
struct net_device *dev;
dev = __dev_get_by_index(net, f->ifindex);
if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
goto nla_put_failure;
}
-#endif /* CONFIG_NET_CLS_IND */
if (head->mask != 0xFFFFFFFF &&
nla_put_u32(skb, TCA_FW_MASK, head->mask))
goto nla_put_failure;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 38c0a9f0f296..a30d2f8feb32 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,6 +21,7 @@ struct cls_mall_head {
unsigned int in_hw_count;
struct tc_matchall_pcnt __percpu *pf;
struct rcu_work rwork;
+ bool deleting;
};
static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -258,7 +259,11 @@ err_exts_init:
static int mall_delete(struct tcf_proto *tp, void *arg, bool *last,
bool rtnl_held, struct netlink_ext_ack *extack)
{
- return -EOPNOTSUPP;
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+
+ head->deleting = true;
+ *last = true;
+ return 0;
}
static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg,
@@ -269,7 +274,7 @@ static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg,
if (arg->count < arg->skip)
goto skip;
- if (!head)
+ if (!head || head->deleting)
return;
if (arg->fn(tp, head, arg) < 0)
arg->stop = 1;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index c7727de5e073..be9e46c77e8b 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -20,9 +20,6 @@
* pure RSVP doesn't need such a general approach and can use
* much simpler (and faster) schemes, sort of cls_rsvp.c.
*
- * JHS: We should remove the CONFIG_NET_CLS_IND from here
- * eventually when the meta match extension is made available
- *
* nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
*/
@@ -48,9 +45,7 @@ struct tc_u_knode {
u32 handle;
struct tc_u_hnode __rcu *ht_up;
struct tcf_exts exts;
-#ifdef CONFIG_NET_CLS_IND
int ifindex;
-#endif
u8 fshift;
struct tcf_result res;
struct tc_u_hnode __rcu *ht_down;
@@ -176,12 +171,10 @@ check_terminal:
if (n->sel.flags & TC_U32_TERMINAL) {
*res = n->res;
-#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, n->ifindex)) {
n = rcu_dereference_bh(n->next);
goto next_knode;
}
-#endif
#ifdef CONFIG_CLS_U32_PERF
__this_cpu_inc(n->pf->rhit);
#endif
@@ -761,7 +754,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
tcf_bind_filter(tp, &n->res, base);
}
-#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_U32_INDEV]) {
int ret;
ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack);
@@ -769,7 +761,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
return -EINVAL;
n->ifindex = ret;
}
-#endif
return 0;
}
@@ -817,9 +808,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
new->handle = n->handle;
RCU_INIT_POINTER(new->ht_up, n->ht_up);
-#ifdef CONFIG_NET_CLS_IND
new->ifindex = n->ifindex;
-#endif
new->fshift = n->fshift;
new->res = n->res;
new->flags = n->flags;
@@ -1351,14 +1340,12 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (tcf_exts_dump(skb, &n->exts) < 0)
goto nla_put_failure;
-#ifdef CONFIG_NET_CLS_IND
if (n->ifindex) {
struct net_device *dev;
dev = __dev_get_by_index(net, n->ifindex);
if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
goto nla_put_failure;
}
-#endif
#ifdef CONFIG_CLS_U32_PERF
gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
n->sel.nkeys * sizeof(u64),
@@ -1422,9 +1409,7 @@ static int __init init_u32(void)
#ifdef CONFIG_CLS_U32_PERF
pr_info(" Performance counters on\n");
#endif
-#ifdef CONFIG_NET_CLS_IND
pr_info(" input device check on\n");
-#endif
#ifdef CONFIG_NET_CLS_ACT
pr_info(" Actions configured\n");
#endif
diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
index 243fd22f2248..9fff6480acc6 100644
--- a/net/sched/em_ipt.c
+++ b/net/sched/em_ipt.c
@@ -21,6 +21,7 @@
struct em_ipt_match {
const struct xt_match *match;
u32 hook;
+ u8 nfproto;
u8 match_data[0] __aligned(8);
};
@@ -71,11 +72,25 @@ static int policy_validate_match_data(struct nlattr **tb, u8 mrev)
return 0;
}
+static int addrtype_validate_match_data(struct nlattr **tb, u8 mrev)
+{
+ if (mrev != 1) {
+ pr_err("only addrtype match revision 1 supported");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static const struct em_ipt_xt_match em_ipt_xt_matches[] = {
{
.match_name = "policy",
.validate_match_data = policy_validate_match_data
},
+ {
+ .match_name = "addrtype",
+ .validate_match_data = addrtype_validate_match_data
+ },
{}
};
@@ -115,6 +130,7 @@ static int em_ipt_change(struct net *net, void *data, int data_len,
struct em_ipt_match *im = NULL;
struct xt_match *match;
int mdata_len, ret;
+ u8 nfproto;
ret = nla_parse_deprecated(tb, TCA_EM_IPT_MAX, data, data_len,
em_ipt_policy, NULL);
@@ -125,6 +141,15 @@ static int em_ipt_change(struct net *net, void *data, int data_len,
!tb[TCA_EM_IPT_MATCH_DATA] || !tb[TCA_EM_IPT_NFPROTO])
return -EINVAL;
+ nfproto = nla_get_u8(tb[TCA_EM_IPT_NFPROTO]);
+ switch (nfproto) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ break;
+ default:
+ return -EINVAL;
+ }
+
match = get_xt_match(tb);
if (IS_ERR(match)) {
pr_err("unable to load match\n");
@@ -140,6 +165,7 @@ static int em_ipt_change(struct net *net, void *data, int data_len,
im->match = match;
im->hook = nla_get_u32(tb[TCA_EM_IPT_HOOK]);
+ im->nfproto = nfproto;
nla_memcpy(im->match_data, tb[TCA_EM_IPT_MATCH_DATA], mdata_len);
ret = check_match(net, im, mdata_len);
@@ -182,15 +208,33 @@ static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em,
const struct em_ipt_match *im = (const void *)em->data;
struct xt_action_param acpar = {};
struct net_device *indev = NULL;
+ u8 nfproto = im->match->family;
struct nf_hook_state state;
int ret;
+ switch (tc_skb_protocol(skb)) {
+ case htons(ETH_P_IP):
+ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+ if (nfproto == NFPROTO_UNSPEC)
+ nfproto = NFPROTO_IPV4;
+ break;
+ case htons(ETH_P_IPV6):
+ if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+ if (nfproto == NFPROTO_UNSPEC)
+ nfproto = NFPROTO_IPV6;
+ break;
+ default:
+ return 0;
+ }
+
rcu_read_lock();
if (skb->skb_iif)
indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
- nf_hook_state_init(&state, im->hook, im->match->family,
+ nf_hook_state_init(&state, im->hook, nfproto,
indev ?: skb->dev, skb->dev, NULL, em->net, NULL);
acpar.match = im->match;
@@ -213,7 +257,7 @@ static int em_ipt_dump(struct sk_buff *skb, struct tcf_ematch *em)
return -EMSGSIZE;
if (nla_put_u8(skb, TCA_EM_IPT_MATCH_REVISION, im->match->revision) < 0)
return -EMSGSIZE;
- if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->match->family) < 0)
+ if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->nfproto) < 0)
return -EMSGSIZE;
if (nla_put(skb, TCA_EM_IPT_MATCH_DATA,
im->match->usersize ?: im->match->matchsize,
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index db0c2ba1d156..cebfb65d8556 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -22,10 +22,12 @@
#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON)
+#define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK)
struct etf_sched_data {
bool offload;
bool deadline_mode;
+ bool skip_sock_check;
int clockid;
int queue;
s32 delta; /* in ns */
@@ -77,6 +79,9 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
struct sock *sk = nskb->sk;
ktime_t now;
+ if (q->skip_sock_check)
+ goto skip;
+
if (!sk)
return false;
@@ -92,6 +97,7 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
if (sk->sk_txtime_deadline_mode != q->deadline_mode)
return false;
+skip:
now = q->get_time();
if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
return false;
@@ -385,6 +391,7 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt,
q->clockid = qopt->clockid;
q->offload = OFFLOAD_IS_ON(qopt);
q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+ q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt);
switch (q->clockid) {
case CLOCK_REALTIME:
@@ -473,6 +480,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
if (q->deadline_mode)
opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+ if (q->skip_sock_check)
+ opt.flags |= TC_ETF_SKIP_SOCK_CHECK;
+
if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 0f65f617756b..bf56aa519797 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -83,7 +83,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress);
- q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
q->block_info.chain_head_change = clsact_chain_head_change;
q->block_info.chain_head_change_priv = &q->miniqp;
@@ -114,6 +114,7 @@ nla_put_failure:
}
static const struct Qdisc_class_ops ingress_class_ops = {
+ .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
.leaf = ingress_leaf,
.find = ingress_find,
.walk = ingress_walk,
@@ -216,7 +217,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress);
- q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
q->ingress_block_info.chain_head_change = clsact_chain_head_change;
q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress;
@@ -227,7 +228,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
- q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
+ q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
q->egress_block_info.chain_head_change = clsact_chain_head_change;
q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
@@ -246,6 +247,7 @@ static void clsact_destroy(struct Qdisc *sch)
}
static const struct Qdisc_class_ops clsact_class_ops = {
+ .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
.leaf = ingress_leaf,
.find = clsact_find,
.walk = ingress_walk,
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 9ecfb8f5902a..388750ddc57a 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -21,12 +21,17 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/sch_generic.h>
+#include <net/sock.h>
+#include <net/tcp.h>
static LIST_HEAD(taprio_list);
static DEFINE_SPINLOCK(taprio_list_lock);
#define TAPRIO_ALL_GATES_OPEN -1
+#define FLAGS_VALID(flags) (!((flags) & ~TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST))
+#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
+
struct sched_entry {
struct list_head list;
@@ -35,6 +40,7 @@ struct sched_entry {
* packet leaves after this time.
*/
ktime_t close_time;
+ ktime_t next_txtime;
atomic_t budget;
int index;
u32 gate_mask;
@@ -55,6 +61,8 @@ struct sched_gate_list {
struct taprio_sched {
struct Qdisc **qdiscs;
struct Qdisc *root;
+ u32 flags;
+ enum tk_offsets tk_offset;
int clockid;
atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
* speeds it's sub-nanoseconds per byte
@@ -65,9 +73,9 @@ struct taprio_sched {
struct sched_entry __rcu *current_entry;
struct sched_gate_list __rcu *oper_sched;
struct sched_gate_list __rcu *admin_sched;
- ktime_t (*get_time)(void);
struct hrtimer advance_timer;
struct list_head taprio_list;
+ int txtime_delay;
};
static ktime_t sched_base_time(const struct sched_gate_list *sched)
@@ -78,6 +86,20 @@ static ktime_t sched_base_time(const struct sched_gate_list *sched)
return ns_to_ktime(sched->base_time);
}
+static ktime_t taprio_get_time(struct taprio_sched *q)
+{
+ ktime_t mono = ktime_get();
+
+ switch (q->tk_offset) {
+ case TK_OFFS_MAX:
+ return mono;
+ default:
+ return ktime_mono_to_any(mono, q->tk_offset);
+ }
+
+ return KTIME_MAX;
+}
+
static void taprio_free_sched_cb(struct rcu_head *head)
{
struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu);
@@ -108,20 +130,263 @@ static void switch_schedules(struct taprio_sched *q,
*admin = NULL;
}
-static ktime_t get_cycle_time(struct sched_gate_list *sched)
+/* Get how much time has been already elapsed in the current cycle. */
+static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time)
+{
+ ktime_t time_since_sched_start;
+ s32 time_elapsed;
+
+ time_since_sched_start = ktime_sub(time, sched->base_time);
+ div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed);
+
+ return time_elapsed;
+}
+
+static ktime_t get_interval_end_time(struct sched_gate_list *sched,
+ struct sched_gate_list *admin,
+ struct sched_entry *entry,
+ ktime_t intv_start)
+{
+ s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start);
+ ktime_t intv_end, cycle_ext_end, cycle_end;
+
+ cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed);
+ intv_end = ktime_add_ns(intv_start, entry->interval);
+ cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension);
+
+ if (ktime_before(intv_end, cycle_end))
+ return intv_end;
+ else if (admin && admin != sched &&
+ ktime_after(admin->base_time, cycle_end) &&
+ ktime_before(admin->base_time, cycle_ext_end))
+ return admin->base_time;
+ else
+ return cycle_end;
+}
+
+static int length_to_duration(struct taprio_sched *q, int len)
+{
+ return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
+}
+
+/* Returns the entry corresponding to next available interval. If
+ * validate_interval is set, it only validates whether the timestamp occurs
+ * when the gate corresponding to the skb's traffic class is open.
+ */
+static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb,
+ struct Qdisc *sch,
+ struct sched_gate_list *sched,
+ struct sched_gate_list *admin,
+ ktime_t time,
+ ktime_t *interval_start,
+ ktime_t *interval_end,
+ bool validate_interval)
+{
+ ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time;
+ ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time;
+ struct sched_entry *entry = NULL, *entry_found = NULL;
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ bool entry_available = false;
+ s32 cycle_elapsed;
+ int tc, n;
+
+ tc = netdev_get_prio_tc_map(dev, skb->priority);
+ packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb));
+
+ *interval_start = 0;
+ *interval_end = 0;
+
+ if (!sched)
+ return NULL;
+
+ cycle = sched->cycle_time;
+ cycle_elapsed = get_cycle_time_elapsed(sched, time);
+ curr_intv_end = ktime_sub_ns(time, cycle_elapsed);
+ cycle_end = ktime_add_ns(curr_intv_end, cycle);
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ curr_intv_start = curr_intv_end;
+ curr_intv_end = get_interval_end_time(sched, admin, entry,
+ curr_intv_start);
+
+ if (ktime_after(curr_intv_start, cycle_end))
+ break;
+
+ if (!(entry->gate_mask & BIT(tc)) ||
+ packet_transmit_time > entry->interval)
+ continue;
+
+ txtime = entry->next_txtime;
+
+ if (ktime_before(txtime, time) || validate_interval) {
+ transmit_end_time = ktime_add_ns(time, packet_transmit_time);
+ if ((ktime_before(curr_intv_start, time) &&
+ ktime_before(transmit_end_time, curr_intv_end)) ||
+ (ktime_after(curr_intv_start, time) && !validate_interval)) {
+ entry_found = entry;
+ *interval_start = curr_intv_start;
+ *interval_end = curr_intv_end;
+ break;
+ } else if (!entry_available && !validate_interval) {
+ /* Here, we are just trying to find out the
+ * first available interval in the next cycle.
+ */
+ entry_available = 1;
+ entry_found = entry;
+ *interval_start = ktime_add_ns(curr_intv_start, cycle);
+ *interval_end = ktime_add_ns(curr_intv_end, cycle);
+ }
+ } else if (ktime_before(txtime, earliest_txtime) &&
+ !entry_available) {
+ earliest_txtime = txtime;
+ entry_found = entry;
+ n = div_s64(ktime_sub(txtime, curr_intv_start), cycle);
+ *interval_start = ktime_add(curr_intv_start, n * cycle);
+ *interval_end = ktime_add(curr_intv_end, n * cycle);
+ }
+ }
+
+ return entry_found;
+}
+
+static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch)
{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct sched_gate_list *sched, *admin;
+ ktime_t interval_start, interval_end;
struct sched_entry *entry;
- ktime_t cycle = 0;
- if (sched->cycle_time != 0)
- return sched->cycle_time;
+ rcu_read_lock();
+ sched = rcu_dereference(q->oper_sched);
+ admin = rcu_dereference(q->admin_sched);
+
+ entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp,
+ &interval_start, &interval_end, true);
+ rcu_read_unlock();
- list_for_each_entry(entry, &sched->entries, list)
- cycle = ktime_add_ns(cycle, entry->interval);
+ return entry;
+}
- sched->cycle_time = cycle;
+/* This returns the tstamp value set by TCP in terms of the set clock. */
+static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
+{
+ unsigned int offset = skb_network_offset(skb);
+ const struct ipv6hdr *ipv6h;
+ const struct iphdr *iph;
+ struct ipv6hdr _ipv6h;
- return cycle;
+ ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
+ if (!ipv6h)
+ return 0;
+
+ if (ipv6h->version == 4) {
+ iph = (struct iphdr *)ipv6h;
+ offset += iph->ihl * 4;
+
+ /* special-case 6in4 tunnelling, as that is a common way to get
+ * v6 connectivity in the home
+ */
+ if (iph->protocol == IPPROTO_IPV6) {
+ ipv6h = skb_header_pointer(skb, offset,
+ sizeof(_ipv6h), &_ipv6h);
+
+ if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
+ return 0;
+ } else if (iph->protocol != IPPROTO_TCP) {
+ return 0;
+ }
+ } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) {
+ return 0;
+ }
+
+ return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset);
+}
+
+/* There are a few scenarios where we will have to modify the txtime from
+ * what is read from next_txtime in sched_entry. They are:
+ * 1. If txtime is in the past,
+ * a. The gate for the traffic class is currently open and packet can be
+ * transmitted before it closes, schedule the packet right away.
+ * b. If the gate corresponding to the traffic class is going to open later
+ * in the cycle, set the txtime of packet to the interval start.
+ * 2. If txtime is in the future, there are packets corresponding to the
+ * current traffic class waiting to be transmitted. So, the following
+ * possibilities exist:
+ * a. We can transmit the packet before the window containing the txtime
+ * closes.
+ * b. The window might close before the transmission can be completed
+ * successfully. So, schedule the packet in the next open window.
+ */
+static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch)
+{
+ ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp;
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct sched_gate_list *sched, *admin;
+ ktime_t minimum_time, now, txtime;
+ int len, packet_transmit_time;
+ struct sched_entry *entry;
+ bool sched_changed;
+
+ now = taprio_get_time(q);
+ minimum_time = ktime_add_ns(now, q->txtime_delay);
+
+ tcp_tstamp = get_tcp_tstamp(q, skb);
+ minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp);
+
+ rcu_read_lock();
+ admin = rcu_dereference(q->admin_sched);
+ sched = rcu_dereference(q->oper_sched);
+ if (admin && ktime_after(minimum_time, admin->base_time))
+ switch_schedules(q, &admin, &sched);
+
+ /* Until the schedule starts, all the queues are open */
+ if (!sched || ktime_before(minimum_time, sched->base_time)) {
+ txtime = minimum_time;
+ goto done;
+ }
+
+ len = qdisc_pkt_len(skb);
+ packet_transmit_time = length_to_duration(q, len);
+
+ do {
+ sched_changed = 0;
+
+ entry = find_entry_to_transmit(skb, sch, sched, admin,
+ minimum_time,
+ &interval_start, &interval_end,
+ false);
+ if (!entry) {
+ txtime = 0;
+ goto done;
+ }
+
+ txtime = entry->next_txtime;
+ txtime = max_t(ktime_t, txtime, minimum_time);
+ txtime = max_t(ktime_t, txtime, interval_start);
+
+ if (admin && admin != sched &&
+ ktime_after(txtime, admin->base_time)) {
+ sched = admin;
+ sched_changed = 1;
+ continue;
+ }
+
+ transmit_end_time = ktime_add(txtime, packet_transmit_time);
+ minimum_time = transmit_end_time;
+
+ /* Update the txtime of current entry to the next time it's
+ * interval starts.
+ */
+ if (ktime_after(transmit_end_time, interval_end))
+ entry->next_txtime = ktime_add(interval_start, sched->cycle_time);
+ } while (sched_changed || ktime_after(transmit_end_time, interval_end));
+
+ entry->next_txtime = transmit_end_time;
+
+done:
+ rcu_read_unlock();
+ return txtime;
}
static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -137,6 +402,15 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(!child))
return qdisc_drop(skb, sch, to_free);
+ if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) {
+ if (!is_valid_interval(skb, sch))
+ return qdisc_drop(skb, sch, to_free);
+ } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ skb->tstamp = get_packet_txtime(skb, sch);
+ if (!skb->tstamp)
+ return qdisc_drop(skb, sch, to_free);
+ }
+
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
@@ -172,6 +446,9 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
if (!skb)
continue;
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags))
+ return skb;
+
prio = skb->priority;
tc = netdev_get_prio_tc_map(dev, prio);
@@ -184,11 +461,6 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
return NULL;
}
-static inline int length_to_duration(struct taprio_sched *q, int len)
-{
- return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
-}
-
static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
{
atomic_set(&entry->budget,
@@ -232,6 +504,13 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
if (unlikely(!child))
continue;
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ skb = child->ops->dequeue(child);
+ if (!skb)
+ continue;
+ goto skb_found;
+ }
+
skb = child->ops->peek(child);
if (!skb)
continue;
@@ -243,7 +522,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
continue;
len = qdisc_pkt_len(skb);
- guard = ktime_add_ns(q->get_time(),
+ guard = ktime_add_ns(taprio_get_time(q),
length_to_duration(q, len));
/* In the case that there's no gate entry, there's no
@@ -262,6 +541,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
if (unlikely(!skb))
goto done;
+skb_found:
qdisc_bstats_update(sch, skb);
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
@@ -524,12 +804,22 @@ static int parse_taprio_schedule(struct nlattr **tb,
if (err < 0)
return err;
+ if (!new->cycle_time) {
+ struct sched_entry *entry;
+ ktime_t cycle = 0;
+
+ list_for_each_entry(entry, &new->entries, list)
+ cycle = ktime_add_ns(cycle, entry->interval);
+ new->cycle_time = cycle;
+ }
+
return 0;
}
static int taprio_parse_mqprio_opt(struct net_device *dev,
struct tc_mqprio_qopt *qopt,
- struct netlink_ext_ack *extack)
+ struct netlink_ext_ack *extack,
+ u32 taprio_flags)
{
int i, j;
@@ -577,6 +867,9 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
return -EINVAL;
}
+ if (TXTIME_ASSIST_IS_ENABLED(taprio_flags))
+ continue;
+
/* Verify that the offset and counts do not overlap */
for (j = i + 1; j < qopt->num_tc; j++) {
if (last > qopt->offset[j]) {
@@ -598,14 +891,14 @@ static int taprio_get_start_time(struct Qdisc *sch,
s64 n;
base = sched_base_time(sched);
- now = q->get_time();
+ now = taprio_get_time(q);
if (ktime_after(base, now)) {
*start = base;
return 0;
}
- cycle = get_cycle_time(sched);
+ cycle = sched->cycle_time;
/* The qdisc is expected to have at least one sched_entry. Moreover,
* any entry must have 'interval' > 0. Thus if the cycle time is zero,
@@ -632,7 +925,7 @@ static void setup_first_close_time(struct taprio_sched *q,
first = list_first_entry(&sched->entries,
struct sched_entry, list);
- cycle = get_cycle_time(sched);
+ cycle = sched->cycle_time;
/* FIXME: find a better place to do this */
sched->cycle_close_time = ktime_add_ns(base, cycle);
@@ -707,6 +1000,18 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
return NOTIFY_DONE;
}
+static void setup_txtime(struct taprio_sched *q,
+ struct sched_gate_list *sched, ktime_t base)
+{
+ struct sched_entry *entry;
+ u32 interval = 0;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ entry->next_txtime = ktime_add_ns(base, interval);
+ interval += entry->interval;
+ }
+}
+
static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -715,6 +1020,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct tc_mqprio_qopt *mqprio = NULL;
+ u32 taprio_flags = 0;
int i, err, clockid;
unsigned long flags;
ktime_t start;
@@ -727,7 +1033,21 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
- err = taprio_parse_mqprio_opt(dev, mqprio, extack);
+ if (tb[TCA_TAPRIO_ATTR_FLAGS]) {
+ taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]);
+
+ if (q->flags != 0 && q->flags != taprio_flags) {
+ NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported");
+ return -EOPNOTSUPP;
+ } else if (!FLAGS_VALID(taprio_flags)) {
+ NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid");
+ return -EINVAL;
+ }
+
+ q->flags = taprio_flags;
+ }
+
+ err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags);
if (err < 0)
return err;
@@ -786,7 +1106,18 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
/* Protects against enqueue()/dequeue() */
spin_lock_bh(qdisc_lock(sch));
- if (!hrtimer_active(&q->advance_timer)) {
+ if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) {
+ if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled");
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ q->txtime_delay = nla_get_s32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
+ }
+
+ if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) &&
+ !hrtimer_active(&q->advance_timer)) {
hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
q->advance_timer.function = advance_sched;
}
@@ -806,16 +1137,16 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
switch (q->clockid) {
case CLOCK_REALTIME:
- q->get_time = ktime_get_real;
+ q->tk_offset = TK_OFFS_REAL;
break;
case CLOCK_MONOTONIC:
- q->get_time = ktime_get;
+ q->tk_offset = TK_OFFS_MAX;
break;
case CLOCK_BOOTTIME:
- q->get_time = ktime_get_boottime;
+ q->tk_offset = TK_OFFS_BOOT;
break;
case CLOCK_TAI:
- q->get_time = ktime_get_clocktai;
+ q->tk_offset = TK_OFFS_TAI;
break;
default:
NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
@@ -829,20 +1160,35 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
goto unlock;
}
- setup_first_close_time(q, new_admin, start);
+ if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) {
+ setup_txtime(q, new_admin, start);
- /* Protects against advance_sched() */
- spin_lock_irqsave(&q->current_entry_lock, flags);
+ if (!oper) {
+ rcu_assign_pointer(q->oper_sched, new_admin);
+ err = 0;
+ new_admin = NULL;
+ goto unlock;
+ }
- taprio_start_sched(sch, start, new_admin);
+ rcu_assign_pointer(q->admin_sched, new_admin);
+ if (admin)
+ call_rcu(&admin->rcu, taprio_free_sched_cb);
+ } else {
+ setup_first_close_time(q, new_admin, start);
- rcu_assign_pointer(q->admin_sched, new_admin);
- if (admin)
- call_rcu(&admin->rcu, taprio_free_sched_cb);
- new_admin = NULL;
+ /* Protects against advance_sched() */
+ spin_lock_irqsave(&q->current_entry_lock, flags);
- spin_unlock_irqrestore(&q->current_entry_lock, flags);
+ taprio_start_sched(sch, start, new_admin);
+ rcu_assign_pointer(q->admin_sched, new_admin);
+ if (admin)
+ call_rcu(&admin->rcu, taprio_free_sched_cb);
+
+ spin_unlock_irqrestore(&q->current_entry_lock, flags);
+ }
+
+ new_admin = NULL;
err = 0;
unlock:
@@ -1080,6 +1426,13 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
goto options_error;
+ if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags))
+ goto options_error;
+
+ if (q->txtime_delay &&
+ nla_put_s32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
+ goto options_error;
+
if (oper && dump_schedule(skb, oper))
goto options_error;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 1999237ce481..5010cce52c93 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -261,8 +261,6 @@ static struct sctp_association *sctp_association_init(
goto stream_free;
asoc->active_key_id = ep->active_key_id;
- asoc->prsctp_enable = ep->prsctp_enable;
- asoc->reconf_enable = ep->reconf_enable;
asoc->strreset_enable = ep->strreset_enable;
/* Save the hmacs and chunks list into this association */
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f54333cbbe0f..53bc61537f44 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -393,24 +393,19 @@ int sctp_bind_addr_state(const struct sctp_bind_addr *bp,
{
struct sctp_sockaddr_entry *laddr;
struct sctp_af *af;
- int state = -1;
af = sctp_get_af_specific(addr->sa.sa_family);
if (unlikely(!af))
- return state;
+ return -1;
- rcu_read_lock();
list_for_each_entry_rcu(laddr, &bp->address_list, list) {
if (!laddr->valid)
continue;
- if (af->cmp_addr(&laddr->a, addr)) {
- state = laddr->state;
- break;
- }
+ if (af->cmp_addr(&laddr->a, addr))
+ return laddr->state;
}
- rcu_read_unlock();
- return state;
+ return -1;
}
/* Find the first address in the bind address list that is not present in
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 64e0a594a651..e5f2fc726a98 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -253,7 +253,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
struct ip6_flowlabel *flowlabel;
flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
goto out;
fl6_sock_release(flowlabel);
}
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 2cae7440349c..74847d613835 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -94,11 +94,6 @@ static const struct net_offload sctp6_offload = {
},
};
-static const struct skb_checksum_ops crc32c_csum_ops = {
- .update = sctp_csum_update,
- .combine = sctp_csum_combine,
-};
-
int __init sctp_offload_init(void)
{
int ret;
@@ -111,7 +106,7 @@ int __init sctp_offload_init(void)
if (ret)
goto ipv4;
- crc32c_csum_stub = &crc32c_csum_ops;
+ crc32c_csum_stub = &sctp_csum_ops;
return ret;
ipv4:
diff --git a/net/sctp/output.c b/net/sctp/output.c
index e0c27477788d..dbda7e7927fd 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -282,6 +282,9 @@ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt,
sctp_chunk_free(sack);
goto out;
}
+ SCTP_INC_STATS(sock_net(asoc->base.sk),
+ SCTP_MIB_OUTCTRLCHUNKS);
+ asoc->stats.octrlchunks++;
asoc->peer.sack_needed = 0;
if (del_timer(timer))
sctp_association_put(asoc);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 23af232c0a25..2d47adcb4cbe 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -81,7 +81,7 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist,
return;
}
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
/* Add the address to the local list. */
addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
if (addr) {
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9b0e5b0d701a..ed39396b9bba 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -247,7 +247,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types));
chunksize += sizeof(ecap_param);
- if (asoc->prsctp_enable)
+ if (asoc->ep->prsctp_enable)
chunksize += sizeof(prsctp_param);
/* ADDIP: Section 4.2.7:
@@ -261,7 +261,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
num_ext += 2;
}
- if (asoc->reconf_enable) {
+ if (asoc->ep->reconf_enable) {
extensions[num_ext] = SCTP_CID_RECONF;
num_ext += 1;
}
@@ -269,7 +269,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
if (sp->adaptation_ind)
chunksize += sizeof(aiparam);
- if (sp->strm_interleave) {
+ if (asoc->ep->intl_enable) {
extensions[num_ext] = SCTP_CID_I_DATA;
num_ext += 1;
}
@@ -348,7 +348,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
sctp_addto_param(retval, num_ext, extensions);
}
- if (asoc->prsctp_enable)
+ if (asoc->ep->prsctp_enable)
sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
if (sp->adaptation_ind) {
@@ -438,7 +438,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
if (sp->adaptation_ind)
chunksize += sizeof(aiparam);
- if (asoc->intl_enable) {
+ if (asoc->peer.intl_capable) {
extensions[num_ext] = SCTP_CID_I_DATA;
num_ext += 1;
}
@@ -2007,12 +2007,11 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
for (i = 0; i < num_ext; i++) {
switch (param.ext->chunks[i]) {
case SCTP_CID_RECONF:
- if (asoc->reconf_enable &&
- !asoc->peer.reconf_capable)
+ if (asoc->ep->reconf_enable)
asoc->peer.reconf_capable = 1;
break;
case SCTP_CID_FWD_TSN:
- if (asoc->prsctp_enable && !asoc->peer.prsctp_capable)
+ if (asoc->ep->prsctp_enable)
asoc->peer.prsctp_capable = 1;
break;
case SCTP_CID_AUTH:
@@ -2028,8 +2027,8 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
asoc->peer.asconf_capable = 1;
break;
case SCTP_CID_I_DATA:
- if (sctp_sk(asoc->base.sk)->strm_interleave)
- asoc->intl_enable = 1;
+ if (asoc->ep->intl_enable)
+ asoc->peer.intl_capable = 1;
break;
default:
break;
@@ -2637,7 +2636,7 @@ do_addr_param:
break;
case SCTP_PARAM_FWD_TSN_SUPPORT:
- if (asoc->prsctp_enable) {
+ if (asoc->ep->prsctp_enable) {
asoc->peer.prsctp_capable = 1;
break;
}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 39ea0a37af09..aa80cda36581 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1913,7 +1913,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
if (err)
goto err;
- if (sp->strm_interleave) {
+ if (asoc->ep->intl_enable) {
timeo = sock_sndtimeo(sk, 0);
err = sctp_wait_for_connect(asoc, &timeo);
if (err) {
@@ -3581,7 +3581,7 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk,
sctp_sk(sk)->frag_interleave = !!val;
if (!sctp_sk(sk)->frag_interleave)
- sctp_sk(sk)->strm_interleave = 0;
+ sctp_sk(sk)->ep->intl_enable = 0;
return 0;
}
@@ -4226,10 +4226,7 @@ static int sctp_setsockopt_reconfig_supported(struct sock *sk,
sctp_style(sk, UDP))
goto out;
- if (asoc)
- asoc->reconf_enable = !!params.assoc_value;
- else
- sctp_sk(sk)->ep->reconf_enable = !!params.assoc_value;
+ sctp_sk(sk)->ep->reconf_enable = !!params.assoc_value;
retval = 0;
@@ -4487,7 +4484,7 @@ static int sctp_setsockopt_interleaving_supported(struct sock *sk,
goto out;
}
- sp->strm_interleave = !!params.assoc_value;
+ sp->ep->intl_enable = !!params.assoc_value;
retval = 0;
@@ -4816,35 +4813,17 @@ out_nounlock:
static int sctp_connect(struct sock *sk, struct sockaddr *addr,
int addr_len, int flags)
{
- struct inet_sock *inet = inet_sk(sk);
struct sctp_af *af;
- int err = 0;
+ int err = -EINVAL;
lock_sock(sk);
-
pr_debug("%s: sk:%p, sockaddr:%p, addr_len:%d\n", __func__, sk,
addr, addr_len);
- /* We may need to bind the socket. */
- if (!inet->inet_num) {
- if (sk->sk_prot->get_port(sk, 0)) {
- release_sock(sk);
- return -EAGAIN;
- }
- inet->inet_sport = htons(inet->inet_num);
- }
-
/* Validate addr_len before calling common connect/connectx routine. */
- af = addr_len < offsetofend(struct sockaddr, sa_family) ? NULL :
- sctp_get_af_specific(addr->sa_family);
- if (!af || addr_len < af->sockaddr_len) {
- err = -EINVAL;
- } else {
- /* Pass correct addr len to common routine (so it knows there
- * is only one address being passed.
- */
+ af = sctp_get_af_specific(addr->sa_family);
+ if (af && addr_len >= af->sockaddr_len)
err = __sctp_connect(sk, addr, af->sockaddr_len, flags, NULL);
- }
release_sock(sk);
return err;
@@ -7346,7 +7325,7 @@ static int sctp_getsockopt_pr_supported(struct sock *sk, int len,
goto out;
}
- params.assoc_value = asoc ? asoc->prsctp_enable
+ params.assoc_value = asoc ? asoc->peer.prsctp_capable
: sctp_sk(sk)->ep->prsctp_enable;
if (put_user(len, optlen))
@@ -7554,7 +7533,7 @@ static int sctp_getsockopt_reconfig_supported(struct sock *sk, int len,
goto out;
}
- params.assoc_value = asoc ? asoc->reconf_enable
+ params.assoc_value = asoc ? asoc->peer.reconf_capable
: sctp_sk(sk)->ep->reconf_enable;
if (put_user(len, optlen))
@@ -7713,8 +7692,8 @@ static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len,
goto out;
}
- params.assoc_value = asoc ? asoc->intl_enable
- : sctp_sk(sk)->strm_interleave;
+ params.assoc_value = asoc ? asoc->peer.intl_capable
+ : sctp_sk(sk)->ep->intl_enable;
if (put_user(len, optlen))
goto out;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 93ed07877337..25946604af85 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -153,13 +153,20 @@ out:
int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
{
struct sctp_stream_out_ext *soute;
+ int ret;
soute = kzalloc(sizeof(*soute), GFP_KERNEL);
if (!soute)
return -ENOMEM;
SCTP_SO(stream, sid)->ext = soute;
- return sctp_sched_init_sid(stream, sid, GFP_KERNEL);
+ ret = sctp_sched_init_sid(stream, sid, GFP_KERNEL);
+ if (ret) {
+ kfree(SCTP_SO(stream, sid)->ext);
+ SCTP_SO(stream, sid)->ext = NULL;
+ }
+
+ return ret;
}
void sctp_stream_free(struct sctp_stream *stream)
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index afbf1223d91c..40c40be23fcb 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1358,6 +1358,6 @@ void sctp_stream_interleave_init(struct sctp_stream *stream)
struct sctp_association *asoc;
asoc = container_of(stream, struct sctp_association, stream);
- stream->si = asoc->intl_enable ? &sctp_stream_interleave_1
- : &sctp_stream_interleave_0;
+ stream->si = asoc->peer.intl_capable ? &sctp_stream_interleave_1
+ : &sctp_stream_interleave_0;
}
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index b8fa7ab3e394..99e5f69fbb74 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -228,7 +228,7 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch)
{
if (!list_is_last(&ch->frag_list, &ch->msg->chunks) &&
- !q->asoc->intl_enable) {
+ !q->asoc->peer.intl_capable) {
struct sctp_stream_out *sout;
__u16 sid;
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 7621ec2f539c..302e355f2ebc 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -123,30 +123,11 @@ struct proto smc_proto6 = {
};
EXPORT_SYMBOL_GPL(smc_proto6);
-static int smc_release(struct socket *sock)
+static int __smc_release(struct smc_sock *smc)
{
- struct sock *sk = sock->sk;
- struct smc_sock *smc;
+ struct sock *sk = &smc->sk;
int rc = 0;
- if (!sk)
- goto out;
-
- smc = smc_sk(sk);
-
- /* cleanup for a dangling non-blocking connect */
- if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
- tcp_abort(smc->clcsock->sk, ECONNABORTED);
- flush_work(&smc->connect_work);
-
- if (sk->sk_state == SMC_LISTEN)
- /* smc_close_non_accepted() is called and acquires
- * sock lock for child sockets again
- */
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
- else
- lock_sock(sk);
-
if (!smc->use_fallback) {
rc = smc_close_active(smc);
sock_set_flag(sk, SOCK_DEAD);
@@ -174,6 +155,35 @@ static int smc_release(struct socket *sock)
smc_conn_free(&smc->conn);
}
+ return rc;
+}
+
+static int smc_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = 0;
+
+ if (!sk)
+ goto out;
+
+ smc = smc_sk(sk);
+
+ /* cleanup for a dangling non-blocking connect */
+ if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
+ tcp_abort(smc->clcsock->sk, ECONNABORTED);
+ flush_work(&smc->connect_work);
+
+ if (sk->sk_state == SMC_LISTEN)
+ /* smc_close_non_accepted() is called and acquires
+ * sock lock for child sockets again
+ */
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ else
+ lock_sock(sk);
+
+ rc = __smc_release(smc);
+
/* detach socket */
sock_orphan(sk);
sock->sk = NULL;
@@ -964,26 +974,7 @@ void smc_close_non_accepted(struct sock *sk)
if (!sk->sk_lingertime)
/* wait for peer closing */
sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
- if (!smc->use_fallback) {
- smc_close_active(smc);
- sock_set_flag(sk, SOCK_DEAD);
- sk->sk_shutdown |= SHUTDOWN_MASK;
- }
- sk->sk_prot->unhash(sk);
- if (smc->clcsock) {
- struct socket *tcp;
-
- tcp = smc->clcsock;
- smc->clcsock = NULL;
- sock_release(tcp);
- }
- if (smc->use_fallback) {
- sock_put(sk); /* passive closing */
- sk->sk_state = SMC_CLOSED;
- } else {
- if (sk->sk_state == SMC_CLOSED)
- smc_conn_free(&smc->conn);
- }
+ __smc_release(smc);
release_sock(sk);
sock_put(sk); /* final sock_put */
}
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 745afd82f281..49bcebff6378 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -97,17 +97,19 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
struct smc_clc_msg_proposal_prefix *prop)
{
struct in_device *in_dev = __in_dev_get_rcu(dst->dev);
+ const struct in_ifaddr *ifa;
if (!in_dev)
return -ENODEV;
- for_ifa(in_dev) {
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (!inet_ifa_match(ipv4, ifa))
continue;
prop->prefix_len = inet_mask_len(ifa->ifa_mask);
prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask;
/* prop->ipv6_prefixes_cnt = 0; already done by memset before */
return 0;
- } endfor_ifa(in_dev);
+ }
return -ENOENT;
}
@@ -190,14 +192,15 @@ static int smc_clc_prfx_match4_rcu(struct net_device *dev,
struct smc_clc_msg_proposal_prefix *prop)
{
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ const struct in_ifaddr *ifa;
if (!in_dev)
return -ENODEV;
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) &&
inet_ifa_match(prop->outgoing_subnet, ifa))
return 0;
- } endfor_ifa(in_dev);
+ }
return -ENOENT;
}
diff --git a/net/socket.c b/net/socket.c
index 38eec1583f6d..16449d6daeca 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -103,13 +103,6 @@
#include <net/busy_poll.h>
#include <linux/errqueue.h>
-/* proto_ops for ipv4 and ipv6 use the same {recv,send}msg function */
-#if IS_ENABLED(CONFIG_INET)
-#define INDIRECT_CALL_INET4(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
-#else
-#define INDIRECT_CALL_INET4(f, f1, ...) f(__VA_ARGS__)
-#endif
-
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sysctl_net_busy_read __read_mostly;
unsigned int sysctl_net_busy_poll __read_mostly;
@@ -241,20 +234,13 @@ static struct kmem_cache *sock_inode_cachep __ro_after_init;
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
- struct socket_wq *wq;
ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
- wq = kmalloc(sizeof(*wq), GFP_KERNEL);
- if (!wq) {
- kmem_cache_free(sock_inode_cachep, ei);
- return NULL;
- }
- init_waitqueue_head(&wq->wait);
- wq->fasync_list = NULL;
- wq->flags = 0;
- ei->socket.wq = wq;
+ init_waitqueue_head(&ei->socket.wq.wait);
+ ei->socket.wq.fasync_list = NULL;
+ ei->socket.wq.flags = 0;
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
@@ -265,12 +251,11 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
return &ei->vfs_inode;
}
-static void sock_destroy_inode(struct inode *inode)
+static void sock_free_inode(struct inode *inode)
{
struct socket_alloc *ei;
ei = container_of(inode, struct socket_alloc, vfs_inode);
- kfree_rcu(ei->socket.wq, rcu);
kmem_cache_free(sock_inode_cachep, ei);
}
@@ -295,7 +280,7 @@ static void init_inodecache(void)
static const struct super_operations sockfs_ops = {
.alloc_inode = sock_alloc_inode,
- .destroy_inode = sock_destroy_inode,
+ .free_inode = sock_free_inode,
.statfs = simple_statfs,
};
@@ -429,7 +414,7 @@ static int sock_map_fd(struct socket *sock, int flags)
}
newfile = sock_alloc_file(sock, flags, NULL);
- if (likely(!IS_ERR(newfile))) {
+ if (!IS_ERR(newfile)) {
fd_install(fd, newfile);
return fd;
}
@@ -606,7 +591,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
module_put(owner);
}
- if (sock->wq->fasync_list)
+ if (sock->wq.fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);
if (!sock->file) {
@@ -641,10 +626,13 @@ EXPORT_SYMBOL(__sock_tx_timestamp);
INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *,
size_t));
+INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *,
+ size_t));
static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
{
- int ret = INDIRECT_CALL_INET4(sock->ops->sendmsg, inet_sendmsg, sock,
- msg, msg_data_left(msg));
+ int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg,
+ inet_sendmsg, sock, msg,
+ msg_data_left(msg));
BUG_ON(ret == -EIOCBQUEUED);
return ret;
}
@@ -870,12 +858,15 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *,
- size_t , int ));
+ size_t, int));
+INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *,
+ size_t, int));
static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
int flags)
{
- return INDIRECT_CALL_INET4(sock->ops->recvmsg, inet_recvmsg, sock, msg,
- msg_data_left(msg), flags);
+ return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
+ inet_recvmsg, sock, msg, msg_data_left(msg),
+ flags);
}
/**
@@ -1289,13 +1280,12 @@ static int sock_fasync(int fd, struct file *filp, int on)
{
struct socket *sock = filp->private_data;
struct sock *sk = sock->sk;
- struct socket_wq *wq;
+ struct socket_wq *wq = &sock->wq;
if (sk == NULL)
return -EINVAL;
lock_sock(sk);
- wq = sock->wq;
fasync_helper(fd, filp, on, &wq->fasync_list);
if (!wq->fasync_list)
@@ -2051,6 +2041,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
static int __sys_setsockopt(int fd, int level, int optname,
char __user *optval, int optlen)
{
+ mm_segment_t oldfs = get_fs();
+ char *kernel_optval = NULL;
int err, fput_needed;
struct socket *sock;
@@ -2063,6 +2055,22 @@ static int __sys_setsockopt(int fd, int level, int optname,
if (err)
goto out_put;
+ err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level,
+ &optname, optval, &optlen,
+ &kernel_optval);
+
+ if (err < 0) {
+ goto out_put;
+ } else if (err > 0) {
+ err = 0;
+ goto out_put;
+ }
+
+ if (kernel_optval) {
+ set_fs(KERNEL_DS);
+ optval = (char __user __force *)kernel_optval;
+ }
+
if (level == SOL_SOCKET)
err =
sock_setsockopt(sock, level, optname, optval,
@@ -2071,6 +2079,11 @@ static int __sys_setsockopt(int fd, int level, int optname,
err =
sock->ops->setsockopt(sock, level, optname, optval,
optlen);
+
+ if (kernel_optval) {
+ set_fs(oldfs);
+ kfree(kernel_optval);
+ }
out_put:
fput_light(sock->file, fput_needed);
}
@@ -2093,6 +2106,7 @@ static int __sys_getsockopt(int fd, int level, int optname,
{
int err, fput_needed;
struct socket *sock;
+ int max_optlen;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock != NULL) {
@@ -2100,6 +2114,8 @@ static int __sys_getsockopt(int fd, int level, int optname,
if (err)
goto out_put;
+ max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
+
if (level == SOL_SOCKET)
err =
sock_getsockopt(sock, level, optname, optval,
@@ -2108,6 +2124,10 @@ static int __sys_getsockopt(int fd, int level, int optname,
err =
sock->ops->getsockopt(sock, level, optname, optval,
optlen);
+
+ err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
+ optval, optlen,
+ max_optlen, err);
out_put:
fput_light(sock->file, fput_needed);
}
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index f64f36a83063..b3815c1e8f2e 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -157,18 +157,14 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
return 0;
}
- skb = alloc_skb(0, GFP_ATOMIC);
+ skb = alloc_skb_for_msg(head);
if (!skb) {
STRP_STATS_INCR(strp->stats.mem_fail);
desc->error = -ENOMEM;
return 0;
}
- skb->len = head->len;
- skb->data_len = head->len;
- skb->truesize = head->truesize;
- *_strp_msg(skb) = *_strp_msg(head);
+
strp->skb_nextp = &head->next;
- skb_shinfo(skb)->frag_list = head;
strp->skb_head = skb;
head = skb;
} else {
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 6c997d4a6218..1336f3cdad38 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -323,7 +323,7 @@ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb,
hdr = buf_msg(skb);
if (msg_user(hdr) == MSG_FRAGMENTER)
- hdr = msg_get_wrapped(hdr);
+ hdr = msg_inner_hdr(hdr);
if (msg_type(hdr) != TIPC_MCAST_MSG)
return 0;
@@ -392,7 +392,7 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
skb = skb_peek(pkts);
hdr = buf_msg(skb);
if (msg_user(hdr) == MSG_FRAGMENTER)
- hdr = msg_get_wrapped(hdr);
+ hdr = msg_inner_hdr(hdr);
msg_set_is_rcast(hdr, method->rcast);
/* Switch method ? */
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 2bed6589f41e..a809c0ec8d15 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -62,7 +62,7 @@ static struct tipc_bearer *bearer_get(struct net *net, int bearer_id)
{
struct tipc_net *tn = tipc_net(net);
- return rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ return rcu_dereference(tn->bearer_list[bearer_id]);
}
static void bearer_disable(struct net *net, struct tipc_bearer *b);
@@ -210,7 +210,7 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest)
struct tipc_bearer *b;
rcu_read_lock();
- b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ b = rcu_dereference(tn->bearer_list[bearer_id]);
if (b)
tipc_disc_add_dest(b->disc);
rcu_read_unlock();
@@ -222,7 +222,7 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest)
struct tipc_bearer *b;
rcu_read_lock();
- b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ b = rcu_dereference(tn->bearer_list[bearer_id]);
if (b)
tipc_disc_remove_dest(b->disc);
rcu_read_unlock();
@@ -444,7 +444,7 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
struct net_device *dev;
int delta;
- dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr);
+ dev = (struct net_device *)rcu_dereference(b->media_ptr);
if (!dev)
return 0;
@@ -481,7 +481,7 @@ int tipc_bearer_mtu(struct net *net, u32 bearer_id)
struct tipc_bearer *b;
rcu_read_lock();
- b = rcu_dereference_rtnl(tipc_net(net)->bearer_list[bearer_id]);
+ b = rcu_dereference(tipc_net(net)->bearer_list[bearer_id]);
if (b)
mtu = b->mtu;
rcu_read_unlock();
@@ -574,8 +574,8 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
struct tipc_bearer *b;
rcu_read_lock();
- b = rcu_dereference_rtnl(dev->tipc_ptr) ?:
- rcu_dereference_rtnl(orig_dev->tipc_ptr);
+ b = rcu_dereference(dev->tipc_ptr) ?:
+ rcu_dereference(orig_dev->tipc_ptr);
if (likely(b && test_bit(0, &b->up) &&
(skb->pkt_type <= PACKET_MULTICAST))) {
skb_mark_not_on_list(skb);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 2050fd386642..66d3a07bc571 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -107,7 +107,6 @@ struct tipc_stats {
* @backlogq: queue for messages waiting to be sent
* @snt_nxt: next sequence number to use for outbound messages
* @prev_from: sequence number of most previous retransmission request
- * @stale_cnt: counter for number of identical retransmit attempts
* @stale_limit: time when repeated identical retransmits must force link reset
* @ackers: # of peers that needs to ack each packet before it can be released
* @acked: # last packet acked by a certain peer. Used for broadcast.
@@ -167,7 +166,6 @@ struct tipc_link {
u16 snd_nxt;
u16 prev_from;
u16 window;
- u16 stale_cnt;
unsigned long stale_limit;
/* Reception */
@@ -209,7 +207,7 @@ enum {
BC_NACK_SND_SUPPRESS,
};
-#define TIPC_BC_RETR_LIM msecs_to_jiffies(10) /* [ms] */
+#define TIPC_BC_RETR_LIM (jiffies + msecs_to_jiffies(10))
#define TIPC_UC_RETR_TIME (jiffies + msecs_to_jiffies(1))
/*
@@ -249,9 +247,9 @@ static void tipc_link_build_bc_init_msg(struct tipc_link *l,
struct sk_buff_head *xmitq);
static bool tipc_link_release_pkts(struct tipc_link *l, u16 to);
static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data);
-static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
- struct tipc_gap_ack_blks *ga,
- struct sk_buff_head *xmitq);
+static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
+ struct tipc_gap_ack_blks *ga,
+ struct sk_buff_head *xmitq);
/*
* Simple non-static link routines (i.e. referenced outside this file)
@@ -734,7 +732,7 @@ static void link_profile_stats(struct tipc_link *l)
if (msg_user(msg) == MSG_FRAGMENTER) {
if (msg_type(msg) != FIRST_FRAGMENT)
return;
- length = msg_size(msg_get_wrapped(msg));
+ length = msg_size(msg_inner_hdr(msg));
}
l->stats.msg_lengths_total += length;
l->stats.msg_length_counts++;
@@ -910,7 +908,6 @@ void tipc_link_reset(struct tipc_link *l)
l->acked = 0;
l->silent_intv_cnt = 0;
l->rst_cnt = 0;
- l->stale_cnt = 0;
l->bc_peer_is_up = false;
memset(&l->mon_state, 0, sizeof(l->mon_state));
tipc_link_reset_stats(l);
@@ -979,8 +976,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
__skb_queue_tail(transmq, skb);
/* next retransmit attempt */
if (link_is_bc_sndlink(l))
- TIPC_SKB_CB(skb)->nxt_retr =
- jiffies + TIPC_BC_RETR_LIM;
+ TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM;
__skb_queue_tail(xmitq, _skb);
TIPC_SKB_CB(skb)->ackers = l->ackers;
l->rcv_unacked = 0;
@@ -1030,7 +1026,7 @@ static void tipc_link_advance_backlog(struct tipc_link *l,
__skb_queue_tail(&l->transmq, skb);
/* next retransmit attempt */
if (link_is_bc_sndlink(l))
- TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM;
+ TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM;
__skb_queue_tail(xmitq, _skb);
TIPC_SKB_CB(skb)->ackers = l->ackers;
@@ -1044,32 +1040,68 @@ static void tipc_link_advance_backlog(struct tipc_link *l,
l->snd_nxt = seqno;
}
-static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb)
+/**
+ * link_retransmit_failure() - Detect repeated retransmit failures
+ * @l: tipc link sender
+ * @r: tipc link receiver (= l in case of unicast)
+ * @from: seqno of the 1st packet in retransmit request
+ * @rc: returned code
+ *
+ * Return: true if the repeated retransmit failures happens, otherwise
+ * false
+ */
+static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r,
+ u16 from, int *rc)
{
- struct tipc_msg *hdr = buf_msg(skb);
+ struct sk_buff *skb = skb_peek(&l->transmq);
+ struct tipc_msg *hdr;
+
+ if (!skb)
+ return false;
+ hdr = buf_msg(skb);
+
+ /* Detect repeated retransmit failures on same packet */
+ if (r->prev_from != from) {
+ r->prev_from = from;
+ r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance);
+ } else if (time_after(jiffies, r->stale_limit)) {
+ pr_warn("Retransmission failure on link <%s>\n", l->name);
+ link_print(l, "State of link ");
+ pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
+ msg_user(hdr), msg_type(hdr), msg_size(hdr),
+ msg_errcode(hdr));
+ pr_info("sqno %u, prev: %x, src: %x\n",
+ msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr));
+
+ trace_tipc_list_dump(&l->transmq, true, "retrans failure!");
+ trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!");
+ trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!");
+
+ if (link_is_bc_sndlink(l))
+ *rc = TIPC_LINK_DOWN_EVT;
+
+ *rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+ return true;
+ }
- pr_warn("Retransmission failure on link <%s>\n", l->name);
- link_print(l, "State of link ");
- pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
- msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr));
- pr_info("sqno %u, prev: %x, src: %x\n",
- msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr));
+ return false;
}
-/* tipc_link_retrans() - retransmit one or more packets
+/* tipc_link_bc_retrans() - retransmit zero or more packets
* @l: the link to transmit on
* @r: the receiving link ordering the retransmit. Same as l if unicast
* @from: retransmit from (inclusive) this sequence number
* @to: retransmit to (inclusive) this sequence number
* xmitq: queue for accumulating the retransmitted packets
*/
-static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
- u16 from, u16 to, struct sk_buff_head *xmitq)
+static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r,
+ u16 from, u16 to, struct sk_buff_head *xmitq)
{
struct sk_buff *_skb, *skb = skb_peek(&l->transmq);
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
u16 ack = l->rcv_nxt - 1;
struct tipc_msg *hdr;
+ int rc = 0;
if (!skb)
return 0;
@@ -1077,20 +1109,9 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
return 0;
trace_tipc_link_retrans(r, from, to, &l->transmq);
- /* Detect repeated retransmit failures on same packet */
- if (r->prev_from != from) {
- r->prev_from = from;
- r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance);
- r->stale_cnt = 0;
- } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) {
- link_retransmit_failure(l, skb);
- trace_tipc_list_dump(&l->transmq, true, "retrans failure!");
- trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!");
- trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!");
- if (link_is_bc_sndlink(l))
- return TIPC_LINK_DOWN_EVT;
- return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
- }
+
+ if (link_retransmit_failure(l, r, from, &rc))
+ return rc;
skb_queue_walk(&l->transmq, skb) {
hdr = buf_msg(skb);
@@ -1101,9 +1122,9 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
if (link_is_bc_sndlink(l)) {
if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
continue;
- TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM;
+ TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM;
}
- _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
+ _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC);
if (!_skb)
return 0;
hdr = buf_msg(_skb);
@@ -1324,17 +1345,23 @@ exit:
* @gap: # of gap packets
* @ga: buffer pointer to Gap ACK blocks from peer
* @xmitq: queue for accumulating the retransmitted packets if any
+ *
+ * In case of a repeated retransmit failures, the call will return shortly
+ * with a returned code (e.g. TIPC_LINK_DOWN_EVT)
*/
-static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
- struct tipc_gap_ack_blks *ga,
- struct sk_buff_head *xmitq)
+static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
+ struct tipc_gap_ack_blks *ga,
+ struct sk_buff_head *xmitq)
{
struct sk_buff *skb, *_skb, *tmp;
struct tipc_msg *hdr;
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
u16 ack = l->rcv_nxt - 1;
- u16 seqno;
- u16 n = 0;
+ u16 seqno, n = 0;
+ int rc = 0;
+
+ if (gap && link_retransmit_failure(l, l, acked + 1, &rc))
+ return rc;
skb_queue_walk_safe(&l->transmq, skb, tmp) {
seqno = buf_seqno(skb);
@@ -1369,6 +1396,8 @@ next_gap_ack:
goto next_gap_ack;
}
}
+
+ return 0;
}
/* tipc_link_build_state_msg: prepare link state message for transmission
@@ -1481,7 +1510,6 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
/* Forward queues and wake up waiting users */
if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) {
- l->stale_cnt = 0;
tipc_link_advance_backlog(l, xmitq);
if (unlikely(!skb_queue_empty(&l->wakeupq)))
link_prepare_wakeup(l);
@@ -1918,7 +1946,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
tipc_link_build_proto_msg(l, STATE_MSG, 0, reply,
rcvgap, 0, 0, xmitq);
- tipc_link_advance_transmq(l, ack, gap, ga, xmitq);
+ rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq);
/* If NACK, retransmit will now start at right position */
if (gap)
@@ -2035,7 +2063,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
if (more(peers_snd_nxt, l->rcv_nxt + l->window))
return rc;
- rc = tipc_link_retrans(snd_l, l, from, to, xmitq);
+ rc = tipc_link_bc_retrans(snd_l, l, from, to, xmitq);
l->snd_nxt = peers_snd_nxt;
if (link_bc_rcv_gap(l))
@@ -2131,7 +2159,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
if (dnode == tipc_own_addr(l->net)) {
tipc_link_bc_ack_rcv(l, acked, xmitq);
- rc = tipc_link_retrans(l->bc_sndlink, l, from, to, xmitq);
+ rc = tipc_link_bc_retrans(l->bc_sndlink, l, from, to, xmitq);
l->stats.recv_nacks++;
return rc;
}
@@ -2550,7 +2578,7 @@ int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf)
i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt);
i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt);
i += scnprintf(buf + i, sz - i, " %u", l->prev_from);
- i += scnprintf(buf + i, sz - i, " %u", l->stale_cnt);
+ i += scnprintf(buf + i, sz - i, " %u", 0);
i += scnprintf(buf + i, sz - i, " %u", l->acked);
list = &l->transmq;
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 8de02ad6e352..da509f0eb9ca 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -308,7 +308,7 @@ static inline unchar *msg_data(struct tipc_msg *m)
return ((unchar *)m) + msg_hdr_sz(m);
}
-static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m)
+static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m)
{
return (struct tipc_msg *)msg_data(m);
}
@@ -486,7 +486,7 @@ static inline void msg_set_prevnode(struct tipc_msg *m, u32 a)
static inline u32 msg_origport(struct tipc_msg *m)
{
if (msg_user(m) == MSG_FRAGMENTER)
- m = msg_get_wrapped(m);
+ m = msg_inner_hdr(m);
return msg_word(m, 4);
}
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 99bd166bccec..d6165ad384c0 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -261,7 +261,7 @@ struct genl_family tipc_genl_family __ro_after_init = {
.version = TIPC_GENL_V2_VERSION,
.hdrsize = 0,
.maxattr = TIPC_NLA_MAX,
- .policy = tipc_nl_policy,
+ .policy = tipc_nl_policy,
.netnsok = true,
.module = THIS_MODULE,
.ops = tipc_genl_v2_ops,
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index cf155061c472..d86030ef1232 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -691,7 +691,6 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb,
struct nlattr *prop;
struct nlattr *media;
struct tipc_link_config *lc;
- int len;
lc = (struct tipc_link_config *)TLV_DATA(msg->req);
@@ -699,10 +698,6 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb,
if (!media)
return -EMSGSIZE;
- len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME);
- if (!string_is_valid(lc->name, len))
- return -EINVAL;
-
if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name))
return -EMSGSIZE;
@@ -723,7 +718,6 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb,
struct nlattr *prop;
struct nlattr *bearer;
struct tipc_link_config *lc;
- int len;
lc = (struct tipc_link_config *)TLV_DATA(msg->req);
@@ -731,10 +725,6 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb,
if (!bearer)
return -EMSGSIZE;
- len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME);
- if (!string_is_valid(lc->name, len))
- return -EINVAL;
-
if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name))
return -EMSGSIZE;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 550581d47d51..324a1f91b394 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1649,7 +1649,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
int usr = msg_user(hdr);
int mtyp = msg_type(hdr);
u16 oseqno = msg_seqno(hdr);
- u16 iseqno = msg_seqno(msg_get_wrapped(hdr));
+ u16 iseqno = msg_seqno(msg_inner_hdr(hdr));
u16 exp_pkts = msg_msgcnt(hdr);
u16 rcv_nxt, syncpt, dlv_nxt, inputq_len;
int state = n->state;
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 1405ccc9101c..287df68721df 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -76,6 +76,7 @@ struct udp_media_addr {
/* struct udp_replicast - container for UDP remote addresses */
struct udp_replicast {
struct udp_media_addr addr;
+ struct dst_cache dst_cache;
struct rcu_head rcu;
struct list_head list;
};
@@ -158,22 +159,27 @@ static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a)
/* tipc_send_msg - enqueue a send request */
static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
struct udp_bearer *ub, struct udp_media_addr *src,
- struct udp_media_addr *dst)
+ struct udp_media_addr *dst, struct dst_cache *cache)
{
+ struct dst_entry *ndst = dst_cache_get(cache);
int ttl, err = 0;
- struct rtable *rt;
if (dst->proto == htons(ETH_P_IP)) {
- struct flowi4 fl = {
- .daddr = dst->ipv4.s_addr,
- .saddr = src->ipv4.s_addr,
- .flowi4_mark = skb->mark,
- .flowi4_proto = IPPROTO_UDP
- };
- rt = ip_route_output_key(net, &fl);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto tx_error;
+ struct rtable *rt = (struct rtable *)ndst;
+
+ if (!rt) {
+ struct flowi4 fl = {
+ .daddr = dst->ipv4.s_addr,
+ .saddr = src->ipv4.s_addr,
+ .flowi4_mark = skb->mark,
+ .flowi4_proto = IPPROTO_UDP
+ };
+ rt = ip_route_output_key(net, &fl);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto tx_error;
+ }
+ dst_cache_set_ip4(cache, &rt->dst, fl.saddr);
}
ttl = ip4_dst_hoplimit(&rt->dst);
@@ -182,17 +188,19 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
dst->port, false, true);
#if IS_ENABLED(CONFIG_IPV6)
} else {
- struct dst_entry *ndst;
- struct flowi6 fl6 = {
- .flowi6_oif = ub->ifindex,
- .daddr = dst->ipv6,
- .saddr = src->ipv6,
- .flowi6_proto = IPPROTO_UDP
- };
- err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, &ndst,
- &fl6);
- if (err)
- goto tx_error;
+ if (!ndst) {
+ struct flowi6 fl6 = {
+ .flowi6_oif = ub->ifindex,
+ .daddr = dst->ipv6,
+ .saddr = src->ipv6,
+ .flowi6_proto = IPPROTO_UDP
+ };
+ err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk,
+ &ndst, &fl6);
+ if (err)
+ goto tx_error;
+ dst_cache_set_ip6(cache, ndst, &fl6.saddr);
+ }
ttl = ip6_dst_hoplimit(ndst);
err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL,
&src->ipv6, &dst->ipv6, 0, ttl, 0,
@@ -223,14 +231,15 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
}
skb_set_inner_protocol(skb, htons(ETH_P_TIPC));
- ub = rcu_dereference_rtnl(b->media_ptr);
+ ub = rcu_dereference(b->media_ptr);
if (!ub) {
err = -ENODEV;
goto out;
}
if (addr->broadcast != TIPC_REPLICAST_SUPPORT)
- return tipc_udp_xmit(net, skb, ub, src, dst);
+ return tipc_udp_xmit(net, skb, ub, src, dst,
+ &ub->rcast.dst_cache);
/* Replicast, send an skb to each configured IP address */
list_for_each_entry_rcu(rcast, &ub->rcast.list, list) {
@@ -242,7 +251,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
goto out;
}
- err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr);
+ err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr,
+ &rcast->dst_cache);
if (err)
goto out;
}
@@ -286,6 +296,11 @@ static int tipc_udp_rcast_add(struct tipc_bearer *b,
if (!rcast)
return -ENOMEM;
+ if (dst_cache_init(&rcast->dst_cache, GFP_ATOMIC)) {
+ kfree(rcast);
+ return -ENOMEM;
+ }
+
memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr));
if (ntohs(addr->proto) == ETH_P_IP)
@@ -475,7 +490,7 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb)
}
}
- ub = rcu_dereference_rtnl(b->media_ptr);
+ ub = rtnl_dereference(b->media_ptr);
if (!ub) {
rtnl_unlock();
return -EINVAL;
@@ -517,7 +532,7 @@ int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b)
struct udp_bearer *ub;
struct nlattr *nest;
- ub = rcu_dereference_rtnl(b->media_ptr);
+ ub = rtnl_dereference(b->media_ptr);
if (!ub)
return -ENODEV;
@@ -742,6 +757,10 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
tuncfg.encap_destroy = NULL;
setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg);
+ err = dst_cache_init(&ub->rcast.dst_cache, GFP_ATOMIC);
+ if (err)
+ goto free;
+
/**
* The bcast media address port is used for all peers and the ip
* is used if it's a multicast address.
@@ -752,12 +771,14 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
else
err = tipc_udp_rcast_add(b, &remote);
if (err)
- goto err;
+ goto free;
return 0;
+
+free:
+ dst_cache_destroy(&ub->rcast.dst_cache);
+ udp_tunnel_sock_release(ub->ubsock);
err:
- if (ub->ubsock)
- udp_tunnel_sock_release(ub->ubsock);
kfree(ub);
return err;
}
@@ -769,12 +790,13 @@ static void cleanup_bearer(struct work_struct *work)
struct udp_replicast *rcast, *tmp;
list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) {
+ dst_cache_destroy(&rcast->dst_cache);
list_del_rcu(&rcast->list);
kfree_rcu(rcast, rcu);
}
- if (ub->ubsock)
- udp_tunnel_sock_release(ub->ubsock);
+ dst_cache_destroy(&ub->rcast.dst_cache);
+ udp_tunnel_sock_release(ub->ubsock);
synchronize_net();
kfree(ub);
}
@@ -784,13 +806,12 @@ static void tipc_udp_disable(struct tipc_bearer *b)
{
struct udp_bearer *ub;
- ub = rcu_dereference_rtnl(b->media_ptr);
+ ub = rtnl_dereference(b->media_ptr);
if (!ub) {
pr_err("UDP bearer instance not found\n");
return;
}
- if (ub->ubsock)
- sock_set_flag(ub->ubsock->sk, SOCK_DEAD);
+ sock_set_flag(ub->ubsock->sk, SOCK_DEAD);
RCU_INIT_POINTER(ub->bearer, NULL);
/* sock_release need to be done outside of rtnl lock */
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 1f9cf57d9754..7c0b2b778703 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -61,7 +61,7 @@ static void tls_device_free_ctx(struct tls_context *ctx)
if (ctx->rx_conf == TLS_HW)
kfree(tls_offload_ctx_rx(ctx));
- kfree(ctx);
+ tls_ctx_free(ctx);
}
static void tls_device_gc_task(struct work_struct *work)
@@ -209,6 +209,33 @@ void tls_device_free_resources_tx(struct sock *sk)
tls_free_partial_record(sk, tls_ctx);
}
+static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx,
+ u32 seq)
+{
+ struct net_device *netdev;
+ struct sk_buff *skb;
+ int err = 0;
+ u8 *rcd_sn;
+
+ skb = tcp_write_queue_tail(sk);
+ if (skb)
+ TCP_SKB_CB(skb)->eor = 1;
+
+ rcd_sn = tls_ctx->tx.rec_seq;
+
+ down_read(&device_offload_lock);
+ netdev = tls_ctx->netdev;
+ if (netdev)
+ err = netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq,
+ rcd_sn,
+ TLS_OFFLOAD_CTX_DIR_TX);
+ up_read(&device_offload_lock);
+ if (err)
+ return;
+
+ clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags);
+}
+
static void tls_append_frag(struct tls_record_info *record,
struct page_frag *pfrag,
int size)
@@ -252,7 +279,7 @@ static int tls_push_record(struct sock *sk,
skb_frag_address(frag),
record->len - prot->prepend_size,
record_type,
- ctx->crypto_send.info.version);
+ prot->version);
/* HW doesn't care about the data in the tag, because it fills it. */
dummy_tag_frag.page = skb_frag_page(frag);
@@ -264,7 +291,11 @@ static int tls_push_record(struct sock *sk,
list_add_tail(&record->list, &offload_ctx->records_list);
spin_unlock_irq(&offload_ctx->lock);
offload_ctx->open_record = NULL;
- tls_advance_record_sn(sk, &ctx->tx, ctx->crypto_send.info.version);
+
+ if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags))
+ tls_device_resync_tx(sk, ctx, tp->write_seq);
+
+ tls_advance_record_sn(sk, prot, &ctx->tx);
for (i = 0; i < record->num_frags; i++) {
frag = &record->frags[i];
@@ -551,7 +582,7 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
}
static void tls_device_resync_rx(struct tls_context *tls_ctx,
- struct sock *sk, u32 seq, u64 rcd_sn)
+ struct sock *sk, u32 seq, u8 *rcd_sn)
{
struct net_device *netdev;
@@ -559,14 +590,17 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx,
return;
netdev = READ_ONCE(tls_ctx->netdev);
if (netdev)
- netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, rcd_sn);
+ netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn,
+ TLS_OFFLOAD_CTX_DIR_RX);
clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags);
}
-void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
+void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_offload_context_rx *rx_ctx;
+ u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE];
+ struct tls_prot_info *prot;
u32 is_req_pending;
s64 resync_req;
u32 req_seq;
@@ -574,15 +608,83 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
if (tls_ctx->rx_conf != TLS_HW)
return;
+ prot = &tls_ctx->prot_info;
rx_ctx = tls_offload_ctx_rx(tls_ctx);
- resync_req = atomic64_read(&rx_ctx->resync_req);
- req_seq = (resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1);
- is_req_pending = resync_req;
+ memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size);
- if (unlikely(is_req_pending) && req_seq == seq &&
- atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) {
+ switch (rx_ctx->resync_type) {
+ case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ:
+ resync_req = atomic64_read(&rx_ctx->resync_req);
+ req_seq = resync_req >> 32;
seq += TLS_HEADER_SIZE - 1;
- tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn);
+ is_req_pending = resync_req;
+
+ if (likely(!is_req_pending) || req_seq != seq ||
+ !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
+ return;
+ break;
+ case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT:
+ if (likely(!rx_ctx->resync_nh_do_now))
+ return;
+
+ /* head of next rec is already in, note that the sock_inq will
+ * include the currently parsed message when called from parser
+ */
+ if (tcp_inq(sk) > rcd_len)
+ return;
+
+ rx_ctx->resync_nh_do_now = 0;
+ seq += rcd_len;
+ tls_bigint_increment(rcd_sn, prot->rec_seq_size);
+ break;
+ }
+
+ tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn);
+}
+
+static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx,
+ struct tls_offload_context_rx *ctx,
+ struct sock *sk, struct sk_buff *skb)
+{
+ struct strp_msg *rxm;
+
+ /* device will request resyncs by itself based on stream scan */
+ if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT)
+ return;
+ /* already scheduled */
+ if (ctx->resync_nh_do_now)
+ return;
+ /* seen decrypted fragments since last fully-failed record */
+ if (ctx->resync_nh_reset) {
+ ctx->resync_nh_reset = 0;
+ ctx->resync_nh.decrypted_failed = 1;
+ ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL;
+ return;
+ }
+
+ if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt)
+ return;
+
+ /* doing resync, bump the next target in case it fails */
+ if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL)
+ ctx->resync_nh.decrypted_tgt *= 2;
+ else
+ ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL;
+
+ rxm = strp_msg(skb);
+
+ /* head of next rec is already in, parser will sync for us */
+ if (tcp_inq(sk) > rxm->full_len) {
+ ctx->resync_nh_do_now = 1;
+ } else {
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+ u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE];
+
+ memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size);
+ tls_bigint_increment(rcd_sn, prot->rec_seq_size);
+
+ tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq,
+ rcd_sn);
}
}
@@ -610,8 +712,10 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
sg_set_buf(&sg[0], buf,
rxm->full_len + TLS_HEADER_SIZE +
TLS_CIPHER_AES_GCM_128_IV_SIZE);
- skb_copy_bits(skb, offset, buf,
- TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE);
+ err = skb_copy_bits(skb, offset, buf,
+ TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE);
+ if (err)
+ goto free_buf;
/* We are interested only in the decrypted data not the auth */
err = decrypt_skb(sk, skb, sg);
@@ -625,8 +729,11 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
if (skb_pagelen(skb) > offset) {
copy = min_t(int, skb_pagelen(skb) - offset, data_len);
- if (skb->decrypted)
- skb_store_bits(skb, offset, buf, copy);
+ if (skb->decrypted) {
+ err = skb_store_bits(skb, offset, buf, copy);
+ if (err)
+ goto free_buf;
+ }
offset += copy;
buf += copy;
@@ -649,8 +756,11 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
copy = min_t(int, skb_iter->len - frag_pos,
data_len + rxm->offset - offset);
- if (skb_iter->decrypted)
- skb_store_bits(skb_iter, frag_pos, buf, copy);
+ if (skb_iter->decrypted) {
+ err = skb_store_bits(skb_iter, frag_pos, buf, copy);
+ if (err)
+ goto free_buf;
+ }
offset += copy;
buf += copy;
@@ -671,10 +781,6 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
int is_encrypted = !is_decrypted;
struct sk_buff *skb_iter;
- /* Skip if it is already decrypted */
- if (ctx->sw.decrypted)
- return 0;
-
/* Check if all the data is decrypted already */
skb_walk_frags(skb, skb_iter) {
is_decrypted &= skb_iter->decrypted;
@@ -683,12 +789,21 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
ctx->sw.decrypted |= is_decrypted;
- /* Return immedeatly if the record is either entirely plaintext or
+ /* Return immediately if the record is either entirely plaintext or
* entirely ciphertext. Otherwise handle reencrypt partially decrypted
* record.
*/
- return (is_encrypted || is_decrypted) ? 0 :
- tls_device_reencrypt(sk, skb);
+ if (is_decrypted) {
+ ctx->resync_nh_reset = 1;
+ return 0;
+ }
+ if (is_encrypted) {
+ tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb);
+ return 0;
+ }
+
+ ctx->resync_nh_reset = 1;
+ return tls_device_reencrypt(sk, skb);
}
static void tls_device_attach(struct tls_context *ctx, struct sock *sk,
@@ -742,6 +857,11 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
}
crypto_info = &ctx->crypto_send.info;
+ if (crypto_info->version != TLS_1_2_VERSION) {
+ rc = -EOPNOTSUPP;
+ goto free_offload_ctx;
+ }
+
switch (crypto_info->cipher_type) {
case TLS_CIPHER_AES_GCM_128:
nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
@@ -757,6 +877,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
goto free_offload_ctx;
}
+ /* Sanity-check the rec_seq_size for stack allocations */
+ if (rec_seq_size > TLS_MAX_REC_SEQ_SIZE) {
+ rc = -EINVAL;
+ goto free_offload_ctx;
+ }
+
+ prot->version = crypto_info->version;
+ prot->cipher_type = crypto_info->cipher_type;
prot->prepend_size = TLS_HEADER_SIZE + nonce_size;
prot->tag_size = tag_size;
prot->overhead_size = prot->prepend_size + prot->tag_size;
@@ -876,6 +1004,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
struct net_device *netdev;
int rc = 0;
+ if (ctx->crypto_recv.info.version != TLS_1_2_VERSION)
+ return -EOPNOTSUPP;
+
/* We support starting offload on multiple sockets
* concurrently, so we only need a read lock here.
* This lock must precede get_netdev_for_sock to prevent races between
@@ -908,6 +1039,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
rc = -ENOMEM;
goto release_netdev;
}
+ context->resync_nh_reset = 1;
ctx->priv_ctx_rx = context;
rc = tls_set_sw_offload(sk, ctx, 0);
@@ -1015,7 +1147,7 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event,
case NETDEV_REGISTER:
case NETDEV_FEAT_CHANGE:
if ((dev->features & NETIF_F_HW_TLS_RX) &&
- !dev->tlsdev_ops->tls_dev_resync_rx)
+ !dev->tlsdev_ops->tls_dev_resync)
return NOTIFY_BAD;
if (dev->tlsdev_ops &&
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index c3a5fe624b4e..9070d68a92a4 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -209,6 +209,10 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
update_chksum(nskb, headln);
+ /* sock_efree means skb must gone through skb_orphan_partial() */
+ if (nskb->destructor == sock_efree)
+ return;
+
delta = nskb->truesize - skb->truesize;
if (likely(delta < 0))
WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc));
@@ -240,7 +244,6 @@ static int fill_sg_in(struct scatterlist *sg_in,
record = tls_get_record(ctx, tcp_seq, rcd_sn);
if (!record) {
spin_unlock_irqrestore(&ctx->lock, flags);
- WARN(1, "Record not found for seq %u\n", tcp_seq);
return -EINVAL;
}
@@ -409,7 +412,10 @@ put_sg:
put_page(sg_page(&sg_in[--resync_sgs]));
kfree(sg_in);
free_orig:
- kfree_skb(skb);
+ if (nskb)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
return nskb;
}
@@ -424,6 +430,12 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
}
EXPORT_SYMBOL_GPL(tls_validate_xmit_skb);
+struct sk_buff *tls_encrypt_skb(struct sk_buff *skb)
+{
+ return tls_sw_fallback(skb->sk, skb);
+}
+EXPORT_SYMBOL_GPL(tls_encrypt_skb);
+
int tls_sw_fallback_init(struct sock *sk,
struct tls_offload_context_tx *offload_ctx,
struct tls_crypto_info *crypto_info)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index e2b69e805d46..4674e57e66b0 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -251,7 +251,7 @@ static void tls_write_space(struct sock *sk)
ctx->sk_write_space(sk);
}
-static void tls_ctx_free(struct tls_context *ctx)
+void tls_ctx_free(struct tls_context *ctx)
{
if (!ctx)
return;
@@ -643,7 +643,7 @@ static void tls_hw_sk_destruct(struct sock *sk)
ctx->sk_destruct(sk);
/* Free ctx */
- kfree(ctx);
+ tls_ctx_free(ctx);
icsk->icsk_ulp_data = NULL;
}
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 455a782c7658..53b4ad94e74a 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -534,7 +534,7 @@ static int tls_do_encryption(struct sock *sk,
/* Unhook the record from context if encryption is not failure */
ctx->open_rec = NULL;
- tls_advance_record_sn(sk, &tls_ctx->tx, prot->version);
+ tls_advance_record_sn(sk, prot, &tls_ctx->tx);
return rc;
}
@@ -1485,15 +1485,16 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct tls_prot_info *prot = &tls_ctx->prot_info;
- int version = prot->version;
struct strp_msg *rxm = strp_msg(skb);
int pad, err = 0;
if (!ctx->decrypted) {
#ifdef CONFIG_TLS_DEVICE
- err = tls_device_decrypted(sk, skb);
- if (err < 0)
- return err;
+ if (tls_ctx->rx_conf == TLS_HW) {
+ err = tls_device_decrypted(sk, skb);
+ if (err < 0)
+ return err;
+ }
#endif
/* Still not decrypted after tls_device */
if (!ctx->decrypted) {
@@ -1501,8 +1502,8 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
async);
if (err < 0) {
if (err == -EINPROGRESS)
- tls_advance_record_sn(sk, &tls_ctx->rx,
- version);
+ tls_advance_record_sn(sk, prot,
+ &tls_ctx->rx);
return err;
}
@@ -1517,7 +1518,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
rxm->full_len -= pad;
rxm->offset += prot->prepend_size;
rxm->full_len -= prot->overhead_size;
- tls_advance_record_sn(sk, &tls_ctx->rx, version);
+ tls_advance_record_sn(sk, prot, &tls_ctx->rx);
ctx->decrypted = true;
ctx->saved_data_ready(sk);
} else {
@@ -1958,7 +1959,8 @@ bool tls_sw_stream_read(const struct sock *sk)
ingress_empty = list_empty(&psock->ingress_msg);
rcu_read_unlock();
- return !ingress_empty || ctx->recv_pkt;
+ return !ingress_empty || ctx->recv_pkt ||
+ !skb_queue_empty(&ctx->rx_list);
}
static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
@@ -2013,8 +2015,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
goto read_failure;
}
#ifdef CONFIG_TLS_DEVICE
- handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset,
- *(u64*)tls_ctx->rx.rec_seq);
+ tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE,
+ TCP_SKB_CB(skb)->seq + rxm->offset);
#endif
return data_len + TLS_HEADER_SIZE;
@@ -2281,8 +2283,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
goto free_priv;
}
- /* Sanity-check the IV size for stack allocations. */
- if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) {
+ /* Sanity-check the sizes for stack allocations. */
+ if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE ||
+ rec_seq_size > TLS_MAX_REC_SEQ_SIZE) {
rc = -EINVAL;
goto free_priv;
}
diff --git a/net/unix/diag.c b/net/unix/diag.c
index c51a707260fa..9ff64f9df1f3 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -5,9 +5,11 @@
#include <linux/unix_diag.h>
#include <linux/skbuff.h>
#include <linux/module.h>
+#include <linux/uidgid.h>
#include <net/netlink.h>
#include <net/af_unix.h>
#include <net/tcp_states.h>
+#include <net/sock.h>
static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
{
@@ -111,6 +113,12 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb)
return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql);
}
+static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb)
+{
+ uid_t uid = from_kuid_munged(sk_user_ns(nlskb->sk), sock_i_uid(sk));
+ return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid);
+}
+
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req,
u32 portid, u32 seq, u32 flags, int sk_ino)
{
@@ -157,6 +165,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown))
goto out_nlmsg_trim;
+ if ((req->udiag_show & UDIAG_SHOW_UID) &&
+ sk_diag_dump_uid(sk, skb))
+ goto out_nlmsg_trim;
+
nlmsg_end(skb, nlh);
return 0;
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 169112f8aa1e..ab47bf3ab66e 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -274,7 +274,8 @@ EXPORT_SYMBOL_GPL(vsock_insert_connected);
void vsock_remove_bound(struct vsock_sock *vsk)
{
spin_lock_bh(&vsock_table_lock);
- __vsock_remove_bound(vsk);
+ if (__vsock_in_bound_table(vsk))
+ __vsock_remove_bound(vsk);
spin_unlock_bh(&vsock_table_lock);
}
EXPORT_SYMBOL_GPL(vsock_remove_bound);
@@ -282,7 +283,8 @@ EXPORT_SYMBOL_GPL(vsock_remove_bound);
void vsock_remove_connected(struct vsock_sock *vsk)
{
spin_lock_bh(&vsock_table_lock);
- __vsock_remove_connected(vsk);
+ if (__vsock_in_connected_table(vsk))
+ __vsock_remove_connected(vsk);
spin_unlock_bh(&vsock_table_lock);
}
EXPORT_SYMBOL_GPL(vsock_remove_connected);
@@ -318,35 +320,10 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
}
EXPORT_SYMBOL_GPL(vsock_find_connected_socket);
-static bool vsock_in_bound_table(struct vsock_sock *vsk)
-{
- bool ret;
-
- spin_lock_bh(&vsock_table_lock);
- ret = __vsock_in_bound_table(vsk);
- spin_unlock_bh(&vsock_table_lock);
-
- return ret;
-}
-
-static bool vsock_in_connected_table(struct vsock_sock *vsk)
-{
- bool ret;
-
- spin_lock_bh(&vsock_table_lock);
- ret = __vsock_in_connected_table(vsk);
- spin_unlock_bh(&vsock_table_lock);
-
- return ret;
-}
-
void vsock_remove_sock(struct vsock_sock *vsk)
{
- if (vsock_in_bound_table(vsk))
- vsock_remove_bound(vsk);
-
- if (vsock_in_connected_table(vsk))
- vsock_remove_connected(vsk);
+ vsock_remove_bound(vsk);
+ vsock_remove_connected(vsk);
}
EXPORT_SYMBOL_GPL(vsock_remove_sock);
@@ -477,8 +454,7 @@ static void vsock_pending_work(struct work_struct *work)
* incoming packets can't find this socket, and to reduce the reference
* count.
*/
- if (vsock_in_connected_table(vsk))
- vsock_remove_connected(vsk);
+ vsock_remove_connected(vsk);
sk->sk_state = TCP_CLOSE;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 62dcdf082349..f2084e3f7aa4 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -14,14 +14,14 @@
#include <net/sock.h>
#include <net/af_vsock.h>
-/* The host side's design of the feature requires 6 exact 4KB pages for
- * recv/send rings respectively -- this is suboptimal considering memory
- * consumption, however unluckily we have to live with it, before the
- * host comes up with a better design in the future.
+/* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some
+ * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer
+ * hosts don't have this limitation; but, keep the defaults the same for compat.
*/
#define PAGE_SIZE_4K 4096
#define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6)
#define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6)
+#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64)
/* The MTU is 16KB per the host side's design */
#define HVS_MTU_SIZE (1024 * 16)
@@ -46,8 +46,9 @@ struct hvs_recv_buf {
};
/* We can send up to HVS_MTU_SIZE bytes of payload to the host, but let's use
- * a small size, i.e. HVS_SEND_BUF_SIZE, to minimize the dynamically-allocated
- * buffer, because tests show there is no significant performance difference.
+ * a smaller size, i.e. HVS_SEND_BUF_SIZE, to maximize concurrency between the
+ * guest and the host processing as one VMBUS packet is the smallest processing
+ * unit.
*
* Note: the buffer can be eliminated in the future when we add new VMBus
* ringbuffer APIs that allow us to directly copy data from userspace buffer
@@ -321,8 +322,11 @@ static void hvs_open_connection(struct vmbus_channel *chan)
struct sockaddr_vm addr;
struct sock *sk, *new = NULL;
struct vsock_sock *vnew = NULL;
- struct hvsock *hvs, *hvs_new = NULL;
+ struct hvsock *hvs = NULL;
+ struct hvsock *hvs_new = NULL;
+ int rcvbuf;
int ret;
+ int sndbuf;
if_type = &chan->offermsg.offer.if_type;
if_instance = &chan->offermsg.offer.if_instance;
@@ -364,9 +368,34 @@ static void hvs_open_connection(struct vmbus_channel *chan)
}
set_channel_read_mode(chan, HV_CALL_DIRECT);
- ret = vmbus_open(chan, RINGBUFFER_HVS_SND_SIZE,
- RINGBUFFER_HVS_RCV_SIZE, NULL, 0,
- hvs_channel_cb, conn_from_host ? new : sk);
+
+ /* Use the socket buffer sizes as hints for the VMBUS ring size. For
+ * server side sockets, 'sk' is the parent socket and thus, this will
+ * allow the child sockets to inherit the size from the parent. Keep
+ * the mins to the default value and align to page size as per VMBUS
+ * requirements.
+ * For the max, the socket core library will limit the socket buffer
+ * size that can be set by the user, but, since currently, the hv_sock
+ * VMBUS ring buffer is physically contiguous allocation, restrict it
+ * further.
+ * Older versions of hv_sock host side code cannot handle bigger VMBUS
+ * ring buffer size. Use the version number to limit the change to newer
+ * versions.
+ */
+ if (vmbus_proto_version < VERSION_WIN10_V5) {
+ sndbuf = RINGBUFFER_HVS_SND_SIZE;
+ rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
+ } else {
+ sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE);
+ sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE);
+ sndbuf = ALIGN(sndbuf, PAGE_SIZE);
+ rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE);
+ rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE);
+ rcvbuf = ALIGN(rcvbuf, PAGE_SIZE);
+ }
+
+ ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb,
+ conn_from_host ? new : sk);
if (ret != 0) {
if (conn_from_host) {
hvs_new->chan = NULL;
@@ -424,6 +453,7 @@ static u32 hvs_get_local_cid(void)
static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk)
{
struct hvsock *hvs;
+ struct sock *sk = sk_vsock(vsk);
hvs = kzalloc(sizeof(*hvs), GFP_KERNEL);
if (!hvs)
@@ -431,7 +461,8 @@ static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk)
vsk->trans = hvs;
hvs->vsk = vsk;
-
+ sk->sk_sndbuf = RINGBUFFER_HVS_SND_SIZE;
+ sk->sk_rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
return 0;
}
@@ -627,7 +658,9 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg,
struct hvsock *hvs = vsk->trans;
struct vmbus_channel *chan = hvs->chan;
struct hvs_send_buf *send_buf;
- ssize_t to_write, max_writable, ret;
+ ssize_t to_write, max_writable;
+ ssize_t ret = 0;
+ ssize_t bytes_written = 0;
BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K);
@@ -635,20 +668,34 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg,
if (!send_buf)
return -ENOMEM;
- max_writable = hvs_channel_writable_bytes(chan);
- to_write = min_t(ssize_t, len, max_writable);
- to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
-
- ret = memcpy_from_msg(send_buf->data, msg, to_write);
- if (ret < 0)
- goto out;
+ /* Reader(s) could be draining data from the channel as we write.
+ * Maximize bandwidth, by iterating until the channel is found to be
+ * full.
+ */
+ while (len) {
+ max_writable = hvs_channel_writable_bytes(chan);
+ if (!max_writable)
+ break;
+ to_write = min_t(ssize_t, len, max_writable);
+ to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
+ /* memcpy_from_msg is safe for loop as it advances the offsets
+ * within the message iterator.
+ */
+ ret = memcpy_from_msg(send_buf->data, msg, to_write);
+ if (ret < 0)
+ goto out;
- ret = hvs_send_data(hvs->chan, send_buf, to_write);
- if (ret < 0)
- goto out;
+ ret = hvs_send_data(hvs->chan, send_buf, to_write);
+ if (ret < 0)
+ goto out;
- ret = to_write;
+ bytes_written += to_write;
+ len -= to_write;
+ }
out:
+ /* If any data has been sent, return that */
+ if (bytes_written)
+ ret = bytes_written;
kfree(send_buf);
return ret;
}
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 9c287e3e393c..0815d1357861 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -38,6 +38,7 @@ struct virtio_vsock {
* must be accessed with tx_lock held.
*/
struct mutex tx_lock;
+ bool tx_run;
struct work_struct send_pkt_work;
spinlock_t send_pkt_list_lock;
@@ -53,6 +54,7 @@ struct virtio_vsock {
* must be accessed with rx_lock held.
*/
struct mutex rx_lock;
+ bool rx_run;
int rx_buf_nr;
int rx_buf_max_nr;
@@ -60,24 +62,28 @@ struct virtio_vsock {
* vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held.
*/
struct mutex event_lock;
+ bool event_run;
struct virtio_vsock_event event_list[8];
u32 guest_cid;
};
-static struct virtio_vsock *virtio_vsock_get(void)
-{
- return the_virtio_vsock;
-}
-
static u32 virtio_transport_get_local_cid(void)
{
- struct virtio_vsock *vsock = virtio_vsock_get();
+ struct virtio_vsock *vsock;
+ u32 ret;
- if (!vsock)
- return VMADDR_CID_ANY;
+ rcu_read_lock();
+ vsock = rcu_dereference(the_virtio_vsock);
+ if (!vsock) {
+ ret = VMADDR_CID_ANY;
+ goto out_rcu;
+ }
- return vsock->guest_cid;
+ ret = vsock->guest_cid;
+out_rcu:
+ rcu_read_unlock();
+ return ret;
}
static void virtio_transport_loopback_work(struct work_struct *work)
@@ -91,6 +97,10 @@ static void virtio_transport_loopback_work(struct work_struct *work)
spin_unlock_bh(&vsock->loopback_list_lock);
mutex_lock(&vsock->rx_lock);
+
+ if (!vsock->rx_run)
+ goto out;
+
while (!list_empty(&pkts)) {
struct virtio_vsock_pkt *pkt;
@@ -99,6 +109,7 @@ static void virtio_transport_loopback_work(struct work_struct *work)
virtio_transport_recv_pkt(pkt);
}
+out:
mutex_unlock(&vsock->rx_lock);
}
@@ -127,6 +138,9 @@ virtio_transport_send_pkt_work(struct work_struct *work)
mutex_lock(&vsock->tx_lock);
+ if (!vsock->tx_run)
+ goto out;
+
vq = vsock->vqs[VSOCK_VQ_TX];
for (;;) {
@@ -185,6 +199,7 @@ virtio_transport_send_pkt_work(struct work_struct *work)
if (added)
virtqueue_kick(vq);
+out:
mutex_unlock(&vsock->tx_lock);
if (restart_rx)
@@ -197,14 +212,18 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
struct virtio_vsock *vsock;
int len = pkt->len;
- vsock = virtio_vsock_get();
+ rcu_read_lock();
+ vsock = rcu_dereference(the_virtio_vsock);
if (!vsock) {
virtio_transport_free_pkt(pkt);
- return -ENODEV;
+ len = -ENODEV;
+ goto out_rcu;
}
- if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid)
- return virtio_transport_send_pkt_loopback(vsock, pkt);
+ if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) {
+ len = virtio_transport_send_pkt_loopback(vsock, pkt);
+ goto out_rcu;
+ }
if (pkt->reply)
atomic_inc(&vsock->queued_replies);
@@ -214,6 +233,9 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
spin_unlock_bh(&vsock->send_pkt_list_lock);
queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+
+out_rcu:
+ rcu_read_unlock();
return len;
}
@@ -222,12 +244,14 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk)
{
struct virtio_vsock *vsock;
struct virtio_vsock_pkt *pkt, *n;
- int cnt = 0;
+ int cnt = 0, ret;
LIST_HEAD(freeme);
- vsock = virtio_vsock_get();
+ rcu_read_lock();
+ vsock = rcu_dereference(the_virtio_vsock);
if (!vsock) {
- return -ENODEV;
+ ret = -ENODEV;
+ goto out_rcu;
}
spin_lock_bh(&vsock->send_pkt_list_lock);
@@ -255,7 +279,11 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk)
queue_work(virtio_vsock_workqueue, &vsock->rx_work);
}
- return 0;
+ ret = 0;
+
+out_rcu:
+ rcu_read_unlock();
+ return ret;
}
static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
@@ -307,6 +335,10 @@ static void virtio_transport_tx_work(struct work_struct *work)
vq = vsock->vqs[VSOCK_VQ_TX];
mutex_lock(&vsock->tx_lock);
+
+ if (!vsock->tx_run)
+ goto out;
+
do {
struct virtio_vsock_pkt *pkt;
unsigned int len;
@@ -317,6 +349,8 @@ static void virtio_transport_tx_work(struct work_struct *work)
added = true;
}
} while (!virtqueue_enable_cb(vq));
+
+out:
mutex_unlock(&vsock->tx_lock);
if (added)
@@ -345,6 +379,9 @@ static void virtio_transport_rx_work(struct work_struct *work)
mutex_lock(&vsock->rx_lock);
+ if (!vsock->rx_run)
+ goto out;
+
do {
virtqueue_disable_cb(vq);
for (;;) {
@@ -454,6 +491,9 @@ static void virtio_transport_event_work(struct work_struct *work)
mutex_lock(&vsock->event_lock);
+ if (!vsock->event_run)
+ goto out;
+
do {
struct virtio_vsock_event *event;
unsigned int len;
@@ -468,7 +508,7 @@ static void virtio_transport_event_work(struct work_struct *work)
} while (!virtqueue_enable_cb(vq));
virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
-
+out:
mutex_unlock(&vsock->event_lock);
}
@@ -565,7 +605,8 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
return ret;
/* Only one virtio-vsock device per guest is supported */
- if (the_virtio_vsock) {
+ if (rcu_dereference_protected(the_virtio_vsock,
+ lockdep_is_held(&the_virtio_vsock_mutex))) {
ret = -EBUSY;
goto out;
}
@@ -590,8 +631,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
vsock->rx_buf_max_nr = 0;
atomic_set(&vsock->queued_replies, 0);
- vdev->priv = vsock;
- the_virtio_vsock = vsock;
mutex_init(&vsock->tx_lock);
mutex_init(&vsock->rx_lock);
mutex_init(&vsock->event_lock);
@@ -605,14 +644,23 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work);
+ mutex_lock(&vsock->tx_lock);
+ vsock->tx_run = true;
+ mutex_unlock(&vsock->tx_lock);
+
mutex_lock(&vsock->rx_lock);
virtio_vsock_rx_fill(vsock);
+ vsock->rx_run = true;
mutex_unlock(&vsock->rx_lock);
mutex_lock(&vsock->event_lock);
virtio_vsock_event_fill(vsock);
+ vsock->event_run = true;
mutex_unlock(&vsock->event_lock);
+ vdev->priv = vsock;
+ rcu_assign_pointer(the_virtio_vsock, vsock);
+
mutex_unlock(&the_virtio_vsock_mutex);
return 0;
@@ -627,15 +675,33 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
struct virtio_vsock *vsock = vdev->priv;
struct virtio_vsock_pkt *pkt;
- flush_work(&vsock->loopback_work);
- flush_work(&vsock->rx_work);
- flush_work(&vsock->tx_work);
- flush_work(&vsock->event_work);
- flush_work(&vsock->send_pkt_work);
+ mutex_lock(&the_virtio_vsock_mutex);
+
+ vdev->priv = NULL;
+ rcu_assign_pointer(the_virtio_vsock, NULL);
+ synchronize_rcu();
/* Reset all connected sockets when the device disappear */
vsock_for_each_connected_socket(virtio_vsock_reset_sock);
+ /* Stop all work handlers to make sure no one is accessing the device,
+ * so we can safely call vdev->config->reset().
+ */
+ mutex_lock(&vsock->rx_lock);
+ vsock->rx_run = false;
+ mutex_unlock(&vsock->rx_lock);
+
+ mutex_lock(&vsock->tx_lock);
+ vsock->tx_run = false;
+ mutex_unlock(&vsock->tx_lock);
+
+ mutex_lock(&vsock->event_lock);
+ vsock->event_run = false;
+ mutex_unlock(&vsock->event_lock);
+
+ /* Flush all device writes and interrupts, device will not use any
+ * more buffers.
+ */
vdev->config->reset(vdev);
mutex_lock(&vsock->rx_lock);
@@ -666,12 +732,20 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
}
spin_unlock_bh(&vsock->loopback_list_lock);
- mutex_lock(&the_virtio_vsock_mutex);
- the_virtio_vsock = NULL;
- mutex_unlock(&the_virtio_vsock_mutex);
-
+ /* Delete virtqueues and flush outstanding callbacks if any */
vdev->config->del_vqs(vdev);
+ /* Other works can be queued before 'config->del_vqs()', so we flush
+ * all works before to free the vsock object to avoid use after free.
+ */
+ flush_work(&vsock->loopback_work);
+ flush_work(&vsock->rx_work);
+ flush_work(&vsock->tx_work);
+ flush_work(&vsock->event_work);
+ flush_work(&vsock->send_pkt_work);
+
+ mutex_unlock(&the_virtio_vsock_mutex);
+
kfree(vsock);
}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 53ad3dbb76fe..45d9afcff6d5 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -859,6 +859,19 @@ int wiphy_register(struct wiphy *wiphy)
return -EINVAL;
}
+ for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) {
+ /*
+ * Validate we have a policy (can be explicitly set to
+ * VENDOR_CMD_RAW_DATA which is non-NULL) and also that
+ * we have at least one of doit/dumpit.
+ */
+ if (WARN_ON(!rdev->wiphy.vendor_commands[i].policy))
+ return -EINVAL;
+ if (WARN_ON(!rdev->wiphy.vendor_commands[i].doit &&
+ !rdev->wiphy.vendor_commands[i].dumpit))
+ return -EINVAL;
+ }
+
#ifdef CONFIG_PM
if (WARN_ON(rdev->wiphy.wowlan && rdev->wiphy.wowlan->n_patterns &&
(!rdev->wiphy.wowlan->pattern_min_len ||
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 84d36ca7a7ab..ee8388fe4a92 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -531,6 +531,10 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev);
+struct cfg80211_internal_bss *
+cfg80211_bss_update(struct cfg80211_registered_device *rdev,
+ struct cfg80211_internal_bss *tmp,
+ bool signal_valid, unsigned long ts);
#ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
#define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond)
#else
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 520d437aa8d1..fc83dd179c1a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -571,6 +571,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_PEER_MEASUREMENTS] =
NLA_POLICY_NESTED(nl80211_pmsr_attr_policy),
[NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1),
+ [NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY,
+ .len = SAE_PASSWORD_MAX_LEN },
+ [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG },
};
/* policy for the key attributes */
@@ -4447,6 +4450,8 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
return true;
case NL80211_CMD_CONNECT:
if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
+ !wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_SAE_OFFLOAD) &&
auth_type == NL80211_AUTHTYPE_SAE)
return false;
@@ -4637,6 +4642,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
return PTR_ERR(params.acl);
}
+ params.twt_responder =
+ nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]);
+
nl80211_calculate_ap_params(&params);
if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])
@@ -8751,7 +8759,8 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
static bool nl80211_valid_wpa_versions(u32 wpa_versions)
{
return !(wpa_versions & ~(NL80211_WPA_VERSION_1 |
- NL80211_WPA_VERSION_2));
+ NL80211_WPA_VERSION_2 |
+ NL80211_WPA_VERSION_3));
}
static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
@@ -8987,6 +8996,16 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]);
}
+ if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) {
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_SAE_OFFLOAD))
+ return -EINVAL;
+ settings->sae_pwd =
+ nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
+ settings->sae_pwd_len =
+ nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
+ }
+
return 0;
}
@@ -12669,6 +12688,29 @@ static int nl80211_crit_protocol_stop(struct sk_buff *skb,
return 0;
}
+static int nl80211_vendor_check_policy(const struct wiphy_vendor_command *vcmd,
+ struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ if (vcmd->policy == VENDOR_CMD_RAW_DATA) {
+ if (attr->nla_type & NLA_F_NESTED) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "unexpected nested data");
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ if (!(attr->nla_type & NLA_F_NESTED)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "expected nested data");
+ return -EINVAL;
+ }
+
+ return nl80211_validate_nested(attr, vcmd->maxattr, vcmd->policy,
+ extack);
+}
+
static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -12727,11 +12769,16 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_VENDOR_DATA]) {
data = nla_data(info->attrs[NL80211_ATTR_VENDOR_DATA]);
len = nla_len(info->attrs[NL80211_ATTR_VENDOR_DATA]);
+
+ err = nl80211_vendor_check_policy(vcmd,
+ info->attrs[NL80211_ATTR_VENDOR_DATA],
+ info->extack);
+ if (err)
+ return err;
}
rdev->cur_cmd_info = info;
- err = rdev->wiphy.vendor_commands[i].doit(&rdev->wiphy, wdev,
- data, len);
+ err = vcmd->doit(&rdev->wiphy, wdev, data, len);
rdev->cur_cmd_info = NULL;
return err;
}
@@ -12818,6 +12865,13 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
if (attrbuf[NL80211_ATTR_VENDOR_DATA]) {
data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]);
data_len = nla_len(attrbuf[NL80211_ATTR_VENDOR_DATA]);
+
+ err = nl80211_vendor_check_policy(
+ &(*rdev)->wiphy.vendor_commands[vcmd_idx],
+ attrbuf[NL80211_ATTR_VENDOR_DATA],
+ cb->extack);
+ if (err)
+ return err;
}
/* 0 is the first index - add 1 to parse only once */
@@ -15086,7 +15140,9 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
return;
}
- if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -15376,6 +15432,19 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
}
EXPORT_SYMBOL(cfg80211_remain_on_channel_expired);
+void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
+ struct ieee80211_channel *chan,
+ gfp_t gfp)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ trace_cfg80211_tx_mgmt_expired(wdev, cookie, chan);
+ nl80211_send_remain_on_chan_event(NL80211_CMD_FRAME_WAIT_CANCEL,
+ rdev, wdev, cookie, chan, 0, gfp);
+}
+EXPORT_SYMBOL(cfg80211_tx_mgmt_expired);
+
void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
struct station_info *sinfo, gfp_t gfp)
{
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index aa571d727903..d66e6d4b7555 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1092,17 +1092,17 @@ struct cfg80211_non_tx_bss {
};
/* Returned bss is reference counted and must be cleaned up appropriately. */
-static struct cfg80211_internal_bss *
+struct cfg80211_internal_bss *
cfg80211_bss_update(struct cfg80211_registered_device *rdev,
struct cfg80211_internal_bss *tmp,
- bool signal_valid)
+ bool signal_valid, unsigned long ts)
{
struct cfg80211_internal_bss *found = NULL;
if (WARN_ON(!tmp->pub.channel))
return NULL;
- tmp->ts = jiffies;
+ tmp->ts = ts;
spin_lock_bh(&rdev->bss_lock);
@@ -1425,7 +1425,8 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
wiphy->max_adj_channel_rssi_comp;
- res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid);
+ res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid,
+ jiffies);
if (!res)
return NULL;
@@ -1842,7 +1843,8 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
wiphy->max_adj_channel_rssi_comp;
- res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid);
+ res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid,
+ jiffies);
if (!res)
return NULL;
@@ -1972,6 +1974,27 @@ out:
}
EXPORT_SYMBOL(cfg80211_unlink_bss);
+void cfg80211_bss_iter(struct wiphy *wiphy,
+ struct cfg80211_chan_def *chandef,
+ void (*iter)(struct wiphy *wiphy,
+ struct cfg80211_bss *bss,
+ void *data),
+ void *iter_data)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct cfg80211_internal_bss *bss;
+
+ spin_lock_bh(&rdev->bss_lock);
+
+ list_for_each_entry(bss, &rdev->bss_list, list) {
+ if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel))
+ iter(wiphy, &bss->pub, iter_data);
+ }
+
+ spin_unlock_bh(&rdev->bss_lock);
+}
+EXPORT_SYMBOL(cfg80211_bss_iter);
+
#ifdef CONFIG_CFG80211_WEXT
static struct cfg80211_registered_device *
cfg80211_get_dev_from_ifindex(struct net *net, int ifindex)
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 7d34cb884840..7a6c38ddc65a 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -796,12 +796,36 @@ void cfg80211_connect_done(struct net_device *dev,
u8 *next;
if (params->bss) {
- /* Make sure the bss entry provided by the driver is valid. */
struct cfg80211_internal_bss *ibss = bss_from_pub(params->bss);
- if (WARN_ON(list_empty(&ibss->list))) {
- cfg80211_put_bss(wdev->wiphy, params->bss);
- return;
+ if (list_empty(&ibss->list)) {
+ struct cfg80211_bss *found = NULL, *tmp = params->bss;
+
+ found = cfg80211_get_bss(wdev->wiphy, NULL,
+ params->bss->bssid,
+ wdev->ssid, wdev->ssid_len,
+ wdev->conn_bss_type,
+ IEEE80211_PRIVACY_ANY);
+ if (found) {
+ /* The same BSS is already updated so use it
+ * instead, as it has latest info.
+ */
+ params->bss = found;
+ } else {
+ /* Update with BSS provided by driver, it will
+ * be freshly added and ref cnted, we can free
+ * the old one.
+ *
+ * signal_valid can be false, as we are not
+ * expecting the BSS to be found.
+ *
+ * keep the old timestamp to avoid confusion
+ */
+ cfg80211_bss_update(rdev, ibss, false,
+ ibss->ts);
+ }
+
+ cfg80211_put_bss(wdev->wiphy, tmp);
}
}
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 2abfff925aac..4fbb91a511ae 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2752,6 +2752,24 @@ TRACE_EVENT(cfg80211_ready_on_channel_expired,
WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
);
+TRACE_EVENT(cfg80211_tx_mgmt_expired,
+ TP_PROTO(struct wireless_dev *wdev, u64 cookie,
+ struct ieee80211_channel *chan),
+ TP_ARGS(wdev, cookie, chan),
+ TP_STRUCT__entry(
+ WDEV_ENTRY
+ __field(u64, cookie)
+ CHAN_ENTRY
+ ),
+ TP_fast_assign(
+ WDEV_ASSIGN;
+ __entry->cookie = cookie;
+ CHAN_ASSIGN(chan);
+ ),
+ TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT,
+ WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
+);
+
TRACE_EVENT(cfg80211_new_sta,
TP_PROTO(struct net_device *netdev, const u8 *mac_addr,
struct station_info *sinfo),
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 9c6de4f114f8..20c91f02d3d8 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -105,6 +105,9 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
umem->dev = dev;
umem->queue_id = queue_id;
+
+ dev_hold(dev);
+
if (force_copy)
/* For copy-mode, we are done. */
goto out_rtnl_unlock;
@@ -124,7 +127,6 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
goto err_unreg_umem;
rtnl_unlock();
- dev_hold(dev);
umem->zc = true;
return 0;
@@ -138,11 +140,13 @@ out_rtnl_unlock:
return err;
}
-static void xdp_umem_clear_dev(struct xdp_umem *umem)
+void xdp_umem_clear_dev(struct xdp_umem *umem)
{
struct netdev_bpf bpf;
int err;
+ ASSERT_RTNL();
+
if (!umem->dev)
return;
@@ -151,22 +155,17 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem)
bpf.xsk.umem = NULL;
bpf.xsk.queue_id = umem->queue_id;
- rtnl_lock();
err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
- rtnl_unlock();
if (err)
WARN(1, "failed to disable umem!\n");
}
- rtnl_lock();
xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
- rtnl_unlock();
- if (umem->zc) {
- dev_put(umem->dev);
- umem->zc = false;
- }
+ dev_put(umem->dev);
+ umem->dev = NULL;
+ umem->zc = false;
}
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
@@ -194,7 +193,9 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
static void xdp_umem_release(struct xdp_umem *umem)
{
+ rtnl_lock();
xdp_umem_clear_dev(umem);
+ rtnl_unlock();
ida_simple_remove(&umem_ida, umem->id);
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 27603227601b..a63a9fb251f5 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -10,6 +10,7 @@
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
u16 queue_id, u16 flags);
+void xdp_umem_clear_dev(struct xdp_umem *umem);
bool xdp_umem_validate_queues(struct xdp_umem *umem);
void xdp_get_umem(struct xdp_umem *umem);
void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index a14e8864e4fa..d4d6f10aa936 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -37,6 +37,12 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
READ_ONCE(xs->umem->fq);
}
+bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
+{
+ return xskq_has_addrs(umem->fq, cnt);
+}
+EXPORT_SYMBOL(xsk_umem_has_addrs);
+
u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
return xskq_peek_addr(umem->fq, addr);
@@ -123,13 +129,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
u64 addr;
int err;
- if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
- return -EINVAL;
+ spin_lock_bh(&xs->rx_lock);
+
+ if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
if (!xskq_peek_addr(xs->umem->fq, &addr) ||
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
- xs->rx_dropped++;
- return -ENOSPC;
+ err = -ENOSPC;
+ goto out_drop;
}
addr += xs->umem->headroom;
@@ -138,13 +148,21 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
memcpy(buffer, xdp->data_meta, len + metalen);
addr += metalen;
err = xskq_produce_batch_desc(xs->rx, addr, len);
- if (!err) {
- xskq_discard_addr(xs->umem->fq);
- xsk_flush(xs);
- return 0;
- }
+ if (err)
+ goto out_drop;
+
+ xskq_discard_addr(xs->umem->fq);
+ xskq_produce_flush_desc(xs->rx);
+
+ spin_unlock_bh(&xs->rx_lock);
+ xs->sk.sk_data_ready(&xs->sk);
+ return 0;
+
+out_drop:
xs->rx_dropped++;
+out_unlock:
+ spin_unlock_bh(&xs->rx_lock);
return err;
}
@@ -166,22 +184,18 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
}
EXPORT_SYMBOL(xsk_umem_consume_tx_done);
-bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
{
- struct xdp_desc desc;
struct xdp_sock *xs;
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
- if (!xskq_peek_desc(xs->tx, &desc))
+ if (!xskq_peek_desc(xs->tx, desc))
continue;
- if (xskq_produce_addr_lazy(umem->cq, desc.addr))
+ if (xskq_produce_addr_lazy(umem->cq, desc->addr))
goto out;
- *dma = xdp_umem_get_dma(umem, desc.addr);
- *len = desc.len;
-
xskq_discard_desc(xs->tx);
rcu_read_unlock();
return true;
@@ -335,6 +349,22 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
return 0;
}
+static void xsk_unbind_dev(struct xdp_sock *xs)
+{
+ struct net_device *dev = xs->dev;
+
+ if (!dev || xs->state != XSK_BOUND)
+ return;
+
+ xs->state = XSK_UNBOUND;
+
+ /* Wait for driver to stop using the xdp socket. */
+ xdp_del_sk_umem(xs->umem, xs);
+ xs->dev = NULL;
+ synchronize_net();
+ dev_put(dev);
+}
+
static int xsk_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -354,15 +384,7 @@ static int xsk_release(struct socket *sock)
sock_prot_inuse_add(net, sk->sk_prot, -1);
local_bh_enable();
- if (xs->dev) {
- struct net_device *dev = xs->dev;
-
- /* Wait for driver to stop using the xdp socket. */
- xdp_del_sk_umem(xs->umem, xs);
- xs->dev = NULL;
- synchronize_net();
- dev_put(dev);
- }
+ xsk_unbind_dev(xs);
xskq_destroy(xs->rx);
xskq_destroy(xs->tx);
@@ -412,7 +434,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
return -EINVAL;
mutex_lock(&xs->mutex);
- if (xs->dev) {
+ if (xs->state != XSK_READY) {
err = -EBUSY;
goto out_release;
}
@@ -492,6 +514,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
out_unlock:
if (err)
dev_put(dev);
+ else
+ xs->state = XSK_BOUND;
out_release:
mutex_unlock(&xs->mutex);
return err;
@@ -520,6 +544,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -EFAULT;
mutex_lock(&xs->mutex);
+ if (xs->state != XSK_READY) {
+ mutex_unlock(&xs->mutex);
+ return -EBUSY;
+ }
q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
err = xsk_init_queue(entries, q, false);
mutex_unlock(&xs->mutex);
@@ -534,7 +562,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -EFAULT;
mutex_lock(&xs->mutex);
- if (xs->umem) {
+ if (xs->state != XSK_READY || xs->umem) {
mutex_unlock(&xs->mutex);
return -EBUSY;
}
@@ -561,6 +589,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -EFAULT;
mutex_lock(&xs->mutex);
+ if (xs->state != XSK_READY) {
+ mutex_unlock(&xs->mutex);
+ return -EBUSY;
+ }
if (!xs->umem) {
mutex_unlock(&xs->mutex);
return -EINVAL;
@@ -644,6 +676,26 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
return 0;
}
+ case XDP_OPTIONS:
+ {
+ struct xdp_options opts = {};
+
+ if (len < sizeof(opts))
+ return -EINVAL;
+
+ mutex_lock(&xs->mutex);
+ if (xs->zc)
+ opts.flags |= XDP_OPTIONS_ZEROCOPY;
+ mutex_unlock(&xs->mutex);
+
+ len = sizeof(opts);
+ if (copy_to_user(optval, &opts, len))
+ return -EFAULT;
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return 0;
+ }
default:
break;
}
@@ -662,6 +714,9 @@ static int xsk_mmap(struct file *file, struct socket *sock,
unsigned long pfn;
struct page *qpg;
+ if (xs->state != XSK_READY)
+ return -EBUSY;
+
if (offset == XDP_PGOFF_RX_RING) {
q = READ_ONCE(xs->rx);
} else if (offset == XDP_PGOFF_TX_RING) {
@@ -693,6 +748,38 @@ static int xsk_mmap(struct file *file, struct socket *sock,
size, vma->vm_page_prot);
}
+static int xsk_notifier(struct notifier_block *this,
+ unsigned long msg, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct sock *sk;
+
+ switch (msg) {
+ case NETDEV_UNREGISTER:
+ mutex_lock(&net->xdp.lock);
+ sk_for_each(sk, &net->xdp.list) {
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ mutex_lock(&xs->mutex);
+ if (xs->dev == dev) {
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+
+ xsk_unbind_dev(xs);
+
+ /* Clear device references in umem. */
+ xdp_umem_clear_dev(xs->umem);
+ }
+ mutex_unlock(&xs->mutex);
+ }
+ mutex_unlock(&net->xdp.lock);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
static struct proto xsk_proto = {
.name = "XDP",
.owner = THIS_MODULE,
@@ -764,7 +851,9 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
sock_set_flag(sk, SOCK_RCU_FREE);
xs = xdp_sk(sk);
+ xs->state = XSK_READY;
mutex_init(&xs->mutex);
+ spin_lock_init(&xs->rx_lock);
spin_lock_init(&xs->tx_completion_lock);
mutex_lock(&net->xdp.lock);
@@ -784,6 +873,10 @@ static const struct net_proto_family xsk_family_ops = {
.owner = THIS_MODULE,
};
+static struct notifier_block xsk_netdev_notifier = {
+ .notifier_call = xsk_notifier,
+};
+
static int __net_init xsk_net_init(struct net *net)
{
mutex_init(&net->xdp.lock);
@@ -816,8 +909,15 @@ static int __init xsk_init(void)
err = register_pernet_subsys(&xsk_net_ops);
if (err)
goto out_sk;
+
+ err = register_netdevice_notifier(&xsk_netdev_notifier);
+ if (err)
+ goto out_pernet;
+
return 0;
+out_pernet:
+ unregister_pernet_subsys(&xsk_net_ops);
out_sk:
sock_unregister(PF_XDP);
out_proto:
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 88b9ae24658d..909c5168ed0f 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -117,6 +117,20 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
return q->nentries - (producer - q->cons_tail);
}
+static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
+{
+ u32 entries = q->prod_tail - q->cons_tail;
+
+ if (entries >= cnt)
+ return true;
+
+ /* Refresh the local pointer. */
+ q->prod_tail = READ_ONCE(q->ring->producer);
+ entries = q->prod_tail - q->cons_tail;
+
+ return entries >= cnt;
+}
+
/* UMEM queue */
static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
@@ -288,7 +302,7 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q)
/* Order producer and data */
smp_wmb(); /* B, matches C */
- q->prod_tail = q->prod_head,
+ q->prod_tail = q->prod_head;
WRITE_ONCE(q->ring->producer, q->prod_tail);
}
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index c967fc3c38c8..51bb6018f3bf 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -15,6 +15,8 @@ config XFRM_ALGO
tristate
select XFRM
select CRYPTO
+ select CRYPTO_HASH
+ select CRYPTO_BLKCIPHER
if INET
config XFRM_USER
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index ff654306d836..189ef15acbbc 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -271,9 +271,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
return false;
if ((!dev || (dev == xfrm_dst_path(dst)->dev)) &&
- (!xdst->child->xfrm && x->type->get_mtu)) {
- mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
-
+ (!xdst->child->xfrm)) {
+ mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
if (skb->len <= mtu)
goto ok;
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 314973aaa414..6088bc2dc11e 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -359,28 +359,29 @@ static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family);
if (likely(afinfo))
err = afinfo->extract_input(x, skb);
+ rcu_read_unlock();
- if (err) {
- rcu_read_unlock();
+ if (err)
return err;
- }
if (x->sel.family == AF_UNSPEC) {
inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
- if (!inner_mode) {
- rcu_read_unlock();
+ if (!inner_mode)
return -EAFNOSUPPORT;
- }
}
- afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
- if (unlikely(!afinfo)) {
- rcu_read_unlock();
- return -EAFNOSUPPORT;
+ switch (inner_mode->family) {
+ case AF_INET:
+ skb->protocol = htons(ETH_P_IP);
+ break;
+ case AF_INET6:
+ skb->protocol = htons(ETH_P_IPV6);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
- skb->protocol = afinfo->eth_proto;
- rcu_read_unlock();
return xfrm_inner_mode_encap_remove(x, inner_mode, skb);
}
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index ad3a2555c517..74868f9d81fb 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -133,7 +133,7 @@ static void xfrmi_dev_free(struct net_device *dev)
free_percpu(dev->tstats);
}
-static int xfrmi_create2(struct net_device *dev)
+static int xfrmi_create(struct net_device *dev)
{
struct xfrm_if *xi = netdev_priv(dev);
struct net *net = dev_net(dev);
@@ -156,54 +156,7 @@ out:
return err;
}
-static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p)
-{
- struct net_device *dev;
- struct xfrm_if *xi;
- char name[IFNAMSIZ];
- int err;
-
- if (p->name[0]) {
- strlcpy(name, p->name, IFNAMSIZ);
- } else {
- err = -EINVAL;
- goto failed;
- }
-
- dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup);
- if (!dev) {
- err = -EAGAIN;
- goto failed;
- }
-
- dev_net_set(dev, net);
-
- xi = netdev_priv(dev);
- xi->p = *p;
- xi->net = net;
- xi->dev = dev;
- xi->phydev = dev_get_by_index(net, p->link);
- if (!xi->phydev) {
- err = -ENODEV;
- goto failed_free;
- }
-
- err = xfrmi_create2(dev);
- if (err < 0)
- goto failed_dev_put;
-
- return xi;
-
-failed_dev_put:
- dev_put(xi->phydev);
-failed_free:
- free_netdev(dev);
-failed:
- return ERR_PTR(err);
-}
-
-static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p,
- int create)
+static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p)
{
struct xfrm_if __rcu **xip;
struct xfrm_if *xi;
@@ -211,17 +164,11 @@ static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p,
for (xip = &xfrmn->xfrmi[0];
(xi = rtnl_dereference(*xip)) != NULL;
- xip = &xi->next) {
- if (xi->p.if_id == p->if_id) {
- if (create)
- return ERR_PTR(-EEXIST);
-
+ xip = &xi->next)
+ if (xi->p.if_id == p->if_id)
return xi;
- }
- }
- if (!create)
- return ERR_PTR(-ENODEV);
- return xfrmi_create(net, p);
+
+ return NULL;
}
static void xfrmi_dev_uninit(struct net_device *dev)
@@ -686,21 +633,33 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct net *net = dev_net(dev);
- struct xfrm_if_parms *p;
+ struct xfrm_if_parms p;
struct xfrm_if *xi;
+ int err;
- xi = netdev_priv(dev);
- p = &xi->p;
-
- xfrmi_netlink_parms(data, p);
+ xfrmi_netlink_parms(data, &p);
if (!tb[IFLA_IFNAME])
return -EINVAL;
- nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ);
+ nla_strlcpy(p.name, tb[IFLA_IFNAME], IFNAMSIZ);
+
+ xi = xfrmi_locate(net, &p);
+ if (xi)
+ return -EEXIST;
- xi = xfrmi_locate(net, p, 1);
- return PTR_ERR_OR_ZERO(xi);
+ xi = netdev_priv(dev);
+ xi->p = p;
+ xi->net = net;
+ xi->dev = dev;
+ xi->phydev = dev_get_by_index(net, p.link);
+ if (!xi->phydev)
+ return -ENODEV;
+
+ err = xfrmi_create(dev);
+ if (err < 0)
+ dev_put(xi->phydev);
+ return err;
}
static void xfrmi_dellink(struct net_device *dev, struct list_head *head)
@@ -717,9 +676,8 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
xfrmi_netlink_parms(data, &xi->p);
- xi = xfrmi_locate(net, &xi->p, 0);
-
- if (IS_ERR_OR_NULL(xi)) {
+ xi = xfrmi_locate(net, &xi->p);
+ if (!xi) {
xi = netdev_priv(dev);
} else {
if (xi->dev != dev)
@@ -793,11 +751,6 @@ static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn)
unregister_netdevice_many(&list);
}
-static int __net_init xfrmi_init_net(struct net *net)
-{
- return 0;
-}
-
static void __net_exit xfrmi_exit_net(struct net *net)
{
struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
@@ -808,7 +761,6 @@ static void __net_exit xfrmi_exit_net(struct net *net)
}
static struct pernet_operations xfrmi_net_ops = {
- .init = xfrmi_init_net,
.exit = xfrmi_exit_net,
.id = &xfrmi_net_id,
.size = sizeof(struct xfrmi_net),
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b1694d5d15d3..8ca637a72697 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -585,9 +585,6 @@ static void xfrm_bydst_resize(struct net *net, int dir)
odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
lockdep_is_held(&net->xfrm.xfrm_policy_lock));
- odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
- lockdep_is_held(&net->xfrm.xfrm_policy_lock));
-
for (i = hmask; i >= 0; i--)
xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);
@@ -1280,13 +1277,17 @@ static void xfrm_hash_rebuild(struct work_struct *work)
hlist_for_each_entry_safe(policy, n,
&net->xfrm.policy_inexact[dir],
- bydst_inexact_list)
+ bydst_inexact_list) {
+ hlist_del_rcu(&policy->bydst);
hlist_del_init(&policy->bydst_inexact_list);
+ }
hmask = net->xfrm.policy_bydst[dir].hmask;
odst = net->xfrm.policy_bydst[dir].table;
- for (i = hmask; i >= 0; i--)
- INIT_HLIST_HEAD(odst + i);
+ for (i = hmask; i >= 0; i--) {
+ hlist_for_each_entry_safe(policy, n, odst + i, bydst)
+ hlist_del_rcu(&policy->bydst);
+ }
if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
/* dir out => dst = remote, src = local */
net->xfrm.policy_bydst[dir].dbits4 = rbits4;
@@ -1315,8 +1316,6 @@ static void xfrm_hash_rebuild(struct work_struct *work)
chain = policy_hash_bysel(net, &policy->selector,
policy->family, dir);
- hlist_del_rcu(&policy->bydst);
-
if (!chain) {
void *p = xfrm_policy_inexact_insert(policy, dir, 0);
@@ -3628,7 +3627,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
}
xfrm_nr = ti;
if (npols > 1) {
- xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net);
+ xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
tpp = stp;
}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 50621d982970..c6f3c4a1bd99 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -27,6 +27,8 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
+#include <crypto/aead.h>
+
#include "xfrm_hash.h"
#define xfrm_state_deref_prot(table, net) \
@@ -177,63 +179,132 @@ int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
static bool km_is_alive(const struct km_event *c);
void km_state_expired(struct xfrm_state *x, int hard, u32 portid);
-static DEFINE_SPINLOCK(xfrm_type_lock);
int xfrm_register_type(const struct xfrm_type *type, unsigned short family)
{
struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
- const struct xfrm_type **typemap;
int err = 0;
- if (unlikely(afinfo == NULL))
+ if (!afinfo)
return -EAFNOSUPPORT;
- typemap = afinfo->type_map;
- spin_lock_bh(&xfrm_type_lock);
- if (likely(typemap[type->proto] == NULL))
- typemap[type->proto] = type;
- else
- err = -EEXIST;
- spin_unlock_bh(&xfrm_type_lock);
+#define X(afi, T, name) do { \
+ WARN_ON((afi)->type_ ## name); \
+ (afi)->type_ ## name = (T); \
+ } while (0)
+
+ switch (type->proto) {
+ case IPPROTO_COMP:
+ X(afinfo, type, comp);
+ break;
+ case IPPROTO_AH:
+ X(afinfo, type, ah);
+ break;
+ case IPPROTO_ESP:
+ X(afinfo, type, esp);
+ break;
+ case IPPROTO_IPIP:
+ X(afinfo, type, ipip);
+ break;
+ case IPPROTO_DSTOPTS:
+ X(afinfo, type, dstopts);
+ break;
+ case IPPROTO_ROUTING:
+ X(afinfo, type, routing);
+ break;
+ case IPPROTO_IPV6:
+ X(afinfo, type, ipip6);
+ break;
+ default:
+ WARN_ON(1);
+ err = -EPROTONOSUPPORT;
+ break;
+ }
+#undef X
rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(xfrm_register_type);
-int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
+void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
{
struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
- const struct xfrm_type **typemap;
- int err = 0;
if (unlikely(afinfo == NULL))
- return -EAFNOSUPPORT;
- typemap = afinfo->type_map;
- spin_lock_bh(&xfrm_type_lock);
+ return;
- if (unlikely(typemap[type->proto] != type))
- err = -ENOENT;
- else
- typemap[type->proto] = NULL;
- spin_unlock_bh(&xfrm_type_lock);
+#define X(afi, T, name) do { \
+ WARN_ON((afi)->type_ ## name != (T)); \
+ (afi)->type_ ## name = NULL; \
+ } while (0)
+
+ switch (type->proto) {
+ case IPPROTO_COMP:
+ X(afinfo, type, comp);
+ break;
+ case IPPROTO_AH:
+ X(afinfo, type, ah);
+ break;
+ case IPPROTO_ESP:
+ X(afinfo, type, esp);
+ break;
+ case IPPROTO_IPIP:
+ X(afinfo, type, ipip);
+ break;
+ case IPPROTO_DSTOPTS:
+ X(afinfo, type, dstopts);
+ break;
+ case IPPROTO_ROUTING:
+ X(afinfo, type, routing);
+ break;
+ case IPPROTO_IPV6:
+ X(afinfo, type, ipip6);
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+#undef X
rcu_read_unlock();
- return err;
}
EXPORT_SYMBOL(xfrm_unregister_type);
static const struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
{
+ const struct xfrm_type *type = NULL;
struct xfrm_state_afinfo *afinfo;
- const struct xfrm_type **typemap;
- const struct xfrm_type *type;
int modload_attempted = 0;
retry:
afinfo = xfrm_state_get_afinfo(family);
if (unlikely(afinfo == NULL))
return NULL;
- typemap = afinfo->type_map;
- type = READ_ONCE(typemap[proto]);
+ switch (proto) {
+ case IPPROTO_COMP:
+ type = afinfo->type_comp;
+ break;
+ case IPPROTO_AH:
+ type = afinfo->type_ah;
+ break;
+ case IPPROTO_ESP:
+ type = afinfo->type_esp;
+ break;
+ case IPPROTO_IPIP:
+ type = afinfo->type_ipip;
+ break;
+ case IPPROTO_DSTOPTS:
+ type = afinfo->type_dstopts;
+ break;
+ case IPPROTO_ROUTING:
+ type = afinfo->type_routing;
+ break;
+ case IPPROTO_IPV6:
+ type = afinfo->type_ipip6;
+ break;
+ default:
+ break;
+ }
+
if (unlikely(type && !try_module_get(type->owner)))
type = NULL;
@@ -253,65 +324,71 @@ static void xfrm_put_type(const struct xfrm_type *type)
module_put(type->owner);
}
-static DEFINE_SPINLOCK(xfrm_type_offload_lock);
int xfrm_register_type_offload(const struct xfrm_type_offload *type,
unsigned short family)
{
struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
- const struct xfrm_type_offload **typemap;
int err = 0;
if (unlikely(afinfo == NULL))
return -EAFNOSUPPORT;
- typemap = afinfo->type_offload_map;
- spin_lock_bh(&xfrm_type_offload_lock);
- if (likely(typemap[type->proto] == NULL))
- typemap[type->proto] = type;
- else
- err = -EEXIST;
- spin_unlock_bh(&xfrm_type_offload_lock);
+ switch (type->proto) {
+ case IPPROTO_ESP:
+ WARN_ON(afinfo->type_offload_esp);
+ afinfo->type_offload_esp = type;
+ break;
+ default:
+ WARN_ON(1);
+ err = -EPROTONOSUPPORT;
+ break;
+ }
+
rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(xfrm_register_type_offload);
-int xfrm_unregister_type_offload(const struct xfrm_type_offload *type,
- unsigned short family)
+void xfrm_unregister_type_offload(const struct xfrm_type_offload *type,
+ unsigned short family)
{
struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
- const struct xfrm_type_offload **typemap;
- int err = 0;
if (unlikely(afinfo == NULL))
- return -EAFNOSUPPORT;
- typemap = afinfo->type_offload_map;
- spin_lock_bh(&xfrm_type_offload_lock);
+ return;
- if (unlikely(typemap[type->proto] != type))
- err = -ENOENT;
- else
- typemap[type->proto] = NULL;
- spin_unlock_bh(&xfrm_type_offload_lock);
+ switch (type->proto) {
+ case IPPROTO_ESP:
+ WARN_ON(afinfo->type_offload_esp != type);
+ afinfo->type_offload_esp = NULL;
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
rcu_read_unlock();
- return err;
}
EXPORT_SYMBOL(xfrm_unregister_type_offload);
static const struct xfrm_type_offload *
xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load)
{
+ const struct xfrm_type_offload *type = NULL;
struct xfrm_state_afinfo *afinfo;
- const struct xfrm_type_offload **typemap;
- const struct xfrm_type_offload *type;
retry:
afinfo = xfrm_state_get_afinfo(family);
if (unlikely(afinfo == NULL))
return NULL;
- typemap = afinfo->type_offload_map;
- type = typemap[proto];
+ switch (proto) {
+ case IPPROTO_ESP:
+ type = afinfo->type_offload_esp;
+ break;
+ default:
+ break;
+ }
+
if ((type && !try_module_get(type->owner)))
type = NULL;
@@ -770,24 +847,79 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
EXPORT_SYMBOL(xfrm_sad_getinfo);
static void
+__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
+{
+ const struct flowi4 *fl4 = &fl->u.ip4;
+
+ sel->daddr.a4 = fl4->daddr;
+ sel->saddr.a4 = fl4->saddr;
+ sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
+ sel->dport_mask = htons(0xffff);
+ sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
+ sel->sport_mask = htons(0xffff);
+ sel->family = AF_INET;
+ sel->prefixlen_d = 32;
+ sel->prefixlen_s = 32;
+ sel->proto = fl4->flowi4_proto;
+ sel->ifindex = fl4->flowi4_oif;
+}
+
+static void
+__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
+{
+ const struct flowi6 *fl6 = &fl->u.ip6;
+
+ /* Initialize temporary selector matching only to current session. */
+ *(struct in6_addr *)&sel->daddr = fl6->daddr;
+ *(struct in6_addr *)&sel->saddr = fl6->saddr;
+ sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
+ sel->dport_mask = htons(0xffff);
+ sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
+ sel->sport_mask = htons(0xffff);
+ sel->family = AF_INET6;
+ sel->prefixlen_d = 128;
+ sel->prefixlen_s = 128;
+ sel->proto = fl6->flowi6_proto;
+ sel->ifindex = fl6->flowi6_oif;
+}
+
+static void
xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
const struct xfrm_tmpl *tmpl,
const xfrm_address_t *daddr, const xfrm_address_t *saddr,
unsigned short family)
{
- struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family);
-
- if (!afinfo)
- return;
+ switch (family) {
+ case AF_INET:
+ __xfrm4_init_tempsel(&x->sel, fl);
+ break;
+ case AF_INET6:
+ __xfrm6_init_tempsel(&x->sel, fl);
+ break;
+ }
- afinfo->init_tempsel(&x->sel, fl);
+ x->id = tmpl->id;
- if (family != tmpl->encap_family) {
- afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family);
- if (!afinfo)
- return;
+ switch (tmpl->encap_family) {
+ case AF_INET:
+ if (x->id.daddr.a4 == 0)
+ x->id.daddr.a4 = daddr->a4;
+ x->props.saddr = tmpl->saddr;
+ if (x->props.saddr.a4 == 0)
+ x->props.saddr.a4 = saddr->a4;
+ break;
+ case AF_INET6:
+ if (ipv6_addr_any((struct in6_addr *)&x->id.daddr))
+ memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
+ memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
+ if (ipv6_addr_any((struct in6_addr *)&x->props.saddr))
+ memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
+ break;
}
- afinfo->init_temprop(x, tmpl, daddr, saddr);
+
+ x->props.mode = tmpl->mode;
+ x->props.reqid = tmpl->reqid;
+ x->props.family = tmpl->encap_family;
}
static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
@@ -1633,51 +1765,129 @@ xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
EXPORT_SYMBOL(xfrm_find_acq);
#ifdef CONFIG_XFRM_SUB_POLICY
-int
+#if IS_ENABLED(CONFIG_IPV6)
+/* distribution counting sort function for xfrm_state and xfrm_tmpl */
+static void
+__xfrm6_sort(void **dst, void **src, int n,
+ int (*cmp)(const void *p), int maxclass)
+{
+ int count[XFRM_MAX_DEPTH] = { };
+ int class[XFRM_MAX_DEPTH];
+ int i;
+
+ for (i = 0; i < n; i++) {
+ int c = cmp(src[i]);
+
+ class[i] = c;
+ count[c]++;
+ }
+
+ for (i = 2; i < maxclass; i++)
+ count[i] += count[i - 1];
+
+ for (i = 0; i < n; i++) {
+ dst[count[class[i] - 1]++] = src[i];
+ src[i] = NULL;
+ }
+}
+
+/* Rule for xfrm_state:
+ *
+ * rule 1: select IPsec transport except AH
+ * rule 2: select MIPv6 RO or inbound trigger
+ * rule 3: select IPsec transport AH
+ * rule 4: select IPsec tunnel
+ * rule 5: others
+ */
+static int __xfrm6_state_sort_cmp(const void *p)
+{
+ const struct xfrm_state *v = p;
+
+ switch (v->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ if (v->id.proto != IPPROTO_AH)
+ return 1;
+ else
+ return 3;
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case XFRM_MODE_ROUTEOPTIMIZATION:
+ case XFRM_MODE_IN_TRIGGER:
+ return 2;
+#endif
+ case XFRM_MODE_TUNNEL:
+ case XFRM_MODE_BEET:
+ return 4;
+ }
+ return 5;
+}
+
+/* Rule for xfrm_tmpl:
+ *
+ * rule 1: select IPsec transport
+ * rule 2: select MIPv6 RO or inbound trigger
+ * rule 3: select IPsec tunnel
+ * rule 4: others
+ */
+static int __xfrm6_tmpl_sort_cmp(const void *p)
+{
+ const struct xfrm_tmpl *v = p;
+
+ switch (v->mode) {
+ case XFRM_MODE_TRANSPORT:
+ return 1;
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case XFRM_MODE_ROUTEOPTIMIZATION:
+ case XFRM_MODE_IN_TRIGGER:
+ return 2;
+#endif
+ case XFRM_MODE_TUNNEL:
+ case XFRM_MODE_BEET:
+ return 3;
+ }
+ return 4;
+}
+#else
+static inline int __xfrm6_state_sort_cmp(const void *p) { return 5; }
+static inline int __xfrm6_tmpl_sort_cmp(const void *p) { return 4; }
+
+static inline void
+__xfrm6_sort(void **dst, void **src, int n,
+ int (*cmp)(const void *p), int maxclass)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ dst[i] = src[i];
+}
+#endif /* CONFIG_IPV6 */
+
+void
xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
- unsigned short family, struct net *net)
+ unsigned short family)
{
int i;
- int err = 0;
- struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
- if (!afinfo)
- return -EAFNOSUPPORT;
- spin_lock_bh(&net->xfrm.xfrm_state_lock); /*FIXME*/
- if (afinfo->tmpl_sort)
- err = afinfo->tmpl_sort(dst, src, n);
+ if (family == AF_INET6)
+ __xfrm6_sort((void **)dst, (void **)src, n,
+ __xfrm6_tmpl_sort_cmp, 5);
else
for (i = 0; i < n; i++)
dst[i] = src[i];
- spin_unlock_bh(&net->xfrm.xfrm_state_lock);
- rcu_read_unlock();
- return err;
}
-EXPORT_SYMBOL(xfrm_tmpl_sort);
-int
+void
xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
unsigned short family)
{
int i;
- int err = 0;
- struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
- struct net *net = xs_net(*src);
-
- if (!afinfo)
- return -EAFNOSUPPORT;
- spin_lock_bh(&net->xfrm.xfrm_state_lock);
- if (afinfo->state_sort)
- err = afinfo->state_sort(dst, src, n);
+ if (family == AF_INET6)
+ __xfrm6_sort((void **)dst, (void **)src, n,
+ __xfrm6_state_sort_cmp, 6);
else
for (i = 0; i < n; i++)
dst[i] = src[i];
- spin_unlock_bh(&net->xfrm.xfrm_state_lock);
- rcu_read_unlock();
- return err;
}
-EXPORT_SYMBOL(xfrm_state_sort);
#endif
/* Silly enough, but I'm lazy to build resolution list */
@@ -2195,38 +2405,49 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x)
}
EXPORT_SYMBOL(xfrm_state_delete_tunnel);
-int xfrm_state_mtu(struct xfrm_state *x, int mtu)
+u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
{
const struct xfrm_type *type = READ_ONCE(x->type);
+ struct crypto_aead *aead;
+ u32 blksize, net_adj = 0;
+
+ if (x->km.state != XFRM_STATE_VALID ||
+ !type || type->proto != IPPROTO_ESP)
+ return mtu - x->props.header_len;
+
+ aead = x->data;
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
- if (x->km.state == XFRM_STATE_VALID &&
- type && type->get_mtu)
- return type->get_mtu(x, mtu);
+ switch (x->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ case XFRM_MODE_BEET:
+ if (x->props.family == AF_INET)
+ net_adj = sizeof(struct iphdr);
+ else if (x->props.family == AF_INET6)
+ net_adj = sizeof(struct ipv6hdr);
+ break;
+ case XFRM_MODE_TUNNEL:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
- return mtu - x->props.header_len;
+ return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
+ net_adj) & ~(blksize - 1)) + net_adj - 2;
}
+EXPORT_SYMBOL_GPL(xfrm_state_mtu);
int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
{
- const struct xfrm_state_afinfo *afinfo;
const struct xfrm_mode *inner_mode;
const struct xfrm_mode *outer_mode;
int family = x->props.family;
int err;
- err = -EAFNOSUPPORT;
- afinfo = xfrm_state_get_afinfo(family);
- if (!afinfo)
- goto error;
-
- err = 0;
- if (afinfo->init_flags)
- err = afinfo->init_flags(x);
-
- rcu_read_unlock();
-
- if (err)
- goto error;
+ if (family == AF_INET &&
+ xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)
+ x->props.flags |= XFRM_STATE_NOPMTUDISC;
err = -EPROTONOSUPPORT;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 173477211e40..b88ba45ff1ac 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -152,6 +152,25 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
err = -EINVAL;
switch (p->family) {
case AF_INET:
+ break;
+
+ case AF_INET6:
+#if IS_ENABLED(CONFIG_IPV6)
+ break;
+#else
+ err = -EAFNOSUPPORT;
+ goto out;
+#endif
+
+ default:
+ goto out;
+ }
+
+ switch (p->sel.family) {
+ case AF_UNSPEC:
+ break;
+
+ case AF_INET:
if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32)
goto out;