summaryrefslogtreecommitdiffstats
path: root/net/netfilter
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/Kconfig73
-rw-r--r--net/netfilter/Makefile4
-rw-r--r--net/netfilter/core.c8
-rw-r--r--net/netfilter/ipvs/Kconfig25
-rw-r--r--net/netfilter/ipvs/Makefile10
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c49
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c297
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c839
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c402
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c91
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c292
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c147
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c169
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c9
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c99
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c82
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c102
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c107
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c47
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c46
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c727
-rw-r--r--net/netfilter/nf_conntrack_acct.c14
-rw-r--r--net/netfilter/nf_conntrack_core.c180
-rw-r--r--net/netfilter/nf_conntrack_ecache.c4
-rw-r--r--net/netfilter/nf_conntrack_expect.c68
-rw-r--r--net/netfilter/nf_conntrack_extend.c28
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c12
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c2
-rw-r--r--net/netfilter/nf_conntrack_netlink.c123
-rw-r--r--net/netfilter/nf_conntrack_proto.c4
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c46
-rw-r--r--net/netfilter/nf_conntrack_sip.c44
-rw-r--r--net/netfilter/nf_conntrack_standalone.c28
-rw-r--r--net/netfilter/nf_log.c2
-rw-r--r--net/netfilter/nf_queue.c2
-rw-r--r--net/netfilter/nf_tproxy_core.c41
-rw-r--r--net/netfilter/nfnetlink_log.c73
-rw-r--r--net/netfilter/nfnetlink_queue.c39
-rw-r--r--net/netfilter/x_tables.c12
-rw-r--r--net/netfilter/xt_CHECKSUM.c70
-rw-r--r--net/netfilter/xt_CT.c5
-rw-r--r--net/netfilter/xt_IDLETIMER.c315
-rw-r--r--net/netfilter/xt_NOTRACK.c2
-rw-r--r--net/netfilter/xt_RATEEST.c12
-rw-r--r--net/netfilter/xt_SECMARK.c35
-rw-r--r--net/netfilter/xt_TCPMSS.c8
-rw-r--r--net/netfilter/xt_TEE.c8
-rw-r--r--net/netfilter/xt_TPROXY.c366
-rw-r--r--net/netfilter/xt_cluster.c2
-rw-r--r--net/netfilter/xt_connbytes.c10
-rw-r--r--net/netfilter/xt_conntrack.c11
-rw-r--r--net/netfilter/xt_cpu.c63
-rw-r--r--net/netfilter/xt_hashlimit.c15
-rw-r--r--net/netfilter/xt_ipvs.c188
-rw-r--r--net/netfilter/xt_quota.c12
-rw-r--r--net/netfilter/xt_recent.c1
-rw-r--r--net/netfilter/xt_sctp.c3
-rw-r--r--net/netfilter/xt_socket.c172
-rw-r--r--net/netfilter/xt_state.c14
-rw-r--r--net/netfilter/xt_statistic.c19
62 files changed, 4085 insertions, 1597 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8593a77cfea9..1534f2b44caf 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -40,27 +40,6 @@ config NF_CONNTRACK
if NF_CONNTRACK
-config NF_CT_ACCT
- bool "Connection tracking flow accounting"
- depends on NETFILTER_ADVANCED
- help
- If this option is enabled, the connection tracking code will
- keep per-flow packet and byte counters.
-
- Those counters can be used for flow-based accounting or the
- `connbytes' match.
-
- Please note that currently this option only sets a default state.
- You may change it at boot time with nf_conntrack.acct=0/1 kernel
- parameter or by loading the nf_conntrack module with acct=0/1.
-
- You may also disable/enable it on a running system with:
- sysctl net.netfilter.nf_conntrack_acct=0/1
-
- This option will be removed in 2.6.29.
-
- If unsure, say `N'.
-
config NF_CONNTRACK_MARK
bool 'Connection mark tracking support'
depends on NETFILTER_ADVANCED
@@ -347,6 +326,22 @@ config NETFILTER_XT_CONNMARK
comment "Xtables targets"
+config NETFILTER_XT_TARGET_CHECKSUM
+ tristate "CHECKSUM target support"
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a `CHECKSUM' target, which can be used in the iptables mangle
+ table.
+
+ You can use this target to compute and fill in the checksum in
+ a packet that lacks a checksum. This is particularly useful,
+ if you need to work around old applications such as dhcp clients,
+ that do not work well with checksum offloads, but don't want to disable
+ checksum offload in your device.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_CLASSIFY
tristate '"CLASSIFY" target support'
depends on NETFILTER_ADVANCED
@@ -424,6 +419,18 @@ config NETFILTER_XT_TARGET_HL
since you can easily create immortal packets that loop
forever on the network.
+config NETFILTER_XT_TARGET_IDLETIMER
+ tristate "IDLETIMER target support"
+ depends on NETFILTER_ADVANCED
+ help
+
+ This option adds the `IDLETIMER' target. Each matching packet
+ resets the timer associated with label specified when the rule is
+ added. When the timer expires, it triggers a sysfs notification.
+ The remaining time for expiration can be read via sysfs.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_LED
tristate '"LED" target support'
depends on LEDS_CLASS && LEDS_TRIGGERS
@@ -503,7 +510,7 @@ config NETFILTER_XT_TARGET_RATEEST
To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_TARGET_TEE
- tristate '"TEE" - packet cloning to alternate destiantion'
+ tristate '"TEE" - packet cloning to alternate destination'
depends on NETFILTER_ADVANCED
depends on (IPV6 || IPV6=n)
depends on !NF_CONNTRACK || NF_CONNTRACK
@@ -518,6 +525,7 @@ config NETFILTER_XT_TARGET_TPROXY
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
select NF_DEFRAG_IPV4
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
help
This option adds a `TPROXY' target, which is somewhat similar to
REDIRECT. It can only be used in the mangle table and is useful
@@ -618,7 +626,6 @@ config NETFILTER_XT_MATCH_CONNBYTES
tristate '"connbytes" per-connection counter match support'
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
- select NF_CT_ACCT
help
This option adds a `connbytes' match, which allows you to match the
number of bytes and/or packets for each direction within a connection.
@@ -657,6 +664,15 @@ config NETFILTER_XT_MATCH_CONNTRACK
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_MATCH_CPU
+ tristate '"cpu" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ CPU matching allows you to match packets based on the CPU
+ currently handling the packet.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_DCCP
tristate '"dccp" protocol match support'
depends on NETFILTER_ADVANCED
@@ -736,6 +752,16 @@ config NETFILTER_XT_MATCH_IPRANGE
If unsure, say M.
+config NETFILTER_XT_MATCH_IPVS
+ tristate '"ipvs" match support'
+ depends on IP_VS
+ depends on NETFILTER_ADVANCED
+ depends on NF_CONNTRACK
+ help
+ This option allows you to match against IPVS properties of a packet.
+
+ If unsure, say N.
+
config NETFILTER_XT_MATCH_LENGTH
tristate '"length" match support'
depends on NETFILTER_ADVANCED
@@ -902,6 +928,7 @@ config NETFILTER_XT_MATCH_SOCKET
depends on NETFILTER_ADVANCED
depends on !NF_CONNTRACK || NF_CONNTRACK
select NF_DEFRAG_IPV4
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
help
This option adds a `socket' match, which can be used to match
packets for which a TCP or UDP socket lookup finds a valid socket.
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 14e3a8fd8180..441050f31111 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
# targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
@@ -61,6 +62,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o
obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o
# matches
obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o
@@ -68,6 +70,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o
obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o
@@ -75,6 +78,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o
obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o
obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o
obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o
obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78b505d33bfb..85dabb86be6f 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -27,7 +27,7 @@
static DEFINE_MUTEX(afinfo_mutex);
-const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
+const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
EXPORT_SYMBOL(nf_afinfo);
int nf_register_afinfo(const struct nf_afinfo *afinfo)
@@ -105,10 +105,8 @@ EXPORT_SYMBOL(nf_register_hooks);
void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
{
- unsigned int i;
-
- for (i = 0; i < n; i++)
- nf_unregister_hook(&reg[i]);
+ while (n-- > 0)
+ nf_unregister_hook(&reg[n]);
}
EXPORT_SYMBOL(nf_unregister_hooks);
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 712ccad13344..a22dac227055 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -26,7 +26,7 @@ if IP_VS
config IP_VS_IPV6
bool "IPv6 support for IPVS"
- depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
+ depends on IPV6 = y || IP_VS = IPV6
---help---
Add IPv6 support to IPVS. This is incomplete and might be dangerous.
@@ -87,19 +87,16 @@ config IP_VS_PROTO_UDP
protocol. Say Y if unsure.
config IP_VS_PROTO_AH_ESP
- bool
- depends on UNDEFINED
+ def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH
config IP_VS_PROTO_ESP
bool "ESP load balancing support"
- select IP_VS_PROTO_AH_ESP
---help---
This option enables support for load balancing ESP (Encapsulation
Security Payload) transport protocol. Say Y if unsure.
config IP_VS_PROTO_AH
bool "AH load balancing support"
- select IP_VS_PROTO_AH_ESP
---help---
This option enables support for load balancing AH (Authentication
Header) transport protocol. Say Y if unsure.
@@ -238,7 +235,8 @@ comment 'IPVS application helper'
config IP_VS_FTP
tristate "FTP protocol helper"
- depends on IP_VS_PROTO_TCP
+ depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
+ select IP_VS_NFCT
---help---
FTP is a protocol that transfers IP address and/or port number in
the payload. In the virtual server via Network Address Translation,
@@ -250,4 +248,19 @@ config IP_VS_FTP
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+config IP_VS_NFCT
+ bool "Netfilter connection tracking"
+ depends on NF_CONNTRACK
+ ---help---
+ The Netfilter connection tracking support allows the IPVS
+ connection state to be exported to the Netfilter framework
+ for filtering purposes.
+
+config IP_VS_PE_SIP
+ tristate "SIP persistence engine"
+ depends on IP_VS_PROTO_UDP
+ depends on NF_CONNTRACK_SIP
+ ---help---
+ Allow persistence based on the SIP Call-ID
+
endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index e3baefd7066e..34ee602ddb66 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
- ip_vs_est.o ip_vs_proto.o \
- $(ip_vs_proto-objs-y)
+ ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \
+ $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
# IPVS core
@@ -32,3 +35,6 @@ obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
# IPVS application helpers
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 1cb0e834f8ff..a475edee0912 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -103,8 +103,8 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
goto out;
list_add(&inc->a_list, &app->incs_list);
- IP_VS_DBG(9, "%s application %s:%u registered\n",
- pp->name, inc->name, inc->port);
+ IP_VS_DBG(9, "%s App %s:%u registered\n",
+ pp->name, inc->name, ntohs(inc->port));
return 0;
@@ -130,7 +130,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
pp->unregister_app(inc);
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
- pp->name, inc->name, inc->port);
+ pp->name, inc->name, ntohs(inc->port));
list_del(&inc->a_list);
@@ -569,49 +569,6 @@ static const struct file_operations ip_vs_app_fops = {
};
#endif
-
-/*
- * Replace a segment of data with a new segment
- */
-int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
- char *o_buf, int o_len, char *n_buf, int n_len)
-{
- int diff;
- int o_offset;
- int o_left;
-
- EnterFunction(9);
-
- diff = n_len - o_len;
- o_offset = o_buf - (char *)skb->data;
- /* The length of left data after o_buf+o_len in the skb data */
- o_left = skb->len - (o_offset + o_len);
-
- if (diff <= 0) {
- memmove(o_buf + n_len, o_buf + o_len, o_left);
- memcpy(o_buf, n_buf, n_len);
- skb_trim(skb, skb->len + diff);
- } else if (diff <= skb_tailroom(skb)) {
- skb_put(skb, diff);
- memmove(o_buf + n_len, o_buf + o_len, o_left);
- memcpy(o_buf, n_buf, n_len);
- } else {
- if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
- return -ENOMEM;
- skb_put(skb, diff);
- memmove(skb->data + o_offset + n_len,
- skb->data + o_offset + o_len, o_left);
- skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
- }
-
- /* must update the iph total length here */
- ip_hdr(skb)->tot_len = htons(skb->len);
-
- LeaveFunction(9);
- return 0;
-}
-
-
int __init ip_vs_app_init(void)
{
/* we will replace it with proc_net_ipvs_create() soon */
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index ff04e9edbed6..e9adecdc8ca4 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -148,6 +148,42 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
& ip_vs_conn_tab_mask;
}
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
+ bool inverse)
+{
+ const union nf_inet_addr *addr;
+ __be16 port;
+
+ if (p->pe_data && p->pe->hashkey_raw)
+ return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
+ ip_vs_conn_tab_mask;
+
+ if (likely(!inverse)) {
+ addr = p->caddr;
+ port = p->cport;
+ } else {
+ addr = p->vaddr;
+ port = p->vport;
+ }
+
+ return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
+ NULL, 0, &p);
+
+ if (cp->dest && cp->dest->svc->pe) {
+ p.pe = cp->dest->svc->pe;
+ p.pe_data = cp->pe_data;
+ p.pe_data_len = cp->pe_data_len;
+ }
+
+ return ip_vs_conn_hashkey_param(&p, false);
+}
/*
* Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
@@ -158,8 +194,11 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
unsigned hash;
int ret;
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return 0;
+
/* Hash by protocol, client address and port */
- hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+ hash = ip_vs_conn_hashkey_conn(cp);
ct_write_lock(hash);
spin_lock(&cp->lock);
@@ -192,7 +231,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
int ret;
/* unhash it and decrease its reference counter */
- hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+ hash = ip_vs_conn_hashkey_conn(cp);
ct_write_lock(hash);
spin_lock(&cp->lock);
@@ -215,27 +254,26 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
/*
* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
* Called for pkts coming from OUTside-to-INside.
- * s_addr, s_port: pkt source address (foreign host)
- * d_addr, d_port: pkt dest address (load balancer)
+ * p->caddr, p->cport: pkt source address (foreign host)
+ * p->vaddr, p->vport: pkt dest address (load balancer)
*/
-static inline struct ip_vs_conn *__ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+static inline struct ip_vs_conn *
+__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp;
- hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+ hash = ip_vs_conn_hashkey_param(p, false);
ct_read_lock(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == af &&
- ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
- ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
- s_port == cp->cport && d_port == cp->vport &&
- ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
- protocol == cp->protocol) {
+ if (cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+ ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
+ p->cport == cp->cport && p->vport == cp->vport &&
+ ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+ p->protocol == cp->protocol) {
/* HIT */
atomic_inc(&cp->refcnt);
ct_read_unlock(hash);
@@ -248,76 +286,111 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
return NULL;
}
-struct ip_vs_conn *ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
{
struct ip_vs_conn *cp;
- cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
- if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
- cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
- d_port);
+ cp = __ip_vs_conn_in_get(p);
+ if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+ struct ip_vs_conn_param cport_zero_p = *p;
+ cport_zero_p.cport = 0;
+ cp = __ip_vs_conn_in_get(&cport_zero_p);
+ }
IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
- ip_vs_proto_name(protocol),
- IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
- IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
cp ? "hit" : "not hit");
return cp;
}
+static int
+ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ unsigned int proto_off, int inverse,
+ struct ip_vs_conn_param *p)
+{
+ __be16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+ if (pptr == NULL)
+ return 1;
+
+ if (likely(!inverse))
+ ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
+ &iph->daddr, pptr[1], p);
+ else
+ ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
+ &iph->saddr, pptr[0], p);
+ return 0;
+}
+
+struct ip_vs_conn *
+ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
+ struct ip_vs_protocol *pp,
+ const struct ip_vs_iphdr *iph,
+ unsigned int proto_off, int inverse)
+{
+ struct ip_vs_conn_param p;
+
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
+ return NULL;
+
+ return ip_vs_conn_in_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
+
/* Get reference to connection template */
-struct ip_vs_conn *ip_vs_ct_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp;
- hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+ hash = ip_vs_conn_hashkey_param(p, false);
ct_read_lock(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == af &&
- ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+ if (p->pe_data && p->pe->ct_match) {
+ if (p->pe->ct_match(p, cp))
+ goto out;
+ continue;
+ }
+
+ if (cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
/* protocol should only be IPPROTO_IP if
- * d_addr is a fwmark */
- ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af,
- d_addr, &cp->vaddr) &&
- s_port == cp->cport && d_port == cp->vport &&
+ * p->vaddr is a fwmark */
+ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
+ p->af, p->vaddr, &cp->vaddr) &&
+ p->cport == cp->cport && p->vport == cp->vport &&
cp->flags & IP_VS_CONN_F_TEMPLATE &&
- protocol == cp->protocol) {
- /* HIT */
- atomic_inc(&cp->refcnt);
+ p->protocol == cp->protocol)
goto out;
- }
}
cp = NULL;
out:
+ if (cp)
+ atomic_inc(&cp->refcnt);
ct_read_unlock(hash);
IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
- ip_vs_proto_name(protocol),
- IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
- IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
cp ? "hit" : "not hit");
return cp;
}
-/*
- * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- * Called for pkts coming from inside-to-OUTside.
- * s_addr, s_port: pkt source address (inside host)
- * d_addr, d_port: pkt dest address (foreign host)
- */
-struct ip_vs_conn *ip_vs_conn_out_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ * p->caddr, p->cport: pkt source address (inside host)
+ * p->vaddr, p->vport: pkt dest address (foreign host) */
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp, *ret=NULL;
@@ -325,16 +398,16 @@ struct ip_vs_conn *ip_vs_conn_out_get
/*
* Check for "full" addressed entries
*/
- hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
+ hash = ip_vs_conn_hashkey_param(p, true);
ct_read_lock(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == af &&
- ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
- ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
- d_port == cp->cport && s_port == cp->dport &&
- protocol == cp->protocol) {
+ if (cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+ p->vport == cp->cport && p->cport == cp->dport &&
+ p->protocol == cp->protocol) {
/* HIT */
atomic_inc(&cp->refcnt);
ret = cp;
@@ -345,22 +418,37 @@ struct ip_vs_conn *ip_vs_conn_out_get
ct_read_unlock(hash);
IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
- ip_vs_proto_name(protocol),
- IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
- IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
ret ? "hit" : "not hit");
return ret;
}
+struct ip_vs_conn *
+ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
+ struct ip_vs_protocol *pp,
+ const struct ip_vs_iphdr *iph,
+ unsigned int proto_off, int inverse)
+{
+ struct ip_vs_conn_param p;
+
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
+ return NULL;
+
+ return ip_vs_conn_out_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
/*
* Put back the conn and restart its timer with its timeout
*/
void ip_vs_conn_put(struct ip_vs_conn *cp)
{
- /* reset it expire in its timeout */
- mod_timer(&cp->timer, jiffies+cp->timeout);
+ unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
+ 0 : cp->timeout;
+ mod_timer(&cp->timer, jiffies+t);
__ip_vs_conn_put(cp);
}
@@ -456,6 +544,8 @@ static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
static inline void
ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
{
+ unsigned int conn_flags;
+
/* if dest is NULL, then return directly */
if (!dest)
return;
@@ -463,16 +553,20 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
/* Increase the refcnt counter of the dest */
atomic_inc(&dest->refcnt);
+ conn_flags = atomic_read(&dest->conn_flags);
+ if (cp->protocol != IPPROTO_UDP)
+ conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
/* Bind with the destination and its corresponding transmitter */
- if ((cp->flags & IP_VS_CONN_F_SYNC) &&
- (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
+ if (cp->flags & IP_VS_CONN_F_SYNC) {
/* if the connection is not template and is created
* by sync, preserve the activity flag.
*/
- cp->flags |= atomic_read(&dest->conn_flags) &
- (~IP_VS_CONN_F_INACTIVE);
- else
- cp->flags |= atomic_read(&dest->conn_flags);
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+ /* connections inherit forwarding method from dest */
+ cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
+ }
+ cp->flags |= conn_flags;
cp->dest = dest;
IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
@@ -653,7 +747,7 @@ static void ip_vs_conn_expire(unsigned long data)
/*
* unhash it if it is hashed in the conn table
*/
- if (!ip_vs_conn_unhash(cp))
+ if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
goto expire_later;
/*
@@ -668,6 +762,10 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->control)
ip_vs_control_del(cp);
+ if (cp->flags & IP_VS_CONN_F_NFCT)
+ ip_vs_conn_drop_conntrack(cp);
+
+ kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
@@ -702,13 +800,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
* Create a new connection entry and hash it into the ip_vs_conn_tab
*/
struct ip_vs_conn *
-ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
- const union nf_inet_addr *vaddr, __be16 vport,
+ip_vs_conn_new(const struct ip_vs_conn_param *p,
const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
struct ip_vs_dest *dest)
{
struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+ struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
@@ -718,17 +815,21 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
INIT_LIST_HEAD(&cp->c_list);
setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
- cp->af = af;
- cp->protocol = proto;
- ip_vs_addr_copy(af, &cp->caddr, caddr);
- cp->cport = cport;
- ip_vs_addr_copy(af, &cp->vaddr, vaddr);
- cp->vport = vport;
+ cp->af = p->af;
+ cp->protocol = p->protocol;
+ ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+ cp->cport = p->cport;
+ ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+ cp->vport = p->vport;
/* proto should only be IPPROTO_IP if d_addr is a fwmark */
- ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af,
+ ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
&cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
+ if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+ cp->pe_data = p->pe_data;
+ cp->pe_data_len = p->pe_data_len;
+ }
spin_lock_init(&cp->lock);
/*
@@ -754,7 +855,7 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
/* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
- if (af == AF_INET6)
+ if (p->af == AF_INET6)
ip_vs_bind_xmit_v6(cp);
else
#endif
@@ -763,13 +864,22 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
if (unlikely(pp && atomic_read(&pp->appcnt)))
ip_vs_bind_app(cp, pp);
+ /*
+ * Allow conntrack to be preserved. By default, conntrack
+ * is created and destroyed for every packet.
+ * Sometimes keeping conntrack can be useful for
+ * IP_VS_CONN_F_ONE_PACKET too.
+ */
+
+ if (ip_vs_conntrack_enabled())
+ cp->flags |= IP_VS_CONN_F_NFCT;
+
/* Hash it in the ip_vs_conn_tab finally */
ip_vs_conn_hash(cp);
return cp;
}
-
/*
* /proc/net/ip_vs_conn entries
*/
@@ -785,7 +895,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
if (pos-- == 0) {
seq->private = &ip_vs_conn_tab[idx];
- return cp;
+ return cp;
}
}
ct_read_unlock_bh(idx);
@@ -842,30 +952,45 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN)
seq_puts(seq,
- "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n");
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
else {
const struct ip_vs_conn *cp = v;
+ char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+ size_t len = 0;
+
+ if (cp->dest && cp->pe_data &&
+ cp->dest->svc->pe->show_pe_data) {
+ pe_data[0] = ' ';
+ len = strlen(cp->dest->svc->pe->name);
+ memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+ pe_data[len + 1] = ' ';
+ len += 2;
+ len += cp->dest->svc->pe->show_pe_data(cp,
+ pe_data + len);
+ }
+ pe_data[len] = '\0';
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
- seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n",
+ seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+ "%pI6 %04X %-11s %7lu%s\n",
ip_vs_proto_name(cp->protocol),
&cp->caddr.in6, ntohs(cp->cport),
&cp->vaddr.in6, ntohs(cp->vport),
&cp->daddr.in6, ntohs(cp->dport),
ip_vs_state_name(cp->protocol, cp->state),
- (cp->timer.expires-jiffies)/HZ);
+ (cp->timer.expires-jiffies)/HZ, pe_data);
else
#endif
seq_printf(seq,
"%-3s %08X %04X %08X %04X"
- " %08X %04X %-11s %7lu\n",
+ " %08X %04X %-11s %7lu%s\n",
ip_vs_proto_name(cp->protocol),
ntohl(cp->caddr.ip), ntohs(cp->cport),
ntohl(cp->vaddr.ip), ntohs(cp->vport),
ntohl(cp->daddr.ip), ntohs(cp->dport),
ip_vs_state_name(cp->protocol, cp->state),
- (cp->timer.expires-jiffies)/HZ);
+ (cp->timer.expires-jiffies)/HZ, pe_data);
}
return 0;
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 1cd6e3fd058b..b4e51e9c5a04 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -40,6 +40,7 @@
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
#include <net/route.h>
+#include <net/ip6_checksum.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
@@ -47,6 +48,7 @@
#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
#include <linux/netfilter_ipv6.h>
+#include <net/ip6_route.h>
#endif
#include <net/ip_vs.h>
@@ -54,7 +56,6 @@
EXPORT_SYMBOL(register_ip_vs_scheduler);
EXPORT_SYMBOL(unregister_ip_vs_scheduler);
-EXPORT_SYMBOL(ip_vs_skb_replace);
EXPORT_SYMBOL(ip_vs_proto_name);
EXPORT_SYMBOL(ip_vs_conn_new);
EXPORT_SYMBOL(ip_vs_conn_in_get);
@@ -176,6 +177,18 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
return pp->state_transition(cp, direction, skb, pp);
}
+static inline void
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+ struct sk_buff *skb, int protocol,
+ const union nf_inet_addr *caddr, __be16 cport,
+ const union nf_inet_addr *vaddr, __be16 vport,
+ struct ip_vs_conn_param *p)
+{
+ ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+ p->pe = svc->pe;
+ if (p->pe && p->pe->fill_param)
+ p->pe->fill_param(p, skb);
+}
/*
* IPVS persistent scheduling function
@@ -186,14 +199,16 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
*/
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc,
- const struct sk_buff *skb,
+ struct sk_buff *skb,
__be16 ports[2])
{
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_dest *dest;
struct ip_vs_conn *ct;
- __be16 dport; /* destination port to forward */
+ __be16 dport = 0; /* destination port to forward */
+ unsigned int flags;
+ struct ip_vs_conn_param param;
union nf_inet_addr snet; /* source network of the client,
after masking */
@@ -226,129 +241,85 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
* is created for other persistent services.
*/
- if (ports[1] == svc->port) {
- /* Check if a template already exists */
- if (svc->port != FTPPORT)
- ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
- &iph.daddr, ports[1]);
- else
- ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
- &iph.daddr, 0);
-
- if (!ct || !ip_vs_check_template(ct)) {
- /*
- * No template found or the dest of the connection
- * template is not available.
- */
- dest = svc->scheduler->schedule(svc, skb);
- if (dest == NULL) {
- IP_VS_DBG(1, "p-schedule: no dest found.\n");
- return NULL;
- }
-
- /*
- * Create a template like <protocol,caddr,0,
- * vaddr,vport,daddr,dport> for non-ftp service,
- * and <protocol,caddr,0,vaddr,0,daddr,0>
- * for ftp service.
+ {
+ int protocol = iph.protocol;
+ const union nf_inet_addr *vaddr = &iph.daddr;
+ const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+ __be16 vport = 0;
+
+ if (ports[1] == svc->port) {
+ /* non-FTP template:
+ * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+ * FTP template:
+ * <protocol, caddr, 0, vaddr, 0, daddr, 0>
*/
if (svc->port != FTPPORT)
- ct = ip_vs_conn_new(svc->af, iph.protocol,
- &snet, 0,
- &iph.daddr,
- ports[1],
- &dest->addr, dest->port,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- else
- ct = ip_vs_conn_new(svc->af, iph.protocol,
- &snet, 0,
- &iph.daddr, 0,
- &dest->addr, 0,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- if (ct == NULL)
- return NULL;
-
- ct->timeout = svc->timeout;
+ vport = ports[1];
} else {
- /* set destination with the found template */
- dest = ct->dest;
- }
- dport = dest->port;
- } else {
- /*
- * Note: persistent fwmark-based services and persistent
- * port zero service are handled here.
- * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
- * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
- */
- if (svc->fwmark) {
- union nf_inet_addr fwmark = {
- .ip = htonl(svc->fwmark)
- };
-
- ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
- &fwmark, 0);
- } else
- ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
- &iph.daddr, 0);
-
- if (!ct || !ip_vs_check_template(ct)) {
- /*
- * If it is not persistent port zero, return NULL,
- * otherwise create a connection template.
+ /* Note: persistent fwmark-based services and
+ * persistent port zero service are handled here.
+ * fwmark template:
+ * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+ * port zero template:
+ * <protocol,caddr,0,vaddr,0,daddr,0>
*/
- if (svc->port)
- return NULL;
-
- dest = svc->scheduler->schedule(svc, skb);
- if (dest == NULL) {
- IP_VS_DBG(1, "p-schedule: no dest found.\n");
- return NULL;
+ if (svc->fwmark) {
+ protocol = IPPROTO_IP;
+ vaddr = &fwmark;
}
+ }
+ ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+ vaddr, vport, &param);
+ }
- /*
- * Create a template according to the service
- */
- if (svc->fwmark) {
- union nf_inet_addr fwmark = {
- .ip = htonl(svc->fwmark)
- };
-
- ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
- &snet, 0,
- &fwmark, 0,
- &dest->addr, 0,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- } else
- ct = ip_vs_conn_new(svc->af, iph.protocol,
- &snet, 0,
- &iph.daddr, 0,
- &dest->addr, 0,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- if (ct == NULL)
- return NULL;
-
- ct->timeout = svc->timeout;
- } else {
- /* set destination with the found template */
- dest = ct->dest;
+ /* Check if a template already exists */
+ ct = ip_vs_ct_in_get(&param);
+ if (!ct || !ip_vs_check_template(ct)) {
+ /* No template found or the dest of the connection
+ * template is not available.
+ */
+ dest = svc->scheduler->schedule(svc, skb);
+ if (!dest) {
+ IP_VS_DBG(1, "p-schedule: no dest found.\n");
+ kfree(param.pe_data);
+ return NULL;
}
- dport = ports[1];
+
+ if (ports[1] == svc->port && svc->port != FTPPORT)
+ dport = dest->port;
+
+ /* Create a template
+ * This adds param.pe_data to the template,
+ * and thus param.pe_data will be destroyed
+ * when the template expires */
+ ct = ip_vs_conn_new(&param, &dest->addr, dport,
+ IP_VS_CONN_F_TEMPLATE, dest);
+ if (ct == NULL) {
+ kfree(param.pe_data);
+ return NULL;
+ }
+
+ ct->timeout = svc->timeout;
+ } else {
+ /* set destination with the found template */
+ dest = ct->dest;
+ kfree(param.pe_data);
}
+ dport = ports[1];
+ if (dport == svc->port && dest->port)
+ dport = dest->port;
+
+ flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+ && iph.protocol == IPPROTO_UDP)?
+ IP_VS_CONN_F_ONE_PACKET : 0;
+
/*
* Create a new connection according to the template
*/
- cp = ip_vs_conn_new(svc->af, iph.protocol,
- &iph.saddr, ports[0],
- &iph.daddr, ports[1],
- &dest->addr, dport,
- 0,
- dest);
+ ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
+ &iph.daddr, ports[1], &param);
+ cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
if (cp == NULL) {
ip_vs_conn_put(ct);
return NULL;
@@ -372,23 +343,53 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* Protocols supported: TCP, UDP
*/
struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+ struct ip_vs_protocol *pp, int *ignored)
{
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_dest *dest;
__be16 _ports[2], *pptr;
+ unsigned int flags;
+ *ignored = 1;
ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
if (pptr == NULL)
return NULL;
/*
+ * FTPDATA needs this check when using local real server.
+ * Never schedule Active FTPDATA connections from real server.
+ * For LVS-NAT they must be already created. For other methods
+ * with persistence the connection is created on SYN+ACK.
+ */
+ if (pptr[0] == FTPDATA) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ "Not scheduling FTPDATA");
+ return NULL;
+ }
+
+ /*
+ * Do not schedule replies from local real server. It is risky
+ * for fwmark services but mostly for persistent services.
+ */
+ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
+ (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ "Not scheduling reply for existing connection");
+ __ip_vs_conn_put(cp);
+ return NULL;
+ }
+
+ /*
* Persistent service
*/
- if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
+ *ignored = 0;
return ip_vs_sched_persist(svc, skb, pptr);
+ }
/*
* Non-persistent service
@@ -401,23 +402,31 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
return NULL;
}
+ *ignored = 0;
+
dest = svc->scheduler->schedule(svc, skb);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
}
+ flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+ && iph.protocol == IPPROTO_UDP)?
+ IP_VS_CONN_F_ONE_PACKET : 0;
+
/*
* Create a connection entry.
*/
- cp = ip_vs_conn_new(svc->af, iph.protocol,
- &iph.saddr, pptr[0],
- &iph.daddr, pptr[1],
- &dest->addr, dest->port ? dest->port : pptr[1],
- 0,
- dest);
- if (cp == NULL)
- return NULL;
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
+ pptr[0], &iph.daddr, pptr[1], &p);
+ cp = ip_vs_conn_new(&p, &dest->addr,
+ dest->port ? dest->port : pptr[1],
+ flags, dest);
+ if (!cp)
+ return NULL;
+ }
IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
"d:%s:%u conn->flags:%X conn->refcnt:%d\n",
@@ -464,20 +473,26 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
int ret, cs;
struct ip_vs_conn *cp;
+ unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+ iph.protocol == IPPROTO_UDP)?
+ IP_VS_CONN_F_ONE_PACKET : 0;
union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
ip_vs_service_put(svc);
/* create a new connection entry */
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
- cp = ip_vs_conn_new(svc->af, iph.protocol,
- &iph.saddr, pptr[0],
- &iph.daddr, pptr[1],
- &daddr, 0,
- IP_VS_CONN_F_BYPASS,
- NULL);
- if (cp == NULL)
- return NF_DROP;
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(svc->af, iph.protocol,
+ &iph.saddr, pptr[0],
+ &iph.daddr, pptr[1], &p);
+ cp = ip_vs_conn_new(&p, &daddr, 0,
+ IP_VS_CONN_F_BYPASS | flags,
+ NULL);
+ if (!cp)
+ return NF_DROP;
+ }
/* statistics */
ip_vs_in_stats(cp, skb);
@@ -515,38 +530,32 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
*/
#ifdef CONFIG_IP_VS_IPV6
- if (svc->af == AF_INET6)
+ if (svc->af == AF_INET6) {
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
- else
+ } else
#endif
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
return NF_DROP;
}
-
-/*
- * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
- * chain, and is used for VS/NAT.
- * It detects packets for VS/NAT connections and sends the packets
- * immediately. This can avoid that iptable_nat mangles the packets
- * for VS/NAT.
- */
-static unsigned int ip_vs_post_routing(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
{
- if (!skb->ipvs_property)
- return NF_ACCEPT;
- /* The packet was sent from IPVS, exit this chain */
- return NF_STOP;
+ return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
}
-__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
{
- return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+ if (NF_INET_LOCAL_IN == hooknum)
+ return IP_DEFRAG_VS_IN;
+ if (NF_INET_FORWARD == hooknum)
+ return IP_DEFRAG_VS_FWD;
+ return IP_DEFRAG_VS_OUT;
}
static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -609,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (inout)
- IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
"Forwarding altered outgoing ICMP");
else
- IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
"Forwarding altered incoming ICMP");
}
@@ -646,17 +655,21 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
}
/* And finally the ICMP checksum */
- icmph->icmp6_cksum = 0;
- /* TODO IPv6: is this correct for ICMPv6? */
- ip_vs_checksum_complete(skb, icmp_offset);
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
+ skb->len - icmp_offset,
+ IPPROTO_ICMPV6, 0);
+ skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
+ skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+ skb->ip_summed = CHECKSUM_PARTIAL;
if (inout)
- IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
- "Forwarding altered outgoing ICMPv6");
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+ (void *)ciph - (void *)iph,
+ "Forwarding altered outgoing ICMPv6");
else
- IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
- "Forwarding altered incoming ICMPv6");
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+ (void *)ciph - (void *)iph,
+ "Forwarding altered incoming ICMPv6");
}
#endif
@@ -697,10 +710,25 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
#endif
ip_vs_nat_icmp(skb, pp, cp, 1);
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+ goto out;
+ } else
+#endif
+ if ((sysctl_ip_vs_snat_reroute ||
+ skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+ ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ goto out;
+
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
skb->ipvs_property = 1;
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
verdict = NF_ACCEPT;
out:
@@ -714,7 +742,8 @@ out:
* Find any that might be relevant, check against existing connections.
* Currently handles error types - unreachable, quench, ttl exceeded.
*/
-static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
+ unsigned int hooknum)
{
struct iphdr *iph;
struct icmphdr _icmph, *ic;
@@ -729,7 +758,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
/* reassemble IP fragments */
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
- if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+ if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -772,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
pp->dont_defrag))
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+ "Checking outgoing ICMP for");
offset += cih->ihl * 4;
@@ -788,7 +818,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
}
#ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
+ unsigned int hooknum)
{
struct ipv6hdr *iph;
struct icmp6hdr _icmph, *ic;
@@ -804,7 +835,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
/* reassemble IP fragments */
if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
- if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
+ if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -847,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+ "Checking outgoing ICMPv6 for");
offset += sizeof(struct ipv6hdr);
@@ -895,7 +927,7 @@ static unsigned int
handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, int ihl)
{
- IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+ IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
if (!skb_make_writable(skb, ihl))
goto drop;
@@ -914,6 +946,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
ip_send_check(ip_hdr(skb));
}
+ /*
+ * nf_iterate does not expect change in the skb->dst->dev.
+ * It looks like it is not fatal to enable this code for hooks
+ * where our handlers are at the end of the chain list and
+ * when all next handlers use skb->dst->dev and not outdev.
+ * It will definitely route properly the inout NAT traffic
+ * when multiple paths are used.
+ */
+
/* For policy routing, packets originating from this
* machine itself may be routed differently to packets
* passing through. We want this packet to be routed as
@@ -922,20 +963,25 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
*/
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- if (ip6_route_me_harder(skb) != 0)
+ if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
goto drop;
} else
#endif
- if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ if ((sysctl_ip_vs_snat_reroute ||
+ skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+ ip_route_me_harder(skb, RTN_LOCAL) != 0)
goto drop;
- IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+ IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
- ip_vs_conn_put(cp);
-
skb->ipvs_property = 1;
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
+ ip_vs_conn_put(cp);
LeaveFunction(11);
return NF_ACCEPT;
@@ -943,35 +989,46 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
drop:
ip_vs_conn_put(cp);
kfree_skb(skb);
+ LeaveFunction(11);
return NF_STOLEN;
}
/*
- * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
* Check if outgoing packet belongs to the established ip_vs_conn.
*/
static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
{
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
- int af;
EnterFunction(11);
- af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
+ /* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
+ /* Bad... Do not break raw sockets */
+ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ af == AF_INET)) {
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ return NF_ACCEPT;
+ }
+
+ if (unlikely(!skb_dst(skb)))
+ return NF_ACCEPT;
+
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
- int related, verdict = ip_vs_out_icmp_v6(skb, &related);
+ int related;
+ int verdict = ip_vs_out_icmp_v6(skb, &related,
+ hooknum);
if (related)
return verdict;
@@ -980,7 +1037,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
} else
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
- int related, verdict = ip_vs_out_icmp(skb, &related);
+ int related;
+ int verdict = ip_vs_out_icmp(skb, &related, hooknum);
if (related)
return verdict;
@@ -994,19 +1052,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
/* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
- int related, verdict = ip_vs_out_icmp_v6(skb, &related);
-
- if (related)
- return verdict;
-
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+ if (ip_vs_gather_frags_v6(skb,
+ ip_vs_defrag_user(hooknum)))
+ return NF_STOLEN;
}
+
+ ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
} else
#endif
if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
!pp->dont_defrag)) {
- if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+ if (ip_vs_gather_frags(skb,
+ ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
@@ -1017,55 +1075,123 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
*/
cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
- if (unlikely(!cp)) {
- if (sysctl_ip_vs_nat_icmp_send &&
- (pp->protocol == IPPROTO_TCP ||
- pp->protocol == IPPROTO_UDP ||
- pp->protocol == IPPROTO_SCTP)) {
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, iph.len,
- sizeof(_ports), _ports);
- if (pptr == NULL)
- return NF_ACCEPT; /* Not for me */
- if (ip_vs_lookup_real_service(af, iph.protocol,
- &iph.saddr,
- pptr[0])) {
- /*
- * Notify the real server: there is no
- * existing entry if it is not RST
- * packet or not TCP packet.
- */
- if ((iph.protocol != IPPROTO_TCP &&
- iph.protocol != IPPROTO_SCTP)
- || ((iph.protocol == IPPROTO_TCP
- && !is_tcp_reset(skb, iph.len))
- || (iph.protocol == IPPROTO_SCTP
- && !is_sctp_abort(skb,
- iph.len)))) {
+ if (likely(cp))
+ return handle_response(af, skb, pp, cp, iph.len);
+ if (sysctl_ip_vs_nat_icmp_send &&
+ (pp->protocol == IPPROTO_TCP ||
+ pp->protocol == IPPROTO_UDP ||
+ pp->protocol == IPPROTO_SCTP)) {
+ __be16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, iph.len,
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ return NF_ACCEPT; /* Not for me */
+ if (ip_vs_lookup_real_service(af, iph.protocol,
+ &iph.saddr,
+ pptr[0])) {
+ /*
+ * Notify the real server: there is no
+ * existing entry if it is not RST
+ * packet or not TCP packet.
+ */
+ if ((iph.protocol != IPPROTO_TCP &&
+ iph.protocol != IPPROTO_SCTP)
+ || ((iph.protocol == IPPROTO_TCP
+ && !is_tcp_reset(skb, iph.len))
+ || (iph.protocol == IPPROTO_SCTP
+ && !is_sctp_abort(skb,
+ iph.len)))) {
#ifdef CONFIG_IP_VS_IPV6
- if (af == AF_INET6)
- icmpv6_send(skb,
- ICMPV6_DEST_UNREACH,
- ICMPV6_PORT_UNREACH,
- 0);
- else
+ if (af == AF_INET6) {
+ struct net *net =
+ dev_net(skb_dst(skb)->dev);
+
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ icmpv6_send(skb,
+ ICMPV6_DEST_UNREACH,
+ ICMPV6_PORT_UNREACH,
+ 0);
+ } else
#endif
- icmp_send(skb,
- ICMP_DEST_UNREACH,
- ICMP_PORT_UNREACH, 0);
- return NF_DROP;
- }
+ icmp_send(skb,
+ ICMP_DEST_UNREACH,
+ ICMP_PORT_UNREACH, 0);
+ return NF_DROP;
}
}
- IP_VS_DBG_PKT(12, pp, skb, 0,
- "packet continues traversal as normal");
- return NF_ACCEPT;
}
+ IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ "ip_vs_out: packet continues traversal as normal");
+ return NF_ACCEPT;
+}
- return handle_response(af, skb, pp, cp, iph.len);
+/*
+ * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ * used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_out(hooknum, skb, AF_INET);
+}
+
+/*
+ * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int verdict;
+
+ /* Disable BH in LOCAL_OUT until all places are fixed */
+ local_bh_disable();
+ verdict = ip_vs_out(hooknum, skb, AF_INET);
+ local_bh_enable();
+ return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ * used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_out(hooknum, skb, AF_INET6);
+}
+
+/*
+ * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int verdict;
+
+ /* Disable BH in LOCAL_OUT until all places are fixed */
+ local_bh_disable();
+ verdict = ip_vs_out(hooknum, skb, AF_INET6);
+ local_bh_enable();
+ return verdict;
}
+#endif
/*
* Handle ICMP messages in the outside-to-inside direction (incoming).
@@ -1089,8 +1215,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
/* reassemble IP fragments */
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
- if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
- IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
+ if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -1133,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
pp->dont_defrag))
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+ "Checking incoming ICMP for");
offset += cih->ihl * 4;
@@ -1167,7 +1293,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
offset += 2 * sizeof(__u16);
verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
- /* do not touch skb anymore */
+ /* LOCALNODE from FORWARD hook is not supported */
+ if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+ skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
+ IP_VS_DBG(1, "%s(): "
+ "local delivery to %pI4 but in FORWARD\n",
+ __func__, &skb_rtable(skb)->rt_dst);
+ verdict = NF_DROP;
+ }
out:
__ip_vs_conn_put(cp);
@@ -1188,14 +1321,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
struct ip_vs_protocol *pp;
unsigned int offset, verdict;
union nf_inet_addr snet;
+ struct rt6_info *rt;
*related = 1;
/* reassemble IP fragments */
if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
- if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
- IP_DEFRAG_VS_IN :
- IP_DEFRAG_VS_FWD))
+ if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -1238,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+ "Checking incoming ICMPv6 for");
offset += sizeof(struct ipv6hdr);
@@ -1266,7 +1399,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
IPPROTO_SCTP == cih->nexthdr)
offset += 2 * sizeof(__u16);
verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
- /* do not touch skb anymore */
+ /* LOCALNODE from FORWARD hook is not supported */
+ if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+ (rt = (struct rt6_info *) skb_dst(skb)) &&
+ rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
+ IP_VS_DBG(1, "%s(): "
+ "local delivery to %pI6 but in FORWARD\n",
+ __func__, &rt->rt6i_dst);
+ verdict = NF_DROP;
+ }
__ip_vs_conn_put(cp);
@@ -1280,35 +1421,49 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
* and send it on its way...
*/
static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
{
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
- int ret, restart, af, pkts;
+ int ret, restart, pkts;
- af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ /* Already marked as IPVS request or reply? */
+ if (skb->ipvs_property)
+ return NF_ACCEPT;
/*
- * Big tappo: only PACKET_HOST, including loopback for local client
- * Don't handle local packets on IPv6 for now
+ * Big tappo:
+ * - remote client: only PACKET_HOST
+ * - route: used for struct net when skb->dev is unset
*/
- if (unlikely(skb->pkt_type != PACKET_HOST)) {
- IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
- skb->pkt_type,
- iph.protocol,
- IP_VS_DBG_ADDR(af, &iph.daddr));
+ if (unlikely((skb->pkt_type != PACKET_HOST &&
+ hooknum != NF_INET_LOCAL_OUT) ||
+ !skb_dst(skb))) {
+ ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+ " ignored in hook %u\n",
+ skb->pkt_type, iph.protocol,
+ IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
return NF_ACCEPT;
}
+ ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+ /* Bad... Do not break raw sockets */
+ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ af == AF_INET)) {
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ return NF_ACCEPT;
+ }
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
- int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
+ int related;
+ int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
if (related)
return verdict;
@@ -1317,7 +1472,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
} else
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
- int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
+ int related;
+ int verdict = ip_vs_in_icmp(skb, &related, hooknum);
if (related)
return verdict;
@@ -1337,23 +1493,18 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
if (unlikely(!cp)) {
int v;
- /* For local client packets, it could be a response */
- cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
- if (cp)
- return handle_response(af, skb, pp, cp, iph.len);
-
if (!pp->conn_schedule(af, skb, pp, &v, &cp))
return v;
}
if (unlikely(!cp)) {
/* sorry, all this trouble for a no-hit :) */
- IP_VS_DBG_PKT(12, pp, skb, 0,
- "packet continues traversal as normal");
+ IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ "ip_vs_in: packet continues traversal as normal");
return NF_ACCEPT;
}
- IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+ IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -1389,8 +1540,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
cp->protocol == IPPROTO_SCTP) {
if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
- (atomic_read(&cp->in_pkts) %
- sysctl_ip_vs_sync_threshold[1]
+ (pkts % sysctl_ip_vs_sync_threshold[1]
== sysctl_ip_vs_sync_threshold[0])) ||
(cp->old_state != cp->state &&
((cp->state == IP_VS_SCTP_S_CLOSED) ||
@@ -1401,7 +1551,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
}
}
- if (af == AF_INET &&
+ /* Keep this block last: TCP and others with pp->num_states <= 1 */
+ else if (af == AF_INET &&
(ip_vs_sync_state & IP_VS_STATE_MASTER) &&
(((cp->protocol != IPPROTO_TCP ||
cp->state == IP_VS_TCP_S_ESTABLISHED) &&
@@ -1420,6 +1571,72 @@ out:
return ret;
}
+/*
+ * AF_INET handler in NF_INET_LOCAL_IN chain
+ * Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_in(hooknum, skb, AF_INET);
+}
+
+/*
+ * AF_INET handler in NF_INET_LOCAL_OUT chain
+ * Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int verdict;
+
+ /* Disable BH in LOCAL_OUT until all places are fixed */
+ local_bh_disable();
+ verdict = ip_vs_in(hooknum, skb, AF_INET);
+ local_bh_enable();
+ return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ * AF_INET6 handler in NF_INET_LOCAL_IN chain
+ * Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_in(hooknum, skb, AF_INET6);
+}
+
+/*
+ * AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ * Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int verdict;
+
+ /* Disable BH in LOCAL_OUT until all places are fixed */
+ local_bh_disable();
+ verdict = ip_vs_in(hooknum, skb, AF_INET6);
+ local_bh_enable();
+ return verdict;
+}
+
+#endif
+
/*
* It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
@@ -1460,23 +1677,39 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply4,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = 99,
+ },
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
- .hook = ip_vs_in,
+ .hook = ip_vs_remote_request4,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = 100,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = 101,
},
- /* After packet filtering, change source only for VS/NAT */
+ /* Before ip_vs_in, change source only for VS/NAT */
+ {
+ .hook = ip_vs_local_reply4,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = -99,
+ },
+ /* After mangle, schedule and forward local requests */
{
- .hook = ip_vs_out,
+ .hook = ip_vs_local_request4,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_INET_FORWARD,
- .priority = 100,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = -98,
},
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1484,35 +1717,51 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
.hook = ip_vs_forward_icmp,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_INET_FORWARD,
- .priority = 99,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
},
- /* Before the netfilter connection tracking, exit from POST_ROUTING */
+ /* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_post_routing,
+ .hook = ip_vs_reply4,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_NAT_SRC-1,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
},
#ifdef CONFIG_IP_VS_IPV6
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply6,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = 99,
+ },
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
- .hook = ip_vs_in,
+ .hook = ip_vs_remote_request6,
.owner = THIS_MODULE,
.pf = PF_INET6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = 100,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = 101,
},
- /* After packet filtering, change source only for VS/NAT */
+ /* Before ip_vs_in, change source only for VS/NAT */
+ {
+ .hook = ip_vs_local_reply6,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = -99,
+ },
+ /* After mangle, schedule and forward local requests */
{
- .hook = ip_vs_out,
+ .hook = ip_vs_local_request6,
.owner = THIS_MODULE,
.pf = PF_INET6,
- .hooknum = NF_INET_FORWARD,
- .priority = 100,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = -98,
},
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1520,16 +1769,16 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
.hook = ip_vs_forward_icmp_v6,
.owner = THIS_MODULE,
.pf = PF_INET6,
- .hooknum = NF_INET_FORWARD,
- .priority = 99,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
},
- /* Before the netfilter connection tracking, exit from POST_ROUTING */
+ /* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_post_routing,
+ .hook = ip_vs_reply6,
.owner = THIS_MODULE,
.pf = PF_INET6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP6_PRI_NAT_SRC-1,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
},
#endif
};
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 36dc1d88c2fa..5f5daa30b0af 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -61,7 +61,7 @@ static DEFINE_RWLOCK(__ip_vs_svc_lock);
static DEFINE_RWLOCK(__ip_vs_rs_lock);
/* lock for state and timeout tables */
-static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
+static DEFINE_SPINLOCK(ip_vs_securetcp_lock);
/* lock for drop entry handling */
static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
@@ -88,6 +88,10 @@ int sysctl_ip_vs_expire_nodest_conn = 0;
int sysctl_ip_vs_expire_quiescent_template = 0;
int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
int sysctl_ip_vs_nat_icmp_send = 0;
+#ifdef CONFIG_IP_VS_NFCT
+int sysctl_ip_vs_conntrack;
+#endif
+int sysctl_ip_vs_snat_reroute = 1;
#ifdef CONFIG_IP_VS_DEBUG
@@ -204,7 +208,7 @@ static void update_defense_level(void)
spin_unlock(&__ip_vs_droppacket_lock);
/* secure_tcp */
- write_lock(&__ip_vs_securetcp_lock);
+ spin_lock(&ip_vs_securetcp_lock);
switch (sysctl_ip_vs_secure_tcp) {
case 0:
if (old_secure_tcp >= 2)
@@ -238,7 +242,7 @@ static void update_defense_level(void)
old_secure_tcp = sysctl_ip_vs_secure_tcp;
if (to_change >= 0)
ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
- write_unlock(&__ip_vs_securetcp_lock);
+ spin_unlock(&ip_vs_securetcp_lock);
local_bh_enable();
}
@@ -401,7 +405,7 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
* Get service by {proto,addr,port} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
+__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
__be16 vport)
{
unsigned hash;
@@ -416,7 +420,6 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
&& (svc->port == vport)
&& (svc->protocol == protocol)) {
/* HIT */
- atomic_inc(&svc->usecnt);
return svc;
}
}
@@ -429,7 +432,7 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
* Get service by {fwmark} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_svc_fwm_get(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(int af, __u32 fwmark)
{
unsigned hash;
struct ip_vs_service *svc;
@@ -440,7 +443,6 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark)
list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
if (svc->fwmark == fwmark && svc->af == af) {
/* HIT */
- atomic_inc(&svc->usecnt);
return svc;
}
}
@@ -459,14 +461,14 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
/*
* Check the table hashed by fwmark first
*/
- if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
+ if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
goto out;
/*
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
- svc = __ip_vs_service_get(af, protocol, vaddr, vport);
+ svc = __ip_vs_service_find(af, protocol, vaddr, vport);
if (svc == NULL
&& protocol == IPPROTO_TCP
@@ -476,7 +478,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
- svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
+ svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
}
if (svc == NULL
@@ -484,10 +486,12 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
/*
* Check if the catch-all port (port zero) exists
*/
- svc = __ip_vs_service_get(af, protocol, vaddr, 0);
+ svc = __ip_vs_service_find(af, protocol, vaddr, 0);
}
out:
+ if (svc)
+ atomic_inc(&svc->usecnt);
read_unlock(&__ip_vs_svc_lock);
IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
@@ -506,14 +510,19 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
dest->svc = svc;
}
-static inline void
+static void
__ip_vs_unbind_svc(struct ip_vs_dest *dest)
{
struct ip_vs_service *svc = dest->svc;
dest->svc = NULL;
- if (atomic_dec_and_test(&svc->refcnt))
+ if (atomic_dec_and_test(&svc->refcnt)) {
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ svc->fwmark,
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port), atomic_read(&svc->usecnt));
kfree(svc);
+ }
}
@@ -758,31 +767,18 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
* Update a destination in the given service
*/
static void
-__ip_vs_update_dest(struct ip_vs_service *svc,
- struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+ struct ip_vs_dest_user_kern *udest, int add)
{
int conn_flags;
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
- conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
-
- /* check if local node and update the flags */
-#ifdef CONFIG_IP_VS_IPV6
- if (svc->af == AF_INET6) {
- if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
- conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
- | IP_VS_CONN_F_LOCALNODE;
- }
- } else
-#endif
- if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
- conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
- | IP_VS_CONN_F_LOCALNODE;
- }
+ conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
+ conn_flags |= IP_VS_CONN_F_INACTIVE;
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
- if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/*
@@ -813,6 +809,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
dest->u_threshold = udest->u_threshold;
dest->l_threshold = udest->l_threshold;
+
+ spin_lock(&dest->dst_lock);
+ ip_vs_dst_reset(dest);
+ spin_unlock(&dest->dst_lock);
+
+ if (add)
+ ip_vs_new_estimator(&dest->stats);
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /* Wait until all other svc users go away */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+ if (add) {
+ list_add(&dest->n_list, &svc->destinations);
+ svc->num_dests++;
+ }
+
+ /* call the update_service, because server weight may be changed */
+ if (svc->scheduler->update_service)
+ svc->scheduler->update_service(svc);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
}
@@ -843,7 +862,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
return -EINVAL;
}
- dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
+ dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
if (dest == NULL) {
pr_err("%s(): no memory.\n", __func__);
return -ENOMEM;
@@ -860,13 +879,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atomic_set(&dest->activeconns, 0);
atomic_set(&dest->inactconns, 0);
atomic_set(&dest->persistconns, 0);
- atomic_set(&dest->refcnt, 0);
+ atomic_set(&dest->refcnt, 1);
INIT_LIST_HEAD(&dest->d_list);
spin_lock_init(&dest->dst_lock);
spin_lock_init(&dest->stats.lock);
- __ip_vs_update_dest(svc, dest, udest);
- ip_vs_new_estimator(&dest->stats);
+ __ip_vs_update_dest(svc, dest, udest, 1);
*dest_p = dest;
@@ -926,65 +944,22 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
- __ip_vs_update_dest(svc, dest, udest);
-
/*
* Get the destination from the trash
*/
list_del(&dest->n_list);
- ip_vs_new_estimator(&dest->stats);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
+ __ip_vs_update_dest(svc, dest, udest, 1);
+ ret = 0;
+ } else {
/*
- * Wait until all other svc users go away.
+ * Allocate and initialize the dest structure
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- list_add(&dest->n_list, &svc->destinations);
- svc->num_dests++;
-
- /* call the update_service function of its scheduler */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
- return 0;
- }
-
- /*
- * Allocate and initialize the dest structure
- */
- ret = ip_vs_new_dest(svc, udest, &dest);
- if (ret) {
- return ret;
+ ret = ip_vs_new_dest(svc, udest, &dest);
}
-
- /*
- * Add the dest entry into the list
- */
- atomic_inc(&dest->refcnt);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- list_add(&dest->n_list, &svc->destinations);
- svc->num_dests++;
-
- /* call the update_service function of its scheduler */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
-
LeaveFunction(2);
- return 0;
+ return ret;
}
@@ -1023,19 +998,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return -ENOENT;
}
- __ip_vs_update_dest(svc, dest, udest);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /* Wait until all other svc users go away */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- /* call the update_service, because server weight may be changed */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
-
+ __ip_vs_update_dest(svc, dest, udest, 0);
LeaveFunction(2);
return 0;
@@ -1062,6 +1025,10 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
* the destination into the trash.
*/
if (atomic_dec_and_test(&dest->refcnt)) {
+ IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
ip_vs_dst_reset(dest);
/* simply decrease svc->refcnt here, let the caller check
and release the service if nobody refers to it.
@@ -1128,7 +1095,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Wait until all other svc users go away.
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
/*
* Unlink dest from the service
@@ -1157,6 +1124,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
{
int ret = 0;
struct ip_vs_scheduler *sched = NULL;
+ struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
/* increase the module use count */
@@ -1167,7 +1135,17 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
if (sched == NULL) {
pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
ret = -ENOENT;
- goto out_mod_dec;
+ goto out_err;
+ }
+
+ if (u->pe_name && *u->pe_name) {
+ pe = ip_vs_pe_get(u->pe_name);
+ if (pe == NULL) {
+ pr_info("persistence engine module ip_vs_pe_%s "
+ "not found\n", u->pe_name);
+ ret = -ENOENT;
+ goto out_err;
+ }
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1177,7 +1155,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
#endif
- svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
+ svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
if (svc == NULL) {
IP_VS_DBG(1, "%s(): no memory\n", __func__);
ret = -ENOMEM;
@@ -1185,7 +1163,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
/* I'm the first user of the service */
- atomic_set(&svc->usecnt, 1);
+ atomic_set(&svc->usecnt, 0);
atomic_set(&svc->refcnt, 0);
svc->af = u->af;
@@ -1207,6 +1185,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
goto out_err;
sched = NULL;
+ /* Bind the ct retriever */
+ ip_vs_bind_pe(svc, pe);
+ pe = NULL;
+
/* Update the virtual service counters */
if (svc->port == FTPPORT)
atomic_inc(&ip_vs_ftpsvc_counter);
@@ -1227,10 +1209,9 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
*svc_p = svc;
return 0;
- out_err:
+ out_err:
if (svc != NULL) {
- if (svc->scheduler)
- ip_vs_unbind_scheduler(svc);
+ ip_vs_unbind_scheduler(svc);
if (svc->inc) {
local_bh_disable();
ip_vs_app_inc_put(svc->inc);
@@ -1239,8 +1220,8 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
kfree(svc);
}
ip_vs_scheduler_put(sched);
+ ip_vs_pe_put(pe);
- out_mod_dec:
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -1255,6 +1236,7 @@ static int
ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
{
struct ip_vs_scheduler *sched, *old_sched;
+ struct ip_vs_pe *pe = NULL, *old_pe = NULL;
int ret = 0;
/*
@@ -1267,6 +1249,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
}
old_sched = sched;
+ if (u->pe_name && *u->pe_name) {
+ pe = ip_vs_pe_get(u->pe_name);
+ if (pe == NULL) {
+ pr_info("persistence engine module ip_vs_pe_%s "
+ "not found\n", u->pe_name);
+ ret = -ENOENT;
+ goto out;
+ }
+ old_pe = pe;
+ }
+
#ifdef CONFIG_IP_VS_IPV6
if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
ret = -EINVAL;
@@ -1279,7 +1272,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
/*
* Wait until all other svc users go away.
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
/*
* Set the flags and timeout value
@@ -1318,15 +1311,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
}
}
+ old_pe = svc->pe;
+ if (pe != old_pe) {
+ ip_vs_unbind_pe(svc);
+ ip_vs_bind_pe(svc, pe);
+ }
+
out_unlock:
write_unlock_bh(&__ip_vs_svc_lock);
-#ifdef CONFIG_IP_VS_IPV6
out:
-#endif
-
- if (old_sched)
- ip_vs_scheduler_put(old_sched);
-
+ ip_vs_scheduler_put(old_sched);
+ ip_vs_pe_put(old_pe);
return ret;
}
@@ -1340,6 +1335,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
{
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
+ struct ip_vs_pe *old_pe;
+
+ pr_info("%s: enter\n", __func__);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
@@ -1350,8 +1348,12 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/* Unbind scheduler */
old_sched = svc->scheduler;
ip_vs_unbind_scheduler(svc);
- if (old_sched)
- ip_vs_scheduler_put(old_sched);
+ ip_vs_scheduler_put(old_sched);
+
+ /* Unbind persistence engine */
+ old_pe = svc->pe;
+ ip_vs_unbind_pe(svc);
+ ip_vs_pe_put(old_pe);
/* Unbind app inc */
if (svc->inc) {
@@ -1378,21 +1380,23 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/*
* Free the service if nobody refers to it
*/
- if (atomic_read(&svc->refcnt) == 0)
+ if (atomic_read(&svc->refcnt) == 0) {
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ svc->fwmark,
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port), atomic_read(&svc->usecnt));
kfree(svc);
+ }
/* decrease the module use count */
ip_vs_use_count_dec();
}
/*
- * Delete a service from the service list
+ * Unlink a service from list and try to delete it if its refcnt reached 0
*/
-static int ip_vs_del_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc)
{
- if (svc == NULL)
- return -EEXIST;
-
/*
* Unhash it from the service table
*/
@@ -1403,11 +1407,21 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Wait until all the svc users go away.
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
__ip_vs_del_service(svc);
write_unlock_bh(&__ip_vs_svc_lock);
+}
+
+/*
+ * Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+ if (svc == NULL)
+ return -EEXIST;
+ ip_vs_unlink_service(svc);
return 0;
}
@@ -1426,14 +1440,7 @@ static int ip_vs_flush(void)
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
- write_lock_bh(&__ip_vs_svc_lock);
- ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
- __ip_vs_del_service(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
+ ip_vs_unlink_service(svc);
}
}
@@ -1443,14 +1450,7 @@ static int ip_vs_flush(void)
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry_safe(svc, nxt,
&ip_vs_svc_fwm_table[idx], f_list) {
- write_lock_bh(&__ip_vs_svc_lock);
- ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
- __ip_vs_del_service(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
+ ip_vs_unlink_service(svc);
}
}
@@ -1579,6 +1579,15 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
+#ifdef CONFIG_IP_VS_NFCT
+ {
+ .procname = "conntrack",
+ .data = &sysctl_ip_vs_conntrack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
{
.procname = "secure_tcp",
.data = &sysctl_ip_vs_secure_tcp,
@@ -1586,6 +1595,13 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
+ {
+ .procname = "snat_reroute",
+ .data = &sysctl_ip_vs_snat_reroute,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#if 0
{
.procname = "timeout_established",
@@ -1864,14 +1880,16 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
svc->scheduler->name);
else
#endif
- seq_printf(seq, "%s %08X:%04X %s ",
+ seq_printf(seq, "%s %08X:%04X %s %s ",
ip_vs_proto_name(svc->protocol),
ntohl(svc->addr.ip),
ntohs(svc->port),
- svc->scheduler->name);
+ svc->scheduler->name,
+ (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
} else {
- seq_printf(seq, "FWM %08X %s ",
- svc->fwmark, svc->scheduler->name);
+ seq_printf(seq, "FWM %08X %s %s",
+ svc->fwmark, svc->scheduler->name,
+ (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
}
if (svc->flags & IP_VS_SVC_F_PERSISTENT)
@@ -2039,6 +2057,8 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
struct ip_vs_service_user *usvc_compat)
{
+ memset(usvc, 0, sizeof(*usvc));
+
usvc->af = AF_INET;
usvc->protocol = usvc_compat->protocol;
usvc->addr.ip = usvc_compat->addr;
@@ -2056,6 +2076,8 @@ static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
struct ip_vs_dest_user *udest_compat)
{
+ memset(udest, 0, sizeof(*udest));
+
udest->addr.ip = udest_compat->addr;
udest->port = udest_compat->port;
udest->conn_flags = udest_compat->conn_flags;
@@ -2145,10 +2167,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* Lookup the exact service by <protocol, addr, port> or fwmark */
if (usvc.fwmark == 0)
- svc = __ip_vs_service_get(usvc.af, usvc.protocol,
- &usvc.addr, usvc.port);
+ svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+ &usvc.addr, usvc.port);
else
- svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+ svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2187,9 +2209,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
ret = -EINVAL;
}
- if (svc)
- ip_vs_service_put(svc);
-
out_unlock:
mutex_unlock(&__ip_vs_mutex);
out_dec:
@@ -2282,10 +2301,10 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
int ret = 0;
if (get->fwmark)
- svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
+ svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
else
- svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
- get->port);
+ svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+ get->port);
if (svc) {
int count = 0;
@@ -2313,7 +2332,6 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
}
count++;
}
- ip_vs_service_put(svc);
} else
ret = -ESRCH;
return ret;
@@ -2434,15 +2452,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
if (entry->fwmark)
- svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
+ svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
else
- svc = __ip_vs_service_get(AF_INET, entry->protocol,
- &addr, entry->port);
+ svc = __ip_vs_service_find(AF_INET, entry->protocol,
+ &addr, entry->port);
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
ret = -EFAULT;
- ip_vs_service_put(svc);
} else
ret = -ESRCH;
}
@@ -2557,6 +2574,8 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
[IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
[IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
.len = IP_VS_SCHEDNAME_MAXLEN },
+ [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
+ .len = IP_VS_PENAME_MAXLEN },
[IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
.len = sizeof(struct ip_vs_flags) },
[IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
@@ -2633,6 +2652,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
}
NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
+ if (svc->pe)
+ NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
@@ -2709,10 +2730,12 @@ nla_put_failure:
}
static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
- struct nlattr *nla, int full_entry)
+ struct nlattr *nla, int full_entry,
+ struct ip_vs_service **ret_svc)
{
struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+ struct ip_vs_service *svc;
/* Parse mandatory identifying service fields first */
if (nla == NULL ||
@@ -2748,14 +2771,21 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
usvc->fwmark = 0;
}
+ if (usvc->fwmark)
+ svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+ else
+ svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+ &usvc->addr, usvc->port);
+ *ret_svc = svc;
+
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
- struct nlattr *nla_sched, *nla_flags, *nla_timeout,
+ struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
*nla_netmask;
struct ip_vs_flags flags;
- struct ip_vs_service *svc;
nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+ nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
@@ -2766,21 +2796,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
nla_memcpy(&flags, nla_flags, sizeof(flags));
/* prefill flags from service if it already exists */
- if (usvc->fwmark)
- svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
- else
- svc = __ip_vs_service_get(usvc->af, usvc->protocol,
- &usvc->addr, usvc->port);
- if (svc) {
+ if (svc)
usvc->flags = svc->flags;
- ip_vs_service_put(svc);
- } else
- usvc->flags = 0;
/* set new flags from userland */
usvc->flags = (usvc->flags & ~flags.mask) |
(flags.flags & flags.mask);
usvc->sched_name = nla_data(nla_sched);
+ usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
usvc->timeout = nla_get_u32(nla_timeout);
usvc->netmask = nla_get_u32(nla_netmask);
}
@@ -2791,17 +2814,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
{
struct ip_vs_service_user_kern usvc;
+ struct ip_vs_service *svc;
int ret;
- ret = ip_vs_genl_parse_service(&usvc, nla, 0);
- if (ret)
- return ERR_PTR(ret);
-
- if (usvc.fwmark)
- return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
- else
- return __ip_vs_service_get(usvc.af, usvc.protocol,
- &usvc.addr, usvc.port);
+ ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+ return ret ? ERR_PTR(ret) : svc;
}
static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
@@ -2892,7 +2909,6 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
nla_put_failure:
cb->args[0] = idx;
- ip_vs_service_put(svc);
out_err:
mutex_unlock(&__ip_vs_mutex);
@@ -3105,17 +3121,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
ret = ip_vs_genl_parse_service(&usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
- need_full_svc);
+ need_full_svc, &svc);
if (ret)
goto out;
- /* Lookup the exact service by <protocol, addr, port> or fwmark */
- if (usvc.fwmark == 0)
- svc = __ip_vs_service_get(usvc.af, usvc.protocol,
- &usvc.addr, usvc.port);
- else
- svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
/* Unless we're adding a new service, the service must already exist */
if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
ret = -ESRCH;
@@ -3149,6 +3158,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
break;
case IPVS_CMD_DEL_SERVICE:
ret = ip_vs_del_service(svc);
+ /* do not use svc, it can be freed */
break;
case IPVS_CMD_NEW_DEST:
ret = ip_vs_add_dest(svc, &udest);
@@ -3167,8 +3177,6 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
}
out:
- if (svc)
- ip_vs_service_put(svc);
mutex_unlock(&__ip_vs_mutex);
return ret;
@@ -3214,7 +3222,6 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
goto out_err;
} else if (svc) {
ret = ip_vs_genl_fill_service(msg, svc);
- ip_vs_service_put(svc);
if (ret)
goto nla_put_failure;
} else {
@@ -3383,6 +3390,16 @@ int __init ip_vs_control_init(void)
EnterFunction(2);
+ /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+ INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+ }
+ for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
+ INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+ }
+ smp_wmb();
+
ret = nf_register_sockopt(&ip_vs_sockopts);
if (ret) {
pr_err("cannot register sockopt.\n");
@@ -3401,15 +3418,6 @@ int __init ip_vs_control_init(void)
sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
- /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
- for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
- INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
- }
- for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_rtable[idx]);
- }
-
ip_vs_new_estimator(&ip_vs_stats);
/* Hook the defense timer */
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 2ae747a376a5..75455000ad1c 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -32,6 +32,10 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
#include <linux/gfp.h>
#include <net/protocol.h>
#include <net/tcp.h>
@@ -60,6 +64,8 @@ static int ip_vs_ftp_pasv;
static int
ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
{
+ /* We use connection tracking for the command connection */
+ cp->flags |= IP_VS_CONN_F_NFCT;
return 0;
}
@@ -123,7 +129,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
return 1;
}
-
/*
* Look at outgoing ftp packets to catch the response to a PASV command
* from the server (inside-to-outside).
@@ -149,7 +154,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
struct ip_vs_conn *n_cp;
char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
unsigned buf_len;
- int ret;
+ int ret = 0;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
@@ -188,14 +195,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
/*
* Now update or create an connection entry for it
*/
- n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
- &cp->caddr, 0);
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(AF_INET, iph->protocol,
+ &from, port, &cp->caddr, 0, &p);
+ n_cp = ip_vs_conn_out_get(&p);
+ }
if (!n_cp) {
- n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
- &cp->caddr, 0,
- &cp->vaddr, port,
- &from, port,
- IP_VS_CONN_F_NO_CPORT,
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+ 0, &cp->vaddr, port, &p);
+ n_cp = ip_vs_conn_new(&p, &from, port,
+ IP_VS_CONN_F_NO_CPORT |
+ IP_VS_CONN_F_NFCT,
cp->dest);
if (!n_cp)
return 0;
@@ -219,19 +231,31 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
buf_len = strlen(buf);
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) {
+ /* If mangling fails this function will return 0
+ * which will cause the packet to be dropped.
+ * Mangling can only fail under memory pressure,
+ * hopefully it will succeed on the retransmitted
+ * packet.
+ */
+ ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ start-data, end-start,
+ buf, buf_len);
+ if (ret) {
+ ip_vs_nfct_expect_related(skb, ct, n_cp,
+ IPPROTO_TCP, 0, 0);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ /* csum is updated */
+ ret = 1;
+ }
+ }
+
/*
- * Calculate required delta-offset to keep TCP happy
+ * Not setting 'diff' is intentional, otherwise the sequence
+ * would be adjusted twice.
*/
- *diff = buf_len - (end-start);
-
- if (*diff == 0) {
- /* simply replace it with new passive address */
- memcpy(start, buf, buf_len);
- ret = 1;
- } else {
- ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
- end-start, buf, buf_len);
- }
cp->app_data = NULL;
ip_vs_tcp_conn_listen(n_cp);
@@ -332,21 +356,22 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
ip_vs_proto_name(iph->protocol),
&to.ip, ntohs(port), &cp->vaddr.ip, 0);
- n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
- &to, port,
- &cp->vaddr, htons(ntohs(cp->vport)-1));
- if (!n_cp) {
- n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
- &to, port,
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
&cp->vaddr, htons(ntohs(cp->vport)-1),
- &cp->daddr, htons(ntohs(cp->dport)-1),
- 0,
- cp->dest);
- if (!n_cp)
- return 0;
+ &p);
+ n_cp = ip_vs_conn_in_get(&p);
+ if (!n_cp) {
+ n_cp = ip_vs_conn_new(&p, &cp->daddr,
+ htons(ntohs(cp->dport)-1),
+ IP_VS_CONN_F_NFCT, cp->dest);
+ if (!n_cp)
+ return 0;
- /* add its controller */
- ip_vs_control_add(n_cp, cp);
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
}
/*
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 94a45213faa6..9323f8944199 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -11,7 +11,7 @@
* Changes:
* Martin Hamilton : fixed the terrible locking bugs
* *lock(tbl->lock) ==> *lock(&tbl->lock)
- * Wensong Zhang : fixed the uninitilized tbl->lock bug
+ * Wensong Zhang : fixed the uninitialized tbl->lock bug
* Wensong Zhang : added doing full expiration check to
* collect stale entries of 24+ hours when
* no partial expire check in a half hour
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 535dc2b419d8..dbeed8ea421a 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -386,7 +386,7 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
ip_vs_addr_copy(dest->af, &en->addr, daddr);
en->lastuse = jiffies;
- /* initilize its dest set */
+ /* initialize its dest set */
atomic_set(&(en->set.size), 0);
INIT_LIST_HEAD(&en->set.list);
rwlock_init(&en->set.lock);
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644
index 000000000000..4680647cd450
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,292 @@
+/*
+ * ip_vs_nfct.c: Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
+ &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+ (T)->dst.protonum
+
+#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
+ &((C)->vaddr.ip), ntohs((C)->vport), \
+ &((C)->daddr.ip), ntohs((C)->dport), \
+ (C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conntrack_tuple new_tuple;
+
+ if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+ nf_ct_is_dying(ct))
+ return;
+
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return;
+
+ /* Alter reply only in original direction */
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ return;
+
+ /*
+ * The connection is not yet in the hashtable, so we update it.
+ * CIP->VIP will remain the same, so leave the tuple in
+ * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
+ * real-server we will see RIP->DIP.
+ */
+ new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ /*
+ * This will also take care of UDP and other protocols.
+ */
+ if (outin) {
+ new_tuple.src.u3 = cp->daddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.src.u.tcp.port = cp->dport;
+ } else {
+ new_tuple.dst.u3 = cp->vaddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.dst.u.tcp.port = cp->vport;
+ }
+ IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, old reply=" FMT_TUPLE
+ ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+ ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+ nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+ return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_conntrack_tuple *orig, new_reply;
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
+
+ if (exp->tuple.src.l3num != PF_INET)
+ return;
+
+ /*
+ * We assume that no NF locks are held before this callback.
+ * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+ * expectations even if they use wildcard values, now we provide the
+ * actual values from the newly created original conntrack direction.
+ * The conntrack is confirmed when packet reaches IPVS hooks.
+ */
+
+ /* RS->CLIENT */
+ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+ &orig->src.u3, orig->src.u.tcp.port,
+ &orig->dst.u3, orig->dst.u.tcp.port, &p);
+ cp = ip_vs_conn_out_get(&p);
+ if (cp) {
+ /* Change reply CLIENT->RS to CLIENT->VS */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.dst.u3 = cp->vaddr;
+ new_reply.dst.u.tcp.port = cp->vport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+ ", inout cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ /* CLIENT->VS */
+ cp = ip_vs_conn_in_get(&p);
+ if (cp) {
+ /* Change reply VS->CLIENT to RS->CLIENT */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.src.u3 = cp->daddr;
+ new_reply.src.u.tcp.port = cp->dport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+ " - unknown expect\n",
+ __func__, ct, ct->status, ARG_TUPLE(orig));
+ return;
+
+alter:
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+ nf_conntrack_alter_reply(ct, &new_reply);
+ ip_vs_conn_put(cp);
+ return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+ struct ip_vs_conn *cp, u_int8_t proto,
+ const __be16 port, int from_rs)
+{
+ struct nf_conntrack_expect *exp;
+
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (!exp)
+ return;
+
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ from_rs ? &cp->daddr : &cp->caddr,
+ from_rs ? &cp->caddr : &cp->vaddr,
+ proto, port ? &port : NULL,
+ from_rs ? &cp->cport : &cp->vport);
+
+ exp->expectfn = ip_vs_nfct_expect_callback;
+
+ IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&exp->tuple));
+ nf_ct_expect_related(exp);
+ nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct nf_conntrack_tuple tuple;
+
+ if (!cp->cport)
+ return;
+
+ tuple = (struct nf_conntrack_tuple) {
+ .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+ tuple.src.u3 = cp->caddr;
+ tuple.src.u.all = cp->cport;
+ tuple.src.l3num = cp->af;
+ tuple.dst.u3 = cp->vaddr;
+ tuple.dst.u.all = cp->vport;
+
+ IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+ " for conn " FMT_CONN "\n",
+ __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+ h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+ if (h) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* Show what happens instead of calling nf_ct_kill() */
+ if (del_timer(&ct->timeout)) {
+ IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ if (ct->timeout.function)
+ ct->timeout.function(ct->timeout.data);
+ } else {
+ IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ }
+ nf_ct_put(ct);
+ } else {
+ IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+ __func__, ARG_TUPLE(&tuple));
+ }
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
new file mode 100644
index 000000000000..3414af70ee12
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -0,0 +1,147 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/* IPVS pe list */
+static LIST_HEAD(ip_vs_pe);
+
+/* lock for service table */
+static DEFINE_SPINLOCK(ip_vs_pe_lock);
+
+/* Bind a service with a pe */
+void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
+{
+ svc->pe = pe;
+}
+
+/* Unbind a service from its pe */
+void ip_vs_unbind_pe(struct ip_vs_service *svc)
+{
+ svc->pe = NULL;
+}
+
+/* Get pe in the pe list by name */
+static struct ip_vs_pe *
+ip_vs_pe_getbyname(const char *pe_name)
+{
+ struct ip_vs_pe *pe;
+
+ IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+ pe_name);
+
+ spin_lock_bh(&ip_vs_pe_lock);
+
+ list_for_each_entry(pe, &ip_vs_pe, n_list) {
+ /* Test and get the modules atomically */
+ if (pe->module &&
+ !try_module_get(pe->module)) {
+ /* This pe is just deleted */
+ continue;
+ }
+ if (strcmp(pe_name, pe->name)==0) {
+ /* HIT */
+ spin_unlock_bh(&ip_vs_pe_lock);
+ return pe;
+ }
+ if (pe->module)
+ module_put(pe->module);
+ }
+
+ spin_unlock_bh(&ip_vs_pe_lock);
+ return NULL;
+}
+
+/* Lookup pe and try to load it if it doesn't exist */
+struct ip_vs_pe *ip_vs_pe_get(const char *name)
+{
+ struct ip_vs_pe *pe;
+
+ /* Search for the pe by name */
+ pe = ip_vs_pe_getbyname(name);
+
+ /* If pe not found, load the module and search again */
+ if (!pe) {
+ request_module("ip_vs_pe_%s", name);
+ pe = ip_vs_pe_getbyname(name);
+ }
+
+ return pe;
+}
+
+void ip_vs_pe_put(struct ip_vs_pe *pe)
+{
+ if (pe && pe->module)
+ module_put(pe->module);
+}
+
+/* Register a pe in the pe list */
+int register_ip_vs_pe(struct ip_vs_pe *pe)
+{
+ struct ip_vs_pe *tmp;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ spin_lock_bh(&ip_vs_pe_lock);
+
+ if (!list_empty(&pe->n_list)) {
+ spin_unlock_bh(&ip_vs_pe_lock);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] pe already linked\n",
+ __func__, pe->name);
+ return -EINVAL;
+ }
+
+ /* Make sure that the pe with this name doesn't exist
+ * in the pe list.
+ */
+ list_for_each_entry(tmp, &ip_vs_pe, n_list) {
+ if (strcmp(tmp->name, pe->name) == 0) {
+ spin_unlock_bh(&ip_vs_pe_lock);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] pe already existed "
+ "in the system\n", __func__, pe->name);
+ return -EINVAL;
+ }
+ }
+ /* Add it into the d-linked pe list */
+ list_add(&pe->n_list, &ip_vs_pe);
+ spin_unlock_bh(&ip_vs_pe_lock);
+
+ pr_info("[%s] pe registered.\n", pe->name);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_ip_vs_pe);
+
+/* Unregister a pe from the pe list */
+int unregister_ip_vs_pe(struct ip_vs_pe *pe)
+{
+ spin_lock_bh(&ip_vs_pe_lock);
+ if (list_empty(&pe->n_list)) {
+ spin_unlock_bh(&ip_vs_pe_lock);
+ pr_err("%s(): [%s] pe is not in the list. failed\n",
+ __func__, pe->name);
+ return -EINVAL;
+ }
+
+ /* Remove it from the d-linked pe list */
+ list_del(&pe->n_list);
+ spin_unlock_bh(&ip_vs_pe_lock);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ pr_info("[%s] pe unregistered.\n", pe->name);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644
index 000000000000..b8b4e9620f3e
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -0,0 +1,169 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+#ifdef CONFIG_IP_VS_DEBUG
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+ const char *callid, size_t callid_len,
+ int *idx)
+{
+ size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+ memcpy(buf + *idx, callid, len);
+ buf[*idx+len] = '\0';
+ *idx += len + 1;
+ return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len) \
+ ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \
+ callid, len, &ip_vs_dbg_idx)
+#endif
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+ unsigned int datalen,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ /* Find callid */
+ while (1) {
+ int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+ SIP_HDR_CALL_ID, matchoff,
+ matchlen);
+ if (ret > 0)
+ break;
+ if (!ret)
+ return 0;
+ dataoff += *matchoff;
+ }
+
+ /* Empty callid is useless */
+ if (!*matchlen)
+ return -EINVAL;
+
+ /* Too large is useless */
+ if (*matchlen > IP_VS_PEDATA_MAXLEN)
+ return -EINVAL;
+
+ /* SIP headers are always followed by a line terminator */
+ if (*matchoff + *matchlen == datalen)
+ return -EINVAL;
+
+ /* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+ * RFC 3261 allows only CRLF, we support both. */
+ if (*(dptr + *matchoff + *matchlen) != '\r' &&
+ *(dptr + *matchoff + *matchlen) != '\n')
+ return -EINVAL;
+
+ IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+ IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+ *matchlen);
+ return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+ struct ip_vs_iphdr iph;
+ unsigned int dataoff, datalen, matchoff, matchlen;
+ const char *dptr;
+
+ ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
+
+ /* Only useful with UDP */
+ if (iph.protocol != IPPROTO_UDP)
+ return -EINVAL;
+
+ /* No Data ? */
+ dataoff = iph.len + sizeof(struct udphdr);
+ if (dataoff >= skb->len)
+ return -EINVAL;
+
+ dptr = skb->data + dataoff;
+ datalen = skb->len - dataoff;
+
+ if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+ return -EINVAL;
+
+ p->pe_data = kmalloc(matchlen, GFP_ATOMIC);
+ if (!p->pe_data)
+ return -ENOMEM;
+
+ /* N.B: pe_data is only set on success,
+ * this allows fallback to the default persistence logic on failure
+ */
+ memcpy(p->pe_data, dptr + matchoff, matchlen);
+ p->pe_data_len = matchlen;
+
+ return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+ struct ip_vs_conn *ct)
+
+{
+ bool ret = 0;
+
+ if (ct->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+ /* protocol should only be IPPROTO_IP if
+ * d_addr is a fwmark */
+ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ p->vaddr, &ct->vaddr) &&
+ ct->vport == p->vport &&
+ ct->flags & IP_VS_CONN_F_TEMPLATE &&
+ ct->protocol == p->protocol &&
+ ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+ !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+ ret = 1;
+
+ IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ ret ? "hit" : "not hit");
+
+ return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+ u32 initval, bool inverse)
+{
+ return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+ memcpy(buf, cp->pe_data, cp->pe_data_len);
+ return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+ .name = "sip",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+ .fill_param = ip_vs_sip_fill_param,
+ .ct_match = ip_vs_sip_ct_match,
+ .hashkey_raw = ip_vs_sip_hashkey_raw,
+ .show_pe_data = ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+ return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+ unregister_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 2d3d5e4b35f8..c53998390877 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -98,6 +98,7 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
return NULL;
}
+EXPORT_SYMBOL(ip_vs_proto_get);
/*
@@ -171,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
else if (ih->frag_off & htons(IP_OFFSET))
sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
else {
- __be16 _ports[2], *pptr
-;
+ __be16 _ports[2], *pptr;
+
pptr = skb_header_pointer(skb, offset + ih->ihl*4,
sizeof(_ports), _ports);
if (pptr == NULL)
@@ -222,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
const struct sk_buff *skb,
int offset,
const char *msg)
{
#ifdef CONFIG_IP_VS_IPV6
- if (skb->protocol == htons(ETH_P_IPV6))
+ if (af == AF_INET6)
ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
else
#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 1892dfc12fdd..3a0461117d3f 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -40,6 +40,19 @@ struct isakmp_hdr {
#define PORT_ISAKMP 500
+static void
+ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
+ int inverse, struct ip_vs_conn_param *p)
+{
+ if (likely(!inverse))
+ ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ &iph->saddr, htons(PORT_ISAKMP),
+ &iph->daddr, htons(PORT_ISAKMP), p);
+ else
+ ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ &iph->daddr, htons(PORT_ISAKMP),
+ &iph->saddr, htons(PORT_ISAKMP), p);
+}
static struct ip_vs_conn *
ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
@@ -47,21 +60,10 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
int inverse)
{
struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
- if (likely(!inverse)) {
- cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
- &iph->saddr,
- htons(PORT_ISAKMP),
- &iph->daddr,
- htons(PORT_ISAKMP));
- } else {
- cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
- &iph->daddr,
- htons(PORT_ISAKMP),
- &iph->saddr,
- htons(PORT_ISAKMP));
- }
-
+ ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ cp = ip_vs_conn_in_get(&p);
if (!cp) {
/*
* We are not sure if the packet is from our
@@ -87,21 +89,10 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
int inverse)
{
struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
- if (likely(!inverse)) {
- cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
- &iph->saddr,
- htons(PORT_ISAKMP),
- &iph->daddr,
- htons(PORT_ISAKMP));
- } else {
- cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
- &iph->daddr,
- htons(PORT_ISAKMP),
- &iph->saddr,
- htons(PORT_ISAKMP));
- }
-
+ ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ cp = ip_vs_conn_out_get(&p);
if (!cp) {
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
"%s%s %s->%s\n",
@@ -126,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
}
-
-static void
-ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
- int offset, const char *msg)
-{
- char buf[256];
- struct iphdr _iph, *ih;
-
- ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
- if (ih == NULL)
- sprintf(buf, "TRUNCATED");
- else
- sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr);
-
- pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static void
-ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
- int offset, const char *msg)
-{
- char buf[256];
- struct ipv6hdr _iph, *ih;
-
- ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
- if (ih == NULL)
- sprintf(buf, "TRUNCATED");
- else
- sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr);
-
- pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-#endif
-
-static void
-ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
- int offset, const char *msg)
-{
-#ifdef CONFIG_IP_VS_IPV6
- if (skb->protocol == htons(ETH_P_IPV6))
- ah_esp_debug_packet_v6(pp, skb, offset, msg);
- else
-#endif
- ah_esp_debug_packet_v4(pp, skb, offset, msg);
-}
-
-
static void ah_esp_init(struct ip_vs_protocol *pp)
{
/* nothing to do now */
@@ -204,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
.register_app = NULL,
.unregister_app = NULL,
.app_conn_bind = NULL,
- .debug_packet = ah_esp_debug_packet,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL, /* ISAKMP */
.set_state_timeout = NULL,
};
@@ -228,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
.register_app = NULL,
.unregister_app = NULL,
.app_conn_bind = NULL,
- .debug_packet = ah_esp_debug_packet,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL, /* ISAKMP */
};
#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index c9a3f7a21d53..1ea96bcd342b 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -8,55 +8,6 @@
#include <net/sctp/checksum.h>
#include <net/ip_vs.h>
-
-static struct ip_vs_conn *
-sctp_conn_in_get(int af,
- const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
- const struct ip_vs_iphdr *iph,
- unsigned int proto_off,
- int inverse)
-{
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
- if (pptr == NULL)
- return NULL;
-
- if (likely(!inverse))
- return ip_vs_conn_in_get(af, iph->protocol,
- &iph->saddr, pptr[0],
- &iph->daddr, pptr[1]);
- else
- return ip_vs_conn_in_get(af, iph->protocol,
- &iph->daddr, pptr[1],
- &iph->saddr, pptr[0]);
-}
-
-static struct ip_vs_conn *
-sctp_conn_out_get(int af,
- const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
- const struct ip_vs_iphdr *iph,
- unsigned int proto_off,
- int inverse)
-{
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
- if (pptr == NULL)
- return NULL;
-
- if (likely(!inverse))
- return ip_vs_conn_out_get(af, iph->protocol,
- &iph->saddr, pptr[0],
- &iph->daddr, pptr[1]);
- else
- return ip_vs_conn_out_get(af, iph->protocol,
- &iph->daddr, pptr[1],
- &iph->saddr, pptr[0]);
-}
-
static int
sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp)
@@ -80,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
if ((sch->type == SCTP_CID_INIT) &&
(svc = ip_vs_service_get(af, skb->mark, iph.protocol,
&iph.daddr, sh->dest))) {
+ int ignored;
+
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
@@ -93,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb);
- if (!*cpp) {
+ *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+ if (!*cpp && !ignored) {
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
@@ -110,6 +63,7 @@ sctp_snat_handler(struct sk_buff *skb,
{
sctp_sctphdr_t *sctph;
unsigned int sctphoff;
+ struct sk_buff *iter;
__be32 crc32;
#ifdef CONFIG_IP_VS_IPV6
@@ -138,8 +92,8 @@ sctp_snat_handler(struct sk_buff *skb,
/* Calculate the checksum */
crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
- for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next)
- crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb),
+ skb_walk_frags(skb, iter)
+ crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
crc32);
crc32 = sctp_end_cksum(crc32);
sctph->checksum = crc32;
@@ -151,9 +105,9 @@ static int
sctp_dnat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
-
sctp_sctphdr_t *sctph;
unsigned int sctphoff;
+ struct sk_buff *iter;
__be32 crc32;
#ifdef CONFIG_IP_VS_IPV6
@@ -173,7 +127,7 @@ sctp_dnat_handler(struct sk_buff *skb,
return 0;
/* Call application helper if needed */
- if (!ip_vs_app_pkt_out(cp, skb))
+ if (!ip_vs_app_pkt_in(cp, skb))
return 0;
}
@@ -182,8 +136,8 @@ sctp_dnat_handler(struct sk_buff *skb,
/* Calculate the checksum */
crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
- for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next)
- crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb),
+ skb_walk_frags(skb, iter)
+ crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
crc32);
crc32 = sctp_end_cksum(crc32);
sctph->checksum = crc32;
@@ -194,9 +148,9 @@ sctp_dnat_handler(struct sk_buff *skb,
static int
sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
{
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
unsigned int sctphoff;
struct sctphdr *sh, _sctph;
+ struct sk_buff *iter;
__le32 cmp;
__le32 val;
__u32 tmp;
@@ -215,15 +169,15 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
cmp = sh->checksum;
tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb));
- for (; list; list = list->next)
- tmp = sctp_update_cksum((__u8 *) list->data,
- skb_headlen(list), tmp);
+ skb_walk_frags(skb, iter)
+ tmp = sctp_update_cksum((__u8 *) iter->data,
+ skb_headlen(iter), tmp);
val = sctp_end_cksum(tmp);
if (val != cmp) {
/* CRC failure, dump it. */
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
"Failed checksum for");
return 0;
}
@@ -1169,8 +1123,8 @@ struct ip_vs_protocol ip_vs_protocol_sctp = {
.register_app = sctp_register_app,
.unregister_app = sctp_unregister_app,
.conn_schedule = sctp_conn_schedule,
- .conn_in_get = sctp_conn_in_get,
- .conn_out_get = sctp_conn_out_get,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
.snat_handler = sctp_snat_handler,
.dnat_handler = sctp_dnat_handler,
.csum_check = sctp_csum_check,
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 91d28e073742..f6c5200e2146 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -27,52 +27,6 @@
#include <net/ip_vs.h>
-
-static struct ip_vs_conn *
-tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
- const struct ip_vs_iphdr *iph, unsigned int proto_off,
- int inverse)
-{
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
- if (pptr == NULL)
- return NULL;
-
- if (likely(!inverse)) {
- return ip_vs_conn_in_get(af, iph->protocol,
- &iph->saddr, pptr[0],
- &iph->daddr, pptr[1]);
- } else {
- return ip_vs_conn_in_get(af, iph->protocol,
- &iph->daddr, pptr[1],
- &iph->saddr, pptr[0]);
- }
-}
-
-static struct ip_vs_conn *
-tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
- const struct ip_vs_iphdr *iph, unsigned int proto_off,
- int inverse)
-{
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
- if (pptr == NULL)
- return NULL;
-
- if (likely(!inverse)) {
- return ip_vs_conn_out_get(af, iph->protocol,
- &iph->saddr, pptr[0],
- &iph->daddr, pptr[1]);
- } else {
- return ip_vs_conn_out_get(af, iph->protocol,
- &iph->daddr, pptr[1],
- &iph->saddr, pptr[0]);
- }
-}
-
-
static int
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp)
@@ -89,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
}
+ /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
if (th->syn &&
(svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
th->dest))) {
+ int ignored;
+
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
@@ -106,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb);
- if (!*cpp) {
+ *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+ if (!*cpp && !ignored) {
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
@@ -147,15 +104,15 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
tcph->check =
- csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
ip_vs_check_diff2(oldlen, newlen,
- ~csum_unfold(tcph->check))));
+ csum_unfold(tcph->check))));
else
#endif
tcph->check =
- csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
ip_vs_check_diff2(oldlen, newlen,
- ~csum_unfold(tcph->check))));
+ csum_unfold(tcph->check))));
}
@@ -166,6 +123,7 @@ tcp_snat_handler(struct sk_buff *skb,
struct tcphdr *tcph;
unsigned int tcphoff;
int oldlen;
+ int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -180,13 +138,20 @@ tcp_snat_handler(struct sk_buff *skb,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
- if (!ip_vs_app_pkt_out(cp, skb))
+ if (!(ret = ip_vs_app_pkt_out(cp, skb)))
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - tcphoff;
+ else
+ payload_csum = 1;
}
tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -197,12 +162,13 @@ tcp_snat_handler(struct sk_buff *skb,
tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
htons(oldlen),
htons(skb->len - tcphoff));
- } else if (!cp->app) {
+ } else if (!payload_csum) {
/* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
tcph->check = 0;
@@ -220,6 +186,7 @@ tcp_snat_handler(struct sk_buff *skb,
skb->len - tcphoff,
cp->protocol,
skb->csum);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
pp->name, tcph->check,
@@ -236,6 +203,7 @@ tcp_dnat_handler(struct sk_buff *skb,
struct tcphdr *tcph;
unsigned int tcphoff;
int oldlen;
+ int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -250,6 +218,8 @@ tcp_dnat_handler(struct sk_buff *skb,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
@@ -258,8 +228,13 @@ tcp_dnat_handler(struct sk_buff *skb,
* Attempt ip_vs_app call.
* It will fix ip_vs_conn and iph ack_seq stuff
*/
- if (!ip_vs_app_pkt_in(cp, skb))
+ if (!(ret = ip_vs_app_pkt_in(cp, skb)))
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - tcphoff;
+ else
+ payload_csum = 1;
}
tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -269,15 +244,16 @@ tcp_dnat_handler(struct sk_buff *skb,
* Adjust TCP checksums
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+ tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
htons(oldlen),
htons(skb->len - tcphoff));
- } else if (!cp->app) {
+ } else if (!payload_csum) {
/* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
tcph->check = 0;
@@ -324,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
skb->len - tcphoff,
ipv6_hdr(skb)->nexthdr,
skb->csum)) {
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
"Failed checksum for");
return 0;
}
@@ -335,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
skb->len - tcphoff,
ip_hdr(skb)->protocol,
skb->csum)) {
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
"Failed checksum for");
return 0;
}
@@ -721,8 +697,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
- .conn_in_get = tcp_conn_in_get,
- .conn_out_get = tcp_conn_out_get,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
.snat_handler = tcp_snat_handler,
.dnat_handler = tcp_dnat_handler,
.csum_check = tcp_csum_check,
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index e7a6885e0167..9d106a06bb0a 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -27,58 +27,6 @@
#include <net/ip.h>
#include <net/ip6_checksum.h>
-static struct ip_vs_conn *
-udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
- const struct ip_vs_iphdr *iph, unsigned int proto_off,
- int inverse)
-{
- struct ip_vs_conn *cp;
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
- if (pptr == NULL)
- return NULL;
-
- if (likely(!inverse)) {
- cp = ip_vs_conn_in_get(af, iph->protocol,
- &iph->saddr, pptr[0],
- &iph->daddr, pptr[1]);
- } else {
- cp = ip_vs_conn_in_get(af, iph->protocol,
- &iph->daddr, pptr[1],
- &iph->saddr, pptr[0]);
- }
-
- return cp;
-}
-
-
-static struct ip_vs_conn *
-udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
- const struct ip_vs_iphdr *iph, unsigned int proto_off,
- int inverse)
-{
- struct ip_vs_conn *cp;
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
- if (pptr == NULL)
- return NULL;
-
- if (likely(!inverse)) {
- cp = ip_vs_conn_out_get(af, iph->protocol,
- &iph->saddr, pptr[0],
- &iph->daddr, pptr[1]);
- } else {
- cp = ip_vs_conn_out_get(af, iph->protocol,
- &iph->daddr, pptr[1],
- &iph->saddr, pptr[0]);
- }
-
- return cp;
-}
-
-
static int
udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp)
@@ -98,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
svc = ip_vs_service_get(af, skb->mark, iph.protocol,
&iph.daddr, uh->dest);
if (svc) {
+ int ignored;
+
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
@@ -112,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb);
- if (!*cpp) {
+ *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+ if (!*cpp && !ignored) {
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
@@ -154,15 +104,15 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
uhdr->check =
- csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
ip_vs_check_diff2(oldlen, newlen,
- ~csum_unfold(uhdr->check))));
+ csum_unfold(uhdr->check))));
else
#endif
uhdr->check =
- csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
ip_vs_check_diff2(oldlen, newlen,
- ~csum_unfold(uhdr->check))));
+ csum_unfold(uhdr->check))));
}
@@ -173,6 +123,7 @@ udp_snat_handler(struct sk_buff *skb,
struct udphdr *udph;
unsigned int udphoff;
int oldlen;
+ int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -187,6 +138,8 @@ udp_snat_handler(struct sk_buff *skb,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
@@ -194,8 +147,13 @@ udp_snat_handler(struct sk_buff *skb,
/*
* Call application helper if needed
*/
- if (!ip_vs_app_pkt_out(cp, skb))
+ if (!(ret = ip_vs_app_pkt_out(cp, skb)))
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - udphoff;
+ else
+ payload_csum = 1;
}
udph = (void *)skb_network_header(skb) + udphoff;
@@ -208,12 +166,13 @@ udp_snat_handler(struct sk_buff *skb,
udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
htons(oldlen),
htons(skb->len - udphoff));
- } else if (!cp->app && (udph->check != 0)) {
+ } else if (!payload_csum && (udph->check != 0)) {
/* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
udph->check = 0;
@@ -233,6 +192,7 @@ udp_snat_handler(struct sk_buff *skb,
skb->csum);
if (udph->check == 0)
udph->check = CSUM_MANGLED_0;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
pp->name, udph->check,
(char*)&(udph->check) - (char*)udph);
@@ -248,6 +208,7 @@ udp_dnat_handler(struct sk_buff *skb,
struct udphdr *udph;
unsigned int udphoff;
int oldlen;
+ int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -262,6 +223,8 @@ udp_dnat_handler(struct sk_buff *skb,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
@@ -270,8 +233,13 @@ udp_dnat_handler(struct sk_buff *skb,
* Attempt ip_vs_app call.
* It will fix ip_vs_conn
*/
- if (!ip_vs_app_pkt_in(cp, skb))
+ if (!(ret = ip_vs_app_pkt_in(cp, skb)))
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - udphoff;
+ else
+ payload_csum = 1;
}
udph = (void *)skb_network_header(skb) + udphoff;
@@ -281,15 +249,16 @@ udp_dnat_handler(struct sk_buff *skb,
* Adjust UDP checksums
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+ udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
htons(oldlen),
htons(skb->len - udphoff));
- } else if (!cp->app && (udph->check != 0)) {
+ } else if (!payload_csum && (udph->check != 0)) {
/* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
udph->check = 0;
@@ -345,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
skb->len - udphoff,
ipv6_hdr(skb)->nexthdr,
skb->csum)) {
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
"Failed checksum for");
return 0;
}
@@ -356,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
skb->len - udphoff,
ip_hdr(skb)->protocol,
skb->csum)) {
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
"Failed checksum for");
return 0;
}
@@ -520,8 +489,8 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.init = udp_init,
.exit = udp_exit,
.conn_schedule = udp_conn_schedule,
- .conn_in_get = udp_conn_in_get,
- .conn_out_get = udp_conn_out_get,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
.snat_handler = udp_snat_handler,
.dnat_handler = udp_dnat_handler,
.csum_check = udp_csum_check,
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index bbc1ac795952..076ebe00435d 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -35,7 +35,7 @@
static LIST_HEAD(ip_vs_schedulers);
/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_sched_lock);
+static DEFINE_SPINLOCK(ip_vs_sched_lock);
/*
@@ -46,15 +46,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
{
int ret;
- if (svc == NULL) {
- pr_err("%s(): svc arg NULL\n", __func__);
- return -EINVAL;
- }
- if (scheduler == NULL) {
- pr_err("%s(): scheduler arg NULL\n", __func__);
- return -EINVAL;
- }
-
svc->scheduler = scheduler;
if (scheduler->init_service) {
@@ -74,18 +65,10 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
*/
int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
{
- struct ip_vs_scheduler *sched;
+ struct ip_vs_scheduler *sched = svc->scheduler;
- if (svc == NULL) {
- pr_err("%s(): svc arg NULL\n", __func__);
- return -EINVAL;
- }
-
- sched = svc->scheduler;
- if (sched == NULL) {
- pr_err("%s(): svc isn't bound\n", __func__);
- return -EINVAL;
- }
+ if (!sched)
+ return 0;
if (sched->done_service) {
if (sched->done_service(svc) != 0) {
@@ -108,7 +91,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
- read_lock_bh(&__ip_vs_sched_lock);
+ spin_lock_bh(&ip_vs_sched_lock);
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
/*
@@ -122,14 +105,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
}
if (strcmp(sched_name, sched->name)==0) {
/* HIT */
- read_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
return sched;
}
if (sched->module)
module_put(sched->module);
}
- read_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
return NULL;
}
@@ -159,7 +142,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
{
- if (scheduler->module)
+ if (scheduler && scheduler->module)
module_put(scheduler->module);
}
@@ -184,10 +167,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
/* increase the module use count */
ip_vs_use_count_inc();
- write_lock_bh(&__ip_vs_sched_lock);
+ spin_lock_bh(&ip_vs_sched_lock);
if (!list_empty(&scheduler->n_list)) {
- write_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already linked\n",
__func__, scheduler->name);
@@ -200,7 +183,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
*/
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
if (strcmp(scheduler->name, sched->name) == 0) {
- write_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already existed "
"in the system\n", __func__, scheduler->name);
@@ -211,7 +194,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Add it into the d-linked scheduler list
*/
list_add(&scheduler->n_list, &ip_vs_schedulers);
- write_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
pr_info("[%s] scheduler registered.\n", scheduler->name);
@@ -229,9 +212,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
return -EINVAL;
}
- write_lock_bh(&__ip_vs_sched_lock);
+ spin_lock_bh(&ip_vs_sched_lock);
if (list_empty(&scheduler->n_list)) {
- write_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
pr_err("%s(): [%s] scheduler is not in the list. failed\n",
__func__, scheduler->name);
return -EINVAL;
@@ -241,7 +224,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Remove it from the d-linked scheduler list
*/
list_del(&scheduler->n_list);
- write_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 7ba06939829f..ab85aedea17e 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -288,6 +288,16 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
ip_vs_sync_conn(cp->control);
}
+static inline int
+ip_vs_conn_fill_param_sync(int af, int protocol,
+ const union nf_inet_addr *caddr, __be16 cport,
+ const union nf_inet_addr *vaddr, __be16 vport,
+ struct ip_vs_conn_param *p)
+{
+ /* XXX: Need to take into account persistence engine */
+ ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+ return 0;
+}
/*
* Process received multicast message and create the corresponding
@@ -301,6 +311,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
struct ip_vs_dest *dest;
+ struct ip_vs_conn_param param;
char *p;
int i;
@@ -370,18 +381,20 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
}
}
- if (!(flags & IP_VS_CONN_F_TEMPLATE))
- cp = ip_vs_conn_in_get(AF_INET, s->protocol,
- (union nf_inet_addr *)&s->caddr,
- s->cport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport);
- else
- cp = ip_vs_ct_in_get(AF_INET, s->protocol,
- (union nf_inet_addr *)&s->caddr,
- s->cport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport);
+ {
+ if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
+ (union nf_inet_addr *)&s->caddr,
+ s->cport,
+ (union nf_inet_addr *)&s->vaddr,
+ s->vport, &param)) {
+ pr_err("ip_vs_conn_fill_param_sync failed");
+ return;
+ }
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ cp = ip_vs_conn_in_get(&param);
+ else
+ cp = ip_vs_ct_in_get(&param);
+ }
if (!cp) {
/*
* Find the appropriate destination for the connection.
@@ -406,14 +419,9 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
else
flags &= ~IP_VS_CONN_F_INACTIVE;
}
- cp = ip_vs_conn_new(AF_INET, s->protocol,
- (union nf_inet_addr *)&s->caddr,
- s->cport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport,
+ cp = ip_vs_conn_new(&param,
(union nf_inet_addr *)&s->daddr,
- s->dport,
- flags, dest);
+ s->dport, flags, dest);
if (dest)
atomic_dec(&dest->refcnt);
if (!cp) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 93c15a107b2c..de04ea39cde8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -11,6 +11,16 @@
*
* Changes:
*
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
*/
#define KMSG_COMPONENT "IPVS"
@@ -26,6 +36,7 @@
#include <net/route.h> /* for ip_route_output */
#include <net/ipv6.h>
#include <net/ip6_route.h>
+#include <net/addrconf.h>
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
@@ -37,26 +48,27 @@
* Destination cache to speed up outgoing route lookup
*/
static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
+ u32 dst_cookie)
{
struct dst_entry *old_dst;
old_dst = dest->dst_cache;
dest->dst_cache = dst;
dest->dst_rtos = rtos;
+ dest->dst_cookie = dst_cookie;
dst_release(old_dst);
}
static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
{
struct dst_entry *dst = dest->dst_cache;
if (!dst)
return NULL;
- if ((dst->obsolete
- || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
- dst->ops->check(dst, cookie) == NULL) {
+ if ((dst->obsolete || rtos != dest->dst_rtos) &&
+ dst->ops->check(dst, dest->dst_cookie) == NULL) {
dest->dst_cache = NULL;
dst_release(dst);
return NULL;
@@ -65,16 +77,24 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
return dst;
}
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ * &4=Allow redirect from remote daddr to local
+ */
static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
+ __be32 daddr, u32 rtos, int rt_mode)
{
+ struct net *net = dev_net(skb_dst(skb)->dev);
struct rtable *rt; /* Route to the other host */
- struct ip_vs_dest *dest = cp->dest;
+ struct rtable *ort; /* Original route */
+ int local;
if (dest) {
spin_lock(&dest->dst_lock);
if (!(rt = (struct rtable *)
- __ip_vs_dst_check(dest, rtos, 0))) {
+ __ip_vs_dst_check(dest, rtos))) {
struct flowi fl = {
.oif = 0,
.nl_u = {
@@ -84,16 +104,16 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
.tos = rtos, } },
};
- if (ip_route_output_key(&init_net, &rt, &fl)) {
+ if (ip_route_output_key(net, &rt, &fl)) {
spin_unlock(&dest->dst_lock);
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
&dest->addr.ip);
return NULL;
}
- __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
+ __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
&dest->addr.ip,
- atomic_read(&rt->u.dst.__refcnt), rtos);
+ atomic_read(&rt->dst.__refcnt), rtos);
}
spin_unlock(&dest->dst_lock);
} else {
@@ -101,78 +121,199 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
.oif = 0,
.nl_u = {
.ip4_u = {
- .daddr = cp->daddr.ip,
+ .daddr = daddr,
.saddr = 0,
.tos = rtos, } },
};
- if (ip_route_output_key(&init_net, &rt, &fl)) {
+ if (ip_route_output_key(net, &rt, &fl)) {
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
- &cp->daddr.ip);
+ &daddr);
return NULL;
}
}
+ local = rt->rt_flags & RTCF_LOCAL;
+ if (!((local ? 1 : 2) & rt_mode)) {
+ IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
+ (rt->rt_flags & RTCF_LOCAL) ?
+ "local":"non-local", &rt->rt_dst);
+ ip_rt_put(rt);
+ return NULL;
+ }
+ if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
+ ort->rt_flags & RTCF_LOCAL)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
+ "requires NAT method, dest: %pI4\n",
+ &ip_hdr(skb)->daddr, &rt->rt_dst);
+ ip_rt_put(rt);
+ return NULL;
+ }
+ if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
+ "to non-local address, dest: %pI4\n",
+ &ip_hdr(skb)->saddr, &rt->rt_dst);
+ ip_rt_put(rt);
+ return NULL;
+ }
+
return rt;
}
+/* Reroute packet to local IPv4 stack after DNAT */
+static int
+__ip_vs_reroute_locally(struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct net_device *dev = rt->dst.dev;
+ struct net *net = dev_net(dev);
+ struct iphdr *iph = ip_hdr(skb);
+
+ if (rt->fl.iif) {
+ unsigned long orefdst = skb->_skb_refdst;
+
+ if (ip_route_input(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev))
+ return 0;
+ refdst_drop(orefdst);
+ } else {
+ struct flowi fl = {
+ .oif = 0,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos),
+ }
+ },
+ .mark = skb->mark,
+ };
+ struct rtable *rt;
+
+ if (ip_route_output_key(net, &rt, &fl))
+ return 0;
+ if (!(rt->rt_flags & RTCF_LOCAL)) {
+ ip_rt_put(rt);
+ return 0;
+ }
+ /* Drop old route. */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ }
+ return 1;
+}
+
#ifdef CONFIG_IP_VS_IPV6
+
+static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
+{
+ return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
+}
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+ struct in6_addr *ret_saddr, int do_xfrm)
+{
+ struct dst_entry *dst;
+ struct flowi fl = {
+ .oif = 0,
+ .nl_u = {
+ .ip6_u = {
+ .daddr = *daddr,
+ },
+ },
+ };
+
+ dst = ip6_route_output(net, NULL, &fl);
+ if (dst->error)
+ goto out_err;
+ if (!ret_saddr)
+ return dst;
+ if (ipv6_addr_any(&fl.fl6_src) &&
+ ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+ &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+ goto out_err;
+ if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+ goto out_err;
+ ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+ return dst;
+
+out_err:
+ dst_release(dst);
+ IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+ return NULL;
+}
+
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ * &4=Allow redirect from remote daddr to local
+ */
static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
+ struct in6_addr *daddr, struct in6_addr *ret_saddr,
+ int do_xfrm, int rt_mode)
{
+ struct net *net = dev_net(skb_dst(skb)->dev);
struct rt6_info *rt; /* Route to the other host */
- struct ip_vs_dest *dest = cp->dest;
+ struct rt6_info *ort; /* Original route */
+ struct dst_entry *dst;
+ int local;
if (dest) {
spin_lock(&dest->dst_lock);
- rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+ rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
if (!rt) {
- struct flowi fl = {
- .oif = 0,
- .nl_u = {
- .ip6_u = {
- .daddr = dest->addr.in6,
- .saddr = {
- .s6_addr32 =
- { 0, 0, 0, 0 },
- },
- },
- },
- };
+ u32 cookie;
- rt = (struct rt6_info *)ip6_route_output(&init_net,
- NULL, &fl);
- if (!rt) {
+ dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+ &dest->dst_saddr,
+ do_xfrm);
+ if (!dst) {
spin_unlock(&dest->dst_lock);
- IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
- &dest->addr.in6);
return NULL;
}
- __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst));
- IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
- &dest->addr.in6,
- atomic_read(&rt->u.dst.__refcnt));
+ rt = (struct rt6_info *) dst;
+ cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+ IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+ &dest->addr.in6, &dest->dst_saddr,
+ atomic_read(&rt->dst.__refcnt));
}
+ if (ret_saddr)
+ ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
spin_unlock(&dest->dst_lock);
} else {
- struct flowi fl = {
- .oif = 0,
- .nl_u = {
- .ip6_u = {
- .daddr = cp->daddr.in6,
- .saddr = {
- .s6_addr32 = { 0, 0, 0, 0 },
- },
- },
- },
- };
-
- rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
- if (!rt) {
- IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
- &cp->daddr.in6);
+ dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
+ if (!dst)
return NULL;
- }
+ rt = (struct rt6_info *) dst;
+ }
+
+ local = __ip_vs_is_local_route6(rt);
+ if (!((local ? 1 : 2) & rt_mode)) {
+ IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
+ local ? "local":"non-local", daddr);
+ dst_release(&rt->dst);
+ return NULL;
+ }
+ if (local && !(rt_mode & 4) &&
+ !((ort = (struct rt6_info *) skb_dst(skb)) &&
+ __ip_vs_is_local_route6(ort))) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
+ "requires NAT method, dest: %pI6\n",
+ &ipv6_hdr(skb)->daddr, daddr);
+ dst_release(&rt->dst);
+ return NULL;
+ }
+ if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+ IPV6_ADDR_LOOPBACK)) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
+ "to non-local address, dest: %pI6\n",
+ &ipv6_hdr(skb)->saddr, daddr);
+ dst_release(&rt->dst);
+ return NULL;
}
return rt;
@@ -193,12 +334,44 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
dst_release(old_dst);
}
-#define IP_VS_XMIT(pf, skb, rt) \
+#define IP_VS_XMIT_TUNNEL(skb, cp) \
+({ \
+ int __ret = NF_ACCEPT; \
+ \
+ (skb)->ipvs_property = 1; \
+ if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
+ __ret = ip_vs_confirm_conntrack(skb, cp); \
+ if (__ret == NF_ACCEPT) { \
+ nf_reset(skb); \
+ skb_forward_csum(skb); \
+ } \
+ __ret; \
+})
+
+#define IP_VS_XMIT_NAT(pf, skb, cp, local) \
+do { \
+ (skb)->ipvs_property = 1; \
+ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
+ ip_vs_notrack(skb); \
+ else \
+ ip_vs_update_conntrack(skb, cp, 1); \
+ if (local) \
+ return NF_ACCEPT; \
+ skb_forward_csum(skb); \
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
+ skb_dst(skb)->dev, dst_output); \
+} while (0)
+
+#define IP_VS_XMIT(pf, skb, cp, local) \
do { \
(skb)->ipvs_property = 1; \
+ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
+ ip_vs_notrack(skb); \
+ if (local) \
+ return NF_ACCEPT; \
skb_forward_csum(skb); \
NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- (rt)->u.dst.dev, dst_output); \
+ skb_dst(skb)->dev, dst_output); \
} while (0)
@@ -210,7 +383,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
/* we do not touch skb and do not need pskb ptr */
- return NF_ACCEPT;
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
}
@@ -225,27 +398,16 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct rtable *rt; /* Route to the other host */
struct iphdr *iph = ip_hdr(skb);
- u8 tos = iph->tos;
int mtu;
- struct flowi fl = {
- .oif = 0,
- .nl_u = {
- .ip4_u = {
- .daddr = iph->daddr,
- .saddr = 0,
- .tos = RT_TOS(tos), } },
- };
EnterFunction(10);
- if (ip_route_output_key(&init_net, &rt, &fl)) {
- IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
- __func__, &iph->daddr);
+ if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
+ RT_TOS(iph->tos), 2)))
goto tx_error_icmp;
- }
/* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
+ mtu = dst_mtu(&rt->dst);
if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
@@ -265,12 +427,12 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -291,28 +453,22 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rt6_info *rt; /* Route to the other host */
struct ipv6hdr *iph = ipv6_hdr(skb);
int mtu;
- struct flowi fl = {
- .oif = 0,
- .nl_u = {
- .ip6_u = {
- .daddr = iph->daddr,
- .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
- };
EnterFunction(10);
- rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
- if (!rt) {
- IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
- __func__, &iph->daddr);
+ if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2)))
goto tx_error_icmp;
- }
/* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
+ mtu = dst_mtu(&rt->dst);
if (skb->len > mtu) {
- dst_release(&rt->u.dst);
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ dst_release(&rt->dst);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error;
}
@@ -323,18 +479,18 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(skb == NULL)) {
- dst_release(&rt->u.dst);
+ dst_release(&rt->dst);
return NF_STOLEN;
}
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -359,6 +515,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rtable *rt; /* Route to the other host */
int mtu;
struct iphdr *iph = ip_hdr(skb);
+ int local;
EnterFunction(10);
@@ -372,36 +529,73 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+ if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ RT_TOS(iph->tos), 1|2|4)))
goto tx_error_icmp;
+ local = rt->rt_flags & RTCF_LOCAL;
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+ "ip_vs_nat_xmit(): "
+ "stopping DNAT to local address");
+ goto tx_error_put;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+ IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
+ "stopping DNAT to loopback address");
+ goto tx_error_put;
+ }
/* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
+ mtu = dst_mtu(&rt->dst);
if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
- ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
- goto tx_error;
+ IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
+ "ip_vs_nat_xmit(): frag needed for");
+ goto tx_error_put;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, sizeof(struct iphdr)))
goto tx_error_put;
- if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
goto tx_error_put;
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
-
/* mangle the packet */
if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
- goto tx_error;
+ goto tx_error_put;
ip_hdr(skb)->daddr = cp->daddr.ip;
ip_send_check(ip_hdr(skb));
- IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+ if (!local) {
+ /* drop old route */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ } else {
+ ip_rt_put(rt);
+ /*
+ * Some IPv4 replies get local address from routes,
+ * not from iph, so while we DNAT after routing
+ * we need this second input/output route.
+ */
+ if (!__ip_vs_reroute_locally(skb))
+ goto tx_error;
+ }
+
+ IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
@@ -410,7 +604,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
LeaveFunction(10);
return NF_STOLEN;
@@ -418,8 +612,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- LeaveFunction(10);
kfree_skb(skb);
+ LeaveFunction(10);
return NF_STOLEN;
tx_error_put:
ip_rt_put(rt);
@@ -433,6 +627,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct rt6_info *rt; /* Route to the other host */
int mtu;
+ int local;
EnterFunction(10);
@@ -447,37 +642,73 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- rt = __ip_vs_get_out_rt_v6(cp);
- if (!rt)
+ if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ 0, 1|2|4)))
goto tx_error_icmp;
+ local = __ip_vs_is_local_route6(rt);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+ "ip_vs_nat_xmit_v6(): "
+ "stopping DNAT to local address");
+ goto tx_error_put;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+ "ip_vs_nat_xmit_v6(): "
+ "stopping DNAT to loopback address");
+ goto tx_error_put;
+ }
/* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
+ mtu = dst_mtu(&rt->dst);
if (skb->len > mtu) {
- dst_release(&rt->u.dst);
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): frag needed for");
- goto tx_error;
+ goto tx_error_put;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
goto tx_error_put;
- if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
goto tx_error_put;
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
-
/* mangle the packet */
if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
goto tx_error;
- ipv6_hdr(skb)->daddr = cp->daddr.in6;
+ ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
- IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+ if (!local || !skb->dev) {
+ /* drop the old route when skb is not shared */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ } else {
+ /* destined to loopback, do we need to change route? */
+ dst_release(&rt->dst);
+ }
+
+ IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
@@ -486,7 +717,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
LeaveFunction(10);
return NF_STOLEN;
@@ -498,7 +729,7 @@ tx_error:
kfree_skb(skb);
return NF_STOLEN;
tx_error_put:
- dst_release(&rt->u.dst);
+ dst_release(&rt->dst);
goto tx_error;
}
#endif
@@ -532,30 +763,27 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct iphdr *old_iph = ip_hdr(skb);
u8 tos = old_iph->tos;
__be16 df = old_iph->frag_off;
- sk_buff_data_t old_transport_header = skb->transport_header;
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int mtu;
+ int ret;
EnterFunction(10);
- if (skb->protocol != htons(ETH_P_IP)) {
- IP_VS_DBG_RL("%s(): protocol error, "
- "ETH_P_IP: %d, skb protocol: %d\n",
- __func__, htons(ETH_P_IP), skb->protocol);
- goto tx_error;
- }
-
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+ if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ RT_TOS(tos), 1|2)))
goto tx_error_icmp;
+ if (rt->rt_flags & RTCF_LOCAL) {
+ ip_rt_put(rt);
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ }
- tdev = rt->u.dst.dev;
+ tdev = rt->dst.dev;
- mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+ mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
if (mtu < 68) {
- ip_rt_put(rt);
IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
- goto tx_error;
+ goto tx_error_put;
}
if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
@@ -565,9 +793,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if ((old_iph->frag_off & htons(IP_DF))
&& mtu < ntohs(old_iph->tot_len)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error;
+ goto tx_error_put;
}
/*
@@ -590,7 +817,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
old_iph = ip_hdr(skb);
}
- skb->transport_header = old_transport_header;
+ skb->transport_header = skb->network_header;
/* fix old IP header checksum */
ip_send_check(old_iph);
@@ -601,7 +828,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
/*
* Push down and install the IPIP header.
@@ -615,12 +842,16 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
iph->daddr = rt->rt_dst;
iph->saddr = rt->rt_src;
iph->ttl = old_iph->ttl;
- ip_select_ident(iph, &rt->u.dst, NULL);
+ ip_select_ident(iph, &rt->dst, NULL);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ip_local_out(skb);
+ ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
LeaveFunction(10);
@@ -632,6 +863,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
kfree_skb(skb);
LeaveFunction(10);
return NF_STOLEN;
+tx_error_put:
+ ip_rt_put(rt);
+ goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
@@ -640,43 +874,44 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
struct rt6_info *rt; /* Route to the other host */
+ struct in6_addr saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
struct ipv6hdr *old_iph = ipv6_hdr(skb);
- sk_buff_data_t old_transport_header = skb->transport_header;
struct ipv6hdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int mtu;
+ int ret;
EnterFunction(10);
- if (skb->protocol != htons(ETH_P_IPV6)) {
- IP_VS_DBG_RL("%s(): protocol error, "
- "ETH_P_IPV6: %d, skb protocol: %d\n",
- __func__, htons(ETH_P_IPV6), skb->protocol);
- goto tx_error;
- }
-
- rt = __ip_vs_get_out_rt_v6(cp);
- if (!rt)
+ if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+ &saddr, 1, 1|2)))
goto tx_error_icmp;
+ if (__ip_vs_is_local_route6(rt)) {
+ dst_release(&rt->dst);
+ IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+ }
- tdev = rt->u.dst.dev;
+ tdev = rt->dst.dev;
- mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr);
- /* TODO IPv6: do we need this check in IPv6? */
- if (mtu < 1280) {
- dst_release(&rt->u.dst);
- IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
- goto tx_error;
+ mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+ if (mtu < IPV6_MIN_MTU) {
+ IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+ IPV6_MIN_MTU);
+ goto tx_error_put;
}
if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->u.dst);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error;
+ goto tx_error_put;
}
/*
@@ -689,7 +924,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
- dst_release(&rt->u.dst);
+ dst_release(&rt->dst);
kfree_skb(skb);
IP_VS_ERR_RL("%s(): no memory\n", __func__);
return NF_STOLEN;
@@ -699,7 +934,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
old_iph = ipv6_hdr(skb);
}
- skb->transport_header = old_transport_header;
+ skb->transport_header = skb->network_header;
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
@@ -707,7 +942,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
/*
* Push down and install the IPIP header.
@@ -719,14 +954,18 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
iph->priority = old_iph->priority;
memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
- iph->daddr = rt->rt6i_dst.addr;
- iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */
+ ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
+ ipv6_addr_copy(&iph->saddr, &saddr);
iph->hop_limit = old_iph->hop_limit;
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ip6_local_out(skb);
+ ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip6_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
LeaveFunction(10);
@@ -738,6 +977,9 @@ tx_error:
kfree_skb(skb);
LeaveFunction(10);
return NF_STOLEN;
+tx_error_put:
+ dst_release(&rt->dst);
+ goto tx_error;
}
#endif
@@ -756,11 +998,16 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+ if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ RT_TOS(iph->tos), 1|2)))
goto tx_error_icmp;
+ if (rt->rt_flags & RTCF_LOCAL) {
+ ip_rt_put(rt);
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ }
/* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
+ mtu = dst_mtu(&rt->dst);
if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
ip_rt_put(rt);
@@ -780,12 +1027,12 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -808,15 +1055,24 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
- rt = __ip_vs_get_out_rt_v6(cp);
- if (!rt)
+ if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ 0, 1|2)))
goto tx_error_icmp;
+ if (__ip_vs_is_local_route6(rt)) {
+ dst_release(&rt->dst);
+ IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+ }
/* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
+ mtu = dst_mtu(&rt->dst);
if (skb->len > mtu) {
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->u.dst);
+ dst_release(&rt->dst);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error;
}
@@ -827,18 +1083,18 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(skb == NULL)) {
- dst_release(&rt->u.dst);
+ dst_release(&rt->dst);
return NF_STOLEN;
}
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -864,6 +1120,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rtable *rt; /* Route to the other host */
int mtu;
int rc;
+ int local;
EnterFunction(10);
@@ -884,35 +1141,73 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
* mangle and send the packet here (only for VS/NAT)
*/
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+ if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
goto tx_error_icmp;
+ local = rt->rt_flags & RTCF_LOCAL;
+
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG(10, "%s(): "
+ "stopping DNAT to local address %pI4\n",
+ __func__, &cp->daddr.ip);
+ goto tx_error_put;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+ IP_VS_DBG(1, "%s(): "
+ "stopping DNAT to loopback %pI4\n",
+ __func__, &cp->daddr.ip);
+ goto tx_error_put;
+ }
/* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
+ mtu = dst_mtu(&rt->dst);
if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
- ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error;
+ goto tx_error_put;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, offset))
goto tx_error_put;
- if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
goto tx_error_put;
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
-
ip_vs_nat_icmp(skb, pp, cp, 0);
+ if (!local) {
+ /* drop the old route when skb is not shared */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ } else {
+ ip_rt_put(rt);
+ /*
+ * Some IPv4 replies get local address from routes,
+ * not from iph, so while we DNAT after routing
+ * we need this second input/output route.
+ */
+ if (!__ip_vs_reroute_locally(skb))
+ goto tx_error;
+ }
+
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
rc = NF_STOLEN;
goto out;
@@ -938,6 +1233,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rt6_info *rt; /* Route to the other host */
int mtu;
int rc;
+ int local;
EnterFunction(10);
@@ -958,36 +1254,73 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
* mangle and send the packet here (only for VS/NAT)
*/
- rt = __ip_vs_get_out_rt_v6(cp);
- if (!rt)
+ if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ 0, 1|2|4)))
goto tx_error_icmp;
+ local = __ip_vs_is_local_route6(rt);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG(10, "%s(): "
+ "stopping DNAT to local address %pI6\n",
+ __func__, &cp->daddr.in6);
+ goto tx_error_put;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG(1, "%s(): "
+ "stopping DNAT to loopback %pI6\n",
+ __func__, &cp->daddr.in6);
+ goto tx_error_put;
+ }
+
/* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
+ mtu = dst_mtu(&rt->dst);
if (skb->len > mtu) {
- dst_release(&rt->u.dst);
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error;
+ goto tx_error_put;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, offset))
goto tx_error_put;
- if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
goto tx_error_put;
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
-
ip_vs_nat_icmp_v6(skb, pp, cp, 0);
+ if (!local || !skb->dev) {
+ /* drop the old route when skb is not shared */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ } else {
+ /* destined to loopback, do we need to change route? */
+ dst_release(&rt->dst);
+ }
+
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
rc = NF_STOLEN;
goto out;
@@ -1001,7 +1334,7 @@ out:
LeaveFunction(10);
return rc;
tx_error_put:
- dst_release(&rt->u.dst);
+ dst_release(&rt->dst);
goto tx_error;
}
#endif
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index ab81b380eae6..5178c691ecbf 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -17,13 +17,7 @@
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_acct.h>
-#ifdef CONFIG_NF_CT_ACCT
-#define NF_CT_ACCT_DEFAULT 1
-#else
-#define NF_CT_ACCT_DEFAULT 0
-#endif
-
-static int nf_ct_acct __read_mostly = NF_CT_ACCT_DEFAULT;
+static int nf_ct_acct __read_mostly;
module_param_named(acct, nf_ct_acct, bool, 0644);
MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
@@ -114,12 +108,6 @@ int nf_conntrack_acct_init(struct net *net)
net->ct.sysctl_acct = nf_ct_acct;
if (net_eq(net, &init_net)) {
-#ifdef CONFIG_NF_CT_ACCT
- printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Please use\n");
- printk(KERN_WARNING "nf_conntrack.acct=1 kernel parameter, acct=1 nf_conntrack module option or\n");
- printk(KERN_WARNING "sysctl net.netfilter.nf_conntrack_acct=1 to enable it.\n");
-#endif
-
ret = nf_ct_extend_register(&acct_extend);
if (ret < 0) {
printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index eeeb8bc73982..1eacf8d9966a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -62,35 +62,45 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_max);
-struct nf_conn nf_conntrack_untracked __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
+DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
+EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
-static int nf_conntrack_hash_rnd_initted;
-static unsigned int nf_conntrack_hash_rnd;
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
- u16 zone, unsigned int size, unsigned int rnd)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
{
unsigned int n;
- u_int32_t h;
/* The direction must be ignored, so we hash everything up to the
* destination ports (which is a multiple of 4) and treat the last
* three bytes manually.
*/
n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- h = jhash2((u32 *)tuple, n,
- zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
- tuple->dst.protonum));
+ return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+ (((__force __u16)tuple->dst.u.all << 16) |
+ tuple->dst.protonum));
+}
+
+static u32 __hash_bucket(u32 hash, unsigned int size)
+{
+ return ((u64)hash * size) >> 32;
+}
+
+static u32 hash_bucket(u32 hash, const struct net *net)
+{
+ return __hash_bucket(hash, net->ct.htable_size);
+}
- return ((u64)h * size) >> 32;
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+ u16 zone, unsigned int size)
+{
+ return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
}
static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
const struct nf_conntrack_tuple *tuple)
{
- return __hash_conntrack(tuple, zone, net->ct.htable_size,
- nf_conntrack_hash_rnd);
+ return __hash_conntrack(tuple, zone, net->ct.htable_size);
}
bool
@@ -292,20 +302,20 @@ static void death_by_timeout(unsigned long ul_conntrack)
* OR
* - Caller must lock nf_conntrack_lock before calling this function
*/
-struct nf_conntrack_tuple_hash *
-__nf_conntrack_find(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+____nf_conntrack_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- unsigned int hash = hash_conntrack(net, zone, tuple);
+ unsigned int bucket = hash_bucket(hash, net);
/* Disable BHs the entire time since we normally need to disable them
* at least once for the stats anyway.
*/
local_bh_disable();
begin:
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+ hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
if (nf_ct_tuple_equal(tuple, &h->tuple) &&
nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
NF_CT_STAT_INC(net, found);
@@ -319,7 +329,7 @@ begin:
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
- if (get_nulls_value(n) != hash) {
+ if (get_nulls_value(n) != bucket) {
NF_CT_STAT_INC(net, search_restart);
goto begin;
}
@@ -327,19 +337,27 @@ begin:
return NULL;
}
+
+struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return ____nf_conntrack_find(net, zone, tuple,
+ hash_conntrack_raw(tuple, zone));
+}
EXPORT_SYMBOL_GPL(__nf_conntrack_find);
/* Find a connection corresponding to a tuple. */
-struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
rcu_read_lock();
begin:
- h = __nf_conntrack_find(net, zone, tuple);
+ h = ____nf_conntrack_find(net, zone, tuple, hash);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (unlikely(nf_ct_is_dying(ct) ||
@@ -357,6 +375,14 @@ begin:
return h;
}
+
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, zone));
+}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
static void __nf_conntrack_hash_insert(struct nf_conn *ct,
@@ -409,8 +435,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT;
zone = nf_ct_zone(ct);
- hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ /* reuse the hash saved before */
+ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+ hash = hash_bucket(hash, net);
+ repl_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
/* We're not in hash table, and we refuse to set up related
connections for unconfirmed conns. But packet copies and
@@ -567,17 +596,29 @@ static noinline int early_drop(struct net *net, unsigned int hash)
return dropped;
}
-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *orig,
- const struct nf_conntrack_tuple *repl,
- gfp_t gfp)
+static struct nf_conn *
+__nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ gfp_t gfp, u32 hash)
{
struct nf_conn *ct;
- if (unlikely(!nf_conntrack_hash_rnd_initted)) {
- get_random_bytes(&nf_conntrack_hash_rnd,
- sizeof(nf_conntrack_hash_rnd));
- nf_conntrack_hash_rnd_initted = 1;
+ if (unlikely(!nf_conntrack_hash_rnd)) {
+ unsigned int rand;
+
+ /*
+ * Why not initialize nf_conntrack_rnd in a "init()" function ?
+ * Because there isn't enough entropy when system initializing,
+ * and we initialize it as late as possible.
+ */
+ do {
+ get_random_bytes(&rand, sizeof(rand));
+ } while (!rand);
+ cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+
+ /* recompute the hash as nf_conntrack_hash_rnd is initialized */
+ hash = hash_conntrack_raw(orig, zone);
}
/* We don't want any race condition at early drop stage */
@@ -585,8 +626,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
if (nf_conntrack_max &&
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
- unsigned int hash = hash_conntrack(net, zone, orig);
- if (!early_drop(net, hash)) {
+ if (!early_drop(net, hash_bucket(hash, net))) {
atomic_dec(&net->ct.count);
if (net_ratelimit())
printk(KERN_WARNING
@@ -616,12 +656,11 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
- ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
+ /* save hash for reusing when confirming */
+ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
/* Don't set timer yet: wait for confirmation */
setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
-#ifdef CONFIG_NET_NS
- ct->ct_net = net;
-#endif
+ write_pnet(&ct->ct_net, net);
#ifdef CONFIG_NF_CONNTRACK_ZONES
if (zone) {
struct nf_conntrack_zone *nf_ct_zone;
@@ -645,6 +684,14 @@ out_free:
return ERR_PTR(-ENOMEM);
#endif
}
+
+struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ gfp_t gfp)
+{
+ return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
+}
EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
void nf_conntrack_free(struct nf_conn *ct)
@@ -666,7 +713,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_l3proto *l3proto,
struct nf_conntrack_l4proto *l4proto,
struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, u32 hash)
{
struct nf_conn *ct;
struct nf_conn_help *help;
@@ -680,7 +727,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
return NULL;
}
- ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC);
+ ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+ hash);
if (IS_ERR(ct)) {
pr_debug("Can't allocate conntrack.\n");
return (struct nf_conntrack_tuple_hash *)ct;
@@ -757,6 +805,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ u32 hash;
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, l3num, protonum, &tuple, l3proto,
@@ -766,10 +815,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
}
/* look for tuple match */
- h = nf_conntrack_find_get(net, zone, &tuple);
+ hash = hash_conntrack_raw(&tuple, zone);
+ h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
- skb, dataoff);
+ skb, dataoff, hash);
if (!h)
return NULL;
if (IS_ERR(h))
@@ -968,8 +1018,7 @@ acct:
if (acct) {
spin_lock_bh(&ct->lock);
acct[CTINFO2DIR(ctinfo)].packets++;
- acct[CTINFO2DIR(ctinfo)].bytes +=
- skb->len - skb_network_offset(skb);
+ acct[CTINFO2DIR(ctinfo)].bytes += skb->len;
spin_unlock_bh(&ct->lock);
}
}
@@ -1183,10 +1232,21 @@ static void nf_ct_release_dying_list(struct net *net)
spin_unlock_bh(&nf_conntrack_lock);
}
+static int untrack_refs(void)
+{
+ int cnt = 0, cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
+
+ cnt += atomic_read(&ct->ct_general.use) - 1;
+ }
+ return cnt;
+}
+
static void nf_conntrack_cleanup_init_net(void)
{
- /* wait until all references to nf_conntrack_untracked are dropped */
- while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
+ while (untrack_refs() > 0)
schedule();
nf_conntrack_helper_fini();
@@ -1299,8 +1359,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
- hashsize,
- nf_conntrack_hash_rnd);
+ hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
@@ -1321,10 +1380,19 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
&nf_conntrack_htable_size, 0600);
+void nf_ct_untracked_status_or(unsigned long bits)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(nf_conntrack_untracked, cpu).status |= bits;
+}
+EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
+
static int nf_conntrack_init_init_net(void)
{
int max_factor = 8;
- int ret;
+ int ret, cpu;
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
* machine has 512 buckets. >= 1GB machines have 16384 buckets. */
@@ -1363,13 +1431,13 @@ static int nf_conntrack_init_init_net(void)
goto err_extend;
#endif
/* Set up fake conntrack: to never be deleted, not in any hashes */
-#ifdef CONFIG_NET_NS
- nf_conntrack_untracked.ct_net = &init_net;
-#endif
- atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
+ for_each_possible_cpu(cpu) {
+ struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
+ write_pnet(&ct->ct_net, &init_net);
+ atomic_set(&ct->ct_general.use, 1);
+ }
/* - and look it like as a confirmed connection */
- set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
-
+ nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
return 0;
#ifdef CONFIG_NF_CONNTRACK_ZONES
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index cdcc7649476b..5702de35e2bb 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -26,10 +26,10 @@
static DEFINE_MUTEX(nf_ct_ecache_mutex);
-struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly;
+struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
-struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly;
+struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
EXPORT_SYMBOL_GPL(nf_expect_event_cb);
/* deliver cached events and clear cache entry - must be called with locally
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index acb29ccaa41f..46e8966912b1 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -38,25 +38,30 @@ static int nf_ct_expect_hash_rnd_initted __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
+static HLIST_HEAD(nf_ct_userspace_expect_list);
+
/* nf_conntrack_expect helper functions */
-void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
+void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+ u32 pid, int report)
{
struct nf_conn_help *master_help = nfct_help(exp->master);
struct net *net = nf_ct_exp_net(exp);
- NF_CT_ASSERT(master_help);
NF_CT_ASSERT(!timer_pending(&exp->timeout));
hlist_del_rcu(&exp->hnode);
net->ct.expect_count--;
hlist_del(&exp->lnode);
- master_help->expecting[exp->class]--;
+ if (!(exp->flags & NF_CT_EXPECT_USERSPACE))
+ master_help->expecting[exp->class]--;
+
+ nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
nf_ct_expect_put(exp);
NF_CT_STAT_INC(net, expect_delete);
}
-EXPORT_SYMBOL_GPL(nf_ct_unlink_expect);
+EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
static void nf_ct_expectation_timed_out(unsigned long ul_expect)
{
@@ -320,16 +325,21 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
atomic_inc(&exp->use);
- hlist_add_head(&exp->lnode, &master_help->expectations);
- master_help->expecting[exp->class]++;
+ if (master_help) {
+ hlist_add_head(&exp->lnode, &master_help->expectations);
+ master_help->expecting[exp->class]++;
+ } else if (exp->flags & NF_CT_EXPECT_USERSPACE)
+ hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list);
hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
net->ct.expect_count++;
setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
(unsigned long)exp);
- p = &master_help->helper->expect_policy[exp->class];
- exp->timeout.expires = jiffies + p->timeout * HZ;
+ if (master_help) {
+ p = &master_help->helper->expect_policy[exp->class];
+ exp->timeout.expires = jiffies + p->timeout * HZ;
+ }
add_timer(&exp->timeout);
atomic_inc(&exp->use);
@@ -380,7 +390,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
unsigned int h;
int ret = 1;
- if (!master_help->helper) {
+ /* Don't allow expectations created from kernel-space with no helper */
+ if (!(expect->flags & NF_CT_EXPECT_USERSPACE) &&
+ (!master_help || (master_help && !master_help->helper))) {
ret = -ESHUTDOWN;
goto out;
}
@@ -398,13 +410,16 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
}
}
/* Will be over limit? */
- p = &master_help->helper->expect_policy[expect->class];
- if (p->max_expected &&
- master_help->expecting[expect->class] >= p->max_expected) {
- evict_oldest_expect(master, expect);
- if (master_help->expecting[expect->class] >= p->max_expected) {
- ret = -EMFILE;
- goto out;
+ if (master_help) {
+ p = &master_help->helper->expect_policy[expect->class];
+ if (p->max_expected &&
+ master_help->expecting[expect->class] >= p->max_expected) {
+ evict_oldest_expect(master, expect);
+ if (master_help->expecting[expect->class]
+ >= p->max_expected) {
+ ret = -EMFILE;
+ goto out;
+ }
}
}
@@ -439,6 +454,21 @@ out:
}
EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
+void nf_ct_remove_userspace_expectations(void)
+{
+ struct nf_conntrack_expect *exp;
+ struct hlist_node *n, *next;
+
+ hlist_for_each_entry_safe(exp, n, next,
+ &nf_ct_userspace_expect_list, lnode) {
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ nf_ct_expect_put(exp);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations);
+
#ifdef CONFIG_PROC_FS
struct ct_expect_iter_state {
struct seq_net_private p;
@@ -529,8 +559,12 @@ static int exp_seq_show(struct seq_file *s, void *v)
seq_printf(s, "PERMANENT");
delim = ",";
}
- if (expect->flags & NF_CT_EXPECT_INACTIVE)
+ if (expect->flags & NF_CT_EXPECT_INACTIVE) {
seq_printf(s, "%sINACTIVE", delim);
+ delim = ",";
+ }
+ if (expect->flags & NF_CT_EXPECT_USERSPACE)
+ seq_printf(s, "%sUSERSPACE", delim);
helper = rcu_dereference(nfct_help(expect->master)->helper);
if (helper) {
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index fdc8fb4ae10f..bd82450c193f 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -16,16 +16,17 @@
#include <linux/skbuff.h>
#include <net/netfilter/nf_conntrack_extend.h>
-static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM];
+static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
static DEFINE_MUTEX(nf_ct_ext_type_mutex);
void __nf_ct_ext_destroy(struct nf_conn *ct)
{
unsigned int i;
struct nf_ct_ext_type *t;
+ struct nf_ct_ext *ext = ct->ext;
for (i = 0; i < NF_CT_EXT_NUM; i++) {
- if (!nf_ct_ext_exist(ct, i))
+ if (!__nf_ct_ext_exist(ext, i))
continue;
rcu_read_lock();
@@ -47,15 +48,17 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)
{
unsigned int off, len;
struct nf_ct_ext_type *t;
+ size_t alloc_size;
rcu_read_lock();
t = rcu_dereference(nf_ct_ext_types[id]);
BUG_ON(t == NULL);
off = ALIGN(sizeof(struct nf_ct_ext), t->align);
len = off + t->len;
+ alloc_size = t->alloc_size;
rcu_read_unlock();
- *ext = kzalloc(t->alloc_size, gfp);
+ *ext = kzalloc(alloc_size, gfp);
if (!*ext)
return NULL;
@@ -73,44 +76,45 @@ static void __nf_ct_ext_free_rcu(struct rcu_head *head)
void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
{
- struct nf_ct_ext *new;
+ struct nf_ct_ext *old, *new;
int i, newlen, newoff;
struct nf_ct_ext_type *t;
/* Conntrack must not be confirmed to avoid races on reallocation. */
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
- if (!ct->ext)
+ old = ct->ext;
+ if (!old)
return nf_ct_ext_create(&ct->ext, id, gfp);
- if (nf_ct_ext_exist(ct, id))
+ if (__nf_ct_ext_exist(old, id))
return NULL;
rcu_read_lock();
t = rcu_dereference(nf_ct_ext_types[id]);
BUG_ON(t == NULL);
- newoff = ALIGN(ct->ext->len, t->align);
+ newoff = ALIGN(old->len, t->align);
newlen = newoff + t->len;
rcu_read_unlock();
- new = __krealloc(ct->ext, newlen, gfp);
+ new = __krealloc(old, newlen, gfp);
if (!new)
return NULL;
- if (new != ct->ext) {
+ if (new != old) {
for (i = 0; i < NF_CT_EXT_NUM; i++) {
- if (!nf_ct_ext_exist(ct, i))
+ if (!__nf_ct_ext_exist(old, i))
continue;
rcu_read_lock();
t = rcu_dereference(nf_ct_ext_types[i]);
if (t && t->move)
t->move((void *)new + new->offset[i],
- (void *)ct->ext + ct->ext->offset[i]);
+ (void *)old + old->offset[i]);
rcu_read_unlock();
}
- call_rcu(&ct->ext->rcu, __nf_ct_ext_free_rcu);
+ call_rcu(&old->rcu, __nf_ct_ext_free_rcu);
ct->ext = new;
}
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 6eaee7c8a337..b969025cf82f 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -734,11 +734,11 @@ static int callforward_do_filter(const union nf_inet_addr *src,
if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) {
if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) {
if (rt1->rt_gateway == rt2->rt_gateway &&
- rt1->u.dst.dev == rt2->u.dst.dev)
+ rt1->dst.dev == rt2->dst.dev)
ret = 1;
- dst_release(&rt2->u.dst);
+ dst_release(&rt2->dst);
}
- dst_release(&rt1->u.dst);
+ dst_release(&rt1->dst);
}
break;
}
@@ -753,11 +753,11 @@ static int callforward_do_filter(const union nf_inet_addr *src,
if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) {
if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway,
sizeof(rt1->rt6i_gateway)) &&
- rt1->u.dst.dev == rt2->u.dst.dev)
+ rt1->dst.dev == rt2->dst.dev)
ret = 1;
- dst_release(&rt2->u.dst);
+ dst_release(&rt2->dst);
}
- dst_release(&rt1->u.dst);
+ dst_release(&rt1->dst);
}
break;
}
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index 497b2224536f..aadde018a072 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -61,7 +61,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
goto out;
rcu_read_lock();
- in_dev = __in_dev_get_rcu(rt->u.dst.dev);
+ in_dev = __in_dev_get_rcu(rt->dst.dev);
if (in_dev != NULL) {
for_primary_ifa(in_dev) {
if (ifa->ifa_broadcast == iph->daddr) {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c42ff6aa441d..b729ace1dcc1 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -22,6 +22,7 @@
#include <linux/rculist_nulls.h>
#include <linux/types.h>
#include <linux/timer.h>
+#include <linux/security.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/netlink.h>
@@ -245,16 +246,31 @@ nla_put_failure:
#ifdef CONFIG_NF_CONNTRACK_SECMARK
static inline int
-ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
{
- NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark));
- return 0;
+ struct nlattr *nest_secctx;
+ int len, ret;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return ret;
+
+ ret = -1;
+ nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
+ if (!nest_secctx)
+ goto nla_put_failure;
+ NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx);
+ nla_nest_end(skb, nest_secctx);
+
+ ret = 0;
nla_put_failure:
- return -1;
+ security_release_secctx(secctx, len);
+ return ret;
}
#else
-#define ctnetlink_dump_secmark(a, b) (0)
+#define ctnetlink_dump_secctx(a, b) (0)
#endif
#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -391,7 +407,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
ctnetlink_dump_protoinfo(skb, ct) < 0 ||
ctnetlink_dump_helpinfo(skb, ct) < 0 ||
ctnetlink_dump_mark(skb, ct) < 0 ||
- ctnetlink_dump_secmark(skb, ct) < 0 ||
+ ctnetlink_dump_secctx(skb, ct) < 0 ||
ctnetlink_dump_id(skb, ct) < 0 ||
ctnetlink_dump_use(skb, ct) < 0 ||
ctnetlink_dump_master(skb, ct) < 0 ||
@@ -437,6 +453,17 @@ ctnetlink_counters_size(const struct nf_conn *ct)
;
}
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct)
+{
+ int len;
+
+ security_secid_to_secctx(ct->secmark, NULL, &len);
+
+ return sizeof(char) * len;
+}
+#endif
+
static inline size_t
ctnetlink_nlmsg_size(const struct nf_conn *ct)
{
@@ -453,7 +480,8 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
+ nla_total_size(0) /* CTA_HELP */
+ nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
#ifdef CONFIG_NF_CONNTRACK_SECMARK
- + nla_total_size(sizeof(u_int32_t)) /* CTA_SECMARK */
+ + nla_total_size(0) /* CTA_SECCTX */
+ + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */
#endif
#ifdef CONFIG_NF_NAT_NEEDED
+ 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
@@ -480,7 +508,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
int err;
/* ignore our fake conntrack entry */
- if (ct == &nf_conntrack_untracked)
+ if (nf_ct_is_untracked(ct))
return 0;
if (events & (1 << IPCT_DESTROY)) {
@@ -556,7 +584,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
#ifdef CONFIG_NF_CONNTRACK_SECMARK
if ((events & (1 << IPCT_SECMARK) || ct->secmark)
- && ctnetlink_dump_secmark(skb, ct) < 0)
+ && ctnetlink_dump_secctx(skb, ct) < 0)
goto nla_put_failure;
#endif
@@ -1560,8 +1588,8 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
const struct nf_conntrack_expect *exp)
{
struct nf_conn *master = exp->master;
- struct nf_conntrack_helper *helper;
long timeout = (exp->timeout.expires - jiffies) / HZ;
+ struct nf_conn_help *help;
if (timeout < 0)
timeout = 0;
@@ -1577,9 +1605,15 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
- helper = rcu_dereference(nfct_help(master)->helper);
- if (helper)
- NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+ NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
+ help = nfct_help(master);
+ if (help) {
+ struct nf_conntrack_helper *helper;
+
+ helper = rcu_dereference(help->helper);
+ if (helper)
+ NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+ }
return 0;
@@ -1626,17 +1660,20 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
struct sk_buff *skb;
- unsigned int type;
+ unsigned int type, group;
int flags = 0;
- if (events & (1 << IPEXP_NEW)) {
+ if (events & (1 << IPEXP_DESTROY)) {
+ type = IPCTNL_MSG_EXP_DELETE;
+ group = NFNLGRP_CONNTRACK_EXP_DESTROY;
+ } else if (events & (1 << IPEXP_NEW)) {
type = IPCTNL_MSG_EXP_NEW;
flags = NLM_F_CREATE|NLM_F_EXCL;
+ group = NFNLGRP_CONNTRACK_EXP_NEW;
} else
return 0;
- if (!item->report &&
- !nfnetlink_has_listeners(net, NFNLGRP_CONNTRACK_EXP_NEW))
+ if (!item->report && !nfnetlink_has_listeners(net, group))
return 0;
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
@@ -1659,8 +1696,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
rcu_read_unlock();
nlmsg_end(skb, nlh);
- nfnetlink_send(skb, net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW,
- item->report, GFP_ATOMIC);
+ nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC);
return 0;
nla_put_failure:
@@ -1733,6 +1769,8 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
[CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 },
[CTA_EXPECT_ID] = { .type = NLA_U32 },
[CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING },
+ [CTA_EXPECT_ZONE] = { .type = NLA_U16 },
+ [CTA_EXPECT_FLAGS] = { .type = NLA_U32 },
};
static int
@@ -1841,7 +1879,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
}
/* after list removal, usage count == 1 */
- nf_ct_unexpect_related(exp);
+ spin_lock_bh(&nf_conntrack_lock);
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid,
+ nlmsg_report(nlh));
+ nf_ct_expect_put(exp);
+ }
+ spin_unlock_bh(&nf_conntrack_lock);
/* have to put what we 'get' above.
* after this line usage count == 0 */
nf_ct_expect_put(exp);
@@ -1858,7 +1902,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
m_help = nfct_help(exp->master);
if (!strcmp(m_help->helper->name, name) &&
del_timer(&exp->timeout)) {
- nf_ct_unlink_expect(exp);
+ nf_ct_unlink_expect_report(exp,
+ NETLINK_CB(skb).pid,
+ nlmsg_report(nlh));
nf_ct_expect_put(exp);
}
}
@@ -1872,7 +1918,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
&net->ct.expect_hash[i],
hnode) {
if (del_timer(&exp->timeout)) {
- nf_ct_unlink_expect(exp);
+ nf_ct_unlink_expect_report(exp,
+ NETLINK_CB(skb).pid,
+ nlmsg_report(nlh));
nf_ct_expect_put(exp);
}
}
@@ -1918,23 +1966,35 @@ ctnetlink_create_expect(struct net *net, u16 zone,
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
- help = nfct_help(ct);
-
- if (!help || !help->helper) {
- /* such conntrack hasn't got any helper, abort */
- err = -EOPNOTSUPP;
- goto out;
- }
-
exp = nf_ct_expect_alloc(ct);
if (!exp) {
err = -ENOMEM;
goto out;
}
+ help = nfct_help(ct);
+ if (!help) {
+ if (!cda[CTA_EXPECT_TIMEOUT]) {
+ err = -EINVAL;
+ goto out;
+ }
+ exp->timeout.expires =
+ jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+
+ exp->flags = NF_CT_EXPECT_USERSPACE;
+ if (cda[CTA_EXPECT_FLAGS]) {
+ exp->flags |=
+ ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+ }
+ } else {
+ if (cda[CTA_EXPECT_FLAGS]) {
+ exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+ exp->flags &= ~NF_CT_EXPECT_USERSPACE;
+ } else
+ exp->flags = 0;
+ }
exp->class = 0;
exp->expectfn = NULL;
- exp->flags = 0;
exp->master = ct;
exp->helper = NULL;
memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
@@ -2102,6 +2162,7 @@ static void __exit ctnetlink_exit(void)
{
pr_info("ctnetlink: unregistering from nfnetlink.\n");
+ nf_ct_remove_userspace_expectations();
#ifdef CONFIG_NF_CONNTRACK_EVENTS
nf_ct_expect_unregister_notifier(&ctnl_notifier_exp);
nf_conntrack_unregister_notifier(&ctnl_notifier);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 5886ba1d52a0..ed6d92958023 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -28,8 +28,8 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
-static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly;
-struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly;
+static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
+struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_l3protos);
static DEFINE_MUTEX(nf_ct_proto_mutex);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 9dd8cd4fb6e6..3fb2b73b24dc 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -329,8 +329,8 @@ static unsigned int get_conntrack_index(const struct tcphdr *tcph)
/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
in IP Filter' by Guido van Rooij.
- http://www.nluug.nl/events/sane2000/papers.html
- http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
+ http://www.sane.nl/events/sane2000/papers.html
+ http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
The boundaries and the conditions are changed according to RFC793:
the packet must intersect the window (i.e. segments may be
@@ -585,8 +585,16 @@ static bool tcp_in_window(const struct nf_conn *ct,
* Let's try to use the data from the packet.
*/
sender->td_end = end;
+ win <<= sender->td_scale;
sender->td_maxwin = (win == 0 ? 1 : win);
sender->td_maxend = end + sender->td_maxwin;
+ /*
+ * We haven't seen traffic in the other direction yet
+ * but we have to tweak window tracking to pass III
+ * and IV until that happens.
+ */
+ if (receiver->td_maxwin == 0)
+ receiver->td_end = receiver->td_maxend = sack;
}
} else if (((state->state == TCP_CONNTRACK_SYN_SENT
&& dir == IP_CT_DIR_ORIGINAL)
@@ -680,7 +688,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
/*
* Update receiver data.
*/
- if (after(end, sender->td_maxend))
+ if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
receiver->td_maxwin += end - sender->td_maxend;
if (after(sack + win, receiver->td_maxend - 1)) {
receiver->td_maxend = sack + win;
@@ -736,27 +744,19 @@ static bool tcp_in_window(const struct nf_conn *ct,
return res;
}
-#define TH_FIN 0x01
-#define TH_SYN 0x02
-#define TH_RST 0x04
-#define TH_PUSH 0x08
-#define TH_ACK 0x10
-#define TH_URG 0x20
-#define TH_ECE 0x40
-#define TH_CWR 0x80
-
/* table of valid flag combinations - PUSH, ECE and CWR are always valid */
-static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] =
+static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
+ TCPHDR_URG) + 1] =
{
- [TH_SYN] = 1,
- [TH_SYN|TH_URG] = 1,
- [TH_SYN|TH_ACK] = 1,
- [TH_RST] = 1,
- [TH_RST|TH_ACK] = 1,
- [TH_FIN|TH_ACK] = 1,
- [TH_FIN|TH_ACK|TH_URG] = 1,
- [TH_ACK] = 1,
- [TH_ACK|TH_URG] = 1,
+ [TCPHDR_SYN] = 1,
+ [TCPHDR_SYN|TCPHDR_URG] = 1,
+ [TCPHDR_SYN|TCPHDR_ACK] = 1,
+ [TCPHDR_RST] = 1,
+ [TCPHDR_RST|TCPHDR_ACK] = 1,
+ [TCPHDR_FIN|TCPHDR_ACK] = 1,
+ [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1,
+ [TCPHDR_ACK] = 1,
+ [TCPHDR_ACK|TCPHDR_URG] = 1,
};
/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
@@ -803,7 +803,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
}
/* Check TCP flags. */
- tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH));
+ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
if (!tcp_valid_flags[tcpflags]) {
if (LOG_INVALID(net, IPPROTO_TCP))
nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 53d892210a04..bcf47eb518ef 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -130,6 +130,44 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
return len;
}
+static int iswordc(const char c)
+{
+ if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
+ (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+ c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
+ c == '{' || c == '}' || c == '~')
+ return 1;
+ return 0;
+}
+
+static int word_len(const char *dptr, const char *limit)
+{
+ int len = 0;
+ while (dptr < limit && iswordc(*dptr)) {
+ dptr++;
+ len++;
+ }
+ return len;
+}
+
+static int callid_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ int len, domain_len;
+
+ len = word_len(dptr, limit);
+ dptr += len;
+ if (!len || dptr == limit || *dptr != '@')
+ return len;
+ dptr++;
+ len++;
+
+ domain_len = word_len(dptr, limit);
+ if (!domain_len)
+ return 0;
+ return len + domain_len;
+}
+
/* get media type + port length */
static int media_len(const struct nf_conn *ct, const char *dptr,
const char *limit, int *shift)
@@ -152,6 +190,9 @@ static int parse_addr(const struct nf_conn *ct, const char *cp,
const char *end;
int ret = 0;
+ if (!ct)
+ return 0;
+
memset(addr, 0, sizeof(*addr));
switch (nf_ct_l3num(ct)) {
case AF_INET:
@@ -296,6 +337,7 @@ static const struct sip_header ct_sip_hdrs[] = {
[SIP_HDR_VIA_TCP] = SIP_HDR("Via", "v", "TCP ", epaddr_len),
[SIP_HDR_EXPIRES] = SIP_HDR("Expires", NULL, NULL, digits_len),
[SIP_HDR_CONTENT_LENGTH] = SIP_HDR("Content-Length", "l", NULL, digits_len),
+ [SIP_HDR_CALL_ID] = SIP_HDR("Call-Id", "i", NULL, callid_len),
};
static const char *sip_follow_continuation(const char *dptr, const char *limit)
@@ -1376,7 +1418,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
unsigned int msglen, origlen;
const char *dptr, *end;
s16 diff, tdiff = 0;
- int ret;
+ int ret = NF_ACCEPT;
typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust;
if (ctinfo != IP_CT_ESTABLISHED &&
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index eb973fcd67ab..0fb65705b44b 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/percpu.h>
#include <linux/netdevice.h>
+#include <linux/security.h>
#include <net/net_namespace.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -108,6 +109,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
rcu_read_unlock();
}
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ int ret;
+ u32 len;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return ret;
+
+ ret = seq_printf(s, "secctx=%s ", secctx);
+
+ security_release_secctx(secctx, len);
+ return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ return 0;
+}
+#endif
+
/* return 0 on success, 1 in case of error */
static int ct_seq_show(struct seq_file *s, void *v)
{
@@ -168,10 +192,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
goto release;
#endif
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
- if (seq_printf(s, "secmark=%u ", ct->secmark))
+ if (ct_show_secctx(s, ct))
goto release;
-#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
if (seq_printf(s, "zone=%u ", nf_ct_zone(ct)))
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 7df37fd786bc..b07393eab88e 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,7 @@
#define NF_LOG_PREFIXLEN 128
#define NFLOGGER_NAME_LEN 64
-static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
static DEFINE_MUTEX(nf_log_mutex);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 78b3cf9c519c..74aebed5bd28 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -18,7 +18,7 @@
* long term mutex. The handler must provide an an outfn() to accept packets
* for queueing and must reinject all packets it receives, no matter what.
*/
-static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
static DEFINE_MUTEX(queue_handler_mutex);
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index 5490fc37c92d..4d87befb04c0 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -18,41 +18,6 @@
#include <net/udp.h>
#include <net/netfilter/nf_tproxy_core.h>
-struct sock *
-nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
- const __be32 saddr, const __be32 daddr,
- const __be16 sport, const __be16 dport,
- const struct net_device *in, bool listening_only)
-{
- struct sock *sk;
-
- /* look up socket */
- switch (protocol) {
- case IPPROTO_TCP:
- if (listening_only)
- sk = __inet_lookup_listener(net, &tcp_hashinfo,
- daddr, ntohs(dport),
- in->ifindex);
- else
- sk = __inet_lookup(net, &tcp_hashinfo,
- saddr, sport, daddr, dport,
- in->ifindex);
- break;
- case IPPROTO_UDP:
- sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
- in->ifindex);
- break;
- default:
- WARN_ON(1);
- sk = NULL;
- }
-
- pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n",
- protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk);
-
- return sk;
-}
-EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
static void
nf_tproxy_destructor(struct sk_buff *skb)
@@ -70,7 +35,11 @@ nf_tproxy_destructor(struct sk_buff *skb)
int
nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
{
- if (inet_sk(sk)->transparent) {
+ bool transparent = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_transparent :
+ inet_sk(sk)->transparent;
+
+ if (transparent) {
skb_orphan(skb);
skb->sk = sk;
skb->destructor = nf_tproxy_destructor;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index fc9a211e629e..6a1572b0ab41 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -66,9 +66,10 @@ struct nfulnl_instance {
u_int16_t group_num; /* number of this queue */
u_int16_t flags;
u_int8_t copy_mode;
+ struct rcu_head rcu;
};
-static DEFINE_RWLOCK(instances_lock);
+static DEFINE_SPINLOCK(instances_lock);
static atomic_t global_seq;
#define INSTANCE_BUCKETS 16
@@ -88,7 +89,7 @@ __instance_lookup(u_int16_t group_num)
struct nfulnl_instance *inst;
head = &instance_table[instance_hashfn(group_num)];
- hlist_for_each_entry(inst, pos, head, hlist) {
+ hlist_for_each_entry_rcu(inst, pos, head, hlist) {
if (inst->group_num == group_num)
return inst;
}
@@ -106,22 +107,26 @@ instance_lookup_get(u_int16_t group_num)
{
struct nfulnl_instance *inst;
- read_lock_bh(&instances_lock);
+ rcu_read_lock_bh();
inst = __instance_lookup(group_num);
- if (inst)
- instance_get(inst);
- read_unlock_bh(&instances_lock);
+ if (inst && !atomic_inc_not_zero(&inst->use))
+ inst = NULL;
+ rcu_read_unlock_bh();
return inst;
}
+static void nfulnl_instance_free_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct nfulnl_instance, rcu));
+ module_put(THIS_MODULE);
+}
+
static void
instance_put(struct nfulnl_instance *inst)
{
- if (inst && atomic_dec_and_test(&inst->use)) {
- kfree(inst);
- module_put(THIS_MODULE);
- }
+ if (inst && atomic_dec_and_test(&inst->use))
+ call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu);
}
static void nfulnl_timer(unsigned long data);
@@ -132,7 +137,7 @@ instance_create(u_int16_t group_num, int pid)
struct nfulnl_instance *inst;
int err;
- write_lock_bh(&instances_lock);
+ spin_lock_bh(&instances_lock);
if (__instance_lookup(group_num)) {
err = -EEXIST;
goto out_unlock;
@@ -166,32 +171,37 @@ instance_create(u_int16_t group_num, int pid)
inst->copy_mode = NFULNL_COPY_PACKET;
inst->copy_range = NFULNL_COPY_RANGE_MAX;
- hlist_add_head(&inst->hlist,
+ hlist_add_head_rcu(&inst->hlist,
&instance_table[instance_hashfn(group_num)]);
- write_unlock_bh(&instances_lock);
+ spin_unlock_bh(&instances_lock);
return inst;
out_unlock:
- write_unlock_bh(&instances_lock);
+ spin_unlock_bh(&instances_lock);
return ERR_PTR(err);
}
static void __nfulnl_flush(struct nfulnl_instance *inst);
+/* called with BH disabled */
static void
__instance_destroy(struct nfulnl_instance *inst)
{
/* first pull it out of the global list */
- hlist_del(&inst->hlist);
+ hlist_del_rcu(&inst->hlist);
/* then flush all pending packets from skb */
- spin_lock_bh(&inst->lock);
+ spin_lock(&inst->lock);
+
+ /* lockless readers wont be able to use us */
+ inst->copy_mode = NFULNL_COPY_DISABLED;
+
if (inst->skb)
__nfulnl_flush(inst);
- spin_unlock_bh(&inst->lock);
+ spin_unlock(&inst->lock);
/* and finally put the refcount */
instance_put(inst);
@@ -200,9 +210,9 @@ __instance_destroy(struct nfulnl_instance *inst)
static inline void
instance_destroy(struct nfulnl_instance *inst)
{
- write_lock_bh(&instances_lock);
+ spin_lock_bh(&instances_lock);
__instance_destroy(inst);
- write_unlock_bh(&instances_lock);
+ spin_unlock_bh(&instances_lock);
}
static int
@@ -403,8 +413,9 @@ __build_packet_message(struct nfulnl_instance *inst,
NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
htonl(indev->ifindex));
/* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
- htonl(indev->br_port->br->dev->ifindex));
+ htonl(br_port_get_rcu(indev)->br->dev->ifindex));
} else {
/* Case 2: indev is bridge group, we need to look for
* physical device (when called from ipv4) */
@@ -430,8 +441,9 @@ __build_packet_message(struct nfulnl_instance *inst,
NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
htonl(outdev->ifindex));
/* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
- htonl(outdev->br_port->br->dev->ifindex));
+ htonl(br_port_get_rcu(outdev)->br->dev->ifindex));
} else {
/* Case 2: indev is a bridge group, we need to look
* for physical device (when called from ipv4) */
@@ -619,6 +631,7 @@ nfulnl_log_packet(u_int8_t pf,
size += nla_total_size(data_len);
break;
+ case NFULNL_COPY_DISABLED:
default:
goto unlock_and_release;
}
@@ -672,7 +685,7 @@ nfulnl_rcv_nl_event(struct notifier_block *this,
int i;
/* destroy all instances for this pid */
- write_lock_bh(&instances_lock);
+ spin_lock_bh(&instances_lock);
for (i = 0; i < INSTANCE_BUCKETS; i++) {
struct hlist_node *tmp, *t2;
struct nfulnl_instance *inst;
@@ -684,7 +697,7 @@ nfulnl_rcv_nl_event(struct notifier_block *this,
__instance_destroy(inst);
}
}
- write_unlock_bh(&instances_lock);
+ spin_unlock_bh(&instances_lock);
}
return NOTIFY_DONE;
}
@@ -861,19 +874,19 @@ static struct hlist_node *get_first(struct iter_state *st)
for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
if (!hlist_empty(&instance_table[st->bucket]))
- return instance_table[st->bucket].first;
+ return rcu_dereference_bh(instance_table[st->bucket].first);
}
return NULL;
}
static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h)
{
- h = h->next;
+ h = rcu_dereference_bh(h->next);
while (!h) {
if (++st->bucket >= INSTANCE_BUCKETS)
return NULL;
- h = instance_table[st->bucket].first;
+ h = rcu_dereference_bh(instance_table[st->bucket].first);
}
return h;
}
@@ -890,9 +903,9 @@ static struct hlist_node *get_idx(struct iter_state *st, loff_t pos)
}
static void *seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(instances_lock)
+ __acquires(rcu_bh)
{
- read_lock_bh(&instances_lock);
+ rcu_read_lock_bh();
return get_idx(seq->private, *pos);
}
@@ -903,9 +916,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
}
static void seq_stop(struct seq_file *s, void *v)
- __releases(instances_lock)
+ __releases(rcu_bh)
{
- read_unlock_bh(&instances_lock);
+ rcu_read_unlock_bh();
}
static int seq_show(struct seq_file *s, void *v)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 12e1ab37fcd8..68e67d19724d 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -46,17 +46,19 @@ struct nfqnl_instance {
int peer_pid;
unsigned int queue_maxlen;
unsigned int copy_range;
- unsigned int queue_total;
unsigned int queue_dropped;
unsigned int queue_user_dropped;
- unsigned int id_sequence; /* 'sequence' of pkt ids */
u_int16_t queue_num; /* number of this queue */
u_int8_t copy_mode;
-
- spinlock_t lock;
-
+/*
+ * Following fields are dirtied for each queued packet,
+ * keep them in same cache line if possible.
+ */
+ spinlock_t lock;
+ unsigned int queue_total;
+ atomic_t id_sequence; /* 'sequence' of pkt ids */
struct list_head queue_list; /* packets in queue */
};
@@ -238,32 +240,24 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
outdev = entry->outdev;
- spin_lock_bh(&queue->lock);
-
- switch ((enum nfqnl_config_mode)queue->copy_mode) {
+ switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) {
case NFQNL_COPY_META:
case NFQNL_COPY_NONE:
break;
case NFQNL_COPY_PACKET:
if (entskb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(entskb)) {
- spin_unlock_bh(&queue->lock);
+ skb_checksum_help(entskb))
return NULL;
- }
- if (queue->copy_range == 0
- || queue->copy_range > entskb->len)
+
+ data_len = ACCESS_ONCE(queue->copy_range);
+ if (data_len == 0 || data_len > entskb->len)
data_len = entskb->len;
- else
- data_len = queue->copy_range;
size += nla_total_size(data_len);
break;
}
- entry->id = queue->id_sequence++;
-
- spin_unlock_bh(&queue->lock);
skb = alloc_skb(size, GFP_ATOMIC);
if (!skb)
@@ -278,6 +272,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = htons(queue->queue_num);
+ entry->id = atomic_inc_return(&queue->id_sequence);
pmsg.packet_id = htonl(entry->id);
pmsg.hw_protocol = entskb->protocol;
pmsg.hook = entry->hook;
@@ -296,8 +291,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV,
htonl(indev->ifindex));
/* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by __nf_queue */
NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV,
- htonl(indev->br_port->br->dev->ifindex));
+ htonl(br_port_get_rcu(indev)->br->dev->ifindex));
} else {
/* Case 2: indev is bridge group, we need to look for
* physical device (when called from ipv4) */
@@ -321,8 +317,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV,
htonl(outdev->ifindex));
/* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by __nf_queue */
NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV,
- htonl(outdev->br_port->br->dev->ifindex));
+ htonl(br_port_get_rcu(outdev)->br->dev->ifindex));
} else {
/* Case 2: outdev is bridge group, we need to look for
* physical output device (when called from ipv4) */
@@ -866,7 +863,7 @@ static int seq_show(struct seq_file *s, void *v)
inst->peer_pid, inst->queue_total,
inst->copy_mode, inst->copy_range,
inst->queue_dropped, inst->queue_user_dropped,
- inst->id_sequence, 1);
+ atomic_read(&inst->id_sequence), 1);
}
static const struct seq_operations nfqnl_seq_ops = {
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index e34622fa0003..80463507420e 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -116,10 +116,8 @@ EXPORT_SYMBOL(xt_register_targets);
void
xt_unregister_targets(struct xt_target *target, unsigned int n)
{
- unsigned int i;
-
- for (i = 0; i < n; i++)
- xt_unregister_target(&target[i]);
+ while (n-- > 0)
+ xt_unregister_target(&target[n]);
}
EXPORT_SYMBOL(xt_unregister_targets);
@@ -174,10 +172,8 @@ EXPORT_SYMBOL(xt_register_matches);
void
xt_unregister_matches(struct xt_match *match, unsigned int n)
{
- unsigned int i;
-
- for (i = 0; i < n; i++)
- xt_unregister_match(&match[i]);
+ while (n-- > 0)
+ xt_unregister_match(&match[n]);
}
EXPORT_SYMBOL(xt_unregister_matches);
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
new file mode 100644
index 000000000000..0f642ef8cd26
--- /dev/null
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -0,0 +1,70 @@
+/* iptables module for the packet checksum mangling
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * (C) 2010 Red Hat, Inc.
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>");
+MODULE_DESCRIPTION("Xtables: checksum modification");
+MODULE_ALIAS("ipt_CHECKSUM");
+MODULE_ALIAS("ip6t_CHECKSUM");
+
+static unsigned int
+checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb_checksum_help(skb);
+
+ return XT_CONTINUE;
+}
+
+static int checksum_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_CHECKSUM_info *einfo = par->targinfo;
+
+ if (einfo->operation & ~XT_CHECKSUM_OP_FILL) {
+ pr_info("unsupported CHECKSUM operation %x\n", einfo->operation);
+ return -EINVAL;
+ }
+ if (!einfo->operation) {
+ pr_info("no CHECKSUM operation enabled\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_target checksum_tg_reg __read_mostly = {
+ .name = "CHECKSUM",
+ .family = NFPROTO_UNSPEC,
+ .target = checksum_tg,
+ .targetsize = sizeof(struct xt_CHECKSUM_info),
+ .table = "mangle",
+ .checkentry = checksum_tg_check,
+ .me = THIS_MODULE,
+};
+
+static int __init checksum_tg_init(void)
+{
+ return xt_register_target(&checksum_tg_reg);
+}
+
+static void __exit checksum_tg_exit(void)
+{
+ xt_unregister_target(&checksum_tg_reg);
+}
+
+module_init(checksum_tg_init);
+module_exit(checksum_tg_exit);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 562bf3266e04..782e51986a6f 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -9,7 +9,6 @@
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/skbuff.h>
-#include <linux/selinux.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/x_tables.h>
@@ -67,7 +66,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
if (info->flags & XT_CT_NOTRACK) {
- ct = &nf_conntrack_untracked;
+ ct = nf_ct_untracked_get();
atomic_inc(&ct->ct_general.use);
goto out;
}
@@ -132,7 +131,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par)
struct nf_conn *ct = info->ct;
struct nf_conn_help *help;
- if (ct != &nf_conntrack_untracked) {
+ if (!nf_ct_is_untracked(ct)) {
help = nfct_help(ct);
if (help)
module_put(help->helper->me);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
new file mode 100644
index 000000000000..be1f22e13545
--- /dev/null
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -0,0 +1,315 @@
+/*
+ * linux/net/netfilter/xt_IDLETIMER.c
+ *
+ * Netfilter module to trigger a timer when packet matches.
+ * After timer expires a kevent will be sent.
+ *
+ * Copyright (C) 2004, 2010 Nokia Corporation
+ * Written by Timo Teras <ext-timo.teras@nokia.com>
+ *
+ * Converted to x_tables and reworked for upstream inclusion
+ * by Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * Contact: Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_IDLETIMER.h>
+#include <linux/kdev_t.h>
+#include <linux/kobject.h>
+#include <linux/workqueue.h>
+#include <linux/sysfs.h>
+
+struct idletimer_tg_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+};
+
+struct idletimer_tg {
+ struct list_head entry;
+ struct timer_list timer;
+ struct work_struct work;
+
+ struct kobject *kobj;
+ struct idletimer_tg_attr attr;
+
+ unsigned int refcnt;
+};
+
+static LIST_HEAD(idletimer_tg_list);
+static DEFINE_MUTEX(list_mutex);
+
+static struct kobject *idletimer_tg_kobj;
+
+static
+struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
+{
+ struct idletimer_tg *entry;
+
+ BUG_ON(!label);
+
+ list_for_each_entry(entry, &idletimer_tg_list, entry) {
+ if (!strcmp(label, entry->attr.attr.name))
+ return entry;
+ }
+
+ return NULL;
+}
+
+static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct idletimer_tg *timer;
+ unsigned long expires = 0;
+
+ mutex_lock(&list_mutex);
+
+ timer = __idletimer_tg_find_by_label(attr->name);
+ if (timer)
+ expires = timer->timer.expires;
+
+ mutex_unlock(&list_mutex);
+
+ if (time_after(expires, jiffies))
+ return sprintf(buf, "%u\n",
+ jiffies_to_msecs(expires - jiffies) / 1000);
+
+ return sprintf(buf, "0\n");
+}
+
+static void idletimer_tg_work(struct work_struct *work)
+{
+ struct idletimer_tg *timer = container_of(work, struct idletimer_tg,
+ work);
+
+ sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
+}
+
+static void idletimer_tg_expired(unsigned long data)
+{
+ struct idletimer_tg *timer = (struct idletimer_tg *) data;
+
+ pr_debug("timer %s expired\n", timer->attr.attr.name);
+
+ schedule_work(&timer->work);
+}
+
+static int idletimer_tg_create(struct idletimer_tg_info *info)
+{
+ int ret;
+
+ info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+ if (!info->timer) {
+ pr_debug("couldn't alloc timer\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
+ if (!info->timer->attr.attr.name) {
+ pr_debug("couldn't alloc attribute name\n");
+ ret = -ENOMEM;
+ goto out_free_timer;
+ }
+ info->timer->attr.attr.mode = S_IRUGO;
+ info->timer->attr.show = idletimer_tg_show;
+
+ ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ if (ret < 0) {
+ pr_debug("couldn't add file to sysfs");
+ goto out_free_attr;
+ }
+
+ list_add(&info->timer->entry, &idletimer_tg_list);
+
+ setup_timer(&info->timer->timer, idletimer_tg_expired,
+ (unsigned long) info->timer);
+ info->timer->refcnt = 1;
+
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+ INIT_WORK(&info->timer->work, idletimer_tg_work);
+
+ return 0;
+
+out_free_attr:
+ kfree(info->timer->attr.attr.name);
+out_free_timer:
+ kfree(info->timer);
+out:
+ return ret;
+}
+
+/*
+ * The actual xt_tables plugin.
+ */
+static unsigned int idletimer_tg_target(struct sk_buff *skb,
+ const struct xt_action_param *par)
+{
+ const struct idletimer_tg_info *info = par->targinfo;
+
+ pr_debug("resetting timer %s, timeout period %u\n",
+ info->label, info->timeout);
+
+ BUG_ON(!info->timer);
+
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+ return XT_CONTINUE;
+}
+
+static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+{
+ struct idletimer_tg_info *info = par->targinfo;
+ int ret;
+
+ pr_debug("checkentry targinfo%s\n", info->label);
+
+ if (info->timeout == 0) {
+ pr_debug("timeout value is zero\n");
+ return -EINVAL;
+ }
+
+ if (info->label[0] == '\0' ||
+ strnlen(info->label,
+ MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) {
+ pr_debug("label is empty or not nul-terminated\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&list_mutex);
+
+ info->timer = __idletimer_tg_find_by_label(info->label);
+ if (info->timer) {
+ info->timer->refcnt++;
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+ pr_debug("increased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ } else {
+ ret = idletimer_tg_create(info);
+ if (ret < 0) {
+ pr_debug("failed to create timer\n");
+ mutex_unlock(&list_mutex);
+ return ret;
+ }
+ }
+
+ mutex_unlock(&list_mutex);
+ return 0;
+}
+
+static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct idletimer_tg_info *info = par->targinfo;
+
+ pr_debug("destroy targinfo %s\n", info->label);
+
+ mutex_lock(&list_mutex);
+
+ if (--info->timer->refcnt == 0) {
+ pr_debug("deleting timer %s\n", info->label);
+
+ list_del(&info->timer->entry);
+ del_timer_sync(&info->timer->timer);
+ sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ kfree(info->timer->attr.attr.name);
+ kfree(info->timer);
+ } else {
+ pr_debug("decreased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ }
+
+ mutex_unlock(&list_mutex);
+}
+
+static struct xt_target idletimer_tg __read_mostly = {
+ .name = "IDLETIMER",
+ .family = NFPROTO_UNSPEC,
+ .target = idletimer_tg_target,
+ .targetsize = sizeof(struct idletimer_tg_info),
+ .checkentry = idletimer_tg_checkentry,
+ .destroy = idletimer_tg_destroy,
+ .me = THIS_MODULE,
+};
+
+static struct class *idletimer_tg_class;
+
+static struct device *idletimer_tg_device;
+
+static int __init idletimer_tg_init(void)
+{
+ int err;
+
+ idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer");
+ err = PTR_ERR(idletimer_tg_class);
+ if (IS_ERR(idletimer_tg_class)) {
+ pr_debug("couldn't register device class\n");
+ goto out;
+ }
+
+ idletimer_tg_device = device_create(idletimer_tg_class, NULL,
+ MKDEV(0, 0), NULL, "timers");
+ err = PTR_ERR(idletimer_tg_device);
+ if (IS_ERR(idletimer_tg_device)) {
+ pr_debug("couldn't register system device\n");
+ goto out_class;
+ }
+
+ idletimer_tg_kobj = &idletimer_tg_device->kobj;
+
+ err = xt_register_target(&idletimer_tg);
+ if (err < 0) {
+ pr_debug("couldn't register xt target\n");
+ goto out_dev;
+ }
+
+ return 0;
+out_dev:
+ device_destroy(idletimer_tg_class, MKDEV(0, 0));
+out_class:
+ class_destroy(idletimer_tg_class);
+out:
+ return err;
+}
+
+static void __exit idletimer_tg_exit(void)
+{
+ xt_unregister_target(&idletimer_tg);
+
+ device_destroy(idletimer_tg_class, MKDEV(0, 0));
+ class_destroy(idletimer_tg_class);
+}
+
+module_init(idletimer_tg_init);
+module_exit(idletimer_tg_exit);
+
+MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>");
+MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>");
+MODULE_DESCRIPTION("Xtables: idle time monitor");
+MODULE_LICENSE("GPL v2");
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
index 512b9123252f..9d782181b6c8 100644
--- a/net/netfilter/xt_NOTRACK.c
+++ b/net/netfilter/xt_NOTRACK.c
@@ -23,7 +23,7 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
If there is a real ct entry correspondig to this packet,
it'll hang aroun till timing out. We don't deal with it
for performance reasons. JK */
- skb->nfct = &nf_conntrack_untracked.ct_general;
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
skb->nfctinfo = IP_CT_NEW;
nf_conntrack_get(skb->nfct);
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 69c01e10f8af..de079abd5bc8 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -60,13 +60,22 @@ struct xt_rateest *xt_rateest_lookup(const char *name)
}
EXPORT_SYMBOL_GPL(xt_rateest_lookup);
+static void xt_rateest_free_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct xt_rateest, rcu));
+}
+
void xt_rateest_put(struct xt_rateest *est)
{
mutex_lock(&xt_rateest_mutex);
if (--est->refcnt == 0) {
hlist_del(&est->list);
gen_kill_estimator(&est->bstats, &est->rstats);
- kfree(est);
+ /*
+ * gen_estimator est_timer() might access est->lock or bstats,
+ * wait a RCU grace period before freeing 'est'
+ */
+ call_rcu(&est->rcu, xt_rateest_free_rcu);
}
mutex_unlock(&xt_rateest_mutex);
}
@@ -179,6 +188,7 @@ static int __init xt_rateest_tg_init(void)
static void __exit xt_rateest_tg_fini(void)
{
xt_unregister_target(&xt_rateest_tg_reg);
+ rcu_barrier(); /* Wait for completion of call_rcu()'s (xt_rateest_free_rcu) */
}
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 23b2d6c486b5..9faf5e050b79 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -14,8 +14,8 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
+#include <linux/security.h>
#include <linux/skbuff.h>
-#include <linux/selinux.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_SECMARK.h>
@@ -39,9 +39,8 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
switch (mode) {
case SECMARK_MODE_SEL:
- secmark = info->u.sel.selsid;
+ secmark = info->secid;
break;
-
default:
BUG();
}
@@ -50,33 +49,33 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static int checkentry_selinux(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info *info)
{
int err;
- struct xt_secmark_target_selinux_info *sel = &info->u.sel;
- sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0';
+ info->secctx[SECMARK_SECCTX_MAX - 1] = '\0';
+ info->secid = 0;
- err = selinux_string_to_sid(sel->selctx, &sel->selsid);
+ err = security_secctx_to_secid(info->secctx, strlen(info->secctx),
+ &info->secid);
if (err) {
if (err == -EINVAL)
- pr_info("invalid SELinux context \'%s\'\n",
- sel->selctx);
+ pr_info("invalid security context \'%s\'\n", info->secctx);
return err;
}
- if (!sel->selsid) {
- pr_info("unable to map SELinux context \'%s\'\n", sel->selctx);
+ if (!info->secid) {
+ pr_info("unable to map security context \'%s\'\n", info->secctx);
return -ENOENT;
}
- err = selinux_secmark_relabel_packet_permission(sel->selsid);
+ err = security_secmark_relabel_packet(info->secid);
if (err) {
pr_info("unable to obtain relabeling permission\n");
return err;
}
- selinux_secmark_refcount_inc();
+ security_secmark_refcount_inc();
return 0;
}
@@ -100,16 +99,16 @@ static int secmark_tg_check(const struct xt_tgchk_param *par)
switch (info->mode) {
case SECMARK_MODE_SEL:
- err = checkentry_selinux(info);
- if (err <= 0)
- return err;
break;
-
default:
pr_info("invalid mode: %hu\n", info->mode);
return -EINVAL;
}
+ err = checkentry_lsm(info);
+ if (err)
+ return err;
+
if (!mode)
mode = info->mode;
return 0;
@@ -119,7 +118,7 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
{
switch (mode) {
case SECMARK_MODE_SEL:
- selinux_secmark_refcount_dec();
+ security_secmark_refcount_dec();
}
}
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 62ec021fbd50..eb81c380da1b 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -165,8 +165,8 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb,
rcu_read_unlock();
if (rt != NULL) {
- mtu = dst_mtu(&rt->u.dst);
- dst_release(&rt->u.dst);
+ mtu = dst_mtu(&rt->dst);
+ dst_release(&rt->dst);
}
return mtu;
}
@@ -220,15 +220,13 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
}
#endif
-#define TH_SYN 0x02
-
/* Must specify -p tcp --syn */
static inline bool find_syn_match(const struct xt_entry_match *m)
{
const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data;
if (strcmp(m->u.kernel.match->name, "tcp") == 0 &&
- tcpinfo->flg_cmp & TH_SYN &&
+ tcpinfo->flg_cmp & TCPHDR_SYN &&
!(tcpinfo->invflags & XT_TCP_INV_FLAGS))
return true;
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 859d9fd429c8..22a2d421e7eb 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -77,8 +77,8 @@ tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
return false;
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
- skb->dev = rt->u.dst.dev;
+ skb_dst_set(skb, &rt->dst);
+ skb->dev = rt->dst.dev;
skb->protocol = htons(ETH_P_IP);
return true;
}
@@ -104,7 +104,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
#ifdef WITH_CONNTRACK
/* Avoid counting cloned packets towards the original connection. */
nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_conntrack_untracked.ct_general;
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
skb->nfctinfo = IP_CT_NEW;
nf_conntrack_get(skb->nfct);
#endif
@@ -177,7 +177,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
#ifdef WITH_CONNTRACK
nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_conntrack_untracked.ct_general;
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
skb->nfctinfo = IP_CT_NEW;
nf_conntrack_get(skb->nfct);
#endif
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index e1a0dedac258..640678f47a2a 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -1,7 +1,7 @@
/*
* Transparent proxy support for Linux/iptables
*
- * Copyright (c) 2006-2007 BalaBit IT Ltd.
+ * Copyright (c) 2006-2010 BalaBit IT Ltd.
* Author: Balazs Scheidler, Krisztian Kovacs
*
* This program is free software; you can redistribute it and/or modify
@@ -16,19 +16,98 @@
#include <net/checksum.h>
#include <net/udp.h>
#include <net/inet_sock.h>
-
+#include <linux/inetdevice.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter/xt_TPROXY.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#define XT_TPROXY_HAVE_IPV6 1
+#include <net/if_inet6.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
#include <net/netfilter/nf_tproxy_core.h>
+#include <linux/netfilter/xt_TPROXY.h>
+
+static inline __be32
+tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+ struct in_device *indev;
+ __be32 laddr;
+
+ if (user_laddr)
+ return user_laddr;
+
+ laddr = 0;
+ rcu_read_lock();
+ indev = __in_dev_get_rcu(skb->dev);
+ for_primary_ifa(indev) {
+ laddr = ifa->ifa_local;
+ break;
+ } endfor_ifa(indev);
+ rcu_read_unlock();
+
+ return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
+ * @skb: The skb being processed.
+ * @laddr: IPv4 address to redirect to or zero.
+ * @lport: TCP port to redirect to or zero.
+ * @sk: The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait4() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+ struct sock *sk)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ iph->saddr, laddr ? laddr : iph->daddr,
+ hp->source, lport ? lport : hp->dest,
+ skb->dev, NFT_LOOKUP_LISTENER);
+ if (sk2) {
+ inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+ inet_twsk_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
static unsigned int
-tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
+tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+ u_int32_t mark_mask, u_int32_t mark_value)
{
const struct iphdr *iph = ip_hdr(skb);
- const struct xt_tproxy_target_info *tgi = par->targinfo;
struct udphdr _hdr, *hp;
struct sock *sk;
@@ -36,10 +115,195 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (hp == NULL)
return NF_DROP;
+ /* check if there's an ongoing connection on the packet
+ * addresses, this happens if the redirect already happened
+ * and the current packet belongs to an already established
+ * connection */
sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
- iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
- hp->source, tgi->lport ? tgi->lport : hp->dest,
- par->in, true);
+ iph->saddr, iph->daddr,
+ hp->source, hp->dest,
+ skb->dev, NFT_LOOKUP_ESTABLISHED);
+
+ laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+ if (!lport)
+ lport = hp->dest;
+
+ /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
+ else if (!sk)
+ /* no, there's no established connection, check if
+ * there's a listener on the redirected addr/port */
+ sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ iph->saddr, laddr,
+ hp->source, lport,
+ skb->dev, NFT_LOOKUP_LISTENER);
+
+ /* NOTE: assign_sock consumes our sk reference */
+ if (sk && nf_tproxy_assign_sock(skb, sk)) {
+ /* This should be in a separate target, but we don't do multiple
+ targets on the same rule yet */
+ skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
+
+ pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+ iph->protocol, &iph->daddr, ntohs(hp->dest),
+ &laddr, ntohs(lport), skb->mark);
+ return NF_ACCEPT;
+ }
+
+ pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+ iph->protocol, &iph->saddr, ntohs(hp->source),
+ &iph->daddr, ntohs(hp->dest), skb->mark);
+ return NF_DROP;
+}
+
+static unsigned int
+tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tproxy_target_info *tgi = par->targinfo;
+
+ return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+static unsigned int
+tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+ return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+#ifdef XT_TPROXY_HAVE_IPV6
+
+static inline const struct in6_addr *
+tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+ const struct in6_addr *daddr)
+{
+ struct inet6_dev *indev;
+ struct inet6_ifaddr *ifa;
+ struct in6_addr *laddr;
+
+ if (!ipv6_addr_any(user_laddr))
+ return user_laddr;
+ laddr = NULL;
+
+ rcu_read_lock();
+ indev = __in6_dev_get(skb->dev);
+ if (indev)
+ list_for_each_entry(ifa, &indev->addr_list, if_list) {
+ if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+ continue;
+
+ laddr = &ifa->addr;
+ break;
+ }
+ rcu_read_unlock();
+
+ return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb: The skb being processed.
+ * @tproto: Transport protocol.
+ * @thoff: Transport protocol header offset.
+ * @par: Iptables target parameters.
+ * @sk: The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+ const struct xt_action_param *par,
+ struct sock *sk)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct tcphdr _hdr, *hp;
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr,
+ tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
+ hp->source,
+ tgi->lport ? tgi->lport : hp->dest,
+ skb->dev, NFT_LOOKUP_LISTENER);
+ if (sk2) {
+ inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+ inet_twsk_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+
+static unsigned int
+tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+ struct udphdr _hdr, *hp;
+ struct sock *sk;
+ const struct in6_addr *laddr;
+ __be16 lport;
+ int thoff;
+ int tproto;
+
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+ if (tproto < 0) {
+ pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ return NF_DROP;
+ }
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+ return NF_DROP;
+ }
+
+ /* check if there's an ongoing connection on the packet
+ * addresses, this happens if the redirect already happened
+ * and the current packet belongs to an already established
+ * connection */
+ sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr, &iph->daddr,
+ hp->source, hp->dest,
+ par->in, NFT_LOOKUP_ESTABLISHED);
+
+ laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+ lport = tgi->lport ? tgi->lport : hp->dest;
+
+ /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+ else if (!sk)
+ /* no there's no established connection, check if
+ * there's a listener on the redirected addr/port */
+ sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr, laddr,
+ hp->source, lport,
+ par->in, NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
if (sk && nf_tproxy_assign_sock(skb, sk)) {
@@ -47,19 +311,34 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
- pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n",
- iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
- ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+ pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+ tproto, &iph->saddr, ntohs(hp->source),
+ laddr, ntohs(lport), skb->mark);
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n",
- iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
- ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+ pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+ tproto, &iph->saddr, ntohs(hp->source),
+ &iph->daddr, ntohs(hp->dest), skb->mark);
+
return NF_DROP;
}
-static int tproxy_tg_check(const struct xt_tgchk_param *par)
+static int tproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+ const struct ip6t_ip6 *i = par->entryinfo;
+
+ if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+ && !(i->flags & IP6T_INV_PROTO))
+ return 0;
+
+ pr_info("Can be used only in combination with "
+ "either -p tcp or -p udp\n");
+ return -EINVAL;
+}
+#endif
+
+static int tproxy_tg4_check(const struct xt_tgchk_param *par)
{
const struct ipt_ip *i = par->entryinfo;
@@ -72,31 +351,64 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
-static struct xt_target tproxy_tg_reg __read_mostly = {
- .name = "TPROXY",
- .family = AF_INET,
- .table = "mangle",
- .target = tproxy_tg,
- .targetsize = sizeof(struct xt_tproxy_target_info),
- .checkentry = tproxy_tg_check,
- .hooks = 1 << NF_INET_PRE_ROUTING,
- .me = THIS_MODULE,
+static struct xt_target tproxy_tg_reg[] __read_mostly = {
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV4,
+ .table = "mangle",
+ .target = tproxy_tg4_v0,
+ .revision = 0,
+ .targetsize = sizeof(struct xt_tproxy_target_info),
+ .checkentry = tproxy_tg4_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV4,
+ .table = "mangle",
+ .target = tproxy_tg4_v1,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_tproxy_target_info_v1),
+ .checkentry = tproxy_tg4_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+#ifdef XT_TPROXY_HAVE_IPV6
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV6,
+ .table = "mangle",
+ .target = tproxy_tg6_v1,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_tproxy_target_info_v1),
+ .checkentry = tproxy_tg6_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+#endif
+
};
static int __init tproxy_tg_init(void)
{
nf_defrag_ipv4_enable();
- return xt_register_target(&tproxy_tg_reg);
+#ifdef XT_TPROXY_HAVE_IPV6
+ nf_defrag_ipv6_enable();
+#endif
+
+ return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
}
static void __exit tproxy_tg_exit(void)
{
- xt_unregister_target(&tproxy_tg_reg);
+ xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
}
module_init(tproxy_tg_init);
module_exit(tproxy_tg_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Krisztian Kovacs");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
MODULE_ALIAS("ipt_TPROXY");
+MODULE_ALIAS("ip6t_TPROXY");
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index 30b95a1c1c89..f4af1bfafb1c 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -120,7 +120,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (ct == NULL)
return false;
- if (ct == &nf_conntrack_untracked)
+ if (nf_ct_is_untracked(ct))
return false;
if (ct->master)
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 73517835303d..5b138506690e 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -112,6 +112,16 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
+
+ /*
+ * This filter cannot function correctly unless connection tracking
+ * accounting is enabled, so complain in the hope that someone notices.
+ */
+ if (!nf_ct_acct_enabled(par->net)) {
+ pr_warning("Forcing CT accounting to be enabled\n");
+ nf_ct_set_acct(par->net, true);
+ }
+
return ret;
}
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 39681f10291c..e536710ad916 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -123,11 +123,12 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
ct = nf_ct_get(skb, &ctinfo);
- if (ct == &nf_conntrack_untracked)
- statebit = XT_CONNTRACK_STATE_UNTRACKED;
- else if (ct != NULL)
- statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
- else
+ if (ct) {
+ if (nf_ct_is_untracked(ct))
+ statebit = XT_CONNTRACK_STATE_UNTRACKED;
+ else
+ statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+ } else
statebit = XT_CONNTRACK_STATE_INVALID;
if (info->match_flags & XT_CONNTRACK_STATE) {
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
new file mode 100644
index 000000000000..b39db8a5cbae
--- /dev/null
+++ b/net/netfilter/xt_cpu.c
@@ -0,0 +1,63 @@
+/* Kernel module to match running CPU */
+
+/*
+ * Might be used to distribute connections on several daemons, if
+ * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable,
+ * each RX queue IRQ affined to one CPU (1:1 mapping)
+ *
+ */
+
+/* (C) 2010 Eric Dumazet
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/xt_cpu.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>");
+MODULE_DESCRIPTION("Xtables: CPU match");
+
+static int cpu_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_cpu_info *info = par->matchinfo;
+
+ if (info->invert & ~1)
+ return -EINVAL;
+ return 0;
+}
+
+static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_cpu_info *info = par->matchinfo;
+
+ return (info->cpu == smp_processor_id()) ^ info->invert;
+}
+
+static struct xt_match cpu_mt_reg __read_mostly = {
+ .name = "cpu",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = cpu_mt_check,
+ .match = cpu_mt,
+ .matchsize = sizeof(struct xt_cpu_info),
+ .me = THIS_MODULE,
+};
+
+static int __init cpu_mt_init(void)
+{
+ return xt_register_match(&cpu_mt_reg);
+}
+
+static void __exit cpu_mt_exit(void)
+{
+ xt_unregister_match(&cpu_mt_reg);
+}
+
+module_init(cpu_mt_init);
+module_exit(cpu_mt_exit);
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index b46a8390896d..9228ee0dc11a 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -448,6 +448,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
{
__be16 _ports[2], *ports;
u8 nexthdr;
+ int poff;
memset(dst, 0, sizeof(*dst));
@@ -492,19 +493,13 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
return 0;
}
- switch (nexthdr) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- case IPPROTO_SCTP:
- case IPPROTO_DCCP:
- ports = skb_header_pointer(skb, protoff, sizeof(_ports),
+ poff = proto_ports_offset(nexthdr);
+ if (poff >= 0) {
+ ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports),
&_ports);
- break;
- default:
+ } else {
_ports[0] = _ports[1] = 0;
ports = _ports;
- break;
}
if (!ports)
return -1;
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
new file mode 100644
index 000000000000..9127a3d8aa35
--- /dev/null
+++ b/net/netfilter/xt_ipvs.c
@@ -0,0 +1,188 @@
+/*
+ * xt_ipvs - kernel module to match IPVS connection properties
+ *
+ * Author: Hannes Eder <heder@google.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#endif
+#include <linux/ip_vs.h>
+#include <linux/types.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_ipvs.h>
+#include <net/netfilter/nf_conntrack.h>
+
+#include <net/ip_vs.h>
+
+MODULE_AUTHOR("Hannes Eder <heder@google.com>");
+MODULE_DESCRIPTION("Xtables: match IPVS connection properties");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_ipvs");
+MODULE_ALIAS("ip6t_ipvs");
+
+/* borrowed from xt_conntrack */
+static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr,
+ const union nf_inet_addr *uaddr,
+ const union nf_inet_addr *umask,
+ unsigned int l3proto)
+{
+ if (l3proto == NFPROTO_IPV4)
+ return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0;
+#ifdef CONFIG_IP_VS_IPV6
+ else if (l3proto == NFPROTO_IPV6)
+ return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6,
+ &uaddr->in6) == 0;
+#endif
+ else
+ return false;
+}
+
+static bool
+ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_ipvs_mtinfo *data = par->matchinfo;
+ /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
+ const u_int8_t family = par->family;
+ struct ip_vs_iphdr iph;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn *cp;
+ bool match = true;
+
+ if (data->bitmask == XT_IPVS_IPVS_PROPERTY) {
+ match = skb->ipvs_property ^
+ !!(data->invert & XT_IPVS_IPVS_PROPERTY);
+ goto out;
+ }
+
+ /* other flags than XT_IPVS_IPVS_PROPERTY are set */
+ if (!skb->ipvs_property) {
+ match = false;
+ goto out;
+ }
+
+ ip_vs_fill_iphdr(family, skb_network_header(skb), &iph);
+
+ if (data->bitmask & XT_IPVS_PROTO)
+ if ((iph.protocol == data->l4proto) ^
+ !(data->invert & XT_IPVS_PROTO)) {
+ match = false;
+ goto out;
+ }
+
+ pp = ip_vs_proto_get(iph.protocol);
+ if (unlikely(!pp)) {
+ match = false;
+ goto out;
+ }
+
+ /*
+ * Check if the packet belongs to an existing entry
+ */
+ cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */);
+ if (unlikely(cp == NULL)) {
+ match = false;
+ goto out;
+ }
+
+ /*
+ * We found a connection, i.e. ct != 0, make sure to call
+ * __ip_vs_conn_put before returning. In our case jump to out_put_con.
+ */
+
+ if (data->bitmask & XT_IPVS_VPORT)
+ if ((cp->vport == data->vport) ^
+ !(data->invert & XT_IPVS_VPORT)) {
+ match = false;
+ goto out_put_cp;
+ }
+
+ if (data->bitmask & XT_IPVS_VPORTCTL)
+ if ((cp->control != NULL &&
+ cp->control->vport == data->vportctl) ^
+ !(data->invert & XT_IPVS_VPORTCTL)) {
+ match = false;
+ goto out_put_cp;
+ }
+
+ if (data->bitmask & XT_IPVS_DIR) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct == NULL || nf_ct_is_untracked(ct)) {
+ match = false;
+ goto out_put_cp;
+ }
+
+ if ((ctinfo >= IP_CT_IS_REPLY) ^
+ !!(data->invert & XT_IPVS_DIR)) {
+ match = false;
+ goto out_put_cp;
+ }
+ }
+
+ if (data->bitmask & XT_IPVS_METHOD)
+ if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^
+ !(data->invert & XT_IPVS_METHOD)) {
+ match = false;
+ goto out_put_cp;
+ }
+
+ if (data->bitmask & XT_IPVS_VADDR) {
+ if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr,
+ &data->vmask, family) ^
+ !(data->invert & XT_IPVS_VADDR)) {
+ match = false;
+ goto out_put_cp;
+ }
+ }
+
+out_put_cp:
+ __ip_vs_conn_put(cp);
+out:
+ pr_debug("match=%d\n", match);
+ return match;
+}
+
+static int ipvs_mt_check(const struct xt_mtchk_param *par)
+{
+ if (par->family != NFPROTO_IPV4
+#ifdef CONFIG_IP_VS_IPV6
+ && par->family != NFPROTO_IPV6
+#endif
+ ) {
+ pr_info("protocol family %u not supported\n", par->family);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match xt_ipvs_mt_reg __read_mostly = {
+ .name = "ipvs",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = ipvs_mt,
+ .checkentry = ipvs_mt_check,
+ .matchsize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+ .me = THIS_MODULE,
+};
+
+static int __init ipvs_mt_init(void)
+{
+ return xt_register_match(&xt_ipvs_mt_reg);
+}
+
+static void __exit ipvs_mt_exit(void)
+{
+ xt_unregister_match(&xt_ipvs_mt_reg);
+}
+
+module_init(ipvs_mt_init);
+module_exit(ipvs_mt_exit);
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index b4f7dfea5980..70eb2b4984dd 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -11,7 +11,8 @@
#include <linux/netfilter/xt_quota.h>
struct xt_quota_priv {
- uint64_t quota;
+ spinlock_t lock;
+ uint64_t quota;
};
MODULE_LICENSE("GPL");
@@ -20,8 +21,6 @@ MODULE_DESCRIPTION("Xtables: countdown quota match");
MODULE_ALIAS("ipt_quota");
MODULE_ALIAS("ip6t_quota");
-static DEFINE_SPINLOCK(quota_lock);
-
static bool
quota_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
@@ -29,7 +28,7 @@ quota_mt(const struct sk_buff *skb, struct xt_action_param *par)
struct xt_quota_priv *priv = q->master;
bool ret = q->flags & XT_QUOTA_INVERT;
- spin_lock_bh(&quota_lock);
+ spin_lock_bh(&priv->lock);
if (priv->quota >= skb->len) {
priv->quota -= skb->len;
ret = !ret;
@@ -37,9 +36,7 @@ quota_mt(const struct sk_buff *skb, struct xt_action_param *par)
/* we do not allow even small packets from now on */
priv->quota = 0;
}
- /* Copy quota back to matchinfo so that iptables can display it */
- q->quota = priv->quota;
- spin_unlock_bh(&quota_lock);
+ spin_unlock_bh(&priv->lock);
return ret;
}
@@ -55,6 +52,7 @@ static int quota_mt_check(const struct xt_mtchk_param *par)
if (q->master == NULL)
return -ENOMEM;
+ spin_lock_init(&q->master->lock);
q->master->quota = q->quota;
return 0;
}
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 76aec6a44762..d2ff15a2412b 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -567,6 +567,7 @@ static const struct file_operations recent_mt_fops = {
.write = recent_mt_proc_write,
.release = seq_release_private,
.owner = THIS_MODULE,
+ .llseek = seq_lseek,
};
static int __net_init recent_proc_net_init(struct net *net)
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index c04fcf385c59..ef36a56a02c6 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -3,6 +3,7 @@
#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/sctp/sctp.h>
#include <linux/sctp.h>
#include <linux/netfilter/x_tables.h>
@@ -67,7 +68,7 @@ match_packet(const struct sk_buff *skb,
++i, offset, sch->type, htons(sch->length),
sch->flags);
#endif
- offset += (ntohs(sch->length) + 3) & ~3;
+ offset += WORD_ROUND(ntohs(sch->length));
pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset);
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 3d54c236a1ba..00d6ae838303 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -22,6 +22,12 @@
#include <net/netfilter/nf_tproxy_core.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#define XT_SOCKET_HAVE_IPV6 1
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
#include <linux/netfilter/xt_socket.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
@@ -30,7 +36,7 @@
#endif
static int
-extract_icmp_fields(const struct sk_buff *skb,
+extract_icmp4_fields(const struct sk_buff *skb,
u8 *protocol,
__be32 *raddr,
__be32 *laddr,
@@ -86,7 +92,6 @@ extract_icmp_fields(const struct sk_buff *skb,
return 0;
}
-
static bool
socket_match(const struct sk_buff *skb, struct xt_action_param *par,
const struct xt_socket_mtinfo1 *info)
@@ -115,7 +120,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
dport = hp->dest;
} else if (iph->protocol == IPPROTO_ICMP) {
- if (extract_icmp_fields(skb, &protocol, &saddr, &daddr,
+ if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
&sport, &dport))
return false;
} else {
@@ -127,7 +132,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
* reply packet of an established SNAT-ted connection. */
ct = nf_ct_get(skb, &ctinfo);
- if (ct && (ct != &nf_conntrack_untracked) &&
+ if (ct && !nf_ct_is_untracked(ct) &&
((iph->protocol != IPPROTO_ICMP &&
ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) ||
(iph->protocol == IPPROTO_ICMP &&
@@ -142,7 +147,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
#endif
sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
- saddr, daddr, sport, dport, par->in, false);
+ saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
if (sk != NULL) {
bool wildcard;
bool transparent = true;
@@ -165,32 +170,156 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
sk = NULL;
}
- pr_debug("proto %u %08x:%u -> %08x:%u (orig %08x:%u) sock %p\n",
- protocol, ntohl(saddr), ntohs(sport),
- ntohl(daddr), ntohs(dport),
- ntohl(iph->daddr), hp ? ntohs(hp->dest) : 0, sk);
+ pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n",
+ protocol, &saddr, ntohs(sport),
+ &daddr, ntohs(dport),
+ &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
return (sk != NULL);
}
static bool
-socket_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
return socket_match(skb, par, NULL);
}
static bool
-socket_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
{
return socket_match(skb, par, par->matchinfo);
}
+#ifdef XT_SOCKET_HAVE_IPV6
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+ unsigned int outside_hdrlen,
+ int *protocol,
+ struct in6_addr **raddr,
+ struct in6_addr **laddr,
+ __be16 *rport,
+ __be16 *lport)
+{
+ struct ipv6hdr *inside_iph, _inside_iph;
+ struct icmp6hdr *icmph, _icmph;
+ __be16 *ports, _ports[2];
+ u8 inside_nexthdr;
+ int inside_hdrlen;
+
+ icmph = skb_header_pointer(skb, outside_hdrlen,
+ sizeof(_icmph), &_icmph);
+ if (icmph == NULL)
+ return 1;
+
+ if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+ return 1;
+
+ inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph);
+ if (inside_iph == NULL)
+ return 1;
+ inside_nexthdr = inside_iph->nexthdr;
+
+ inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr);
+ if (inside_hdrlen < 0)
+ return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+ if (inside_nexthdr != IPPROTO_TCP &&
+ inside_nexthdr != IPPROTO_UDP)
+ return 1;
+
+ ports = skb_header_pointer(skb, inside_hdrlen,
+ sizeof(_ports), &_ports);
+ if (ports == NULL)
+ return 1;
+
+ /* the inside IP packet is the one quoted from our side, thus
+ * its saddr is the local address */
+ *protocol = inside_nexthdr;
+ *laddr = &inside_iph->saddr;
+ *lport = ports[0];
+ *raddr = &inside_iph->daddr;
+ *rport = ports[1];
+
+ return 0;
+}
+
+static bool
+socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct udphdr _hdr, *hp = NULL;
+ struct sock *sk;
+ struct in6_addr *daddr, *saddr;
+ __be16 dport, sport;
+ int thoff, tproto;
+ const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+ if (tproto < 0) {
+ pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ return NF_DROP;
+ }
+
+ if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+ hp = skb_header_pointer(skb, thoff,
+ sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return false;
+
+ saddr = &iph->saddr;
+ sport = hp->source;
+ daddr = &iph->daddr;
+ dport = hp->dest;
+
+ } else if (tproto == IPPROTO_ICMPV6) {
+ if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+ &sport, &dport))
+ return false;
+ } else {
+ return false;
+ }
+
+ sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
+ if (sk != NULL) {
+ bool wildcard;
+ bool transparent = true;
+
+ /* Ignore sockets listening on INADDR_ANY */
+ wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+ ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
+
+ /* Ignore non-transparent sockets,
+ if XT_SOCKET_TRANSPARENT is used */
+ if (info && info->flags & XT_SOCKET_TRANSPARENT)
+ transparent = ((sk->sk_state != TCP_TIME_WAIT &&
+ inet_sk(sk)->transparent) ||
+ (sk->sk_state == TCP_TIME_WAIT &&
+ inet_twsk(sk)->tw_transparent));
+
+ nf_tproxy_put_sock(sk);
+
+ if (wildcard || !transparent)
+ sk = NULL;
+ }
+
+ pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu "
+ "(orig %pI6:%hu) sock %p\n",
+ tproto, saddr, ntohs(sport),
+ daddr, ntohs(dport),
+ &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
+
+ return (sk != NULL);
+}
+#endif
+
static struct xt_match socket_mt_reg[] __read_mostly = {
{
.name = "socket",
.revision = 0,
.family = NFPROTO_IPV4,
- .match = socket_mt_v0,
+ .match = socket_mt4_v0,
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
@@ -199,17 +328,33 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 1,
.family = NFPROTO_IPV4,
- .match = socket_mt_v1,
+ .match = socket_mt4_v1,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
+#ifdef XT_SOCKET_HAVE_IPV6
+ {
+ .name = "socket",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .match = socket_mt6_v1,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init socket_mt_init(void)
{
nf_defrag_ipv4_enable();
+#ifdef XT_SOCKET_HAVE_IPV6
+ nf_defrag_ipv6_enable();
+#endif
+
return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
}
@@ -225,3 +370,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
MODULE_DESCRIPTION("x_tables socket match module");
MODULE_ALIAS("ipt_socket");
+MODULE_ALIAS("ip6t_socket");
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index e12e053d3782..a507922d80cd 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -26,14 +26,16 @@ state_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_state_info *sinfo = par->matchinfo;
enum ip_conntrack_info ctinfo;
unsigned int statebit;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- if (nf_ct_is_untracked(skb))
- statebit = XT_STATE_UNTRACKED;
- else if (!nf_ct_get(skb, &ctinfo))
+ if (!ct)
statebit = XT_STATE_INVALID;
- else
- statebit = XT_STATE_BIT(ctinfo);
-
+ else {
+ if (nf_ct_is_untracked(ct))
+ statebit = XT_STATE_UNTRACKED;
+ else
+ statebit = XT_STATE_BIT(ctinfo);
+ }
return (sinfo->statemask & statebit);
}
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 96e62b8fd6b1..42ecb71d445f 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -18,8 +18,8 @@
#include <linux/netfilter/x_tables.h>
struct xt_statistic_priv {
- uint32_t count;
-};
+ atomic_t count;
+} ____cacheline_aligned_in_smp;
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
@@ -27,13 +27,12 @@ MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)");
MODULE_ALIAS("ipt_statistic");
MODULE_ALIAS("ip6t_statistic");
-static DEFINE_SPINLOCK(nth_lock);
-
static bool
statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_statistic_info *info = par->matchinfo;
bool ret = info->flags & XT_STATISTIC_INVERT;
+ int nval, oval;
switch (info->mode) {
case XT_STATISTIC_MODE_RANDOM:
@@ -41,12 +40,12 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
ret = !ret;
break;
case XT_STATISTIC_MODE_NTH:
- spin_lock_bh(&nth_lock);
- if (info->master->count++ == info->u.nth.every) {
- info->master->count = 0;
+ do {
+ oval = atomic_read(&info->master->count);
+ nval = (oval == info->u.nth.every) ? 0 : oval + 1;
+ } while (atomic_cmpxchg(&info->master->count, oval, nval) != oval);
+ if (nval == 0)
ret = !ret;
- }
- spin_unlock_bh(&nth_lock);
break;
}
@@ -64,7 +63,7 @@ static int statistic_mt_check(const struct xt_mtchk_param *par)
info->master = kzalloc(sizeof(*info->master), GFP_KERNEL);
if (info->master == NULL)
return -ENOMEM;
- info->master->count = info->u.nth.count;
+ atomic_set(&info->master->count, info->u.nth.count);
return 0;
}