From 5bc9068e9d962ca6b8bec3f0eb6f60ab4dee1d04 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:14:31 +0300 Subject: ipvs: fix CHECKSUM_PARTIAL for TCP, UDP Fix CHECKSUM_PARTIAL handling. Tested for IPv4 TCP, UDP not tested because it needs network card with HW CSUM support. May be fixes problem where IPVS can not be used in virtual boxes. Problem appears with DNAT to local address when the local stack sends reply in CHECKSUM_PARTIAL mode. Fix tcp_dnat_handler and udp_dnat_handler to provide vaddr and daddr in right order (old and new IP) when calling tcp_partial_csum_update/udp_partial_csum_update (CHECKSUM_PARTIAL). Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_proto_tcp.c | 10 +++++----- net/netfilter/ipvs/ip_vs_proto_udp.c | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 282d24de8592..318d011036db 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -101,15 +101,15 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); + csum_unfold(tcph->check)))); else #endif tcph->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); + csum_unfold(tcph->check)))); } @@ -223,7 +223,7 @@ tcp_dnat_handler(struct sk_buff *skb, * Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { - tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!cp->app) { diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 8553231b5d41..f9290893bd93 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -102,15 +102,15 @@ udp_partial_csum_update(int af, struct udphdr *uhdr, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) uhdr->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); + csum_unfold(uhdr->check)))); else #endif uhdr->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); + csum_unfold(uhdr->check)))); } @@ -229,7 +229,7 @@ udp_dnat_handler(struct sk_buff *skb, * Adjust UDP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { - udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - udphoff)); } else if (!cp->app && (udph->check != 0)) { -- cgit v1.2.3-55-g7522 From 8b27b10f5863a5b63e46304a71aa01463d1efac4 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:17:20 +0300 Subject: ipvs: optimize checksums for apps Avoid full checksum calculation for apps that can provide info whether csum was broken after payload mangling. For now only ip_vs_ftp mangles payload and it updates the csum, so the full recalculation is avoided for all packets. Add CHECKSUM_UNNECESSARY for snat_handler (TCP and UDP). It is needed to support SNAT from local address for the case when csum is fully recalculated. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 12 ++++++++++-- net/netfilter/ipvs/ip_vs_ftp.c | 7 ++++++- net/netfilter/ipvs/ip_vs_proto_tcp.c | 31 +++++++++++++++++++++++++------ net/netfilter/ipvs/ip_vs_proto_udp.c | 31 +++++++++++++++++++++++++------ 4 files changed, 66 insertions(+), 15 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 6e8a6192e574..adcdba9dd183 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -597,11 +597,19 @@ struct ip_vs_app { __be16 port; /* port number in net order */ atomic_t usecnt; /* usage counter */ - /* output hook: return false if can't linearize. diff set for TCP. */ + /* + * output hook: Process packet in inout direction, diff set for TCP. + * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, + * 2=Mangled but checksum was not updated + */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff); - /* input hook: return false if can't linearize. diff set for TCP. */ + /* + * input hook: Process packet in outin direction, diff set for TCP. + * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, + * 2=Mangled but checksum was not updated + */ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 090889a3b3af..75455000ad1c 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -242,9 +242,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, start-data, end-start, buf, buf_len); - if (ret) + if (ret) { ip_vs_nfct_expect_related(skb, ct, n_cp, IPPROTO_TCP, 0, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* csum is updated */ + ret = 1; + } } /* diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 318d011036db..64dc2954cf78 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -120,6 +120,7 @@ tcp_snat_handler(struct sk_buff *skb, struct tcphdr *tcph; unsigned int tcphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -134,13 +135,20 @@ tcp_snat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + if (!(ret = ip_vs_app_pkt_out(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; } tcph = (void *)skb_network_header(skb) + tcphoff; @@ -151,12 +159,13 @@ tcp_snat_handler(struct sk_buff *skb, tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - tcphoff)); - } else if (!cp->app) { + } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; @@ -174,6 +183,7 @@ tcp_snat_handler(struct sk_buff *skb, skb->len - tcphoff, cp->protocol, skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, tcph->check, @@ -190,6 +200,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct tcphdr *tcph; unsigned int tcphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -204,6 +215,8 @@ tcp_dnat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -212,8 +225,13 @@ tcp_dnat_handler(struct sk_buff *skb, * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ - if (!ip_vs_app_pkt_in(cp, skb)) + if (!(ret = ip_vs_app_pkt_in(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; } tcph = (void *)skb_network_header(skb) + tcphoff; @@ -226,12 +244,13 @@ tcp_dnat_handler(struct sk_buff *skb, tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - tcphoff)); - } else if (!cp->app) { + } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index f9290893bd93..9c558c40bfbb 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -121,6 +121,7 @@ udp_snat_handler(struct sk_buff *skb, struct udphdr *udph; unsigned int udphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -135,6 +136,8 @@ udp_snat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -142,8 +145,13 @@ udp_snat_handler(struct sk_buff *skb, /* * Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + if (!(ret = ip_vs_app_pkt_out(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; } udph = (void *)skb_network_header(skb) + udphoff; @@ -156,12 +164,13 @@ udp_snat_handler(struct sk_buff *skb, udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { + } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; @@ -181,6 +190,7 @@ udp_snat_handler(struct sk_buff *skb, skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, udph->check, (char*)&(udph->check) - (char*)udph); @@ -196,6 +206,7 @@ udp_dnat_handler(struct sk_buff *skb, struct udphdr *udph; unsigned int udphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -210,6 +221,8 @@ udp_dnat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -218,8 +231,13 @@ udp_dnat_handler(struct sk_buff *skb, * Attempt ip_vs_app call. * It will fix ip_vs_conn */ - if (!ip_vs_app_pkt_in(cp, skb)) + if (!(ret = ip_vs_app_pkt_in(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; } udph = (void *)skb_network_header(skb) + udphoff; @@ -232,12 +250,13 @@ udp_dnat_handler(struct sk_buff *skb, udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { + } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; -- cgit v1.2.3-55-g7522 From cf356d69db0afef692cd640917bc70f708c27f14 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:21:07 +0300 Subject: ipvs: switch to notrack mode Change skb->ipvs_property semantic. This is preparation to support ip_vs_out processing in LOCAL_OUT. ipvs_property=1 will be used to avoid expensive lookups for traffic sent by transmitters. Now when conntrack support is not used we call ip_vs_notrack method to avoid problems in OUTPUT and POST_ROUTING hooks instead of exiting POST_ROUTING as before. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 20 +++++++++++++++++++- net/netfilter/ipvs/ip_vs_core.c | 39 ++++----------------------------------- net/netfilter/ipvs/ip_vs_xmit.c | 7 +++++-- 3 files changed, 28 insertions(+), 38 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index adcdba9dd183..0e4618470cee 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -25,7 +25,7 @@ #include #include /* for struct ipv6hdr */ #include /* for ipv6_addr_copy */ -#ifdef CONFIG_IP_VS_NFCT +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include #endif @@ -1021,6 +1021,24 @@ static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum) return csum_partial(diff, sizeof(diff), oldsum); } +/* + * Forget current conntrack (unconfirmed) and attach notrack entry + */ +static inline void ip_vs_notrack(struct sk_buff *skb) +{ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (!ct || !nf_ct_is_untracked(ct)) { + nf_reset(skb); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); + } +#endif +} + #ifdef CONFIG_IP_VS_NFCT /* * Netfilter connection tracking diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index e5fef7aef0d4..222453029b9e 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -507,23 +507,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, return NF_DROP; } -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain and is used to avoid double NAT and confirmation when we do - * not want to keep the conntrack structure - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; -} - __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); @@ -682,8 +665,9 @@ static int handle_response_icmp(int af, struct sk_buff *skb, /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); + skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) - skb->ipvs_property = 1; + ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); verdict = NF_ACCEPT; @@ -929,8 +913,9 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) - skb->ipvs_property = 1; + ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); @@ -1496,14 +1481,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, - }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1532,14 +1509,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_NAT_SRC-1, - }, #endif }; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b0bd8afbf368..94b53b441028 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -217,6 +217,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) ({ \ int __ret = NF_ACCEPT; \ \ + (skb)->ipvs_property = 1; \ if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ __ret = ip_vs_confirm_conntrack(skb, cp); \ if (__ret == NF_ACCEPT) { \ @@ -228,8 +229,9 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) #define IP_VS_XMIT_NAT(pf, skb, cp) \ do { \ + (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - (skb)->ipvs_property = 1; \ + ip_vs_notrack(skb); \ else \ ip_vs_update_conntrack(skb, cp, 1); \ skb_forward_csum(skb); \ @@ -239,8 +241,9 @@ do { \ #define IP_VS_XMIT(pf, skb, cp) \ do { \ + (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - (skb)->ipvs_property = 1; \ + ip_vs_notrack(skb); \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ skb_dst(skb)->dev, dst_output); \ -- cgit v1.2.3-55-g7522 From 190ecd27cd7294105e3b26ca71663c7d940acbbb Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:24:37 +0300 Subject: ipvs: do not schedule conns from real servers This patch is needed to avoid scheduling of packets from local real server when we add ip_vs_in in LOCAL_OUT hook to support local client. Currently, when ip_vs_in can not find existing connection it tries to create new one by calling ip_vs_schedule. The default indication from ip_vs_schedule was if connection was scheduled to real server. If real server is not available we try to use the bypass forwarding method or to send ICMP error. But in some cases we do not want to use the bypass feature. So, add flag 'ignored' to indicate if the scheduler ignores this packet. Make sure we do not create new connections from replies. We can hit this problem for persistent services and local real server when ip_vs_in is added to LOCAL_OUT hook to handle local clients. Also, make sure ip_vs_schedule ignores SYN packets for Active FTP DATA from local real server. The FTP DATA connection should be created on SYN+ACK from client to assign correct connection daddr. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 ++- net/netfilter/ipvs/ip_vs_core.c | 34 ++++++++++++++++++++++++++++++++-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 6 ++++-- net/netfilter/ipvs/ip_vs_proto_tcp.c | 7 +++++-- net/netfilter/ipvs/ip_vs_proto_udp.c | 6 ++++-- 5 files changed, 47 insertions(+), 9 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 0e4618470cee..9d5c1b965304 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -849,7 +849,8 @@ extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc); extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); extern struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb); +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp, int *ignored); extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_protocol *pp); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 222453029b9e..0090d6d25e95 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -342,7 +342,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * Protocols supported: TCP, UDP */ struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp, int *ignored) { struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; @@ -350,16 +351,43 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) __be16 _ports[2], *pptr; unsigned int flags; + *ignored = 1; ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NULL; + /* + * FTPDATA needs this check when using local real server. + * Never schedule Active FTPDATA connections from real server. + * For LVS-NAT they must be already created. For other methods + * with persistence the connection is created on SYN+ACK. + */ + if (pptr[0] == FTPDATA) { + IP_VS_DBG_PKT(12, pp, skb, 0, "Not scheduling FTPDATA"); + return NULL; + } + + /* + * Do not schedule replies from local real server. It is risky + * for fwmark services but mostly for persistent services. + */ + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && + (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { + IP_VS_DBG_PKT(12, pp, skb, 0, + "Not scheduling reply for existing connection"); + __ip_vs_conn_put(cp); + return NULL; + } + /* * Persistent service */ - if (svc->flags & IP_VS_SVC_F_PERSISTENT) + if (svc->flags & IP_VS_SVC_F_PERSISTENT) { + *ignored = 0; return ip_vs_sched_persist(svc, skb, pptr); + } /* * Non-persistent service @@ -372,6 +400,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) return NULL; } + *ignored = 0; + dest = svc->scheduler->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 4c0855cb006e..9ab5232ce019 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -31,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, if ((sch->type == SCTP_CID_INIT) && (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, sh->dest))) { + int ignored; + if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -44,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { + *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + if (!*cpp && !ignored) { *verdict = ip_vs_leave(svc, skb, pp); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 64dc2954cf78..85d80a66b492 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -43,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; } + /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ if (th->syn && (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, th->dest))) { + int ignored; + if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -60,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { + *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + if (!*cpp && !ignored) { *verdict = ip_vs_leave(svc, skb, pp); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 9c558c40bfbb..5d21f08155ed 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -46,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, uh->dest); if (svc) { + int ignored; + if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -60,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { + *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + if (!*cpp && !ignored) { *verdict = ip_vs_leave(svc, skb, pp); return 0; } -- cgit v1.2.3-55-g7522 From 489fdedaed5ddb437dd2840eb93df37a6dd8c7de Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:27:31 +0300 Subject: ipvs: stop ICMP from FORWARD to local Delivering locally ICMP from FORWARD hook is not supported. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 0090d6d25e95..27ecb258ea70 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -48,6 +48,7 @@ #ifdef CONFIG_IP_VS_IPV6 #include #include +#include #endif #include @@ -1191,7 +1192,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) offset += 2 * sizeof(__u16); verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); - /* do not touch skb anymore */ + /* LOCALNODE from FORWARD hook is not supported */ + if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD && + skb_rtable(skb)->rt_flags & RTCF_LOCAL) { + IP_VS_DBG(1, "%s(): " + "local delivery to %pI4 but in FORWARD\n", + __func__, &skb_rtable(skb)->rt_dst); + verdict = NF_DROP; + } out: __ip_vs_conn_put(cp); @@ -1212,6 +1220,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_protocol *pp; unsigned int offset, verdict; union nf_inet_addr snet; + struct rt6_info *rt; *related = 1; @@ -1290,7 +1299,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) IPPROTO_SCTP == cih->nexthdr) offset += 2 * sizeof(__u16); verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); - /* do not touch skb anymore */ + /* LOCALNODE from FORWARD hook is not supported */ + if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD && + (rt = (struct rt6_info *) skb_dst(skb)) && + rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) { + IP_VS_DBG(1, "%s(): " + "local delivery to %pI6 but in FORWARD\n", + __func__, &rt->rt6i_dst); + verdict = NF_DROP; + } __ip_vs_conn_put(cp); -- cgit v1.2.3-55-g7522 From 4256f1aaa662697c1faa0984b7a698c2c8c57735 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:29:40 +0300 Subject: ipvs: fix CHECKSUM_PARTIAL for TUN method The recent change in IP_VS_XMIT_TUNNEL to set CHECKSUM_NONE is not correct. After adding IPIP header skb->csum becomes invalid but the CHECKSUM_PARTIAL case must be supported. So, use skb_forward_csum() which is most suitable for us to allow local clients to send IPIP to remote real server. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 94b53b441028..63cc0feaaef6 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -222,7 +222,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) __ret = ip_vs_confirm_conntrack(skb, cp); \ if (__ret == NF_ACCEPT) { \ nf_reset(skb); \ - (skb)->ip_summed = CHECKSUM_NONE; \ + skb_forward_csum(skb); \ } \ __ret; \ }) -- cgit v1.2.3-55-g7522 From 1ca5bb5450aa2401fa272efeb741ebb260d0fbb0 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:32:29 +0300 Subject: ipvs: create ip_vs_defrag_user Create new function ip_vs_defrag_user to return correct IP_DEFRAG_xxx user depending on the hooknum. It will be needed when we add handlers in LOCAL_OUT. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 55 +++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 27ecb258ea70..f7f52831c4a6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -543,6 +543,15 @@ __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); } +static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) +{ + if (NF_INET_LOCAL_IN == hooknum) + return IP_DEFRAG_VS_IN; + if (NF_INET_FORWARD == hooknum) + return IP_DEFRAG_VS_FWD; + return IP_DEFRAG_VS_OUT; +} + static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) { int err = ip_defrag(skb, user); @@ -714,7 +723,8 @@ out: * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related) +static int ip_vs_out_icmp(struct sk_buff *skb, int *related, + unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; @@ -729,7 +739,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) /* reassemble IP fragments */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -788,7 +798,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) } #ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, + unsigned int hooknum) { struct ipv6hdr *iph; struct icmp6hdr _icmph, *ic; @@ -804,7 +815,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) /* reassemble IP fragments */ if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) + if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -986,7 +997,9 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); + int related; + int verdict = ip_vs_out_icmp_v6(skb, &related, + hooknum); if (related) { if (sysctl_ip_vs_snat_reroute && @@ -1000,7 +1013,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_out_icmp(skb, &related); + int related; + int verdict = ip_vs_out_icmp(skb, &related, hooknum); if (related) { if (sysctl_ip_vs_snat_reroute && @@ -1019,19 +1033,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); - - if (related) - return verdict; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, + ip_vs_defrag_user(hooknum))) + return NF_STOLEN; } + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } else #endif if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + if (ip_vs_gather_frags(skb, + ip_vs_defrag_user(hooknum))) return NF_STOLEN; ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); @@ -1114,8 +1128,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) /* reassemble IP fragments */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -1226,9 +1239,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) /* reassemble IP fragments */ if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : - IP_DEFRAG_VS_FWD)) + if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -1349,7 +1360,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); + int related; + int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); if (related) return verdict; @@ -1358,7 +1370,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); + int related; + int verdict = ip_vs_in_icmp(skb, &related, hooknum); if (related) return verdict; -- cgit v1.2.3-55-g7522 From f5a41847acc535e2e2018e397b1876ba7577d9d9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:35:46 +0300 Subject: ipvs: move ip_route_me_harder for ICMP Currently, ip_route_me_harder after ip_vs_out_icmp is called even if packet is not related to IPVS connection. Move it into handle_response_icmp. Also, force rerouting if sending to local client because IPv4 stack uses addresses from the route. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index f7f52831c4a6..c4f091d5a628 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -702,6 +702,17 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #endif ip_vs_nat_icmp(skb, pp, cp, 1); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) + goto out; + } else +#endif + if ((sysctl_ip_vs_snat_reroute || + skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto out; + /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); @@ -940,16 +951,16 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * if it came from this machine itself. So re-compute * the routing information. */ - if (sysctl_ip_vs_snat_reroute) { #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ip6_route_me_harder(skb) != 0) - goto drop; - } else + if (af == AF_INET6) { + if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) + goto drop; + } else #endif - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - } + if ((sysctl_ip_vs_snat_reroute || + skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); @@ -1001,13 +1012,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int verdict = ip_vs_out_icmp_v6(skb, &related, hooknum); - if (related) { - if (sysctl_ip_vs_snat_reroute && - NF_ACCEPT == verdict && - ip6_route_me_harder(skb)) - verdict = NF_DROP; + if (related) return verdict; - } ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } } else @@ -1016,13 +1022,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int related; int verdict = ip_vs_out_icmp(skb, &related, hooknum); - if (related) { - if (sysctl_ip_vs_snat_reroute && - NF_ACCEPT == verdict && - ip_route_me_harder(skb, RTN_LOCAL)) - verdict = NF_DROP; + if (related) return verdict; - } ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } -- cgit v1.2.3-55-g7522 From fc604767613b6d2036cdc35b660bc39451040a47 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:38:15 +0300 Subject: ipvs: changes for local real server This patch deals with local real servers: - Add support for DNAT to local address (different real server port). It needs ip_vs_out hook in LOCAL_OUT for both families because skb->protocol is not set for locally generated packets and can not be used to set 'af'. - Skip packets in ip_vs_in marked with skb->ipvs_property because ip_vs_out processing can be executed in LOCAL_OUT but we still have the conn_out_get check in ip_vs_in. - Ignore packets with inet->nodefrag from local stack - Require skb_dst(skb) != NULL because we use it to get struct net - Add support for changing the route to local IPv4 stack after DNAT depending on the source address type. Local client sets output route and the remote client sets input route. It looks like IPv6 does not need such rerouting because the replies use addresses from initial incoming header, not from skb route. - All transmitters now have strict checks for the destination address type: redirect from non-local address to local real server requires NAT method, local address can not be used as source address when talking to remote real server. - Now LOCALNODE is not set explicitly as forwarding method in real server to allow the connections to provide correct forwarding method to the backup server. Not sure if this breaks tools that expect to see 'Local' real server type. If needed, this can be supported with new flag IP_VS_DEST_F_LOCAL. Now it should be possible connections in backup that lost their fwmark information during sync to be forwarded properly to their daddr, even if it is local address in the backup server. By this way backup could be used as real server for DR or TUN, for NAT there are some restrictions because tuple collisions in conntracks can create problems for the traffic. - Call ip_vs_dst_reset when destination is updated in case some real server IP type is changed between local and remote. [ horms@verge.net.au: removed trailing whitespace ] Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_core.c | 123 ++++++++++-- net/netfilter/ipvs/ip_vs_ctl.c | 18 +- net/netfilter/ipvs/ip_vs_xmit.c | 433 ++++++++++++++++++++++++++++++++-------- 4 files changed, 458 insertions(+), 117 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 9d5c1b965304..2f88d5942332 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -409,6 +409,7 @@ struct ip_vs_conn { /* packet transmitter for different forwarding methods. If it mangles the packet, it must return NF_DROP or better NF_STOLEN, otherwise this must be changed to a sk_buff **. + NF_ACCEPT can be returned when destination is local. */ int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c4f091d5a628..a6c8aff1b47e 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -984,26 +984,34 @@ drop: } /* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) { struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; - int af; EnterFunction(11); - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - + /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } + + if (unlikely(!skb_dst(skb))) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1106,6 +1114,69 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, return handle_response(af, skb, pp, cp, iph.len); } +/* + * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET); + local_bh_enable(); + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET6); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET6); + local_bh_enable(); + return verdict; +} + +#endif /* * Handle ICMP messages in the outside-to-inside direction (incoming). @@ -1342,6 +1413,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, struct ip_vs_conn *cp; int ret, restart, af, pkts; + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); @@ -1525,13 +1600,13 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_IN, .priority = 100, }, - /* After packet filtering, change source only for VS/NAT */ + /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_out, + .hook = ip_vs_local_reply4, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 100, + .hooknum = NF_INET_LOCAL_OUT, + .priority = -99, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ @@ -1542,6 +1617,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1553,13 +1636,13 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_IN, .priority = 100, }, - /* After packet filtering, change source only for VS/NAT */ + /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_out, + .hook = ip_vs_local_reply6, .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 100, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = -99, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ @@ -1570,6 +1653,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, #endif }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 0b884d3e192f..5f5daa30b0af 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -777,20 +777,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; - /* check if local node and update the flags */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { - if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - } else -#endif - if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; @@ -824,6 +810,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; + spin_lock(&dest->dst_lock); + ip_vs_dst_reset(dest); + spin_unlock(&dest->dst_lock); + if (add) ip_vs_new_estimator(&dest->stats); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 63cc0feaaef6..8608882f89e3 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -67,12 +67,19 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) return dst; } +/* + * Get route to destination or remote server + * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest, + * &4=Allow redirect from remote daddr to local + */ static struct rtable * -__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos) +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, + __be32 daddr, u32 rtos, int rt_mode) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net(skb_dst(skb)->dev); struct rtable *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; + struct rtable *ort; /* Original route */ + int local; if (dest) { spin_lock(&dest->dst_lock); @@ -104,23 +111,95 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos) .oif = 0, .nl_u = { .ip4_u = { - .daddr = cp->daddr.ip, + .daddr = daddr, .saddr = 0, .tos = rtos, } }, }; if (ip_route_output_key(net, &rt, &fl)) { IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", - &cp->daddr.ip); + &daddr); return NULL; } } + local = rt->rt_flags & RTCF_LOCAL; + if (!((local ? 1 : 2) & rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", + (rt->rt_flags & RTCF_LOCAL) ? + "local":"non-local", &rt->rt_dst); + ip_rt_put(rt); + return NULL; + } + if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) && + ort->rt_flags & RTCF_LOCAL)) { + IP_VS_DBG_RL("Redirect from non-local address %pI4 to local " + "requires NAT method, dest: %pI4\n", + &ip_hdr(skb)->daddr, &rt->rt_dst); + ip_rt_put(rt); + return NULL; + } + if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) { + IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 " + "to non-local address, dest: %pI4\n", + &ip_hdr(skb)->saddr, &rt->rt_dst); + ip_rt_put(rt); + return NULL; + } + return rt; } +/* Reroute packet to local IPv4 stack after DNAT */ +static int +__ip_vs_reroute_locally(struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + struct net *net = dev_net(dev); + struct iphdr *iph = ip_hdr(skb); + + if (rt->fl.iif) { + unsigned long orefdst = skb->_skb_refdst; + + if (ip_route_input(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) + return 0; + refdst_drop(orefdst); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos), + } + }, + .mark = skb->mark, + }; + struct rtable *rt; + + if (ip_route_output_key(net, &rt, &fl)) + return 0; + if (!(rt->rt_flags & RTCF_LOCAL)) { + ip_rt_put(rt); + return 0; + } + /* Drop old route. */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } + return 1; +} + #ifdef CONFIG_IP_VS_IPV6 +static inline int __ip_vs_is_local_route6(struct rt6_info *rt) +{ + return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK; +} + static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, struct in6_addr *ret_saddr, int do_xfrm) @@ -155,14 +234,21 @@ out_err: return NULL; } +/* + * Get route to destination or remote server + * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest, + * &4=Allow redirect from remote daddr to local + */ static struct rt6_info * -__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct in6_addr *ret_saddr, int do_xfrm) +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, + struct in6_addr *daddr, struct in6_addr *ret_saddr, + int do_xfrm, int rt_mode) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net(skb_dst(skb)->dev); struct rt6_info *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; + struct rt6_info *ort; /* Original route */ struct dst_entry *dst; + int local; if (dest) { spin_lock(&dest->dst_lock); @@ -188,13 +274,38 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ipv6_addr_copy(ret_saddr, &dest->dst_saddr); spin_unlock(&dest->dst_lock); } else { - dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr, - do_xfrm); + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); if (!dst) return NULL; rt = (struct rt6_info *) dst; } + local = __ip_vs_is_local_route6(rt); + if (!((local ? 1 : 2) & rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n", + local ? "local":"non-local", daddr); + dst_release(&rt->dst); + return NULL; + } + if (local && !(rt_mode & 4) && + !((ort = (struct rt6_info *) skb_dst(skb)) && + __ip_vs_is_local_route6(ort))) { + IP_VS_DBG_RL("Redirect from non-local address %pI6 to local " + "requires NAT method, dest: %pI6\n", + &ipv6_hdr(skb)->daddr, daddr); + dst_release(&rt->dst); + return NULL; + } + if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&ipv6_hdr(skb)->saddr) & + IPV6_ADDR_LOOPBACK)) { + IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 " + "to non-local address, dest: %pI6\n", + &ipv6_hdr(skb)->saddr, daddr); + dst_release(&rt->dst); + return NULL; + } + return rt; } #endif @@ -227,23 +338,27 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) __ret; \ }) -#define IP_VS_XMIT_NAT(pf, skb, cp) \ +#define IP_VS_XMIT_NAT(pf, skb, cp, local) \ do { \ (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ ip_vs_notrack(skb); \ else \ ip_vs_update_conntrack(skb, cp, 1); \ + if (local) \ + return NF_ACCEPT; \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ skb_dst(skb)->dev, dst_output); \ } while (0) -#define IP_VS_XMIT(pf, skb, cp) \ +#define IP_VS_XMIT(pf, skb, cp, local) \ do { \ (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ ip_vs_notrack(skb); \ + if (local) \ + return NF_ACCEPT; \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ skb_dst(skb)->dev, dst_output); \ @@ -258,7 +373,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { /* we do not touch skb and do not need pskb ptr */ - return NF_ACCEPT; + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); } @@ -271,27 +386,15 @@ int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { - struct net *net = dev_net(skb->dev); struct rtable *rt; /* Route to the other host */ struct iphdr *iph = ip_hdr(skb); - u8 tos = iph->tos; int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = iph->daddr, - .saddr = 0, - .tos = RT_TOS(tos), } }, - }; EnterFunction(10); - if (ip_route_output_key(net, &rt, &fl)) { - IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n", - __func__, &iph->daddr); + if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, + RT_TOS(iph->tos), 2))) goto tx_error_icmp; - } /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -319,7 +422,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -337,18 +440,14 @@ int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { - struct net *net = dev_net(skb->dev); - struct dst_entry *dst; struct rt6_info *rt; /* Route to the other host */ struct ipv6hdr *iph = ipv6_hdr(skb); int mtu; EnterFunction(10); - dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0); - if (!dst) + if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2))) goto tx_error_icmp; - rt = (struct rt6_info *) dst; /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -376,7 +475,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -401,6 +500,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct rtable *rt; /* Route to the other host */ int mtu; struct iphdr *iph = ip_hdr(skb); + int local; EnterFunction(10); @@ -414,16 +514,40 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(iph->tos), 1|2|4))) goto tx_error_icmp; + local = rt->rt_flags & RTCF_LOCAL; + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, pp, skb, 0, "ip_vs_nat_xmit(): " + "stopping DNAT to local address"); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) { + IP_VS_DBG_RL_PKT(1, pp, skb, 0, "ip_vs_nat_xmit(): " + "stopping DNAT to loopback address"); + goto tx_error_put; + } /* MTU checking */ mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -433,16 +557,27 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error; + goto tx_error_put; ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); + if (!local) { + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + ip_rt_put(rt); + /* + * Some IPv4 replies get local address from routes, + * not from iph, so while we DNAT after routing + * we need this second input/output route. + */ + if (!__ip_vs_reroute_locally(skb)) + goto tx_error; + } + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length @@ -452,7 +587,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); LeaveFunction(10); return NF_STOLEN; @@ -475,6 +610,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, { struct rt6_info *rt; /* Route to the other host */ int mtu; + int local; EnterFunction(10); @@ -489,18 +625,44 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, 1|2|4))) goto tx_error_icmp; + local = __ip_vs_is_local_route6(rt); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to local address"); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG_RL_PKT(1, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to loopback address"); + goto tx_error_put; + } /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit_v6(): frag needed for"); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -510,14 +672,19 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) goto tx_error; - ipv6_hdr(skb)->daddr = cp->daddr.in6; + ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6); + + if (!local || !skb->dev) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + /* destined to loopback, do we need to change route? */ + dst_release(&rt->dst); + } IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); @@ -528,7 +695,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); LeaveFunction(10); return NF_STOLEN; @@ -588,16 +755,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(tos), 1|2))) goto tx_error_icmp; + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + } tdev = rt->dst.dev; mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { - ip_rt_put(rt); IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); - goto tx_error; + goto tx_error_put; } if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); @@ -607,9 +778,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if ((old_iph->frag_off & htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* @@ -678,6 +848,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; +tx_error_put: + ip_rt_put(rt); + goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 @@ -703,27 +876,29 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + &saddr, 1, 1|2))) goto tx_error_icmp; + if (__ip_vs_is_local_route6(rt)) { + dst_release(&rt->dst); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + } tdev = rt->dst.dev; mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (mtu < IPV6_MIN_MTU) { - dst_release(&rt->dst); IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); - goto tx_error; + goto tx_error_put; } if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->dst); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* @@ -789,6 +964,9 @@ tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; +tx_error_put: + dst_release(&rt->dst); + goto tx_error; } #endif @@ -807,8 +985,13 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(iph->tos), 1|2))) goto tx_error_icmp; + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + } /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -836,7 +1019,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -859,9 +1042,13 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, 1|2))) goto tx_error_icmp; + if (__ip_vs_is_local_route6(rt)) { + dst_release(&rt->dst); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + } /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -889,7 +1076,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -915,6 +1102,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct rtable *rt; /* Route to the other host */ int mtu; int rc; + int local; EnterFunction(10); @@ -935,16 +1123,43 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(ip_hdr(skb)->tos), 1|2|4))) goto tx_error_icmp; + local = rt->rt_flags & RTCF_LOCAL; + + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error_put; + } /* MTU checking */ mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { - ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -954,16 +1169,27 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - ip_vs_nat_icmp(skb, pp, cp, 0); + if (!local) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + ip_rt_put(rt); + /* + * Some IPv4 replies get local address from routes, + * not from iph, so while we DNAT after routing + * we need this second input/output route. + */ + if (!__ip_vs_reroute_locally(skb)) + goto tx_error; + } + /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); rc = NF_STOLEN; goto out; @@ -989,6 +1215,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct rt6_info *rt; /* Route to the other host */ int mtu; int rc; + int local; EnterFunction(10); @@ -1009,17 +1236,44 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, 1|2|4))) goto tx_error_icmp; + local = __ip_vs_is_local_route6(rt); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error_put; + } + /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -1029,16 +1283,21 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - ip_vs_nat_icmp_v6(skb, pp, cp, 0); + if (!local || !skb->dev) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + /* destined to loopback, do we need to change route? */ + dst_release(&rt->dst); + } + /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); rc = NF_STOLEN; goto out; -- cgit v1.2.3-55-g7522 From cb59155f21d4c0507d2034c2953f6a3f7806913d Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:40:51 +0300 Subject: ipvs: changes for local client This patch deals with local client processing. Prefer LOCAL_OUT hook for scheduling connections from local clients. LOCAL_IN is still supported if the packets are not marked as processed in LOCAL_OUT. The idea to process requests in LOCAL_OUT is to alter conntrack reply before it is confirmed at POST_ROUTING. If the local requests are processed in LOCAL_IN the conntrack can not be updated and matching by state is impossible. Add the following handlers: - ip_vs_reply[46] at LOCAL_IN:99 to process replies from remote real servers to local clients. Now when both replies from remote real servers (ip_vs_reply*) and local real servers (ip_vs_local_reply*) are handled it is safe to remove the conn_out_get call from ip_vs_in because it does not support related ICMP packets. - ip_vs_local_request[46] at LOCAL_OUT:-98 to process requests from local client Handling in LOCAL_OUT causes some changes: - as skb->dev, skb->protocol and skb->pkt_type are not defined in LOCAL_OUT make sure we set skb->dev before calling icmpv6_send, prefer skb_dst(skb) for struct net and remove the skb->protocol checks from TUN transmitters. [ horms@verge.net.au: removed trailing whitespace ] Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 266 ++++++++++++++++++++++++++++------------ net/netfilter/ipvs/ip_vs_xmit.c | 51 +++++--- 2 files changed, 225 insertions(+), 92 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index a6c8aff1b47e..5fbcf67af8ec 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -529,9 +529,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ */ #ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) + if (svc->af == AF_INET6) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); - else + } else #endif icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); @@ -1065,57 +1070,61 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) */ cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - if (unlikely(!cp)) { - if (sysctl_ip_vs_nat_icmp_send && - (pp->protocol == IPPROTO_TCP || - pp->protocol == IPPROTO_UDP || - pp->protocol == IPPROTO_SCTP)) { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, iph.len, - sizeof(_ports), _ports); - if (pptr == NULL) - return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(af, iph.protocol, - &iph.saddr, - pptr[0])) { - /* - * Notify the real server: there is no - * existing entry if it is not RST - * packet or not TCP packet. - */ - if ((iph.protocol != IPPROTO_TCP && - iph.protocol != IPPROTO_SCTP) - || ((iph.protocol == IPPROTO_TCP - && !is_tcp_reset(skb, iph.len)) - || (iph.protocol == IPPROTO_SCTP - && !is_sctp_abort(skb, - iph.len)))) { + if (likely(cp)) + return handle_response(af, skb, pp, cp, iph.len); + if (sysctl_ip_vs_nat_icmp_send && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP || + pp->protocol == IPPROTO_SCTP)) { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, iph.len, + sizeof(_ports), _ports); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_lookup_real_service(af, iph.protocol, + &iph.saddr, + pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if ((iph.protocol != IPPROTO_TCP && + iph.protocol != IPPROTO_SCTP) + || ((iph.protocol == IPPROTO_TCP + && !is_tcp_reset(skb, iph.len)) + || (iph.protocol == IPPROTO_SCTP + && !is_sctp_abort(skb, + iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - icmpv6_send(skb, - ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, - 0); - else + if (af == AF_INET6) { + struct net *net = + dev_net(skb_dst(skb)->dev); + + if (!skb->dev) + skb->dev = net->loopback_dev; + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0); + } else #endif - icmp_send(skb, - ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); - return NF_DROP; - } + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; } } - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; } - - return handle_response(af, skb, pp, cp, iph.len); + IP_VS_DBG_PKT(12, pp, skb, 0, + "ip_vs_out: packet continues traversal as normal"); + return NF_ACCEPT; } /* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. * Check if packet is reply for established ip_vs_conn. */ static unsigned int @@ -1147,7 +1156,8 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 /* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. * Check if packet is reply for established ip_vs_conn. */ static unsigned int @@ -1404,34 +1414,43 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) * and send it on its way... */ static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) { struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; - int ret, restart, af, pkts; + int ret, restart, pkts; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - /* - * Big tappo: only PACKET_HOST, including loopback for local client - * Don't handle local packets on IPv6 for now + * Big tappo: + * - remote client: only PACKET_HOST + * - route: used for struct net when skb->dev is unset */ - if (unlikely(skb->pkt_type != PACKET_HOST)) { - IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", - skb->pkt_type, - iph.protocol, - IP_VS_DBG_ADDR(af, &iph.daddr)); + if (unlikely((skb->pkt_type != PACKET_HOST && + hooknum != NF_INET_LOCAL_OUT) || + !skb_dst(skb))) { + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" + " ignored in hook %u\n", + skb->pkt_type, iph.protocol, + IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1467,11 +1486,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, if (unlikely(!cp)) { int v; - /* For local client packets, it could be a response */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - if (cp) - return handle_response(af, skb, pp, cp, iph.len); - if (!pp->conn_schedule(af, skb, pp, &v, &cp)) return v; } @@ -1479,7 +1493,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, if (unlikely(!cp)) { /* sorry, all this trouble for a no-hit :) */ IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); + "ip_vs_in: packet continues traversal as normal"); return NF_ACCEPT; } @@ -1550,6 +1564,72 @@ out: return ret; } +/* + * AF_INET handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(hooknum, skb, AF_INET); +} + +/* + * AF_INET handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_in(hooknum, skb, AF_INET); + local_bh_enable(); + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * AF_INET6 handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(hooknum, skb, AF_INET6); +} + +/* + * AF_INET6 handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_in(hooknum, skb, AF_INET6); + local_bh_enable(); + return verdict; +} + +#endif + /* * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP @@ -1590,15 +1670,23 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = 99, + }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_in, + .hook = ip_vs_remote_request4, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, + .hooknum = NF_INET_LOCAL_IN, + .priority = 101, }, /* Before ip_vs_in, change source only for VS/NAT */ { @@ -1608,14 +1696,22 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_OUT, .priority = -99, }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_local_request4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = -98, + }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 99, + .hooknum = NF_INET_FORWARD, + .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { @@ -1626,15 +1722,23 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .priority = 100, }, #ifdef CONFIG_IP_VS_IPV6 + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_IN, + .priority = 99, + }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_in, + .hook = ip_vs_remote_request6, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, + .hooknum = NF_INET_LOCAL_IN, + .priority = 101, }, /* Before ip_vs_in, change source only for VS/NAT */ { @@ -1644,14 +1748,22 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_OUT, .priority = -99, }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_local_request6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = -98, + }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp_v6, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 99, + .hooknum = NF_INET_FORWARD, + .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 8608882f89e3..97b5361c036e 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -11,6 +11,16 @@ * * Changes: * + * Description of forwarding methods: + * - all transmitters are called from LOCAL_IN (remote clients) and + * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD + * - not all connections have destination server, for example, + * connections in backup server when fwmark is used + * - bypass connections use daddr from packet + * LOCAL_OUT rules: + * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) + * - skb->pkt_type is not set yet + * - the only place where we can see skb->sk != NULL */ #define KMSG_COMPONENT "IPVS" @@ -452,8 +462,13 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->dst); + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + dst_release(&rt->dst); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error; } @@ -659,6 +674,11 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit_v6(): frag needed for"); @@ -748,13 +768,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (skb->protocol != htons(ETH_P_IP)) { - IP_VS_DBG_RL("%s(): protocol error, " - "ETH_P_IP: %d, skb protocol: %d\n", - __func__, htons(ETH_P_IP), skb->protocol); - goto tx_error; - } - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, RT_TOS(tos), 1|2))) goto tx_error_icmp; @@ -869,13 +882,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (skb->protocol != htons(ETH_P_IPV6)) { - IP_VS_DBG_RL("%s(): protocol error, " - "ETH_P_IPV6: %d, skb protocol: %d\n", - __func__, htons(ETH_P_IPV6), skb->protocol); - goto tx_error; - } - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, &saddr, 1, 1|2))) goto tx_error_icmp; @@ -896,6 +902,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; @@ -1053,6 +1064,11 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); dst_release(&rt->dst); IP_VS_DBG_RL("%s(): frag needed\n", __func__); @@ -1271,6 +1287,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; -- cgit v1.2.3-55-g7522 From 3233759be7eeca9998c514b8f49e8cf2b85e64d3 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:43:36 +0300 Subject: ipvs: inherit forwarding method in backup Connections in backup server should inherit the forwarding method from real server. It is a way to fix a problem where the forwarding method in backup connection is damaged by logical OR operation with the real server's connection flags. And the change is needed for setups where the backup server uses different forwarding method for the same real servers. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 1d1a529dbe24..e9adecdc8ca4 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -563,6 +563,8 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) */ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) conn_flags &= ~IP_VS_CONN_F_INACTIVE; + /* connections inherit forwarding method from dest */ + cp->flags &= ~IP_VS_CONN_F_FWD_MASK; } cp->flags |= conn_flags; cp->dest = dest; -- cgit v1.2.3-55-g7522 From 0d79641a96d612aaa6d57a4d4f521d7ed9c9ccdd Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:46:17 +0300 Subject: ipvs: provide address family for debugging As skb->protocol is not valid in LOCAL_OUT add parameter for address family in packet debugging functions. Even if ports are not present in AH and ESP change them to use ip_vs_tcpudp_debug_packet to show at least valid addresses as before. This patch removes the last user of skb->protocol in IPVS. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 17 ++++++----- net/netfilter/ipvs/ip_vs_core.c | 41 +++++++++++++++----------- net/netfilter/ipvs/ip_vs_proto.c | 8 ++--- net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 52 ++------------------------------- net/netfilter/ipvs/ip_vs_proto_sctp.c | 2 +- net/netfilter/ipvs/ip_vs_proto_tcp.c | 4 +-- net/netfilter/ipvs/ip_vs_proto_udp.c | 4 +-- net/netfilter/ipvs/ip_vs_xmit.c | 18 +++++++----- 8 files changed, 54 insertions(+), 92 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 2f88d5942332..b7bbd6c28cfa 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -136,24 +136,24 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, if (net_ratelimit()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) -#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg) \ +#define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level()) \ - pp->debug_packet(pp, skb, ofs, msg); \ + pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) -#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg) \ +#define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level() && \ net_ratelimit()) \ - pp->debug_packet(pp, skb, ofs, msg); \ + pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) #else /* NO DEBUGGING at ALL */ #define IP_VS_DBG_BUF(level, msg...) do {} while (0) #define IP_VS_ERR_BUF(msg...) do {} while (0) #define IP_VS_DBG(level, msg...) do {} while (0) #define IP_VS_DBG_RL(msg...) do {} while (0) -#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg) do {} while (0) -#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg) do {} while (0) +#define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) do {} while (0) +#define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) do {} while (0) #endif #define IP_VS_BUG() BUG() @@ -345,7 +345,7 @@ struct ip_vs_protocol { int (*app_conn_bind)(struct ip_vs_conn *cp); - void (*debug_packet)(struct ip_vs_protocol *pp, + void (*debug_packet)(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg); @@ -828,7 +828,8 @@ extern int ip_vs_set_state_timeout(int *table, int num, const char *const *names, const char *name, int to); extern void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, +ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, + const struct sk_buff *skb, int offset, const char *msg); extern struct ip_vs_protocol ip_vs_protocol_tcp; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5fbcf67af8ec..b4e51e9c5a04 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -365,7 +365,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * with persistence the connection is created on SYN+ACK. */ if (pptr[0] == FTPDATA) { - IP_VS_DBG_PKT(12, pp, skb, 0, "Not scheduling FTPDATA"); + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling FTPDATA"); return NULL; } @@ -376,7 +377,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { - IP_VS_DBG_PKT(12, pp, skb, 0, + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, "Not scheduling reply for existing connection"); __ip_vs_conn_put(cp); return NULL; @@ -617,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, skb->ip_summed = CHECKSUM_UNNECESSARY; if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMP"); else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMP"); } @@ -662,11 +663,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, skb->ip_summed = CHECKSUM_PARTIAL; if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMPv6"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMPv6"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); } #endif @@ -798,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking outgoing ICMP for"); offset += cih->ihl * 4; @@ -874,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking outgoing ICMPv6 for"); offset += sizeof(struct ipv6hdr); @@ -922,7 +927,7 @@ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int ihl) { - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); if (!skb_make_writable(skb, ihl)) goto drop; @@ -967,7 +972,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_route_me_harder(skb, RTN_LOCAL) != 0) goto drop; - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); @@ -1117,7 +1122,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) } } } - IP_VS_DBG_PKT(12, pp, skb, 0, + IP_VS_DBG_PKT(12, af, pp, skb, 0, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; } @@ -1253,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking incoming ICMP for"); offset += cih->ihl * 4; @@ -1364,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking incoming ICMPv6 for"); offset += sizeof(struct ipv6hdr); @@ -1492,12 +1499,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (unlikely(!cp)) { /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, pp, skb, 0, + IP_VS_DBG_PKT(12, af, pp, skb, 0, "ip_vs_in: packet continues traversal as normal"); return NF_ACCEPT; } - IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 027f654799fe..c53998390877 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -172,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, else if (ih->frag_off & htons(IP_OFFSET)) sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); else { - __be16 _ports[2], *pptr -; + __be16 _ports[2], *pptr; + pptr = skb_header_pointer(skb, offset + ih->ihl*4, sizeof(_ports), _ports); if (pptr == NULL) @@ -223,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, +ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) + if (af == AF_INET6) ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); else #endif diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 8956ef33ea6c..3a0461117d3f 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -117,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; } - -static void -ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "TRUNCATED"); - else - sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr); - - pr_debug("%s: %s %s\n", msg, pp->name, buf); -} - -#ifdef CONFIG_IP_VS_IPV6 -static void -ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct ipv6hdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "TRUNCATED"); - else - sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr); - - pr_debug("%s: %s %s\n", msg, pp->name, buf); -} -#endif - -static void -ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) - ah_esp_debug_packet_v6(pp, skb, offset, msg); - else -#endif - ah_esp_debug_packet_v4(pp, skb, offset, msg); -} - - static void ah_esp_init(struct ip_vs_protocol *pp) { /* nothing to do now */ @@ -195,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = { .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, + .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ .set_state_timeout = NULL, }; @@ -219,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = { .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, + .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ }; #endif diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 9ab5232ce019..d254345bfda7 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -176,7 +176,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) if (val != cmp) { /* CRC failure, dump it. */ - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 85d80a66b492..f6c5200e2146 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -300,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - tcphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } @@ -311,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - tcphoff, ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 5d21f08155ed..9d106a06bb0a 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -314,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - udphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } @@ -325,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - udphoff, ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 97b5361c036e..de04ea39cde8 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -543,7 +543,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, pp, skb, 0, "ip_vs_nat_xmit(): " + IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): " "stopping DNAT to local address"); goto tx_error_put; } @@ -552,7 +553,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) { - IP_VS_DBG_RL_PKT(1, pp, skb, 0, "ip_vs_nat_xmit(): " + IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to loopback address"); goto tx_error_put; } @@ -561,7 +562,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); + IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): frag needed for"); goto tx_error_put; } @@ -593,7 +595,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still @@ -654,7 +656,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, pp, skb, 0, + IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); goto tx_error_put; @@ -665,7 +667,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { - IP_VS_DBG_RL_PKT(1, pp, skb, 0, + IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); goto tx_error_put; @@ -680,7 +682,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->dev = net->loopback_dev; } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): frag needed for"); goto tx_error_put; } @@ -706,7 +708,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, dst_release(&rt->dst); } - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still -- cgit v1.2.3-55-g7522 From b0aeef30433ea6854e985c2e9842fa19f51b95cc Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 11 Oct 2010 11:23:07 +0300 Subject: nf_nat: restrict ICMP translation for embedded header Skip ICMP translation of embedded protocol header if NAT bits are not set. Needed for IPVS to see the original embedded addresses because for IPVS traffic the IPS_SRC_NAT_BIT and IPS_DST_NAT_BIT bits are not set. It happens when IPVS performs DNAT for client packets after using nf_conntrack_alter_reply to expect replies from real server. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/ipv4/netfilter/nf_nat_core.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index e2e00c4da883..0047923c1f22 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -462,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, return 0; } + if (manip == IP_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply dir. */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + if (!(ct->status & statusbit)) + return 1; + pr_debug("icmp_reply_translation: translating error %p manip %u " "dir %s\n", skb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); @@ -496,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, /* Change outer to look the reply to an incoming packet * (proto 0 means don't invert per-proto part). */ - if (manip == IP_NAT_MANIP_SRC) - statusbit = IPS_SRC_NAT; - else - statusbit = IPS_DST_NAT; - - /* Invert if this is reply dir. */ - if (dir == IP_CT_DIR_REPLY) - statusbit ^= IPS_NAT_MASK; - - if (ct->status & statusbit) { - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - if (!manip_pkt(0, skb, 0, &target, manip)) - return 0; - } + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + if (!manip_pkt(0, skb, 0, &target, manip)) + return 0; return 1; } -- cgit v1.2.3-55-g7522