summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/Makefile.objs3
-rw-r--r--net/colo-compare.c781
-rw-r--r--net/colo.c211
-rw-r--r--net/colo.h88
-rw-r--r--net/filter-rewriter.c263
-rw-r--r--net/net.c9
-rw-r--r--net/tap.c4
7 files changed, 1356 insertions, 3 deletions
diff --git a/net/Makefile.objs b/net/Makefile.objs
index b7c22fddbf..2a80df5fa7 100644
--- a/net/Makefile.objs
+++ b/net/Makefile.objs
@@ -16,3 +16,6 @@ common-obj-$(CONFIG_NETMAP) += netmap.o
common-obj-y += filter.o
common-obj-y += filter-buffer.o
common-obj-y += filter-mirror.o
+common-obj-y += colo-compare.o
+common-obj-y += colo.o
+common-obj-y += filter-rewriter.o
diff --git a/net/colo-compare.c b/net/colo-compare.c
new file mode 100644
index 0000000000..22b1da19f5
--- /dev/null
+++ b/net/colo-compare.c
@@ -0,0 +1,781 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "qemu-common.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/error.h"
+#include "net/net.h"
+#include "net/eth.h"
+#include "qom/object_interfaces.h"
+#include "qemu/iov.h"
+#include "qom/object.h"
+#include "qemu/typedefs.h"
+#include "net/queue.h"
+#include "sysemu/char.h"
+#include "qemu/sockets.h"
+#include "qapi-visit.h"
+#include "net/colo.h"
+
+#define TYPE_COLO_COMPARE "colo-compare"
+#define COLO_COMPARE(obj) \
+ OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
+
+#define COMPARE_READ_LEN_MAX NET_BUFSIZE
+#define MAX_QUEUE_SIZE 1024
+
+/* TODO: Should be configurable */
+#define REGULAR_PACKET_CHECK_MS 3000
+
+/*
+ + CompareState ++
+ | |
+ +---------------+ +---------------+ +---------------+
+ |conn list +--->conn +--------->conn |
+ +---------------+ +---------------+ +---------------+
+ | | | | | |
+ +---------------+ +---v----+ +---v----+ +---v----+ +---v----+
+ |primary | |secondary |primary | |secondary
+ |packet | |packet + |packet | |packet +
+ +--------+ +--------+ +--------+ +--------+
+ | | | |
+ +---v----+ +---v----+ +---v----+ +---v----+
+ |primary | |secondary |primary | |secondary
+ |packet | |packet + |packet | |packet +
+ +--------+ +--------+ +--------+ +--------+
+ | | | |
+ +---v----+ +---v----+ +---v----+ +---v----+
+ |primary | |secondary |primary | |secondary
+ |packet | |packet + |packet | |packet +
+ +--------+ +--------+ +--------+ +--------+
+*/
+typedef struct CompareState {
+ Object parent;
+
+ char *pri_indev;
+ char *sec_indev;
+ char *outdev;
+ CharDriverState *chr_pri_in;
+ CharDriverState *chr_sec_in;
+ CharDriverState *chr_out;
+ SocketReadState pri_rs;
+ SocketReadState sec_rs;
+
+ /* connection list: the connections belonged to this NIC could be found
+ * in this list.
+ * element type: Connection
+ */
+ GQueue conn_list;
+ /* hashtable to save connection */
+ GHashTable *connection_track_table;
+ /* compare thread, a thread for each NIC */
+ QemuThread thread;
+ /* Timer used on the primary to find packets that are never matched */
+ QEMUTimer *timer;
+ QemuMutex timer_check_lock;
+} CompareState;
+
+typedef struct CompareClass {
+ ObjectClass parent_class;
+} CompareClass;
+
+typedef struct CompareChardevProps {
+ bool is_socket;
+} CompareChardevProps;
+
+enum {
+ PRIMARY_IN = 0,
+ SECONDARY_IN,
+};
+
+static int compare_chr_send(CharDriverState *out,
+ const uint8_t *buf,
+ uint32_t size);
+
+/*
+ * Return 0 on success, if return -1 means the pkt
+ * is unsupported(arp and ipv6) and will be sent later
+ */
+static int packet_enqueue(CompareState *s, int mode)
+{
+ ConnectionKey key;
+ Packet *pkt = NULL;
+ Connection *conn;
+
+ if (mode == PRIMARY_IN) {
+ pkt = packet_new(s->pri_rs.buf, s->pri_rs.packet_len);
+ } else {
+ pkt = packet_new(s->sec_rs.buf, s->sec_rs.packet_len);
+ }
+
+ if (parse_packet_early(pkt)) {
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ return -1;
+ }
+ fill_connection_key(pkt, &key);
+
+ conn = connection_get(s->connection_track_table,
+ &key,
+ &s->conn_list);
+
+ if (!conn->processing) {
+ g_queue_push_tail(&s->conn_list, conn);
+ conn->processing = true;
+ }
+
+ if (mode == PRIMARY_IN) {
+ if (g_queue_get_length(&conn->primary_list) <=
+ MAX_QUEUE_SIZE) {
+ g_queue_push_tail(&conn->primary_list, pkt);
+ } else {
+ error_report("colo compare primary queue size too big,"
+ "drop packet");
+ }
+ } else {
+ if (g_queue_get_length(&conn->secondary_list) <=
+ MAX_QUEUE_SIZE) {
+ g_queue_push_tail(&conn->secondary_list, pkt);
+ } else {
+ error_report("colo compare secondary queue size too big,"
+ "drop packet");
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * The IP packets sent by primary and secondary
+ * will be compared in here
+ * TODO support ip fragment, Out-Of-Order
+ * return: 0 means packet same
+ * > 0 || < 0 means packet different
+ */
+static int colo_packet_compare(Packet *ppkt, Packet *spkt)
+{
+ trace_colo_compare_ip_info(ppkt->size, inet_ntoa(ppkt->ip->ip_src),
+ inet_ntoa(ppkt->ip->ip_dst), spkt->size,
+ inet_ntoa(spkt->ip->ip_src),
+ inet_ntoa(spkt->ip->ip_dst));
+
+ if (ppkt->size == spkt->size) {
+ return memcmp(ppkt->data, spkt->data, spkt->size);
+ } else {
+ return -1;
+ }
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare tcp packet
+ * compare_tcp copied from Dr. David Alan Gilbert's branch
+ */
+static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
+{
+ struct tcphdr *ptcp, *stcp;
+ int res;
+ char *sdebug, *ddebug;
+
+ trace_colo_compare_main("compare tcp");
+ if (ppkt->size != spkt->size) {
+ if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
+ trace_colo_compare_main("pkt size not same");
+ }
+ return -1;
+ }
+
+ ptcp = (struct tcphdr *)ppkt->transport_header;
+ stcp = (struct tcphdr *)spkt->transport_header;
+
+ /*
+ * The 'identification' field in the IP header is *very* random
+ * it almost never matches. Fudge this by ignoring differences in
+ * unfragmented packets; they'll normally sort themselves out if different
+ * anyway, and it should recover at the TCP level.
+ * An alternative would be to get both the primary and secondary to rewrite
+ * somehow; but that would need some sync traffic to sync the state
+ */
+ if (ntohs(ppkt->ip->ip_off) & IP_DF) {
+ spkt->ip->ip_id = ppkt->ip->ip_id;
+ /* and the sum will be different if the IDs were different */
+ spkt->ip->ip_sum = ppkt->ip->ip_sum;
+ }
+
+ res = memcmp(ppkt->data + ETH_HLEN, spkt->data + ETH_HLEN,
+ (spkt->size - ETH_HLEN));
+
+ if (res != 0 && trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
+ sdebug = strdup(inet_ntoa(ppkt->ip->ip_src));
+ ddebug = strdup(inet_ntoa(ppkt->ip->ip_dst));
+ fprintf(stderr, "%s: src/dst: %s/%s p: seq/ack=%u/%u"
+ " s: seq/ack=%u/%u res=%d flags=%x/%x\n",
+ __func__, sdebug, ddebug,
+ (unsigned int)ntohl(ptcp->th_seq),
+ (unsigned int)ntohl(ptcp->th_ack),
+ (unsigned int)ntohl(stcp->th_seq),
+ (unsigned int)ntohl(stcp->th_ack),
+ res, ptcp->th_flags, stcp->th_flags);
+
+ fprintf(stderr, "Primary len = %d\n", ppkt->size);
+ qemu_hexdump((char *)ppkt->data, stderr, "colo-compare", ppkt->size);
+ fprintf(stderr, "Secondary len = %d\n", spkt->size);
+ qemu_hexdump((char *)spkt->data, stderr, "colo-compare", spkt->size);
+
+ g_free(sdebug);
+ g_free(ddebug);
+ }
+
+ return res;
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare udp packet
+ */
+static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
+{
+ int ret;
+
+ trace_colo_compare_main("compare udp");
+ ret = colo_packet_compare(ppkt, spkt);
+
+ if (ret) {
+ trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
+ qemu_hexdump((char *)ppkt->data, stderr, "colo-compare", ppkt->size);
+ trace_colo_compare_udp_miscompare("Secondary pkt size", spkt->size);
+ qemu_hexdump((char *)spkt->data, stderr, "colo-compare", spkt->size);
+ }
+
+ return ret;
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare icmp packet
+ */
+static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
+{
+ int network_length;
+
+ trace_colo_compare_main("compare icmp");
+ network_length = ppkt->ip->ip_hl * 4;
+ if (ppkt->size != spkt->size ||
+ ppkt->size < network_length + ETH_HLEN) {
+ return -1;
+ }
+
+ if (colo_packet_compare(ppkt, spkt)) {
+ trace_colo_compare_icmp_miscompare("primary pkt size",
+ ppkt->size);
+ qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
+ ppkt->size);
+ trace_colo_compare_icmp_miscompare("Secondary pkt size",
+ spkt->size);
+ qemu_hexdump((char *)spkt->data, stderr, "colo-compare",
+ spkt->size);
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare other packet
+ */
+static int colo_packet_compare_other(Packet *spkt, Packet *ppkt)
+{
+ trace_colo_compare_main("compare other");
+ trace_colo_compare_ip_info(ppkt->size, inet_ntoa(ppkt->ip->ip_src),
+ inet_ntoa(ppkt->ip->ip_dst), spkt->size,
+ inet_ntoa(spkt->ip->ip_src),
+ inet_ntoa(spkt->ip->ip_dst));
+ return colo_packet_compare(ppkt, spkt);
+}
+
+static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
+{
+ int64_t now = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+
+ if ((now - pkt->creation_ms) > (*check_time)) {
+ trace_colo_old_packet_check_found(pkt->creation_ms);
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+static void colo_old_packet_check_one_conn(void *opaque,
+ void *user_data)
+{
+ Connection *conn = opaque;
+ GList *result = NULL;
+ int64_t check_time = REGULAR_PACKET_CHECK_MS;
+
+ result = g_queue_find_custom(&conn->primary_list,
+ &check_time,
+ (GCompareFunc)colo_old_packet_check_one);
+
+ if (result) {
+ /* do checkpoint will flush old packet */
+ /* TODO: colo_notify_checkpoint();*/
+ }
+}
+
+/*
+ * Look for old packets that the secondary hasn't matched,
+ * if we have some then we have to checkpoint to wake
+ * the secondary up.
+ */
+static void colo_old_packet_check(void *opaque)
+{
+ CompareState *s = opaque;
+
+ g_queue_foreach(&s->conn_list, colo_old_packet_check_one_conn, NULL);
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare connection
+ */
+static void colo_compare_connection(void *opaque, void *user_data)
+{
+ CompareState *s = user_data;
+ Connection *conn = opaque;
+ Packet *pkt = NULL;
+ GList *result = NULL;
+ int ret;
+
+ while (!g_queue_is_empty(&conn->primary_list) &&
+ !g_queue_is_empty(&conn->secondary_list)) {
+ qemu_mutex_lock(&s->timer_check_lock);
+ pkt = g_queue_pop_tail(&conn->primary_list);
+ qemu_mutex_unlock(&s->timer_check_lock);
+ switch (conn->ip_proto) {
+ case IPPROTO_TCP:
+ result = g_queue_find_custom(&conn->secondary_list,
+ pkt, (GCompareFunc)colo_packet_compare_tcp);
+ break;
+ case IPPROTO_UDP:
+ result = g_queue_find_custom(&conn->secondary_list,
+ pkt, (GCompareFunc)colo_packet_compare_udp);
+ break;
+ case IPPROTO_ICMP:
+ result = g_queue_find_custom(&conn->secondary_list,
+ pkt, (GCompareFunc)colo_packet_compare_icmp);
+ break;
+ default:
+ result = g_queue_find_custom(&conn->secondary_list,
+ pkt, (GCompareFunc)colo_packet_compare_other);
+ break;
+ }
+
+ if (result) {
+ ret = compare_chr_send(s->chr_out, pkt->data, pkt->size);
+ if (ret < 0) {
+ error_report("colo_send_primary_packet failed");
+ }
+ trace_colo_compare_main("packet same and release packet");
+ g_queue_remove(&conn->secondary_list, result->data);
+ packet_destroy(pkt, NULL);
+ } else {
+ /*
+ * If one packet arrive late, the secondary_list or
+ * primary_list will be empty, so we can't compare it
+ * until next comparison.
+ */
+ trace_colo_compare_main("packet different");
+ qemu_mutex_lock(&s->timer_check_lock);
+ g_queue_push_tail(&conn->primary_list, pkt);
+ qemu_mutex_unlock(&s->timer_check_lock);
+ /* TODO: colo_notify_checkpoint();*/
+ break;
+ }
+ }
+}
+
+static int compare_chr_send(CharDriverState *out,
+ const uint8_t *buf,
+ uint32_t size)
+{
+ int ret = 0;
+ uint32_t len = htonl(size);
+
+ if (!size) {
+ return 0;
+ }
+
+ ret = qemu_chr_fe_write_all(out, (uint8_t *)&len, sizeof(len));
+ if (ret != sizeof(len)) {
+ goto err;
+ }
+
+ ret = qemu_chr_fe_write_all(out, (uint8_t *)buf, size);
+ if (ret != size) {
+ goto err;
+ }
+
+ return 0;
+
+err:
+ return ret < 0 ? ret : -EIO;
+}
+
+static int compare_chr_can_read(void *opaque)
+{
+ return COMPARE_READ_LEN_MAX;
+}
+
+/*
+ * Called from the main thread on the primary for packets
+ * arriving over the socket from the primary.
+ */
+static void compare_pri_chr_in(void *opaque, const uint8_t *buf, int size)
+{
+ CompareState *s = COLO_COMPARE(opaque);
+ int ret;
+
+ ret = net_fill_rstate(&s->pri_rs, buf, size);
+ if (ret == -1) {
+ qemu_chr_add_handlers(s->chr_pri_in, NULL, NULL, NULL, NULL);
+ error_report("colo-compare primary_in error");
+ }
+}
+
+/*
+ * Called from the main thread on the primary for packets
+ * arriving over the socket from the secondary.
+ */
+static void compare_sec_chr_in(void *opaque, const uint8_t *buf, int size)
+{
+ CompareState *s = COLO_COMPARE(opaque);
+ int ret;
+
+ ret = net_fill_rstate(&s->sec_rs, buf, size);
+ if (ret == -1) {
+ qemu_chr_add_handlers(s->chr_sec_in, NULL, NULL, NULL, NULL);
+ error_report("colo-compare secondary_in error");
+ }
+}
+
+static void *colo_compare_thread(void *opaque)
+{
+ GMainContext *worker_context;
+ GMainLoop *compare_loop;
+ CompareState *s = opaque;
+
+ worker_context = g_main_context_new();
+
+ qemu_chr_add_handlers_full(s->chr_pri_in, compare_chr_can_read,
+ compare_pri_chr_in, NULL, s, worker_context);
+ qemu_chr_add_handlers_full(s->chr_sec_in, compare_chr_can_read,
+ compare_sec_chr_in, NULL, s, worker_context);
+
+ compare_loop = g_main_loop_new(worker_context, FALSE);
+
+ g_main_loop_run(compare_loop);
+
+ g_main_loop_unref(compare_loop);
+ g_main_context_unref(worker_context);
+ return NULL;
+}
+
+static char *compare_get_pri_indev(Object *obj, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ return g_strdup(s->pri_indev);
+}
+
+static void compare_set_pri_indev(Object *obj, const char *value, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ g_free(s->pri_indev);
+ s->pri_indev = g_strdup(value);
+}
+
+static char *compare_get_sec_indev(Object *obj, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ return g_strdup(s->sec_indev);
+}
+
+static void compare_set_sec_indev(Object *obj, const char *value, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ g_free(s->sec_indev);
+ s->sec_indev = g_strdup(value);
+}
+
+static char *compare_get_outdev(Object *obj, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ return g_strdup(s->outdev);
+}
+
+static void compare_set_outdev(Object *obj, const char *value, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ g_free(s->outdev);
+ s->outdev = g_strdup(value);
+}
+
+static void compare_pri_rs_finalize(SocketReadState *pri_rs)
+{
+ CompareState *s = container_of(pri_rs, CompareState, pri_rs);
+
+ if (packet_enqueue(s, PRIMARY_IN)) {
+ trace_colo_compare_main("primary: unsupported packet in");
+ compare_chr_send(s->chr_out, pri_rs->buf, pri_rs->packet_len);
+ } else {
+ /* compare connection */
+ g_queue_foreach(&s->conn_list, colo_compare_connection, s);
+ }
+}
+
+static void compare_sec_rs_finalize(SocketReadState *sec_rs)
+{
+ CompareState *s = container_of(sec_rs, CompareState, sec_rs);
+
+ if (packet_enqueue(s, SECONDARY_IN)) {
+ trace_colo_compare_main("secondary: unsupported packet in");
+ } else {
+ /* compare connection */
+ g_queue_foreach(&s->conn_list, colo_compare_connection, s);
+ }
+}
+
+static int compare_chardev_opts(void *opaque,
+ const char *name, const char *value,
+ Error **errp)
+{
+ CompareChardevProps *props = opaque;
+
+ if (strcmp(name, "backend") == 0 &&
+ strcmp(value, "socket") == 0) {
+ props->is_socket = true;
+ return 0;
+ } else if (strcmp(name, "host") == 0 ||
+ (strcmp(name, "port") == 0) ||
+ (strcmp(name, "server") == 0) ||
+ (strcmp(name, "wait") == 0) ||
+ (strcmp(name, "path") == 0)) {
+ return 0;
+ } else {
+ error_setg(errp,
+ "COLO-compare does not support a chardev with option %s=%s",
+ name, value);
+ return -1;
+ }
+}
+
+/*
+ * Return 0 is success.
+ * Return 1 is failed.
+ */
+static int find_and_check_chardev(CharDriverState **chr,
+ char *chr_name,
+ Error **errp)
+{
+ CompareChardevProps props;
+
+ *chr = qemu_chr_find(chr_name);
+ if (*chr == NULL) {
+ error_setg(errp, "Device '%s' not found",
+ chr_name);
+ return 1;
+ }
+
+ memset(&props, 0, sizeof(props));
+ if (qemu_opt_foreach((*chr)->opts, compare_chardev_opts, &props, errp)) {
+ return 1;
+ }
+
+ if (!props.is_socket) {
+ error_setg(errp, "chardev \"%s\" is not a tcp socket",
+ chr_name);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Check old packet regularly so it can watch for any packets
+ * that the secondary hasn't produced equivalents of.
+ */
+static void check_old_packet_regular(void *opaque)
+{
+ CompareState *s = opaque;
+
+ timer_mod(s->timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+ REGULAR_PACKET_CHECK_MS);
+ /* if have old packet we will notify checkpoint */
+ /*
+ * TODO: Make timer handler run in compare thread
+ * like qemu_chr_add_handlers_full.
+ */
+ qemu_mutex_lock(&s->timer_check_lock);
+ colo_old_packet_check(s);
+ qemu_mutex_unlock(&s->timer_check_lock);
+}
+
+/*
+ * Called from the main thread on the primary
+ * to setup colo-compare.
+ */
+static void colo_compare_complete(UserCreatable *uc, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(uc);
+ char thread_name[64];
+ static int compare_id;
+
+ if (!s->pri_indev || !s->sec_indev || !s->outdev) {
+ error_setg(errp, "colo compare needs 'primary_in' ,"
+ "'secondary_in','outdev' property set");
+ return;
+ } else if (!strcmp(s->pri_indev, s->outdev) ||
+ !strcmp(s->sec_indev, s->outdev) ||
+ !strcmp(s->pri_indev, s->sec_indev)) {
+ error_setg(errp, "'indev' and 'outdev' could not be same "
+ "for compare module");
+ return;
+ }
+
+ if (find_and_check_chardev(&s->chr_pri_in, s->pri_indev, errp)) {
+ return;
+ }
+
+ if (find_and_check_chardev(&s->chr_sec_in, s->sec_indev, errp)) {
+ return;
+ }
+
+ if (find_and_check_chardev(&s->chr_out, s->outdev, errp)) {
+ return;
+ }
+
+ qemu_chr_fe_claim_no_fail(s->chr_pri_in);
+
+ qemu_chr_fe_claim_no_fail(s->chr_sec_in);
+
+ qemu_chr_fe_claim_no_fail(s->chr_out);
+
+ net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize);
+ net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize);
+
+ g_queue_init(&s->conn_list);
+ qemu_mutex_init(&s->timer_check_lock);
+
+ s->connection_track_table = g_hash_table_new_full(connection_key_hash,
+ connection_key_equal,
+ g_free,
+ connection_destroy);
+
+ sprintf(thread_name, "colo-compare %d", compare_id);
+ qemu_thread_create(&s->thread, thread_name,
+ colo_compare_thread, s,
+ QEMU_THREAD_JOINABLE);
+ compare_id++;
+
+ /* A regular timer to kick any packets that the secondary doesn't match */
+ s->timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, /* Only when guest runs */
+ check_old_packet_regular, s);
+ timer_mod(s->timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+ REGULAR_PACKET_CHECK_MS);
+
+ return;
+}
+
+static void colo_compare_class_init(ObjectClass *oc, void *data)
+{
+ UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+
+ ucc->complete = colo_compare_complete;
+}
+
+static void colo_compare_init(Object *obj)
+{
+ object_property_add_str(obj, "primary_in",
+ compare_get_pri_indev, compare_set_pri_indev,
+ NULL);
+ object_property_add_str(obj, "secondary_in",
+ compare_get_sec_indev, compare_set_sec_indev,
+ NULL);
+ object_property_add_str(obj, "outdev",
+ compare_get_outdev, compare_set_outdev,
+ NULL);
+}
+
+static void colo_compare_finalize(Object *obj)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ if (s->chr_pri_in) {
+ qemu_chr_add_handlers(s->chr_pri_in, NULL, NULL, NULL, NULL);
+ qemu_chr_fe_release(s->chr_pri_in);
+ }
+ if (s->chr_sec_in) {
+ qemu_chr_add_handlers(s->chr_sec_in, NULL, NULL, NULL, NULL);
+ qemu_chr_fe_release(s->chr_sec_in);
+ }
+ if (s->chr_out) {
+ qemu_chr_fe_release(s->chr_out);
+ }
+
+ g_queue_free(&s->conn_list);
+
+ if (qemu_thread_is_self(&s->thread)) {
+ /* compare connection */
+ g_queue_foreach(&s->conn_list, colo_compare_connection, s);
+ qemu_thread_join(&s->thread);
+ }
+
+ if (s->timer) {
+ timer_del(s->timer);
+ }
+
+ qemu_mutex_destroy(&s->timer_check_lock);
+
+ g_free(s->pri_indev);
+ g_free(s->sec_indev);
+ g_free(s->outdev);
+}
+
+static const TypeInfo colo_compare_info = {
+ .name = TYPE_COLO_COMPARE,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(CompareState),
+ .instance_init = colo_compare_init,
+ .instance_finalize = colo_compare_finalize,
+ .class_size = sizeof(CompareClass),
+ .class_init = colo_compare_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void register_types(void)
+{
+ type_register_static(&colo_compare_info);
+}
+
+type_init(register_types);
diff --git a/net/colo.c b/net/colo.c
new file mode 100644
index 0000000000..6a6eacd2dc
--- /dev/null
+++ b/net/colo.c
@@ -0,0 +1,211 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "trace.h"
+#include "net/colo.h"
+
+uint32_t connection_key_hash(const void *opaque)
+{
+ const ConnectionKey *key = opaque;
+ uint32_t a, b, c;
+
+ /* Jenkins hash */
+ a = b = c = JHASH_INITVAL + sizeof(*key);
+ a += key->src.s_addr;
+ b += key->dst.s_addr;
+ c += (key->src_port | key->dst_port << 16);
+ __jhash_mix(a, b, c);
+
+ a += key->ip_proto;
+ __jhash_final(a, b, c);
+
+ return c;
+}
+
+int connection_key_equal(const void *key1, const void *key2)
+{
+ return memcmp(key1, key2, sizeof(ConnectionKey)) == 0;
+}
+
+int parse_packet_early(Packet *pkt)
+{
+ int network_length;
+ static const uint8_t vlan[] = {0x81, 0x00};
+ uint8_t *data = pkt->data;
+ uint16_t l3_proto;
+ ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
+
+ if (pkt->size < ETH_HLEN) {
+ trace_colo_proxy_main("pkt->size < ETH_HLEN");
+ return 1;
+ }
+
+ /*
+ * TODO: support vlan.
+ */
+ if (!memcmp(&data[12], vlan, sizeof(vlan))) {
+ trace_colo_proxy_main("COLO-proxy don't support vlan");
+ return 1;
+ }
+
+ pkt->network_header = data + l2hdr_len;
+
+ const struct iovec l2vec = {
+ .iov_base = (void *) data,
+ .iov_len = l2hdr_len
+ };
+ l3_proto = eth_get_l3_proto(&l2vec, 1, l2hdr_len);
+
+ if (l3_proto != ETH_P_IP) {
+ return 1;
+ }
+
+ network_length = pkt->ip->ip_hl * 4;
+ if (pkt->size < l2hdr_len + network_length) {
+ trace_colo_proxy_main("pkt->size < network_header + network_length");
+ return 1;
+ }
+ pkt->transport_header = pkt->network_header + network_length;
+
+ return 0;
+}
+
+void fill_connection_key(Packet *pkt, ConnectionKey *key)
+{
+ uint32_t tmp_ports;
+
+ memset(key, 0, sizeof(*key));
+ key->ip_proto = pkt->ip->ip_p;
+
+ switch (key->ip_proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_DCCP:
+ case IPPROTO_ESP:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDPLITE:
+ tmp_ports = *(uint32_t *)(pkt->transport_header);
+ key->src = pkt->ip->ip_src;
+ key->dst = pkt->ip->ip_dst;
+ key->src_port = ntohs(tmp_ports & 0xffff);
+ key->dst_port = ntohs(tmp_ports >> 16);
+ break;
+ case IPPROTO_AH:
+ tmp_ports = *(uint32_t *)(pkt->transport_header + 4);
+ key->src = pkt->ip->ip_src;
+ key->dst = pkt->ip->ip_dst;
+ key->src_port = ntohs(tmp_ports & 0xffff);
+ key->dst_port = ntohs(tmp_ports >> 16);
+ break;
+ default:
+ break;
+ }
+}
+
+void reverse_connection_key(ConnectionKey *key)
+{
+ struct in_addr tmp_ip;
+ uint16_t tmp_port;
+
+ tmp_ip = key->src;
+ key->src = key->dst;
+ key->dst = tmp_ip;
+
+ tmp_port = key->src_port;
+ key->src_port = key->dst_port;
+ key->dst_port = tmp_port;
+}
+
+Connection *connection_new(ConnectionKey *key)
+{
+ Connection *conn = g_slice_new(Connection);
+
+ conn->ip_proto = key->ip_proto;
+ conn->processing = false;
+ conn->offset = 0;
+ conn->syn_flag = 0;
+ g_queue_init(&conn->primary_list);
+ g_queue_init(&conn->secondary_list);
+
+ return conn;
+}
+
+void connection_destroy(void *opaque)
+{
+ Connection *conn = opaque;
+
+ g_queue_foreach(&conn->primary_list, packet_destroy, NULL);
+ g_queue_free(&conn->primary_list);
+ g_queue_foreach(&conn->secondary_list, packet_destroy, NULL);
+ g_queue_free(&conn->secondary_list);
+ g_slice_free(Connection, conn);
+}
+
+Packet *packet_new(const void *data, int size)
+{
+ Packet *pkt = g_slice_new(Packet);
+
+ pkt->data = g_memdup(data, size);
+ pkt->size = size;
+ pkt->creation_ms = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+
+ return pkt;
+}
+
+void packet_destroy(void *opaque, void *user_data)
+{
+ Packet *pkt = opaque;
+
+ g_free(pkt->data);
+ g_slice_free(Packet, pkt);
+}
+
+/*
+ * Clear hashtable, stop this hash growing really huge
+ */
+void connection_hashtable_reset(GHashTable *connection_track_table)
+{
+ g_hash_table_remove_all(connection_track_table);
+}
+
+/* if not found, create a new connection and add to hash table */
+Connection *connection_get(GHashTable *connection_track_table,
+ ConnectionKey *key,
+ GQueue *conn_list)
+{
+ Connection *conn = g_hash_table_lookup(connection_track_table, key);
+
+ if (conn == NULL) {
+ ConnectionKey *new_key = g_memdup(key, sizeof(*key));
+
+ conn = connection_new(key);
+
+ if (g_hash_table_size(connection_track_table) > HASHTABLE_MAX_SIZE) {
+ trace_colo_proxy_main("colo proxy connection hashtable full,"
+ " clear it");
+ connection_hashtable_reset(connection_track_table);
+ /*
+ * clear the conn_list
+ */
+ while (!g_queue_is_empty(conn_list)) {
+ connection_destroy(g_queue_pop_head(conn_list));
+ }
+ }
+
+ g_hash_table_insert(connection_track_table, new_key, conn);
+ }
+
+ return conn;
+}
diff --git a/net/colo.h b/net/colo.h
new file mode 100644
index 0000000000..7c524f3a1c
--- /dev/null
+++ b/net/colo.h
@@ -0,0 +1,88 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_COLO_PROXY_H
+#define QEMU_COLO_PROXY_H
+
+#include "slirp/slirp.h"
+#include "qemu/jhash.h"
+#include "qemu/timer.h"
+
+#define HASHTABLE_MAX_SIZE 16384
+
+#ifndef IPPROTO_DCCP
+#define IPPROTO_DCCP 33
+#endif
+
+#ifndef IPPROTO_SCTP
+#define IPPROTO_SCTP 132
+#endif
+
+#ifndef IPPROTO_UDPLITE
+#define IPPROTO_UDPLITE 136
+#endif
+
+typedef struct Packet {
+ void *data;
+ union {
+ uint8_t *network_header;
+ struct ip *ip;
+ };
+ uint8_t *transport_header;
+ int size;
+ /* Time of packet creation, in wall clock ms */
+ int64_t creation_ms;
+} Packet;
+
+typedef struct ConnectionKey {
+ /* (src, dst) must be grouped, in the same way than in IP header */
+ struct in_addr src;
+ struct in_addr dst;
+ uint16_t src_port;
+ uint16_t dst_port;
+ uint8_t ip_proto;
+} QEMU_PACKED ConnectionKey;
+
+typedef struct Connection {
+ /* connection primary send queue: element type: Packet */
+ GQueue primary_list;
+ /* connection secondary send queue: element type: Packet */
+ GQueue secondary_list;
+ /* flag to enqueue unprocessed_connections */
+ bool processing;
+ uint8_t ip_proto;
+ /* offset = secondary_seq - primary_seq */
+ tcp_seq offset;
+ /*
+ * we use this flag update offset func
+ * run once in independent tcp connection
+ */
+ int syn_flag;
+} Connection;
+
+uint32_t connection_key_hash(const void *opaque);
+int connection_key_equal(const void *opaque1, const void *opaque2);
+int parse_packet_early(Packet *pkt);
+void fill_connection_key(Packet *pkt, ConnectionKey *key);
+void reverse_connection_key(ConnectionKey *key);
+Connection *connection_new(ConnectionKey *key);
+void connection_destroy(void *opaque);
+Connection *connection_get(GHashTable *connection_track_table,
+ ConnectionKey *key,
+ GQueue *conn_list);
+void connection_hashtable_reset(GHashTable *connection_track_table);
+Packet *packet_new(const void *data, int size);
+void packet_destroy(void *opaque, void *user_data);
+
+#endif /* QEMU_COLO_PROXY_H */
diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
new file mode 100644
index 0000000000..89abe72d4e
--- /dev/null
+++ b/net/filter-rewriter.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "trace.h"
+#include "net/colo.h"
+#include "net/filter.h"
+#include "net/net.h"
+#include "qemu-common.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi-visit.h"
+#include "qom/object.h"
+#include "qemu/main-loop.h"
+#include "qemu/iov.h"
+#include "net/checksum.h"
+
+#define FILTER_COLO_REWRITER(obj) \
+ OBJECT_CHECK(RewriterState, (obj), TYPE_FILTER_REWRITER)
+
+#define TYPE_FILTER_REWRITER "filter-rewriter"
+
+typedef struct RewriterState {
+ NetFilterState parent_obj;
+ NetQueue *incoming_queue;
+ /* hashtable to save connection */
+ GHashTable *connection_track_table;
+} RewriterState;
+
+static void filter_rewriter_flush(NetFilterState *nf)
+{
+ RewriterState *s = FILTER_COLO_REWRITER(nf);
+
+ if (!qemu_net_queue_flush(s->incoming_queue)) {
+ /* Unable to empty the queue, purge remaining packets */
+ qemu_net_queue_purge(s->incoming_queue, nf->netdev);
+ }
+}
+
+/*
+ * Return 1 on success, if return 0 means the pkt
+ * is not TCP packet
+ */
+static int is_tcp_packet(Packet *pkt)
+{
+ if (!parse_packet_early(pkt) &&
+ pkt->ip->ip_p == IPPROTO_TCP) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/* handle tcp packet from primary guest */
+static int handle_primary_tcp_pkt(NetFilterState *nf,
+ Connection *conn,
+ Packet *pkt)
+{
+ struct tcphdr *tcp_pkt;
+
+ tcp_pkt = (struct tcphdr *)pkt->transport_header;
+ if (trace_event_get_state(TRACE_COLO_FILTER_REWRITER_DEBUG)) {
+ char *sdebug, *ddebug;
+ sdebug = strdup(inet_ntoa(pkt->ip->ip_src));
+ ddebug = strdup(inet_ntoa(pkt->ip->ip_dst));
+ trace_colo_filter_rewriter_pkt_info(__func__, sdebug, ddebug,
+ ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
+ tcp_pkt->th_flags);
+ trace_colo_filter_rewriter_conn_offset(conn->offset);
+ g_free(sdebug);
+ g_free(ddebug);
+ }
+
+ if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
+ /*
+ * we use this flag update offset func
+ * run once in independent tcp connection
+ */
+ conn->syn_flag = 1;
+ }
+
+ if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
+ if (conn->syn_flag) {
+ /*
+ * offset = secondary_seq - primary seq
+ * ack packet sent by guest from primary node,
+ * so we use th_ack - 1 get primary_seq
+ */
+ conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
+ conn->syn_flag = 0;
+ }
+ /* handle packets to the secondary from the primary */
+ tcp_pkt->th_ack = htonl(ntohl(tcp_pkt->th_ack) + conn->offset);
+
+ net_checksum_calculate((uint8_t *)pkt->data, pkt->size);
+ }
+
+ return 0;
+}
+
+/* handle tcp packet from secondary guest */
+static int handle_secondary_tcp_pkt(NetFilterState *nf,
+ Connection *conn,
+ Packet *pkt)
+{
+ struct tcphdr *tcp_pkt;
+
+ tcp_pkt = (struct tcphdr *)pkt->transport_header;
+
+ if (trace_event_get_state(TRACE_COLO_FILTER_REWRITER_DEBUG)) {
+ char *sdebug, *ddebug;
+ sdebug = strdup(inet_ntoa(pkt->ip->ip_src));
+ ddebug = strdup(inet_ntoa(pkt->ip->ip_dst));
+ trace_colo_filter_rewriter_pkt_info(__func__, sdebug, ddebug,
+ ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
+ tcp_pkt->th_flags);
+ trace_colo_filter_rewriter_conn_offset(conn->offset);
+ g_free(sdebug);
+ g_free(ddebug);
+ }
+
+ if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
+ /*
+ * save offset = secondary_seq and then
+ * in handle_primary_tcp_pkt make offset
+ * = secondary_seq - primary_seq
+ */
+ conn->offset = ntohl(tcp_pkt->th_seq);
+ }
+
+ if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
+ /* handle packets to the primary from the secondary*/
+ tcp_pkt->th_seq = htonl(ntohl(tcp_pkt->th_seq) - conn->offset);
+
+ net_checksum_calculate((uint8_t *)pkt->data, pkt->size);
+ }
+
+ return 0;
+}
+
+static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ RewriterState *s = FILTER_COLO_REWRITER(nf);
+ Connection *conn;
+ ConnectionKey key;
+ Packet *pkt;
+ ssize_t size = iov_size(iov, iovcnt);
+ char *buf = g_malloc0(size);
+
+ iov_to_buf(iov, iovcnt, 0, buf, size);
+ pkt = packet_new(buf, size);
+
+ /*
+ * if we get tcp packet
+ * we will rewrite it to make secondary guest's
+ * connection established successfully
+ */
+ if (pkt && is_tcp_packet(pkt)) {
+
+ fill_connection_key(pkt, &key);
+
+ if (sender == nf->netdev) {
+ /*
+ * We need make tcp TX and RX packet
+ * into one connection.
+ */
+ reverse_connection_key(&key);
+ }
+ conn = connection_get(s->connection_track_table,
+ &key,
+ NULL);
+
+ if (sender == nf->netdev) {
+ /* NET_FILTER_DIRECTION_TX */
+ if (!handle_primary_tcp_pkt(nf, conn, pkt)) {
+ qemu_net_queue_send(s->incoming_queue, sender, 0,
+ (const uint8_t *)pkt->data, pkt->size, NULL);
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ /*
+ * We block the packet here,after rewrite pkt
+ * and will send it
+ */
+ return 1;
+ }
+ } else {
+ /* NET_FILTER_DIRECTION_RX */
+ if (!handle_secondary_tcp_pkt(nf, conn, pkt)) {
+ qemu_net_queue_send(s->incoming_queue, sender, 0,
+ (const uint8_t *)pkt->data, pkt->size, NULL);
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ /*
+ * We block the packet here,after rewrite pkt
+ * and will send it
+ */
+ return 1;
+ }
+ }
+ }
+
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ return 0;
+}
+
+static void colo_rewriter_cleanup(NetFilterState *nf)
+{
+ RewriterState *s = FILTER_COLO_REWRITER(nf);
+
+ /* flush packets */
+ if (s->incoming_queue) {
+ filter_rewriter_flush(nf);
+ g_free(s->incoming_queue);
+ }
+}
+
+static void colo_rewriter_setup(NetFilterState *nf, Error **errp)
+{
+ RewriterState *s = FILTER_COLO_REWRITER(nf);
+
+ s->connection_track_table = g_hash_table_new_full(connection_key_hash,
+ connection_key_equal,
+ g_free,
+ connection_destroy);
+ s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
+}
+
+static void colo_rewriter_class_init(ObjectClass *oc, void *data)
+{
+ NetFilterClass *nfc = NETFILTER_CLASS(oc);
+
+ nfc->setup = colo_rewriter_setup;
+ nfc->cleanup = colo_rewriter_cleanup;
+ nfc->receive_iov = colo_rewriter_receive_iov;
+}
+
+static const TypeInfo colo_rewriter_info = {
+ .name = TYPE_FILTER_REWRITER,
+ .parent = TYPE_NETFILTER,
+ .class_init = colo_rewriter_class_init,
+ .instance_size = sizeof(RewriterState),
+};
+
+static void register_types(void)
+{
+ type_register_static(&colo_rewriter_info);
+}
+
+type_init(register_types);
diff --git a/net/net.c b/net/net.c
index d51cb29882..ec984bf782 100644
--- a/net/net.c
+++ b/net/net.c
@@ -690,9 +690,13 @@ static ssize_t nc_sendv_compat(NetClientState *nc, const struct iovec *iov,
buffer = iov[0].iov_base;
offset = iov[0].iov_len;
} else {
- buf = g_new(uint8_t, NET_BUFSIZE);
+ offset = iov_size(iov, iovcnt);
+ if (offset > NET_BUFSIZE) {
+ return -1;
+ }
+ buf = g_malloc(offset);
buffer = buf;
- offset = iov_to_buf(iov, iovcnt, 0, buf, NET_BUFSIZE);
+ offset = iov_to_buf(iov, iovcnt, 0, buf, offset);
}
if (flags & QEMU_NET_PACKET_FLAG_RAW && nc->info->receive_raw) {
@@ -1179,6 +1183,7 @@ void hmp_host_net_remove(Monitor *mon, const QDict *qdict)
qemu_del_net_client(nc->peer);
qemu_del_net_client(nc);
+ qemu_opts_del(qemu_opts_find(qemu_find_opts("net"), device));
}
void netdev_add(QemuOpts *opts, Error **errp)
diff --git a/net/tap.c b/net/tap.c
index 6abb962efd..b6896a7b7c 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -857,7 +857,9 @@ free_fail:
return -1;
}
- fd = net_bridge_run_helper(tap->helper, DEFAULT_BRIDGE_INTERFACE,
+ fd = net_bridge_run_helper(tap->helper,
+ tap->has_br ?
+ tap->br : DEFAULT_BRIDGE_INTERFACE,
errp);
if (fd == -1) {
return -1;