2016-09-27 10:22:27 +08:00
|
|
|
/*
|
|
|
|
* COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
|
|
|
|
* (a.k.a. Fault Tolerance or Continuous Replication)
|
|
|
|
*
|
|
|
|
* Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
|
|
|
|
* Copyright (c) 2016 FUJITSU LIMITED
|
|
|
|
* Copyright (c) 2016 Intel Corporation
|
|
|
|
*
|
|
|
|
* Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
|
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU GPL, version 2 or
|
|
|
|
* later. See the COPYING file in the top-level directory.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef QEMU_COLO_PROXY_H
|
|
|
|
#define QEMU_COLO_PROXY_H
|
|
|
|
|
|
|
|
#include "slirp/slirp.h"
|
2016-09-27 10:22:28 +08:00
|
|
|
#include "qemu/jhash.h"
|
2016-09-27 10:22:30 +08:00
|
|
|
#include "qemu/timer.h"
|
2016-09-27 10:22:27 +08:00
|
|
|
|
|
|
|
#define HASHTABLE_MAX_SIZE 16384
|
|
|
|
|
2016-09-27 10:22:29 +08:00
|
|
|
#ifndef IPPROTO_DCCP
|
|
|
|
#define IPPROTO_DCCP 33
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef IPPROTO_SCTP
|
|
|
|
#define IPPROTO_SCTP 132
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef IPPROTO_UDPLITE
|
|
|
|
#define IPPROTO_UDPLITE 136
|
|
|
|
#endif
|
|
|
|
|
2016-09-27 10:22:27 +08:00
|
|
|
typedef struct Packet {
|
|
|
|
void *data;
|
|
|
|
union {
|
|
|
|
uint8_t *network_header;
|
|
|
|
struct ip *ip;
|
|
|
|
};
|
|
|
|
uint8_t *transport_header;
|
|
|
|
int size;
|
2016-09-27 10:22:30 +08:00
|
|
|
/* Time of packet creation, in wall clock ms */
|
|
|
|
int64_t creation_ms;
|
2017-07-04 14:53:50 +08:00
|
|
|
/* Get vnet_hdr_len from filter */
|
|
|
|
uint32_t vnet_hdr_len;
|
colo: compare the packet based on the tcp sequence number
Packet size some time different or when network is busy.
Based on same payload size, but TCP protocol can not
guarantee send the same one packet in the same way,
like that:
We send this payload:
------------------------------
| header |1|2|3|4|5|6|7|8|9|0|
------------------------------
primary:
ppkt1:
----------------
| header |1|2|3|
----------------
ppkt2:
------------------------
| header |4|5|6|7|8|9|0|
------------------------
secondary:
spkt1:
------------------------------
| header |1|2|3|4|5|6|7|8|9|0|
------------------------------
In the original method, ppkt1 and ppkt2 are different in size and
spkt1, so they can't compare and trigger the checkpoint.
I have tested FTP get 200M and 1G file many times, I found that
the performance was less than 1% of the native.
Now I reconstructed the comparison of TCP packets based on the
TCP sequence number. first of all, ppkt1 and spkt1 have the same
starting sequence number, so they can compare, even though their
length is different. And then ppkt1 with a smaller payload length
is used as the comparison length, if the payload is same, send
out the ppkt1 and record the offset(the length of ppkt1 payload)
in spkt1. The next comparison, ppkt2 and spkt1 can be compared
from the recorded position of spkt1.
like that:
----------------
| header |1|2|3| ppkt1
---------|-----|
| |
---------v-----v--------------
| header |1|2|3|4|5|6|7|8|9|0| spkt1
---------------|\------------|
| \offset |
---------v-------------v
| header |4|5|6|7|8|9|0| ppkt2
------------------------
In this way, the performance can reach native 20% in my multiple
tests.
Cc: Zhang Chen <zhangckid@gmail.com>
Cc: Li Zhijian <lizhijian@cn.fujitsu.com>
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Reviewed-by: Zhang Chen <zhangckid@gmail.com>
Tested-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-12-25 10:54:12 +08:00
|
|
|
uint32_t tcp_seq; /* sequence number */
|
|
|
|
uint32_t tcp_ack; /* acknowledgement number */
|
|
|
|
/* the sequence number of the last byte of the packet */
|
|
|
|
uint32_t seq_end;
|
|
|
|
uint8_t header_size; /* the header length */
|
|
|
|
uint16_t payload_size; /* the payload length */
|
|
|
|
/* record the payload offset(the length that has been compared) */
|
|
|
|
uint16_t offset;
|
|
|
|
uint8_t flags; /* Flags(aka Control bits) */
|
2016-09-27 10:22:27 +08:00
|
|
|
} Packet;
|
|
|
|
|
2016-09-27 10:22:29 +08:00
|
|
|
typedef struct ConnectionKey {
|
|
|
|
/* (src, dst) must be grouped, in the same way than in IP header */
|
|
|
|
struct in_addr src;
|
|
|
|
struct in_addr dst;
|
|
|
|
uint16_t src_port;
|
|
|
|
uint16_t dst_port;
|
|
|
|
uint8_t ip_proto;
|
|
|
|
} QEMU_PACKED ConnectionKey;
|
|
|
|
|
|
|
|
typedef struct Connection {
|
|
|
|
/* connection primary send queue: element type: Packet */
|
|
|
|
GQueue primary_list;
|
|
|
|
/* connection secondary send queue: element type: Packet */
|
|
|
|
GQueue secondary_list;
|
|
|
|
/* flag to enqueue unprocessed_connections */
|
|
|
|
bool processing;
|
|
|
|
uint8_t ip_proto;
|
colo: compare the packet based on the tcp sequence number
Packet size some time different or when network is busy.
Based on same payload size, but TCP protocol can not
guarantee send the same one packet in the same way,
like that:
We send this payload:
------------------------------
| header |1|2|3|4|5|6|7|8|9|0|
------------------------------
primary:
ppkt1:
----------------
| header |1|2|3|
----------------
ppkt2:
------------------------
| header |4|5|6|7|8|9|0|
------------------------
secondary:
spkt1:
------------------------------
| header |1|2|3|4|5|6|7|8|9|0|
------------------------------
In the original method, ppkt1 and ppkt2 are different in size and
spkt1, so they can't compare and trigger the checkpoint.
I have tested FTP get 200M and 1G file many times, I found that
the performance was less than 1% of the native.
Now I reconstructed the comparison of TCP packets based on the
TCP sequence number. first of all, ppkt1 and spkt1 have the same
starting sequence number, so they can compare, even though their
length is different. And then ppkt1 with a smaller payload length
is used as the comparison length, if the payload is same, send
out the ppkt1 and record the offset(the length of ppkt1 payload)
in spkt1. The next comparison, ppkt2 and spkt1 can be compared
from the recorded position of spkt1.
like that:
----------------
| header |1|2|3| ppkt1
---------|-----|
| |
---------v-----v--------------
| header |1|2|3|4|5|6|7|8|9|0| spkt1
---------------|\------------|
| \offset |
---------v-------------v
| header |4|5|6|7|8|9|0| ppkt2
------------------------
In this way, the performance can reach native 20% in my multiple
tests.
Cc: Zhang Chen <zhangckid@gmail.com>
Cc: Li Zhijian <lizhijian@cn.fujitsu.com>
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Reviewed-by: Zhang Chen <zhangckid@gmail.com>
Tested-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-12-25 10:54:12 +08:00
|
|
|
/* record the sequence number that has been compared */
|
|
|
|
uint32_t compare_seq;
|
|
|
|
/* the maximum of acknowledgement number in primary_list queue */
|
|
|
|
uint32_t pack;
|
|
|
|
/* the maximum of acknowledgement number in secondary_list queue */
|
|
|
|
uint32_t sack;
|
filter-rewriter: rewrite tcp packet to keep secondary connection
We will rewrite tcp packet secondary received and sent.
When colo guest is a tcp server.
Firstly, client start a tcp handshake. the packet's seq=client_seq,
ack=0,flag=SYN. COLO primary guest get this pkt and mirror(filter-mirror)
to secondary guest, secondary get it use filter-redirector.
Then,primary guest response pkt
(seq=primary_seq,ack=client_seq+1,flag=ACK|SYN).
secondary guest response pkt
(seq=secondary_seq,ack=client_seq+1,flag=ACK|SYN).
In here,we use filter-rewriter save the secondary_seq to it's tcp connection.
Finally handshake,client send pkt
(seq=client_seq+1,ack=primary_seq+1,flag=ACK).
Here,filter-rewriter can get primary_seq, and rewrite ack from primary_seq+1
to secondary_seq+1, recalculate checksum. So the secondary tcp connection
kept good.
When we send/recv packet.
client send pkt(seq=client_seq+1+data_len,ack=primary_seq+1,flag=ACK|PSH).
filter-rewriter rewrite ack and send to secondary guest.
primary guest response pkt
(seq=primary_seq+1,ack=client_seq+1+data_len,flag=ACK)
secondary guest response pkt
(seq=secondary_seq+1,ack=client_seq+1+data_len,flag=ACK)
we rewrite secondary guest seq from secondary_seq+1 to primary_seq+1.
So tcp connection kept good.
In code We use offset( = secondary_seq - primary_seq )
to rewrite seq or ack.
handle_primary_tcp_pkt: tcp_pkt->th_ack += offset;
handle_secondary_tcp_pkt: tcp_pkt->th_seq -= offset;
Signed-off-by: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2016-09-27 10:22:34 +08:00
|
|
|
/* offset = secondary_seq - primary_seq */
|
|
|
|
tcp_seq offset;
|
|
|
|
/*
|
|
|
|
* we use this flag update offset func
|
|
|
|
* run once in independent tcp connection
|
|
|
|
*/
|
|
|
|
int syn_flag;
|
2016-09-27 10:22:29 +08:00
|
|
|
} Connection;
|
|
|
|
|
|
|
|
uint32_t connection_key_hash(const void *opaque);
|
|
|
|
int connection_key_equal(const void *opaque1, const void *opaque2);
|
2016-09-27 10:22:27 +08:00
|
|
|
int parse_packet_early(Packet *pkt);
|
2017-10-13 14:32:09 +08:00
|
|
|
void extract_ip_and_port(uint32_t tmp_ports, ConnectionKey *key, Packet *pkt);
|
2016-09-27 10:22:29 +08:00
|
|
|
void fill_connection_key(Packet *pkt, ConnectionKey *key);
|
2016-09-27 10:22:33 +08:00
|
|
|
void reverse_connection_key(ConnectionKey *key);
|
2016-09-27 10:22:29 +08:00
|
|
|
Connection *connection_new(ConnectionKey *key);
|
|
|
|
void connection_destroy(void *opaque);
|
|
|
|
Connection *connection_get(GHashTable *connection_track_table,
|
|
|
|
ConnectionKey *key,
|
|
|
|
GQueue *conn_list);
|
2016-09-27 10:22:27 +08:00
|
|
|
void connection_hashtable_reset(GHashTable *connection_track_table);
|
2017-07-04 14:53:50 +08:00
|
|
|
Packet *packet_new(const void *data, int size, int vnet_hdr_len);
|
2016-09-27 10:22:27 +08:00
|
|
|
void packet_destroy(void *opaque, void *user_data);
|
|
|
|
|
|
|
|
#endif /* QEMU_COLO_PROXY_H */
|