From 3a5d5b86163503f8939bfb022ec4d17c4a41cd4a Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Tue, 4 Oct 2022 17:12:48 +0000 Subject: [PATCH 01/33] Initial commit that includes changes in the libbpf installation --- build/build.sh | 2 -- build/machine-init.sh | 5 ++++- src/CMakeLists.txt | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) mode change 100644 => 100755 build/build.sh mode change 100644 => 100755 build/machine-init.sh diff --git a/build/build.sh b/build/build.sh old mode 100644 new mode 100755 index 98a277d..71bd36d --- a/build/build.sh +++ b/build/build.sh @@ -23,5 +23,3 @@ echo "--- prepare dependencies ---" echo "--- building arion-agent ---" cmake . && make - -fi diff --git a/build/machine-init.sh b/build/machine-init.sh old mode 100644 new mode 100755 index 379a778..4776882 --- a/build/machine-init.sh +++ b/build/machine-init.sh @@ -119,5 +119,8 @@ echo "5--- installing ebpf dependencies ---" && \ cd /var/local/git && \ git clone https://github.com/futurewei-cloud/zeta && \ cd zeta && \ - ./build.sh && \ + git submodule update --init --recursive && \ + cd src/extern/libbpf/src && \ + mkdir build root && \ + BUILD_STATIC_ONLY=y OBJDIR=build DESTDIR=root make install && \ cd ~ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 448f7e7..5395e01 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,7 +9,7 @@ set(SOURCES link_libraries(/usr/lib/x86_64-linux-gnu/libevent_pthreads.so) link_libraries(/usr/lib/x86_64-linux-gnu/libpthread.so) link_libraries(/var/local/git/marl/marl/build/libmarl.a) #this was built by machine-init.sh -link_libraries(/var/local/git/zeta/src/extern/libbpf/src/libbpf.a) #this was built by machine-init.sh +link_libraries(/var/local/git/zeta/src/extern/libbpf/src/build/libbpf.a) #this was built by machine-init.sh link_libraries(/usr/lib/x86_64-linux-gnu/libelf.a) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/proto3) From a304675b5a468aa282efe18fd89299c0735983e5 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 7 Oct 2022 18:55:15 +0000 Subject: [PATCH 02/33] Initial commit for af_xdp module --- include/af_xdp_user.h | 29 +++ src/CMakeLists.txt | 3 +- src/comm/af_xdp_user.cpp | 495 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 526 insertions(+), 1 deletion(-) create mode 100644 include/af_xdp_user.h create mode 100644 src/comm/af_xdp_user.cpp diff --git a/include/af_xdp_user.h b/include/af_xdp_user.h new file mode 100644 index 0000000..cd523f5 --- /dev/null +++ b/include/af_xdp_user.h @@ -0,0 +1,29 @@ +// +// Created by ubuntu on 10/4/22. +// + +#ifndef ARIONAGENT_AF_XDP_USER_H +#define ARIONAGENT_AF_XDP_USER_H + +#include "logger.h" +#include +#include +#include + +#include "common_params.h" +#include "common_user_bpf_xdp.h" +#include "common_libbpf.h" + +static const char *__doc__ = "AF_XDP kernel bypass example\n"; + +class af_xdp_user { +public: + af_xdp_user() { + printf("%s", "Start of af_xdp userspace program."); + } + void run_af_xdp(int argc, char *argv[]); +private: + +}; + +#endif //ARIONAGENT_AF_XDP_USER_H diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5395e01..d4d083b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,7 +2,7 @@ set(EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../build/bin) set(SOURCES ./comm/grpc_client.cpp - ) + comm/af_xdp_user.cpp ../include/af_xdp_user.h) #FIND_LIBRARY(LIBUUID_LIBRARIES uuid) #link_libraries(/usr/lib/x86_64-linux-gnu/libuuid.so) @@ -16,6 +16,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/proto3) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/grpc) include_directories(/var/local/git/marl/marl/include) include_directories(/var/local/git/zeta/src/extern/libbpf/src) #libbpf.h +include_directories(/var/local/git/xdp-tutorial/common) # Find Protobuf installation # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation. diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp new file mode 100644 index 0000000..4aaba0a --- /dev/null +++ b/src/comm/af_xdp_user.cpp @@ -0,0 +1,495 @@ +// +// Created by ubuntu on 10/4/22. +// +#include +#include +#include +#include +#include +#include +#include +#include "af_xdp_user.h" +#include +#include +#include +#include + + +#define NUM_FRAMES 4096 +#define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE +#define RX_BATCH_SIZE 64 +#define INVALID_UMEM_FRAME UINT64_MAX +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct xsk_umem *umem; + void *buffer; +}; + +struct stats_record { + uint64_t timestamp; + uint64_t rx_packets; + uint64_t rx_bytes; + uint64_t tx_packets; + uint64_t tx_bytes; +}; + +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; + + uint64_t umem_frame_addr[NUM_FRAMES]; + uint32_t umem_frame_free; + + uint32_t outstanding_tx; + + struct stats_record stats; + struct stats_record prev_stats; +}; + +static uint64_t xsk_alloc_umem_frame(struct xsk_socket_info *xsk) +{ + uint64_t frame; + if (xsk->umem_frame_free == 0) + return INVALID_UMEM_FRAME; + + frame = xsk->umem_frame_addr[--xsk->umem_frame_free]; + xsk->umem_frame_addr[xsk->umem_frame_free] = INVALID_UMEM_FRAME; + return frame; +} + +static struct xsk_socket_info *xsk_configure_socket(struct config *cfg, + struct xsk_umem_info *umem) +{ + struct xsk_socket_config xsk_cfg; + struct xsk_socket_info *xsk_info; + uint32_t idx; + uint32_t prog_id = 0; + int i; + int ret; + + xsk_info = static_cast(calloc(1, sizeof(*xsk_info))); + if (!xsk_info) + return NULL; + + xsk_info->umem = umem; + xsk_cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + xsk_cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + xsk_cfg.libbpf_flags = 0; + xsk_cfg.xdp_flags = cfg->xdp_flags; + xsk_cfg.bind_flags = cfg->xsk_bind_flags; + ret = xsk_socket__create(&xsk_info->xsk, cfg->ifname, + cfg->xsk_if_queue, umem->umem, &xsk_info->rx, + &xsk_info->tx, &xsk_cfg); + + if (ret) + goto error_exit; + + ret = bpf_get_link_xdp_id(cfg->ifindex, &prog_id, cfg->xdp_flags); + if (ret) + goto error_exit; + + /* Initialize umem frame allocation */ + + for (i = 0; i < NUM_FRAMES; i++) + xsk_info->umem_frame_addr[i] = i * FRAME_SIZE; + + xsk_info->umem_frame_free = NUM_FRAMES; + + /* Stuff the receive path with buffers, we assume we have enough */ + ret = xsk_ring_prod__reserve(&xsk_info->umem->fq, + XSK_RING_PROD__DEFAULT_NUM_DESCS, + &idx); + + if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) + goto error_exit; + + for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i ++) + *xsk_ring_prod__fill_addr(&xsk_info->umem->fq, idx++) = + xsk_alloc_umem_frame(xsk_info); + + xsk_ring_prod__submit(&xsk_info->umem->fq, + XSK_RING_PROD__DEFAULT_NUM_DESCS); + + return xsk_info; + +error_exit: + errno = -ret; + return NULL; +} + +static const struct option_wrapper long_options[] = { + + {{"help", no_argument, NULL, 'h' }, + "Show help", "",false}, + + {{"dev", required_argument, NULL, 'd' }, + "Operate on device ", "", true}, + + {{"skb-mode", no_argument, NULL, 'S' }, + "Install XDP program in SKB (AKA generic) mode"}, + + {{"native-mode", no_argument, NULL, 'N' }, + "Install XDP program in native mode"}, + + {{"auto-mode", no_argument, NULL, 'A' }, + "Auto-detect SKB or native mode"}, + + {{"force", no_argument, NULL, 'F' }, + "Force install, replacing existing program on interface"}, + + {{"copy", no_argument, NULL, 'c' }, + "Force copy mode"}, + + {{"zero-copy", no_argument, NULL, 'z' }, + "Force zero-copy mode"}, + + {{"queue", required_argument, NULL, 'Q' }, + "Configure interface receive queue for AF_XDP, default=0"}, + + {{"poll-mode", no_argument, NULL, 'p' }, + "Use the poll() API waiting for packets to arrive"}, + + {{"unload", no_argument, NULL, 'U' }, + "Unload XDP program instead of loading"}, + + {{"quiet", no_argument, NULL, 'q' }, + "Quiet mode (no output)"}, + + {{"filename", required_argument, NULL, 1 }, + "Load program from ", ""}, + + {{"progsec", required_argument, NULL, 2 }, + "Load program in
of the ELF file", "
"}, + + {{0, 0, NULL, 0 }, NULL, "",false} +}; + +static bool global_exit; + +static uint64_t xsk_umem_free_frames(struct xsk_socket_info *xsk) +{ + return xsk->umem_frame_free; +} + +static void xsk_free_umem_frame(struct xsk_socket_info *xsk, uint64_t frame) +{ + assert(xsk->umem_frame_free < NUM_FRAMES); + + xsk->umem_frame_addr[xsk->umem_frame_free++] = frame; +} + +static void complete_tx(struct xsk_socket_info *xsk) +{ + unsigned int completed; + uint32_t idx_cq; + + if (!xsk->outstanding_tx) + return; + + sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + + + /* Collect/free completed TX buffers */ + completed = xsk_ring_cons__peek(&xsk->umem->cq, + XSK_RING_CONS__DEFAULT_NUM_DESCS, + &idx_cq); + + if (completed > 0) { + for (int i = 0; i < completed; i++) + xsk_free_umem_frame(xsk, + *xsk_ring_cons__comp_addr(&xsk->umem->cq, + idx_cq++)); + + xsk_ring_cons__release(&xsk->umem->cq, completed); + xsk->outstanding_tx -= completed < xsk->outstanding_tx ? + completed : xsk->outstanding_tx; + } +} + +static inline __sum16 csum16_add(__sum16 csum, __be16 addend) +{ + uint16_t res = (uint16_t)csum; + + res += (__u16)addend; + return (__sum16)(res + (res < (__u16)addend)); +} + +static inline __sum16 csum16_sub(__sum16 csum, __be16 addend) +{ + return csum16_add(csum, ~addend); +} + +static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 present) +{ + *sum = ~csum16_add(csum16_sub(~(*sum), old), present); +} + +static bool process_packet(struct xsk_socket_info *xsk, + uint64_t addr, uint32_t len) +{ + uint8_t *pkt = static_cast(xsk_umem__get_data(xsk->umem->buffer, addr)); + + /* Lesson#3: Write an IPv6 ICMP ECHO parser to send responses + * + * Some assumptions to make it easier: + * - No VLAN handling + * - Only if nexthdr is ICMP + * - Just return all data with MAC/IP swapped, and type set to + * ICMPV6_ECHO_REPLY + * - Recalculate the icmp checksum */ + + if (true) { + /* + * TODO: Parse packet here, get VNI, IP, MAC, lookup locally in DB, and replace neigbor host IP if found; + * if NOT found, drop packet and remotely GET from Arion Master. + * */ + int ret; + uint32_t tx_idx = 0; + uint8_t tmp_mac[ETH_ALEN]; + struct in6_addr tmp_ip; + struct ethhdr *eth = (struct ethhdr *) pkt; + struct ipv6hdr *ipv6 = (struct ipv6hdr *) (eth + 1); + struct icmp6hdr *icmp = (struct icmp6hdr *) (ipv6 + 1); + + if (ntohs(eth->h_proto) != ETH_P_IPV6 || + len < (sizeof(*eth) + sizeof(*ipv6) + sizeof(*icmp)) || + ipv6->nexthdr != IPPROTO_ICMPV6 || + icmp->icmp6_type != ICMPV6_ECHO_REQUEST) + return false; + + memcpy(tmp_mac, eth->h_dest, ETH_ALEN); + memcpy(eth->h_dest, eth->h_source, ETH_ALEN); + memcpy(eth->h_source, tmp_mac, ETH_ALEN); + + memcpy(&tmp_ip, &ipv6->saddr, sizeof(tmp_ip)); + memcpy(&ipv6->saddr, &ipv6->daddr, sizeof(tmp_ip)); + memcpy(&ipv6->daddr, &tmp_ip, sizeof(tmp_ip)); + + icmp->icmp6_type = ICMPV6_ECHO_REPLY; + + csum_replace2(&icmp->icmp6_cksum, + htons(ICMPV6_ECHO_REQUEST << 8), + htons(ICMPV6_ECHO_REPLY << 8)); + + /* Here we sent the packet out of the receive port. Note that + * we allocate one entry and schedule it. Your design would be + * faster if you do batch processing/transmission */ + + ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); + if (ret != 1) { + /* No more transmit slots, drop the packet */ + return false; + } + + xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = addr; + xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = len; + xsk_ring_prod__submit(&xsk->tx, 1); + xsk->outstanding_tx++; + + xsk->stats.tx_bytes += len; + xsk->stats.tx_packets++; + return true; + } + + return false; +} + + +static void handle_receive_packets(struct xsk_socket_info *xsk) +{ + unsigned int rcvd, stock_frames, i; + uint32_t idx_rx = 0, idx_fq = 0; + int ret; + + rcvd = xsk_ring_cons__peek(&xsk->rx, RX_BATCH_SIZE, &idx_rx); + if (!rcvd) + return; + + /* Stuff the ring with as much frames as possible */ + stock_frames = xsk_prod_nb_free(&xsk->umem->fq, + xsk_umem_free_frames(xsk)); + + if (stock_frames > 0) { + + ret = xsk_ring_prod__reserve(&xsk->umem->fq, stock_frames, + &idx_fq); + + /* This should not happen, but just in case */ + while (ret != stock_frames) + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, + &idx_fq); + + for (i = 0; i < stock_frames; i++) + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = + xsk_alloc_umem_frame(xsk); + + xsk_ring_prod__submit(&xsk->umem->fq, stock_frames); + } + + /* Process received packets */ + for (i = 0; i < rcvd; i++) { + uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; + uint32_t len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; + + if (!process_packet(xsk, addr, len)) + xsk_free_umem_frame(xsk, addr); + + xsk->stats.rx_bytes += len; + } + + xsk_ring_cons__release(&xsk->rx, rcvd); + xsk->stats.rx_packets += rcvd; + + /* Do we need to wake up the kernel for transmission */ + complete_tx(xsk); +} + +static void rx_and_process(struct config *cfg, + struct xsk_socket_info *xsk_socket) +{ + struct pollfd fds[2]; + int ret, nfds = 1; + + memset(fds, 0, sizeof(fds)); + fds[0].fd = xsk_socket__fd(xsk_socket->xsk); + fds[0].events = POLLIN; + + while(!global_exit) { + if (cfg->xsk_poll_mode) { + ret = poll(fds, nfds, -1); + if (ret <= 0 || ret > 1) + continue; + } + handle_receive_packets(xsk_socket); + } +} + +static void exit_application(int signal) +{ + signal = signal; + global_exit = true; +} + +static struct xsk_umem_info *configure_xsk_umem(void *buffer, uint64_t size) +{ + struct xsk_umem_info *umem; + int ret; + + umem = static_cast(calloc(1, sizeof(*umem))); + if (!umem) + return NULL; + + ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, + NULL); + if (ret) { + errno = -ret; + return NULL; + } + + umem->buffer = buffer; + return umem; +} + +void af_xdp_user::run_af_xdp(int argc, char *argv[]) +{ + printf("%s", "af_xdp started"); + int ret; + int xsks_map_fd; + void *packet_buffer; + uint64_t packet_buffer_size; + struct rlimit rlim = {RLIM_INFINITY, RLIM_INFINITY}; + struct config cfg; + + cfg.ifindex = -1; + cfg.do_unload = false; + // TODO: fill in the file name and progsec in CPP style + struct xsk_umem_info *umem; + struct xsk_socket_info *xsk_socket; + struct bpf_object *bpf_obj = NULL; + + /* Global shutdown handler*/ + signal(SIGINT, exit_application); + + /* Command line options can change progsec*/ + parse_cmdline_args(argc, argv, long_options, &cfg, __doc__); + + /* Required option */ + if (cfg.ifindex == -1) { + printf("%s", "ERROR: Required option --dev missing\n\n"); + usage(argv[0], __doc__, long_options, (argc == 1)); + exit(EXIT_FAIL_OPTION); + } + + /* Unload XDP program if requested */ + if (cfg.do_unload) { + int rc = xdp_link_detach(cfg.ifindex, cfg.xdp_flags, 0); + exit(rc); + } + + /* Load custom program if configured */ + if (cfg.filename[0] != 0) { + struct bpf_map *map; + + bpf_obj = load_bpf_and_xdp_attach(&cfg); + if (!bpf_obj) { + /* Error handling done in load_bpf_and_xdp_attach() */ + exit(EXIT_FAILURE); + } + + /* We also need to load the xsks_map */ + map = bpf_object__find_map_by_name(bpf_obj, "xsks_map"); + xsks_map_fd = bpf_map__fd(map); + if (xsks_map_fd < 0) { + fprintf(stderr, "ERROR: no xsks map found: %s\n", + strerror(xsks_map_fd)); + exit(EXIT_FAILURE); + } + } + + /* Allow unlimited locking of memory, so all memory needed for packet + * buffers can be locked. + */ + if (setrlimit(RLIMIT_MEMLOCK, &rlim)) { + printf("%s", "ERROR: setrlimit(RLIMIT_MEMLOCK) \n"); + exit(EXIT_FAILURE); + } + + /* Allocate memory for NUM_FRAMES of the default XDP frame size */ + packet_buffer_size = NUM_FRAMES * FRAME_SIZE; + if (posix_memalign(&packet_buffer, + getpagesize(), /* PAGE_SIZE aligned */ + packet_buffer_size)) { + fprintf(stderr, "ERROR: Can't allocate buffer memory \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Initialize shared packet_buffer for umem usage */ + umem = configure_xsk_umem(packet_buffer, packet_buffer_size); + if (umem == NULL) { + fprintf(stderr, "ERROR: Can't create umem \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Open and configure the AF_XDP (xsk) socket */ + xsk_socket = xsk_configure_socket(&cfg, umem); + if (xsk_socket == NULL) { + fprintf(stderr, "ERROR: Can't setup AF_XDP socket \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Receive and count packets than drop them */ + rx_and_process(&cfg, xsk_socket); + + /* Cleanup */ + xsk_socket__delete(xsk_socket->xsk); + xsk_umem__delete(umem->umem); + xdp_link_detach(cfg.ifindex, cfg.xdp_flags, 0); + + return /*EXIT_OK*/; +} From 2761449c8594ed70e6ce1eb7e302467e64e8b48e Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 7 Oct 2022 21:27:54 +0000 Subject: [PATCH 03/33] Updated build script to include af_xdp sample code --- build/machine-init.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/build/machine-init.sh b/build/machine-init.sh index 4776882..7d13642 100755 --- a/build/machine-init.sh +++ b/build/machine-init.sh @@ -124,3 +124,13 @@ echo "5--- installing ebpf dependencies ---" && \ mkdir build root && \ BUILD_STATIC_ONLY=y OBJDIR=build DESTDIR=root make install && \ cd ~ + +echo "6--- downloading xdp-project files" + cd /var/local/git && \ + git clone https://github.com/xdp-project/xdp-tutorial&& \ +# cd xdp-tutorial && \ +# git submodule update --init --recursive && \ +# cd src/extern/libbpf/src && \ +# mkdir build root && \ +# BUILD_STATIC_ONLY=y OBJDIR=build DESTDIR=root make install && \ + cd ~ \ No newline at end of file From ec99f8fc1ff3ce72c6c84b6b5effc3b5bcf2079b Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Tue, 11 Oct 2022 17:37:42 +0000 Subject: [PATCH 04/33] Included glog install in the machine-init.sh --- build/machine-init.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/build/machine-init.sh b/build/machine-init.sh index 4a9351c..1c2228e 100755 --- a/build/machine-init.sh +++ b/build/machine-init.sh @@ -152,7 +152,16 @@ echo "7--- installing sqlite orm lib dependencies ---" && \ sudo cmake --build build --target install && \ cd ~ -echo "8--- installing double conversion for folly ---" && \ +echo "8--- installing glog for folly ---" && \ + cd /var/local/git && \ + git clone https://github.com/google/glog.git && \ + cd glog && \ + sudo cmake -S . -B build -G "Unix Makefiles" && \ + sudo cmake --build build && \ + sudo cmake --build build --target install && \ + cd ~ + +echo "9--- installing double conversion for folly ---" && \ cd /var/local/git && \ git clone https://github.com/google/double-conversion.git && \ cd double-conversion && \ @@ -161,7 +170,7 @@ echo "8--- installing double conversion for folly ---" && \ sudo make install && \ cd ~ -echo "9--- installing folly lib for concurrent hashmap ---" && \ +echo "10--- installing folly lib for concurrent hashmap ---" && \ cd /var/local/git && \ git clone https://github.com/facebook/folly.git && \ cd folly && \ @@ -172,7 +181,7 @@ echo "9--- installing folly lib for concurrent hashmap ---" && \ sudo make install && \ cd ~ -echo "6--- downloading xdp-project files" +echo "11--- downloading xdp-project files" cd /var/local/git && \ git clone https://github.com/xdp-project/xdp-tutorial&& \ # cd xdp-tutorial && \ From 4be77b7ea8709907d5d4ae1d274612f4e88b9f40 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Wed, 12 Oct 2022 23:51:02 +0000 Subject: [PATCH 05/33] Added code to parse packet headers all the way to inner IP --- src/CMakeLists.txt | 36 +++++++++++++++++++ src/comm/af_xdp_user.cpp | 75 ++++++++++++++++++++++++++++------------ 2 files changed, 88 insertions(+), 23 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c691137..c07adba 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,6 +4,7 @@ set(SOURCES ./util/dispatch_queue.cpp ./comm/grpc_client.cpp comm/af_xdp_user.cpp ../include/af_xdp_user.h) +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -idirafter /usr/src/linux-headers-5.10.4/include/") #FIND_LIBRARY(LIBUUID_LIBRARIES uuid) #link_libraries(/usr/lib/x86_64-linux-gnu/libuuid.so) @@ -21,6 +22,41 @@ include_directories(/usr/local/include/folly) include_directories(/var/local/git/zeta/src/extern/libbpf/src) #libbpf.h include_directories(/var/local/git/xdp-tutorial/common) include_directories(/usr/local/include/sqlite_orm) #sqlite_orm.h +include_directories(/usr/include/glog) +#include_directories(SYSTEM $ENV{SDKTARGETSYSROOT} /usr/src/linux-headers-5.10.4/include/) +#include_directories(SYSTEM $ENV{SDKTARGETSYSROOT} /usr/src/linux-headers-5.10.4/arch/x86/include) +#include_directories(SYSTEM $ENV{SDKTARGETSYSROOT} /usr/src/linux-headers-5.10.4/include/linux) # try to include vxlan.h in order to parse vxlan header + + +# Try to find the installed headers - Start +# Find the kernel release +execute_process( + COMMAND uname -r + OUTPUT_VARIABLE KERNEL_RELEASE + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Find the headers +find_path(KERNELHEADERS_DIR + include/linux/user.h + PATHS /usr/src/linux-headers-${KERNEL_RELEASE} + ) + +message(STATUS "Kernel release: ${KERNEL_RELEASE}") +message(STATUS "Kernel headers: ${KERNELHEADERS_DIR}") + +if (KERNELHEADERS_DIR) + set(KERNELHEADERS_INCLUDE_DIRS + ${KERNELHEADERS_DIR}/include + ${KERNELHEADERS_DIR}/arch/x86/include + CACHE PATH "Kernel headers include dirs" + ) +# set(KERNELHEADERS_FOUND 1 CACHE STRING "Set to 1 if kernel headers were found") +#else (KERNELHEADERS_DIR) +# set(KERNELHEADERS_FOUND 0 CACHE STRING "Set to 1 if kernel headers were found") +endif (KERNELHEADERS_DIR) +# Try to find the installed headers - End + # Find Protobuf installation # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation. diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp index 4aaba0a..7e22d46 100644 --- a/src/comm/af_xdp_user.cpp +++ b/src/comm/af_xdp_user.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include #include "af_xdp_user.h" @@ -13,12 +15,32 @@ #include #include #include - +#include +#include +#include +#include +#include #define NUM_FRAMES 4096 #define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE #define RX_BATCH_SIZE 64 #define INVALID_UMEM_FRAME UINT64_MAX +#define MSG_DONTWAIT = 0x40 + +/* VXLAN protocol (RFC 7348) header: + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |R|R|R|R|I|R|R|R| Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | VXLAN Network Identifier (VNI) | Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * I = VXLAN Network Identifier (VNI) present. + */ +struct vxlanhdr { + __be32 vx_flags; + __be32 vx_vni; +}; + struct xsk_umem_info { struct xsk_ring_prod fq; struct xsk_ring_cons cq; @@ -51,6 +73,7 @@ struct xsk_socket_info { static uint64_t xsk_alloc_umem_frame(struct xsk_socket_info *xsk) { + uint64_t frame; if (xsk->umem_frame_free == 0) return INVALID_UMEM_FRAME; @@ -72,7 +95,7 @@ static struct xsk_socket_info *xsk_configure_socket(struct config *cfg, xsk_info = static_cast(calloc(1, sizeof(*xsk_info))); if (!xsk_info) - return NULL; + return static_cast(nullptr); xsk_info->umem = umem; xsk_cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; @@ -117,54 +140,54 @@ static struct xsk_socket_info *xsk_configure_socket(struct config *cfg, error_exit: errno = -ret; - return NULL; + return static_cast(nullptr); } static const struct option_wrapper long_options[] = { - {{"help", no_argument, NULL, 'h' }, + {{"help", no_argument, nullptr, 'h' }, "Show help", "",false}, - {{"dev", required_argument, NULL, 'd' }, + {{"dev", required_argument, nullptr, 'd' }, "Operate on device ", "", true}, - {{"skb-mode", no_argument, NULL, 'S' }, + {{"skb-mode", no_argument, nullptr, 'S' }, "Install XDP program in SKB (AKA generic) mode"}, - {{"native-mode", no_argument, NULL, 'N' }, + {{"native-mode", no_argument, nullptr, 'N' }, "Install XDP program in native mode"}, - {{"auto-mode", no_argument, NULL, 'A' }, + {{"auto-mode", no_argument, nullptr, 'A' }, "Auto-detect SKB or native mode"}, - {{"force", no_argument, NULL, 'F' }, + {{"force", no_argument, nullptr, 'F' }, "Force install, replacing existing program on interface"}, - {{"copy", no_argument, NULL, 'c' }, + {{"copy", no_argument, nullptr, 'c' }, "Force copy mode"}, - {{"zero-copy", no_argument, NULL, 'z' }, + {{"zero-copy", no_argument, nullptr, 'z' }, "Force zero-copy mode"}, - {{"queue", required_argument, NULL, 'Q' }, + {{"queue", required_argument, nullptr, 'Q' }, "Configure interface receive queue for AF_XDP, default=0"}, - {{"poll-mode", no_argument, NULL, 'p' }, + {{"poll-mode", no_argument, nullptr, 'p' }, "Use the poll() API waiting for packets to arrive"}, - {{"unload", no_argument, NULL, 'U' }, + {{"unload", no_argument, nullptr, 'U' }, "Unload XDP program instead of loading"}, - {{"quiet", no_argument, NULL, 'q' }, + {{"quiet", no_argument, nullptr, 'q' }, "Quiet mode (no output)"}, - {{"filename", required_argument, NULL, 1 }, + {{"filename", required_argument, nullptr, 1 }, "Load program from ", ""}, - {{"progsec", required_argument, NULL, 2 }, + {{"progsec", required_argument, nullptr, 2 }, "Load program in
of the ELF file", "
"}, - {{0, 0, NULL, 0 }, NULL, "",false} + {{0, 0, nullptr, 0 }, nullptr, "",false} }; static bool global_exit; @@ -189,7 +212,7 @@ static void complete_tx(struct xsk_socket_info *xsk) if (!xsk->outstanding_tx) return; - sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + sendto(xsk_socket__fd(xsk->xsk), NULL, 0, 0X40/*MSG_DONTWAIT*/, NULL, 0); /* Collect/free completed TX buffers */ @@ -251,6 +274,12 @@ static bool process_packet(struct xsk_socket_info *xsk, uint8_t tmp_mac[ETH_ALEN]; struct in6_addr tmp_ip; struct ethhdr *eth = (struct ethhdr *) pkt; + struct iphdr *ip = (struct iphdr *) (eth + sizeof(*eth)); + struct udphdr *udp = (struct udphdr *) (ip + sizeof(*ip)); + // TODO: find a way to get vxlan header + struct vxlanhdr* vxlan = (struct vxlanhdr *)(udp + sizeof(*udp)); + struct iphdr *inner_ip = (struct iphdr *)(vxlan + sizeof(*vxlan)); + printf("VNI: %ld, Inner src IP: %d, dest IP: %d", vxlan->vx_vni, inner_ip->saddr, inner_ip->daddr); struct ipv6hdr *ipv6 = (struct ipv6hdr *) (eth + 1); struct icmp6hdr *icmp = (struct icmp6hdr *) (ipv6 + 1); @@ -380,13 +409,13 @@ static struct xsk_umem_info *configure_xsk_umem(void *buffer, uint64_t size) umem = static_cast(calloc(1, sizeof(*umem))); if (!umem) - return NULL; + return nullptr; ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, - NULL); + nullptr); if (ret) { errno = -ret; - return NULL; + return nullptr; } umem->buffer = buffer; @@ -408,7 +437,7 @@ void af_xdp_user::run_af_xdp(int argc, char *argv[]) // TODO: fill in the file name and progsec in CPP style struct xsk_umem_info *umem; struct xsk_socket_info *xsk_socket; - struct bpf_object *bpf_obj = NULL; + struct bpf_object *bpf_obj = nullptr; /* Global shutdown handler*/ signal(SIGINT, exit_application); From cc1babb16b8b764c02b7276c670bf00f5c8f9f6c Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Wed, 26 Oct 2022 17:46:11 -0700 Subject: [PATCH 06/33] Check in code, right now the af_xdp_user is able to get packets from socket, and able to parse packet until the ARP level --- include/af_xdp_user.h | 7 +- src/CMakeLists.txt | 5 +- src/comm/af_xdp_user.cpp | 430 +++++++++++++++++++++++++++++++++++++-- src/comm/grpc_client.cpp | 1 + src/main.cpp | 5 + 5 files changed, 429 insertions(+), 19 deletions(-) diff --git a/include/af_xdp_user.h b/include/af_xdp_user.h index cd523f5..bb19a71 100644 --- a/include/af_xdp_user.h +++ b/include/af_xdp_user.h @@ -9,11 +9,14 @@ #include #include #include - +#ifdef __cplusplus +extern "C" +{ #include "common_params.h" #include "common_user_bpf_xdp.h" #include "common_libbpf.h" - +} +#endif static const char *__doc__ = "AF_XDP kernel bypass example\n"; class af_xdp_user { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c07adba..87702dd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,7 +3,8 @@ set(EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../build/bin) set(SOURCES ./util/dispatch_queue.cpp ./comm/grpc_client.cpp - comm/af_xdp_user.cpp ../include/af_xdp_user.h) + comm/af_xdp_user.cpp + ) #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -idirafter /usr/src/linux-headers-5.10.4/include/") #FIND_LIBRARY(LIBUUID_LIBRARIES uuid) @@ -82,6 +83,8 @@ find_package(GLog REQUIRED) find_package(fmt REQUIRED) add_library(ArionAgentLib STATIC ${SOURCES}) +target_include_directories(ArionAgentLib PUBLIC /var/local/git/xdp-tutorial/common) +target_link_directories(ArionAgentLib PUBLIC /var/local/git/xdp-tutorial/common) #target_link_libraries(ArionAgentLib event) target_link_libraries(ArionAgentLib ssl) target_link_libraries(ArionAgentLib crypto) diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp index 7e22d46..1ca1177 100644 --- a/src/comm/af_xdp_user.cpp +++ b/src/comm/af_xdp_user.cpp @@ -8,6 +8,7 @@ #include #include #include +//#include #include #include #include "af_xdp_user.h" @@ -20,12 +21,19 @@ #include #include #include +#include +#include #define NUM_FRAMES 4096 #define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE #define RX_BATCH_SIZE 64 #define INVALID_UMEM_FRAME UINT64_MAX #define MSG_DONTWAIT = 0x40 +#define VXL_DSTPORT 0xb512 // UDP dport 4789(0x12b5) for VxLAN overlay + +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif /* VXLAN protocol (RFC 7348) header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ @@ -41,6 +49,52 @@ struct vxlanhdr { __be32 vx_vni; }; +struct vxlanhdr_internal { + /* Big endian! */ + __u8 rsvd1 : 3; + __u8 i_flag : 1; + __u8 rsvd2 : 4; + __u8 rsvd3[3]; + __u8 vni[3]; + __u8 rsvd4; +}; + + +/* + * This structure defines an ethernet arp header. + */ + +struct arphdr { + __be16 ar_hrd; /* format of hardware address */ + __be16 ar_pro; /* format of protocol address */ + unsigned char ar_hln; /* length of hardware address */ + unsigned char ar_pln; /* length of protocol address */ + __be16 ar_op; /* ARP opcode (command) */ + +#if 0 + /* + * Ethernet looks like this : This bit is variable sized however... + */ + unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ + unsigned char ar_sip[4]; /* sender IP address */ + unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ + unsigned char ar_tip[4]; /* target IP address */ +#endif + +}; + +struct arp_message { + uint16_t hrd; + uint16_t pro; + uint8_t hln; + uint8_t pln; + uint16_t op; + uint8_t sha[6]; + uint32_t spa; + uint8_t tha[6]; + uint32_t tpa; +} __attribute__((__packed__)); + struct xsk_umem_info { struct xsk_ring_prod fq; struct xsk_ring_cons cq; @@ -89,6 +143,9 @@ static struct xsk_socket_info *xsk_configure_socket(struct config *cfg, struct xsk_socket_config xsk_cfg; struct xsk_socket_info *xsk_info; uint32_t idx; + /* TODO: Fill in the prog_id of the 'transit' xdp program + otherwise, the xsk_socket__create will create a map with the name 'xsk_map' + */ uint32_t prog_id = 0; int i; int ret; @@ -250,6 +307,12 @@ static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 present) *sum = ~csum16_add(csum16_sub(~(*sum), old), present); } +static __be32 trn_get_vni(const __u8 *vni) +{ + /* Big endian! */ + return (vni[0] << 16) | (vni[1] << 8) | vni[2]; +} + static bool process_packet(struct xsk_socket_info *xsk, uint64_t addr, uint32_t len) { @@ -274,20 +337,81 @@ static bool process_packet(struct xsk_socket_info *xsk, uint8_t tmp_mac[ETH_ALEN]; struct in6_addr tmp_ip; struct ethhdr *eth = (struct ethhdr *) pkt; - struct iphdr *ip = (struct iphdr *) (eth + sizeof(*eth)); - struct udphdr *udp = (struct udphdr *) (ip + sizeof(*ip)); - // TODO: find a way to get vxlan header - struct vxlanhdr* vxlan = (struct vxlanhdr *)(udp + sizeof(*udp)); - struct iphdr *inner_ip = (struct iphdr *)(vxlan + sizeof(*vxlan)); - printf("VNI: %ld, Inner src IP: %d, dest IP: %d", vxlan->vx_vni, inner_ip->saddr, inner_ip->daddr); + + struct ipv6hdr *ipv6 = (struct ipv6hdr *) (eth + 1); struct icmp6hdr *icmp = (struct icmp6hdr *) (ipv6 + 1); - if (ntohs(eth->h_proto) != ETH_P_IPV6 || - len < (sizeof(*eth) + sizeof(*ipv6) + sizeof(*icmp)) || - ipv6->nexthdr != IPPROTO_ICMPV6 || - icmp->icmp6_type != ICMPV6_ECHO_REQUEST) + if (ntohs(eth->h_proto) != ETH_P_IP +// || +// len < (sizeof(*eth) + sizeof(*ipv6) + sizeof(*icmp)) || +// ipv6->nexthdr != IPPROTO_ICMPV6 || +// icmp->icmp6_type != ICMPV6_ECHO_REQUEST + ) { + printf("%s\n", "returning false for this packet as it is NOT IP"); return false; + }else { + printf("Packet length: %ld\n", len); + printf("Outter eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" + "eth size: %d\n", + eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], + eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], + bpf_ntohs(eth->h_proto), + sizeof(*eth)); + struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); + struct in_addr outter_ip_src; + outter_ip_src.s_addr = ip->saddr; + struct in_addr outter_ip_dest; + outter_ip_dest.s_addr = ip->daddr; + printf("Outter ip src: %s, ip dest: %s\n" + "Outter ip ihl: %d, version: %d\n", + inet_ntoa(outter_ip_src),inet_ntoa(outter_ip_dest), + ip->ihl, ip->version); + struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); + printf("UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", + udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); + // TODO: find a way to get vxlan header + struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); + printf("VNI: %ld, \n",trn_get_vni(vxlan->vni)); + struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); + printf("inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", + inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], + inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], + inner_eth->h_proto); + struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); + struct in_addr inner_ip_src, inner_ip_dest; + inner_ip_src.s_addr = inner_ip->saddr; + inner_ip_dest.s_addr = inner_ip->daddr; +// printf("Inner src IP: %d, dest IP: %d\n", +// inet_ntoa(inner_ip_src), inet_ntoa(inner_ip_dest)); + struct arphdr *inner_arp = (struct arphdr *)(inner_eth + 1/*sizeof(*inner_eth)*/); + unsigned char *sha; + unsigned char *tha = NULL; + __u32 *sip, *tip; + sha = (unsigned char*)(inner_arp + 1); + sip = (__u32 *)(sha + ETH_ALEN); + tha = (unsigned char *)sip + sizeof(__u32); + tip = (__u32 *)(tha + ETH_ALEN); + struct in_addr inner_arp_src_ip, inner_arp_dest_ip; + inner_arp_src_ip.s_addr = (__be32)*sip; +// inner_arp_dest_ip.s_addr = (__be32)*tip; + + arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); + struct in_addr arp_src_ip; + arp_src_ip.s_addr = arp_msg->spa; + struct in_addr arp_dest_ip; + arp_dest_ip.s_addr = arp_msg->tpa; + printf("arp op: %d\n", + bpf_htons(inner_arp->ar_op)); + printf("arp source ip: %s, \n", + inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) + ); + printf("arp dest ip: %s, \n", + inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) + ); + + return false; + } memcpy(tmp_mac, eth->h_dest, ETH_ALEN); memcpy(eth->h_dest, eth->h_source, ETH_ALEN); @@ -359,6 +483,7 @@ static void handle_receive_packets(struct xsk_socket_info *xsk) } /* Process received packets */ + printf("Received %d packets\n", rcvd); for (i = 0; i < rcvd; i++) { uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; uint32_t len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; @@ -422,8 +547,250 @@ static struct xsk_umem_info *configure_xsk_umem(void *buffer, uint64_t size) return umem; } +static struct bpf_object *open_bpf_object(const char *file, int ifindex) +{ + int err; + struct bpf_object *obj; + struct bpf_map *map; + struct bpf_program *prog, *first_prog = NULL; + + struct bpf_object_open_attr open_attr = { + .file = file, + .prog_type = BPF_PROG_TYPE_XDP, + }; + + obj = bpf_object__open_xattr(&open_attr); + + bpf_object__for_each_program(prog, obj) { + bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); + bpf_program__set_ifindex(prog, ifindex); + if (!first_prog) + first_prog = prog; + } + + bpf_object__for_each_map(map, obj) { + if (!bpf_map__is_offload_neutral(map)) + bpf_map__set_ifindex(map, ifindex); + } + + if (!first_prog) { + fprintf(stderr, "ERR: file %s contains no programs\n", file); + return NULL; + } + + return obj; +} + +static int reuse_maps(struct bpf_object *obj, const char *path) +{ + struct bpf_map *map; + + if (!obj) + return -ENOENT; + + if (!path) + return -EINVAL; + + bpf_object__for_each_map(map, obj) { + if (bpf_map__name(map) == "xsks_map"){ + printf("Try to reuse map: %s\n", bpf_map__name(map)); + int len, err; + int pinned_map_fd; + char buf[PATH_MAX]; + + len = snprintf(buf, PATH_MAX, "%s/%s", path, bpf_map__name(map)); + if (len < 0) { + return -EINVAL; + } else if (len >= PATH_MAX) { + return -ENAMETOOLONG; + } + + pinned_map_fd = bpf_obj_get(buf); + if (pinned_map_fd < 0) { + printf("failed at bpf_obj_get for map: %s, buf: %s\n", bpf_map__name(map), buf); + return pinned_map_fd; + } + + err = bpf_map__reuse_fd(map, pinned_map_fd); + if (err) { + printf("failed at bpf_map__reuse_fd for map: %s\n", bpf_map__name(map)); + return err; + } + }else { + printf("Skipping map: %s\n", bpf_map__name(map)); + } + } + + return 0; +} + +struct bpf_object *load_bpf_object_file_reuse_maps(const char *file, + int ifindex, + const char *pin_dir) +{ + int err; + struct bpf_object *obj; + + obj = open_bpf_object(file, ifindex); + if (!obj) { + fprintf(stderr, "ERR: failed to open object %s\n", file); + return NULL; + } + + err = reuse_maps(obj, pin_dir); + if (err) { + fprintf(stderr, "ERR: failed to reuse maps for object %s, pin_dir=%s, err=%d\n", + file, pin_dir, err); + return NULL; + } + + err = bpf_object__load(obj); + if (err) { + fprintf(stderr, "ERR: loading BPF-OBJ file(%s) (%d): %s\n", + file, err, strerror(-err)); + return NULL; + } + + return obj; +} + +struct bpf_object *load_bpf_object_file(const char *filename, int ifindex) +{ + int first_prog_fd = -1; + struct bpf_object *obj; + int err; + + /* This struct allow us to set ifindex, this features is used for + * hardware offloading XDP programs (note this sets libbpf + * bpf_program->prog_ifindex and foreach bpf_map->map_ifindex). + */ + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + .ifindex = ifindex, + }; + prog_load_attr.file = filename; + + /* Use libbpf for extracting BPF byte-code from BPF-ELF object, and + * loading this into the kernel via bpf-syscall + */ + err = bpf_prog_load_xattr(&prog_load_attr, &obj, &first_prog_fd); + if (err) { + fprintf(stderr, "ERR: loading BPF-OBJ file(%s) (%d): %s\n", + filename, err, strerror(-err)); + return NULL; + } + + /* Notice how a pointer to a libbpf bpf_object is returned */ + return obj; +} + +int xdp_link_attach(int ifindex, __u32 xdp_flags, int prog_fd) +{ + int err; + + /* libbpf provide the XDP net_device link-level hook attach helper */ + err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags); + if (err == -EEXIST && !(xdp_flags & XDP_FLAGS_UPDATE_IF_NOEXIST)) { + /* Force mode didn't work, probably because a program of the + * opposite type is loaded. Let's unload that and try loading + * again. + */ + + __u32 old_flags = xdp_flags; + + xdp_flags &= ~XDP_FLAGS_MODES; + xdp_flags |= (old_flags & XDP_FLAGS_SKB_MODE) ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE; + err = bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + if (!err) + err = bpf_set_link_xdp_fd(ifindex, prog_fd, old_flags); + } + if (err < 0) { + fprintf(stderr, "ERR: " + "ifindex(%d) link set xdp fd failed (%d): %s\n", + ifindex, -err, strerror(-err)); + + switch (-err) { + case EBUSY: + case EEXIST: + fprintf(stderr, "Hint: XDP already loaded on device" + " use --force to swap/replace\n"); + break; + case EOPNOTSUPP: + fprintf(stderr, "Hint: Native-XDP not supported" + " use --skb-mode or --auto-mode\n"); + break; + default: + break; + } + return EXIT_FAIL_XDP; + } + + return EXIT_OK; +} + +struct bpf_object *load_bpf_and_xdp_attach(struct config *cfg) +{ + struct bpf_program *bpf_prog; + struct bpf_object *bpf_obj; + int offload_ifindex = 0; + int prog_fd = -1; + int err; + + /* If flags indicate hardware offload, supply ifindex */ + if (cfg->xdp_flags & XDP_FLAGS_HW_MODE) + offload_ifindex = cfg->ifindex; + + /* Load the BPF-ELF object file and get back libbpf bpf_object */ + if (cfg->reuse_maps) + bpf_obj = load_bpf_object_file_reuse_maps(cfg->filename, + offload_ifindex, + cfg->pin_dir); + else + bpf_obj = load_bpf_object_file(cfg->filename, offload_ifindex); + if (!bpf_obj) { + fprintf(stderr, "ERR: loading file: %s\n", cfg->filename); + exit(EXIT_FAIL_BPF); + } + /* At this point: All XDP/BPF programs from the cfg->filename have been + * loaded into the kernel, and evaluated by the verifier. Only one of + * these gets attached to XDP hook, the others will get freed once this + * process exit. + */ + + if (cfg->progsec[0]) + /* Find a matching BPF prog section name */ + bpf_prog = bpf_object__find_program_by_title(bpf_obj, cfg->progsec); + else + /* Find the first program */ + bpf_prog = bpf_program__next(NULL, bpf_obj); + + if (!bpf_prog) { + fprintf(stderr, "ERR: couldn't find a program in ELF section '%s'\n", cfg->progsec); + exit(EXIT_FAIL_BPF); + } + + strncpy(cfg->progsec, bpf_program__title(bpf_prog, false), sizeof(cfg->progsec)); + + prog_fd = bpf_program__fd(bpf_prog); + if (prog_fd <= 0) { + fprintf(stderr, "ERR: bpf_program__fd failed\n"); + exit(EXIT_FAIL_BPF); + } + + /* At this point: BPF-progs are (only) loaded by the kernel, and prog_fd + * is our select file-descriptor handle. Next step is attaching this FD + * to a kernel hook point, in this case XDP net_device link-level hook. + */ + err = xdp_link_attach(cfg->ifindex, cfg->xdp_flags, prog_fd); + if (err) + exit(err); + + return bpf_obj; +} + void af_xdp_user::run_af_xdp(int argc, char *argv[]) { + printf("%s", "af_xdp started"); int ret; int xsks_map_fd; @@ -443,19 +810,49 @@ void af_xdp_user::run_af_xdp(int argc, char *argv[]) signal(SIGINT, exit_application); /* Command line options can change progsec*/ - parse_cmdline_args(argc, argv, long_options, &cfg, __doc__); +// parse_cmdline_args(argc, argv, long_options, &cfg, __doc__); + // TODO: Get rid of getting the config from argc/argv, hardcode it for the time being. + // interface name + cfg.ifname = "enp4s0f1"; + cfg.ifindex = if_nametoindex(cfg.ifname); + // skb mode + cfg.xdp_flags &= ~XDP_FLAGS_MODES; + cfg.xdp_flags |= XDP_FLAGS_SKB_MODE; + cfg.xsk_bind_flags &= XDP_ZEROCOPY; + cfg.xsk_bind_flags |= XDP_COPY; + + // queue_id, default = 0 + cfg.xsk_if_queue = 0; + // NOT using poll + cfg.xsk_poll_mode = false; + // not doing unload this time + cfg.do_unload = false; + // progsec of the xdp program + std::string progsec_string = "transit"; + strncpy(cfg.progsec, progsec_string.c_str(), sizeof(cfg.progsec)); +// progsec_string.copy(cfg.progsec, progsec_string.size()); + + // absolute path for the xdp.o file + std::string file_name = "/trn_xdp/trn_transit_xdp_ebpf.o"; +// strncpy(cfg.filename, file_name.c_str(), sizeof(cfg.filename)); +// file_name.copy(cfg.filename, file_name.size()); + // reuse maps, try NOT to create a new map. + cfg.reuse_maps = true; + std::string pin_dir = "/sys/fs/bpf"; + strncpy(cfg.pin_dir, pin_dir.c_str(), sizeof(cfg.pin_dir)); +// pin_dir.copy(cfg.pin_dir, pin_dir.size()); /* Required option */ if (cfg.ifindex == -1) { printf("%s", "ERROR: Required option --dev missing\n\n"); - usage(argv[0], __doc__, long_options, (argc == 1)); +// usage(argv[0], __doc__, long_options, (argc == 1)); exit(EXIT_FAIL_OPTION); } /* Unload XDP program if requested */ if (cfg.do_unload) { - int rc = xdp_link_detach(cfg.ifindex, cfg.xdp_flags, 0); - exit(rc); +// int rc = xdp_link_detach(cfg.ifindex, cfg.xdp_flags, 0); + exit(-1); } /* Load custom program if configured */ @@ -476,6 +873,8 @@ void af_xdp_user::run_af_xdp(int argc, char *argv[]) strerror(xsks_map_fd)); exit(EXIT_FAILURE); } + } else { + printf("%s\n", "Empty config filename, not loading/attaching"); } /* Allow unlimited locking of memory, so all memory needed for packet @@ -511,14 +910,13 @@ void af_xdp_user::run_af_xdp(int argc, char *argv[]) strerror(errno)); exit(EXIT_FAILURE); } - /* Receive and count packets than drop them */ rx_and_process(&cfg, xsk_socket); /* Cleanup */ xsk_socket__delete(xsk_socket->xsk); xsk_umem__delete(umem->umem); - xdp_link_detach(cfg.ifindex, cfg.xdp_flags, 0); +// xdp_link_detach(cfg.ifindex, cfg.xdp_flags, 0); return /*EXIT_OK*/; } diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 9d72b13..52a7ad9 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -161,6 +161,7 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, &ep.hmac[0], &ep.hmac[1], &ep.hmac[2], &ep.hmac[3], &ep.hmac[4], &ep.hmac[5]); + //disabling the element udpate, so that all packets will be sent to user space program. int ebpf_rc = bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); // step #3 - async call to write/update to local db table 1 diff --git a/src/main.cpp b/src/main.cpp index a0cd18a..247e50c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -25,6 +25,7 @@ #include "marl/event.h" #include "marl/scheduler.h" #include "marl/waitgroup.h" +#include "af_xdp_user.h" using namespace std; using std::string; @@ -142,6 +143,10 @@ int main(int argc, char *argv[]) { g_arion_neighbor_table); }); + marl::schedule([=] { + auto af = af_xdp_user(); + af.run_af_xdp(argc, argv); + }); pause(); cleanup(); From 344ee7a50ce24c8304cf329600eb53315a2c40df Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Wed, 9 Nov 2022 16:29:39 -0800 Subject: [PATCH 07/33] With this commit, the ArionAgent is able to receive, parse, modify and tx packets; however, more investigation is needed in order to use the db functions --- include/af_xdp_user.h | 3 +- include/db_client.h | 41 +++- include/grpc_client.h | 3 +- src/CMakeLists.txt | 5 +- src/comm/af_xdp_user.cpp | 461 +++++++++++++++++++++++++++------------ src/comm/grpc_client.cpp | 3 +- src/main.cpp | 44 ++-- 7 files changed, 396 insertions(+), 164 deletions(-) diff --git a/include/af_xdp_user.h b/include/af_xdp_user.h index bb19a71..7d14621 100644 --- a/include/af_xdp_user.h +++ b/include/af_xdp_user.h @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -24,7 +25,7 @@ class af_xdp_user { af_xdp_user() { printf("%s", "Start of af_xdp userspace program."); } - void run_af_xdp(int argc, char *argv[]); + void run_af_xdp(std::string table_name_neighbor_ebpf_map); private: }; diff --git a/include/db_client.h b/include/db_client.h index b797d4b..59b445e 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -16,6 +16,7 @@ #include #include #include "dispatch_queue.h" +#include "xdp/trn_datamodel.h" using namespace sqlite_orm; @@ -32,10 +33,10 @@ struct ProgrammingState { int version; }; // local db table 2 - neighbor ebpf programmed version -std::string g_local_db_path = "/var/local/arion/arion_wing.db"; +static std::string g_local_db_path = "/var/local/arion/arion_wing.db"; // Schema definition (create DB if not exists) or retrieved handle (get DB if exists already) of local db -auto local_db = make_storage(g_local_db_path, +static auto local_db = make_storage(g_local_db_path, make_table("neighbor", make_column("vni", &Neighbor::vni), make_column("vpc_ip", &Neighbor::vpc_ip), @@ -52,7 +53,7 @@ auto local_db = make_storage(g_local_db_path, ); // Create local db writer single thread execution queue -dispatch_queue local_db_writer_queue("Local db background write queue", 1); +static dispatch_queue local_db_writer_queue("Local db background write queue", 1); static int FindLKGVersion() { int lkg_ver = 0; @@ -89,3 +90,37 @@ static int FindLKGVersion() { return lkg_ver + 1; } + +/* SELECT host_ip, vpc_mac, host_mac + * FROM neighbor + * WHERE vni=%{vni} AND vpc_ip=%{vpc_ip} + * */ +static auto query_neighbor_statement = + local_db.prepare(select(columns(&Neighbor::host_ip, &Neighbor::vpc_mac, &Neighbor::host_mac), + where(is_equal((&Neighbor::vni), 0) and is_equal((&Neighbor::vpc_ip), "127.0.0.1")))); + +static endpoint_t GetNeighbor(int vni, std::string vpc_ip) { + endpoint_t found_neighbor; + printf("GetNeighbor with VNI: [%d], vpc_ip: [%s]\n", vni, vpc_ip.c_str()); + get<0>(query_neighbor_statement) = vni; + get<1>(query_neighbor_statement) = vpc_ip.c_str(); + printf("Statement: %s\n", query_neighbor_statement.sql().c_str()); + auto rows = local_db.execute(query_neighbor_statement); + printf("Found %ld rows\n", rows.size()); + for (auto& row : rows) { + struct sockaddr_in ep_hip; + inet_pton(AF_INET, get<0>(row).c_str(), &(ep_hip.sin_addr)); + found_neighbor.hip = ep_hip.sin_addr.s_addr; + + std::sscanf(get<1>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + &found_neighbor.mac[0], &found_neighbor.mac[1], &found_neighbor.mac[2], + &found_neighbor.mac[3], &found_neighbor.mac[4], &found_neighbor.mac[5]); + + std::sscanf(get<2>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + &found_neighbor.hmac[0], &found_neighbor.hmac[1], &found_neighbor.hmac[2], + &found_neighbor.hmac[3], &found_neighbor.hmac[4], &found_neighbor.hmac[5]); + + printf("host_ip: %s, vpc_mac: %s, host_mac: %s\n", get<0>(row).c_str(), get<1>(row).c_str(), get<2>(row).c_str()); + } + return found_neighbor; +} \ No newline at end of file diff --git a/include/grpc_client.h b/include/grpc_client.h index 7c9969a..90ad6e7 100644 --- a/include/grpc_client.h +++ b/include/grpc_client.h @@ -45,6 +45,8 @@ class ArionMasterWatcherImpl final : public Watch::Service { bool a = chan_ == nullptr; + int fd_neighbor_ebpf_map = -1; + private: std::string server_address; @@ -54,7 +56,6 @@ class ArionMasterWatcherImpl final : public Watch::Service { std::string table_name_neighbor_ebpf_map; - int fd_neighbor_ebpf_map = -1; // key std::string is '-', value is inserted version of this neighbor folly::ConcurrentHashMap neighbor_task_map; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 87702dd..fc779cb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,8 +3,9 @@ set(EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../build/bin) set(SOURCES ./util/dispatch_queue.cpp ./comm/grpc_client.cpp - comm/af_xdp_user.cpp - ) + comm/af_xdp_user.cpp +# db/db_client.cpp + ) #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -idirafter /usr/src/linux-headers-5.10.4/include/") #FIND_LIBRARY(LIBUUID_LIBRARIES uuid) diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp index 1ca1177..35206a5 100644 --- a/src/comm/af_xdp_user.cpp +++ b/src/comm/af_xdp_user.cpp @@ -11,6 +11,8 @@ //#include #include #include +//#include +//#include #include "af_xdp_user.h" #include #include @@ -23,6 +25,8 @@ #include #include #include +#include "xdp/trn_datamodel.h" +//#include "xdp/trn_kern.h" #define NUM_FRAMES 4096 #define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE @@ -30,25 +34,14 @@ #define INVALID_UMEM_FRAME UINT64_MAX #define MSG_DONTWAIT = 0x40 #define VXL_DSTPORT 0xb512 // UDP dport 4789(0x12b5) for VxLAN overlay - +/* ARP protocol opcodes. */ +#define ARPOP_REQUEST 1 /* ARP request */ +#define ARPOP_REPLY 2 /* ARP reply */ #ifndef PATH_MAX + #define PATH_MAX 4096 #endif -/* VXLAN protocol (RFC 7348) header: - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |R|R|R|R|I|R|R|R| Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | VXLAN Network Identifier (VNI) | Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * I = VXLAN Network Identifier (VNI) present. - */ -struct vxlanhdr { - __be32 vx_flags; - __be32 vx_vni; -}; - struct vxlanhdr_internal { /* Big endian! */ __u8 rsvd1 : 3; @@ -60,28 +53,28 @@ struct vxlanhdr_internal { }; -/* - * This structure defines an ethernet arp header. - */ - -struct arphdr { - __be16 ar_hrd; /* format of hardware address */ - __be16 ar_pro; /* format of protocol address */ - unsigned char ar_hln; /* length of hardware address */ - unsigned char ar_pln; /* length of protocol address */ - __be16 ar_op; /* ARP opcode (command) */ - -#if 0 - /* - * Ethernet looks like this : This bit is variable sized however... - */ - unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ - unsigned char ar_sip[4]; /* sender IP address */ - unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ - unsigned char ar_tip[4]; /* target IP address */ -#endif - -}; +///* +// * This structure defines an ethernet arp header. +// */ +// +//struct arphdr { +// __be16 ar_hrd; /* format of hardware address */ +// __be16 ar_pro; /* format of protocol address */ +// unsigned char ar_hln; /* length of hardware address */ +// unsigned char ar_pln; /* length of protocol address */ +// __be16 ar_op; /* ARP opcode (command) */ +// +//#if 0 +// /* +// * Ethernet looks like this : This bit is variable sized however... +// */ +// unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ +// unsigned char ar_sip[4]; /* sender IP address */ +// unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ +// unsigned char ar_tip[4]; /* target IP address */ +//#endif +// +//}; struct arp_message { uint16_t hrd; @@ -160,16 +153,32 @@ static struct xsk_socket_info *xsk_configure_socket(struct config *cfg, xsk_cfg.libbpf_flags = 0; xsk_cfg.xdp_flags = cfg->xdp_flags; xsk_cfg.bind_flags = cfg->xsk_bind_flags; + if (!umem->umem) { + printf("%s\n", "umem is empty!"); + } + if (!(&xsk_info->xsk)) { + printf("%s\n", "xsk_ptr is empty!"); + } + if (!(&xsk_info->tx)) { + printf("%s\n", "tx is empty!"); + } + if (!(&xsk_info->rx)) { + printf("%s\n", "rx is empty!"); + } ret = xsk_socket__create(&xsk_info->xsk, cfg->ifname, cfg->xsk_if_queue, umem->umem, &xsk_info->rx, &xsk_info->tx, &xsk_cfg); - if (ret) + if (ret) { + printf("xsk_socket__create failed with ret: [%ld]\n", ret); goto error_exit; + } ret = bpf_get_link_xdp_id(cfg->ifindex, &prog_id, cfg->xdp_flags); - if (ret) + if (ret) { + printf("bpf_get_link_xdp_id failed\n"); goto error_exit; + } /* Initialize umem frame allocation */ @@ -183,8 +192,10 @@ static struct xsk_socket_info *xsk_configure_socket(struct config *cfg, XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx); - if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) + if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) { + printf("xsk_ring_prod__reserve failed\n"); goto error_exit; + } for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i ++) *xsk_ring_prod__fill_addr(&xsk_info->umem->fq, idx++) = @@ -307,25 +318,119 @@ static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 present) *sum = ~csum16_add(csum16_sub(~(*sum), old), present); } +static inline void trn_set_mac(void *dst, unsigned char *mac) +{ + unsigned short *d = static_cast(dst); + unsigned short *s = (unsigned short *)mac; + + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; +} + +static inline void trn_set_dst_mac(void *data, unsigned char *dst_mac) +{ + trn_set_mac(data, dst_mac); +} + +static inline void trn_set_src_mac(void *data, unsigned char *src_mac) +{ + uint8_t *tmp = static_cast(data); + trn_set_mac((void*)(tmp + 6), src_mac); +} + static __be32 trn_get_vni(const __u8 *vni) { /* Big endian! */ return (vni[0] << 16) | (vni[1] << 8) | vni[2]; } +static inline void trn_set_src_ip(void *data, void *data_end, __u32 saddr) +{ + int off = offsetof(struct iphdr, saddr); + uint8_t *tmp = static_cast(data); + + __u32 *addr = (__u32*)(tmp + off); + if ((void *)addr > data_end) + return; + + *addr = saddr; +} + +static inline void trn_set_dst_ip(void *data, void *data_end, __u32 daddr) +{ + int off = offsetof(struct iphdr, daddr); + uint8_t *tmp = static_cast(data); + + __u32 *addr = (__u32 *)(tmp + off); + if ((void *)addr > data_end) + return; + + *addr = daddr; +} + +static inline __u16 trn_csum_fold_helper(__u64 csum) +{ + int i; +#pragma unroll + for (i = 0; i < 4; i++) { + if (csum >> 16) + csum = (csum & 0xffff) + (csum >> 16); + } + return ~csum; +} + +static inline void trn_ipv4_csum_inline(void *iph, __u64 *csum) +{ + __u16 *next_iph_u16 = (__u16 *)iph; +#pragma clang loop unroll(full) + for (int i = 0; i> 1; i++) { + *csum += *next_iph_u16++; + } + *csum = trn_csum_fold_helper(*csum); +} + +static inline void trn_set_src_dst_ip_csum(struct iphdr *ip, + __u32 saddr, __u32 daddr, void *data_end) +{ + /* Since the packet destination is being rewritten we also + decrement the TTL */ + ip->ttl--; + + __u64 csum = 0; + trn_set_src_ip(ip, data_end, saddr); + trn_set_dst_ip(ip, data_end, daddr); + csum = 0; + ip->check = 0; + trn_ipv4_csum_inline(ip, &csum); + ip->check = csum; + + printf("Modified IP Address, src: 0x%x, dst: 0x%x, csum: 0x%x\n", + ip->saddr, ip->daddr, ip->check); +} + +static inline void trn_swap_src_dst_mac(void *data) +{ + unsigned short *p = static_cast(data); + unsigned short tmp[3]; + + tmp[0] = p[0]; + tmp[1] = p[1]; + tmp[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = tmp[0]; + p[4] = tmp[1]; + p[5] = tmp[2]; +} + static bool process_packet(struct xsk_socket_info *xsk, - uint64_t addr, uint32_t len) + uint64_t addr, uint32_t len, int* fd) { + printf(">>>>>>>>>> Begin processing packet >>>>>>>>>>\n"); uint8_t *pkt = static_cast(xsk_umem__get_data(xsk->umem->buffer, addr)); - /* Lesson#3: Write an IPv6 ICMP ECHO parser to send responses - * - * Some assumptions to make it easier: - * - No VLAN handling - * - Only if nexthdr is ICMP - * - Just return all data with MAC/IP swapped, and type set to - * ICMPV6_ECHO_REPLY - * - Recalculate the icmp checksum */ if (true) { /* @@ -335,123 +440,201 @@ static bool process_packet(struct xsk_socket_info *xsk, int ret; uint32_t tx_idx = 0; uint8_t tmp_mac[ETH_ALEN]; - struct in6_addr tmp_ip; + // parse outer eth header struct ethhdr *eth = (struct ethhdr *) pkt; - - struct ipv6hdr *ipv6 = (struct ipv6hdr *) (eth + 1); - struct icmp6hdr *icmp = (struct icmp6hdr *) (ipv6 + 1); - - if (ntohs(eth->h_proto) != ETH_P_IP -// || -// len < (sizeof(*eth) + sizeof(*ipv6) + sizeof(*icmp)) || -// ipv6->nexthdr != IPPROTO_ICMPV6 || -// icmp->icmp6_type != ICMPV6_ECHO_REQUEST - ) { + if (ntohs(eth->h_proto) != ETH_P_IP) { printf("%s\n", "returning false for this packet as it is NOT IP"); return false; - }else { - printf("Packet length: %ld\n", len); - printf("Outter eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" + } + printf("Packet length: %ld\n", len); + printf("Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" + "eth size: %d\n", + eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], + eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], + bpf_ntohs(eth->h_proto), + sizeof(*eth)); + + // parse outer IP header + struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); + struct in_addr outer_ip_src; + outer_ip_src.s_addr = ip->saddr; + struct in_addr outer_ip_dest; + outer_ip_dest.s_addr = ip->daddr; + printf("Outer ip src: %s, ip dest: %s\n" + "Outer ip ihl: %d, version: %d\n", + inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), + ip->ihl, ip->version); + + // parse UDP header + struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); + printf("UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", + udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); + + // parse VXLAN header + struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); + printf("VNI: %ld, \n",trn_get_vni(vxlan->vni)); + + // parse inner eth header + struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); + printf("inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", + inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], + inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], + inner_eth->h_proto); + + // TODO: Add inner IP support, refer to trn_process_inner_ip + // parse inner IP header +// struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); +// struct in_addr inner_ip_src, inner_ip_dest; +// inner_ip_src.s_addr = inner_ip->saddr; +// inner_ip_dest.s_addr = inner_ip->daddr; + + // parse inner arp header + arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); + struct in_addr arp_src_ip; + arp_src_ip.s_addr = arp_msg->spa; + struct in_addr arp_dest_ip; + arp_dest_ip.s_addr = arp_msg->tpa; + printf("arp op: %d\n", + bpf_htons(arp_msg->op)); + printf("arp source ip: %s, \n", + inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) + ); + printf("arp dest ip: %s, \n", + inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) + ); + endpoint_key_t epkey; + endpoint_t ep_value; +// ep_value = GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_src_ip)); + if (ep_value.hip > 0) { + epkey.vni = trn_get_vni(vxlan->vni); + struct sockaddr_in ep_ip; + inet_pton(AF_INET, inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); + epkey.ip = ep_ip.sin_addr.s_addr; + // we now have key and value, can modify the packet and update the map now. + int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); + printf("Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", + inet_ntoa(arp_src_ip), trn_get_vni(vxlan->vni), ebpf_rc); +// // TODO: step #3 - async call to write/update to local db table 1 +// local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { +// get<0>(add_or_update_neighbor_db_stmt) = { vni, vpc_ip, host_ip, vpc_mac, host_mac, ver }; +// local_db.execute(add_or_update_neighbor_db_stmt); +// }); + + /* Modify pkt for inner ARP response */ + arp_msg->op = bpf_htons(ARPOP_REPLY); + trn_set_mac(arp_msg->tha, arp_msg->sha); + trn_set_mac(arp_msg->sha, ep_value.mac); + + __u32 tmp_ip = arp_msg->spa;//*sip; + arp_msg->spa = arp_msg->tpa;//*tip; + arp_msg->tpa = tmp_ip; + + /* Modify inner EitherHdr, pretend it's from target */ + trn_set_dst_mac(inner_eth, inner_eth->h_source); + trn_set_src_mac(inner_eth, ep_value.mac); + + /* Keep overlay header, swap outer IP header */ + trn_set_src_dst_ip_csum(ip, ip->daddr, ip->saddr, (eth + len)); + trn_swap_src_dst_mac(pkt); + + /* + * Packet modification finished, read packet content again, in order to verify the mod + * */ + + struct ethhdr *eth = (struct ethhdr *) pkt; + + if (ntohs(eth->h_proto) != ETH_P_IP) { + printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); + return false; + } + printf("AFTER MOD: Packet length: %ld\n", len); + printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" "eth size: %d\n", eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], bpf_ntohs(eth->h_proto), sizeof(*eth)); + + // parse outer IP header struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); - struct in_addr outter_ip_src; - outter_ip_src.s_addr = ip->saddr; - struct in_addr outter_ip_dest; - outter_ip_dest.s_addr = ip->daddr; - printf("Outter ip src: %s, ip dest: %s\n" - "Outter ip ihl: %d, version: %d\n", - inet_ntoa(outter_ip_src),inet_ntoa(outter_ip_dest), - ip->ihl, ip->version); + struct in_addr outer_ip_src; + outer_ip_src.s_addr = ip->saddr; + struct in_addr outer_ip_dest; + outer_ip_dest.s_addr = ip->daddr; + printf("AFTER MOD: Outer ip src: %s, ip dest: %s\n" + "AFTER MOD: Outer ip ihl: %d, version: %d\n", + inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), + ip->ihl, ip->version); + + // parse UDP header struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); - printf("UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", + printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); - // TODO: find a way to get vxlan header + + // parse VXLAN header struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); - printf("VNI: %ld, \n",trn_get_vni(vxlan->vni)); + printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); + + // parse inner eth header struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); - printf("inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", + printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], inner_eth->h_proto); - struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); - struct in_addr inner_ip_src, inner_ip_dest; - inner_ip_src.s_addr = inner_ip->saddr; - inner_ip_dest.s_addr = inner_ip->daddr; -// printf("Inner src IP: %d, dest IP: %d\n", -// inet_ntoa(inner_ip_src), inet_ntoa(inner_ip_dest)); - struct arphdr *inner_arp = (struct arphdr *)(inner_eth + 1/*sizeof(*inner_eth)*/); - unsigned char *sha; - unsigned char *tha = NULL; - __u32 *sip, *tip; - sha = (unsigned char*)(inner_arp + 1); - sip = (__u32 *)(sha + ETH_ALEN); - tha = (unsigned char *)sip + sizeof(__u32); - tip = (__u32 *)(tha + ETH_ALEN); - struct in_addr inner_arp_src_ip, inner_arp_dest_ip; - inner_arp_src_ip.s_addr = (__be32)*sip; -// inner_arp_dest_ip.s_addr = (__be32)*tip; + // TODO: Add inner IP support, refer to trn_process_inner_ip + // parse inner IP header + // struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); + // struct in_addr inner_ip_src, inner_ip_dest; + // inner_ip_src.s_addr = inner_ip->saddr; + // inner_ip_dest.s_addr = inner_ip->daddr; + + // parse inner arp header arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); struct in_addr arp_src_ip; arp_src_ip.s_addr = arp_msg->spa; struct in_addr arp_dest_ip; arp_dest_ip.s_addr = arp_msg->tpa; - printf("arp op: %d\n", - bpf_htons(inner_arp->ar_op)); - printf("arp source ip: %s, \n", + printf("AFTER MOD: arp op: %d\n", + bpf_htons(arp_msg->op)); + printf("AFTER MOD: arp source ip: %s, \n", inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) ); - printf("arp dest ip: %s, \n", + printf("AFTER MOD: arp dest ip: %s, \n", inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) ); - - return false; - } - - memcpy(tmp_mac, eth->h_dest, ETH_ALEN); - memcpy(eth->h_dest, eth->h_source, ETH_ALEN); - memcpy(eth->h_source, tmp_mac, ETH_ALEN); - - memcpy(&tmp_ip, &ipv6->saddr, sizeof(tmp_ip)); - memcpy(&ipv6->saddr, &ipv6->daddr, sizeof(tmp_ip)); - memcpy(&ipv6->daddr, &tmp_ip, sizeof(tmp_ip)); - - icmp->icmp6_type = ICMPV6_ECHO_REPLY; - - csum_replace2(&icmp->icmp6_cksum, - htons(ICMPV6_ECHO_REQUEST << 8), - htons(ICMPV6_ECHO_REPLY << 8)); - - /* Here we sent the packet out of the receive port. Note that + /* Here we sent the packet out of the receive port. Note that * we allocate one entry and schedule it. Your design would be * faster if you do batch processing/transmission */ - ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); - if (ret != 1) { - /* No more transmit slots, drop the packet */ - return false; - } + ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); + if (ret != 1) { + /* No more transmit slots, drop the packet */ + return false; + } + + xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = addr; + xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = len; + xsk_ring_prod__submit(&xsk->tx, 1); + xsk->outstanding_tx++; - xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = addr; - xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = len; - xsk_ring_prod__submit(&xsk->tx, 1); - xsk->outstanding_tx++; + xsk->stats.tx_bytes += len; + xsk->stats.tx_packets++; + printf("Packet sent via tx queue\n"); + printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); - xsk->stats.tx_bytes += len; - xsk->stats.tx_packets++; - return true; + return true; + } + printf("Endpoing hip == 0, returning false.\n"); + return false; } return false; } -static void handle_receive_packets(struct xsk_socket_info *xsk) +static void handle_receive_packets(struct xsk_socket_info *xsk, int* fd) { unsigned int rcvd, stock_frames, i; uint32_t idx_rx = 0, idx_fq = 0; @@ -488,7 +671,7 @@ static void handle_receive_packets(struct xsk_socket_info *xsk) uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; uint32_t len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; - if (!process_packet(xsk, addr, len)) + if (!process_packet(xsk, addr, len, fd)) xsk_free_umem_frame(xsk, addr); xsk->stats.rx_bytes += len; @@ -499,10 +682,11 @@ static void handle_receive_packets(struct xsk_socket_info *xsk) /* Do we need to wake up the kernel for transmission */ complete_tx(xsk); + printf("tx completed\n"); } static void rx_and_process(struct config *cfg, - struct xsk_socket_info *xsk_socket) + struct xsk_socket_info *xsk_socket, int* fd) { struct pollfd fds[2]; int ret, nfds = 1; @@ -510,14 +694,14 @@ static void rx_and_process(struct config *cfg, memset(fds, 0, sizeof(fds)); fds[0].fd = xsk_socket__fd(xsk_socket->xsk); fds[0].events = POLLIN; - + printf("%s\n", "Entering while loop to process packets."); while(!global_exit) { if (cfg->xsk_poll_mode) { ret = poll(fds, nfds, -1); if (ret <= 0 || ret > 1) continue; } - handle_receive_packets(xsk_socket); + handle_receive_packets(xsk_socket, fd); } } @@ -788,10 +972,17 @@ struct bpf_object *load_bpf_and_xdp_attach(struct config *cfg) return bpf_obj; } -void af_xdp_user::run_af_xdp(int argc, char *argv[]) +void af_xdp_user::run_af_xdp(std::string table_name_neighbor_ebpf_map) { + printf("%s", "af_xdp started\n"); + int fd_neighbor_ebpf_map = bpf_obj_get(table_name_neighbor_ebpf_map.c_str()); + if (fd_neighbor_ebpf_map < 0) { + printf("Failed to get xdp neighbor endpoint map fd, exiting\n"); + return; + } else { + printf("Got xdp neighbor endpoint map fd %d\n", fd_neighbor_ebpf_map); + } - printf("%s", "af_xdp started"); int ret; int xsks_map_fd; void *packet_buffer; @@ -911,7 +1102,7 @@ void af_xdp_user::run_af_xdp(int argc, char *argv[]) exit(EXIT_FAILURE); } /* Receive and count packets than drop them */ - rx_and_process(&cfg, xsk_socket); + rx_and_process(&cfg, xsk_socket, &fd_neighbor_ebpf_map); /* Cleanup */ xsk_socket__delete(xsk_socket->xsk); diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 52a7ad9..c309d8f 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -162,8 +162,9 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, &ep.hmac[3], &ep.hmac[4], &ep.hmac[5]); //disabling the element udpate, so that all packets will be sent to user space program. - int ebpf_rc = bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); + int ebpf_rc = 0;//bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); + printf("Inserted this neighbor into map: vip: %s, vni: %s\n", vpc_ip.c_str(), vni); // step #3 - async call to write/update to local db table 1 local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { get<0>(add_or_update_neighbor_db_stmt) = { vni, vpc_ip, host_ip, vpc_mac, host_mac, ver }; diff --git a/src/main.cpp b/src/main.cpp index 247e50c..14026b4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,7 +12,7 @@ // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include "grpc_client.h" +//#include "grpc_client.h" #include #include @@ -36,7 +36,7 @@ static char EMPTY_STRING[] = ""; // Global variables std::thread *g_grpc_client_thread = NULL; -ArionMasterWatcherImpl *g_grpc_client = NULL; +//ArionMasterWatcherImpl *g_grpc_client = NULL; string g_arion_master_address = EMPTY_STRING; string g_arion_master_port = "9090"; @@ -62,16 +62,16 @@ static void cleanup() { printf("%s", "Program exiting, cleaning up...\n"); // optional: delete all global objects allocated by libprotobuf. - google::protobuf::ShutdownProtobufLibrary(); +// google::protobuf::ShutdownProtobufLibrary(); // stop the grpc client - if (g_grpc_client != NULL) { - delete g_grpc_client; - g_grpc_client = NULL; - printf("%s", "Cleaned up grpc client.\n"); - } else { - printf("%s", "Unable to delete grpc client pointer since it is null.\n"); - } +// if (g_grpc_client != NULL) { +// delete g_grpc_client; +// g_grpc_client = NULL; +// printf("%s", "Cleaned up grpc client.\n"); +// } else { +// printf("%s", "Unable to delete grpc client pointer since it is null.\n"); +// } if (g_grpc_client_thread != NULL) { delete g_grpc_client_thread; @@ -98,10 +98,10 @@ int main(int argc, char *argv[]) { printf("%s", "Arion Agent started...\n"); // Register input key signal handlers - signal(SIGINT, signal_handler); - signal(SIGTERM, signal_handler); +// signal(SIGINT, signal_handler); +// signal(SIGTERM, signal_handler); - while ((option = getopt(argc, argv, "a:p:g:d:")) != -1) { + while ((option = getopt(argc, argv, "a:p:g:d")) != -1) { switch (option) { case 'a': g_arion_master_address = optarg; @@ -126,6 +126,8 @@ int main(int argc, char *argv[]) { } } + printf("Read arion master IP: [%s], arion master port: [%s], arion group name: [%s]\n", + g_arion_master_address.c_str(), g_arion_master_port.c_str(), g_arion_group.c_str()); // Create marl scheduler using all the logical processors available to the process. // Bind this scheduler to the main thread so we can call marl::schedule() marl::Scheduler::Config cfg_bind_hw_cores; @@ -135,17 +137,17 @@ int main(int argc, char *argv[]) { defer(task_scheduler.unbind()); // Create a separate thread to run the grpc client of List & Watch Arion Master (first sync from a known revision, and then watch for future updates) - g_grpc_client = new ArionMasterWatcherImpl(); - marl::schedule([=] { - g_grpc_client->RunClient(g_arion_master_address, - g_arion_master_port, - g_arion_group, - g_arion_neighbor_table); - }); +// g_grpc_client = new ArionMasterWatcherImpl(); +// marl::schedule([=] { +// g_grpc_client->RunClient(g_arion_master_address, +// g_arion_master_port, +// g_arion_group, +// g_arion_neighbor_table); +// }); marl::schedule([=] { auto af = af_xdp_user(); - af.run_af_xdp(argc, argv); + af.run_af_xdp(g_arion_neighbor_table); }); pause(); cleanup(); From b8943abf325a12531294831d74e4726e869da7f6 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Wed, 9 Nov 2022 18:04:22 -0800 Subject: [PATCH 08/33] Rewrote db_client.h so that it doesn't crash; next is to do the same for grpc_client --- include/db_client.h | 247 +++++++++++++++++++++++++++------------ src/comm/af_xdp_user.cpp | 4 +- src/comm/grpc_client.cpp | 22 ++-- src/main.cpp | 1 + 4 files changed, 187 insertions(+), 87 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index 59b445e..551e0ae 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -20,6 +20,7 @@ using namespace sqlite_orm; + struct Neighbor { int vni; std::string vpc_ip; @@ -36,29 +37,134 @@ struct ProgrammingState { static std::string g_local_db_path = "/var/local/arion/arion_wing.db"; // Schema definition (create DB if not exists) or retrieved handle (get DB if exists already) of local db -static auto local_db = make_storage(g_local_db_path, - make_table("neighbor", - make_column("vni", &Neighbor::vni), - make_column("vpc_ip", &Neighbor::vpc_ip), - make_column("host_ip", &Neighbor::host_ip), - make_column("vpc_mac", &Neighbor::vpc_mac), - make_column("host_mac", &Neighbor::host_mac), - make_column("version", &Neighbor::version), - primary_key(&Neighbor::vni, &Neighbor::vpc_ip) - ), - make_table("journal", - make_column("version", &ProgrammingState::version), - primary_key(&ProgrammingState::version) - ) -); +//static auto local_db = make_storage(g_local_db_path, +// make_table("neighbor", +// make_column("vni", &Neighbor::vni), +// make_column("vpc_ip", &Neighbor::vpc_ip), +// make_column("host_ip", &Neighbor::host_ip), +// make_column("vpc_mac", &Neighbor::vpc_mac), +// make_column("host_mac", &Neighbor::host_mac), +// make_column("version", &Neighbor::version), +// primary_key(&Neighbor::vni, &Neighbor::vpc_ip) +// ), +// make_table("journal", +// make_column("version", &ProgrammingState::version), +// primary_key(&ProgrammingState::version) +// ) +//); // Create local db writer single thread execution queue -static dispatch_queue local_db_writer_queue("Local db background write queue", 1); - -static int FindLKGVersion() { - int lkg_ver = 0; +//static dispatch_queue local_db_writer_queue("Local db background write queue", 1); +// +//static int FindLKGVersion() { +// int lkg_ver = 0; +// +// /* original sql is +// SELECT MIN(mo.version) + 1 +// FROM journal AS mo +// WHERE NOT EXISTS +// ( +// SELECT 0 - mi.version +// FROM journal AS mi +// WHERE mo.version + 1 = mi.version +// ); +// */ +// +// using als_mo = alias_a; +// using als_mi = alias_b; +// auto ver_gaps = local_db.select(alias_column(&ProgrammingState::version), +// from(), +// where(not exists( +// select(0 - c(alias_column(&ProgrammingState::version)), +// from(), +// where(is_equal(c(alias_column(&ProgrammingState::version)) + 1, alias_column(&ProgrammingState::version))) +// )))); +// +// // lkg version: +// // case 1 - if no ver gap, the query above will return the max version (since this version is already programmed, so return max + 1) +// // case 2 - if there's ver gap, then always locate the min ver gap (as above, return minVerGap + 1) +// // case 3 - if the table is empty like new launched instance, then always sync/watch from server with version 1 +// // (since server syncs including the version agent provides, so sync/watch from version 1 means sync everything +// if (ver_gaps.size() > 0) { +// lkg_ver = *std::min_element(ver_gaps.begin(), ver_gaps.end()); +// } +// +// return lkg_ver + 1; +//} - /* original sql is +/* SELECT host_ip, vpc_mac, host_mac + * FROM neighbor + * WHERE vni=%{vni} AND vpc_ip=%{vpc_ip} + * */ +//static auto query_neighbor_statement = +// local_db.prepare(select(columns(&Neighbor::host_ip, &Neighbor::vpc_mac, &Neighbor::host_mac), +// where(is_equal((&Neighbor::vni), 0) and is_equal((&Neighbor::vpc_ip), "127.0.0.1")))); + +//static endpoint_t GetNeighbor(int vni, std::string vpc_ip) { +// endpoint_t found_neighbor; +// printf("GetNeighbor with VNI: [%d], vpc_ip: [%s]\n", vni, vpc_ip.c_str()); +// get<0>(query_neighbor_statement) = vni; +// get<1>(query_neighbor_statement) = vpc_ip.c_str(); +// printf("Statement: %s\n", query_neighbor_statement.sql().c_str()); +// auto rows = local_db.execute(query_neighbor_statement); +// printf("Found %ld rows\n", rows.size()); +// for (auto& row : rows) { +// struct sockaddr_in ep_hip; +// inet_pton(AF_INET, get<0>(row).c_str(), &(ep_hip.sin_addr)); +// found_neighbor.hip = ep_hip.sin_addr.s_addr; +// +// std::sscanf(get<1>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", +// &found_neighbor.mac[0], &found_neighbor.mac[1], &found_neighbor.mac[2], +// &found_neighbor.mac[3], &found_neighbor.mac[4], &found_neighbor.mac[5]); +// +// std::sscanf(get<2>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", +// &found_neighbor.hmac[0], &found_neighbor.hmac[1], &found_neighbor.hmac[2], +// &found_neighbor.hmac[3], &found_neighbor.hmac[4], &found_neighbor.hmac[5]); +// +// printf("host_ip: %s, vpc_mac: %s, host_mac: %s\n", get<0>(row).c_str(), get<1>(row).c_str(), get<2>(row).c_str()); +// } +// return found_neighbor; +//} +inline auto make_storage_query () { + return make_storage(g_local_db_path, + make_table("neighbor", + make_column("vni", &Neighbor::vni), + make_column("vpc_ip", &Neighbor::vpc_ip), + make_column("host_ip", &Neighbor::host_ip), + make_column("vpc_mac", &Neighbor::vpc_mac), + make_column("host_mac", &Neighbor::host_mac), + make_column("version", &Neighbor::version), + primary_key(&Neighbor::vni, &Neighbor::vpc_ip) + ), + make_table("journal", + make_column("version", &ProgrammingState::version), + primary_key(&ProgrammingState::version) + ) + ); +}; + +using Storage = decltype(make_storage_query()); + +class db_client { + +public: + static db_client &get_instance() { + static db_client instance; + return instance; + }; + + Storage local_db = make_storage_query(); + using NeighborPrepareStatement = decltype(local_db.prepare(select(columns(&Neighbor::host_ip, &Neighbor::vpc_mac, &Neighbor::host_mac), + where(is_equal((&Neighbor::vni), 0) and is_equal((&Neighbor::vpc_ip), "127.0.0.1"))))); + NeighborPrepareStatement query_neighbor_statement = local_db.prepare(select(columns(&Neighbor::host_ip, &Neighbor::vpc_mac, &Neighbor::host_mac), + where(is_equal((&Neighbor::vni), 0) and is_equal((&Neighbor::vpc_ip), "127.0.0.1")))); + // Create local db writer single thread execution queue + dispatch_queue local_db_writer_queue = dispatch_queue("Local db background write queue", 1); + + int FindLKGVersion() { + int lkg_ver = 0; + + /* original sql is SELECT MIN(mo.version) + 1 FROM journal AS mo WHERE NOT EXISTS @@ -69,58 +175,51 @@ static int FindLKGVersion() { ); */ - using als_mo = alias_a; - using als_mi = alias_b; - auto ver_gaps = local_db.select(alias_column(&ProgrammingState::version), - from(), - where(not exists( - select(0 - c(alias_column(&ProgrammingState::version)), - from(), - where(is_equal(c(alias_column(&ProgrammingState::version)) + 1, alias_column(&ProgrammingState::version))) - )))); - - // lkg version: - // case 1 - if no ver gap, the query above will return the max version (since this version is already programmed, so return max + 1) - // case 2 - if there's ver gap, then always locate the min ver gap (as above, return minVerGap + 1) - // case 3 - if the table is empty like new launched instance, then always sync/watch from server with version 1 - // (since server syncs including the version agent provides, so sync/watch from version 1 means sync everything - if (ver_gaps.size() > 0) { - lkg_ver = *std::min_element(ver_gaps.begin(), ver_gaps.end()); - } - - return lkg_ver + 1; -} - -/* SELECT host_ip, vpc_mac, host_mac - * FROM neighbor - * WHERE vni=%{vni} AND vpc_ip=%{vpc_ip} - * */ -static auto query_neighbor_statement = - local_db.prepare(select(columns(&Neighbor::host_ip, &Neighbor::vpc_mac, &Neighbor::host_mac), - where(is_equal((&Neighbor::vni), 0) and is_equal((&Neighbor::vpc_ip), "127.0.0.1")))); - -static endpoint_t GetNeighbor(int vni, std::string vpc_ip) { - endpoint_t found_neighbor; - printf("GetNeighbor with VNI: [%d], vpc_ip: [%s]\n", vni, vpc_ip.c_str()); - get<0>(query_neighbor_statement) = vni; - get<1>(query_neighbor_statement) = vpc_ip.c_str(); - printf("Statement: %s\n", query_neighbor_statement.sql().c_str()); - auto rows = local_db.execute(query_neighbor_statement); - printf("Found %ld rows\n", rows.size()); - for (auto& row : rows) { - struct sockaddr_in ep_hip; - inet_pton(AF_INET, get<0>(row).c_str(), &(ep_hip.sin_addr)); - found_neighbor.hip = ep_hip.sin_addr.s_addr; - - std::sscanf(get<1>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", - &found_neighbor.mac[0], &found_neighbor.mac[1], &found_neighbor.mac[2], - &found_neighbor.mac[3], &found_neighbor.mac[4], &found_neighbor.mac[5]); - - std::sscanf(get<2>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", - &found_neighbor.hmac[0], &found_neighbor.hmac[1], &found_neighbor.hmac[2], - &found_neighbor.hmac[3], &found_neighbor.hmac[4], &found_neighbor.hmac[5]); - - printf("host_ip: %s, vpc_mac: %s, host_mac: %s\n", get<0>(row).c_str(), get<1>(row).c_str(), get<2>(row).c_str()); + using als_mo = alias_a; + using als_mi = alias_b; + auto ver_gaps = local_db.select(alias_column(&ProgrammingState::version), + from(), + where(not exists( + select(0 - c(alias_column(&ProgrammingState::version)), + from(), + where(is_equal(c(alias_column(&ProgrammingState::version)) + 1, alias_column(&ProgrammingState::version))) + )))); + + // lkg version: + // case 1 - if no ver gap, the query above will return the max version (since this version is already programmed, so return max + 1) + // case 2 - if there's ver gap, then always locate the min ver gap (as above, return minVerGap + 1) + // case 3 - if the table is empty like new launched instance, then always sync/watch from server with version 1 + // (since server syncs including the version agent provides, so sync/watch from version 1 means sync everything + if (ver_gaps.size() > 0) { + lkg_ver = *std::min_element(ver_gaps.begin(), ver_gaps.end()); + } + + return lkg_ver + 1; + }; + + endpoint_t GetNeighbor(int vni, std::string vpc_ip) { + endpoint_t found_neighbor; + printf("GetNeighbor with VNI: [%d], vpc_ip: [%s]\n", vni, vpc_ip.c_str()); + get<0>(query_neighbor_statement) = vni; + get<1>(query_neighbor_statement) = vpc_ip.c_str(); + printf("Statement: %s\n", query_neighbor_statement.sql().c_str()); + auto rows = local_db.execute(query_neighbor_statement); + printf("Found %ld rows\n", rows.size()); + for (auto& row : rows) { + struct sockaddr_in ep_hip; + inet_pton(AF_INET, get<0>(row).c_str(), &(ep_hip.sin_addr)); + found_neighbor.hip = ep_hip.sin_addr.s_addr; + + std::sscanf(get<1>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + &found_neighbor.mac[0], &found_neighbor.mac[1], &found_neighbor.mac[2], + &found_neighbor.mac[3], &found_neighbor.mac[4], &found_neighbor.mac[5]); + + std::sscanf(get<2>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + &found_neighbor.hmac[0], &found_neighbor.hmac[1], &found_neighbor.hmac[2], + &found_neighbor.hmac[3], &found_neighbor.hmac[4], &found_neighbor.hmac[5]); + + printf("host_ip: %s, vpc_mac: %s, host_mac: %s\n", get<0>(row).c_str(), get<1>(row).c_str(), get<2>(row).c_str()); + } + return found_neighbor; } - return found_neighbor; -} \ No newline at end of file +}; \ No newline at end of file diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp index 35206a5..c4b562d 100644 --- a/src/comm/af_xdp_user.cpp +++ b/src/comm/af_xdp_user.cpp @@ -11,7 +11,7 @@ //#include #include #include -//#include +#include //#include #include "af_xdp_user.h" #include @@ -505,7 +505,7 @@ static bool process_packet(struct xsk_socket_info *xsk, ); endpoint_key_t epkey; endpoint_t ep_value; -// ep_value = GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_src_ip)); + ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_src_ip)); if (ep_value.hip > 0) { epkey.vni = trn_get_vni(vxlan->vni); struct sockaddr_in ep_ip; diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index c309d8f..2e0fc34 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -37,7 +37,7 @@ #include "arionmaster.grpc.pb.h" #include "db_client.h" #include "grpc_client.h" -#include "xdp/trn_datamodel.h" +//#include "xdp/trn_datamodel.h" using namespace arion::schema; @@ -47,8 +47,8 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, arion::schema::NeighborRule reply; // prepared statements for better performance of db writing in completion queue - auto add_or_update_neighbor_db_stmt = local_db.prepare(replace(Neighbor{ 0, "", "", "", "", 0 })); - auto add_programmed_version_db_stmt = local_db.prepare(insert(ProgrammingState{ 0 })); + auto add_or_update_neighbor_db_stmt = db_client::get_instance().local_db.prepare(replace(Neighbor{ 0, "", "", "", "", 0 })); + auto add_programmed_version_db_stmt = db_client::get_instance().local_db.prepare(insert(ProgrammingState{ 0 })); // check current grpc channel state, try to connect if needed grpc_connectivity_state current_state = chan_->GetState(true); @@ -166,23 +166,23 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, int ebpf_rc = 0;//bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); printf("Inserted this neighbor into map: vip: %s, vni: %s\n", vpc_ip.c_str(), vni); // step #3 - async call to write/update to local db table 1 - local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { + db_client::get_instance().local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { get<0>(add_or_update_neighbor_db_stmt) = { vni, vpc_ip, host_ip, vpc_mac, host_mac, ver }; - local_db.execute(add_or_update_neighbor_db_stmt); + db_client::get_instance().local_db.execute(add_or_update_neighbor_db_stmt); }); // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded if (0 == ebpf_rc) { - local_db_writer_queue.dispatch([ver, &add_programmed_version_db_stmt] { + db_client::get_instance().local_db_writer_queue.dispatch([ver, &add_programmed_version_db_stmt] { get<0>(add_programmed_version_db_stmt) = { ver }; - local_db.execute(add_programmed_version_db_stmt); + db_client::get_instance().local_db.execute(add_programmed_version_db_stmt); }); } } else { // step #4 (case 2) - always write to local db table 2 (programming journal) when version intended ignored (no need to program older version) - local_db_writer_queue.dispatch([ver, &add_programmed_version_db_stmt] { + db_client::get_instance().local_db_writer_queue.dispatch([ver, &add_programmed_version_db_stmt] { get<0>(add_programmed_version_db_stmt) = { ver }; - local_db.execute(add_programmed_version_db_stmt); + db_client::get_instance().local_db.execute(add_programmed_version_db_stmt); }); } } else { @@ -231,10 +231,10 @@ void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::st } // Create (if db not exists) or connect (if db exists already) to local db - local_db.sync_schema(); + db_client::get_instance().local_db.sync_schema(); // Find lkg version to reconcile/sync from server - int rev_lkg = FindLKGVersion(); + int rev_lkg = db_client::get_instance().FindLKGVersion(); printf("Found last known good version: %d from local db to sync from server\n", rev_lkg); this->ConnectToArionMaster(); diff --git a/src/main.cpp b/src/main.cpp index 14026b4..deee426 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -26,6 +26,7 @@ #include "marl/scheduler.h" #include "marl/waitgroup.h" #include "af_xdp_user.h" +//#include "grpc_client.h" using namespace std; using std::string; From 910465642a477681d3afc53a64752d4409a3fb6e Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Thu, 10 Nov 2022 15:07:31 -0800 Subject: [PATCH 09/33] Check in updates, right now program crashes near the db insert in grpc client --- include/af_xdp_user.h | 2 +- src/comm/af_xdp_user.cpp | 17 +++++++------ src/comm/grpc_client.cpp | 55 +++++++++++++++++++++++++--------------- src/main.cpp | 52 ++++++++++++++++++------------------- 4 files changed, 71 insertions(+), 55 deletions(-) diff --git a/include/af_xdp_user.h b/include/af_xdp_user.h index 7d14621..5c94266 100644 --- a/include/af_xdp_user.h +++ b/include/af_xdp_user.h @@ -25,7 +25,7 @@ class af_xdp_user { af_xdp_user() { printf("%s", "Start of af_xdp userspace program."); } - void run_af_xdp(std::string table_name_neighbor_ebpf_map); + void run_af_xdp(/*std::string table_name_neighbor_ebpf_map*/); private: }; diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp index c4b562d..eaed17b 100644 --- a/src/comm/af_xdp_user.cpp +++ b/src/comm/af_xdp_user.cpp @@ -513,7 +513,7 @@ static bool process_packet(struct xsk_socket_info *xsk, epkey.ip = ep_ip.sin_addr.s_addr; // we now have key and value, can modify the packet and update the map now. int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); - printf("Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", + printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", inet_ntoa(arp_src_ip), trn_get_vni(vxlan->vni), ebpf_rc); // // TODO: step #3 - async call to write/update to local db table 1 // local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { @@ -972,16 +972,17 @@ struct bpf_object *load_bpf_and_xdp_attach(struct config *cfg) return bpf_obj; } -void af_xdp_user::run_af_xdp(std::string table_name_neighbor_ebpf_map) +void af_xdp_user::run_af_xdp() { printf("%s", "af_xdp started\n"); + std::string table_name_neighbor_ebpf_map = "/sys/fs/bpf/endpoints_map"; int fd_neighbor_ebpf_map = bpf_obj_get(table_name_neighbor_ebpf_map.c_str()); - if (fd_neighbor_ebpf_map < 0) { - printf("Failed to get xdp neighbor endpoint map fd, exiting\n"); - return; - } else { - printf("Got xdp neighbor endpoint map fd %d\n", fd_neighbor_ebpf_map); - } +// if (fd_neighbor_ebpf_map < 0) { +// printf("Failed to get xdp neighbor endpoint map fd, exiting\n"); +// return; +// } else { +// printf("Got xdp neighbor endpoint map fd %d\n", fd_neighbor_ebpf_map); +// } int ret; int xsks_map_fd; diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 2e0fc34..56a8cd9 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -74,6 +74,7 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, std::atomic i(tag_watch + 1); bool write_done = false; while (cq->Next(&got_tag, &ok)) { + printf("Read one from grpc stream\n"); if (ok) { if (!write_done) { printf("Completion queue: initial task response received\n"); @@ -98,12 +99,14 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, &add_or_update_neighbor_db_stmt, &add_programmed_version_db_stmt] { // step #1 - check and store as in concurrent hash map std::string neighbor_key = std::to_string(vni) + "-" + vpc_ip; - + printf("vpc_ip is NOT empty: [%s]\n", vpc_ip.c_str()); bool ebpf_ignored = false; bool map_updated = false; int update_ct = 0, max_update_ct = 5; while (!map_updated && (update_ct < max_update_ct)) { + printf("Inside while loop, map_updated = [%b], update_ct = [%ld], max_update_ct = [%ld]\n", + map_updated, update_ct, max_update_ct); auto neighbor_pos = neighbor_task_map.find(neighbor_key); if (neighbor_pos == neighbor_task_map.end()) { // key not found, try insert. The function returns successful only when key not exists when inserting @@ -112,8 +115,10 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, if (res_insert.second) { // means successfully inserted, done with update map_updated = true; + printf("Found neighbor key in neighbor_task_map\n"); } // 'else' means another thread already inserted before me, then it's not an insert case and next time in the loop will go to case of update } else { + printf("Didn't find neighbor key in neighbor_task_map\n"); // key found, means multi neighbor versions might update at the same time int cur_ver = neighbor_pos->second; @@ -141,44 +146,50 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, if (map_updated) { if (!ebpf_ignored) { + printf("ebpf_ignored = false\n"); // step #2 - sync syscall ebpf map programming with return code endpoint_key_t epkey; epkey.vni = vni; struct sockaddr_in ep_ip; inet_pton(AF_INET, vpc_ip.c_str(), &(ep_ip.sin_addr)); epkey.ip = ep_ip.sin_addr.s_addr; - + printf("Filled in ep.ip\n"); endpoint_t ep; struct sockaddr_in ep_hip; inet_pton(AF_INET, host_ip.c_str(), &(ep_hip.sin_addr)); ep.hip = ep_hip.sin_addr.s_addr; + printf("Filled in ep.hip\n"); std::sscanf(vpc_mac.c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", &ep.mac[0], &ep.mac[1], &ep.mac[2], &ep.mac[3], &ep.mac[4], &ep.mac[5]); + printf("Filled in ep.mac\n"); std::sscanf(host_mac.c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", &ep.hmac[0], &ep.hmac[1], &ep.hmac[2], &ep.hmac[3], &ep.hmac[4], &ep.hmac[5]); + printf("Filled in ep.hmac\n"); //disabling the element udpate, so that all packets will be sent to user space program. int ebpf_rc = 0;//bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); - printf("Inserted this neighbor into map: vip: %s, vni: %s\n", vpc_ip.c_str(), vni); + printf("GPPC: Inserted this neighbor into map: vip: %s, vni: %s\n", vpc_ip.c_str(), vni); // step #3 - async call to write/update to local db table 1 - db_client::get_instance().local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { - get<0>(add_or_update_neighbor_db_stmt) = { vni, vpc_ip, host_ip, vpc_mac, host_mac, ver }; - db_client::get_instance().local_db.execute(add_or_update_neighbor_db_stmt); - }); - +// db_client::get_instance().local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { +// get<0>(add_or_update_neighbor_db_stmt) = { vni, vpc_ip, host_ip, vpc_mac, host_mac, ver }; +// db_client::get_instance().local_db.execute(add_or_update_neighbor_db_stmt); +// }); + printf("Dispatched local db neighbor insert\n"); // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded - if (0 == ebpf_rc) { - db_client::get_instance().local_db_writer_queue.dispatch([ver, &add_programmed_version_db_stmt] { - get<0>(add_programmed_version_db_stmt) = { ver }; - db_client::get_instance().local_db.execute(add_programmed_version_db_stmt); - }); - } +// if (0 == ebpf_rc) { +// db_client::get_instance().local_db_writer_queue.dispatch([ver, &add_programmed_version_db_stmt] { +// get<0>(add_programmed_version_db_stmt) = { ver }; +// db_client::get_instance().local_db.execute(add_programmed_version_db_stmt); +// }); +// } + printf("Dispatched local db journal insert\n"); } else { + printf("ebpf_ignored = true\n"); // step #4 (case 2) - always write to local db table 2 (programming journal) when version intended ignored (no need to program older version) db_client::get_instance().local_db_writer_queue.dispatch([ver, &add_programmed_version_db_stmt] { get<0>(add_programmed_version_db_stmt) = { ver }; @@ -191,8 +202,12 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, i++; }); + } else { + printf("vpc_ip is empty\n"); } } + } else { + printf("NOT okay\n"); } } } @@ -223,12 +238,12 @@ void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::st // Retrieve neighbor's ebpf map fd (handle) fd_neighbor_ebpf_map = bpf_obj_get(table_name_neighbor_ebpf_map.c_str()); - if (fd_neighbor_ebpf_map < 0) { - printf("Failed to get xdp neighbor endpoint map fd, exiting\n"); - return; - } else { - printf("Got xdp neighbor endpoint map fd %d\n", fd_neighbor_ebpf_map); - } +// if (fd_neighbor_ebpf_map < 0) { +// printf("Failed to get xdp neighbor endpoint map fd, exiting\n"); +// return; +// } else { +// printf("Got xdp neighbor endpoint map fd %d\n", fd_neighbor_ebpf_map); +// } // Create (if db not exists) or connect (if db exists already) to local db db_client::get_instance().local_db.sync_schema(); diff --git a/src/main.cpp b/src/main.cpp index deee426..59e1cbd 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -26,7 +26,7 @@ #include "marl/scheduler.h" #include "marl/waitgroup.h" #include "af_xdp_user.h" -//#include "grpc_client.h" +#include "grpc_client.h" using namespace std; using std::string; @@ -37,7 +37,7 @@ static char EMPTY_STRING[] = ""; // Global variables std::thread *g_grpc_client_thread = NULL; -//ArionMasterWatcherImpl *g_grpc_client = NULL; +ArionMasterWatcherImpl *g_grpc_client = NULL; string g_arion_master_address = EMPTY_STRING; string g_arion_master_port = "9090"; @@ -63,24 +63,24 @@ static void cleanup() { printf("%s", "Program exiting, cleaning up...\n"); // optional: delete all global objects allocated by libprotobuf. -// google::protobuf::ShutdownProtobufLibrary(); + google::protobuf::ShutdownProtobufLibrary(); // stop the grpc client -// if (g_grpc_client != NULL) { -// delete g_grpc_client; -// g_grpc_client = NULL; -// printf("%s", "Cleaned up grpc client.\n"); -// } else { -// printf("%s", "Unable to delete grpc client pointer since it is null.\n"); -// } - - if (g_grpc_client_thread != NULL) { - delete g_grpc_client_thread; - g_grpc_client_thread = NULL; - printf("%s", "Cleaned up grpc client thread.\n"); + if (g_grpc_client != NULL) { + delete g_grpc_client; + g_grpc_client = NULL; + printf("%s", "Cleaned up grpc client.\n"); } else { - printf("%s", "Unable to call delete grpc client thread pointer since it is null.\n"); + printf("%s", "Unable to delete grpc client pointer since it is null.\n"); } + +// if (g_grpc_client_thread != NULL) { +// delete g_grpc_client_thread; +// g_grpc_client_thread = NULL; +// printf("%s", "Cleaned up grpc client thread.\n"); +// } else { +// printf("%s", "Unable to call delete grpc client thread pointer since it is null.\n"); +// } } // function to handle ctrl-c and kill process @@ -99,8 +99,8 @@ int main(int argc, char *argv[]) { printf("%s", "Arion Agent started...\n"); // Register input key signal handlers -// signal(SIGINT, signal_handler); -// signal(SIGTERM, signal_handler); + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); while ((option = getopt(argc, argv, "a:p:g:d")) != -1) { switch (option) { @@ -138,17 +138,17 @@ int main(int argc, char *argv[]) { defer(task_scheduler.unbind()); // Create a separate thread to run the grpc client of List & Watch Arion Master (first sync from a known revision, and then watch for future updates) -// g_grpc_client = new ArionMasterWatcherImpl(); -// marl::schedule([=] { -// g_grpc_client->RunClient(g_arion_master_address, -// g_arion_master_port, -// g_arion_group, -// g_arion_neighbor_table); -// }); + g_grpc_client = new ArionMasterWatcherImpl(); + marl::schedule([=] { + g_grpc_client->RunClient(g_arion_master_address, + g_arion_master_port, + g_arion_group, + g_arion_neighbor_table); + }); marl::schedule([=] { auto af = af_xdp_user(); - af.run_af_xdp(g_arion_neighbor_table); + af.run_af_xdp(/*g_arion_neighbor_table*/); }); pause(); cleanup(); From acde1bc39667a529cd5148a836cc13c055d0bb48 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Mon, 14 Nov 2022 13:33:44 -0800 Subject: [PATCH 10/33] This commit enables the happy path for pinging; ARP request/reply is successful --- src/comm/af_xdp_user.cpp | 5 +++-- src/comm/grpc_client.cpp | 22 +++++++++++----------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp index eaed17b..a250076 100644 --- a/src/comm/af_xdp_user.cpp +++ b/src/comm/af_xdp_user.cpp @@ -505,11 +505,12 @@ static bool process_packet(struct xsk_socket_info *xsk, ); endpoint_key_t epkey; endpoint_t ep_value; - ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_src_ip)); + ep_value.hip = 0; + ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_dest_ip)); if (ep_value.hip > 0) { epkey.vni = trn_get_vni(vxlan->vni); struct sockaddr_in ep_ip; - inet_pton(AF_INET, inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); + inet_pton(AF_INET, inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); epkey.ip = ep_ip.sin_addr.s_addr; // we now have key and value, can modify the packet and update the map now. int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 56a8cd9..89ba399 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -173,20 +173,20 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, //disabling the element udpate, so that all packets will be sent to user space program. int ebpf_rc = 0;//bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); - printf("GPPC: Inserted this neighbor into map: vip: %s, vni: %s\n", vpc_ip.c_str(), vni); + printf("GPPC: Inserted this neighbor into map: vip: %s, vni: %d\n", vpc_ip.c_str(), vni); // step #3 - async call to write/update to local db table 1 -// db_client::get_instance().local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { -// get<0>(add_or_update_neighbor_db_stmt) = { vni, vpc_ip, host_ip, vpc_mac, host_mac, ver }; -// db_client::get_instance().local_db.execute(add_or_update_neighbor_db_stmt); -// }); + db_client::get_instance().local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { + get<0>(add_or_update_neighbor_db_stmt) = { vni, vpc_ip, host_ip, vpc_mac, host_mac, ver }; + db_client::get_instance().local_db.execute(add_or_update_neighbor_db_stmt); + }); printf("Dispatched local db neighbor insert\n"); // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded -// if (0 == ebpf_rc) { -// db_client::get_instance().local_db_writer_queue.dispatch([ver, &add_programmed_version_db_stmt] { -// get<0>(add_programmed_version_db_stmt) = { ver }; -// db_client::get_instance().local_db.execute(add_programmed_version_db_stmt); -// }); -// } + if (0 == ebpf_rc) { + db_client::get_instance().local_db_writer_queue.dispatch([ver, &add_programmed_version_db_stmt] { + get<0>(add_programmed_version_db_stmt) = { ver }; + db_client::get_instance().local_db.execute(add_programmed_version_db_stmt); + }); + } printf("Dispatched local db journal insert\n"); } else { printf("ebpf_ignored = true\n"); From e336e905fea406a12f5589ef08a627f0e2727a30 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 18 Nov 2022 13:15:16 -0800 Subject: [PATCH 11/33] AF_XDP working for ARP and IP --- include/db_client.h | 1 + src/comm/af_xdp_user.cpp | 339 ++++++++++++++++++++++++--------------- 2 files changed, 215 insertions(+), 125 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index 551e0ae..c57d80c 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -199,6 +199,7 @@ class db_client { endpoint_t GetNeighbor(int vni, std::string vpc_ip) { endpoint_t found_neighbor; + found_neighbor.hip = 0; printf("GetNeighbor with VNI: [%d], vpc_ip: [%s]\n", vni, vpc_ip.c_str()); get<0>(query_neighbor_statement) = vni; get<1>(query_neighbor_statement) = vpc_ip.c_str(); diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp index a250076..92c84f5 100644 --- a/src/comm/af_xdp_user.cpp +++ b/src/comm/af_xdp_user.cpp @@ -482,151 +482,240 @@ static bool process_packet(struct xsk_socket_info *xsk, inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], inner_eth->h_proto); - // TODO: Add inner IP support, refer to trn_process_inner_ip - // parse inner IP header -// struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); -// struct in_addr inner_ip_src, inner_ip_dest; -// inner_ip_src.s_addr = inner_ip->saddr; -// inner_ip_dest.s_addr = inner_ip->daddr; - - // parse inner arp header - arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); - struct in_addr arp_src_ip; - arp_src_ip.s_addr = arp_msg->spa; - struct in_addr arp_dest_ip; - arp_dest_ip.s_addr = arp_msg->tpa; - printf("arp op: %d\n", - bpf_htons(arp_msg->op)); - printf("arp source ip: %s, \n", - inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) - ); - printf("arp dest ip: %s, \n", - inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) - ); - endpoint_key_t epkey; - endpoint_t ep_value; - ep_value.hip = 0; - ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_dest_ip)); - if (ep_value.hip > 0) { - epkey.vni = trn_get_vni(vxlan->vni); - struct sockaddr_in ep_ip; - inet_pton(AF_INET, inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); - epkey.ip = ep_ip.sin_addr.s_addr; - // we now have key and value, can modify the packet and update the map now. - int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); - printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", - inet_ntoa(arp_src_ip), trn_get_vni(vxlan->vni), ebpf_rc); -// // TODO: step #3 - async call to write/update to local db table 1 -// local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { -// get<0>(add_or_update_neighbor_db_stmt) = { vni, vpc_ip, host_ip, vpc_mac, host_mac, ver }; -// local_db.execute(add_or_update_neighbor_db_stmt); -// }); - - /* Modify pkt for inner ARP response */ - arp_msg->op = bpf_htons(ARPOP_REPLY); - trn_set_mac(arp_msg->tha, arp_msg->sha); - trn_set_mac(arp_msg->sha, ep_value.mac); - - __u32 tmp_ip = arp_msg->spa;//*sip; - arp_msg->spa = arp_msg->tpa;//*tip; - arp_msg->tpa = tmp_ip; - - /* Modify inner EitherHdr, pretend it's from target */ - trn_set_dst_mac(inner_eth, inner_eth->h_source); - trn_set_src_mac(inner_eth, ep_value.mac); - - /* Keep overlay header, swap outer IP header */ - trn_set_src_dst_ip_csum(ip, ip->daddr, ip->saddr, (eth + len)); - trn_swap_src_dst_mac(pkt); - - /* - * Packet modification finished, read packet content again, in order to verify the mod - * */ - - struct ethhdr *eth = (struct ethhdr *) pkt; - - if (ntohs(eth->h_proto) != ETH_P_IP) { - printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); - return false; - } - printf("AFTER MOD: Packet length: %ld\n", len); - printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" - "eth size: %d\n", - eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], - eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], - bpf_ntohs(eth->h_proto), - sizeof(*eth)); - - // parse outer IP header - struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); - struct in_addr outer_ip_src; - outer_ip_src.s_addr = ip->saddr; - struct in_addr outer_ip_dest; - outer_ip_dest.s_addr = ip->daddr; - printf("AFTER MOD: Outer ip src: %s, ip dest: %s\n" - "AFTER MOD: Outer ip ihl: %d, version: %d\n", - inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), - ip->ihl, ip->version); - - // parse UDP header - struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); - printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", - udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); - - // parse VXLAN header - struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); - printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); - - // parse inner eth header - struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); - printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", - inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], - inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], - inner_eth->h_proto); - - // TODO: Add inner IP support, refer to trn_process_inner_ip - // parse inner IP header - // struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); - // struct in_addr inner_ip_src, inner_ip_dest; - // inner_ip_src.s_addr = inner_ip->saddr; - // inner_ip_dest.s_addr = inner_ip->daddr; - + if (ntohs(inner_eth->h_proto) == ETH_P_ARP) { // parse inner arp header arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); struct in_addr arp_src_ip; arp_src_ip.s_addr = arp_msg->spa; struct in_addr arp_dest_ip; arp_dest_ip.s_addr = arp_msg->tpa; - printf("AFTER MOD: arp op: %d\n", + printf("arp op: %d\n", bpf_htons(arp_msg->op)); - printf("AFTER MOD: arp source ip: %s, \n", + printf("arp source ip: %s, \n", inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) ); - printf("AFTER MOD: arp dest ip: %s, \n", + printf("arp dest ip: %s, \n", inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) ); - /* Here we sent the packet out of the receive port. Note that + endpoint_key_t epkey; + endpoint_t ep_value; + ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_dest_ip)); + if (ep_value.hip > 0) { + epkey.vni = trn_get_vni(vxlan->vni); + struct sockaddr_in ep_ip; + inet_pton(AF_INET, inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); + epkey.ip = ep_ip.sin_addr.s_addr; + // we now have key and value, can modify the packet and update the map now. +// int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); + printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", + inet_ntoa(arp_src_ip), trn_get_vni(vxlan->vni), 0); + + /* Modify pkt for inner ARP response */ + arp_msg->op = bpf_htons(ARPOP_REPLY); + trn_set_mac(arp_msg->tha, arp_msg->sha); + trn_set_mac(arp_msg->sha, ep_value.mac); + + __u32 tmp_ip = arp_msg->spa;//*sip; + arp_msg->spa = arp_msg->tpa;//*tip; + arp_msg->tpa = tmp_ip; + + /* Modify inner EitherHdr, pretend it's from target */ + trn_set_dst_mac(inner_eth, inner_eth->h_source); + trn_set_src_mac(inner_eth, ep_value.mac); + + /* Keep overlay header, swap outer IP header */ + trn_set_src_dst_ip_csum(ip, ip->daddr, ip->saddr, (eth + len)); + trn_swap_src_dst_mac(pkt); + + /* + * Packet modification finished, read packet content again, in order to verify the mod + * */ + + struct ethhdr *eth = (struct ethhdr *) pkt; + + if (ntohs(eth->h_proto) != ETH_P_IP) { + printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); + return false; + } + printf("AFTER MOD: Packet length: %ld\n", len); + printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" + "eth size: %d\n", + eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], + eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], + bpf_ntohs(eth->h_proto), + sizeof(*eth)); + + // parse outer IP header + struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); + struct in_addr outer_ip_src; + outer_ip_src.s_addr = ip->saddr; + struct in_addr outer_ip_dest; + outer_ip_dest.s_addr = ip->daddr; + printf("AFTER MOD: Outer ip src: %s, ip dest: %s\n" + "AFTER MOD: Outer ip ihl: %d, version: %d\n", + inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), + ip->ihl, ip->version); + + // parse UDP header + struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); + printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", + udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); + + // parse VXLAN header + struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); + printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); + + // parse inner eth header + struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); + printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", + inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], + inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], + inner_eth->h_proto); + + // parse inner arp header + arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); + struct in_addr arp_src_ip; + arp_src_ip.s_addr = arp_msg->spa; + struct in_addr arp_dest_ip; + arp_dest_ip.s_addr = arp_msg->tpa; + printf("AFTER MOD: arp op: %d\n", + bpf_htons(arp_msg->op)); + printf("AFTER MOD: arp source ip: %s, \n", + inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) + ); + printf("AFTER MOD: arp dest ip: %s, \n", + inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) + ); + /* Here we sent the packet out of the receive port. Note that * we allocate one entry and schedule it. Your design would be * faster if you do batch processing/transmission */ - ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); - if (ret != 1) { - /* No more transmit slots, drop the packet */ + ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); + if (ret != 1) { + /* No more transmit slots, drop the packet */ + return false; + } + + xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = addr; + xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = len; + xsk_ring_prod__submit(&xsk->tx, 1); + xsk->outstanding_tx++; + + xsk->stats.tx_bytes += len; + xsk->stats.tx_packets++; + printf("Packet sent via tx queue\n"); + printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); + + return true; + } else { return false; } + }else if (ntohs(inner_eth->h_proto) == ETH_P_IP) { + // TODO: Add inner IP support, refer to trn_process_inner_ip + // parse inner IP header + struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); + struct in_addr inner_ip_src, inner_ip_dest; + inner_ip_src.s_addr = inner_ip->saddr; + inner_ip_dest.s_addr = inner_ip->daddr; + printf("Inner IP src: %s\n", inet_ntoa(inner_ip_src)); + printf("Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); + endpoint_key_t epkey; + endpoint_t ep_value; + ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(inner_ip_dest)); + if (ep_value.hip > 0) { + epkey.vni = trn_get_vni(vxlan->vni); + struct sockaddr_in ep_ip; + inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); + epkey.ip = ep_ip.sin_addr.s_addr; + // we now have key and value, can modify the packet and update the map now. +// int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); + printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", + inet_ntoa(inner_ip_dest), trn_get_vni(vxlan->vni), 0); + + /* Modify inner EitherHdr, pretend it's from target */ + trn_set_dst_mac(inner_eth, ep_value.mac); + + /* Keep overlay header, update outer header destinations */ + trn_set_src_dst_ip_csum(ip, ip->daddr, ep_value.hip, (eth + len)); + trn_set_src_mac(eth, eth->h_dest); + trn_set_dst_mac(eth, ep_value.hmac); + + /* + * Packet modification finished, read packet content again, in order to verify the mod + * */ - xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = addr; - xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = len; - xsk_ring_prod__submit(&xsk->tx, 1); - xsk->outstanding_tx++; + struct ethhdr *eth = (struct ethhdr *) pkt; + + if (ntohs(eth->h_proto) != ETH_P_IP) { + printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); + return false; + } + printf("AFTER MOD: Packet length: %ld\n", len); + printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" + "eth size: %d\n", + eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], + eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], + bpf_ntohs(eth->h_proto), + sizeof(*eth)); + + // parse outer IP header + struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); + struct in_addr outer_ip_src; + outer_ip_src.s_addr = ip->saddr; + struct in_addr outer_ip_dest; + outer_ip_dest.s_addr = ip->daddr; + printf("AFTER MOD: Outer ip src: %s, ip dest: %s\n" + "AFTER MOD: Outer ip ihl: %d, version: %d\n", + inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), + ip->ihl, ip->version); + + // parse UDP header + struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); + printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", + udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); + + // parse VXLAN header + struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); + printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); + + // parse inner eth header + struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); + printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", + inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], + inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], + inner_eth->h_proto); + + // parse inner IP header + struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); + struct in_addr inner_ip_src, inner_ip_dest; + inner_ip_src.s_addr = inner_ip->saddr; + inner_ip_dest.s_addr = inner_ip->daddr; + printf("AFTER MOD: Inner IP src: %s\n", inet_ntoa(inner_ip_src)); + printf("AFTER MOD: Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); + /* Here we sent the packet out of the receive port. Note that + * we allocate one entry and schedule it. Your design would be + * faster if you do batch processing/transmission */ + + ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); + if (ret != 1) { + /* No more transmit slots, drop the packet */ + return false; + } - xsk->stats.tx_bytes += len; - xsk->stats.tx_packets++; - printf("Packet sent via tx queue\n"); - printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); + xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = addr; + xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = len; + xsk_ring_prod__submit(&xsk->tx, 1); + xsk->outstanding_tx++; - return true; + xsk->stats.tx_bytes += len; + xsk->stats.tx_packets++; + printf("Packet sent via tx queue\n"); + printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); + + return true; + } } + printf("Endpoing hip == 0, returning false.\n"); return false; } From 3e0fc7d97071813927d1e71d8cd4a5b31f2b1f0b Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Tue, 29 Nov 2022 09:53:12 -0800 Subject: [PATCH 12/33] Refactored db_client to make local_db accessible by other files; implemented endpoint cache layer for better lookup performance; refactored functions in af_xdp_user to utils --- include/db_client.h | 154 ++++++-------- include/util.h | 127 +++++++++++ src/comm/af_xdp_user.cpp | 448 +++++++++++++++------------------------ src/comm/grpc_client.cpp | 4 +- 4 files changed, 359 insertions(+), 374 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index c57d80c..48634e1 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -15,8 +15,10 @@ #include #include #include +#include #include "dispatch_queue.h" #include "xdp/trn_datamodel.h" +#include "util.h" using namespace sqlite_orm; @@ -34,97 +36,21 @@ struct ProgrammingState { int version; }; // local db table 2 - neighbor ebpf programmed version +// copied from arp_hash in ACA +struct EndpointHash { + size_t operator()(const endpoint_key_t &e) const{ + return std::hash<__u32>()(e.vni) ^ (std::hash<__u32>()(e.ip) << 1); + } +}; + +struct EndpointEqual { + bool operator() (const endpoint_key_t &e, const endpoint_key_t &f) const { + return (e.vni == f.vni) && (e.ip == f.ip); + } +}; + static std::string g_local_db_path = "/var/local/arion/arion_wing.db"; -// Schema definition (create DB if not exists) or retrieved handle (get DB if exists already) of local db -//static auto local_db = make_storage(g_local_db_path, -// make_table("neighbor", -// make_column("vni", &Neighbor::vni), -// make_column("vpc_ip", &Neighbor::vpc_ip), -// make_column("host_ip", &Neighbor::host_ip), -// make_column("vpc_mac", &Neighbor::vpc_mac), -// make_column("host_mac", &Neighbor::host_mac), -// make_column("version", &Neighbor::version), -// primary_key(&Neighbor::vni, &Neighbor::vpc_ip) -// ), -// make_table("journal", -// make_column("version", &ProgrammingState::version), -// primary_key(&ProgrammingState::version) -// ) -//); - -// Create local db writer single thread execution queue -//static dispatch_queue local_db_writer_queue("Local db background write queue", 1); -// -//static int FindLKGVersion() { -// int lkg_ver = 0; -// -// /* original sql is -// SELECT MIN(mo.version) + 1 -// FROM journal AS mo -// WHERE NOT EXISTS -// ( -// SELECT 0 - mi.version -// FROM journal AS mi -// WHERE mo.version + 1 = mi.version -// ); -// */ -// -// using als_mo = alias_a; -// using als_mi = alias_b; -// auto ver_gaps = local_db.select(alias_column(&ProgrammingState::version), -// from(), -// where(not exists( -// select(0 - c(alias_column(&ProgrammingState::version)), -// from(), -// where(is_equal(c(alias_column(&ProgrammingState::version)) + 1, alias_column(&ProgrammingState::version))) -// )))); -// -// // lkg version: -// // case 1 - if no ver gap, the query above will return the max version (since this version is already programmed, so return max + 1) -// // case 2 - if there's ver gap, then always locate the min ver gap (as above, return minVerGap + 1) -// // case 3 - if the table is empty like new launched instance, then always sync/watch from server with version 1 -// // (since server syncs including the version agent provides, so sync/watch from version 1 means sync everything -// if (ver_gaps.size() > 0) { -// lkg_ver = *std::min_element(ver_gaps.begin(), ver_gaps.end()); -// } -// -// return lkg_ver + 1; -//} - -/* SELECT host_ip, vpc_mac, host_mac - * FROM neighbor - * WHERE vni=%{vni} AND vpc_ip=%{vpc_ip} - * */ -//static auto query_neighbor_statement = -// local_db.prepare(select(columns(&Neighbor::host_ip, &Neighbor::vpc_mac, &Neighbor::host_mac), -// where(is_equal((&Neighbor::vni), 0) and is_equal((&Neighbor::vpc_ip), "127.0.0.1")))); - -//static endpoint_t GetNeighbor(int vni, std::string vpc_ip) { -// endpoint_t found_neighbor; -// printf("GetNeighbor with VNI: [%d], vpc_ip: [%s]\n", vni, vpc_ip.c_str()); -// get<0>(query_neighbor_statement) = vni; -// get<1>(query_neighbor_statement) = vpc_ip.c_str(); -// printf("Statement: %s\n", query_neighbor_statement.sql().c_str()); -// auto rows = local_db.execute(query_neighbor_statement); -// printf("Found %ld rows\n", rows.size()); -// for (auto& row : rows) { -// struct sockaddr_in ep_hip; -// inet_pton(AF_INET, get<0>(row).c_str(), &(ep_hip.sin_addr)); -// found_neighbor.hip = ep_hip.sin_addr.s_addr; -// -// std::sscanf(get<1>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", -// &found_neighbor.mac[0], &found_neighbor.mac[1], &found_neighbor.mac[2], -// &found_neighbor.mac[3], &found_neighbor.mac[4], &found_neighbor.mac[5]); -// -// std::sscanf(get<2>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", -// &found_neighbor.hmac[0], &found_neighbor.hmac[1], &found_neighbor.hmac[2], -// &found_neighbor.hmac[3], &found_neighbor.hmac[4], &found_neighbor.hmac[5]); -// -// printf("host_ip: %s, vpc_mac: %s, host_mac: %s\n", get<0>(row).c_str(), get<1>(row).c_str(), get<2>(row).c_str()); -// } -// return found_neighbor; -//} inline auto make_storage_query () { return make_storage(g_local_db_path, make_table("neighbor", @@ -161,6 +87,39 @@ class db_client { // Create local db writer single thread execution queue dispatch_queue local_db_writer_queue = dispatch_queue("Local db background write queue", 1); + folly::ConcurrentHashMap endpoint_cache; + + + void FillEndpointCacheFromDB() { + // Get all neighbors from SQLite Database + auto get_all_neighbors_statement = local_db.prepare( + select( + columns(&Neighbor::vni, &Neighbor::vpc_ip, &Neighbor::host_mac, &Neighbor::vpc_mac, &Neighbor::host_ip) + ) + ); + auto rows = local_db.execute(get_all_neighbors_statement); + printf("Retrieved %ld neighbors from local DB\n", rows.size()); + for (auto & row : rows) { + endpoint_key_t key; + key.vni = (get<0>(row)); + struct sockaddr_in ep_ip; + inet_pton(AF_INET, get<1>(row).c_str(), &(ep_ip.sin_addr)); + key.ip = ep_ip.sin_addr.s_addr; + endpoint_t value; + std::sscanf(get<3>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + &value.mac[0], &value.mac[1], &value.mac[2], + &value.mac[3], &value.mac[4], &value.mac[5]); + + std::sscanf(get<2>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + &value.hmac[0], &value.hmac[1], &value.hmac[2], + &value.hmac[3], &value.hmac[4], &value.hmac[5]); + struct sockaddr_in ep_hip; + inet_pton(AF_INET, get<4>(row).c_str(), &(ep_hip.sin_addr)); + value.hip = ep_hip.sin_addr.s_addr; + endpoint_cache.insert(key, value); + } + printf("Finished retrieving from local DB, not endpoint cache has %ld endpoints\n", endpoint_cache.size()); + } int FindLKGVersion() { int lkg_ver = 0; @@ -200,12 +159,12 @@ class db_client { endpoint_t GetNeighbor(int vni, std::string vpc_ip) { endpoint_t found_neighbor; found_neighbor.hip = 0; - printf("GetNeighbor with VNI: [%d], vpc_ip: [%s]\n", vni, vpc_ip.c_str()); +// printf("GetNeighbor with VNI: [%d], vpc_ip: [%s]\n", vni, vpc_ip.c_str()); get<0>(query_neighbor_statement) = vni; get<1>(query_neighbor_statement) = vpc_ip.c_str(); - printf("Statement: %s\n", query_neighbor_statement.sql().c_str()); +// printf("Statement: %s\n", query_neighbor_statement.sql().c_str()); auto rows = local_db.execute(query_neighbor_statement); - printf("Found %ld rows\n", rows.size()); +// printf("Found %ld rows\n", rows.size()); for (auto& row : rows) { struct sockaddr_in ep_hip; inet_pton(AF_INET, get<0>(row).c_str(), &(ep_hip.sin_addr)); @@ -219,8 +178,17 @@ class db_client { &found_neighbor.hmac[0], &found_neighbor.hmac[1], &found_neighbor.hmac[2], &found_neighbor.hmac[3], &found_neighbor.hmac[4], &found_neighbor.hmac[5]); - printf("host_ip: %s, vpc_mac: %s, host_mac: %s\n", get<0>(row).c_str(), get<1>(row).c_str(), get<2>(row).c_str()); +// printf("host_ip: %s, vpc_mac: %s, host_mac: %s\n", get<0>(row).c_str(), get<1>(row).c_str(), get<2>(row).c_str()); } return found_neighbor; } + + endpoint_t* GetNeighborInMemory(endpoint_key_t * key) { + auto iterator = endpoint_cache.find(*key); + if (iterator == endpoint_cache.end()) { + return nullptr; + } + auto endpoint_value = iterator->second;//endpoint_cache[*key]; + return std::move(&endpoint_value); + } }; \ No newline at end of file diff --git a/include/util.h b/include/util.h index f68053f..ebed5a5 100644 --- a/include/util.h +++ b/include/util.h @@ -17,6 +17,8 @@ #include #include +#include +using namespace std; // the number of characters needed to store the HEX form of IP address #define HEX_IP_BUFFER_SIZE 12 @@ -49,4 +51,129 @@ static inline std::uint8_t getNum(char hexChar) { return (hexChar - 'A' + 10); } + +static inline __sum16 csum16_add(__sum16 csum, __be16 addend) +{ + uint16_t res = (uint16_t)csum; + + res += (__u16)addend; + return (__sum16)(res + (res < (__u16)addend)); +} + +static inline __sum16 csum16_sub(__sum16 csum, __be16 addend) +{ + return csum16_add(csum, ~addend); +} + +static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 present) +{ + *sum = ~csum16_add(csum16_sub(~(*sum), old), present); +} + +static inline void trn_set_mac(void *dst, unsigned char *mac) +{ + unsigned short *d = static_cast(dst); + unsigned short *s = (unsigned short *)mac; + + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; +} + +static inline void trn_set_dst_mac(void *data, unsigned char *dst_mac) +{ + trn_set_mac(data, dst_mac); +} + +static inline void trn_set_src_mac(void *data, unsigned char *src_mac) +{ + uint8_t *tmp = static_cast(data); + trn_set_mac((void*)(tmp + 6), src_mac); +} + +static __be32 trn_get_vni(const __u8 *vni) +{ + /* Big endian! */ + return (vni[0] << 16) | (vni[1] << 8) | vni[2]; +} + +static inline void trn_set_src_ip(void *data, void *data_end, __u32 saddr) +{ + int off = offsetof(struct iphdr, saddr); + uint8_t *tmp = static_cast(data); + + __u32 *addr = (__u32*)(tmp + off); + if ((void *)addr > data_end) + return; + + *addr = saddr; +} + +static inline void trn_set_dst_ip(void *data, void *data_end, __u32 daddr) +{ + int off = offsetof(struct iphdr, daddr); + uint8_t *tmp = static_cast(data); + + __u32 *addr = (__u32 *)(tmp + off); + if ((void *)addr > data_end) + return; + + *addr = daddr; +} + +static inline __u16 trn_csum_fold_helper(__u64 csum) +{ + int i; +#pragma unroll + for (i = 0; i < 4; i++) { + if (csum >> 16) + csum = (csum & 0xffff) + (csum >> 16); + } + return ~csum; +} + +static inline void trn_ipv4_csum_inline(void *iph, __u64 *csum) +{ + __u16 *next_iph_u16 = (__u16 *)iph; +#pragma clang loop unroll(full) + for (int i = 0; i> 1; i++) { + *csum += *next_iph_u16++; + } + *csum = trn_csum_fold_helper(*csum); +} + +static inline void trn_set_src_dst_ip_csum(struct iphdr *ip, + __u32 saddr, __u32 daddr, void *data_end) +{ + /* Since the packet destination is being rewritten we also + decrement the TTL */ + ip->ttl--; + + __u64 csum = 0; + trn_set_src_ip(ip, data_end, saddr); + trn_set_dst_ip(ip, data_end, daddr); + csum = 0; + ip->check = 0; + trn_ipv4_csum_inline(ip, &csum); + ip->check = csum; + + // printf("Modified IP Address, src: 0x%x, dst: 0x%x, csum: 0x%x\n", + // ip->saddr, ip->daddr, ip->check); +} + +static inline void trn_swap_src_dst_mac(void *data) +{ + unsigned short *p = static_cast(data); + unsigned short tmp[3]; + + tmp[0] = p[0]; + tmp[1] = p[1]; + tmp[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = tmp[0]; + p[4] = tmp[1]; + p[5] = tmp[2]; +} #endif diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp index 92c84f5..f797e92 100644 --- a/src/comm/af_xdp_user.cpp +++ b/src/comm/af_xdp_user.cpp @@ -26,6 +26,7 @@ #include #include #include "xdp/trn_datamodel.h" +#include "util.h" //#include "xdp/trn_kern.h" #define NUM_FRAMES 4096 @@ -300,135 +301,12 @@ static void complete_tx(struct xsk_socket_info *xsk) } } -static inline __sum16 csum16_add(__sum16 csum, __be16 addend) -{ - uint16_t res = (uint16_t)csum; - - res += (__u16)addend; - return (__sum16)(res + (res < (__u16)addend)); -} - -static inline __sum16 csum16_sub(__sum16 csum, __be16 addend) -{ - return csum16_add(csum, ~addend); -} - -static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 present) -{ - *sum = ~csum16_add(csum16_sub(~(*sum), old), present); -} - -static inline void trn_set_mac(void *dst, unsigned char *mac) -{ - unsigned short *d = static_cast(dst); - unsigned short *s = (unsigned short *)mac; - - d[0] = s[0]; - d[1] = s[1]; - d[2] = s[2]; -} -static inline void trn_set_dst_mac(void *data, unsigned char *dst_mac) -{ - trn_set_mac(data, dst_mac); -} - -static inline void trn_set_src_mac(void *data, unsigned char *src_mac) -{ - uint8_t *tmp = static_cast(data); - trn_set_mac((void*)(tmp + 6), src_mac); -} - -static __be32 trn_get_vni(const __u8 *vni) -{ - /* Big endian! */ - return (vni[0] << 16) | (vni[1] << 8) | vni[2]; -} - -static inline void trn_set_src_ip(void *data, void *data_end, __u32 saddr) -{ - int off = offsetof(struct iphdr, saddr); - uint8_t *tmp = static_cast(data); - - __u32 *addr = (__u32*)(tmp + off); - if ((void *)addr > data_end) - return; - - *addr = saddr; -} - -static inline void trn_set_dst_ip(void *data, void *data_end, __u32 daddr) -{ - int off = offsetof(struct iphdr, daddr); - uint8_t *tmp = static_cast(data); - - __u32 *addr = (__u32 *)(tmp + off); - if ((void *)addr > data_end) - return; - - *addr = daddr; -} - -static inline __u16 trn_csum_fold_helper(__u64 csum) -{ - int i; -#pragma unroll - for (i = 0; i < 4; i++) { - if (csum >> 16) - csum = (csum & 0xffff) + (csum >> 16); - } - return ~csum; -} - -static inline void trn_ipv4_csum_inline(void *iph, __u64 *csum) -{ - __u16 *next_iph_u16 = (__u16 *)iph; -#pragma clang loop unroll(full) - for (int i = 0; i> 1; i++) { - *csum += *next_iph_u16++; - } - *csum = trn_csum_fold_helper(*csum); -} - -static inline void trn_set_src_dst_ip_csum(struct iphdr *ip, - __u32 saddr, __u32 daddr, void *data_end) -{ - /* Since the packet destination is being rewritten we also - decrement the TTL */ - ip->ttl--; - - __u64 csum = 0; - trn_set_src_ip(ip, data_end, saddr); - trn_set_dst_ip(ip, data_end, daddr); - csum = 0; - ip->check = 0; - trn_ipv4_csum_inline(ip, &csum); - ip->check = csum; - - printf("Modified IP Address, src: 0x%x, dst: 0x%x, csum: 0x%x\n", - ip->saddr, ip->daddr, ip->check); -} - -static inline void trn_swap_src_dst_mac(void *data) -{ - unsigned short *p = static_cast(data); - unsigned short tmp[3]; - - tmp[0] = p[0]; - tmp[1] = p[1]; - tmp[2] = p[2]; - p[0] = p[3]; - p[1] = p[4]; - p[2] = p[5]; - p[3] = tmp[0]; - p[4] = tmp[1]; - p[5] = tmp[2]; -} static bool process_packet(struct xsk_socket_info *xsk, uint64_t addr, uint32_t len, int* fd) { - printf(">>>>>>>>>> Begin processing packet >>>>>>>>>>\n"); +// printf(">>>>>>>>>> Begin processing packet >>>>>>>>>>\n"); uint8_t *pkt = static_cast(xsk_umem__get_data(xsk->umem->buffer, addr)); @@ -444,16 +322,16 @@ static bool process_packet(struct xsk_socket_info *xsk, struct ethhdr *eth = (struct ethhdr *) pkt; if (ntohs(eth->h_proto) != ETH_P_IP) { - printf("%s\n", "returning false for this packet as it is NOT IP"); +// printf("%s\n", "returning false for this packet as it is NOT IP"); return false; } - printf("Packet length: %ld\n", len); - printf("Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" - "eth size: %d\n", - eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], - eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], - bpf_ntohs(eth->h_proto), - sizeof(*eth)); +// printf("Packet length: %ld\n", len); +// printf("Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" +// "eth size: %d\n", +// eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], +// eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], +// bpf_ntohs(eth->h_proto), +// sizeof(*eth)); // parse outer IP header struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); @@ -461,26 +339,26 @@ static bool process_packet(struct xsk_socket_info *xsk, outer_ip_src.s_addr = ip->saddr; struct in_addr outer_ip_dest; outer_ip_dest.s_addr = ip->daddr; - printf("Outer ip src: %s, ip dest: %s\n" - "Outer ip ihl: %d, version: %d\n", - inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), - ip->ihl, ip->version); +// printf("Outer ip src: %s, ip dest: %s\n" +// "Outer ip ihl: %d, version: %d\n", +// inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), +// ip->ihl, ip->version); // parse UDP header struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); - printf("UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", - udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); +// printf("UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", +// udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); // parse VXLAN header struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); - printf("VNI: %ld, \n",trn_get_vni(vxlan->vni)); +// printf("VNI: %ld, \n",trn_get_vni(vxlan->vni)); // parse inner eth header struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); - printf("inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", - inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], - inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], - inner_eth->h_proto); +// printf("inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", +// inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], +// inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], +// inner_eth->h_proto); if (ntohs(inner_eth->h_proto) == ETH_P_ARP) { // parse inner arp header @@ -489,31 +367,32 @@ static bool process_packet(struct xsk_socket_info *xsk, arp_src_ip.s_addr = arp_msg->spa; struct in_addr arp_dest_ip; arp_dest_ip.s_addr = arp_msg->tpa; - printf("arp op: %d\n", - bpf_htons(arp_msg->op)); - printf("arp source ip: %s, \n", - inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) - ); - printf("arp dest ip: %s, \n", - inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) - ); +// printf("arp op: %d\n", +// bpf_htons(arp_msg->op)); +// printf("arp source ip: %s, \n", +// inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) +// ); +// printf("arp dest ip: %s, \n", +// inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) +// ); endpoint_key_t epkey; - endpoint_t ep_value; - ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_dest_ip)); - if (ep_value.hip > 0) { - epkey.vni = trn_get_vni(vxlan->vni); - struct sockaddr_in ep_ip; - inet_pton(AF_INET, inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); - epkey.ip = ep_ip.sin_addr.s_addr; + epkey.vni = trn_get_vni(vxlan->vni); + struct sockaddr_in ep_ip; + inet_pton(AF_INET, inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); + epkey.ip = ep_ip.sin_addr.s_addr; + auto ep_value = db_client::get_instance().GetNeighborInMemory(&epkey); +// endpoint_t ep_value; +// ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_dest_ip)); + if (ep_value != nullptr) { // we now have key and value, can modify the packet and update the map now. // int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); - printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", - inet_ntoa(arp_src_ip), trn_get_vni(vxlan->vni), 0); +// printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", +// inet_ntoa(arp_src_ip), trn_get_vni(vxlan->vni), 0); /* Modify pkt for inner ARP response */ arp_msg->op = bpf_htons(ARPOP_REPLY); trn_set_mac(arp_msg->tha, arp_msg->sha); - trn_set_mac(arp_msg->sha, ep_value.mac); + trn_set_mac(arp_msg->sha, ep_value->mac); __u32 tmp_ip = arp_msg->spa;//*sip; arp_msg->spa = arp_msg->tpa;//*tip; @@ -521,7 +400,7 @@ static bool process_packet(struct xsk_socket_info *xsk, /* Modify inner EitherHdr, pretend it's from target */ trn_set_dst_mac(inner_eth, inner_eth->h_source); - trn_set_src_mac(inner_eth, ep_value.mac); + trn_set_src_mac(inner_eth, ep_value->mac); /* Keep overlay header, swap outer IP header */ trn_set_src_dst_ip_csum(ip, ip->daddr, ip->saddr, (eth + len)); @@ -534,58 +413,58 @@ static bool process_packet(struct xsk_socket_info *xsk, struct ethhdr *eth = (struct ethhdr *) pkt; if (ntohs(eth->h_proto) != ETH_P_IP) { - printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); +// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); return false; } - printf("AFTER MOD: Packet length: %ld\n", len); - printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" - "eth size: %d\n", - eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], - eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], - bpf_ntohs(eth->h_proto), - sizeof(*eth)); - - // parse outer IP header - struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); - struct in_addr outer_ip_src; - outer_ip_src.s_addr = ip->saddr; - struct in_addr outer_ip_dest; - outer_ip_dest.s_addr = ip->daddr; - printf("AFTER MOD: Outer ip src: %s, ip dest: %s\n" - "AFTER MOD: Outer ip ihl: %d, version: %d\n", - inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), - ip->ihl, ip->version); - - // parse UDP header - struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); - printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", - udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); - - // parse VXLAN header - struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); - printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); - - // parse inner eth header - struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); - printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", - inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], - inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], - inner_eth->h_proto); - - // parse inner arp header - arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); - struct in_addr arp_src_ip; - arp_src_ip.s_addr = arp_msg->spa; - struct in_addr arp_dest_ip; - arp_dest_ip.s_addr = arp_msg->tpa; - printf("AFTER MOD: arp op: %d\n", - bpf_htons(arp_msg->op)); - printf("AFTER MOD: arp source ip: %s, \n", - inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) - ); - printf("AFTER MOD: arp dest ip: %s, \n", - inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) - ); +// printf("AFTER MOD: Packet length: %ld\n", len); +// printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" +// "eth size: %d\n", +// eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], +// eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], +// bpf_ntohs(eth->h_proto), +// sizeof(*eth)); +// +// // parse outer IP header +// struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); +// struct in_addr outer_ip_src; +// outer_ip_src.s_addr = ip->saddr; +// struct in_addr outer_ip_dest; +// outer_ip_dest.s_addr = ip->daddr; +// printf("AFTER MOD: Outer ip src: %s, ip dest: %s\n" +// "AFTER MOD: Outer ip ihl: %d, version: %d\n", +// inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), +// ip->ihl, ip->version); +// +// // parse UDP header +// struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); +// printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", +// udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); +// +// // parse VXLAN header +// struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); +// printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); +// +// // parse inner eth header +// struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); +// printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", +// inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], +// inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], +// inner_eth->h_proto); +// +// // parse inner arp header +// arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); +// struct in_addr arp_src_ip; +// arp_src_ip.s_addr = arp_msg->spa; +// struct in_addr arp_dest_ip; +// arp_dest_ip.s_addr = arp_msg->tpa; +// printf("AFTER MOD: arp op: %d\n", +// bpf_htons(arp_msg->op)); +// printf("AFTER MOD: arp source ip: %s, \n", +// inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) +// ); +// printf("AFTER MOD: arp dest ip: %s, \n", +// inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) +// ); /* Here we sent the packet out of the receive port. Note that * we allocate one entry and schedule it. Your design would be * faster if you do batch processing/transmission */ @@ -603,11 +482,12 @@ static bool process_packet(struct xsk_socket_info *xsk, xsk->stats.tx_bytes += len; xsk->stats.tx_packets++; - printf("Packet sent via tx queue\n"); - printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); +// printf("Packet sent via tx queue\n"); +// printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); return true; } else { + printf("Can't find endpoint!\n"); return false; } }else if (ntohs(inner_eth->h_proto) == ETH_P_IP) { @@ -617,28 +497,33 @@ static bool process_packet(struct xsk_socket_info *xsk, struct in_addr inner_ip_src, inner_ip_dest; inner_ip_src.s_addr = inner_ip->saddr; inner_ip_dest.s_addr = inner_ip->daddr; - printf("Inner IP src: %s\n", inet_ntoa(inner_ip_src)); - printf("Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); +// printf("Inner IP src: %s\n", inet_ntoa(inner_ip_src)); +// printf("Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); endpoint_key_t epkey; - endpoint_t ep_value; - ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(inner_ip_dest)); - if (ep_value.hip > 0) { - epkey.vni = trn_get_vni(vxlan->vni); - struct sockaddr_in ep_ip; - inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); - epkey.ip = ep_ip.sin_addr.s_addr; + epkey.vni = trn_get_vni(vxlan->vni); + struct sockaddr_in ep_ip; + inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); + epkey.ip = ep_ip.sin_addr.s_addr; + auto ep_value = db_client::get_instance().GetNeighborInMemory(&epkey); +// endpoint_t ep_value; +// ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(inner_ip_dest)); + if (ep_value != nullptr) { +// epkey.vni = trn_get_vni(vxlan->vni); +// struct sockaddr_in ep_ip; +// inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); +// epkey.ip = ep_ip.sin_addr.s_addr; // we now have key and value, can modify the packet and update the map now. // int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); - printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", - inet_ntoa(inner_ip_dest), trn_get_vni(vxlan->vni), 0); +// printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", +// inet_ntoa(inner_ip_dest), trn_get_vni(vxlan->vni), 0); /* Modify inner EitherHdr, pretend it's from target */ - trn_set_dst_mac(inner_eth, ep_value.mac); + trn_set_dst_mac(inner_eth, ep_value->mac); /* Keep overlay header, update outer header destinations */ - trn_set_src_dst_ip_csum(ip, ip->daddr, ep_value.hip, (eth + len)); + trn_set_src_dst_ip_csum(ip, ip->daddr, ep_value->hip, (eth + len)); trn_set_src_mac(eth, eth->h_dest); - trn_set_dst_mac(eth, ep_value.hmac); + trn_set_dst_mac(eth, ep_value->hmac); /* * Packet modification finished, read packet content again, in order to verify the mod @@ -647,51 +532,51 @@ static bool process_packet(struct xsk_socket_info *xsk, struct ethhdr *eth = (struct ethhdr *) pkt; if (ntohs(eth->h_proto) != ETH_P_IP) { - printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); +// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); return false; } - printf("AFTER MOD: Packet length: %ld\n", len); - printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" - "eth size: %d\n", - eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], - eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], - bpf_ntohs(eth->h_proto), - sizeof(*eth)); - - // parse outer IP header - struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); - struct in_addr outer_ip_src; - outer_ip_src.s_addr = ip->saddr; - struct in_addr outer_ip_dest; - outer_ip_dest.s_addr = ip->daddr; - printf("AFTER MOD: Outer ip src: %s, ip dest: %s\n" - "AFTER MOD: Outer ip ihl: %d, version: %d\n", - inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), - ip->ihl, ip->version); - - // parse UDP header - struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); - printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", - udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); - - // parse VXLAN header - struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); - printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); - - // parse inner eth header - struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); - printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", - inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], - inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], - inner_eth->h_proto); - - // parse inner IP header - struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); - struct in_addr inner_ip_src, inner_ip_dest; - inner_ip_src.s_addr = inner_ip->saddr; - inner_ip_dest.s_addr = inner_ip->daddr; - printf("AFTER MOD: Inner IP src: %s\n", inet_ntoa(inner_ip_src)); - printf("AFTER MOD: Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); +// printf("AFTER MOD: Packet length: %ld\n", len); +// printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" +// "eth size: %d\n", +// eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], +// eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], +// bpf_ntohs(eth->h_proto), +// sizeof(*eth)); +// +// // parse outer IP header +// struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); +// struct in_addr outer_ip_src; +// outer_ip_src.s_addr = ip->saddr; +// struct in_addr outer_ip_dest; +// outer_ip_dest.s_addr = ip->daddr; +// printf("AFTER MOD: Outer ip src: %s, ip dest: %s\n" +// "AFTER MOD: Outer ip ihl: %d, version: %d\n", +// inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), +// ip->ihl, ip->version); +// +// // parse UDP header +// struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); +// printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", +// udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); +// +// // parse VXLAN header +// struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); +// printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); +// +// // parse inner eth header +// struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); +// printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", +// inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], +// inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], +// inner_eth->h_proto); +// +// // parse inner IP header +// struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); +// struct in_addr inner_ip_src, inner_ip_dest; +// inner_ip_src.s_addr = inner_ip->saddr; +// inner_ip_dest.s_addr = inner_ip->daddr; +// printf("AFTER MOD: Inner IP src: %s\n", inet_ntoa(inner_ip_src)); +// printf("AFTER MOD: Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); /* Here we sent the packet out of the receive port. Note that * we allocate one entry and schedule it. Your design would be * faster if you do batch processing/transmission */ @@ -709,14 +594,17 @@ static bool process_packet(struct xsk_socket_info *xsk, xsk->stats.tx_bytes += len; xsk->stats.tx_packets++; - printf("Packet sent via tx queue\n"); - printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); +// printf("Packet sent via tx queue\n"); +// printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); return true; + } else { + printf("Can't find endpoint!\n"); + return false; } } - printf("Endpoing hip == 0, returning false.\n"); +// printf("Endpoing hip == 0, returning false.\n"); return false; } @@ -756,7 +644,7 @@ static void handle_receive_packets(struct xsk_socket_info *xsk, int* fd) } /* Process received packets */ - printf("Received %d packets\n", rcvd); +// printf("Received %d packets\n", rcvd); for (i = 0; i < rcvd; i++) { uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; uint32_t len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; @@ -772,7 +660,7 @@ static void handle_receive_packets(struct xsk_socket_info *xsk, int* fd) /* Do we need to wake up the kernel for transmission */ complete_tx(xsk); - printf("tx completed\n"); +// printf("tx completed\n"); } static void rx_and_process(struct config *cfg, @@ -1099,14 +987,14 @@ void af_xdp_user::run_af_xdp() cfg.ifindex = if_nametoindex(cfg.ifname); // skb mode cfg.xdp_flags &= ~XDP_FLAGS_MODES; - cfg.xdp_flags |= XDP_FLAGS_SKB_MODE; - cfg.xsk_bind_flags &= XDP_ZEROCOPY; - cfg.xsk_bind_flags |= XDP_COPY; + cfg.xdp_flags |= XDP_FLAGS_DRV_MODE; + cfg.xsk_bind_flags &= XDP_COPY;//XDP_ZEROCOPY; + cfg.xsk_bind_flags |= XDP_ZEROCOPY;//XDP_COPY; // queue_id, default = 0 cfg.xsk_if_queue = 0; // NOT using poll - cfg.xsk_poll_mode = false; + cfg.xsk_poll_mode = true; // not doing unload this time cfg.do_unload = false; // progsec of the xdp program diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 89ba399..f125945 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -173,6 +173,8 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, //disabling the element udpate, so that all packets will be sent to user space program. int ebpf_rc = 0;//bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); + // also put in local in memory cache + db_client::get_instance().endpoint_cache.insert(epkey, ep); printf("GPPC: Inserted this neighbor into map: vip: %s, vni: %d\n", vpc_ip.c_str(), vni); // step #3 - async call to write/update to local db table 1 db_client::get_instance().local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { @@ -251,7 +253,7 @@ void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::st // Find lkg version to reconcile/sync from server int rev_lkg = db_client::get_instance().FindLKGVersion(); printf("Found last known good version: %d from local db to sync from server\n", rev_lkg); - + db_client::get_instance().FillEndpointCacheFromDB(); this->ConnectToArionMaster(); grpc::CompletionQueue cq; ArionWingRequest watch_req; From c7813b6d557f5128c2a51b4ffaabf7d86244d15e Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Wed, 30 Nov 2022 13:41:16 -0800 Subject: [PATCH 13/33] added counter to count processed packets; changed return type of GetNeighborInMemory from pointer to value, in order to make the packet correct after modification --- include/db_client.h | 19 ++++++-- src/comm/af_xdp_user.cpp | 93 +++++++++++++++++++++++++++++----------- 2 files changed, 82 insertions(+), 30 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index 48634e1..d5fc36e 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -100,6 +100,9 @@ class db_client { auto rows = local_db.execute(get_all_neighbors_statement); printf("Retrieved %ld neighbors from local DB\n", rows.size()); for (auto & row : rows) { + printf("Retrieved this endpoint from local DB: VNI: %ld, vpc_ip: %s, host_mac: %s, vpc_mac: %s, host_ip: %s\n", + get<0>(row), get<1>(row).c_str(), get<2>(row).c_str(), get<3>(row).c_str(), get<4>(row).c_str() + ); endpoint_key_t key; key.vni = (get<0>(row)); struct sockaddr_in ep_ip; @@ -117,6 +120,12 @@ class db_client { inet_pton(AF_INET, get<4>(row).c_str(), &(ep_hip.sin_addr)); value.hip = ep_hip.sin_addr.s_addr; endpoint_cache.insert(key, value); + printf("Inserted this endpoint into cache: VNI: %ld, vpc_ip: %s, host_mac: %x:%x:%x:%x:%x:%x, vpc_mac: %x:%x:%x:%x:%x:%x, host_ip: %s\n", + key.vni, inet_ntoa(ep_ip.sin_addr), + value.hmac[0],value.hmac[1],value.hmac[2],value.hmac[3],value.hmac[4],value.hmac[5], + value.mac[0],value.mac[1],value.mac[2],value.mac[3],value.mac[4],value.mac[5], + inet_ntoa(ep_hip.sin_addr) + ); } printf("Finished retrieving from local DB, not endpoint cache has %ld endpoints\n", endpoint_cache.size()); } @@ -183,12 +192,14 @@ class db_client { return found_neighbor; } - endpoint_t* GetNeighborInMemory(endpoint_key_t * key) { - auto iterator = endpoint_cache.find(*key); + endpoint_t GetNeighborInMemory(endpoint_key_t key) { + auto iterator = endpoint_cache.find(key); if (iterator == endpoint_cache.end()) { - return nullptr; + return { + .hip = 0, + }; } auto endpoint_value = iterator->second;//endpoint_cache[*key]; - return std::move(&endpoint_value); + return endpoint_value; } }; \ No newline at end of file diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp index f797e92..3fc8d65 100644 --- a/src/comm/af_xdp_user.cpp +++ b/src/comm/af_xdp_user.cpp @@ -380,19 +380,27 @@ static bool process_packet(struct xsk_socket_info *xsk, struct sockaddr_in ep_ip; inet_pton(AF_INET, inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); epkey.ip = ep_ip.sin_addr.s_addr; - auto ep_value = db_client::get_instance().GetNeighborInMemory(&epkey); + auto ep_value = db_client::get_instance().GetNeighborInMemory(epkey); // endpoint_t ep_value; // ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_dest_ip)); - if (ep_value != nullptr) { + if (ep_value.hip != 0) { // we now have key and value, can modify the packet and update the map now. // int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); // printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", // inet_ntoa(arp_src_ip), trn_get_vni(vxlan->vni), 0); /* Modify pkt for inner ARP response */ +// struct in_addr ep_ip_addr, ep_host_ip_addr; +// ep_ip_addr.s_addr = epkey.ip; +// ep_host_ip_addr.s_addr = ep_value.hip; +// printf("Retrived this endpoint: HIP: %s, IP: %s, host_mac: %x:%x:%x:%x:%x:%x, mac: %x:%x:%x:%x:%x:%x\n", +// inet_ntoa(ep_host_ip_addr), inet_ntoa(ep_ip_addr), +// ep_value.hmac[0],ep_value.hmac[1],ep_value.hmac[2],ep_value.hmac[3],ep_value.hmac[4],ep_value.hmac[5], +// ep_value.mac[0],ep_value.mac[1],ep_value.mac[2],ep_value.mac[3],ep_value.mac[4],ep_value.mac[5] +// ); arp_msg->op = bpf_htons(ARPOP_REPLY); trn_set_mac(arp_msg->tha, arp_msg->sha); - trn_set_mac(arp_msg->sha, ep_value->mac); + trn_set_mac(arp_msg->sha, ep_value.mac); __u32 tmp_ip = arp_msg->spa;//*sip; arp_msg->spa = arp_msg->tpa;//*tip; @@ -400,7 +408,7 @@ static bool process_packet(struct xsk_socket_info *xsk, /* Modify inner EitherHdr, pretend it's from target */ trn_set_dst_mac(inner_eth, inner_eth->h_source); - trn_set_src_mac(inner_eth, ep_value->mac); + trn_set_src_mac(inner_eth, ep_value.mac); /* Keep overlay header, swap outer IP header */ trn_set_src_dst_ip_csum(ip, ip->daddr, ip->saddr, (eth + len)); @@ -410,12 +418,12 @@ static bool process_packet(struct xsk_socket_info *xsk, * Packet modification finished, read packet content again, in order to verify the mod * */ - struct ethhdr *eth = (struct ethhdr *) pkt; - - if (ntohs(eth->h_proto) != ETH_P_IP) { -// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); - return false; - } +// struct ethhdr *eth = (struct ethhdr *) pkt; +// +// if (ntohs(eth->h_proto) != ETH_P_IP) { +//// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); +// return false; +// } // printf("AFTER MOD: Packet length: %ld\n", len); // printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" // "eth size: %d\n", @@ -504,10 +512,11 @@ static bool process_packet(struct xsk_socket_info *xsk, struct sockaddr_in ep_ip; inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); epkey.ip = ep_ip.sin_addr.s_addr; - auto ep_value = db_client::get_instance().GetNeighborInMemory(&epkey); + auto ep_value = db_client::get_instance().GetNeighborInMemory(epkey); // endpoint_t ep_value; // ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(inner_ip_dest)); - if (ep_value != nullptr) { + + if (ep_value.hip != 0) { // epkey.vni = trn_get_vni(vxlan->vni); // struct sockaddr_in ep_ip; // inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); @@ -518,23 +527,31 @@ static bool process_packet(struct xsk_socket_info *xsk, // inet_ntoa(inner_ip_dest), trn_get_vni(vxlan->vni), 0); /* Modify inner EitherHdr, pretend it's from target */ - trn_set_dst_mac(inner_eth, ep_value->mac); +// struct in_addr ep_ip_addr, ep_host_ip_addr; +// ep_ip_addr.s_addr = epkey.ip; +// ep_host_ip_addr.s_addr = ep_value.hip; +// printf("Retrived this endpoint: HIP: %s, IP: %s, host_mac: %x:%x:%x:%x:%x:%x, mac: %x:%x:%x:%x:%x:%x\n", +// inet_ntoa(ep_host_ip_addr), inet_ntoa(ep_ip_addr), +// ep_value.hmac[0],ep_value.hmac[1],ep_value.hmac[2],ep_value.hmac[3],ep_value.hmac[4],ep_value.hmac[5], +// ep_value.mac[0],ep_value.mac[1],ep_value.mac[2],ep_value.mac[3],ep_value.mac[4],ep_value.mac[5] +// ); + trn_set_dst_mac(inner_eth, ep_value.mac); /* Keep overlay header, update outer header destinations */ - trn_set_src_dst_ip_csum(ip, ip->daddr, ep_value->hip, (eth + len)); + trn_set_src_dst_ip_csum(ip, ip->daddr, ep_value.hip, (eth + len)); trn_set_src_mac(eth, eth->h_dest); - trn_set_dst_mac(eth, ep_value->hmac); + trn_set_dst_mac(eth, ep_value.hmac); /* * Packet modification finished, read packet content again, in order to verify the mod * */ - struct ethhdr *eth = (struct ethhdr *) pkt; - - if (ntohs(eth->h_proto) != ETH_P_IP) { -// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); - return false; - } +// struct ethhdr *eth = (struct ethhdr *) pkt; +// +// if (ntohs(eth->h_proto) != ETH_P_IP) { +//// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); +// return false; +// } // printf("AFTER MOD: Packet length: %ld\n", len); // printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" // "eth size: %d\n", @@ -612,12 +629,22 @@ static bool process_packet(struct xsk_socket_info *xsk, } -static void handle_receive_packets(struct xsk_socket_info *xsk, int* fd) +static void handle_receive_packets(struct xsk_socket_info *xsk, int* fd, atomic* processed_packet_count) { unsigned int rcvd, stock_frames, i; uint32_t idx_rx = 0, idx_fq = 0; int ret; +// marl::schedule([&processed_packet_count] { +// int ten_seconds = (10 * 1000 * 1000); +// while (true){ +// usleep(ten_seconds); +// auto current_count = processed_packet_count.load(); +// printf("Ten seconds passed, processed packet count: %ld\n", +// current_count); +// } +// }); + rcvd = xsk_ring_cons__peek(&xsk->rx, RX_BATCH_SIZE, &idx_rx); if (!rcvd) return; @@ -653,6 +680,7 @@ static void handle_receive_packets(struct xsk_socket_info *xsk, int* fd) xsk_free_umem_frame(xsk, addr); xsk->stats.rx_bytes += len; + processed_packet_count->fetch_add(1); } xsk_ring_cons__release(&xsk->rx, rcvd); @@ -673,13 +701,26 @@ static void rx_and_process(struct config *cfg, fds[0].fd = xsk_socket__fd(xsk_socket->xsk); fds[0].events = POLLIN; printf("%s\n", "Entering while loop to process packets."); + atomic processed_packet_count(0); + std::thread t( + [&] { + int ten_seconds = (10 * 1000 * 1000); + while (true){ + usleep(ten_seconds); + auto current_count = processed_packet_count.load(); + printf("Ten seconds passed, processed packet count: %ld\n", + current_count); + } + } + ); + t.detach(); while(!global_exit) { if (cfg->xsk_poll_mode) { ret = poll(fds, nfds, -1); if (ret <= 0 || ret > 1) continue; } - handle_receive_packets(xsk_socket, fd); + handle_receive_packets(xsk_socket, fd, &processed_packet_count); } } @@ -987,9 +1028,9 @@ void af_xdp_user::run_af_xdp() cfg.ifindex = if_nametoindex(cfg.ifname); // skb mode cfg.xdp_flags &= ~XDP_FLAGS_MODES; - cfg.xdp_flags |= XDP_FLAGS_DRV_MODE; - cfg.xsk_bind_flags &= XDP_COPY;//XDP_ZEROCOPY; - cfg.xsk_bind_flags |= XDP_ZEROCOPY;//XDP_COPY; + cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;//XDP_FLAGS_DRV_MODE; + cfg.xsk_bind_flags &= XDP_ZEROCOPY;//XDP_COPY; + cfg.xsk_bind_flags |= XDP_COPY;//XDP_ZEROCOPY; // queue_id, default = 0 cfg.xsk_if_queue = 0; From 491bc41dc736edcc74cb66e2479fb97f8a65e801 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Tue, 6 Dec 2022 14:22:47 -0800 Subject: [PATCH 14/33] Changed folly::ConcurrentHashMap to std::unordered_map for better performance --- include/db_client.h | 5 +++-- src/comm/af_xdp_user.cpp | 35 ++++++++++++++++------------------- src/comm/grpc_client.cpp | 2 +- src/main.cpp | 14 +++++++------- 4 files changed, 27 insertions(+), 29 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index d5fc36e..4f33ec0 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -87,7 +87,7 @@ class db_client { // Create local db writer single thread execution queue dispatch_queue local_db_writer_queue = dispatch_queue("Local db background write queue", 1); - folly::ConcurrentHashMap endpoint_cache; + std::unordered_map endpoint_cache; void FillEndpointCacheFromDB() { @@ -119,7 +119,8 @@ class db_client { struct sockaddr_in ep_hip; inet_pton(AF_INET, get<4>(row).c_str(), &(ep_hip.sin_addr)); value.hip = ep_hip.sin_addr.s_addr; - endpoint_cache.insert(key, value); + endpoint_cache[key] = value; +// endpoint_cache.insert(key, value); printf("Inserted this endpoint into cache: VNI: %ld, vpc_ip: %s, host_mac: %x:%x:%x:%x:%x:%x, vpc_mac: %x:%x:%x:%x:%x:%x, host_ip: %s\n", key.vni, inet_ntoa(ep_ip.sin_addr), value.hmac[0],value.hmac[1],value.hmac[2],value.hmac[3],value.hmac[4],value.hmac[5], diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp index 3fc8d65..2334071 100644 --- a/src/comm/af_xdp_user.cpp +++ b/src/comm/af_xdp_user.cpp @@ -28,8 +28,12 @@ #include "xdp/trn_datamodel.h" #include "util.h" //#include "xdp/trn_kern.h" +#include "marl/defer.h" +#include "marl/event.h" +#include "marl/scheduler.h" +#include "marl/waitgroup.h" -#define NUM_FRAMES 4096 +#define NUM_FRAMES 40960//4096 #define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE #define RX_BATCH_SIZE 64 #define INVALID_UMEM_FRAME UINT64_MAX @@ -502,8 +506,9 @@ static bool process_packet(struct xsk_socket_info *xsk, // TODO: Add inner IP support, refer to trn_process_inner_ip // parse inner IP header struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); - struct in_addr inner_ip_src, inner_ip_dest; - inner_ip_src.s_addr = inner_ip->saddr; +// struct in_addr inner_ip_src; +// inner_ip_src.s_addr = inner_ip->saddr; + struct in_addr inner_ip_dest; inner_ip_dest.s_addr = inner_ip->daddr; // printf("Inner IP src: %s\n", inet_ntoa(inner_ip_src)); // printf("Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); @@ -526,7 +531,6 @@ static bool process_packet(struct xsk_socket_info *xsk, // printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", // inet_ntoa(inner_ip_dest), trn_get_vni(vxlan->vni), 0); - /* Modify inner EitherHdr, pretend it's from target */ // struct in_addr ep_ip_addr, ep_host_ip_addr; // ep_ip_addr.s_addr = epkey.ip; // ep_host_ip_addr.s_addr = ep_value.hip; @@ -535,6 +539,8 @@ static bool process_packet(struct xsk_socket_info *xsk, // ep_value.hmac[0],ep_value.hmac[1],ep_value.hmac[2],ep_value.hmac[3],ep_value.hmac[4],ep_value.hmac[5], // ep_value.mac[0],ep_value.mac[1],ep_value.mac[2],ep_value.mac[3],ep_value.mac[4],ep_value.mac[5] // ); + + /* Modify inner EitherHdr, pretend it's from target */ trn_set_dst_mac(inner_eth, ep_value.mac); /* Keep overlay header, update outer header destinations */ @@ -635,16 +641,6 @@ static void handle_receive_packets(struct xsk_socket_info *xsk, int* fd, atomic< uint32_t idx_rx = 0, idx_fq = 0; int ret; -// marl::schedule([&processed_packet_count] { -// int ten_seconds = (10 * 1000 * 1000); -// while (true){ -// usleep(ten_seconds); -// auto current_count = processed_packet_count.load(); -// printf("Ten seconds passed, processed packet count: %ld\n", -// current_count); -// } -// }); - rcvd = xsk_ring_cons__peek(&xsk->rx, RX_BATCH_SIZE, &idx_rx); if (!rcvd) return; @@ -688,7 +684,7 @@ static void handle_receive_packets(struct xsk_socket_info *xsk, int* fd, atomic< /* Do we need to wake up the kernel for transmission */ complete_tx(xsk); -// printf("tx completed\n"); + // printf("tx completed\n"); } static void rx_and_process(struct config *cfg, @@ -710,6 +706,7 @@ static void rx_and_process(struct config *cfg, auto current_count = processed_packet_count.load(); printf("Ten seconds passed, processed packet count: %ld\n", current_count); + } } ); @@ -1028,14 +1025,14 @@ void af_xdp_user::run_af_xdp() cfg.ifindex = if_nametoindex(cfg.ifname); // skb mode cfg.xdp_flags &= ~XDP_FLAGS_MODES; - cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;//XDP_FLAGS_DRV_MODE; - cfg.xsk_bind_flags &= XDP_ZEROCOPY;//XDP_COPY; - cfg.xsk_bind_flags |= XDP_COPY;//XDP_ZEROCOPY; + cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;//XDP_FLAGS_DRV_MODE; + cfg.xsk_bind_flags &= XDP_COPY;//XDP_COPY; + cfg.xsk_bind_flags |= XDP_ZEROCOPY;//XDP_ZEROCOPY; // queue_id, default = 0 cfg.xsk_if_queue = 0; // NOT using poll - cfg.xsk_poll_mode = true; + cfg.xsk_poll_mode = false; // not doing unload this time cfg.do_unload = false; // progsec of the xdp program diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index f125945..8bb1da2 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -174,7 +174,7 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, int ebpf_rc = 0;//bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); // also put in local in memory cache - db_client::get_instance().endpoint_cache.insert(epkey, ep); + db_client::get_instance().endpoint_cache[epkey] = ep;//.insert(epkey, ep); printf("GPPC: Inserted this neighbor into map: vip: %s, vni: %d\n", vpc_ip.c_str(), vni); // step #3 - async call to write/update to local db table 1 db_client::get_instance().local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { diff --git a/src/main.cpp b/src/main.cpp index 59e1cbd..13e256a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -74,13 +74,13 @@ static void cleanup() { printf("%s", "Unable to delete grpc client pointer since it is null.\n"); } -// if (g_grpc_client_thread != NULL) { -// delete g_grpc_client_thread; -// g_grpc_client_thread = NULL; -// printf("%s", "Cleaned up grpc client thread.\n"); -// } else { -// printf("%s", "Unable to call delete grpc client thread pointer since it is null.\n"); -// } + if (g_grpc_client_thread != NULL) { + delete g_grpc_client_thread; + g_grpc_client_thread = NULL; + printf("%s", "Cleaned up grpc client thread.\n"); + } else { + printf("%s", "Unable to call delete grpc client thread pointer since it is null.\n"); + } } // function to handle ctrl-c and kill process From c29681160a45703448ca0c1d0d28479409252f48 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Wed, 21 Dec 2022 15:26:33 -0800 Subject: [PATCH 15/33] With this commit, AF_XDP multi threading works for ping with 8 threads; need to investigate how to add use round robin in kernel, how to make iperf work and how to user more threads --- include/af_xdp_user_multi_thread.h | 33 + include/db_client.h | 33 +- include/util.h | 2 +- src/CMakeLists.txt | 2 +- src/comm/af_xdp_user.cpp | 4 +- src/comm/af_xdp_user_multi_thread.cpp | 1473 +++++++++++++++++++++++++ src/main.cpp | 12 +- 7 files changed, 1538 insertions(+), 21 deletions(-) create mode 100644 include/af_xdp_user_multi_thread.h create mode 100644 src/comm/af_xdp_user_multi_thread.cpp diff --git a/include/af_xdp_user_multi_thread.h b/include/af_xdp_user_multi_thread.h new file mode 100644 index 0000000..c9116c1 --- /dev/null +++ b/include/af_xdp_user_multi_thread.h @@ -0,0 +1,33 @@ +// +// Created by ubuntu on 10/4/22. +// + +#ifndef ARIONAGENT_AF_XDP_USER_MULTI_THREADED_H +#define ARIONAGENT_AF_XDP_USER_MULTI_THREADED_H + +#include "logger.h" +#include +#include +#include +#include +#ifdef __cplusplus +extern "C" +{ +#include "common_params.h" +#include "common_user_bpf_xdp.h" +#include "common_libbpf.h" +} +#endif +static const char *__d__ = "AF_XDP kernel bypass example multi threaded\n"; + +class af_xdp_user_multi_thread { +public: + af_xdp_user_multi_thread() { + printf("%s", "Start of multithread af_xdp userspace program."); + } + static void* run_af_xdp_multi_threaded(void* args/*std::string table_name_neighbor_ebpf_map*/); +private: + +}; + +#endif //ARIONAGENT_AF_XDP_USER_H diff --git a/include/db_client.h b/include/db_client.h index 4f33ec0..65c55a8 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -100,33 +100,40 @@ class db_client { auto rows = local_db.execute(get_all_neighbors_statement); printf("Retrieved %ld neighbors from local DB\n", rows.size()); for (auto & row : rows) { + int vni = get<0>(row); + auto vpc_ip = get<1>(row).c_str(); + auto host_ip = get<4>(row).c_str(); + auto vpc_mac = get<3>(row).c_str(); + auto host_mac = get<2>(row).c_str(); printf("Retrieved this endpoint from local DB: VNI: %ld, vpc_ip: %s, host_mac: %s, vpc_mac: %s, host_ip: %s\n", - get<0>(row), get<1>(row).c_str(), get<2>(row).c_str(), get<3>(row).c_str(), get<4>(row).c_str() +// get<0>(row), get<1>(row).c_str(), get<2>(row).c_str(), get<3>(row).c_str(), get<4>(row).c_str() + vni, vpc_ip, host_mac, vpc_mac, host_ip ); endpoint_key_t key; - key.vni = (get<0>(row)); - struct sockaddr_in ep_ip; - inet_pton(AF_INET, get<1>(row).c_str(), &(ep_ip.sin_addr)); - key.ip = ep_ip.sin_addr.s_addr; + key.vni = vni; //(get<0>(row)); + struct sockaddr_in endpoint_vpc_ip_socket; + inet_pton(AF_INET, vpc_ip, &(endpoint_vpc_ip_socket.sin_addr)); + key.ip = endpoint_vpc_ip_socket.sin_addr.s_addr; endpoint_t value; - std::sscanf(get<3>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + std::sscanf(vpc_mac, "%02x:%02x:%02x:%02x:%02x:%02x", &value.mac[0], &value.mac[1], &value.mac[2], &value.mac[3], &value.mac[4], &value.mac[5]); - std::sscanf(get<2>(row).c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + std::sscanf(host_mac, "%02x:%02x:%02x:%02x:%02x:%02x", &value.hmac[0], &value.hmac[1], &value.hmac[2], &value.hmac[3], &value.hmac[4], &value.hmac[5]); - struct sockaddr_in ep_hip; - inet_pton(AF_INET, get<4>(row).c_str(), &(ep_hip.sin_addr)); - value.hip = ep_hip.sin_addr.s_addr; + struct sockaddr_in endpoint_host_ip_socket; + inet_pton(AF_INET, host_ip, &(endpoint_host_ip_socket.sin_addr)); + value.hip = endpoint_host_ip_socket.sin_addr.s_addr; endpoint_cache[key] = value; // endpoint_cache.insert(key, value); - printf("Inserted this endpoint into cache: VNI: %ld, vpc_ip: %s, host_mac: %x:%x:%x:%x:%x:%x, vpc_mac: %x:%x:%x:%x:%x:%x, host_ip: %s\n", - key.vni, inet_ntoa(ep_ip.sin_addr), + printf("Inserted this endpoint into cache: VNI: %ld, vpc_ip: %s, ", key.vni, inet_ntoa(endpoint_vpc_ip_socket.sin_addr)); + printf("host_mac: %x:%x:%x:%x:%x:%x, vpc_mac: %x:%x:%x:%x:%x:%x, host_ip: %s\n", value.hmac[0],value.hmac[1],value.hmac[2],value.hmac[3],value.hmac[4],value.hmac[5], value.mac[0],value.mac[1],value.mac[2],value.mac[3],value.mac[4],value.mac[5], - inet_ntoa(ep_hip.sin_addr) + inet_ntoa(endpoint_host_ip_socket.sin_addr) ); + printf("Finished one endpoint\n"); } printf("Finished retrieving from local DB, not endpoint cache has %ld endpoints\n", endpoint_cache.size()); } diff --git a/include/util.h b/include/util.h index ebed5a5..9c72c90 100644 --- a/include/util.h +++ b/include/util.h @@ -39,7 +39,7 @@ using namespace std; static inline long ip4tol(const string ip) { struct sockaddr_in sa; if (inet_pton(AF_INET, ip.c_str(), &(sa.sin_addr)) != 1) { - throw std::invalid_argument("Virtual ipv4 address is not in the expected format"); +// throw std::invalid_argument("Virtual ipv4 address is not in the expected format"); } return sa.sin_addr.s_addr; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fc779cb..9a42505 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,7 +5,7 @@ set(SOURCES ./comm/grpc_client.cpp comm/af_xdp_user.cpp # db/db_client.cpp - ) + comm/af_xdp_user_multi_thread.cpp ) #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -idirafter /usr/src/linux-headers-5.10.4/include/") #FIND_LIBRARY(LIBUUID_LIBRARIES uuid) diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp index 2334071..534191d 100644 --- a/src/comm/af_xdp_user.cpp +++ b/src/comm/af_xdp_user.cpp @@ -350,8 +350,8 @@ static bool process_packet(struct xsk_socket_info *xsk, // parse UDP header struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); -// printf("UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", -// udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); + printf("UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", + udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); // parse VXLAN header struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); diff --git a/src/comm/af_xdp_user_multi_thread.cpp b/src/comm/af_xdp_user_multi_thread.cpp new file mode 100644 index 0000000..7ab0d54 --- /dev/null +++ b/src/comm/af_xdp_user_multi_thread.cpp @@ -0,0 +1,1473 @@ +// +// Created by user on 12/16/22. +// + +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2020 - 2022 Intel Corporation. */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include +#include + +//#include +#include +//#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" +#include "xdp/trn_datamodel.h" +#include +#include + + + +#define VXL_DSTPORT 0xb512 // UDP dport 4789(0x12b5) for VxLAN overlay +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +typedef __u64 u64; +typedef __u32 u32; +typedef __u16 u16; +typedef __u8 u8; + +struct arp_message { + uint16_t hrd; + uint16_t pro; + uint8_t hln; + uint8_t pln; + uint16_t op; + uint8_t sha[6]; + uint32_t spa; + uint8_t tha[6]; + uint32_t tpa; +} __attribute__((__packed__)); + +struct vxlanhdr_internal { + /* Big endian! */ + __u8 rsvd1 : 3; + __u8 i_flag : 1; + __u8 rsvd2 : 4; + __u8 rsvd3[3]; + __u8 vni[3]; + __u8 rsvd4; +}; + +/* This program illustrates the packet forwarding between multiple AF_XDP + * sockets in multi-threaded environment. All threads are sharing a common + * buffer pool, with each socket having its own private buffer cache. + * + * Example 1: Single thread handling two sockets. The packets received by socket + * A (interface IFA, queue QA) are forwarded to socket B (interface IFB, queue + * QB), while the packets received by socket B are forwarded to socket A. The + * thread is running on CPU core X: + * + * ./xsk_fwd -i IFA -q QA -i IFB -q QB -c X + * + * Example 2: Two threads, each handling two sockets. The thread running on CPU + * core X forwards all the packets received by socket A to socket B, and all the + * packets received by socket B to socket A. The thread running on CPU core Y is + * performing the same packet forwarding between sockets C and D: + * + * ./xsk_fwd -i IFA -q QA -i IFB -q QB -i IFC -q QC -i IFD -q QD + * -c CX -c CY + */ + +/* + * Buffer pool and buffer cache + * + * For packet forwarding, the packet buffers are typically allocated from the + * pool for packet reception and freed back to the pool for further reuse once + * the packet transmission is completed. + * + * The buffer pool is shared between multiple threads. In order to minimize the + * access latency to the shared buffer pool, each thread creates one (or + * several) buffer caches, which, unlike the buffer pool, are private to the + * thread that creates them and therefore cannot be shared with other threads. + * The access to the shared pool is only needed either (A) when the cache gets + * empty due to repeated buffer allocations and it needs to be replenished from + * the pool, or (B) when the cache gets full due to repeated buffer free and it + * needs to be flushed back to the pull. + * + * In a packet forwarding system, a packet received on any input port can + * potentially be transmitted on any output port, depending on the forwarding + * configuration. For AF_XDP sockets, for this to work with zero-copy of the + * packet buffers when, it is required that the buffer pool memory fits into the + * UMEM area shared by all the sockets. + */ + +struct bpool_params { + u32 n_buffers; + u32 buffer_size; + int mmap_flags; + + u32 n_users_max; + u32 n_buffers_per_slab; +}; + +/* This buffer pool implementation organizes the buffers into equally sized + * slabs of *n_buffers_per_slab*. Initially, there are *n_slabs* slabs in the + * pool that are completely filled with buffer pointers (full slabs). + * + * Each buffer cache has a slab for buffer allocation and a slab for buffer + * free, with both of these slabs initially empty. When the cache's allocation + * slab goes empty, it is swapped with one of the available full slabs from the + * pool, if any is available. When the cache's free slab goes full, it is + * swapped for one of the empty slabs from the pool, which is guaranteed to + * succeed. + * + * Partially filled slabs never get traded between the cache and the pool + * (except when the cache itself is destroyed), which enables fast operation + * through pointer swapping. + */ +struct bpool { + struct bpool_params params; + pthread_mutex_t lock; + void *addr; + + u64 **slabs; + u64 **slabs_reserved; + u64 *buffers; + u64 *buffers_reserved; + + u64 n_slabs; + u64 n_slabs_reserved; + u64 n_buffers; + + u64 n_slabs_available; + u64 n_slabs_reserved_available; + + struct xsk_umem_config umem_cfg; + struct xsk_ring_prod umem_fq; + struct xsk_ring_cons umem_cq; + struct xsk_umem *umem; +}; + +static struct bpool * +bpool_init(struct bpool_params *params, + struct xsk_umem_config *umem_cfg) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + u64 n_slabs, n_slabs_reserved, n_buffers, n_buffers_reserved; + u64 slabs_size, slabs_reserved_size; + u64 buffers_size, buffers_reserved_size; + u64 total_size, i; + struct bpool *bp; + u8 *p; + int status; + + /* mmap prep. */ + if (setrlimit(RLIMIT_MEMLOCK, &r)) + return NULL; + + /* bpool internals dimensioning. */ + n_slabs = (params->n_buffers + params->n_buffers_per_slab - 1) / + params->n_buffers_per_slab; + n_slabs_reserved = params->n_users_max * 2; + n_buffers = n_slabs * params->n_buffers_per_slab; + n_buffers_reserved = n_slabs_reserved * params->n_buffers_per_slab; + + slabs_size = n_slabs * sizeof(u64 *); + slabs_reserved_size = n_slabs_reserved * sizeof(u64 *); + buffers_size = n_buffers * sizeof(u64); + buffers_reserved_size = n_buffers_reserved * sizeof(u64); + + total_size = sizeof(struct bpool) + + slabs_size + slabs_reserved_size + + buffers_size + buffers_reserved_size; + + /* bpool memory allocation. */ + p = static_cast(calloc(total_size, sizeof(u8))); + if (!p) + return NULL; + + /* bpool memory initialization. */ + bp = (struct bpool *)p; + memcpy(&bp->params, params, sizeof(*params)); + bp->params.n_buffers = n_buffers; + + bp->slabs = (u64 **)&p[sizeof(struct bpool)]; + bp->slabs_reserved = (u64 **)&p[sizeof(struct bpool) + + slabs_size]; + bp->buffers = (u64 *)&p[sizeof(struct bpool) + + slabs_size + slabs_reserved_size]; + bp->buffers_reserved = (u64 *)&p[sizeof(struct bpool) + + slabs_size + slabs_reserved_size + buffers_size]; + + bp->n_slabs = n_slabs; + bp->n_slabs_reserved = n_slabs_reserved; + bp->n_buffers = n_buffers; + + for (i = 0; i < n_slabs; i++) + bp->slabs[i] = &bp->buffers[i * params->n_buffers_per_slab]; + bp->n_slabs_available = n_slabs; + + for (i = 0; i < n_slabs_reserved; i++) + bp->slabs_reserved[i] = &bp->buffers_reserved[i * + params->n_buffers_per_slab]; + bp->n_slabs_reserved_available = n_slabs_reserved; + + for (i = 0; i < n_buffers; i++) + bp->buffers[i] = i * params->buffer_size; + + /* lock. */ + status = pthread_mutex_init(&bp->lock, NULL); + if (status) { + free(p); + return NULL; + } + + /* mmap. */ + bp->addr = mmap(NULL, + n_buffers * params->buffer_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | params->mmap_flags, + -1, + 0); + if (bp->addr == MAP_FAILED) { + pthread_mutex_destroy(&bp->lock); + free(p); + return NULL; + } + + /* umem. */ + status = xsk_umem__create(&bp->umem, + bp->addr, + bp->params.n_buffers * bp->params.buffer_size, + &bp->umem_fq, + &bp->umem_cq, + umem_cfg); + if (status) { + munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size); + pthread_mutex_destroy(&bp->lock); + free(p); + return NULL; + } + memcpy(&bp->umem_cfg, umem_cfg, sizeof(*umem_cfg)); + + return bp; +} + +static void +bpool_free(struct bpool *bp) +{ + if (!bp) + return; + + xsk_umem__delete(bp->umem); + munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size); + pthread_mutex_destroy(&bp->lock); + free(bp); +} + +struct bcache { + struct bpool *bp; + + u64 *slab_cons; + u64 *slab_prod; + + u64 n_buffers_cons; + u64 n_buffers_prod; +}; + +static u32 +bcache_slab_size(struct bcache *bc) +{ + struct bpool *bp = bc->bp; + + return bp->params.n_buffers_per_slab; +} + +static struct bcache * +bcache_init(struct bpool *bp) +{ + struct bcache *bc; + + bc = static_cast(calloc(1, sizeof(struct bcache))); + if (!bc) + return NULL; + + bc->bp = bp; + bc->n_buffers_cons = 0; + bc->n_buffers_prod = 0; + + pthread_mutex_lock(&bp->lock); + if (bp->n_slabs_reserved_available == 0) { + pthread_mutex_unlock(&bp->lock); + free(bc); + return NULL; + } + + bc->slab_cons = bp->slabs_reserved[bp->n_slabs_reserved_available - 1]; + bc->slab_prod = bp->slabs_reserved[bp->n_slabs_reserved_available - 2]; + bp->n_slabs_reserved_available -= 2; + pthread_mutex_unlock(&bp->lock); + + return bc; +} + +static void +bcache_free(struct bcache *bc) +{ + struct bpool *bp; + + if (!bc) + return; + + /* In order to keep this example simple, the case of freeing any + * existing buffers from the cache back to the pool is ignored. + */ + + bp = bc->bp; + pthread_mutex_lock(&bp->lock); + bp->slabs_reserved[bp->n_slabs_reserved_available] = bc->slab_prod; + bp->slabs_reserved[bp->n_slabs_reserved_available + 1] = bc->slab_cons; + bp->n_slabs_reserved_available += 2; + pthread_mutex_unlock(&bp->lock); + + free(bc); +} + +/* To work correctly, the implementation requires that the *n_buffers* input + * argument is never greater than the buffer pool's *n_buffers_per_slab*. This + * is typically the case, with one exception taking place when large number of + * buffers are allocated at init time (e.g. for the UMEM fill queue setup). + */ +static inline u32 +bcache_cons_check(struct bcache *bc, u32 n_buffers) +{ + struct bpool *bp = bc->bp; +// printf("bp->params.n_buffers_per_slab: %ld\n", bp->params.n_buffers_per_slab); + u64 n_buffers_per_slab = bp->params.n_buffers_per_slab; + u64 n_buffers_cons = bc->n_buffers_cons; + u64 n_slabs_available; + u64 *slab_full; + + /* + * Consumer slab is not empty: Use what's available locally. Do not + * look for more buffers from the pool when the ask can only be + * partially satisfied. + */ + if (n_buffers_cons) + return (n_buffers_cons < n_buffers) ? + n_buffers_cons : + n_buffers; + + /* + * Consumer slab is empty: look to trade the current consumer slab + * (full) for a full slab from the pool, if any is available. + */ + pthread_mutex_lock(&bp->lock); + n_slabs_available = bp->n_slabs_available; + if (!n_slabs_available) { + pthread_mutex_unlock(&bp->lock); + return 0; + } + + n_slabs_available--; + slab_full = bp->slabs[n_slabs_available]; + bp->slabs[n_slabs_available] = bc->slab_cons; + bp->n_slabs_available = n_slabs_available; + pthread_mutex_unlock(&bp->lock); + + bc->slab_cons = slab_full; + bc->n_buffers_cons = n_buffers_per_slab; + return n_buffers; +} + +static inline u64 +bcache_cons(struct bcache *bc) +{ + u64 n_buffers_cons = bc->n_buffers_cons - 1; + u64 buffer; + + buffer = bc->slab_cons[n_buffers_cons]; + bc->n_buffers_cons = n_buffers_cons; + return buffer; +} + +static inline void +bcache_prod(struct bcache *bc, u64 buffer) +{ + struct bpool *bp = bc->bp; + u64 n_buffers_per_slab = bp->params.n_buffers_per_slab; + u64 n_buffers_prod = bc->n_buffers_prod; + u64 n_slabs_available; + u64 *slab_empty; + + /* + * Producer slab is not yet full: store the current buffer to it. + */ + if (n_buffers_prod < n_buffers_per_slab) { + bc->slab_prod[n_buffers_prod] = buffer; + bc->n_buffers_prod = n_buffers_prod + 1; + return; + } + + /* + * Producer slab is full: trade the cache's current producer slab + * (full) for an empty slab from the pool, then store the current + * buffer to the new producer slab. As one full slab exists in the + * cache, it is guaranteed that there is at least one empty slab + * available in the pool. + */ + pthread_mutex_lock(&bp->lock); + n_slabs_available = bp->n_slabs_available; + slab_empty = bp->slabs[n_slabs_available]; + bp->slabs[n_slabs_available] = bc->slab_prod; + bp->n_slabs_available = n_slabs_available + 1; + pthread_mutex_unlock(&bp->lock); + + slab_empty[0] = buffer; + bc->slab_prod = slab_empty; + bc->n_buffers_prod = 1; +} + +/* + * Port + * + * Each of the forwarding ports sits on top of an AF_XDP socket. In order for + * packet forwarding to happen with no packet buffer copy, all the sockets need + * to share the same UMEM area, which is used as the buffer pool memory. + */ +#ifndef MAX_BURST_RX +#define MAX_BURST_RX 64 +#endif + +#ifndef MAX_BURST_TX +#define MAX_BURST_TX 64 +#endif + +struct burst_rx { + u64 addr[MAX_BURST_RX]; + u32 len[MAX_BURST_RX]; +}; + +struct burst_tx { + u64 addr[MAX_BURST_TX]; + u32 len[MAX_BURST_TX]; + u32 n_pkts; +}; + +struct port_params { + struct xsk_socket_config xsk_cfg; + struct bpool *bp; + const char *iface; + u32 iface_queue; +}; + +struct port { + struct port_params params; + + struct bcache *bc; + + struct xsk_ring_cons rxq; + struct xsk_ring_prod txq; + struct xsk_ring_prod umem_fq; + struct xsk_ring_cons umem_cq; + struct xsk_socket *xsk; + int umem_fq_initialized; + + u64 n_pkts_rx; + u64 n_pkts_tx; +}; + +static void +port_free(struct port *p) +{ + if (!p) + return; + + /* To keep this example simple, the code to free the buffers from the + * socket's receive and transmit queues, as well as from the UMEM fill + * and completion queues, is not included. + */ + + if (p->xsk) + xsk_socket__delete(p->xsk); + + bcache_free(p->bc); + + free(p); +} + +static struct port * +port_init(struct port_params *params) +{ + struct port *p; + u32 umem_fq_size, pos = 0; + int status, i; + + /* Memory allocation and initialization. */ + p = static_cast(calloc(sizeof(struct port), 1)); + if (!p) { + printf("port_init failed because memory allocation failed.\n"); + return NULL; + } + + memcpy(&p->params, params, sizeof(p->params)); + umem_fq_size = params->bp->umem_cfg.fill_size; + + /* bcache. */ + p->bc = bcache_init(params->bp); + if (!p->bc || + (bcache_slab_size(p->bc) < umem_fq_size) || + (bcache_cons_check(p->bc, umem_fq_size) < umem_fq_size)) { + port_free(p); + printf("port_init failed because bcache failed.\n(bcache_slab_size(p->bc) < umem_fq_size) : %s\n" + "(bcache_cons_check(p->bc, umem_fq_size) < umem_fq_size) : %s", + ((bcache_slab_size(p->bc) < umem_fq_size) ? "true" : "false"), + ((bcache_cons_check(p->bc, umem_fq_size) < umem_fq_size) ? "true" : "false") + ); + return NULL; + } + + /* xsk socket. */ + status = xsk_socket__create_shared(&p->xsk, + params->iface, + params->iface_queue, + params->bp->umem, + &p->rxq, + &p->txq, + &p->umem_fq, + &p->umem_cq, + ¶ms->xsk_cfg); + if (status) { + port_free(p); + printf("port_init failed because xsk_socket__create_shared failed.\n"); + return NULL; + } + + /* umem fq. */ + xsk_ring_prod__reserve(&p->umem_fq, umem_fq_size, &pos); + + for (i = 0; i < umem_fq_size; i++) + *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) = + bcache_cons(p->bc); + + xsk_ring_prod__submit(&p->umem_fq, umem_fq_size); + p->umem_fq_initialized = 1; + + return p; +} + +static inline u32 +port_rx_burst(struct port *p, struct burst_rx *b) +{ + u32 n_pkts, pos, i; + + /* Free buffers for FQ replenish. */ + n_pkts = ARRAY_SIZE(b->addr); + + n_pkts = bcache_cons_check(p->bc, n_pkts); + if (!n_pkts) + return 0; + + /* RXQ. */ + n_pkts = xsk_ring_cons__peek(&p->rxq, n_pkts, &pos); + if (!n_pkts) { + if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { + struct pollfd pollfd = { + .fd = xsk_socket__fd(p->xsk), + .events = POLLIN, + }; + + poll(&pollfd, 1, 0); + } + return 0; + } + + for (i = 0; i < n_pkts; i++) { + b->addr[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->addr; + b->len[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->len; + } + + xsk_ring_cons__release(&p->rxq, n_pkts); + p->n_pkts_rx += n_pkts; + + /* UMEM FQ. */ + for ( ; ; ) { + int status; + + status = xsk_ring_prod__reserve(&p->umem_fq, n_pkts, &pos); + if (status == n_pkts) + break; + + if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { + struct pollfd pollfd = { + .fd = xsk_socket__fd(p->xsk), + .events = POLLIN, + }; + + poll(&pollfd, 1, 0); + } + } + + for (i = 0; i < n_pkts; i++) + *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) = + bcache_cons(p->bc); + + xsk_ring_prod__submit(&p->umem_fq, n_pkts); + printf("Queue: %ld rx burst got %ld packets\n", p->params.iface_queue,n_pkts); + return n_pkts; +} + +static inline void +port_tx_burst(struct port *p, struct burst_tx *b) +{ + u32 n_pkts, pos, i; + int status; + + /* UMEM CQ. */ + n_pkts = p->params.bp->umem_cfg.comp_size; + + n_pkts = xsk_ring_cons__peek(&p->umem_cq, n_pkts, &pos); + + for (i = 0; i < n_pkts; i++) { + u64 addr = *xsk_ring_cons__comp_addr(&p->umem_cq, pos + i); + + bcache_prod(p->bc, addr); + } + + xsk_ring_cons__release(&p->umem_cq, n_pkts); + + /* TXQ. */ + n_pkts = b->n_pkts; + + for ( ; ; ) { + status = xsk_ring_prod__reserve(&p->txq, n_pkts, &pos); + if (status == n_pkts) + break; + + if (xsk_ring_prod__needs_wakeup(&p->txq)) + sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, + NULL, 0); + } + + for (i = 0; i < n_pkts; i++) { + xsk_ring_prod__tx_desc(&p->txq, pos + i)->addr = b->addr[i]; + xsk_ring_prod__tx_desc(&p->txq, pos + i)->len = b->len[i]; + } + + xsk_ring_prod__submit(&p->txq, n_pkts); + if (xsk_ring_prod__needs_wakeup(&p->txq)) + sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + p->n_pkts_tx += n_pkts; + printf("tx burst sent %ld packets\n", n_pkts); +} + +/* + * Thread + * + * Packet forwarding threads. + */ +#ifndef MAX_PORTS_PER_THREAD +#define MAX_PORTS_PER_THREAD 16 +#endif + +struct thread_data { + struct port *ports_rx[MAX_PORTS_PER_THREAD]; + struct port *ports_tx[MAX_PORTS_PER_THREAD]; + u32 n_ports_rx; + struct burst_rx burst_rx; + struct burst_tx burst_tx[MAX_PORTS_PER_THREAD]; + u32 cpu_core_id; + int quit; +}; + +static void swap_mac_addresses(void *data) +{ + struct ether_header *eth = (struct ether_header *)data; + struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; + struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; + struct ether_addr tmp; + + tmp = *src_addr; + *src_addr = *dst_addr; + *dst_addr = tmp; +} + +static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk,*/ + /*uint64_t addr, , int* fd*/ + ) +{ + printf(">>>>>>>>>> Begin processing packet >>>>>>>>>>\n"); + + if (true) { + /* + * TODO: Parse packet here, get VNI, IP, MAC, lookup locally in DB, and replace neigbor host IP if found; + * if NOT found, drop packet and remotely GET from Arion Master. + * */ + int ret; + uint32_t tx_idx = 0; + uint8_t tmp_mac[ETH_ALEN]; + // parse outer eth header + struct ethhdr *eth = (struct ethhdr *) pkt; + + if (ntohs(eth->h_proto) != ETH_P_IP) { + // printf("%s\n", "returning false for this packet as it is NOT IP"); + return false; + } + printf("Packet length: %ld\n", len); + printf("Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" + "eth size: %d\n", + eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], + eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], + bpf_ntohs(eth->h_proto), + sizeof(*eth)); + + // parse outer IP header + struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); + struct in_addr outer_ip_src; + outer_ip_src.s_addr = ip->saddr; + struct in_addr outer_ip_dest; + outer_ip_dest.s_addr = ip->daddr; + printf("Outer ip src: %s,",inet_ntoa(outer_ip_src)); + printf("ip dest: %s\n" + "Outer ip ihl: %d, version: %d\n", + inet_ntoa(outer_ip_dest), + ip->ihl, ip->version); + + // parse UDP header + struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); + printf("UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", + udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); + + // parse VXLAN header + struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); + printf("VNI: %ld, \n",trn_get_vni(vxlan->vni)); + + // parse inner eth header + struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); + printf("inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", + inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], + inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], + inner_eth->h_proto); + + if (ntohs(inner_eth->h_proto) == ETH_P_ARP) { + // parse inner arp header + arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); + struct in_addr arp_src_ip; + arp_src_ip.s_addr = arp_msg->spa; + struct in_addr arp_dest_ip; + arp_dest_ip.s_addr = arp_msg->tpa; + printf("arp op: %d\n", + bpf_htons(arp_msg->op)); + printf("arp source ip: %s, \n", + inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) + ); + printf("arp dest ip: %s, \n", + inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) + ); + endpoint_key_t epkey; + epkey.vni = trn_get_vni(vxlan->vni); + struct sockaddr_in ep_ip; + inet_pton(AF_INET, inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); + epkey.ip = ep_ip.sin_addr.s_addr; + auto ep_value = db_client::get_instance().GetNeighborInMemory(epkey); + // endpoint_t ep_value; + // ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_dest_ip)); + if (ep_value.hip != 0) { + // we now have key and value, can modify the packet and update the map now. + // int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); + // printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", + // inet_ntoa(arp_src_ip), trn_get_vni(vxlan->vni), 0); + + /* Modify pkt for inner ARP response */ + struct in_addr ep_ip_addr, ep_host_ip_addr; + ep_ip_addr.s_addr = epkey.ip; + ep_host_ip_addr.s_addr = ep_value.hip; + printf("Retrived this endpoint: HIP: %s ", inet_ntoa(ep_host_ip_addr)); + printf("IP: %s, host_mac: %x:%x:%x:%x:%x:%x, mac: %x:%x:%x:%x:%x:%x\n", + inet_ntoa(ep_ip_addr), + ep_value.hmac[0],ep_value.hmac[1],ep_value.hmac[2],ep_value.hmac[3],ep_value.hmac[4],ep_value.hmac[5], + ep_value.mac[0],ep_value.mac[1],ep_value.mac[2],ep_value.mac[3],ep_value.mac[4],ep_value.mac[5] + ); + arp_msg->op = bpf_htons(ARPOP_REPLY); + trn_set_mac(arp_msg->tha, arp_msg->sha); + trn_set_mac(arp_msg->sha, ep_value.mac); + + __u32 tmp_ip = arp_msg->spa;//*sip; + arp_msg->spa = arp_msg->tpa;//*tip; + arp_msg->tpa = tmp_ip; + + /* Modify inner EitherHdr, pretend it's from target */ + trn_set_dst_mac(inner_eth, inner_eth->h_source); + trn_set_src_mac(inner_eth, ep_value.mac); + + /* Keep overlay header, swap outer IP header */ + trn_set_src_dst_ip_csum(ip, ip->daddr, ip->saddr, (eth + len)); + trn_swap_src_dst_mac(pkt); + + /* + * Packet modification finished, read packet content again, in order to verify the mod + * */ + + struct ethhdr *eth = (struct ethhdr *) pkt; + + if (ntohs(eth->h_proto) != ETH_P_IP) { +// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); + return false; + } + printf("AFTER MOD: Packet length: %ld\n", len); + printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" + "eth size: %d\n", + eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], + eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], + bpf_ntohs(eth->h_proto), + sizeof(*eth)); + + // parse outer IP header + struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); + struct in_addr outer_ip_src; + outer_ip_src.s_addr = ip->saddr; + struct in_addr outer_ip_dest; + outer_ip_dest.s_addr = ip->daddr; + printf("AFTER MOD: Outer ip src: %s,", inet_ntoa(outer_ip_src)); + printf("ip dest: %s\n" + "AFTER MOD: Outer ip ihl: %d, version: %d\n", + inet_ntoa(outer_ip_dest), + ip->ihl, ip->version); + + // parse UDP header + struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); + printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", + udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); + + // parse VXLAN header + struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); + printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); + + // parse inner eth header + struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); + printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", + inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], + inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], + inner_eth->h_proto); + + // parse inner arp header + arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); + struct in_addr arp_src_ip; + arp_src_ip.s_addr = arp_msg->spa; + struct in_addr arp_dest_ip; + arp_dest_ip.s_addr = arp_msg->tpa; + printf("AFTER MOD: arp op: %d\n", + bpf_htons(arp_msg->op)); + printf("AFTER MOD: arp source ip: %s, \n", + inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) + ); + printf("AFTER MOD: arp dest ip: %s, \n", + inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) + ); + /* Here we sent the packet out of the receive port. Note that + * we allocate one entry and schedule it. Your design would be + * faster if you do batch processing/transmission */ + + printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); + + return true; + } else { + printf("Can't find endpoint!\n"); + return false; + } + }else if (ntohs(inner_eth->h_proto) == ETH_P_IP) { + // TODO: Add inner IP support, refer to trn_process_inner_ip + // parse inner IP header + struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); + struct in_addr inner_ip_src; + inner_ip_src.s_addr = inner_ip->saddr; + struct in_addr inner_ip_dest; + inner_ip_dest.s_addr = inner_ip->daddr; + printf("Inner IP src: %s\n", inet_ntoa(inner_ip_src)); + printf("Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); + endpoint_key_t epkey; + epkey.vni = trn_get_vni(vxlan->vni); + struct sockaddr_in ep_ip; + inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); + epkey.ip = ep_ip.sin_addr.s_addr; + auto ep_value = db_client::get_instance().GetNeighborInMemory(epkey); + // endpoint_t ep_value; + // ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(inner_ip_dest)); + + if (ep_value.hip != 0) { + // epkey.vni = trn_get_vni(vxlan->vni); + // struct sockaddr_in ep_ip; + // inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); + // epkey.ip = ep_ip.sin_addr.s_addr; + // we now have key and value, can modify the packet and update the map now. + // int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); + // printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", + // inet_ntoa(inner_ip_dest), trn_get_vni(vxlan->vni), 0); + + struct in_addr ep_ip_addr, ep_host_ip_addr; + ep_ip_addr.s_addr = epkey.ip; + ep_host_ip_addr.s_addr = ep_value.hip; + printf("Retrived this endpoint: HIP: %s,", inet_ntoa(ep_host_ip_addr)); + printf("IP: %s, host_mac: %x:%x:%x:%x:%x:%x, mac: %x:%x:%x:%x:%x:%x\n", + inet_ntoa(ep_ip_addr), + ep_value.hmac[0],ep_value.hmac[1],ep_value.hmac[2],ep_value.hmac[3],ep_value.hmac[4],ep_value.hmac[5], + ep_value.mac[0],ep_value.mac[1],ep_value.mac[2],ep_value.mac[3],ep_value.mac[4],ep_value.mac[5] + ); + + /* Modify inner EitherHdr, pretend it's from target */ + trn_set_dst_mac(inner_eth, ep_value.mac); + + /* Keep overlay header, update outer header destinations */ + trn_set_src_dst_ip_csum(ip, ip->daddr, ep_value.hip, (eth + len)); + trn_set_src_mac(eth, eth->h_dest); + trn_set_dst_mac(eth, ep_value.hmac); + + /* + * Packet modification finished, read packet content again, in order to verify the mod + * */ + + struct ethhdr *eth = (struct ethhdr *) pkt; + + if (ntohs(eth->h_proto) != ETH_P_IP) { +// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); + return false; + } + printf("AFTER MOD: Packet length: %ld\n", len); + printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" + "eth size: %d\n", + eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], + eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], + bpf_ntohs(eth->h_proto), + sizeof(*eth)); + + // parse outer IP header + struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); + struct in_addr outer_ip_src; + outer_ip_src.s_addr = ip->saddr; + struct in_addr outer_ip_dest; + outer_ip_dest.s_addr = ip->daddr; + printf("AFTER MOD: Outer ip src: %s", inet_ntoa(outer_ip_src)); + printf("ip dest: %s\n" + "AFTER MOD: Outer ip ihl: %d, version: %d\n", + inet_ntoa(outer_ip_dest), + ip->ihl, ip->version); + + // parse UDP header + struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); + printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", + udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); + + // parse VXLAN header + struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); + printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); + + // parse inner eth header + struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); + printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", + inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], + inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], + inner_eth->h_proto); + + // parse inner IP header + struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); + struct in_addr inner_ip_src, inner_ip_dest; + inner_ip_src.s_addr = inner_ip->saddr; + inner_ip_dest.s_addr = inner_ip->daddr; + printf("AFTER MOD: Inner IP src: %s\n", inet_ntoa(inner_ip_src)); + printf("AFTER MOD: Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); + /* Here we sent the packet out of the receive port. Note that + * we allocate one entry and schedule it. Your design would be + * faster if you do batch processing/transmission */ + + printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); + + return true; + } else { + printf("Can't find endpoint!\n"); + return false; + } + } + + printf("Endpoing hip == 0, returning false.\n"); + return false; + } + + return false; +} + + + +static void * +thread_func(void *arg) +{ + struct thread_data *t = static_cast(arg); + cpu_set_t cpu_cores; + u32 i; + + CPU_ZERO(&cpu_cores); + CPU_SET(t->cpu_core_id, &cpu_cores); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_cores); + + for (i = 0; !t->quit; i = (i + 1) & (t->n_ports_rx - 1)) { + struct port *port_rx = t->ports_rx[i]; + struct port *port_tx = t->ports_tx[i]; + struct burst_rx *brx = &t->burst_rx; + struct burst_tx *btx = &t->burst_tx[i]; + u32 n_pkts, j; + + /* RX. */ + n_pkts = port_rx_burst(port_rx, brx); + if (!n_pkts) { +// printf("thead %ld got no packets in rx_burst, continue\n", t->cpu_core_id ); + continue; + } + + /* Process & TX. */ + for (j = 0; j < n_pkts; j++) { + u64 addr = xsk_umem__add_offset_to_addr(brx->addr[j]); + u8 *pkt = static_cast(xsk_umem__get_data(port_rx->params.bp->addr, addr)); + + process_packet(pkt, brx->len[j]); +// swap_mac_addresses(pkt); + + btx->addr[btx->n_pkts] = brx->addr[j]; + btx->len[btx->n_pkts] = brx->len[j]; + btx->n_pkts++; + + if (btx->n_pkts > 0/*== MAX_BURST_TX*/) { + port_tx_burst(port_tx, btx); + btx->n_pkts = 0; + } + } + } + + return NULL; +} + +/* + * Process + */ +static const struct bpool_params bpool_params_default = { + .n_buffers = 64/*96*/ * 1024, + .buffer_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .mmap_flags = 0, + + .n_users_max = 16/*24*/, + .n_buffers_per_slab = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, +}; + +static const struct xsk_umem_config umem_cfg_default = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, + .flags = 0, +}; + +static const struct port_params port_params_default = { + .xsk_cfg = { + .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .libbpf_flags = 0, //.libxdp_flags + .xdp_flags = XDP_FLAGS_DRV_MODE, + .bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY, + }, + + .bp = NULL, + .iface = NULL, + .iface_queue = 0, +}; + +#ifndef MAX_PORTS +#define MAX_PORTS 64 +#endif + +#ifndef MAX_THREADS +#define MAX_THREADS 64 +#endif + +static struct bpool_params bpool_params; +static struct xsk_umem_config umem_cfg; +static struct bpool *bp; + +static struct port_params port_params[MAX_PORTS]; +static struct port *ports[MAX_PORTS]; +static u64 n_pkts_rx[MAX_PORTS]; +static u64 n_pkts_tx[MAX_PORTS]; +static int n_ports; + +static pthread_t threads[MAX_THREADS]; +static struct thread_data thread_data[MAX_THREADS]; +static int n_threads; + +static void +print_usage(char *prog_name) +{ + const char *usage = + "Usage:\n" + "\t%s [ -b SIZE ] -c CORE -i INTERFACE [ -q QUEUE ]\n" + "\n" + "-c CORE CPU core to run a packet forwarding thread\n" + " on. May be invoked multiple times.\n" + "\n" + "-b SIZE Number of buffers in the buffer pool shared\n" + " by all the forwarding threads. Default: %u.\n" + "\n" + "-i INTERFACE Network interface. Each (INTERFACE, QUEUE)\n" + " pair specifies one forwarding port. May be\n" + " invoked multiple times.\n" + "\n" + "-q QUEUE Network interface queue for RX and TX. Each\n" + " (INTERFACE, QUEUE) pair specified one\n" + " forwarding port. Default: %u. May be invoked\n" + " multiple times.\n" + "\n"; + printf(usage, + prog_name, + bpool_params_default.n_buffers, + port_params_default.iface_queue); +} + +static int +parse_args(int argc, char **argv) +{ + struct option lgopts[] = { + { NULL, 0, 0, 0 } + }; + int opt, option_index; + + /* Parse the input arguments. */ + for ( ; ;) { + opt = getopt_long(argc, argv, "c:i:q:", lgopts, &option_index); + if (opt == EOF) + break; + + switch (opt) { + case 'b': + bpool_params.n_buffers = atoi(optarg); + break; + + case 'c': + if (n_threads == MAX_THREADS) { + printf("Max number of threads (%d) reached.\n", + MAX_THREADS); + return -1; + } + + thread_data[n_threads].cpu_core_id = atoi(optarg); + n_threads++; + break; + + case 'i': + if (n_ports == MAX_PORTS) { + printf("Max number of ports (%d) reached.\n", + MAX_PORTS); + return -1; + } + + port_params[n_ports].iface = optarg; + port_params[n_ports].iface_queue = 0; + n_ports++; + break; + + case 'q': + if (n_ports == 0) { + printf("No port specified for queue.\n"); + return -1; + } + port_params[n_ports - 1].iface_queue = atoi(optarg); + break; + + default: + printf("Illegal argument.\n"); + return -1; + } + } + + optind = 1; /* reset getopt lib */ + + /* Check the input arguments. */ + if (!n_ports) { + printf("No ports specified.\n"); + return -1; + } + + if (!n_threads) { + printf("No threads specified.\n"); + return -1; + } + + if (n_ports % n_threads) { + printf("Ports cannot be evenly distributed to threads.\n"); + return -1; + } + + return 0; +} + +static void +print_port(u32 port_id) +{ + struct port *port = ports[port_id]; + + printf("Port %u: interface = %s, queue = %u\n", + port_id, port->params.iface, port->params.iface_queue); +} + +static void +print_thread(u32 thread_id) +{ + struct thread_data *t = &thread_data[thread_id]; + u32 i; + + printf("Thread %u (CPU core %u): ", + thread_id, t->cpu_core_id); + + for (i = 0; i < t->n_ports_rx; i++) { + struct port *port_rx = t->ports_rx[i]; + struct port *port_tx = t->ports_tx[i]; + + printf("(%s, %u) -> (%s, %u), ", + port_rx->params.iface, + port_rx->params.iface_queue, + port_tx->params.iface, + port_tx->params.iface_queue); + } + + printf("\n"); +} + +static void +print_port_stats_separator(void) +{ + printf("+-%4s-+-%12s-+-%13s-+-%12s-+-%13s-+\n", + "----", + "------------", + "-------------", + "------------", + "-------------"); +} + +static void +print_port_stats_header(void) +{ + print_port_stats_separator(); + printf("| %4s | %12s | %13s | %12s | %13s |\n", + "Port", + "RX packets", + "RX rate (pps)", + "TX packets", + "TX_rate (pps)"); + print_port_stats_separator(); +} + +static void +print_port_stats_trailer(void) +{ + print_port_stats_separator(); + printf("\n"); +} + +static void +print_port_stats(int port_id, u64 ns_diff) +{ + struct port *p = ports[port_id]; + double rx_pps, tx_pps; + + rx_pps = (p->n_pkts_rx - n_pkts_rx[port_id]) * 1000000000. / ns_diff; + tx_pps = (p->n_pkts_tx - n_pkts_tx[port_id]) * 1000000000. / ns_diff; + + printf("| %4d | %12llu | %13.0f | %12llu | %13.0f |\n", + port_id, + p->n_pkts_rx, + rx_pps, + p->n_pkts_tx, + tx_pps); + + n_pkts_rx[port_id] = p->n_pkts_rx; + n_pkts_tx[port_id] = p->n_pkts_tx; +} + +static void +print_port_stats_all(u64 ns_diff) +{ + int i; + + print_port_stats_header(); + for (i = 0; i < n_ports; i++) + print_port_stats(i, ns_diff); + print_port_stats_trailer(); +} + +static int quit; + +static void +signal_handler(int sig) +{ + quit = 1; +} + +//static void remove_xdp_program(void) +//{ +// struct xdp_multiprog *mp; +// int i, err; +// +// for (i = 0 ; i < n_ports; i++) { +// mp = xdp_multiprog__get_from_ifindex(if_nametoindex(port_params[i].iface)); +// if (IS_ERR_OR_NULL(mp)) { +// printf("No XDP program loaded on %s\n", port_params[i].iface); +// continue; +// } +// +// err = xdp_multiprog__detach(mp); +// if (err) +// printf("Unable to detach XDP program: %s\n", strerror(-err)); +// } +//} + +void* af_xdp_user_multi_thread::run_af_xdp_multi_threaded(void* args/*int argc, char **argv*/) +{ + struct timespec time; + u64 ns0; + int i; + + /* Parse args. */ + memcpy(&bpool_params, &bpool_params_default, + sizeof(struct bpool_params)); + memcpy(&umem_cfg, &umem_cfg_default, + sizeof(struct xsk_umem_config)); + for (i = 0; i < MAX_PORTS; i++) + memcpy(&port_params[i], &port_params_default, + sizeof(struct port_params)); + +// if (parse_args(argc, argv)) { +// print_usage(argv[0]); +// return -1; +// } + auto number_of_cores = 8;//std::thread::hardware_concurrency(); + printf("This machine has %ld cores\n", number_of_cores); + // 2 ports(interfaces), same name, different q number. + n_ports = number_of_cores; + + // using 1 thread per iface + iface_queue + n_threads = number_of_cores; // get number of cores of this machine. + + for ( int i = 0 ; i < number_of_cores ; i ++) { + port_params[i].iface = "enp4s0f1"; + port_params[i].iface_queue = i; + thread_data[i].cpu_core_id = i; + } +// port_params[0].iface = "enp4s0f1"; +// port_params[0].iface_queue = 0; +// thread_data[0].cpu_core_id = 0; +// +// port_params[1].iface = "enp4s0f1"; +// port_params[1].iface_queue = 1; +// thread_data[1].cpu_core_id = 1; + + + /* Buffer pool initialization. */ + bp = bpool_init(&bpool_params, &umem_cfg); + if (!bp) { + printf("Buffer pool initialization failed.\n"); + return args; + } + printf("Buffer pool created successfully.\n"); + + /* Ports initialization. */ + for (i = 0; i < MAX_PORTS; i++) + port_params[i].bp = bp; + + for (i = 0; i < n_ports; i++) { + ports[i] = port_init(&port_params[i]); + if (!ports[i]) { + printf("Port %d initialization failed.\n", i); + return args; + } + print_port(i); + } + printf("All ports created successfully.\n"); + + /* Threads. */ + for (i = 0; i < n_threads; i++) { + struct thread_data *t = &thread_data[i]; + u32 n_ports_per_thread = n_ports / n_threads, j; + + for (j = 0; j < n_ports_per_thread; j++) { + t->ports_rx[j] = ports[i * n_ports_per_thread + j]; + t->ports_tx[j] = ports[i * n_ports_per_thread + + (j + 1) % n_ports_per_thread]; + } + + t->n_ports_rx = n_ports_per_thread; + + print_thread(i); + } + + for (i = 0; i < n_threads; i++) { + int status; + + status = pthread_create(&threads[i], + NULL, + thread_func, + &thread_data[i]); + if (status) { + printf("Thread %d creation failed.\n", i); + return args; + } + } + printf("All threads created successfully.\n"); + + /* Print statistics. */ + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); + signal(SIGABRT, signal_handler); + + clock_gettime(CLOCK_MONOTONIC, &time); + ns0 = time.tv_sec * 1000000000UL + time.tv_nsec; + for ( ; !quit; ) { + u64 ns1, ns_diff; + + sleep(10); + clock_gettime(CLOCK_MONOTONIC, &time); + ns1 = time.tv_sec * 1000000000UL + time.tv_nsec; + ns_diff = ns1 - ns0; + ns0 = ns1; + + print_port_stats_all(ns_diff); + } + + /* Threads completion. */ + printf("Quit.\n"); + for (i = 0; i < n_threads; i++) + thread_data[i].quit = 1; + + for (i = 0; i < n_threads; i++) + pthread_join(threads[i], NULL); + + for (i = 0; i < n_ports; i++) + port_free(ports[i]); + + bpool_free(bp); + +// remove_xdp_program(); + + return args; +} diff --git a/src/main.cpp b/src/main.cpp index 13e256a..e3f54e7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -26,6 +26,7 @@ #include "marl/scheduler.h" #include "marl/waitgroup.h" #include "af_xdp_user.h" +#include "af_xdp_user_multi_thread.h" #include "grpc_client.h" using namespace std; @@ -146,10 +147,13 @@ int main(int argc, char *argv[]) { g_arion_neighbor_table); }); - marl::schedule([=] { - auto af = af_xdp_user(); - af.run_af_xdp(/*g_arion_neighbor_table*/); - }); +// marl::schedule([=] { +// auto af = af_xdp_user(); +// af.run_af_xdp(/*g_arion_neighbor_table*/); +// }); + auto afm = af_xdp_user_multi_thread(); + pthread_t t; + pthread_create(&t, NULL, &af_xdp_user_multi_thread::run_af_xdp_multi_threaded, &afm); pause(); cleanup(); From 1f3b4e9c904eba0ec76e8dd5fdab9254407f9094 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Wed, 28 Dec 2022 14:13:54 -0800 Subject: [PATCH 16/33] bcache_cons_check has a bug where the first n_buffer_cons is zero, even after trading from bpool. Made a hack to make iperf work. --- src/comm/af_xdp_user_multi_thread.cpp | 60 ++++++++++++++++++++------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/src/comm/af_xdp_user_multi_thread.cpp b/src/comm/af_xdp_user_multi_thread.cpp index 7ab0d54..1046bc3 100644 --- a/src/comm/af_xdp_user_multi_thread.cpp +++ b/src/comm/af_xdp_user_multi_thread.cpp @@ -185,6 +185,7 @@ bpool_init(struct bpool_params *params, /* bpool internals dimensioning. */ n_slabs = (params->n_buffers + params->n_buffers_per_slab - 1) / params->n_buffers_per_slab; + printf("bpool_init: n_slabs = %ld\n", n_slabs); n_slabs_reserved = params->n_users_max * 2; n_buffers = n_slabs * params->n_buffers_per_slab; n_buffers_reserved = n_slabs_reserved * params->n_buffers_per_slab; @@ -359,7 +360,7 @@ static inline u32 bcache_cons_check(struct bcache *bc, u32 n_buffers) { struct bpool *bp = bc->bp; -// printf("bp->params.n_buffers_per_slab: %ld\n", bp->params.n_buffers_per_slab); +// printf("n_buffers: %ld\nbp->params.n_buffers_per_slab: %ld\n", n_buffers, bp->params.n_buffers_per_slab); u64 n_buffers_per_slab = bp->params.n_buffers_per_slab; u64 n_buffers_cons = bc->n_buffers_cons; u64 n_slabs_available; @@ -380,8 +381,10 @@ bcache_cons_check(struct bcache *bc, u32 n_buffers) * (full) for a full slab from the pool, if any is available. */ pthread_mutex_lock(&bp->lock); + printf("Locking bp\n"); n_slabs_available = bp->n_slabs_available; if (!n_slabs_available) { + printf("Unlocking bp because !n_slabs_available)\n"); pthread_mutex_unlock(&bp->lock); return 0; } @@ -390,10 +393,12 @@ bcache_cons_check(struct bcache *bc, u32 n_buffers) slab_full = bp->slabs[n_slabs_available]; bp->slabs[n_slabs_available] = bc->slab_cons; bp->n_slabs_available = n_slabs_available; + printf("Unlocking bp because traded a slab from bpool\n"); pthread_mutex_unlock(&bp->lock); bc->slab_cons = slab_full; bc->n_buffers_cons = n_buffers_per_slab; + printf("bc->n_buffers_cons = %ld\n", bc->n_buffers_cons); return n_buffers; } @@ -422,6 +427,7 @@ bcache_prod(struct bcache *bc, u64 buffer) */ if (n_buffers_prod < n_buffers_per_slab) { bc->slab_prod[n_buffers_prod] = buffer; + printf("bcache_prod: n_buffers_prod: %ld\nn_buffers_per_slab: %ld\n", n_buffers_prod, n_buffers_per_slab); bc->n_buffers_prod = n_buffers_prod + 1; return; } @@ -437,6 +443,7 @@ bcache_prod(struct bcache *bc, u64 buffer) n_slabs_available = bp->n_slabs_available; slab_empty = bp->slabs[n_slabs_available]; bp->slabs[n_slabs_available] = bc->slab_prod; + printf("bcache_prod: bp->n_slabs_available = n_slabs_available + 1;"); bp->n_slabs_available = n_slabs_available + 1; pthread_mutex_unlock(&bp->lock); @@ -529,6 +536,7 @@ port_init(struct port_params *params) memcpy(&p->params, params, sizeof(p->params)); umem_fq_size = params->bp->umem_cfg.fill_size; + printf("port_init: umem_fq_size: %ld\n", umem_fq_size); /* bcache. */ p->bc = bcache_init(params->bp); @@ -569,7 +577,8 @@ port_init(struct port_params *params) xsk_ring_prod__submit(&p->umem_fq, umem_fq_size); p->umem_fq_initialized = 1; - + printf("port init: queue: %d, n_buffers_cons: %ld, n_buffers_prod: %ld\n", + p->params.iface_queue, p->bc->n_buffers_cons, p->bc->n_buffers_prod); return p; } @@ -580,13 +589,19 @@ port_rx_burst(struct port *p, struct burst_rx *b) /* Free buffers for FQ replenish. */ n_pkts = ARRAY_SIZE(b->addr); - + if (p->bc->n_buffers_cons == 0) { + printf("port_rx_burst: p->bc->n_buffers_cons == 0, need to trade slab from pool\n"); + } n_pkts = bcache_cons_check(p->bc, n_pkts); +// printf("Queue: %ld ons_check got %ld packets\n", p->params.iface_queue,n_pkts); + if (!n_pkts) return 0; /* RXQ. */ n_pkts = xsk_ring_cons__peek(&p->rxq, n_pkts, &pos); +// printf("Queue: %ld RXQ got %ld packets\n", p->params.iface_queue,n_pkts); + if (!n_pkts) { if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { struct pollfd pollfd = { @@ -612,8 +627,10 @@ port_rx_burst(struct port *p, struct burst_rx *b) int status; status = xsk_ring_prod__reserve(&p->umem_fq, n_pkts, &pos); - if (status == n_pkts) + if (status == n_pkts) { +// printf("Queue: %ld Fill Queue got %ld packets, breaking\n", p->params.iface_queue,n_pkts); break; + } if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { struct pollfd pollfd = { @@ -622,6 +639,7 @@ port_rx_burst(struct port *p, struct burst_rx *b) }; poll(&pollfd, 1, 0); +// printf("Queue: %ld Fill Queue pooling for %ld packets\n", p->params.iface_queue,n_pkts); } } @@ -633,7 +651,7 @@ port_rx_burst(struct port *p, struct burst_rx *b) printf("Queue: %ld rx burst got %ld packets\n", p->params.iface_queue,n_pkts); return n_pkts; } - +qq static inline void port_tx_burst(struct port *p, struct burst_tx *b) { @@ -658,8 +676,10 @@ port_tx_burst(struct port *p, struct burst_tx *b) for ( ; ; ) { status = xsk_ring_prod__reserve(&p->txq, n_pkts, &pos); - if (status == n_pkts) + if (status == n_pkts) { +// printf("Queue: %ld TX Queue got %ld packets, breaking\n", p->params.iface_queue,n_pkts); break; + } if (xsk_ring_prod__needs_wakeup(&p->txq)) sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, @@ -675,7 +695,7 @@ port_tx_burst(struct port *p, struct burst_tx *b) if (xsk_ring_prod__needs_wakeup(&p->txq)) sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); p->n_pkts_tx += n_pkts; - printf("tx burst sent %ld packets\n", n_pkts); +// printf("Queue: %ld tx burst sent %ld packets\n", p->params.iface_queue, n_pkts); } /* @@ -716,6 +736,7 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk printf(">>>>>>>>>> Begin processing packet >>>>>>>>>>\n"); if (true) { + printf("Process packets: inside if (true)\n"); /* * TODO: Parse packet here, get VNI, IP, MAC, lookup locally in DB, and replace neigbor host IP if found; * if NOT found, drop packet and remotely GET from Arion Master. @@ -727,7 +748,7 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk struct ethhdr *eth = (struct ethhdr *) pkt; if (ntohs(eth->h_proto) != ETH_P_IP) { - // printf("%s\n", "returning false for this packet as it is NOT IP"); + printf("Process packets: returning false for this packet as it is NOT IP %u\n", ntohs(eth->h_proto)); return false; } printf("Packet length: %ld\n", len); @@ -757,7 +778,7 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk // parse VXLAN header struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); - printf("VNI: %ld, \n",trn_get_vni(vxlan->vni)); + printf("VNI: %ld, \n",trn_get_vni(vxlan->vni)); // parse inner eth header struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); @@ -892,8 +913,8 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk printf("Can't find endpoint!\n"); return false; } - }else if (ntohs(inner_eth->h_proto) == ETH_P_IP) { - // TODO: Add inner IP support, refer to trn_process_inner_ip + } + else if (ntohs(inner_eth->h_proto) == ETH_P_IP) { // parse inner IP header struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); struct in_addr inner_ip_src; @@ -957,7 +978,7 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk bpf_ntohs(eth->h_proto), sizeof(*eth)); - // parse outer IP header +// parse outer IP header struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); struct in_addr outer_ip_src; outer_ip_src.s_addr = ip->saddr; @@ -1005,10 +1026,10 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk } } - printf("Endpoing hip == 0, returning false.\n"); + printf("Neither ARP or IP, returning false.\n"); return false; } - + printf("process packet: how is this false?\n"); return false; } @@ -1024,7 +1045,13 @@ thread_func(void *arg) CPU_ZERO(&cpu_cores); CPU_SET(t->cpu_core_id, &cpu_cores); pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_cores); - + for (int i = 0 ; i < t->n_ports_rx ; i ++) { + struct port * port_rx = t->ports_rx[i]; + if (port_rx->bc->n_buffers_cons == 0) { + port_rx->bc->n_buffers_cons = 4096; + printf("Manually setting port %d n_buffer_cons to 4096\n", port_rx->params.iface_queue); + } + } for (i = 0; !t->quit; i = (i + 1) & (t->n_ports_rx - 1)) { struct port *port_rx = t->ports_rx[i]; struct port *port_tx = t->ports_tx[i]; @@ -1041,8 +1068,10 @@ thread_func(void *arg) /* Process & TX. */ for (j = 0; j < n_pkts; j++) { + printf("Queue %ld getting the %ld th packet\n", port_rx->params.iface_queue, j); u64 addr = xsk_umem__add_offset_to_addr(brx->addr[j]); u8 *pkt = static_cast(xsk_umem__get_data(port_rx->params.bp->addr, addr)); + printf("Queue %ld processing the %ld th packet\n", port_rx->params.iface_queue, j); process_packet(pkt, brx->len[j]); // swap_mac_addresses(pkt); @@ -1050,7 +1079,6 @@ thread_func(void *arg) btx->addr[btx->n_pkts] = brx->addr[j]; btx->len[btx->n_pkts] = brx->len[j]; btx->n_pkts++; - if (btx->n_pkts > 0/*== MAX_BURST_TX*/) { port_tx_burst(port_tx, btx); btx->n_pkts = 0; From d15fa06475ea587a667e3a16ff4620b1a5778d14 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 30 Dec 2022 15:44:51 -0800 Subject: [PATCH 17/33] With this commit, AF_XDP with 8 sockets works for multipe iperfs, but it still stops receiving traffic from time to time --- src/comm/af_xdp_user_multi_thread.cpp | 398 ++++++++++++++------------ 1 file changed, 214 insertions(+), 184 deletions(-) diff --git a/src/comm/af_xdp_user_multi_thread.cpp b/src/comm/af_xdp_user_multi_thread.cpp index 1046bc3..0d70b7d 100644 --- a/src/comm/af_xdp_user_multi_thread.cpp +++ b/src/comm/af_xdp_user_multi_thread.cpp @@ -165,6 +165,13 @@ struct bpool { struct xsk_umem *umem; }; +static bool xsk_page_aligned(void *buffer) +{ + unsigned long addr = (unsigned long)buffer; + + return !(addr & (getpagesize() - 1)); +} + static struct bpool * bpool_init(struct bpool_params *params, struct xsk_umem_config *umem_cfg) @@ -253,6 +260,8 @@ bpool_init(struct bpool_params *params, return NULL; } + printf("xsk_umem__create: size: %ld, xsk_page_aligned: %b\n", + bp->params.n_buffers * bp->params.buffer_size, xsk_page_aligned(bp->addr)); /* umem. */ status = xsk_umem__create(&bp->umem, bp->addr, @@ -261,6 +270,7 @@ bpool_init(struct bpool_params *params, &bp->umem_cq, umem_cfg); if (status) { + printf("xsk_umem__create failed with status: %d\n", status); munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size); pthread_mutex_destroy(&bp->lock); free(p); @@ -381,7 +391,7 @@ bcache_cons_check(struct bcache *bc, u32 n_buffers) * (full) for a full slab from the pool, if any is available. */ pthread_mutex_lock(&bp->lock); - printf("Locking bp\n"); +// printf("Locking bp\n"); n_slabs_available = bp->n_slabs_available; if (!n_slabs_available) { printf("Unlocking bp because !n_slabs_available)\n"); @@ -393,12 +403,12 @@ bcache_cons_check(struct bcache *bc, u32 n_buffers) slab_full = bp->slabs[n_slabs_available]; bp->slabs[n_slabs_available] = bc->slab_cons; bp->n_slabs_available = n_slabs_available; - printf("Unlocking bp because traded a slab from bpool\n"); +// printf("Unlocking bp because traded a slab from bpool\n"); pthread_mutex_unlock(&bp->lock); bc->slab_cons = slab_full; bc->n_buffers_cons = n_buffers_per_slab; - printf("bc->n_buffers_cons = %ld\n", bc->n_buffers_cons); +// printf("bc->n_buffers_cons = %ld\n", bc->n_buffers_cons); return n_buffers; } @@ -427,7 +437,7 @@ bcache_prod(struct bcache *bc, u64 buffer) */ if (n_buffers_prod < n_buffers_per_slab) { bc->slab_prod[n_buffers_prod] = buffer; - printf("bcache_prod: n_buffers_prod: %ld\nn_buffers_per_slab: %ld\n", n_buffers_prod, n_buffers_per_slab); +// printf("bcache_prod: n_buffers_prod: %ld\nn_buffers_per_slab: %ld\n", n_buffers_prod, n_buffers_per_slab); bc->n_buffers_prod = n_buffers_prod + 1; return; } @@ -443,7 +453,7 @@ bcache_prod(struct bcache *bc, u64 buffer) n_slabs_available = bp->n_slabs_available; slab_empty = bp->slabs[n_slabs_available]; bp->slabs[n_slabs_available] = bc->slab_prod; - printf("bcache_prod: bp->n_slabs_available = n_slabs_available + 1;"); +// printf("bcache_prod: bp->n_slabs_available = n_slabs_available + 1;\n"); bp->n_slabs_available = n_slabs_available + 1; pthread_mutex_unlock(&bp->lock); @@ -589,9 +599,9 @@ port_rx_burst(struct port *p, struct burst_rx *b) /* Free buffers for FQ replenish. */ n_pkts = ARRAY_SIZE(b->addr); - if (p->bc->n_buffers_cons == 0) { - printf("port_rx_burst: p->bc->n_buffers_cons == 0, need to trade slab from pool\n"); - } +// if (p->bc->n_buffers_cons == 0) { +// printf("port_rx_burst: p->bc->n_buffers_cons == 0, need to trade slab from pool\n"); +// } n_pkts = bcache_cons_check(p->bc, n_pkts); // printf("Queue: %ld ons_check got %ld packets\n", p->params.iface_queue,n_pkts); @@ -623,12 +633,14 @@ port_rx_burst(struct port *p, struct burst_rx *b) p->n_pkts_rx += n_pkts; /* UMEM FQ. */ +// u64 counter = 0; for ( ; ; ) { +// counter ++; int status; status = xsk_ring_prod__reserve(&p->umem_fq, n_pkts, &pos); if (status == n_pkts) { -// printf("Queue: %ld Fill Queue got %ld packets, breaking\n", p->params.iface_queue,n_pkts); +// printf("Queue: %ld Fill Queue got %ld packets, counter = %ld, breaking\n", counter, p->params.iface_queue,n_pkts); break; } @@ -639,8 +651,9 @@ port_rx_burst(struct port *p, struct burst_rx *b) }; poll(&pollfd, 1, 0); -// printf("Queue: %ld Fill Queue pooling for %ld packets\n", p->params.iface_queue,n_pkts); +// printf("Queue: %ld Fill Queue poll for %ld packets, counter = %ld\n", p->params.iface_queue,n_pkts, counter); } +// printf("Queue: %ld Fill Queue busy spinning, counter = %ld\n", p->params.iface_queue,n_pkts, counter); } for (i = 0; i < n_pkts; i++) @@ -648,12 +661,12 @@ port_rx_burst(struct port *p, struct burst_rx *b) bcache_cons(p->bc); xsk_ring_prod__submit(&p->umem_fq, n_pkts); - printf("Queue: %ld rx burst got %ld packets\n", p->params.iface_queue,n_pkts); +// printf("Queue: %ld rx burst got %ld packets\n", p->params.iface_queue,n_pkts); return n_pkts; } -qq + static inline void -port_tx_burst(struct port *p, struct burst_tx *b) +port_tx_burst(struct port *p, struct burst_tx *b, struct port * p2) { u32 n_pkts, pos, i; int status; @@ -674,16 +687,21 @@ port_tx_burst(struct port *p, struct burst_tx *b) /* TXQ. */ n_pkts = b->n_pkts; +// u64 counter = 0; for ( ; ; ) { +// counter ++; status = xsk_ring_prod__reserve(&p->txq, n_pkts, &pos); if (status == n_pkts) { -// printf("Queue: %ld TX Queue got %ld packets, breaking\n", p->params.iface_queue,n_pkts); +// printf("Queue: %ld TX Queue got %ld packets, counter = %ld, breaking\n", counter, p->params.iface_queue,n_pkts); break; } - if (xsk_ring_prod__needs_wakeup(&p->txq)) + if (xsk_ring_prod__needs_wakeup(&p->txq)) { sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); +// printf("Queue: %ld TX sendto %ld packets, counter = %ld\n", counter, p->params.iface_queue,n_pkts); + } +// printf("Queue: %ld TX busy spinning, counter = %ld\n", counter, p->params.iface_queue,n_pkts); } for (i = 0; i < n_pkts; i++) { @@ -695,7 +713,9 @@ port_tx_burst(struct port *p, struct burst_tx *b) if (xsk_ring_prod__needs_wakeup(&p->txq)) sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); p->n_pkts_tx += n_pkts; -// printf("Queue: %ld tx burst sent %ld packets\n", p->params.iface_queue, n_pkts); + if (p2->params.iface_queue != p->params.iface_queue) { + printf("TX Queue: %ld, RX Queue: %ld tx burst sent %ld packets\n", p2->params.iface_queue, p->params.iface_queue, n_pkts); + } } /* @@ -733,10 +753,10 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk /*uint64_t addr, , int* fd*/ ) { - printf(">>>>>>>>>> Begin processing packet >>>>>>>>>>\n"); +// printf(">>>>>>>>>> Begin processing packet >>>>>>>>>>\n"); if (true) { - printf("Process packets: inside if (true)\n"); +// printf("Process packets: inside if (true)\n"); /* * TODO: Parse packet here, get VNI, IP, MAC, lookup locally in DB, and replace neigbor host IP if found; * if NOT found, drop packet and remotely GET from Arion Master. @@ -748,16 +768,16 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk struct ethhdr *eth = (struct ethhdr *) pkt; if (ntohs(eth->h_proto) != ETH_P_IP) { - printf("Process packets: returning false for this packet as it is NOT IP %u\n", ntohs(eth->h_proto)); +// printf("Process packets: returning false for this packet as it is NOT IP %u\n", ntohs(eth->h_proto)); return false; } - printf("Packet length: %ld\n", len); - printf("Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" - "eth size: %d\n", - eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], - eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], - bpf_ntohs(eth->h_proto), - sizeof(*eth)); +// printf("Packet length: %ld\n", len); +// printf("Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" +// "eth size: %d\n", +// eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], +// eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], +// bpf_ntohs(eth->h_proto), +// sizeof(*eth)); // parse outer IP header struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); @@ -765,43 +785,43 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk outer_ip_src.s_addr = ip->saddr; struct in_addr outer_ip_dest; outer_ip_dest.s_addr = ip->daddr; - printf("Outer ip src: %s,",inet_ntoa(outer_ip_src)); - printf("ip dest: %s\n" - "Outer ip ihl: %d, version: %d\n", - inet_ntoa(outer_ip_dest), - ip->ihl, ip->version); +// printf("Outer ip src: %s,",inet_ntoa(outer_ip_src)); +// printf("ip dest: %s\n" +// "Outer ip ihl: %d, version: %d\n", +// inet_ntoa(outer_ip_dest), +// ip->ihl, ip->version); // parse UDP header struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); - printf("UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", - udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); +// printf("UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", +// udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); // parse VXLAN header struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); - printf("VNI: %ld, \n",trn_get_vni(vxlan->vni)); +// printf("VNI: %ld, \n",trn_get_vni(vxlan->vni)); // parse inner eth header struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); - printf("inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", - inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], - inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], - inner_eth->h_proto); +// printf("inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", +// inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], +// inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], +// inner_eth->h_proto); if (ntohs(inner_eth->h_proto) == ETH_P_ARP) { // parse inner arp header arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); - struct in_addr arp_src_ip; - arp_src_ip.s_addr = arp_msg->spa; +// struct in_addr arp_src_ip; +// arp_src_ip.s_addr = arp_msg->spa; struct in_addr arp_dest_ip; arp_dest_ip.s_addr = arp_msg->tpa; - printf("arp op: %d\n", - bpf_htons(arp_msg->op)); - printf("arp source ip: %s, \n", - inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) - ); - printf("arp dest ip: %s, \n", - inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) - ); +// printf("arp op: %d\n", +// bpf_htons(arp_msg->op)); +// printf("arp source ip: %s, \n", +// inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) +// ); +// printf("arp dest ip: %s, \n", +// inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) +// ); endpoint_key_t epkey; epkey.vni = trn_get_vni(vxlan->vni); struct sockaddr_in ep_ip; @@ -817,15 +837,15 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk // inet_ntoa(arp_src_ip), trn_get_vni(vxlan->vni), 0); /* Modify pkt for inner ARP response */ - struct in_addr ep_ip_addr, ep_host_ip_addr; + struct in_addr ep_ip_addr/*, ep_host_ip_addr*/; ep_ip_addr.s_addr = epkey.ip; - ep_host_ip_addr.s_addr = ep_value.hip; - printf("Retrived this endpoint: HIP: %s ", inet_ntoa(ep_host_ip_addr)); - printf("IP: %s, host_mac: %x:%x:%x:%x:%x:%x, mac: %x:%x:%x:%x:%x:%x\n", - inet_ntoa(ep_ip_addr), - ep_value.hmac[0],ep_value.hmac[1],ep_value.hmac[2],ep_value.hmac[3],ep_value.hmac[4],ep_value.hmac[5], - ep_value.mac[0],ep_value.mac[1],ep_value.mac[2],ep_value.mac[3],ep_value.mac[4],ep_value.mac[5] - ); +// ep_host_ip_addr.s_addr = ep_value.hip; +// printf("Retrived this endpoint: HIP: %s ", inet_ntoa(ep_host_ip_addr)); +// printf("IP: %s, host_mac: %x:%x:%x:%x:%x:%x, mac: %x:%x:%x:%x:%x:%x\n", +// inet_ntoa(ep_ip_addr), +// ep_value.hmac[0],ep_value.hmac[1],ep_value.hmac[2],ep_value.hmac[3],ep_value.hmac[4],ep_value.hmac[5], +// ep_value.mac[0],ep_value.mac[1],ep_value.mac[2],ep_value.mac[3],ep_value.mac[4],ep_value.mac[5] +// ); arp_msg->op = bpf_htons(ARPOP_REPLY); trn_set_mac(arp_msg->tha, arp_msg->sha); trn_set_mac(arp_msg->sha, ep_value.mac); @@ -846,67 +866,67 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk * Packet modification finished, read packet content again, in order to verify the mod * */ - struct ethhdr *eth = (struct ethhdr *) pkt; - - if (ntohs(eth->h_proto) != ETH_P_IP) { -// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); - return false; - } - printf("AFTER MOD: Packet length: %ld\n", len); - printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" - "eth size: %d\n", - eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], - eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], - bpf_ntohs(eth->h_proto), - sizeof(*eth)); - - // parse outer IP header - struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); - struct in_addr outer_ip_src; - outer_ip_src.s_addr = ip->saddr; - struct in_addr outer_ip_dest; - outer_ip_dest.s_addr = ip->daddr; - printf("AFTER MOD: Outer ip src: %s,", inet_ntoa(outer_ip_src)); - printf("ip dest: %s\n" - "AFTER MOD: Outer ip ihl: %d, version: %d\n", - inet_ntoa(outer_ip_dest), - ip->ihl, ip->version); - - // parse UDP header - struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); - printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", - udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); - - // parse VXLAN header - struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); - printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); - - // parse inner eth header - struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); - printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", - inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], - inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], - inner_eth->h_proto); - - // parse inner arp header - arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); - struct in_addr arp_src_ip; - arp_src_ip.s_addr = arp_msg->spa; - struct in_addr arp_dest_ip; - arp_dest_ip.s_addr = arp_msg->tpa; - printf("AFTER MOD: arp op: %d\n", - bpf_htons(arp_msg->op)); - printf("AFTER MOD: arp source ip: %s, \n", - inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) - ); - printf("AFTER MOD: arp dest ip: %s, \n", - inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) - ); +// struct ethhdr *eth = (struct ethhdr *) pkt; +// +// if (ntohs(eth->h_proto) != ETH_P_IP) { +//// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); +// return false; +// } +// printf("AFTER MOD: Packet length: %ld\n", len); +// printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" +// "eth size: %d\n", +// eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], +// eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], +// bpf_ntohs(eth->h_proto), +// sizeof(*eth)); +// +// // parse outer IP header +// struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); +// struct in_addr outer_ip_src; +// outer_ip_src.s_addr = ip->saddr; +// struct in_addr outer_ip_dest; +// outer_ip_dest.s_addr = ip->daddr; +// printf("AFTER MOD: Outer ip src: %s,", inet_ntoa(outer_ip_src)); +// printf("ip dest: %s\n" +// "AFTER MOD: Outer ip ihl: %d, version: %d\n", +// inet_ntoa(outer_ip_dest), +// ip->ihl, ip->version); +// +// // parse UDP header +// struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); +// printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", +// udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); +// +// // parse VXLAN header +// struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); +// printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); +// +// // parse inner eth header +// struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); +// printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", +// inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], +// inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], +// inner_eth->h_proto); +// +// // parse inner arp header +// arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); +// struct in_addr arp_src_ip; +// arp_src_ip.s_addr = arp_msg->spa; +// struct in_addr arp_dest_ip; +// arp_dest_ip.s_addr = arp_msg->tpa; +// printf("AFTER MOD: arp op: %d\n", +// bpf_htons(arp_msg->op)); +// printf("AFTER MOD: arp source ip: %s, \n", +// inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) +// ); +// printf("AFTER MOD: arp dest ip: %s, \n", +// inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) +// ); /* Here we sent the packet out of the receive port. Note that * we allocate one entry and schedule it. Your design would be * faster if you do batch processing/transmission */ - printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); +// printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); return true; } else { @@ -917,12 +937,12 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk else if (ntohs(inner_eth->h_proto) == ETH_P_IP) { // parse inner IP header struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); - struct in_addr inner_ip_src; - inner_ip_src.s_addr = inner_ip->saddr; +// struct in_addr inner_ip_src; +// inner_ip_src.s_addr = inner_ip->saddr; struct in_addr inner_ip_dest; inner_ip_dest.s_addr = inner_ip->daddr; - printf("Inner IP src: %s\n", inet_ntoa(inner_ip_src)); - printf("Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); +// printf("Inner IP src: %s\n", inet_ntoa(inner_ip_src)); +// printf("Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); endpoint_key_t epkey; epkey.vni = trn_get_vni(vxlan->vni); struct sockaddr_in ep_ip; @@ -942,15 +962,15 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk // printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", // inet_ntoa(inner_ip_dest), trn_get_vni(vxlan->vni), 0); - struct in_addr ep_ip_addr, ep_host_ip_addr; - ep_ip_addr.s_addr = epkey.ip; - ep_host_ip_addr.s_addr = ep_value.hip; - printf("Retrived this endpoint: HIP: %s,", inet_ntoa(ep_host_ip_addr)); - printf("IP: %s, host_mac: %x:%x:%x:%x:%x:%x, mac: %x:%x:%x:%x:%x:%x\n", - inet_ntoa(ep_ip_addr), - ep_value.hmac[0],ep_value.hmac[1],ep_value.hmac[2],ep_value.hmac[3],ep_value.hmac[4],ep_value.hmac[5], - ep_value.mac[0],ep_value.mac[1],ep_value.mac[2],ep_value.mac[3],ep_value.mac[4],ep_value.mac[5] - ); +// struct in_addr ep_ip_addr, ep_host_ip_addr; +// ep_ip_addr.s_addr = epkey.ip; +// ep_host_ip_addr.s_addr = ep_value.hip; +// printf("Retrived this endpoint: HIP: %s,", inet_ntoa(ep_host_ip_addr)); +// printf("IP: %s, host_mac: %x:%x:%x:%x:%x:%x, mac: %x:%x:%x:%x:%x:%x\n", +// inet_ntoa(ep_ip_addr), +// ep_value.hmac[0],ep_value.hmac[1],ep_value.hmac[2],ep_value.hmac[3],ep_value.hmac[4],ep_value.hmac[5], +// ep_value.mac[0],ep_value.mac[1],ep_value.mac[2],ep_value.mac[3],ep_value.mac[4],ep_value.mac[5] +// ); /* Modify inner EitherHdr, pretend it's from target */ trn_set_dst_mac(inner_eth, ep_value.mac); @@ -964,60 +984,60 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk * Packet modification finished, read packet content again, in order to verify the mod * */ - struct ethhdr *eth = (struct ethhdr *) pkt; - - if (ntohs(eth->h_proto) != ETH_P_IP) { -// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); - return false; - } - printf("AFTER MOD: Packet length: %ld\n", len); - printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" - "eth size: %d\n", - eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], - eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], - bpf_ntohs(eth->h_proto), - sizeof(*eth)); - -// parse outer IP header - struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); - struct in_addr outer_ip_src; - outer_ip_src.s_addr = ip->saddr; - struct in_addr outer_ip_dest; - outer_ip_dest.s_addr = ip->daddr; - printf("AFTER MOD: Outer ip src: %s", inet_ntoa(outer_ip_src)); - printf("ip dest: %s\n" - "AFTER MOD: Outer ip ihl: %d, version: %d\n", - inet_ntoa(outer_ip_dest), - ip->ihl, ip->version); - - // parse UDP header - struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); - printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", - udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); - - // parse VXLAN header - struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); - printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); - - // parse inner eth header - struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); - printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", - inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], - inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], - inner_eth->h_proto); - - // parse inner IP header - struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); - struct in_addr inner_ip_src, inner_ip_dest; - inner_ip_src.s_addr = inner_ip->saddr; - inner_ip_dest.s_addr = inner_ip->daddr; - printf("AFTER MOD: Inner IP src: %s\n", inet_ntoa(inner_ip_src)); - printf("AFTER MOD: Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); +// struct ethhdr *eth = (struct ethhdr *) pkt; +// +// if (ntohs(eth->h_proto) != ETH_P_IP) { +//// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); +// return false; +// } +// printf("AFTER MOD: Packet length: %ld\n", len); +// printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" +// "eth size: %d\n", +// eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], +// eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], +// bpf_ntohs(eth->h_proto), +// sizeof(*eth)); +// +//// parse outer IP header +// struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); +// struct in_addr outer_ip_src; +// outer_ip_src.s_addr = ip->saddr; +// struct in_addr outer_ip_dest; +// outer_ip_dest.s_addr = ip->daddr; +// printf("AFTER MOD: Outer ip src: %s", inet_ntoa(outer_ip_src)); +// printf("ip dest: %s\n" +// "AFTER MOD: Outer ip ihl: %d, version: %d\n", +// inet_ntoa(outer_ip_dest), +// ip->ihl, ip->version); +// +// // parse UDP header +// struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); +// printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", +// udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); +// +// // parse VXLAN header +// struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); +// printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); +// +// // parse inner eth header +// struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); +// printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", +// inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], +// inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], +// inner_eth->h_proto); +// +// // parse inner IP header +// struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); +// struct in_addr inner_ip_src, inner_ip_dest; +// inner_ip_src.s_addr = inner_ip->saddr; +// inner_ip_dest.s_addr = inner_ip->daddr; +// printf("AFTER MOD: Inner IP src: %s\n", inet_ntoa(inner_ip_src)); +// printf("AFTER MOD: Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); /* Here we sent the packet out of the receive port. Note that * we allocate one entry and schedule it. Your design would be * faster if you do batch processing/transmission */ - printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); +// printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); return true; } else { @@ -1045,16 +1065,22 @@ thread_func(void *arg) CPU_ZERO(&cpu_cores); CPU_SET(t->cpu_core_id, &cpu_cores); pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_cores); - for (int i = 0 ; i < t->n_ports_rx ; i ++) { - struct port * port_rx = t->ports_rx[i]; + for (int j = 0 ; j < t->n_ports_rx ; j ++) { + struct port * port_rx = t->ports_rx[j]; + printf("port: %ld, tx queue needs wake up: %ld, fill queue needs wake up :%ld\n", + port_rx->params.iface_queue, xsk_ring_prod__needs_wakeup(&port_rx->txq) , xsk_ring_prod__needs_wakeup(&port_rx->umem_fq)); if (port_rx->bc->n_buffers_cons == 0) { port_rx->bc->n_buffers_cons = 4096; printf("Manually setting port %d n_buffer_cons to 4096\n", port_rx->params.iface_queue); } } + + for (i = 0; !t->quit; i = (i + 1) & (t->n_ports_rx - 1)) { struct port *port_rx = t->ports_rx[i]; struct port *port_tx = t->ports_tx[i]; +// printf("Thread: %ld, port rx: %ld, port tx: %ld\n", +// t->cpu_core_id, port_rx->params.iface_queue, port_tx->params.iface_queue); struct burst_rx *brx = &t->burst_rx; struct burst_tx *btx = &t->burst_tx[i]; u32 n_pkts, j; @@ -1068,10 +1094,10 @@ thread_func(void *arg) /* Process & TX. */ for (j = 0; j < n_pkts; j++) { - printf("Queue %ld getting the %ld th packet\n", port_rx->params.iface_queue, j); +// printf("Queue %ld getting the %ld th packet\n", port_rx->params.iface_queue, j); u64 addr = xsk_umem__add_offset_to_addr(brx->addr[j]); u8 *pkt = static_cast(xsk_umem__get_data(port_rx->params.bp->addr, addr)); - printf("Queue %ld processing the %ld th packet\n", port_rx->params.iface_queue, j); +// printf("Queue %ld processing the %ld th packet\n", port_rx->params.iface_queue, j); process_packet(pkt, brx->len[j]); // swap_mac_addresses(pkt); @@ -1079,10 +1105,11 @@ thread_func(void *arg) btx->addr[btx->n_pkts] = brx->addr[j]; btx->len[btx->n_pkts] = brx->len[j]; btx->n_pkts++; - if (btx->n_pkts > 0/*== MAX_BURST_TX*/) { - port_tx_burst(port_tx, btx); - btx->n_pkts = 0; - } + + } + if (btx->n_pkts > 0/*== MAX_BURST_TX*/) { + port_tx_burst(port_tx, btx, port_rx); + btx->n_pkts = 0; } } @@ -1381,6 +1408,7 @@ void* af_xdp_user_multi_thread::run_af_xdp_multi_threaded(void* args/*int argc, sizeof(struct bpool_params)); memcpy(&umem_cfg, &umem_cfg_default, sizeof(struct xsk_umem_config)); + umem_cfg.flags |= (XDP_RING_NEED_WAKEUP/*XDP_USE_NEED_WAKEUP*/ ); for (i = 0; i < MAX_PORTS; i++) memcpy(&port_params[i], &port_params_default, sizeof(struct port_params)); @@ -1442,6 +1470,8 @@ void* af_xdp_user_multi_thread::run_af_xdp_multi_threaded(void* args/*int argc, t->ports_rx[j] = ports[i * n_ports_per_thread + j]; t->ports_tx[j] = ports[i * n_ports_per_thread + (j + 1) % n_ports_per_thread]; +// printf("Thread: %ld has rx port: %ld, tx port: %ld\n", +// i, t->ports_rx[j]->params.iface_queue, t->ports_tx[j]->params.iface_queue); } t->n_ports_rx = n_ports_per_thread; From 13b32507c1deda2cec22019c401305452247be88 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 30 Dec 2022 16:20:20 -0800 Subject: [PATCH 18/33] Fixed the cached drained problem, by setting the UMEM Fill Queue and Completion Queue to have the same size --- src/comm/af_xdp_user_multi_thread.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/comm/af_xdp_user_multi_thread.cpp b/src/comm/af_xdp_user_multi_thread.cpp index 0d70b7d..8f3180c 100644 --- a/src/comm/af_xdp_user_multi_thread.cpp +++ b/src/comm/af_xdp_user_multi_thread.cpp @@ -1129,7 +1129,7 @@ static const struct bpool_params bpool_params_default = { }; static const struct xsk_umem_config umem_cfg_default = { - .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, //* 2, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, From 357aabb9276d6e7f756c66a465830327fd115fc9 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 13 Jan 2023 16:23:22 -0800 Subject: [PATCH 19/33] Added logic to save 8 cores for the machine and use the rest for AF_XDP sockets. --- include/af_xdp_user.h | 33 - include/db_client.h | 22 +- src/CMakeLists.txt | 3 +- src/comm/af_xdp_user.cpp | 1130 ------------------------- src/comm/af_xdp_user_multi_thread.cpp | 56 +- src/main.cpp | 1 - 6 files changed, 46 insertions(+), 1199 deletions(-) delete mode 100644 include/af_xdp_user.h delete mode 100644 src/comm/af_xdp_user.cpp diff --git a/include/af_xdp_user.h b/include/af_xdp_user.h deleted file mode 100644 index 5c94266..0000000 --- a/include/af_xdp_user.h +++ /dev/null @@ -1,33 +0,0 @@ -// -// Created by ubuntu on 10/4/22. -// - -#ifndef ARIONAGENT_AF_XDP_USER_H -#define ARIONAGENT_AF_XDP_USER_H - -#include "logger.h" -#include -#include -#include -#include -#ifdef __cplusplus -extern "C" -{ -#include "common_params.h" -#include "common_user_bpf_xdp.h" -#include "common_libbpf.h" -} -#endif -static const char *__doc__ = "AF_XDP kernel bypass example\n"; - -class af_xdp_user { -public: - af_xdp_user() { - printf("%s", "Start of af_xdp userspace program."); - } - void run_af_xdp(/*std::string table_name_neighbor_ebpf_map*/); -private: - -}; - -#endif //ARIONAGENT_AF_XDP_USER_H diff --git a/include/db_client.h b/include/db_client.h index 65c55a8..0661942 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -105,10 +105,10 @@ class db_client { auto host_ip = get<4>(row).c_str(); auto vpc_mac = get<3>(row).c_str(); auto host_mac = get<2>(row).c_str(); - printf("Retrieved this endpoint from local DB: VNI: %ld, vpc_ip: %s, host_mac: %s, vpc_mac: %s, host_ip: %s\n", -// get<0>(row), get<1>(row).c_str(), get<2>(row).c_str(), get<3>(row).c_str(), get<4>(row).c_str() - vni, vpc_ip, host_mac, vpc_mac, host_ip - ); +// printf("Retrieved this endpoint from local DB: VNI: %ld, vpc_ip: %s, host_mac: %s, vpc_mac: %s, host_ip: %s\n", +//// get<0>(row), get<1>(row).c_str(), get<2>(row).c_str(), get<3>(row).c_str(), get<4>(row).c_str() +// vni, vpc_ip, host_mac, vpc_mac, host_ip +// ); endpoint_key_t key; key.vni = vni; //(get<0>(row)); struct sockaddr_in endpoint_vpc_ip_socket; @@ -127,13 +127,13 @@ class db_client { value.hip = endpoint_host_ip_socket.sin_addr.s_addr; endpoint_cache[key] = value; // endpoint_cache.insert(key, value); - printf("Inserted this endpoint into cache: VNI: %ld, vpc_ip: %s, ", key.vni, inet_ntoa(endpoint_vpc_ip_socket.sin_addr)); - printf("host_mac: %x:%x:%x:%x:%x:%x, vpc_mac: %x:%x:%x:%x:%x:%x, host_ip: %s\n", - value.hmac[0],value.hmac[1],value.hmac[2],value.hmac[3],value.hmac[4],value.hmac[5], - value.mac[0],value.mac[1],value.mac[2],value.mac[3],value.mac[4],value.mac[5], - inet_ntoa(endpoint_host_ip_socket.sin_addr) - ); - printf("Finished one endpoint\n"); +// printf("Inserted this endpoint into cache: VNI: %ld, vpc_ip: %s, ", key.vni, inet_ntoa(endpoint_vpc_ip_socket.sin_addr)); +// printf("host_mac: %x:%x:%x:%x:%x:%x, vpc_mac: %x:%x:%x:%x:%x:%x, host_ip: %s\n", +// value.hmac[0],value.hmac[1],value.hmac[2],value.hmac[3],value.hmac[4],value.hmac[5], +// value.mac[0],value.mac[1],value.mac[2],value.mac[3],value.mac[4],value.mac[5], +// inet_ntoa(endpoint_host_ip_socket.sin_addr) +// ); +// printf("Finished one endpoint\n"); } printf("Finished retrieving from local DB, not endpoint cache has %ld endpoints\n", endpoint_cache.size()); } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9a42505..0a36db2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,8 +3,7 @@ set(EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../build/bin) set(SOURCES ./util/dispatch_queue.cpp ./comm/grpc_client.cpp - comm/af_xdp_user.cpp -# db/db_client.cpp + # db/db_client.cpp comm/af_xdp_user_multi_thread.cpp ) #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -idirafter /usr/src/linux-headers-5.10.4/include/") diff --git a/src/comm/af_xdp_user.cpp b/src/comm/af_xdp_user.cpp deleted file mode 100644 index 534191d..0000000 --- a/src/comm/af_xdp_user.cpp +++ /dev/null @@ -1,1130 +0,0 @@ -// -// Created by ubuntu on 10/4/22. -// -#include -#include -#include -#include -#include -#include -#include -//#include -#include -#include -#include -//#include -#include "af_xdp_user.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "xdp/trn_datamodel.h" -#include "util.h" -//#include "xdp/trn_kern.h" -#include "marl/defer.h" -#include "marl/event.h" -#include "marl/scheduler.h" -#include "marl/waitgroup.h" - -#define NUM_FRAMES 40960//4096 -#define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE -#define RX_BATCH_SIZE 64 -#define INVALID_UMEM_FRAME UINT64_MAX -#define MSG_DONTWAIT = 0x40 -#define VXL_DSTPORT 0xb512 // UDP dport 4789(0x12b5) for VxLAN overlay -/* ARP protocol opcodes. */ -#define ARPOP_REQUEST 1 /* ARP request */ -#define ARPOP_REPLY 2 /* ARP reply */ -#ifndef PATH_MAX - -#define PATH_MAX 4096 -#endif - -struct vxlanhdr_internal { - /* Big endian! */ - __u8 rsvd1 : 3; - __u8 i_flag : 1; - __u8 rsvd2 : 4; - __u8 rsvd3[3]; - __u8 vni[3]; - __u8 rsvd4; -}; - - -///* -// * This structure defines an ethernet arp header. -// */ -// -//struct arphdr { -// __be16 ar_hrd; /* format of hardware address */ -// __be16 ar_pro; /* format of protocol address */ -// unsigned char ar_hln; /* length of hardware address */ -// unsigned char ar_pln; /* length of protocol address */ -// __be16 ar_op; /* ARP opcode (command) */ -// -//#if 0 -// /* -// * Ethernet looks like this : This bit is variable sized however... -// */ -// unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ -// unsigned char ar_sip[4]; /* sender IP address */ -// unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ -// unsigned char ar_tip[4]; /* target IP address */ -//#endif -// -//}; - -struct arp_message { - uint16_t hrd; - uint16_t pro; - uint8_t hln; - uint8_t pln; - uint16_t op; - uint8_t sha[6]; - uint32_t spa; - uint8_t tha[6]; - uint32_t tpa; -} __attribute__((__packed__)); - -struct xsk_umem_info { - struct xsk_ring_prod fq; - struct xsk_ring_cons cq; - struct xsk_umem *umem; - void *buffer; -}; - -struct stats_record { - uint64_t timestamp; - uint64_t rx_packets; - uint64_t rx_bytes; - uint64_t tx_packets; - uint64_t tx_bytes; -}; - -struct xsk_socket_info { - struct xsk_ring_cons rx; - struct xsk_ring_prod tx; - struct xsk_umem_info *umem; - struct xsk_socket *xsk; - - uint64_t umem_frame_addr[NUM_FRAMES]; - uint32_t umem_frame_free; - - uint32_t outstanding_tx; - - struct stats_record stats; - struct stats_record prev_stats; -}; - -static uint64_t xsk_alloc_umem_frame(struct xsk_socket_info *xsk) -{ - - uint64_t frame; - if (xsk->umem_frame_free == 0) - return INVALID_UMEM_FRAME; - - frame = xsk->umem_frame_addr[--xsk->umem_frame_free]; - xsk->umem_frame_addr[xsk->umem_frame_free] = INVALID_UMEM_FRAME; - return frame; -} - -static struct xsk_socket_info *xsk_configure_socket(struct config *cfg, - struct xsk_umem_info *umem) -{ - struct xsk_socket_config xsk_cfg; - struct xsk_socket_info *xsk_info; - uint32_t idx; - /* TODO: Fill in the prog_id of the 'transit' xdp program - otherwise, the xsk_socket__create will create a map with the name 'xsk_map' - */ - uint32_t prog_id = 0; - int i; - int ret; - - xsk_info = static_cast(calloc(1, sizeof(*xsk_info))); - if (!xsk_info) - return static_cast(nullptr); - - xsk_info->umem = umem; - xsk_cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; - xsk_cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - xsk_cfg.libbpf_flags = 0; - xsk_cfg.xdp_flags = cfg->xdp_flags; - xsk_cfg.bind_flags = cfg->xsk_bind_flags; - if (!umem->umem) { - printf("%s\n", "umem is empty!"); - } - if (!(&xsk_info->xsk)) { - printf("%s\n", "xsk_ptr is empty!"); - } - if (!(&xsk_info->tx)) { - printf("%s\n", "tx is empty!"); - } - if (!(&xsk_info->rx)) { - printf("%s\n", "rx is empty!"); - } - ret = xsk_socket__create(&xsk_info->xsk, cfg->ifname, - cfg->xsk_if_queue, umem->umem, &xsk_info->rx, - &xsk_info->tx, &xsk_cfg); - - if (ret) { - printf("xsk_socket__create failed with ret: [%ld]\n", ret); - goto error_exit; - } - - ret = bpf_get_link_xdp_id(cfg->ifindex, &prog_id, cfg->xdp_flags); - if (ret) { - printf("bpf_get_link_xdp_id failed\n"); - goto error_exit; - } - - /* Initialize umem frame allocation */ - - for (i = 0; i < NUM_FRAMES; i++) - xsk_info->umem_frame_addr[i] = i * FRAME_SIZE; - - xsk_info->umem_frame_free = NUM_FRAMES; - - /* Stuff the receive path with buffers, we assume we have enough */ - ret = xsk_ring_prod__reserve(&xsk_info->umem->fq, - XSK_RING_PROD__DEFAULT_NUM_DESCS, - &idx); - - if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) { - printf("xsk_ring_prod__reserve failed\n"); - goto error_exit; - } - - for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i ++) - *xsk_ring_prod__fill_addr(&xsk_info->umem->fq, idx++) = - xsk_alloc_umem_frame(xsk_info); - - xsk_ring_prod__submit(&xsk_info->umem->fq, - XSK_RING_PROD__DEFAULT_NUM_DESCS); - - return xsk_info; - -error_exit: - errno = -ret; - return static_cast(nullptr); -} - -static const struct option_wrapper long_options[] = { - - {{"help", no_argument, nullptr, 'h' }, - "Show help", "",false}, - - {{"dev", required_argument, nullptr, 'd' }, - "Operate on device ", "", true}, - - {{"skb-mode", no_argument, nullptr, 'S' }, - "Install XDP program in SKB (AKA generic) mode"}, - - {{"native-mode", no_argument, nullptr, 'N' }, - "Install XDP program in native mode"}, - - {{"auto-mode", no_argument, nullptr, 'A' }, - "Auto-detect SKB or native mode"}, - - {{"force", no_argument, nullptr, 'F' }, - "Force install, replacing existing program on interface"}, - - {{"copy", no_argument, nullptr, 'c' }, - "Force copy mode"}, - - {{"zero-copy", no_argument, nullptr, 'z' }, - "Force zero-copy mode"}, - - {{"queue", required_argument, nullptr, 'Q' }, - "Configure interface receive queue for AF_XDP, default=0"}, - - {{"poll-mode", no_argument, nullptr, 'p' }, - "Use the poll() API waiting for packets to arrive"}, - - {{"unload", no_argument, nullptr, 'U' }, - "Unload XDP program instead of loading"}, - - {{"quiet", no_argument, nullptr, 'q' }, - "Quiet mode (no output)"}, - - {{"filename", required_argument, nullptr, 1 }, - "Load program from ", ""}, - - {{"progsec", required_argument, nullptr, 2 }, - "Load program in
of the ELF file", "
"}, - - {{0, 0, nullptr, 0 }, nullptr, "",false} -}; - -static bool global_exit; - -static uint64_t xsk_umem_free_frames(struct xsk_socket_info *xsk) -{ - return xsk->umem_frame_free; -} - -static void xsk_free_umem_frame(struct xsk_socket_info *xsk, uint64_t frame) -{ - assert(xsk->umem_frame_free < NUM_FRAMES); - - xsk->umem_frame_addr[xsk->umem_frame_free++] = frame; -} - -static void complete_tx(struct xsk_socket_info *xsk) -{ - unsigned int completed; - uint32_t idx_cq; - - if (!xsk->outstanding_tx) - return; - - sendto(xsk_socket__fd(xsk->xsk), NULL, 0, 0X40/*MSG_DONTWAIT*/, NULL, 0); - - - /* Collect/free completed TX buffers */ - completed = xsk_ring_cons__peek(&xsk->umem->cq, - XSK_RING_CONS__DEFAULT_NUM_DESCS, - &idx_cq); - - if (completed > 0) { - for (int i = 0; i < completed; i++) - xsk_free_umem_frame(xsk, - *xsk_ring_cons__comp_addr(&xsk->umem->cq, - idx_cq++)); - - xsk_ring_cons__release(&xsk->umem->cq, completed); - xsk->outstanding_tx -= completed < xsk->outstanding_tx ? - completed : xsk->outstanding_tx; - } -} - - - -static bool process_packet(struct xsk_socket_info *xsk, - uint64_t addr, uint32_t len, int* fd) -{ -// printf(">>>>>>>>>> Begin processing packet >>>>>>>>>>\n"); - uint8_t *pkt = static_cast(xsk_umem__get_data(xsk->umem->buffer, addr)); - - - if (true) { - /* - * TODO: Parse packet here, get VNI, IP, MAC, lookup locally in DB, and replace neigbor host IP if found; - * if NOT found, drop packet and remotely GET from Arion Master. - * */ - int ret; - uint32_t tx_idx = 0; - uint8_t tmp_mac[ETH_ALEN]; - // parse outer eth header - struct ethhdr *eth = (struct ethhdr *) pkt; - - if (ntohs(eth->h_proto) != ETH_P_IP) { -// printf("%s\n", "returning false for this packet as it is NOT IP"); - return false; - } -// printf("Packet length: %ld\n", len); -// printf("Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" -// "eth size: %d\n", -// eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], -// eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], -// bpf_ntohs(eth->h_proto), -// sizeof(*eth)); - - // parse outer IP header - struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); - struct in_addr outer_ip_src; - outer_ip_src.s_addr = ip->saddr; - struct in_addr outer_ip_dest; - outer_ip_dest.s_addr = ip->daddr; -// printf("Outer ip src: %s, ip dest: %s\n" -// "Outer ip ihl: %d, version: %d\n", -// inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), -// ip->ihl, ip->version); - - // parse UDP header - struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); - printf("UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", - udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); - - // parse VXLAN header - struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); -// printf("VNI: %ld, \n",trn_get_vni(vxlan->vni)); - - // parse inner eth header - struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); -// printf("inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", -// inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], -// inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], -// inner_eth->h_proto); - - if (ntohs(inner_eth->h_proto) == ETH_P_ARP) { - // parse inner arp header - arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); - struct in_addr arp_src_ip; - arp_src_ip.s_addr = arp_msg->spa; - struct in_addr arp_dest_ip; - arp_dest_ip.s_addr = arp_msg->tpa; -// printf("arp op: %d\n", -// bpf_htons(arp_msg->op)); -// printf("arp source ip: %s, \n", -// inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) -// ); -// printf("arp dest ip: %s, \n", -// inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) -// ); - endpoint_key_t epkey; - epkey.vni = trn_get_vni(vxlan->vni); - struct sockaddr_in ep_ip; - inet_pton(AF_INET, inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); - epkey.ip = ep_ip.sin_addr.s_addr; - auto ep_value = db_client::get_instance().GetNeighborInMemory(epkey); -// endpoint_t ep_value; -// ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(arp_dest_ip)); - if (ep_value.hip != 0) { - // we now have key and value, can modify the packet and update the map now. -// int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); -// printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", -// inet_ntoa(arp_src_ip), trn_get_vni(vxlan->vni), 0); - - /* Modify pkt for inner ARP response */ -// struct in_addr ep_ip_addr, ep_host_ip_addr; -// ep_ip_addr.s_addr = epkey.ip; -// ep_host_ip_addr.s_addr = ep_value.hip; -// printf("Retrived this endpoint: HIP: %s, IP: %s, host_mac: %x:%x:%x:%x:%x:%x, mac: %x:%x:%x:%x:%x:%x\n", -// inet_ntoa(ep_host_ip_addr), inet_ntoa(ep_ip_addr), -// ep_value.hmac[0],ep_value.hmac[1],ep_value.hmac[2],ep_value.hmac[3],ep_value.hmac[4],ep_value.hmac[5], -// ep_value.mac[0],ep_value.mac[1],ep_value.mac[2],ep_value.mac[3],ep_value.mac[4],ep_value.mac[5] -// ); - arp_msg->op = bpf_htons(ARPOP_REPLY); - trn_set_mac(arp_msg->tha, arp_msg->sha); - trn_set_mac(arp_msg->sha, ep_value.mac); - - __u32 tmp_ip = arp_msg->spa;//*sip; - arp_msg->spa = arp_msg->tpa;//*tip; - arp_msg->tpa = tmp_ip; - - /* Modify inner EitherHdr, pretend it's from target */ - trn_set_dst_mac(inner_eth, inner_eth->h_source); - trn_set_src_mac(inner_eth, ep_value.mac); - - /* Keep overlay header, swap outer IP header */ - trn_set_src_dst_ip_csum(ip, ip->daddr, ip->saddr, (eth + len)); - trn_swap_src_dst_mac(pkt); - - /* - * Packet modification finished, read packet content again, in order to verify the mod - * */ - -// struct ethhdr *eth = (struct ethhdr *) pkt; -// -// if (ntohs(eth->h_proto) != ETH_P_IP) { -//// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); -// return false; -// } -// printf("AFTER MOD: Packet length: %ld\n", len); -// printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" -// "eth size: %d\n", -// eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], -// eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], -// bpf_ntohs(eth->h_proto), -// sizeof(*eth)); -// -// // parse outer IP header -// struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); -// struct in_addr outer_ip_src; -// outer_ip_src.s_addr = ip->saddr; -// struct in_addr outer_ip_dest; -// outer_ip_dest.s_addr = ip->daddr; -// printf("AFTER MOD: Outer ip src: %s, ip dest: %s\n" -// "AFTER MOD: Outer ip ihl: %d, version: %d\n", -// inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), -// ip->ihl, ip->version); -// -// // parse UDP header -// struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); -// printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", -// udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); -// -// // parse VXLAN header -// struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); -// printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); -// -// // parse inner eth header -// struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); -// printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", -// inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], -// inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], -// inner_eth->h_proto); -// -// // parse inner arp header -// arp_message *arp_msg = (struct arp_message *)(inner_eth + 1); -// struct in_addr arp_src_ip; -// arp_src_ip.s_addr = arp_msg->spa; -// struct in_addr arp_dest_ip; -// arp_dest_ip.s_addr = arp_msg->tpa; -// printf("AFTER MOD: arp op: %d\n", -// bpf_htons(arp_msg->op)); -// printf("AFTER MOD: arp source ip: %s, \n", -// inet_ntoa(arp_src_ip/*inner_arp_dest_ip*/) -// ); -// printf("AFTER MOD: arp dest ip: %s, \n", -// inet_ntoa(arp_dest_ip/*inner_arp_dest_ip*/) -// ); - /* Here we sent the packet out of the receive port. Note that - * we allocate one entry and schedule it. Your design would be - * faster if you do batch processing/transmission */ - - ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); - if (ret != 1) { - /* No more transmit slots, drop the packet */ - return false; - } - - xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = addr; - xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = len; - xsk_ring_prod__submit(&xsk->tx, 1); - xsk->outstanding_tx++; - - xsk->stats.tx_bytes += len; - xsk->stats.tx_packets++; -// printf("Packet sent via tx queue\n"); -// printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); - - return true; - } else { - printf("Can't find endpoint!\n"); - return false; - } - }else if (ntohs(inner_eth->h_proto) == ETH_P_IP) { - // TODO: Add inner IP support, refer to trn_process_inner_ip - // parse inner IP header - struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); -// struct in_addr inner_ip_src; -// inner_ip_src.s_addr = inner_ip->saddr; - struct in_addr inner_ip_dest; - inner_ip_dest.s_addr = inner_ip->daddr; -// printf("Inner IP src: %s\n", inet_ntoa(inner_ip_src)); -// printf("Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); - endpoint_key_t epkey; - epkey.vni = trn_get_vni(vxlan->vni); - struct sockaddr_in ep_ip; - inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); - epkey.ip = ep_ip.sin_addr.s_addr; - auto ep_value = db_client::get_instance().GetNeighborInMemory(epkey); -// endpoint_t ep_value; -// ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(inner_ip_dest)); - - if (ep_value.hip != 0) { -// epkey.vni = trn_get_vni(vxlan->vni); -// struct sockaddr_in ep_ip; -// inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); -// epkey.ip = ep_ip.sin_addr.s_addr; - // we now have key and value, can modify the packet and update the map now. -// int ebpf_rc = bpf_map_update_elem((*fd), &epkey, &ep_value, BPF_ANY); -// printf("AF_XDP: Inserted this neighbor into map: vip: %s, vni: %d, ebpf_rc: %d\n", -// inet_ntoa(inner_ip_dest), trn_get_vni(vxlan->vni), 0); - -// struct in_addr ep_ip_addr, ep_host_ip_addr; -// ep_ip_addr.s_addr = epkey.ip; -// ep_host_ip_addr.s_addr = ep_value.hip; -// printf("Retrived this endpoint: HIP: %s, IP: %s, host_mac: %x:%x:%x:%x:%x:%x, mac: %x:%x:%x:%x:%x:%x\n", -// inet_ntoa(ep_host_ip_addr), inet_ntoa(ep_ip_addr), -// ep_value.hmac[0],ep_value.hmac[1],ep_value.hmac[2],ep_value.hmac[3],ep_value.hmac[4],ep_value.hmac[5], -// ep_value.mac[0],ep_value.mac[1],ep_value.mac[2],ep_value.mac[3],ep_value.mac[4],ep_value.mac[5] -// ); - - /* Modify inner EitherHdr, pretend it's from target */ - trn_set_dst_mac(inner_eth, ep_value.mac); - - /* Keep overlay header, update outer header destinations */ - trn_set_src_dst_ip_csum(ip, ip->daddr, ep_value.hip, (eth + len)); - trn_set_src_mac(eth, eth->h_dest); - trn_set_dst_mac(eth, ep_value.hmac); - - /* - * Packet modification finished, read packet content again, in order to verify the mod - * */ - -// struct ethhdr *eth = (struct ethhdr *) pkt; -// -// if (ntohs(eth->h_proto) != ETH_P_IP) { -//// printf("%s\n", "AFTER MOD: returning false for this packet as it is NOT IP"); -// return false; -// } -// printf("AFTER MOD: Packet length: %ld\n", len); -// printf("AFTER MOD: Outer eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n" -// "eth size: %d\n", -// eth->h_source[0],eth->h_source[1],eth->h_source[2],eth->h_source[3],eth->h_source[4],eth->h_source[5], -// eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],eth->h_dest[5], -// bpf_ntohs(eth->h_proto), -// sizeof(*eth)); -// -// // parse outer IP header -// struct iphdr *ip = (struct iphdr *) (eth + 1/*sizeof(*eth)*/); -// struct in_addr outer_ip_src; -// outer_ip_src.s_addr = ip->saddr; -// struct in_addr outer_ip_dest; -// outer_ip_dest.s_addr = ip->daddr; -// printf("AFTER MOD: Outer ip src: %s, ip dest: %s\n" -// "AFTER MOD: Outer ip ihl: %d, version: %d\n", -// inet_ntoa(outer_ip_src),inet_ntoa(outer_ip_dest), -// ip->ihl, ip->version); -// -// // parse UDP header -// struct udphdr *udp = (struct udphdr *) (ip + 1/*sizeof(*ip)*/); -// printf("AFTER MOD: UDP dest: %d, UDP src: %d, == VXL_DSTPORT? %s\n", -// udp->dest, udp->source, (udp->dest==VXL_DSTPORT? "true" : "false")); -// -// // parse VXLAN header -// struct vxlanhdr_internal* vxlan = (struct vxlanhdr_internal *)(udp + 1/*sizeof(*udp)*/); -// printf("AFTER MOD: VNI: %ld, \n",trn_get_vni(vxlan->vni)); -// -// // parse inner eth header -// struct ethhdr *inner_eth = (struct ethhdr *)(vxlan + 1/*sizeof(*vxlan)*/); -// printf("AFTER MOD: inner eth src: %x:%x:%x:%x:%x:%x, dest: %x:%x:%x:%x:%x:%x; next proto: 0x%x\n", -// inner_eth->h_source[0],inner_eth->h_source[1],inner_eth->h_source[2],inner_eth->h_source[3],inner_eth->h_source[4],inner_eth->h_source[5], -// inner_eth->h_dest[0],inner_eth->h_dest[1],inner_eth->h_dest[2],inner_eth->h_dest[3],inner_eth->h_dest[4],inner_eth->h_dest[5], -// inner_eth->h_proto); -// -// // parse inner IP header -// struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); -// struct in_addr inner_ip_src, inner_ip_dest; -// inner_ip_src.s_addr = inner_ip->saddr; -// inner_ip_dest.s_addr = inner_ip->daddr; -// printf("AFTER MOD: Inner IP src: %s\n", inet_ntoa(inner_ip_src)); -// printf("AFTER MOD: Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); - /* Here we sent the packet out of the receive port. Note that - * we allocate one entry and schedule it. Your design would be - * faster if you do batch processing/transmission */ - - ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); - if (ret != 1) { - /* No more transmit slots, drop the packet */ - return false; - } - - xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = addr; - xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = len; - xsk_ring_prod__submit(&xsk->tx, 1); - xsk->outstanding_tx++; - - xsk->stats.tx_bytes += len; - xsk->stats.tx_packets++; -// printf("Packet sent via tx queue\n"); -// printf("<<<<<<<<<< Finished processing packet <<<<<<<<<<\n"); - - return true; - } else { - printf("Can't find endpoint!\n"); - return false; - } - } - -// printf("Endpoing hip == 0, returning false.\n"); - return false; - } - - return false; -} - - -static void handle_receive_packets(struct xsk_socket_info *xsk, int* fd, atomic* processed_packet_count) -{ - unsigned int rcvd, stock_frames, i; - uint32_t idx_rx = 0, idx_fq = 0; - int ret; - - rcvd = xsk_ring_cons__peek(&xsk->rx, RX_BATCH_SIZE, &idx_rx); - if (!rcvd) - return; - - /* Stuff the ring with as much frames as possible */ - stock_frames = xsk_prod_nb_free(&xsk->umem->fq, - xsk_umem_free_frames(xsk)); - - if (stock_frames > 0) { - - ret = xsk_ring_prod__reserve(&xsk->umem->fq, stock_frames, - &idx_fq); - - /* This should not happen, but just in case */ - while (ret != stock_frames) - ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, - &idx_fq); - - for (i = 0; i < stock_frames; i++) - *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = - xsk_alloc_umem_frame(xsk); - - xsk_ring_prod__submit(&xsk->umem->fq, stock_frames); - } - - /* Process received packets */ -// printf("Received %d packets\n", rcvd); - for (i = 0; i < rcvd; i++) { - uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; - uint32_t len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; - - if (!process_packet(xsk, addr, len, fd)) - xsk_free_umem_frame(xsk, addr); - - xsk->stats.rx_bytes += len; - processed_packet_count->fetch_add(1); - } - - xsk_ring_cons__release(&xsk->rx, rcvd); - xsk->stats.rx_packets += rcvd; - - /* Do we need to wake up the kernel for transmission */ - complete_tx(xsk); - // printf("tx completed\n"); -} - -static void rx_and_process(struct config *cfg, - struct xsk_socket_info *xsk_socket, int* fd) -{ - struct pollfd fds[2]; - int ret, nfds = 1; - - memset(fds, 0, sizeof(fds)); - fds[0].fd = xsk_socket__fd(xsk_socket->xsk); - fds[0].events = POLLIN; - printf("%s\n", "Entering while loop to process packets."); - atomic processed_packet_count(0); - std::thread t( - [&] { - int ten_seconds = (10 * 1000 * 1000); - while (true){ - usleep(ten_seconds); - auto current_count = processed_packet_count.load(); - printf("Ten seconds passed, processed packet count: %ld\n", - current_count); - - } - } - ); - t.detach(); - while(!global_exit) { - if (cfg->xsk_poll_mode) { - ret = poll(fds, nfds, -1); - if (ret <= 0 || ret > 1) - continue; - } - handle_receive_packets(xsk_socket, fd, &processed_packet_count); - } -} - -static void exit_application(int signal) -{ - signal = signal; - global_exit = true; -} - -static struct xsk_umem_info *configure_xsk_umem(void *buffer, uint64_t size) -{ - struct xsk_umem_info *umem; - int ret; - - umem = static_cast(calloc(1, sizeof(*umem))); - if (!umem) - return nullptr; - - ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, - nullptr); - if (ret) { - errno = -ret; - return nullptr; - } - - umem->buffer = buffer; - return umem; -} - -static struct bpf_object *open_bpf_object(const char *file, int ifindex) -{ - int err; - struct bpf_object *obj; - struct bpf_map *map; - struct bpf_program *prog, *first_prog = NULL; - - struct bpf_object_open_attr open_attr = { - .file = file, - .prog_type = BPF_PROG_TYPE_XDP, - }; - - obj = bpf_object__open_xattr(&open_attr); - - bpf_object__for_each_program(prog, obj) { - bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); - bpf_program__set_ifindex(prog, ifindex); - if (!first_prog) - first_prog = prog; - } - - bpf_object__for_each_map(map, obj) { - if (!bpf_map__is_offload_neutral(map)) - bpf_map__set_ifindex(map, ifindex); - } - - if (!first_prog) { - fprintf(stderr, "ERR: file %s contains no programs\n", file); - return NULL; - } - - return obj; -} - -static int reuse_maps(struct bpf_object *obj, const char *path) -{ - struct bpf_map *map; - - if (!obj) - return -ENOENT; - - if (!path) - return -EINVAL; - - bpf_object__for_each_map(map, obj) { - if (bpf_map__name(map) == "xsks_map"){ - printf("Try to reuse map: %s\n", bpf_map__name(map)); - int len, err; - int pinned_map_fd; - char buf[PATH_MAX]; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, bpf_map__name(map)); - if (len < 0) { - return -EINVAL; - } else if (len >= PATH_MAX) { - return -ENAMETOOLONG; - } - - pinned_map_fd = bpf_obj_get(buf); - if (pinned_map_fd < 0) { - printf("failed at bpf_obj_get for map: %s, buf: %s\n", bpf_map__name(map), buf); - return pinned_map_fd; - } - - err = bpf_map__reuse_fd(map, pinned_map_fd); - if (err) { - printf("failed at bpf_map__reuse_fd for map: %s\n", bpf_map__name(map)); - return err; - } - }else { - printf("Skipping map: %s\n", bpf_map__name(map)); - } - } - - return 0; -} - -struct bpf_object *load_bpf_object_file_reuse_maps(const char *file, - int ifindex, - const char *pin_dir) -{ - int err; - struct bpf_object *obj; - - obj = open_bpf_object(file, ifindex); - if (!obj) { - fprintf(stderr, "ERR: failed to open object %s\n", file); - return NULL; - } - - err = reuse_maps(obj, pin_dir); - if (err) { - fprintf(stderr, "ERR: failed to reuse maps for object %s, pin_dir=%s, err=%d\n", - file, pin_dir, err); - return NULL; - } - - err = bpf_object__load(obj); - if (err) { - fprintf(stderr, "ERR: loading BPF-OBJ file(%s) (%d): %s\n", - file, err, strerror(-err)); - return NULL; - } - - return obj; -} - -struct bpf_object *load_bpf_object_file(const char *filename, int ifindex) -{ - int first_prog_fd = -1; - struct bpf_object *obj; - int err; - - /* This struct allow us to set ifindex, this features is used for - * hardware offloading XDP programs (note this sets libbpf - * bpf_program->prog_ifindex and foreach bpf_map->map_ifindex). - */ - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - .ifindex = ifindex, - }; - prog_load_attr.file = filename; - - /* Use libbpf for extracting BPF byte-code from BPF-ELF object, and - * loading this into the kernel via bpf-syscall - */ - err = bpf_prog_load_xattr(&prog_load_attr, &obj, &first_prog_fd); - if (err) { - fprintf(stderr, "ERR: loading BPF-OBJ file(%s) (%d): %s\n", - filename, err, strerror(-err)); - return NULL; - } - - /* Notice how a pointer to a libbpf bpf_object is returned */ - return obj; -} - -int xdp_link_attach(int ifindex, __u32 xdp_flags, int prog_fd) -{ - int err; - - /* libbpf provide the XDP net_device link-level hook attach helper */ - err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags); - if (err == -EEXIST && !(xdp_flags & XDP_FLAGS_UPDATE_IF_NOEXIST)) { - /* Force mode didn't work, probably because a program of the - * opposite type is loaded. Let's unload that and try loading - * again. - */ - - __u32 old_flags = xdp_flags; - - xdp_flags &= ~XDP_FLAGS_MODES; - xdp_flags |= (old_flags & XDP_FLAGS_SKB_MODE) ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE; - err = bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); - if (!err) - err = bpf_set_link_xdp_fd(ifindex, prog_fd, old_flags); - } - if (err < 0) { - fprintf(stderr, "ERR: " - "ifindex(%d) link set xdp fd failed (%d): %s\n", - ifindex, -err, strerror(-err)); - - switch (-err) { - case EBUSY: - case EEXIST: - fprintf(stderr, "Hint: XDP already loaded on device" - " use --force to swap/replace\n"); - break; - case EOPNOTSUPP: - fprintf(stderr, "Hint: Native-XDP not supported" - " use --skb-mode or --auto-mode\n"); - break; - default: - break; - } - return EXIT_FAIL_XDP; - } - - return EXIT_OK; -} - -struct bpf_object *load_bpf_and_xdp_attach(struct config *cfg) -{ - struct bpf_program *bpf_prog; - struct bpf_object *bpf_obj; - int offload_ifindex = 0; - int prog_fd = -1; - int err; - - /* If flags indicate hardware offload, supply ifindex */ - if (cfg->xdp_flags & XDP_FLAGS_HW_MODE) - offload_ifindex = cfg->ifindex; - - /* Load the BPF-ELF object file and get back libbpf bpf_object */ - if (cfg->reuse_maps) - bpf_obj = load_bpf_object_file_reuse_maps(cfg->filename, - offload_ifindex, - cfg->pin_dir); - else - bpf_obj = load_bpf_object_file(cfg->filename, offload_ifindex); - if (!bpf_obj) { - fprintf(stderr, "ERR: loading file: %s\n", cfg->filename); - exit(EXIT_FAIL_BPF); - } - /* At this point: All XDP/BPF programs from the cfg->filename have been - * loaded into the kernel, and evaluated by the verifier. Only one of - * these gets attached to XDP hook, the others will get freed once this - * process exit. - */ - - if (cfg->progsec[0]) - /* Find a matching BPF prog section name */ - bpf_prog = bpf_object__find_program_by_title(bpf_obj, cfg->progsec); - else - /* Find the first program */ - bpf_prog = bpf_program__next(NULL, bpf_obj); - - if (!bpf_prog) { - fprintf(stderr, "ERR: couldn't find a program in ELF section '%s'\n", cfg->progsec); - exit(EXIT_FAIL_BPF); - } - - strncpy(cfg->progsec, bpf_program__title(bpf_prog, false), sizeof(cfg->progsec)); - - prog_fd = bpf_program__fd(bpf_prog); - if (prog_fd <= 0) { - fprintf(stderr, "ERR: bpf_program__fd failed\n"); - exit(EXIT_FAIL_BPF); - } - - /* At this point: BPF-progs are (only) loaded by the kernel, and prog_fd - * is our select file-descriptor handle. Next step is attaching this FD - * to a kernel hook point, in this case XDP net_device link-level hook. - */ - err = xdp_link_attach(cfg->ifindex, cfg->xdp_flags, prog_fd); - if (err) - exit(err); - - return bpf_obj; -} - -void af_xdp_user::run_af_xdp() -{ - printf("%s", "af_xdp started\n"); - std::string table_name_neighbor_ebpf_map = "/sys/fs/bpf/endpoints_map"; - int fd_neighbor_ebpf_map = bpf_obj_get(table_name_neighbor_ebpf_map.c_str()); -// if (fd_neighbor_ebpf_map < 0) { -// printf("Failed to get xdp neighbor endpoint map fd, exiting\n"); -// return; -// } else { -// printf("Got xdp neighbor endpoint map fd %d\n", fd_neighbor_ebpf_map); -// } - - int ret; - int xsks_map_fd; - void *packet_buffer; - uint64_t packet_buffer_size; - struct rlimit rlim = {RLIM_INFINITY, RLIM_INFINITY}; - struct config cfg; - - cfg.ifindex = -1; - cfg.do_unload = false; - // TODO: fill in the file name and progsec in CPP style - struct xsk_umem_info *umem; - struct xsk_socket_info *xsk_socket; - struct bpf_object *bpf_obj = nullptr; - - /* Global shutdown handler*/ - signal(SIGINT, exit_application); - - /* Command line options can change progsec*/ -// parse_cmdline_args(argc, argv, long_options, &cfg, __doc__); - // TODO: Get rid of getting the config from argc/argv, hardcode it for the time being. - // interface name - cfg.ifname = "enp4s0f1"; - cfg.ifindex = if_nametoindex(cfg.ifname); - // skb mode - cfg.xdp_flags &= ~XDP_FLAGS_MODES; - cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;//XDP_FLAGS_DRV_MODE; - cfg.xsk_bind_flags &= XDP_COPY;//XDP_COPY; - cfg.xsk_bind_flags |= XDP_ZEROCOPY;//XDP_ZEROCOPY; - - // queue_id, default = 0 - cfg.xsk_if_queue = 0; - // NOT using poll - cfg.xsk_poll_mode = false; - // not doing unload this time - cfg.do_unload = false; - // progsec of the xdp program - std::string progsec_string = "transit"; - strncpy(cfg.progsec, progsec_string.c_str(), sizeof(cfg.progsec)); -// progsec_string.copy(cfg.progsec, progsec_string.size()); - - // absolute path for the xdp.o file - std::string file_name = "/trn_xdp/trn_transit_xdp_ebpf.o"; -// strncpy(cfg.filename, file_name.c_str(), sizeof(cfg.filename)); -// file_name.copy(cfg.filename, file_name.size()); - // reuse maps, try NOT to create a new map. - cfg.reuse_maps = true; - std::string pin_dir = "/sys/fs/bpf"; - strncpy(cfg.pin_dir, pin_dir.c_str(), sizeof(cfg.pin_dir)); -// pin_dir.copy(cfg.pin_dir, pin_dir.size()); - - /* Required option */ - if (cfg.ifindex == -1) { - printf("%s", "ERROR: Required option --dev missing\n\n"); -// usage(argv[0], __doc__, long_options, (argc == 1)); - exit(EXIT_FAIL_OPTION); - } - - /* Unload XDP program if requested */ - if (cfg.do_unload) { -// int rc = xdp_link_detach(cfg.ifindex, cfg.xdp_flags, 0); - exit(-1); - } - - /* Load custom program if configured */ - if (cfg.filename[0] != 0) { - struct bpf_map *map; - - bpf_obj = load_bpf_and_xdp_attach(&cfg); - if (!bpf_obj) { - /* Error handling done in load_bpf_and_xdp_attach() */ - exit(EXIT_FAILURE); - } - - /* We also need to load the xsks_map */ - map = bpf_object__find_map_by_name(bpf_obj, "xsks_map"); - xsks_map_fd = bpf_map__fd(map); - if (xsks_map_fd < 0) { - fprintf(stderr, "ERROR: no xsks map found: %s\n", - strerror(xsks_map_fd)); - exit(EXIT_FAILURE); - } - } else { - printf("%s\n", "Empty config filename, not loading/attaching"); - } - - /* Allow unlimited locking of memory, so all memory needed for packet - * buffers can be locked. - */ - if (setrlimit(RLIMIT_MEMLOCK, &rlim)) { - printf("%s", "ERROR: setrlimit(RLIMIT_MEMLOCK) \n"); - exit(EXIT_FAILURE); - } - - /* Allocate memory for NUM_FRAMES of the default XDP frame size */ - packet_buffer_size = NUM_FRAMES * FRAME_SIZE; - if (posix_memalign(&packet_buffer, - getpagesize(), /* PAGE_SIZE aligned */ - packet_buffer_size)) { - fprintf(stderr, "ERROR: Can't allocate buffer memory \"%s\"\n", - strerror(errno)); - exit(EXIT_FAILURE); - } - - /* Initialize shared packet_buffer for umem usage */ - umem = configure_xsk_umem(packet_buffer, packet_buffer_size); - if (umem == NULL) { - fprintf(stderr, "ERROR: Can't create umem \"%s\"\n", - strerror(errno)); - exit(EXIT_FAILURE); - } - - /* Open and configure the AF_XDP (xsk) socket */ - xsk_socket = xsk_configure_socket(&cfg, umem); - if (xsk_socket == NULL) { - fprintf(stderr, "ERROR: Can't setup AF_XDP socket \"%s\"\n", - strerror(errno)); - exit(EXIT_FAILURE); - } - /* Receive and count packets than drop them */ - rx_and_process(&cfg, xsk_socket, &fd_neighbor_ebpf_map); - - /* Cleanup */ - xsk_socket__delete(xsk_socket->xsk); - xsk_umem__delete(umem->umem); -// xdp_link_detach(cfg.ifindex, cfg.xdp_flags, 0); - - return /*EXIT_OK*/; -} diff --git a/src/comm/af_xdp_user_multi_thread.cpp b/src/comm/af_xdp_user_multi_thread.cpp index 8f3180c..6953e10 100644 --- a/src/comm/af_xdp_user_multi_thread.cpp +++ b/src/comm/af_xdp_user_multi_thread.cpp @@ -44,7 +44,6 @@ #include - #define VXL_DSTPORT 0xb512 // UDP dport 4789(0x12b5) for VxLAN overlay #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) @@ -574,7 +573,7 @@ port_init(struct port_params *params) ¶ms->xsk_cfg); if (status) { port_free(p); - printf("port_init failed because xsk_socket__create_shared failed.\n"); + printf("port_init failed because xsk_socket__create_shared failed. Status: %ld\n", status); return NULL; } @@ -754,7 +753,7 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk ) { // printf(">>>>>>>>>> Begin processing packet >>>>>>>>>>\n"); - + bpf_lpm_trie_key k; if (true) { // printf("Process packets: inside if (true)\n"); /* @@ -1061,7 +1060,6 @@ thread_func(void *arg) struct thread_data *t = static_cast(arg); cpu_set_t cpu_cores; u32 i; - CPU_ZERO(&cpu_cores); CPU_SET(t->cpu_core_id, &cpu_cores); pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_cores); @@ -1120,7 +1118,7 @@ thread_func(void *arg) * Process */ static const struct bpool_params bpool_params_default = { - .n_buffers = 64/*96*/ * 1024, + .n_buffers = 192 /* this number should be set to 64 * (number_of_cores / 8)*/ * 1024, .buffer_size = XSK_UMEM__DEFAULT_FRAME_SIZE, .mmap_flags = 0, @@ -1133,7 +1131,7 @@ static const struct xsk_umem_config umem_cfg_default = { .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, - .flags = 0, + .flags = XDP_RING_NEED_WAKEUP, }; static const struct port_params port_params_default = { @@ -1281,9 +1279,12 @@ static void print_port(u32 port_id) { struct port *port = ports[port_id]; - - printf("Port %u: interface = %s, queue = %u\n", - port_id, port->params.iface, port->params.iface_queue); + int option; + socklen_t option_length = sizeof(option); + getsockopt(xsk_socket__fd(port->xsk), SOL_XDP, XDP_OPTIONS, &option, (&option_length)); + printf("Port %u: interface = %s, queue = %u, zero_copy_enabled: %s\n", + port_id, port->params.iface, port->params.iface_queue, + ((option == XDP_OPTIONS_ZEROCOPY ? "true" : "false"))); } static void @@ -1408,7 +1409,7 @@ void* af_xdp_user_multi_thread::run_af_xdp_multi_threaded(void* args/*int argc, sizeof(struct bpool_params)); memcpy(&umem_cfg, &umem_cfg_default, sizeof(struct xsk_umem_config)); - umem_cfg.flags |= (XDP_RING_NEED_WAKEUP/*XDP_USE_NEED_WAKEUP*/ ); +// umem_cfg.flags |= (XDP_RING_NEED_WAKEUP/*XDP_USE_NEED_WAKEUP*/ ); for (i = 0; i < MAX_PORTS; i++) memcpy(&port_params[i], &port_params_default, sizeof(struct port_params)); @@ -1417,26 +1418,37 @@ void* af_xdp_user_multi_thread::run_af_xdp_multi_threaded(void* args/*int argc, // print_usage(argv[0]); // return -1; // } - auto number_of_cores = 8;//std::thread::hardware_concurrency(); + auto number_of_cores = std::thread::hardware_concurrency(); printf("This machine has %ld cores\n", number_of_cores); - // 2 ports(interfaces), same name, different q number. - n_ports = number_of_cores; + // leave 8 cores for the rest of the system. + n_ports = number_of_cores > 8 ? (number_of_cores - 8) : 0; + + if (n_ports == 0) { + printf("This machine has too little number of cores(%ld), not good for AF_XDP. Exiting\n", number_of_cores); + exit(-1); + } + + printf("After leaving 8 cores for other applications, we are now setting the interface to have %ld AF_XDP sockets.\n", n_ports); + + string set_nic_queue_command_template = "ethtool -L enp4s0f1 combined %ld"; + char set_nic_queue_command[100]; + sprintf(set_nic_queue_command, "ethtool -L enp4s0f1 combined %ld", n_ports); + printf("Executing system command: %s\n", set_nic_queue_command); + int set_nic_queue_command_rc = system(set_nic_queue_command); + + if (set_nic_queue_command_rc!=EXIT_SUCCESS) { + printf("set nic queue command failed(%ld)! Exiting\n", set_nic_queue_command_rc); + exit(-1); + } // using 1 thread per iface + iface_queue - n_threads = number_of_cores; // get number of cores of this machine. + n_threads = n_ports; // get number of cores of this machine. - for ( int i = 0 ; i < number_of_cores ; i ++) { + for ( int i = 0 ; i < n_ports ; i ++) { port_params[i].iface = "enp4s0f1"; port_params[i].iface_queue = i; thread_data[i].cpu_core_id = i; } -// port_params[0].iface = "enp4s0f1"; -// port_params[0].iface_queue = 0; -// thread_data[0].cpu_core_id = 0; -// -// port_params[1].iface = "enp4s0f1"; -// port_params[1].iface_queue = 1; -// thread_data[1].cpu_core_id = 1; /* Buffer pool initialization. */ diff --git a/src/main.cpp b/src/main.cpp index e3f54e7..105c221 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -25,7 +25,6 @@ #include "marl/event.h" #include "marl/scheduler.h" #include "marl/waitgroup.h" -#include "af_xdp_user.h" #include "af_xdp_user_multi_thread.h" #include "grpc_client.h" From b757ea1f4c4cccd99c6ba1c09e4bd4604fa71834 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Thu, 19 Jan 2023 15:54:26 -0800 Subject: [PATCH 20/33] Added data models for security group and connection tracking; added ebpf map fd for security group ebpf table; added sample code to insert data into security group ebpf maps --- include/db_client.h | 32 +++++++++++ include/grpc_client.h | 5 +- include/xdp/trn_datamodel.h | 79 +++++++++++++++++++++++++++ src/comm/af_xdp_user_multi_thread.cpp | 39 +++++++++++++ src/comm/grpc_client.cpp | 32 ++++++++++- src/main.cpp | 12 ++-- 6 files changed, 190 insertions(+), 9 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index 0661942..3fa3c3f 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -91,6 +91,14 @@ class db_client { void FillEndpointCacheFromDB() { + std::string table_name_sg_ebpf_map = "/sys/fs/bpf/security_group_map"; + int fd_security_group_ebpf_map = bpf_obj_get(table_name_sg_ebpf_map.c_str()); + printf("DB Client: sg map fd: %ld\n", fd_security_group_ebpf_map); + + std::string table_name_sg_cidr_map = "/sys/fs/bpf/sg_cidr_map"; + int fd_sg_cidr_ebpf_map = bpf_obj_get(table_name_sg_cidr_map.c_str()); + printf("DB Client: sg cidr map fd: %ld\n", fd_security_group_ebpf_map); + // Get all neighbors from SQLite Database auto get_all_neighbors_statement = local_db.prepare( select( @@ -134,6 +142,30 @@ class db_client { // inet_ntoa(endpoint_host_ip_socket.sin_addr) // ); // printf("Finished one endpoint\n"); + security_group_key_t sg_key; + sg_key.vni = vni; + sg_key.ip = endpoint_vpc_ip_socket.sin_addr.s_addr; + sg_key.direction = 0; + security_group_t sg_value; + sg_value.sg_id = 12345; + sg_value.action = 1; +// int sg_map_insert_rc = bpf_map_update_elem(fd_security_group_ebpf_map, &sg_key, &sg_value, BPF_ANY); +// printf("Sg map insert rc: %ld\n", sg_map_insert_rc); + sg_cidr_key_t sg_cidr_key; + sg_cidr_key.prefixlen = 64; +// inet_pton(AF_INET, vpc_ip, sg_cidr_key.lpm_key.data); + sg_cidr_key.ip = endpoint_vpc_ip_socket.sin_addr.s_addr; + sg_cidr_key.vni = vni; + sg_cidr_key.direction = 1; + sg_cidr_key.protocol = IPPROTO_TCP; + sg_cidr_key.port = 888; + int sg_map_insert_rc = bpf_map_update_elem(fd_sg_cidr_ebpf_map, &sg_cidr_key, &sg_value, BPF_ANY); + if (sg_map_insert_rc != 0) { + printf("Error for inserting into lpm map: %s", std::strerror(errno)); + } + printf("Sg map insert rc: %ld\n", sg_map_insert_rc); + + } printf("Finished retrieving from local DB, not endpoint cache has %ld endpoints\n", endpoint_cache.size()); } diff --git a/include/grpc_client.h b/include/grpc_client.h index 90ad6e7..fb73895 100644 --- a/include/grpc_client.h +++ b/include/grpc_client.h @@ -41,12 +41,14 @@ class ArionMasterWatcherImpl final : public Watch::Service { void ConnectToArionMaster(); - void RunClient(std::string ip, std::string port, std::string group, std::string table); + void RunClient(std::string ip, std::string port, std::string group, std::string neighbor_table, std::string security_group_rules_table); bool a = chan_ == nullptr; int fd_neighbor_ebpf_map = -1; + int fd_security_group_ebpf_map = -1; + private: std::string server_address; @@ -56,6 +58,7 @@ class ArionMasterWatcherImpl final : public Watch::Service { std::string table_name_neighbor_ebpf_map; + std::string table_name_sg_ebpf_map; // key std::string is '-', value is inserted version of this neighbor folly::ConcurrentHashMap neighbor_task_map; diff --git a/include/xdp/trn_datamodel.h b/include/xdp/trn_datamodel.h index 7ab0321..351833c 100644 --- a/include/xdp/trn_datamodel.h +++ b/include/xdp/trn_datamodel.h @@ -29,6 +29,7 @@ #include #include #include +#include "bpf.h" #define __ALIGNED_64__ __attribute__((aligned(64))) #define __ALWAYS_INLINE__ __attribute__((__always_inline__)) @@ -147,6 +148,22 @@ typedef struct { unsigned char hmac[6]; } __attribute__((packed, aligned(4))) endpoint_t; +typedef struct { + __u8 prefix_len; + __u32 vni; + __u32 ip; + __u16 port; + __u8 protocol; + bool direction; + __u16 cidr[5]; // 192.168.16.0/23, 5 elements +} __attribute__((packed, aligned(4))) security_group_rule_key_t; + +typedef struct { + bool action; // 0 or 1 + __u16 port_range[2]; // assume it supports only 1 range, such as [9000,9016] + __u16 remote_group; // remote group ID +} __attribute__((packed, aligned(4))) security_group_rule_t; + typedef struct { __u32 ip; // IP used for ZGC access __u16 announced; // non-zero indicates the MAC has been announced locally @@ -215,3 +232,65 @@ typedef struct { dp_encap_opdata_t encap; } opdata; } __attribute__((packed, aligned(8))) flow_ctx_t; + +// #if connTrack +struct ipv4_tuple_t { + __u32 saddr; + __u32 daddr; + + /* ports */ + __u16 sport; + __u16 dport; + + /* Addresses */ + __u8 protocol; + + /*TODO: include TCP flags, no use case for the moment! */ + +} __attribute__((packed)); + + +typedef struct { + __u32 vni; + struct ipv4_tuple_t tuple; +} __attribute__((packed)) contrack_key_t; + + +typedef struct { + __u32 hip; + unsigned char mac[6]; + unsigned char hmac[6]; +} __attribute__ ((packed, aligned(4))) contrack_t; + +// #endif + +// #if sgSupport + +typedef struct { + __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6*/ + __u32 vni; + __u32 ip; + __u16 port; + __u8 direction; + __u8 protocol; +} __attribute__((packed, aligned(4))) sg_cidr_key_t; + + +typedef struct { + __u32 sg_id; + __u8 action; +} __attribute__((packed, aligned(4))) sg_cidr_t; + + +typedef struct { + __u32 vni; + __u32 ip; + __u8 direction; +} __attribute__((packed, aligned(4))) security_group_key_t; + + +typedef struct { + __u32 sg_id; + __u8 action; +} __attribute__((packed, aligned(4))) security_group_t; +// #endif \ No newline at end of file diff --git a/src/comm/af_xdp_user_multi_thread.cpp b/src/comm/af_xdp_user_multi_thread.cpp index 6953e10..42f2886 100644 --- a/src/comm/af_xdp_user_multi_thread.cpp +++ b/src/comm/af_xdp_user_multi_thread.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include "util.h" #include "xdp/trn_datamodel.h" #include @@ -936,6 +937,8 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk else if (ntohs(inner_eth->h_proto) == ETH_P_IP) { // parse inner IP header struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); + + // struct in_addr inner_ip_src; // inner_ip_src.s_addr = inner_ip->saddr; struct in_addr inner_ip_dest; @@ -947,6 +950,27 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk struct sockaddr_in ep_ip; inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); epkey.ip = ep_ip.sin_addr.s_addr; + + + sg_cidr_key_t sg_key; + sg_key.protocol = inner_ip->protocol; + sg_key.ip = ep_ip.sin_addr.s_addr; + sg_key.vni = epkey.vni; + sg_key.direction = 1; // how to express goingout/coming in? + if (sg_key.protocol == IPPROTO_TCP) { + struct tcphdr *inner_tcp = (struct tcphdr *)(inner_ip + 1); + sg_key.port = bpf_htons(inner_tcp->dest); + sg_key.prefixlen = 136; + // how about lpm_key.data? + } else if (sg_key.protocol == IPPROTO_UDP) { + struct udphdr *inner_udp = (struct udphdr *)(inner_ip + 1); + sg_key.port = bpf_htons(inner_udp->dest); + sg_key.prefixlen = 136; + // how about lpm_key.data? + } + + + auto ep_value = db_client::get_instance().GetNeighborInMemory(epkey); // endpoint_t ep_value; // ep_value = db_client::get_instance().GetNeighbor(trn_get_vni(vxlan->vni), inet_ntoa(inner_ip_dest)); @@ -1404,6 +1428,21 @@ void* af_xdp_user_multi_thread::run_af_xdp_multi_threaded(void* args/*int argc, u64 ns0; int i; + std::string table_name_neighbor_ebpf_map = "/sys/fs/bpf/endpoints_map"; + int fd_neighbor_ebpf_map = bpf_obj_get(table_name_neighbor_ebpf_map.c_str()); + + std::string table_name_sg_ebpf_map = "/sys/fs/bpf/security_group_map"; + int fd_security_group_ebpf_map = bpf_obj_get(table_name_sg_ebpf_map.c_str()); + + printf("endpoints map fd: %ld, sg map fd: %ld\n", fd_neighbor_ebpf_map, fd_security_group_ebpf_map); + + + if (fd_neighbor_ebpf_map <= 0 || fd_security_group_ebpf_map <= 0 ) { + printf("fd_neighbor_ebpf_map: %ld, fd_security_group_ebpf_map: %ld, exiting\n" + , fd_neighbor_ebpf_map, fd_security_group_ebpf_map); +// exit(-1); + } + /* Parse args. */ memcpy(&bpool_params, &bpool_params_default, sizeof(struct bpool_params)); diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 8bb1da2..18ce4ba 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -230,13 +230,14 @@ void ArionMasterWatcherImpl::ConnectToArionMaster() { printf("After initiating a new sub to connect to the Arion Master: %s\n", (server_address + ":" + server_port).c_str()); } -void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::string group, std::string table) { +void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::string group, std::string endpoints_table, std::string security_group_rules_table) { printf("Running a grpc client in a separate thread id: %ld\n", std::this_thread::get_id()); server_address = ip; server_port = port; group_id = group; - table_name_neighbor_ebpf_map = table; + table_name_neighbor_ebpf_map = endpoints_table; + table_name_sg_ebpf_map = security_group_rules_table; // Retrieve neighbor's ebpf map fd (handle) fd_neighbor_ebpf_map = bpf_obj_get(table_name_neighbor_ebpf_map.c_str()); @@ -247,6 +248,33 @@ void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::st // printf("Got xdp neighbor endpoint map fd %d\n", fd_neighbor_ebpf_map); // } + // check if security group ebpf map exists, and create it if it doesn't + + fd_security_group_ebpf_map = bpf_obj_get(table_name_sg_ebpf_map.c_str()); + + if (fd_security_group_ebpf_map < 0) { + printf("Creating security_group_ebpf_map manually\n"); + + struct bpf_lpm_trie_key *security_group_key; + size_t key_size_security_group; + key_size_security_group = sizeof(*security_group_key) + sizeof(__u32); + + printf("Key size: %ld, value size: %ld\n", key_size_security_group, sizeof(security_group_rule_t)); + fd_security_group_ebpf_map = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, + key_size_security_group/*sizeof(security_group_rule_key_t)*/, + sizeof(security_group_rule_t), + 999, // need to change it to a bigger number later. + 0); + + if (fd_security_group_ebpf_map <= 0) { + printf("Tried to manually create security group map, but failed with fd: %ld, and error no: %s, returning\n", + fd_security_group_ebpf_map, std::strerror(errno)); + exit(-1); + } + printf("Manually created security group map with fd: %ld, returning\n", fd_security_group_ebpf_map); + + } + // Create (if db not exists) or connect (if db exists already) to local db db_client::get_instance().local_db.sync_schema(); diff --git a/src/main.cpp b/src/main.cpp index 105c221..372feb7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -42,6 +42,7 @@ ArionMasterWatcherImpl *g_grpc_client = NULL; string g_arion_master_address = EMPTY_STRING; string g_arion_master_port = "9090"; string g_arion_neighbor_table = "/sys/fs/bpf/endpoints_map"; +string g_arion_security_group_table = "/sys/fs/bpf/security_group_map"; //TODO: let ArionMaster figure out group from ArionWing IP (in grpc channel) string g_arion_group = "group1"; @@ -142,17 +143,16 @@ int main(int argc, char *argv[]) { marl::schedule([=] { g_grpc_client->RunClient(g_arion_master_address, g_arion_master_port, - g_arion_group, - g_arion_neighbor_table); + g_arion_group,\ + g_arion_neighbor_table, + g_arion_security_group_table); }); -// marl::schedule([=] { -// auto af = af_xdp_user(); -// af.run_af_xdp(/*g_arion_neighbor_table*/); -// }); + /* auto afm = af_xdp_user_multi_thread(); pthread_t t; pthread_create(&t, NULL, &af_xdp_user_multi_thread::run_af_xdp_multi_threaded, &afm); + */ pause(); cleanup(); From 1653e6df38b29de0471322ae6167b90187cd9417 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 20 Jan 2023 17:45:22 -0800 Subject: [PATCH 21/33] Modified the sg_cidr_key_t, added remote_ip field --- include/db_client.h | 6 ++++-- include/xdp/trn_datamodel.h | 18 +++++++++++++++++- src/comm/af_xdp_user_multi_thread.cpp | 10 ++++++---- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index 3fa3c3f..51c5996 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -152,9 +152,11 @@ class db_client { // int sg_map_insert_rc = bpf_map_update_elem(fd_security_group_ebpf_map, &sg_key, &sg_value, BPF_ANY); // printf("Sg map insert rc: %ld\n", sg_map_insert_rc); sg_cidr_key_t sg_cidr_key; - sg_cidr_key.prefixlen = 64; + // add the number of bits for all fields, except prefexlen and dst_ip, then add the cidr range, in this case it is /24 + sg_cidr_key.prefixlen = (32 + 16 + 8 + 8 + 32 + 24); // inet_pton(AF_INET, vpc_ip, sg_cidr_key.lpm_key.data); - sg_cidr_key.ip = endpoint_vpc_ip_socket.sin_addr.s_addr; + sg_cidr_key.src_ip = endpoint_vpc_ip_socket.sin_addr.s_addr; + sg_cidr_key.dst_ip = endpoint_vpc_ip_socket.sin_addr.s_addr; sg_cidr_key.vni = vni; sg_cidr_key.direction = 1; sg_cidr_key.protocol = IPPROTO_TCP; diff --git a/include/xdp/trn_datamodel.h b/include/xdp/trn_datamodel.h index 351833c..f140561 100644 --- a/include/xdp/trn_datamodel.h +++ b/include/xdp/trn_datamodel.h @@ -269,10 +269,11 @@ typedef struct { typedef struct { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6*/ __u32 vni; - __u32 ip; __u16 port; __u8 direction; __u8 protocol; + __u32 src_ip; + __u32 dst_ip; } __attribute__((packed, aligned(4))) sg_cidr_key_t; @@ -293,4 +294,19 @@ typedef struct { __u32 sg_id; __u8 action; } __attribute__((packed, aligned(4))) security_group_t; + + +typedef struct { + __u32 vni; + __u32 ip; + __u8 direction; +} __attribute__((packed, aligned(4))) port_range_key_t; + +typedef struct { + __u16 port_min1; + __u16 port_max1; + __u16 port_min2; + __u16 port_max2; +} __attribute__((packed, aligned(4))) port_range_t; + // #endif \ No newline at end of file diff --git a/src/comm/af_xdp_user_multi_thread.cpp b/src/comm/af_xdp_user_multi_thread.cpp index 42f2886..d6fa1ab 100644 --- a/src/comm/af_xdp_user_multi_thread.cpp +++ b/src/comm/af_xdp_user_multi_thread.cpp @@ -939,22 +939,24 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk struct iphdr *inner_ip = (struct iphdr *)(inner_eth + 1 /*sizeof(*inner_eth)*/); -// struct in_addr inner_ip_src; -// inner_ip_src.s_addr = inner_ip->saddr; + struct in_addr inner_ip_src; + inner_ip_src.s_addr = inner_ip->saddr; struct in_addr inner_ip_dest; inner_ip_dest.s_addr = inner_ip->daddr; // printf("Inner IP src: %s\n", inet_ntoa(inner_ip_src)); // printf("Inner IP dest: %s\n", inet_ntoa(inner_ip_dest)); endpoint_key_t epkey; epkey.vni = trn_get_vni(vxlan->vni); - struct sockaddr_in ep_ip; + struct sockaddr_in ep_ip, src_ip; inet_pton(AF_INET, inet_ntoa(inner_ip_dest/*inner_arp_dest_ip*/), &(ep_ip.sin_addr)); + inet_pton(AF_INET, inet_ntoa(inner_ip_src), &(src_ip.sin_addr)); epkey.ip = ep_ip.sin_addr.s_addr; sg_cidr_key_t sg_key; sg_key.protocol = inner_ip->protocol; - sg_key.ip = ep_ip.sin_addr.s_addr; + sg_key.src_ip = ep_ip.sin_addr.s_addr; + sg_key.dst_ip = src_ip.sin_addr.s_addr; sg_key.vni = epkey.vni; sg_key.direction = 1; // how to express goingout/coming in? if (sg_key.protocol == IPPROTO_TCP) { From 26fc211c4e287b87db97bad9f0189b4f32a81f33 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Mon, 23 Jan 2023 12:27:09 -0800 Subject: [PATCH 22/33] Added security group rule to the local DB and to the gRPC client. --- include/db_client.h | 66 ++++-- include/grpc_client.h | 9 +- include/xdp/trn_datamodel.h | 14 +- src/comm/af_xdp_user_multi_thread.cpp | 4 +- src/comm/grpc_client.cpp | 298 ++++++++++++++++---------- 5 files changed, 244 insertions(+), 147 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index 51c5996..3e69f5b 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -32,10 +32,32 @@ struct Neighbor { int version; }; // local db table 1 - neighbor info table that stores the latest neighbors (if there are version updates per neighbor) received from ArionMaster -struct ProgrammingState { +struct NeibghborProgrammingState { int version; }; // local db table 2 - neighbor ebpf programmed version +struct SecurityGroupPortBinding { + std::string port_id; // vni-vpc_ip + std::string security_group_id; +}; // local db table 3, stores the mapping between port and security group, 1 group can have multiple rules. + +struct SecurityGroupRule { + std::string security_group_id; + std::string remote_group_id; + std::string direction; + std::string remote_ip_prefix; + std::string protocol; + int port_range_max; + int port_range_min; + std::string ether_type; + int vni; + int version; +}; // local db table 3, security group rule table that stores the latest security group rules (if there are version updates per neighbor) received from ArionMaster + +struct SecurityGroupProgrammingState { + int version; +}; // local db table 2 - security rule ebpf programmed version + // copied from arp_hash in ACA struct EndpointHash { size_t operator()(const endpoint_key_t &e) const{ @@ -62,17 +84,33 @@ inline auto make_storage_query () { make_column("version", &Neighbor::version), primary_key(&Neighbor::vni, &Neighbor::vpc_ip) ), - make_table("journal", - make_column("version", &ProgrammingState::version), - primary_key(&ProgrammingState::version) + make_table("journal_neighbor", + make_column("version", &NeibghborProgrammingState::version), + primary_key(&NeibghborProgrammingState::version) + ), + make_table("security_group_rule", + make_column("security_group_id", &SecurityGroupRule::security_group_id), + make_column("remote_group_id", &SecurityGroupRule::remote_group_id), + make_column("direction", &SecurityGroupRule::direction), + make_column("remote_ip_prefix", &SecurityGroupRule::remote_ip_prefix), + make_column("protocol", &SecurityGroupRule::protocol), + make_column("port_range_max", &SecurityGroupRule::port_range_max), + make_column("port_range_min", &SecurityGroupRule::port_range_min), + make_column("ether_type", &SecurityGroupRule::ether_type), + make_column("vni", &SecurityGroupRule::vni), + make_column("version", &SecurityGroupRule::version), + primary_key(&SecurityGroupRule::remote_group_id) + ), + make_table("journal_security_group_rules", + make_column("version", &SecurityGroupProgrammingState::version), + primary_key(&SecurityGroupProgrammingState::version) ) - ); + ); }; using Storage = decltype(make_storage_query()); class db_client { - public: static db_client &get_instance() { static db_client instance; @@ -90,6 +128,8 @@ class db_client { std::unordered_map endpoint_cache; + // function that will be called at the beginning of the program, reads rows from the neighbor table + // and fills the in-memory endpoint cache, which is used for fast lookup. void FillEndpointCacheFromDB() { std::string table_name_sg_ebpf_map = "/sys/fs/bpf/security_group_map"; int fd_security_group_ebpf_map = bpf_obj_get(table_name_sg_ebpf_map.c_str()); @@ -155,8 +195,8 @@ class db_client { // add the number of bits for all fields, except prefexlen and dst_ip, then add the cidr range, in this case it is /24 sg_cidr_key.prefixlen = (32 + 16 + 8 + 8 + 32 + 24); // inet_pton(AF_INET, vpc_ip, sg_cidr_key.lpm_key.data); - sg_cidr_key.src_ip = endpoint_vpc_ip_socket.sin_addr.s_addr; - sg_cidr_key.dst_ip = endpoint_vpc_ip_socket.sin_addr.s_addr; + sg_cidr_key.local_ip = endpoint_vpc_ip_socket.sin_addr.s_addr; + sg_cidr_key.remote_ip = endpoint_vpc_ip_socket.sin_addr.s_addr; sg_cidr_key.vni = vni; sg_cidr_key.direction = 1; sg_cidr_key.protocol = IPPROTO_TCP; @@ -185,14 +225,14 @@ class db_client { ); */ - using als_mo = alias_a; - using als_mi = alias_b; - auto ver_gaps = local_db.select(alias_column(&ProgrammingState::version), + using als_mo = alias_a; + using als_mi = alias_b; + auto ver_gaps = local_db.select(alias_column(&NeibghborProgrammingState::version), from(), where(not exists( - select(0 - c(alias_column(&ProgrammingState::version)), + select(0 - c(alias_column(&NeibghborProgrammingState::version)), from(), - where(is_equal(c(alias_column(&ProgrammingState::version)) + 1, alias_column(&ProgrammingState::version))) + where(is_equal(c(alias_column(&NeibghborProgrammingState::version)) + 1, alias_column(&NeibghborProgrammingState::version))) )))); // lkg version: diff --git a/include/grpc_client.h b/include/grpc_client.h index fb73895..98b186d 100644 --- a/include/grpc_client.h +++ b/include/grpc_client.h @@ -37,7 +37,7 @@ class ArionMasterWatcherImpl final : public Watch::Service { explicit ArionMasterWatcherImpl() {} - void RequestNeighborRules(ArionWingRequest *request, grpc::CompletionQueue *cq); + void RequestArionMaster(std::vector *request_vector, grpc::CompletionQueue *cq); void ConnectToArionMaster(); @@ -62,11 +62,14 @@ class ArionMasterWatcherImpl final : public Watch::Service { // key std::string is '-', value is inserted version of this neighbor folly::ConcurrentHashMap neighbor_task_map; + + // key std::string is 'securitygroupid', value is inserted version of this security group rule + folly::ConcurrentHashMap security_group_rule_task_map; }; struct AsyncClientCall { - arion::schema::NeighborRule reply; + arion::schema::ArionWingResponse reply; grpc::ClientContext context; grpc::Status status; - std::unique_ptr > stream; + std::unique_ptr > stream; }; diff --git a/include/xdp/trn_datamodel.h b/include/xdp/trn_datamodel.h index f140561..1048e0d 100644 --- a/include/xdp/trn_datamodel.h +++ b/include/xdp/trn_datamodel.h @@ -148,16 +148,6 @@ typedef struct { unsigned char hmac[6]; } __attribute__((packed, aligned(4))) endpoint_t; -typedef struct { - __u8 prefix_len; - __u32 vni; - __u32 ip; - __u16 port; - __u8 protocol; - bool direction; - __u16 cidr[5]; // 192.168.16.0/23, 5 elements -} __attribute__((packed, aligned(4))) security_group_rule_key_t; - typedef struct { bool action; // 0 or 1 __u16 port_range[2]; // assume it supports only 1 range, such as [9000,9016] @@ -272,8 +262,8 @@ typedef struct { __u16 port; __u8 direction; __u8 protocol; - __u32 src_ip; - __u32 dst_ip; + __u32 local_ip; + __u32 remote_ip; } __attribute__((packed, aligned(4))) sg_cidr_key_t; diff --git a/src/comm/af_xdp_user_multi_thread.cpp b/src/comm/af_xdp_user_multi_thread.cpp index d6fa1ab..70850ad 100644 --- a/src/comm/af_xdp_user_multi_thread.cpp +++ b/src/comm/af_xdp_user_multi_thread.cpp @@ -955,8 +955,8 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk sg_cidr_key_t sg_key; sg_key.protocol = inner_ip->protocol; - sg_key.src_ip = ep_ip.sin_addr.s_addr; - sg_key.dst_ip = src_ip.sin_addr.s_addr; + sg_key.local_ip = ep_ip.sin_addr.s_addr; + sg_key.remote_ip = src_ip.sin_addr.s_addr; sg_key.vni = epkey.vni; sg_key.direction = 1; // how to express goingout/coming in? if (sg_key.protocol == IPPROTO_TCP) { diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 18ce4ba..35476c6 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -41,14 +41,16 @@ using namespace arion::schema; -void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, +void ArionMasterWatcherImpl::RequestArionMaster(vector *request_vector, grpc::CompletionQueue *cq) { grpc::ClientContext ctx; arion::schema::NeighborRule reply; // prepared statements for better performance of db writing in completion queue auto add_or_update_neighbor_db_stmt = db_client::get_instance().local_db.prepare(replace(Neighbor{ 0, "", "", "", "", 0 })); - auto add_programmed_version_db_stmt = db_client::get_instance().local_db.prepare(insert(ProgrammingState{ 0 })); + auto add_programmed_neighbor_version_db_stmt = db_client::get_instance().local_db.prepare(insert(NeibghborProgrammingState{ 0 })); + auto add_or_update_security_group_rule_db_stmt = db_client::get_instance().local_db.prepare(replace(::SecurityGroupRule{ "", "", "", "", "", 0, 0, "", 0, 0 })); + auto add_programmed_security_group_rule_version_db_stmt = db_client::get_instance().local_db.prepare(insert(SecurityGroupProgrammingState {0})); // check current grpc channel state, try to connect if needed grpc_connectivity_state current_state = chan_->GetState(true); @@ -66,6 +68,7 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, int tag_watch = 1; printf("Completion queue: initial task, async watch\n"); + stub_->AsyncWatch(&call->context, cq, (void*) tag_watch); call->stream = stub_->AsyncWatch(&call->context, cq, (void*)tag_watch); // start time @@ -80,133 +83,182 @@ void ArionMasterWatcherImpl::RequestNeighborRules(ArionWingRequest *request, printf("Completion queue: initial task response received\n"); printf("Completion queue: write async watch ArionWingRequest of [group, revision] to stream\n"); - call->stream->Write(*request, (void*)tag_watch); + for (auto &request : *request_vector) { + call->stream->Write(*request, (void*)tag_watch); + printf("Just wrote request with rev: [%ld], map: [%ld] and group id: [%ld]\n", + request->rev(), request->map().c_str(), request->group().c_str() + ); + } write_done = true; } else { call->stream->Read(&call->reply, got_tag); - auto vni = call->reply.tunnel_id(); - auto vpc_ip = call->reply.ip(); - auto vpc_mac = call->reply.mac(); - auto host_ip = call->reply.hostip(); - auto host_mac = call->reply.hostmac(); - auto ver = call->reply.version(); - int fd = fd_neighbor_ebpf_map; - - // non-empty rule - if ("" != vpc_ip) { - marl::schedule([this, &i, vni, vpc_ip, vpc_mac, host_ip, host_mac, ver, fd, - &add_or_update_neighbor_db_stmt, &add_programmed_version_db_stmt] { - // step #1 - check and store as in concurrent hash map - std::string neighbor_key = std::to_string(vni) + "-" + vpc_ip; - printf("vpc_ip is NOT empty: [%s]\n", vpc_ip.c_str()); - bool ebpf_ignored = false; - bool map_updated = false; - int update_ct = 0, max_update_ct = 5; - - while (!map_updated && (update_ct < max_update_ct)) { - printf("Inside while loop, map_updated = [%b], update_ct = [%ld], max_update_ct = [%ld]\n", - map_updated, update_ct, max_update_ct); - auto neighbor_pos = neighbor_task_map.find(neighbor_key); - if (neighbor_pos == neighbor_task_map.end()) { - // key not found, try insert. The function returns successful only when key not exists when inserting - auto res_insert = - neighbor_task_map.insert(neighbor_key, ver); - if (res_insert.second) { - // means successfully inserted, done with update - map_updated = true; - printf("Found neighbor key in neighbor_task_map\n"); - } // 'else' means another thread already inserted before me, then it's not an insert case and next time in the loop will go to case of update - } else { - printf("Didn't find neighbor key in neighbor_task_map\n"); - // key found, means multi neighbor versions might update at the same time - int cur_ver = neighbor_pos->second; - - if (ver > cur_ver) { - // only update neighbor version - // 1. when received (from ArionMaster) neighbor version is greater than current version in map - // 2. and only if the element to update is the original element (version in 'find') - if (neighbor_task_map.assign_if_equal(neighbor_key, ver, cur_ver)) { + if (call->reply.has_neighbor_rule()) { + auto vni = call->reply.neighbor_rule().tunnel_id(); + auto vpc_ip = call->reply.neighbor_rule().ip(); + auto vpc_mac = call->reply.neighbor_rule().mac(); + auto host_ip = call->reply.neighbor_rule().hostip(); + auto host_mac = call->reply.neighbor_rule().hostmac(); + auto ver = call->reply.neighbor_rule().version(); + int fd = fd_neighbor_ebpf_map; + + // non-empty rule + if ("" != vpc_ip) { + marl::schedule([this, &i, vni, vpc_ip, vpc_mac, host_ip, host_mac, ver, fd, + &add_or_update_neighbor_db_stmt, &add_programmed_neighbor_version_db_stmt] { + // step #1 - check and store as in concurrent hash map + std::string neighbor_key = std::to_string(vni) + "-" + vpc_ip; + printf("vpc_ip is NOT empty: [%s]\n", vpc_ip.c_str()); + bool ebpf_ignored = false; + bool map_updated = false; + int update_ct = 0, max_update_ct = 5; + + while (!map_updated && (update_ct < max_update_ct)) { + printf("Inside while loop, map_updated = [%b], update_ct = [%ld], max_update_ct = [%ld]\n", + map_updated, update_ct, max_update_ct); + auto neighbor_pos = neighbor_task_map.find(neighbor_key); + if (neighbor_pos == neighbor_task_map.end()) { + // key not found, try insert. The function returns successful only when key not exists when inserting + auto res_insert = + neighbor_task_map.insert(neighbor_key, ver); + if (res_insert.second) { + // means successfully inserted, done with update map_updated = true; - } + printf("Found neighbor key in neighbor_task_map\n"); + } // 'else' means another thread already inserted before me, then it's not an insert case and next time in the loop will go to case of update } else { - // otherwise - // ignore: - // 1. update concurrent hash map - // 2. update ebpf map to not overwrite new data with out dated data - // 3. update local db table 1 (table 1 is for local lookup) since it is an old version - // update: journal table (since this skipped version is treated as programming succeeded) - ebpf_ignored = true; - map_updated = true; + printf("Didn't find neighbor key in neighbor_task_map\n"); + // key found, means multi neighbor versions might update at the same time + int cur_ver = neighbor_pos->second; + + if (ver > cur_ver) { + // only update neighbor version + // 1. when received (from ArionMaster) neighbor version is greater than current version in map + // 2. and only if the element to update is the original element (version in 'find') + if (neighbor_task_map.assign_if_equal(neighbor_key, ver, cur_ver)) { + map_updated = true; + } + } else { + // otherwise + // ignore: + // 1. update concurrent hash map + // 2. update ebpf map to not overwrite new data with out dated data + // 3. update local db table 1 (table 1 is for local lookup) since it is an old version + // update: journal table (since this skipped version is treated as programming succeeded) + ebpf_ignored = true; + map_updated = true; + } } + + update_ct++; } - update_ct++; - } - - if (map_updated) { - if (!ebpf_ignored) { - printf("ebpf_ignored = false\n"); - // step #2 - sync syscall ebpf map programming with return code - endpoint_key_t epkey; - epkey.vni = vni; - struct sockaddr_in ep_ip; - inet_pton(AF_INET, vpc_ip.c_str(), &(ep_ip.sin_addr)); - epkey.ip = ep_ip.sin_addr.s_addr; - printf("Filled in ep.ip\n"); - endpoint_t ep; - struct sockaddr_in ep_hip; - inet_pton(AF_INET, host_ip.c_str(), &(ep_hip.sin_addr)); - ep.hip = ep_hip.sin_addr.s_addr; - printf("Filled in ep.hip\n"); - - std::sscanf(vpc_mac.c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", - &ep.mac[0], &ep.mac[1], &ep.mac[2], - &ep.mac[3], &ep.mac[4], &ep.mac[5]); - printf("Filled in ep.mac\n"); - - std::sscanf(host_mac.c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", - &ep.hmac[0], &ep.hmac[1], &ep.hmac[2], - &ep.hmac[3], &ep.hmac[4], &ep.hmac[5]); - printf("Filled in ep.hmac\n"); - - //disabling the element udpate, so that all packets will be sent to user space program. - - int ebpf_rc = 0;//bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); - // also put in local in memory cache - db_client::get_instance().endpoint_cache[epkey] = ep;//.insert(epkey, ep); - printf("GPPC: Inserted this neighbor into map: vip: %s, vni: %d\n", vpc_ip.c_str(), vni); - // step #3 - async call to write/update to local db table 1 - db_client::get_instance().local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { - get<0>(add_or_update_neighbor_db_stmt) = { vni, vpc_ip, host_ip, vpc_mac, host_mac, ver }; - db_client::get_instance().local_db.execute(add_or_update_neighbor_db_stmt); - }); - printf("Dispatched local db neighbor insert\n"); - // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded - if (0 == ebpf_rc) { - db_client::get_instance().local_db_writer_queue.dispatch([ver, &add_programmed_version_db_stmt] { - get<0>(add_programmed_version_db_stmt) = { ver }; - db_client::get_instance().local_db.execute(add_programmed_version_db_stmt); + if (map_updated) { + if (!ebpf_ignored) { + printf("ebpf_ignored = false\n"); + // step #2 - sync syscall ebpf map programming with return code + endpoint_key_t epkey; + epkey.vni = vni; + struct sockaddr_in ep_ip; + inet_pton(AF_INET, vpc_ip.c_str(), &(ep_ip.sin_addr)); + epkey.ip = ep_ip.sin_addr.s_addr; + printf("Filled in ep.ip\n"); + endpoint_t ep; + struct sockaddr_in ep_hip; + inet_pton(AF_INET, host_ip.c_str(), &(ep_hip.sin_addr)); + ep.hip = ep_hip.sin_addr.s_addr; + printf("Filled in ep.hip\n"); + + std::sscanf(vpc_mac.c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + &ep.mac[0], &ep.mac[1], &ep.mac[2], + &ep.mac[3], &ep.mac[4], &ep.mac[5]); + printf("Filled in ep.mac\n"); + + std::sscanf(host_mac.c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + &ep.hmac[0], &ep.hmac[1], &ep.hmac[2], + &ep.hmac[3], &ep.hmac[4], &ep.hmac[5]); + printf("Filled in ep.hmac\n"); + + //disabling the element udpate, so that all packets will be sent to user space program. + + int ebpf_rc = bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); + // also put in local in memory cache + db_client::get_instance().endpoint_cache[epkey] = ep;//.insert(epkey, ep); + printf("GPPC: Inserted this neighbor into map: vip: %s, vni: %d\n", vpc_ip.c_str(), vni); + // step #3 - async call to write/update to local db table 1 + db_client::get_instance().local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { + get<0>(add_or_update_neighbor_db_stmt) = { vni, vpc_ip, host_ip, vpc_mac, host_mac, ver }; + db_client::get_instance().local_db.execute(add_or_update_neighbor_db_stmt); + }); + printf("Dispatched local db neighbor insert\n"); + // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded + if (0 == ebpf_rc) { + db_client::get_instance().local_db_writer_queue.dispatch([ver, &add_programmed_neighbor_version_db_stmt] { + get<0>(add_programmed_neighbor_version_db_stmt) = { ver }; + db_client::get_instance().local_db.execute( + add_programmed_neighbor_version_db_stmt); + }); + } + printf("Dispatched local db journal insert\n"); + } else { + printf("ebpf_ignored = true\n"); + // step #4 (case 2) - always write to local db table 2 (programming journal) when version intended ignored (no need to program older version) + db_client::get_instance().local_db_writer_queue.dispatch([ver, &add_programmed_neighbor_version_db_stmt] { + get<0>(add_programmed_neighbor_version_db_stmt) = { ver }; + db_client::get_instance().local_db.execute( + add_programmed_neighbor_version_db_stmt); }); } - printf("Dispatched local db journal insert\n"); } else { - printf("ebpf_ignored = true\n"); - // step #4 (case 2) - always write to local db table 2 (programming journal) when version intended ignored (no need to program older version) - db_client::get_instance().local_db_writer_queue.dispatch([ver, &add_programmed_version_db_stmt] { - get<0>(add_programmed_version_db_stmt) = { ver }; - db_client::get_instance().local_db.execute(add_programmed_version_db_stmt); - }); + printf("Failed to update neighbor %d %s in map, skipping it\n", vni, vpc_ip.c_str()); } - } else { - printf("Failed to update neighbor %d %s in map, skipping it\n", vni, vpc_ip.c_str()); - } - i++; - }); + i++; + }); + } else { + printf("vpc_ip is empty\n"); + } + } else if (call->reply.has_securitygrouprule()) { + // only write security group rule to local DB, the actually ebpf map insert will + // happen when a port binding message is sent down. We use the port_id (vni-vpc_ip) + // for the SG rule's local IP, and use the securitygroupid to lookup the rest of info + // for sg_cidr_key_t and sg_cidr_t + auto security_group_id = call->reply.securitygrouprule().securitygroupid(); + auto remote_group_id = call->reply.securitygrouprule().remotegroupid(); + auto direction = call->reply.securitygrouprule().direction(); + auto remote_ip_prefix = call->reply.securitygrouprule().remoteipprefix(); + auto protocol = call->reply.securitygrouprule().protocol(); + auto port_range_max = call->reply.securitygrouprule().portrangemax(); + auto port_range_min = call->reply.securitygrouprule().portrangemin(); + auto ether_type = call->reply.securitygrouprule().ethertype(); + auto vni = call->reply.securitygrouprule().vni(); + auto version = call->reply.securitygrouprule().version(); + + // non-empty rule + if ("" != security_group_id) { + db_client::get_instance().local_db_writer_queue.dispatch( + [security_group_id, remote_group_id, direction, remote_ip_prefix, protocol, port_range_max, port_range_min, &add_or_update_security_group_rule_db_stmt, ether_type, vni, version] { + get<0>(add_or_update_security_group_rule_db_stmt) = + { security_group_id, remote_group_id, direction, remote_ip_prefix, protocol, port_range_max, port_range_min, ether_type, vni, version }; + db_client::get_instance().local_db.execute(add_or_update_security_group_rule_db_stmt); + }); + printf("Dispatched local db security group rule insert\n"); + // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded + db_client::get_instance().local_db_writer_queue.dispatch([version, &add_programmed_security_group_rule_version_db_stmt] { + get<0>(add_programmed_security_group_rule_version_db_stmt) = { version }; + db_client::get_instance().local_db.execute( + add_programmed_security_group_rule_version_db_stmt); + }); + printf("Dispatched local db security group journal insert\n"); + } else { + printf("security group id is empty\n"); + } + } else if (call->reply.has_securitygroupportbinding()) { + printf("Security group port binding feather to be implemented\n"); } else { - printf("vpc_ip is empty\n"); + printf("This reply doesn't have a neighbor rule, a security group rule, or a security group port binding\n"); } + } } else { printf("NOT okay\n"); @@ -284,8 +336,20 @@ void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::st db_client::get_instance().FillEndpointCacheFromDB(); this->ConnectToArionMaster(); grpc::CompletionQueue cq; - ArionWingRequest watch_req; - watch_req.set_group(group_id); - watch_req.set_rev(rev_lkg); - this->RequestNeighborRules(&watch_req, &cq); + // This vector includes the Arion Requests that will be sent to Arion Master + vector arion_request_vector; + ArionWingRequest neighbor_watch_req; + neighbor_watch_req.set_map("NeighborRule"); + neighbor_watch_req.set_group(group_id); + neighbor_watch_req.set_rev(rev_lkg); + + ArionWingRequest security_group_rule_watch_req; + security_group_rule_watch_req.set_map("SecurityGroupRule"); + // set version 0 for now. + security_group_rule_watch_req.set_rev(0); + // set empty group rule for now. + security_group_rule_watch_req.set_group(""); + arion_request_vector.emplace_back(&neighbor_watch_req); + arion_request_vector.emplace_back(&security_group_rule_watch_req); + this->RequestArionMaster(&arion_request_vector, &cq); } From 8d31868608e222d62872ee2bc9dca53a7dbd20ea Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Mon, 23 Jan 2023 18:05:38 -0800 Subject: [PATCH 23/33] Added support for adding security group port binding to grpc client; need to be tested --- include/db_client.h | 44 ++++++++-- src/comm/grpc_client.cpp | 171 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 200 insertions(+), 15 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index 3e69f5b..dca5a46 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -39,6 +39,7 @@ struct NeibghborProgrammingState { struct SecurityGroupPortBinding { std::string port_id; // vni-vpc_ip std::string security_group_id; + int version; }; // local db table 3, stores the mapping between port and security group, 1 group can have multiple rules. struct SecurityGroupRule { @@ -54,7 +55,7 @@ struct SecurityGroupRule { int version; }; // local db table 3, security group rule table that stores the latest security group rules (if there are version updates per neighbor) received from ArionMaster -struct SecurityGroupProgrammingState { +struct SecurityGroupPortBindingProgrammingState { int version; }; // local db table 2 - security rule ebpf programmed version @@ -71,6 +72,21 @@ struct EndpointEqual { } }; +struct SecurityGroupRuleHash { + size_t operator()(const sg_cidr_key_t &e) const{ + return std::hash<__u32>()(e.prefixlen) ^ std::hash<__u32>()(e.vni) ^ std::hash<__u16>()(e.port) ^ + std::hash<__u8>()(e.direction) ^ std::hash<__u8>()(e.protocol) ^ std::hash<__u32>()(e.local_ip) ^ + std::hash<__u32>()(e.remote_ip); + } +}; + +struct SecurityGroupRuleEqual { + bool operator() (const sg_cidr_key_t &e, const sg_cidr_key_t &f) const { + return (e.remote_ip == f.remote_ip) && (e.local_ip == f.local_ip) && (e.protocol == f.protocol) && + (e.direction == f.direction) && (e.port == f.port) && (e.vni == f.vni) && (e.prefixlen == f.prefixlen); + } +}; + static std::string g_local_db_path = "/var/local/arion/arion_wing.db"; inline auto make_storage_query () { @@ -101,9 +117,17 @@ inline auto make_storage_query () { make_column("version", &SecurityGroupRule::version), primary_key(&SecurityGroupRule::remote_group_id) ), + make_table("security_group_port_binding", + make_column("port_id", &SecurityGroupPortBinding::port_id), + make_column("security_group_id", &SecurityGroupPortBinding::security_group_id), + make_column("version", &SecurityGroupPortBinding::version), + primary_key(&SecurityGroupPortBinding::port_id, &SecurityGroupPortBinding::security_group_id, &SecurityGroupPortBinding::version) + ), + // 1 version is written when all related SecurityGroupRules of a SecurityGroupPortBinding + // is programmed into the eBPF map and written into the DB. make_table("journal_security_group_rules", - make_column("version", &SecurityGroupProgrammingState::version), - primary_key(&SecurityGroupProgrammingState::version) + make_column("version", &SecurityGroupPortBindingProgrammingState::version), + primary_key(&SecurityGroupPortBindingProgrammingState::version) ) ); }; @@ -120,13 +144,23 @@ class db_client { Storage local_db = make_storage_query(); using NeighborPrepareStatement = decltype(local_db.prepare(select(columns(&Neighbor::host_ip, &Neighbor::vpc_mac, &Neighbor::host_mac), where(is_equal((&Neighbor::vni), 0) and is_equal((&Neighbor::vpc_ip), "127.0.0.1"))))); - NeighborPrepareStatement query_neighbor_statement = local_db.prepare(select(columns(&Neighbor::host_ip, &Neighbor::vpc_mac, &Neighbor::host_mac), - where(is_equal((&Neighbor::vni), 0) and is_equal((&Neighbor::vpc_ip), "127.0.0.1")))); + NeighborPrepareStatement query_neighbor_statement = local_db.prepare( + select( + columns(&Neighbor::host_ip, &Neighbor::vpc_mac, &Neighbor::host_mac), + where( + is_equal((&Neighbor::vni), 0) + and + is_equal((&Neighbor::vpc_ip), "127.0.0.1") + ) + ) + ); // Create local db writer single thread execution queue dispatch_queue local_db_writer_queue = dispatch_queue("Local db background write queue", 1); std::unordered_map endpoint_cache; + std::unordered_map sg_rule_cache; + // function that will be called at the beginning of the program, reads rows from the neighbor table // and fills the in-memory endpoint cache, which is used for fast lookup. diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 35476c6..167b09a 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -47,10 +47,17 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ arion::schema::NeighborRule reply; // prepared statements for better performance of db writing in completion queue + + // neighbor state, has ebpf map and local db table auto add_or_update_neighbor_db_stmt = db_client::get_instance().local_db.prepare(replace(Neighbor{ 0, "", "", "", "", 0 })); auto add_programmed_neighbor_version_db_stmt = db_client::get_instance().local_db.prepare(insert(NeibghborProgrammingState{ 0 })); + + // security group rules, has local db, but NOT ebpf map auto add_or_update_security_group_rule_db_stmt = db_client::get_instance().local_db.prepare(replace(::SecurityGroupRule{ "", "", "", "", "", 0, 0, "", 0, 0 })); - auto add_programmed_security_group_rule_version_db_stmt = db_client::get_instance().local_db.prepare(insert(SecurityGroupProgrammingState {0})); + + // security group port binding, has local db, needs to query security group rules to insert into eBPF map. + auto add_or_update_security_group_port_binding_stmt = db_client::get_instance().local_db.prepare(replace(::SecurityGroupPortBinding{"", ""})); + auto add_programmed_security_group_port_binding_version_db_stmt = db_client::get_instance().local_db.prepare(insert(SecurityGroupPortBindingProgrammingState{ 0 })); // check current grpc channel state, try to connect if needed grpc_connectivity_state current_state = chan_->GetState(true); @@ -243,19 +250,163 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ db_client::get_instance().local_db.execute(add_or_update_security_group_rule_db_stmt); }); printf("Dispatched local db security group rule insert\n"); - // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded - db_client::get_instance().local_db_writer_queue.dispatch([version, &add_programmed_security_group_rule_version_db_stmt] { - get<0>(add_programmed_security_group_rule_version_db_stmt) = { version }; - db_client::get_instance().local_db.execute( - add_programmed_security_group_rule_version_db_stmt); - }); - printf("Dispatched local db security group journal insert\n"); } else { printf("security group id is empty\n"); } } else if (call->reply.has_securitygroupportbinding()) { - printf("Security group port binding feather to be implemented\n"); - } else { + auto port_id = call->reply.securitygroupportbinding().portid(); + auto security_group_id = call->reply.securitygroupportbinding().securitygroupid(); + auto version = call->reply.securitygroupportbinding().version(); + int fd = fd_security_group_ebpf_map; + + // non-empty rule + if ("" != port_id && "" != security_group_id) { + marl::schedule([this, &i, port_id, security_group_id, version, fd, + &add_or_update_security_group_port_binding_stmt, &add_programmed_security_group_port_binding_version_db_stmt] { + // step #0 - split the port id into vni and vpc_id, then get the security group rules based on the security group id + std::string delimiter = "-"; //because port_id is in the format of "vni-vpc_id" + std::string vni = port_id.substr(0, port_id.find(delimiter)); + std::string vpc_ip = port_id.substr(port_id.find(delimiter) + 1); + // step #1 - check and store as in concurrent hash map + std::string security_group_port_binding_id = port_id + "-" + security_group_id; + printf("vpc_ip is NOT empty: [%s]\n", vpc_ip.c_str()); + bool ebpf_ignored = false; + bool map_updated = false; + int update_ct = 0, max_update_ct = 5; + + while (!map_updated && (update_ct < max_update_ct)) { + printf("Inside while loop, map_updated = [%b], update_ct = [%ld], max_update_ct = [%ld]\n", + map_updated, update_ct, max_update_ct); + auto sg_pos = security_group_rule_task_map.find(security_group_port_binding_id); + if (sg_pos == security_group_rule_task_map.end()) { + // key not found, try insert. The function returns successful only when key not exists when inserting + auto res_insert = + security_group_rule_task_map.insert(security_group_port_binding_id, version); + if (res_insert.second) { + // means successfully inserted, done with update + map_updated = true; + printf("Found neighbor key in security_group_rule_task_map\n"); + } // 'else' means another thread already inserted before me, then it's not an insert case and next time in the loop will go to case of update + } else { + printf("Didn't find neighbor key in security_group_rule_task_map\n"); + // key found, means multi neighbor versions might update at the same time + int cur_ver = sg_pos->second; + + if (version > cur_ver) { + // only update neighbor version + // 1. when received (from ArionMaster) neighbor version is greater than current version in map + // 2. and only if the element to update is the original element (version in 'find') + if (security_group_rule_task_map.assign_if_equal(security_group_port_binding_id, version, cur_ver)) { + map_updated = true; + } + } else { + // otherwise + // ignore: + // 1. update concurrent hash map + // 2. update ebpf map to not overwrite new data with out dated data + // 3. update local db table 1 (table 1 is for local lookup) since it is an old version + // update: journal table (since this skipped version is treated as programming succeeded) + ebpf_ignored = true; + map_updated = true; + } + } + + update_ct++; + } + + if (map_updated) { + if (!ebpf_ignored) { + printf("ebpf_ignored = false\n"); + // step 1.5 get all related security group rules. + auto rows = db_client::get_instance().local_db.get_all<::SecurityGroupRule>( + where( +// is_equal( + c(&::SecurityGroupRule::security_group_id) == security_group_id.c_str() +// ) + ) + ); +// printf("Retrieved %ld rows of security group rules with security group id == [%s]\n", rows.size(), security_group_id.c_str()); + int ebpf_rc = 0; + /* + for (auto &rule : rows) { + // step #2 - sync syscall ebpf map programming with return code + string remote_ip; + int prefixlen = 0 ; + remote_ip = rule.remote_ip_prefix.substr(0, rule.remote_ip_prefix.find("/")); + prefixlen = atoi((rule.remote_ip_prefix.substr(rule.remote_ip_prefix.find("/") + 1).c_str())); + + struct sockaddr_in local_ip_sock, remote_ip_sock; + inet_pton(AF_INET, vpc_ip.c_str(), &(local_ip_sock.sin_addr)); + inet_pton(AF_INET, remote_ip.c_str(), &(local_ip_sock.sin_addr)); + sg_cidr_key_t sg_key; + sg_key.vni = atoi(vni.c_str()); + sg_key.prefixlen = prefixlen + 96; // 96 = ( __u32 vni; + __u16 port; + __u8 direction; + __u8 protocol; + __u32 local_ip; ) + sg_key.remote_ip = remote_ip_sock.sin_addr.s_addr; + sg_key.local_ip = local_ip_sock.sin_addr.s_addr; + sg_key.direction = rule.direction == "out" ? 0 : 1; // going out is 0 and coming in is 1 + + if (rule.protocol == "TCP") { + sg_key.protocol = IPPROTO_TCP; + } else if (rule.protocol == "UDP") { + sg_key.protocol = IPPROTO_UDP; + } else { + sg_key.protocol = IPPROTO_NONE; + } + + sg_key.port = rule.port_range_min; //TODO: see if we should use this or other fields + + sg_cidr_t sg_value; + sg_value.sg_id = 1; + sg_value.action = 1; // 1 for allow and other values for drop + + int single_ebpf_rc = bpf_map_update_elem(fd, &sg_key, &sg_value, BPF_ANY); + if (single_ebpf_rc != 0) { + ebpf_rc = single_ebpf_rc; + printf("Tried to insert into sg rule ebpf map, but got RC: [%ld], errno: [%s]\n", single_ebpf_rc, std::strerror(errno)); + } + // also put in local in memory cache + db_client::get_instance().sg_rule_cache[sg_key] = sg_value;//.insert(epkey, ep); + printf("GPPC: Inserted this neighbor into map: vip: %s, vni: %s\n", vpc_ip.c_str(), vni.c_str()); + + } + // step #3 - async call to write/update to local db table + db_client::get_instance().local_db_writer_queue.dispatch([security_group_id, port_id, version, &add_or_update_security_group_port_binding_stmt] { + get<0>(add_or_update_security_group_port_binding_stmt) = { port_id, security_group_id }; + db_client::get_instance().local_db.execute(add_or_update_security_group_port_binding_stmt); + }); + printf("Dispatched local db neighbor insert\n"); + + // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded + if (0 == ebpf_rc) { + db_client::get_instance().local_db_writer_queue.dispatch([version, &add_programmed_security_group_port_binding_version_db_stmt] { + get<0>(add_programmed_security_group_port_binding_version_db_stmt) = { version }; + db_client::get_instance().local_db.execute( + add_programmed_security_group_port_binding_version_db_stmt); + }); + } else { + printf("ebpf_rc = [%ld], this version isn't finished, NOT updating the local DB.\n", ebpf_rc); + } + printf("Dispatched local db journal insert\n"); + */ + + } else { + printf("ebpf_ignored = true\n"); + // step #4 (case 2) - always write to local db table 2 (programming journal) when version intended ignored (no need to program older version) + db_client::get_instance().local_db_writer_queue.dispatch([version, &add_programmed_security_group_port_binding_version_db_stmt] { + get<0>(add_programmed_security_group_port_binding_version_db_stmt) = { version }; + db_client::get_instance().local_db.execute( + add_programmed_security_group_port_binding_version_db_stmt); + }); + } + } else { + printf("Failed to update neighbor %d %s in map, skipping it\n", vni.c_str(), vpc_ip.c_str()); + } + + i++; + }); + } else { + printf("port_id [%s] or security_group_id: [%s] is empty\n", port_id.c_str(), security_group_id.c_str()); + } } else { printf("This reply doesn't have a neighbor rule, a security group rule, or a security group port binding\n"); } From 075f5e1910b58fa6fc1324054e4b3142a6c36382 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Wed, 25 Jan 2023 10:37:26 -0800 Subject: [PATCH 24/33] Moved sync_schema() to the right place; fixed creation of the LPM Trie ebpf map --- include/db_client.h | 83 +++++++++++++++++++++------------------- src/comm/grpc_client.cpp | 14 +++---- 2 files changed, 48 insertions(+), 49 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index dca5a46..1fb0420 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -90,46 +90,48 @@ struct SecurityGroupRuleEqual { static std::string g_local_db_path = "/var/local/arion/arion_wing.db"; inline auto make_storage_query () { - return make_storage(g_local_db_path, - make_table("neighbor", - make_column("vni", &Neighbor::vni), - make_column("vpc_ip", &Neighbor::vpc_ip), - make_column("host_ip", &Neighbor::host_ip), - make_column("vpc_mac", &Neighbor::vpc_mac), - make_column("host_mac", &Neighbor::host_mac), - make_column("version", &Neighbor::version), - primary_key(&Neighbor::vni, &Neighbor::vpc_ip) - ), - make_table("journal_neighbor", - make_column("version", &NeibghborProgrammingState::version), - primary_key(&NeibghborProgrammingState::version) - ), - make_table("security_group_rule", - make_column("security_group_id", &SecurityGroupRule::security_group_id), - make_column("remote_group_id", &SecurityGroupRule::remote_group_id), - make_column("direction", &SecurityGroupRule::direction), - make_column("remote_ip_prefix", &SecurityGroupRule::remote_ip_prefix), - make_column("protocol", &SecurityGroupRule::protocol), - make_column("port_range_max", &SecurityGroupRule::port_range_max), - make_column("port_range_min", &SecurityGroupRule::port_range_min), - make_column("ether_type", &SecurityGroupRule::ether_type), - make_column("vni", &SecurityGroupRule::vni), - make_column("version", &SecurityGroupRule::version), - primary_key(&SecurityGroupRule::remote_group_id) - ), - make_table("security_group_port_binding", - make_column("port_id", &SecurityGroupPortBinding::port_id), - make_column("security_group_id", &SecurityGroupPortBinding::security_group_id), - make_column("version", &SecurityGroupPortBinding::version), - primary_key(&SecurityGroupPortBinding::port_id, &SecurityGroupPortBinding::security_group_id, &SecurityGroupPortBinding::version) - ), - // 1 version is written when all related SecurityGroupRules of a SecurityGroupPortBinding - // is programmed into the eBPF map and written into the DB. - make_table("journal_security_group_rules", - make_column("version", &SecurityGroupPortBindingProgrammingState::version), - primary_key(&SecurityGroupPortBindingProgrammingState::version) - ) - ); + auto storage = make_storage(g_local_db_path, + make_table("neighbor", + make_column("vni", &Neighbor::vni), + make_column("vpc_ip", &Neighbor::vpc_ip), + make_column("host_ip", &Neighbor::host_ip), + make_column("vpc_mac", &Neighbor::vpc_mac), + make_column("host_mac", &Neighbor::host_mac), + make_column("version", &Neighbor::version), + primary_key(&Neighbor::vni, &Neighbor::vpc_ip) + ), + make_table("journal_neighbor", + make_column("version", &NeibghborProgrammingState::version), + primary_key(&NeibghborProgrammingState::version) + ), + make_table("security_group_rule", + make_column("security_group_id", &SecurityGroupRule::security_group_id), + make_column("remote_group_id", &SecurityGroupRule::remote_group_id), + make_column("direction", &SecurityGroupRule::direction), + make_column("remote_ip_prefix", &SecurityGroupRule::remote_ip_prefix), + make_column("protocol", &SecurityGroupRule::protocol), + make_column("port_range_max", &SecurityGroupRule::port_range_max), + make_column("port_range_min", &SecurityGroupRule::port_range_min), + make_column("ether_type", &SecurityGroupRule::ether_type), + make_column("vni", &SecurityGroupRule::vni), + make_column("version", &SecurityGroupRule::version), + primary_key(&SecurityGroupRule::remote_group_id) + ), + make_table("security_group_port_binding", + make_column("port_id", &SecurityGroupPortBinding::port_id), + make_column("security_group_id", &SecurityGroupPortBinding::security_group_id), + make_column("version", &SecurityGroupPortBinding::version), + primary_key(&SecurityGroupPortBinding::port_id, &SecurityGroupPortBinding::security_group_id, &SecurityGroupPortBinding::version) + ), + // 1 version is written when all related SecurityGroupRules of a SecurityGroupPortBinding + // is programmed into the eBPF map and written into the DB. + make_table("journal_security_group_rules", + make_column("version", &SecurityGroupPortBindingProgrammingState::version), + primary_key(&SecurityGroupPortBindingProgrammingState::version) + ) + ); + storage.sync_schema(); + return storage; }; using Storage = decltype(make_storage_query()); @@ -142,6 +144,7 @@ class db_client { }; Storage local_db = make_storage_query(); + using NeighborPrepareStatement = decltype(local_db.prepare(select(columns(&Neighbor::host_ip, &Neighbor::vpc_mac, &Neighbor::host_mac), where(is_equal((&Neighbor::vni), 0) and is_equal((&Neighbor::vpc_ip), "127.0.0.1"))))); NeighborPrepareStatement query_neighbor_statement = local_db.prepare( diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 167b09a..7f392d3 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -458,16 +458,15 @@ void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::st if (fd_security_group_ebpf_map < 0) { printf("Creating security_group_ebpf_map manually\n"); - struct bpf_lpm_trie_key *security_group_key; size_t key_size_security_group; - key_size_security_group = sizeof(*security_group_key) + sizeof(__u32); + key_size_security_group = sizeof(sg_cidr_key_t); - printf("Key size: %ld, value size: %ld\n", key_size_security_group, sizeof(security_group_rule_t)); + printf("Key size: %ld, value size: %ld\n", key_size_security_group, sizeof(sg_cidr_t)); fd_security_group_ebpf_map = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, - key_size_security_group/*sizeof(security_group_rule_key_t)*/, - sizeof(security_group_rule_t), + key_size_security_group, + sizeof(sg_cidr_t), 999, // need to change it to a bigger number later. - 0); + BPF_F_NO_PREALLOC); if (fd_security_group_ebpf_map <= 0) { printf("Tried to manually create security group map, but failed with fd: %ld, and error no: %s, returning\n", @@ -478,9 +477,6 @@ void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::st } - // Create (if db not exists) or connect (if db exists already) to local db - db_client::get_instance().local_db.sync_schema(); - // Find lkg version to reconcile/sync from server int rev_lkg = db_client::get_instance().FindLKGVersion(); printf("Found last known good version: %d from local db to sync from server\n", rev_lkg); From af57a0091951a52c457d0ff929a607b05b943466 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Thu, 26 Jan 2023 17:02:42 -0800 Subject: [PATCH 25/33] With this commit, arion agent is able to get security_group_rule and security_group_port_bindnig info from arion master, and program into the sg_cidr_map ebpf map --- include/db_client.h | 4 +++- src/comm/grpc_client.cpp | 50 +++++++++++++++++++++++++++++----------- src/main.cpp | 3 ++- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index 1fb0420..92aedbb 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -219,6 +219,7 @@ class db_client { // inet_ntoa(endpoint_host_ip_socket.sin_addr) // ); // printf("Finished one endpoint\n"); + /* security_group_key_t sg_key; sg_key.vni = vni; sg_key.ip = endpoint_vpc_ip_socket.sin_addr.s_addr; @@ -242,7 +243,8 @@ class db_client { if (sg_map_insert_rc != 0) { printf("Error for inserting into lpm map: %s", std::strerror(errno)); } - printf("Sg map insert rc: %ld\n", sg_map_insert_rc); + */ +// printf("Sg map insert rc: %ld\n", sg_map_insert_rc); } diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 7f392d3..a4a1fe2 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -43,8 +43,8 @@ using namespace arion::schema; void ArionMasterWatcherImpl::RequestArionMaster(vector *request_vector, grpc::CompletionQueue *cq) { - grpc::ClientContext ctx; - arion::schema::NeighborRule reply; +// grpc::ClientContext ctx; +// arion::schema::NeighborRule reply; // prepared statements for better performance of db writing in completion queue @@ -75,7 +75,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ int tag_watch = 1; printf("Completion queue: initial task, async watch\n"); - stub_->AsyncWatch(&call->context, cq, (void*) tag_watch); +// stub_->AsyncWatch(&call->context, cq, (void*) tag_watch); call->stream = stub_->AsyncWatch(&call->context, cq, (void*)tag_watch); // start time @@ -83,6 +83,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ std::atomic i(tag_watch + 1); bool write_done = false; + int current_write_request_index = 0; while (cq->Next(&got_tag, &ok)) { printf("Read one from grpc stream\n"); if (ok) { @@ -90,14 +91,26 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ printf("Completion queue: initial task response received\n"); printf("Completion queue: write async watch ArionWingRequest of [group, revision] to stream\n"); + call->stream->Write(*(request_vector->at(current_write_request_index)), (void*)tag_watch); + printf("Just wrote request with rev: [%ld], map: [%s] and group id: [%s]\n", + (request_vector->at(current_write_request_index))->rev(), + (request_vector->at(current_write_request_index))->map().c_str(), + (request_vector->at(current_write_request_index)->group().c_str()) + ); + current_write_request_index ++; + if (current_write_request_index == request_vector->size()) { + write_done = true; + } + /* for (auto &request : *request_vector) { call->stream->Write(*request, (void*)tag_watch); - printf("Just wrote request with rev: [%ld], map: [%ld] and group id: [%ld]\n", + printf("Just wrote request with rev: [%ld], map: [%s] and group id: [%s]\n", request->rev(), request->map().c_str(), request->group().c_str() ); } write_done = true; + */ } else { call->stream->Read(&call->reply, got_tag); if (call->reply.has_neighbor_rule()) { @@ -206,7 +219,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ add_programmed_neighbor_version_db_stmt); }); } - printf("Dispatched local db journal insert\n"); + printf("Dispatched local db neighbor journal insert\n"); } else { printf("ebpf_ignored = true\n"); // step #4 (case 2) - always write to local db table 2 (programming journal) when version intended ignored (no need to program older version) @@ -327,7 +340,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ ); // printf("Retrieved %ld rows of security group rules with security group id == [%s]\n", rows.size(), security_group_id.c_str()); int ebpf_rc = 0; - /* + printf("Found %ld sg rules related to this ID: %s\n", rows.size(), security_group_id.c_str()); for (auto &rule : rows) { // step #2 - sync syscall ebpf map programming with return code string remote_ip; @@ -343,7 +356,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ sg_key.prefixlen = prefixlen + 96; // 96 = ( __u32 vni; + __u16 port; + __u8 direction; + __u8 protocol; + __u32 local_ip; ) sg_key.remote_ip = remote_ip_sock.sin_addr.s_addr; sg_key.local_ip = local_ip_sock.sin_addr.s_addr; - sg_key.direction = rule.direction == "out" ? 0 : 1; // going out is 0 and coming in is 1 + sg_key.direction = rule.direction == "egress" ? 0 : 1; // going out is 0 and coming in is 1 if (rule.protocol == "TCP") { sg_key.protocol = IPPROTO_TCP; @@ -358,15 +371,18 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ sg_cidr_t sg_value; sg_value.sg_id = 1; sg_value.action = 1; // 1 for allow and other values for drop - + printf("Inserting sg rule with prefixlen: %ld VNI: %s, port: %ld, direction: %s, protocol: %s, local_ip: %s, remote_ip: %s\n", + prefixlen, vni.c_str(), rule.port_range_min, rule.direction.c_str(), rule.protocol.c_str(), vpc_ip.c_str(), remote_ip.c_str()); int single_ebpf_rc = bpf_map_update_elem(fd, &sg_key, &sg_value, BPF_ANY); if (single_ebpf_rc != 0) { ebpf_rc = single_ebpf_rc; printf("Tried to insert into sg rule ebpf map, but got RC: [%ld], errno: [%s]\n", single_ebpf_rc, std::strerror(errno)); + } else { + printf("Insert into sg eBPF map returned %ld\n", single_ebpf_rc); } // also put in local in memory cache db_client::get_instance().sg_rule_cache[sg_key] = sg_value;//.insert(epkey, ep); - printf("GPPC: Inserted this neighbor into map: vip: %s, vni: %s\n", vpc_ip.c_str(), vni.c_str()); + printf("GPPC: Inserted this sg rule into map: vip: %s, vni: %s\n", vpc_ip.c_str(), vni.c_str()); } // step #3 - async call to write/update to local db table @@ -374,7 +390,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ get<0>(add_or_update_security_group_port_binding_stmt) = { port_id, security_group_id }; db_client::get_instance().local_db.execute(add_or_update_security_group_port_binding_stmt); }); - printf("Dispatched local db neighbor insert\n"); + printf("Dispatched local db security group port binding insert\n"); // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded if (0 == ebpf_rc) { @@ -386,8 +402,8 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ } else { printf("ebpf_rc = [%ld], this version isn't finished, NOT updating the local DB.\n", ebpf_rc); } - printf("Dispatched local db journal insert\n"); - */ + printf("Dispatched local db sg journal insert\n"); + } else { printf("ebpf_ignored = true\n"); @@ -487,8 +503,8 @@ void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::st vector arion_request_vector; ArionWingRequest neighbor_watch_req; neighbor_watch_req.set_map("NeighborRule"); - neighbor_watch_req.set_group(group_id); - neighbor_watch_req.set_rev(rev_lkg); + neighbor_watch_req.set_group(""/*group_id*/); + neighbor_watch_req.set_rev(0/*rev_lkg*/); ArionWingRequest security_group_rule_watch_req; security_group_rule_watch_req.set_map("SecurityGroupRule"); @@ -496,7 +512,13 @@ void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::st security_group_rule_watch_req.set_rev(0); // set empty group rule for now. security_group_rule_watch_req.set_group(""); + + ArionWingRequest security_group_port_binding_watch_req; + security_group_port_binding_watch_req.set_map("SecurityGroupPortBinding"); + security_group_port_binding_watch_req.set_rev(0); + security_group_port_binding_watch_req.set_group(""); arion_request_vector.emplace_back(&neighbor_watch_req); arion_request_vector.emplace_back(&security_group_rule_watch_req); + arion_request_vector.emplace_back(&security_group_port_binding_watch_req); this->RequestArionMaster(&arion_request_vector, &cq); } diff --git a/src/main.cpp b/src/main.cpp index 372feb7..eee37e0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -42,7 +42,7 @@ ArionMasterWatcherImpl *g_grpc_client = NULL; string g_arion_master_address = EMPTY_STRING; string g_arion_master_port = "9090"; string g_arion_neighbor_table = "/sys/fs/bpf/endpoints_map"; -string g_arion_security_group_table = "/sys/fs/bpf/security_group_map"; +string g_arion_security_group_table = "/sys/fs/bpf/sg_cidr_map"; //TODO: let ArionMaster figure out group from ArionWing IP (in grpc channel) string g_arion_group = "group1"; @@ -133,6 +133,7 @@ int main(int argc, char *argv[]) { // Create marl scheduler using all the logical processors available to the process. // Bind this scheduler to the main thread so we can call marl::schedule() marl::Scheduler::Config cfg_bind_hw_cores; + cfg_bind_hw_cores.setWorkerThreadCount(thread_pools_size * 2); marl::Scheduler task_scheduler(cfg_bind_hw_cores); task_scheduler.bind(); From 7dda69e12b0c1b6b76e4adb25a5e296474e5693d Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 27 Jan 2023 13:05:13 -0800 Subject: [PATCH 26/33] Added TODOs and made changes based on feedbacks. --- include/db_client.h | 16 ++++++++-------- src/comm/af_xdp_user_multi_thread.cpp | 11 ++++++++--- src/comm/grpc_client.cpp | 13 +++++++++---- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/include/db_client.h b/include/db_client.h index 92aedbb..72773c8 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -32,7 +32,7 @@ struct Neighbor { int version; }; // local db table 1 - neighbor info table that stores the latest neighbors (if there are version updates per neighbor) received from ArionMaster -struct NeibghborProgrammingState { +struct NeighborProgrammingState { int version; }; // local db table 2 - neighbor ebpf programmed version @@ -101,8 +101,8 @@ inline auto make_storage_query () { primary_key(&Neighbor::vni, &Neighbor::vpc_ip) ), make_table("journal_neighbor", - make_column("version", &NeibghborProgrammingState::version), - primary_key(&NeibghborProgrammingState::version) + make_column("version", &NeighborProgrammingState::version), + primary_key(&NeighborProgrammingState::version) ), make_table("security_group_rule", make_column("security_group_id", &SecurityGroupRule::security_group_id), @@ -264,14 +264,14 @@ class db_client { ); */ - using als_mo = alias_a; - using als_mi = alias_b; - auto ver_gaps = local_db.select(alias_column(&NeibghborProgrammingState::version), + using als_mo = alias_a; + using als_mi = alias_b; + auto ver_gaps = local_db.select(alias_column(&NeighborProgrammingState::version), from(), where(not exists( - select(0 - c(alias_column(&NeibghborProgrammingState::version)), + select(0 - c(alias_column(&NeighborProgrammingState::version)), from(), - where(is_equal(c(alias_column(&NeibghborProgrammingState::version)) + 1, alias_column(&NeibghborProgrammingState::version))) + where(is_equal(c(alias_column(&NeighborProgrammingState::version)) + 1, alias_column(&NeighborProgrammingState::version))) )))); // lkg version: diff --git a/src/comm/af_xdp_user_multi_thread.cpp b/src/comm/af_xdp_user_multi_thread.cpp index 70850ad..6118d4f 100644 --- a/src/comm/af_xdp_user_multi_thread.cpp +++ b/src/comm/af_xdp_user_multi_thread.cpp @@ -952,7 +952,9 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk inet_pton(AF_INET, inet_ntoa(inner_ip_src), &(src_ip.sin_addr)); epkey.ip = ep_ip.sin_addr.s_addr; - + /* + * TODO: implement SG Logic + * */ sg_cidr_key_t sg_key; sg_key.protocol = inner_ip->protocol; sg_key.local_ip = ep_ip.sin_addr.s_addr; @@ -1081,7 +1083,7 @@ static bool process_packet(void *pkt, uint32_t len/*,struct xsk_socket_info *xsk static void * -thread_func(void *arg) +run_af_xdp_socket(void *arg) { struct thread_data *t = static_cast(arg); cpu_set_t cpu_cores; @@ -1471,6 +1473,9 @@ void* af_xdp_user_multi_thread::run_af_xdp_multi_threaded(void* args/*int argc, printf("After leaving 8 cores for other applications, we are now setting the interface to have %ld AF_XDP sockets.\n", n_ports); + /* + * TODO: Make NIC name configurable. + * */ string set_nic_queue_command_template = "ethtool -L enp4s0f1 combined %ld"; char set_nic_queue_command[100]; sprintf(set_nic_queue_command, "ethtool -L enp4s0f1 combined %ld", n_ports); @@ -1537,7 +1542,7 @@ void* af_xdp_user_multi_thread::run_af_xdp_multi_threaded(void* args/*int argc, status = pthread_create(&threads[i], NULL, - thread_func, + run_af_xdp_socket, &thread_data[i]); if (status) { printf("Thread %d creation failed.\n", i); diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index a4a1fe2..f487fef 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -50,7 +50,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ // neighbor state, has ebpf map and local db table auto add_or_update_neighbor_db_stmt = db_client::get_instance().local_db.prepare(replace(Neighbor{ 0, "", "", "", "", 0 })); - auto add_programmed_neighbor_version_db_stmt = db_client::get_instance().local_db.prepare(insert(NeibghborProgrammingState{ 0 })); + auto add_programmed_neighbor_version_db_stmt = db_client::get_instance().local_db.prepare(insert(NeighborProgrammingState{ 0 })); // security group rules, has local db, but NOT ebpf map auto add_or_update_security_group_rule_db_stmt = db_client::get_instance().local_db.prepare(replace(::SecurityGroupRule{ "", "", "", "", "", 0, 0, "", 0, 0 })); @@ -87,6 +87,10 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ while (cq->Next(&got_tag, &ok)) { printf("Read one from grpc stream\n"); if (ok) { + /* + * TODO: Associate call with the tag_watch, so that the write_done can be got rid of. + * + * */ if (!write_done) { printf("Completion queue: initial task response received\n"); @@ -333,9 +337,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ // step 1.5 get all related security group rules. auto rows = db_client::get_instance().local_db.get_all<::SecurityGroupRule>( where( -// is_equal( - c(&::SecurityGroupRule::security_group_id) == security_group_id.c_str() -// ) + c(&::SecurityGroupRule::security_group_id) == security_group_id.c_str() ) ); // printf("Retrieved %ld rows of security group rules with security group id == [%s]\n", rows.size(), security_group_id.c_str()); @@ -509,6 +511,9 @@ void ArionMasterWatcherImpl::RunClient(std::string ip, std::string port, std::st ArionWingRequest security_group_rule_watch_req; security_group_rule_watch_req.set_map("SecurityGroupRule"); // set version 0 for now. + /* TODO: Change the version and group name to valid ones, + Same group of neighbor and security group should have the same version + */ security_group_rule_watch_req.set_rev(0); // set empty group rule for now. security_group_rule_watch_req.set_group(""); From 0b101451699b29da4f012aa1febc14f8c5b0d9ca Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 27 Jan 2023 13:39:19 -0800 Subject: [PATCH 27/33] Added logic where when an sg rule is added, arion agent queires the relarted port bindings and programs the rules --- src/comm/grpc_client.cpp | 141 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 134 insertions(+), 7 deletions(-) diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index f487fef..329cac8 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -89,7 +89,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ if (ok) { /* * TODO: Associate call with the tag_watch, so that the write_done can be got rid of. - * + * * */ if (!write_done) { printf("Completion queue: initial task response received\n"); @@ -257,16 +257,143 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ auto ether_type = call->reply.securitygrouprule().ethertype(); auto vni = call->reply.securitygrouprule().vni(); auto version = call->reply.securitygrouprule().version(); + int fd = fd_security_group_ebpf_map; // non-empty rule if ("" != security_group_id) { - db_client::get_instance().local_db_writer_queue.dispatch( - [security_group_id, remote_group_id, direction, remote_ip_prefix, protocol, port_range_max, port_range_min, &add_or_update_security_group_rule_db_stmt, ether_type, vni, version] { - get<0>(add_or_update_security_group_rule_db_stmt) = - { security_group_id, remote_group_id, direction, remote_ip_prefix, protocol, port_range_max, port_range_min, ether_type, vni, version }; - db_client::get_instance().local_db.execute(add_or_update_security_group_rule_db_stmt); + marl::schedule([this, &i, security_group_id, version, fd, vni, remote_ip_prefix, direction, protocol, port_range_min, + remote_group_id, port_range_max, ether_type, &add_or_update_security_group_rule_db_stmt] { + // step #1 - check and store as in concurrent hash map + + bool ebpf_ignored = false; + bool map_updated = false; + int update_ct = 0, max_update_ct = 5; + + while (!map_updated && (update_ct < max_update_ct)) { + printf("Inside while loop, map_updated = [%b], update_ct = [%ld], max_update_ct = [%ld]\n", + map_updated, update_ct, max_update_ct); + auto sg_pos = security_group_rule_task_map.find(security_group_id); + if (sg_pos == security_group_rule_task_map.end()) { + // key not found, try insert. The function returns successful only when key not exists when inserting + auto res_insert = + security_group_rule_task_map.insert(security_group_id, version); + if (res_insert.second) { + // means successfully inserted, done with update + map_updated = true; + printf("Found neighbor key in security_group_rule_task_map\n"); + } // 'else' means another thread already inserted before me, then it's not an insert case and next time in the loop will go to case of update + } else { + printf("Didn't find neighbor key in security_group_rule_task_map\n"); + // key found, means multi neighbor versions might update at the same time + int cur_ver = sg_pos->second; + + if (version > cur_ver) { + // only update neighbor version + // 1. when received (from ArionMaster) neighbor version is greater than current version in map + // 2. and only if the element to update is the original element (version in 'find') + if (security_group_rule_task_map.assign_if_equal(security_group_id, version, cur_ver)) { + map_updated = true; + } + } else { + // otherwise + // ignore: + // 1. update concurrent hash map + // 2. update ebpf map to not overwrite new data with out dated data + // 3. update local db table 1 (table 1 is for local lookup) since it is an old version + // update: journal table (since this skipped version is treated as programming succeeded) + ebpf_ignored = true; + map_updated = true; + } + } + + update_ct++; + } + + if (map_updated) { + if (!ebpf_ignored) { + printf("ebpf_ignored = false\n"); + // step 1.5 get all related security group rules. + auto rows = db_client::get_instance().local_db.get_all<::SecurityGroupPortBinding>( + where( + c(&::SecurityGroupPortBinding::security_group_id) == security_group_id.c_str() + ) + ); + // printf("Retrieved %ld rows of security group rules with security group id == [%s]\n", rows.size(), security_group_id.c_str()); + int ebpf_rc = 0; + printf("Found %ld sg rules related to this ID: %s\n", rows.size(), security_group_id.c_str()); + for (auto &binding : rows) { + // step #2 - sync syscall ebpf map programming with return code + std::string delimiter = "-"; //because port_id is in the format of "vni-vpc_id" + std::string vpc_ip = binding.port_id.substr(binding.port_id.find(delimiter) + 1); + string remote_ip; + int prefixlen = 0 ; + remote_ip = remote_ip_prefix.substr(0, remote_ip_prefix.find("/")); + prefixlen = atoi((remote_ip_prefix.substr(remote_ip_prefix.find("/") + 1).c_str())); + + struct sockaddr_in local_ip_sock, remote_ip_sock; + inet_pton(AF_INET, vpc_ip.c_str(), &(local_ip_sock.sin_addr)); + inet_pton(AF_INET, remote_ip.c_str(), &(local_ip_sock.sin_addr)); + sg_cidr_key_t sg_key; + sg_key.vni = vni; + sg_key.prefixlen = prefixlen + 96; // 96 = ( __u32 vni; + __u16 port; + __u8 direction; + __u8 protocol; + __u32 local_ip; ) + sg_key.remote_ip = remote_ip_sock.sin_addr.s_addr; + sg_key.local_ip = local_ip_sock.sin_addr.s_addr; + sg_key.direction = direction == "egress" ? 0 : 1; // going out is 0 and coming in is 1 + + if (protocol == "TCP") { + sg_key.protocol = IPPROTO_TCP; + } else if (protocol == "UDP") { + sg_key.protocol = IPPROTO_UDP; + } else { + sg_key.protocol = IPPROTO_NONE; + } + + sg_key.port = port_range_min; //TODO: see if we should use this or other fields + + sg_cidr_t sg_value; + sg_value.sg_id = 1; + sg_value.action = 1; // 1 for allow and other values for drop + printf("Inserting sg rule with prefixlen: %ld VNI: %s, port: %ld, direction: %s, protocol: %s, local_ip: %s, remote_ip: %s\n", + prefixlen, vni, port_range_min, direction.c_str(), protocol.c_str(), vpc_ip.c_str(), remote_ip.c_str()); + int single_ebpf_rc = bpf_map_update_elem(fd, &sg_key, &sg_value, BPF_ANY); + if (single_ebpf_rc != 0) { + ebpf_rc = single_ebpf_rc; + printf("Tried to insert into sg rule ebpf map, but got RC: [%ld], errno: [%s]\n", single_ebpf_rc, std::strerror(errno)); + } else { + printf("Insert into sg eBPF map returned %ld\n", single_ebpf_rc); + } + // also put in local in memory cache + db_client::get_instance().sg_rule_cache[sg_key] = sg_value;//.insert(epkey, ep); + printf("GPPC: Inserted this sg rule into map: vip: %s, vni: %d\n", vpc_ip.c_str(), vni); + + } + + // step #3 - async call to write/update to local db table + db_client::get_instance().local_db_writer_queue.dispatch( + [security_group_id, remote_group_id, direction, remote_ip_prefix, protocol, port_range_max, port_range_min, &add_or_update_security_group_rule_db_stmt, ether_type, vni, version] { + get<0>(add_or_update_security_group_rule_db_stmt) = + { security_group_id, remote_group_id, direction, remote_ip_prefix, protocol, port_range_max, port_range_min, ether_type, vni, version }; + db_client::get_instance().local_db.execute(add_or_update_security_group_rule_db_stmt); + }); + printf("Dispatched local db security group port binding insert\n"); + + // no journal to write in this case. + } else { + printf("ebpf_ignored = true\n"); + // step #4 (case 2) - always write to local db table 2 (programming journal) when version intended ignored (no need to program older version) + db_client::get_instance().local_db_writer_queue.dispatch( + [security_group_id, remote_group_id, direction, remote_ip_prefix, protocol, port_range_max, port_range_min, &add_or_update_security_group_rule_db_stmt, ether_type, vni, version] { + get<0>(add_or_update_security_group_rule_db_stmt) = + { security_group_id, remote_group_id, direction, remote_ip_prefix, protocol, port_range_max, port_range_min, ether_type, vni, version }; + db_client::get_instance().local_db.execute(add_or_update_security_group_rule_db_stmt); + }); + } + } else { + printf("Failed to update security group rule with vni: %d and id: %s in map, skipping it\n", vni, security_group_id.c_str()); + } + + i++; }); - printf("Dispatched local db security group rule insert\n"); } else { printf("security group id is empty\n"); } From 4f1343218c0f5915da336fb949f4018c4ce90eab Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 27 Jan 2023 15:09:12 -0800 Subject: [PATCH 28/33] Adapted the segment_lock lock for the Neighbor Rules --- src/comm/grpc_client.cpp | 101 +++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 89fd8cf..c2d1754 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -132,12 +132,40 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ &add_or_update_neighbor_db_stmt, &add_programmed_neighbor_version_db_stmt] { // step #1 - check and store as in concurrent hash map std::string neighbor_key = std::to_string(vni) + "-" + vpc_ip; + + endpoint_key_t epkey; + epkey.vni = vni; + struct sockaddr_in ep_ip; + inet_pton(AF_INET, vpc_ip.c_str(), &(ep_ip.sin_addr)); + epkey.ip = ep_ip.sin_addr.s_addr; + printf("Filled in ep.ip\n"); + endpoint_t ep; + struct sockaddr_in ep_hip; + inet_pton(AF_INET, host_ip.c_str(), &(ep_hip.sin_addr)); + ep.hip = ep_hip.sin_addr.s_addr; + printf("Filled in ep.hip\n"); + + std::sscanf(vpc_mac.c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + &ep.mac[0], &ep.mac[1], &ep.mac[2], + &ep.mac[3], &ep.mac[4], &ep.mac[5]); + printf("Filled in ep.mac\n"); + + std::sscanf(host_mac.c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", + &ep.hmac[0], &ep.hmac[1], &ep.hmac[2], + &ep.hmac[3], &ep.hmac[4], &ep.hmac[5]); + printf("Filled in ep.hmac\n"); + printf("vpc_ip is NOT empty: [%s]\n", vpc_ip.c_str()); bool ebpf_ignored = false; bool map_updated = false; int update_ct = 0, max_update_ct = 5; + int ebpf_rc = -1; while (!map_updated && (update_ct < max_update_ct)) { + // lock transaction section + // segment lock allows some level of concurrent manipulations of concurrent version map + // as long as the multi-threading version updates' keys are not hashed to the same slot in segment array + segment_lock.lock(neighbor_key); printf("Inside while loop, map_updated = [%b], update_ct = [%ld], max_update_ct = [%ld]\n", map_updated, update_ct, max_update_ct); auto neighbor_pos = neighbor_task_map.find(neighbor_key); @@ -149,6 +177,21 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ // means successfully inserted, done with update map_updated = true; printf("Found neighbor key in neighbor_task_map\n"); + + // step #2 - sync syscall ebpf map programming with return code + ebpf_rc = bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); + if (ebpf_rc < 0) { + // safely rollback + // rollback version, for insertion case let's revert it to 0 + neighbor_task_map.assign(neighbor_key, 0); + + // rollback map status + map_updated = false; + }else { + // if ebpf map programming succeeded, also put in local in memory cache + db_client::get_instance().endpoint_cache[epkey] = ep; + } + } // 'else' means another thread already inserted before me, then it's not an insert case and next time in the loop will go to case of update } else { printf("Didn't find neighbor key in neighbor_task_map\n"); @@ -159,8 +202,22 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ // only update neighbor version // 1. when received (from ArionMaster) neighbor version is greater than current version in map // 2. and only if the element to update is the original element (version in 'find') - if (neighbor_task_map.assign_if_equal(neighbor_key, ver, cur_ver)) { + if (neighbor_task_map.assign(neighbor_key, ver)) { map_updated = true; + + // step #2 - sync syscall ebpf map programming with return code + ebpf_rc = bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); + if (ebpf_rc < 0) { + // safely rollback + // rollback version, for insertion case let's revert it to 0 + neighbor_task_map.assign(neighbor_key, cur_ver); + + // rollback map status + map_updated = false; + }else { + // if ebpf map programming succeeded, also put in local in memory cache + db_client::get_instance().endpoint_cache[epkey] = ep; + } } } else { // otherwise @@ -171,54 +228,16 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ // update: journal table (since this skipped version is treated as programming succeeded) ebpf_ignored = true; map_updated = true; - - // step #2 - sync syscall ebpf map programming with return code - ebpf_rc = bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); - if (ebpf_rc < 0) { - // safely rollback - // rollback version - neighbor_task_map.assign(neighbor_key, cur_ver); - - // rollback map status - map_updated = false; - } } } update_ct++; + // exit transaction section + segment_lock.unlock(neighbor_key); } if (map_updated) { if (!ebpf_ignored) { - printf("ebpf_ignored = false\n"); - // step #2 - sync syscall ebpf map programming with return code - endpoint_key_t epkey; - epkey.vni = vni; - struct sockaddr_in ep_ip; - inet_pton(AF_INET, vpc_ip.c_str(), &(ep_ip.sin_addr)); - epkey.ip = ep_ip.sin_addr.s_addr; - printf("Filled in ep.ip\n"); - endpoint_t ep; - struct sockaddr_in ep_hip; - inet_pton(AF_INET, host_ip.c_str(), &(ep_hip.sin_addr)); - ep.hip = ep_hip.sin_addr.s_addr; - printf("Filled in ep.hip\n"); - - std::sscanf(vpc_mac.c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", - &ep.mac[0], &ep.mac[1], &ep.mac[2], - &ep.mac[3], &ep.mac[4], &ep.mac[5]); - printf("Filled in ep.mac\n"); - - std::sscanf(host_mac.c_str(), "%02x:%02x:%02x:%02x:%02x:%02x", - &ep.hmac[0], &ep.hmac[1], &ep.hmac[2], - &ep.hmac[3], &ep.hmac[4], &ep.hmac[5]); - printf("Filled in ep.hmac\n"); - - //disabling the element udpate, so that all packets will be sent to user space program. - - int ebpf_rc = bpf_map_update_elem(fd, &epkey, &ep, BPF_ANY); - // also put in local in memory cache - db_client::get_instance().endpoint_cache[epkey] = ep;//.insert(epkey, ep); printf("GPPC: Inserted this neighbor into map: vip: %s, vni: %d\n", vpc_ip.c_str(), vni); // step #3 - async call to write/update to local db table 1 db_client::get_instance().local_db_writer_queue.dispatch([vni, vpc_ip, host_ip, vpc_mac, host_mac, ver, &add_or_update_neighbor_db_stmt] { From 8991fc48722d7ffd68d423bf8708eff389ba86ac Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 27 Jan 2023 15:56:50 -0800 Subject: [PATCH 29/33] Fixed bug for filling in remote ip into sg cidr key --- src/comm/grpc_client.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index c2d1754..0725c39 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -362,7 +362,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ struct sockaddr_in local_ip_sock, remote_ip_sock; inet_pton(AF_INET, vpc_ip.c_str(), &(local_ip_sock.sin_addr)); - inet_pton(AF_INET, remote_ip.c_str(), &(local_ip_sock.sin_addr)); + inet_pton(AF_INET, remote_ip.c_str(), &(remote_ip_sock.sin_addr)); sg_cidr_key_t sg_key; sg_key.vni = vni; sg_key.prefixlen = prefixlen + 96; // 96 = ( __u32 vni; + __u16 port; + __u8 direction; + __u8 protocol; + __u32 local_ip; ) @@ -509,7 +509,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ struct sockaddr_in local_ip_sock, remote_ip_sock; inet_pton(AF_INET, vpc_ip.c_str(), &(local_ip_sock.sin_addr)); - inet_pton(AF_INET, remote_ip.c_str(), &(local_ip_sock.sin_addr)); + inet_pton(AF_INET, remote_ip.c_str(), &(remote_ip_sock.sin_addr)); sg_cidr_key_t sg_key; sg_key.vni = atoi(vni.c_str()); sg_key.prefixlen = prefixlen + 96; // 96 = ( __u32 vni; + __u16 port; + __u8 direction; + __u8 protocol; + __u32 local_ip; ) From b39c10f21bba89dac94e1b8f2db7d738039dfa69 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 27 Jan 2023 16:26:14 -0800 Subject: [PATCH 30/33] Changed protocol to lowercase --- src/comm/grpc_client.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 0725c39..9f5a5e7 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -517,9 +517,9 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ sg_key.local_ip = local_ip_sock.sin_addr.s_addr; sg_key.direction = rule.direction == "egress" ? 0 : 1; // going out is 0 and coming in is 1 - if (rule.protocol == "TCP") { + if (rule.protocol == "tcp") { sg_key.protocol = IPPROTO_TCP; - } else if (rule.protocol == "UDP") { + } else if (rule.protocol == "udp") { sg_key.protocol = IPPROTO_UDP; } else { sg_key.protocol = IPPROTO_NONE; From df115a12ea4844669a11a792634d1c3c1fc9589b Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 27 Jan 2023 17:12:53 -0800 Subject: [PATCH 31/33] Adapted seglock logic for security group rule --- src/comm/grpc_client.cpp | 221 +++++++++++++++++++++++++++------------ 1 file changed, 155 insertions(+), 66 deletions(-) diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index 9f5a5e7..c620400 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -292,7 +292,7 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ // non-empty rule if ("" != security_group_id) { marl::schedule([this, &i, security_group_id, version, fd, vni, remote_ip_prefix, direction, protocol, port_range_min, - remote_group_id, port_range_max, ether_type, &add_or_update_security_group_rule_db_stmt] { + remote_group_id, port_range_max, ether_type, &add_or_update_security_group_rule_db_stmt, &add_programmed_security_group_port_binding_version_db_stmt] { // step #1 - check and store as in concurrent hash map bool ebpf_ignored = false; @@ -300,6 +300,10 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ int update_ct = 0, max_update_ct = 5; while (!map_updated && (update_ct < max_update_ct)) { + // lock transaction section + // segment lock allows some level of concurrent manipulations of concurrent version map + // as long as the multi-threading version updates' keys are not hashed to the same slot in segment array + segment_lock.lock(security_group_id); printf("Inside while loop, map_updated = [%b], update_ct = [%ld], max_update_ct = [%ld]\n", map_updated, update_ct, max_update_ct); auto sg_pos = security_group_rule_task_map.find(security_group_id); @@ -310,6 +314,71 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ if (res_insert.second) { // means successfully inserted, done with update map_updated = true; + + // step 1.5 get all related security group rules. + auto rows = db_client::get_instance().local_db.get_all<::SecurityGroupPortBinding>( + where( + c(&::SecurityGroupPortBinding::security_group_id) == security_group_id.c_str() + ) + ); + // printf("Retrieved %ld rows of security group rules with security group id == [%s]\n", rows.size(), security_group_id.c_str()); + int ebpf_rc = 0; + printf("Found %ld sg rules related to this ID: %s\n", rows.size(), security_group_id.c_str()); + for (auto &binding : rows) { + // step #2 - sync syscall ebpf map programming with return code + std::string delimiter = "-"; //because port_id is in the format of "vni-vpc_id" + std::string vpc_ip = binding.port_id.substr(binding.port_id.find(delimiter) + 1); + string remote_ip; + int prefixlen = 0 ; + remote_ip = remote_ip_prefix.substr(0, remote_ip_prefix.find("/")); + prefixlen = atoi((remote_ip_prefix.substr(remote_ip_prefix.find("/") + 1).c_str())); + + struct sockaddr_in local_ip_sock, remote_ip_sock; + inet_pton(AF_INET, vpc_ip.c_str(), &(local_ip_sock.sin_addr)); + inet_pton(AF_INET, remote_ip.c_str(), &(remote_ip_sock.sin_addr)); + sg_cidr_key_t sg_key; + sg_key.vni = vni; + sg_key.prefixlen = prefixlen + 96; // 96 = ( __u32 vni; + __u16 port; + __u8 direction; + __u8 protocol; + __u32 local_ip; ) + sg_key.remote_ip = remote_ip_sock.sin_addr.s_addr; + sg_key.local_ip = local_ip_sock.sin_addr.s_addr; + sg_key.direction = direction == "egress" ? 0 : 1; // going out is 0 and coming in is 1 + + if (protocol == "TCP") { + sg_key.protocol = IPPROTO_TCP; + } else if (protocol == "UDP") { + sg_key.protocol = IPPROTO_UDP; + } else { + sg_key.protocol = IPPROTO_NONE; + } + + sg_key.port = port_range_min; //TODO: see if we should use this or other fields + + sg_cidr_t sg_value; + sg_value.sg_id = 1; + sg_value.action = 1; // 1 for allow and other values for drop + printf("Inserting sg rule with prefixlen: %ld VNI: %s, port: %ld, direction: %s, protocol: %s, local_ip: %s, remote_ip: %s\n", + prefixlen, vni, port_range_min, direction.c_str(), protocol.c_str(), vpc_ip.c_str(), remote_ip.c_str()); + int single_ebpf_rc = bpf_map_update_elem(fd, &sg_key, &sg_value, BPF_ANY); + if (single_ebpf_rc != 0) { + ebpf_rc = single_ebpf_rc; + printf("Tried to insert into sg rule ebpf map, but got RC: [%ld], errno: [%s]\n", single_ebpf_rc, std::strerror(errno)); + } else { + printf("Insert into sg eBPF map returned %ld\n", single_ebpf_rc); + // on success, also put in local in memory cache + db_client::get_instance().sg_rule_cache[sg_key] = sg_value; + } + + printf("GPPC: Inserted this sg rule into map: vip: %s, vni: %d\n", vpc_ip.c_str(), vni); + + } + if (ebpf_rc < 0) { + // safely rollback + // rollback version, for insertion case let's revert it to 0 + security_group_rule_task_map.assign(security_group_id, 0); + + // rollback map status + map_updated = false; + } printf("Found neighbor key in security_group_rule_task_map\n"); } // 'else' means another thread already inserted before me, then it's not an insert case and next time in the loop will go to case of update } else { @@ -318,11 +387,84 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ int cur_ver = sg_pos->second; if (version > cur_ver) { - // only update neighbor version - // 1. when received (from ArionMaster) neighbor version is greater than current version in map + // only update sg rule version + // 1. when received (from ArionMaster) sg rule version is greater than current version in map // 2. and only if the element to update is the original element (version in 'find') - if (security_group_rule_task_map.assign_if_equal(security_group_id, version, cur_ver)) { + if (security_group_rule_task_map.assign(security_group_id, version)) { map_updated = true; + + // step 1.5 get all related security group rules. + auto rows = db_client::get_instance().local_db.get_all<::SecurityGroupPortBinding>( + where( + c(&::SecurityGroupPortBinding::security_group_id) == security_group_id.c_str() + ) + ); + // printf("Retrieved %ld rows of security group rules with security group id == [%s]\n", rows.size(), security_group_id.c_str()); + int ebpf_rc = 0; + printf("Found %ld sg rules related to this ID: %s\n", rows.size(), security_group_id.c_str()); + for (auto &binding : rows) { + // step #2 - sync syscall ebpf map programming with return code + std::string delimiter = "-"; //because port_id is in the format of "vni-vpc_id" + std::string vpc_ip = binding.port_id.substr(binding.port_id.find(delimiter) + 1); + string remote_ip; + int prefixlen = 0 ; + remote_ip = remote_ip_prefix.substr(0, remote_ip_prefix.find("/")); + prefixlen = atoi((remote_ip_prefix.substr(remote_ip_prefix.find("/") + 1).c_str())); + + struct sockaddr_in local_ip_sock, remote_ip_sock; + inet_pton(AF_INET, vpc_ip.c_str(), &(local_ip_sock.sin_addr)); + inet_pton(AF_INET, remote_ip.c_str(), &(remote_ip_sock.sin_addr)); + sg_cidr_key_t sg_key; + sg_key.vni = vni; + sg_key.prefixlen = prefixlen + 96; // 96 = ( __u32 vni; + __u16 port; + __u8 direction; + __u8 protocol; + __u32 local_ip; ) + sg_key.remote_ip = remote_ip_sock.sin_addr.s_addr; + sg_key.local_ip = local_ip_sock.sin_addr.s_addr; + sg_key.direction = direction == "egress" ? 0 : 1; // going out is 0 and coming in is 1 + + if (protocol == "TCP") { + sg_key.protocol = IPPROTO_TCP; + } else if (protocol == "UDP") { + sg_key.protocol = IPPROTO_UDP; + } else { + sg_key.protocol = IPPROTO_NONE; + } + + sg_key.port = port_range_min; //TODO: see if we should use this or other fields + + sg_cidr_t sg_value; + sg_value.sg_id = 1; + sg_value.action = 1; // 1 for allow and other values for drop + printf("Inserting sg rule with prefixlen: %ld VNI: %s, port: %ld, direction: %s, protocol: %s, local_ip: %s, remote_ip: %s\n", + prefixlen, vni, port_range_min, direction.c_str(), protocol.c_str(), vpc_ip.c_str(), remote_ip.c_str()); + int single_ebpf_rc = bpf_map_update_elem(fd, &sg_key, &sg_value, BPF_ANY); + if (single_ebpf_rc != 0) { + ebpf_rc = single_ebpf_rc; + printf("Tried to insert into sg rule ebpf map, but got RC: [%ld], errno: [%s]\n", single_ebpf_rc, std::strerror(errno)); + } else { + printf("Insert into sg eBPF map returned %ld\n", single_ebpf_rc); + // on success, also put in local in memory cache + db_client::get_instance().sg_rule_cache[sg_key] = sg_value; + } + + printf("GPPC: Inserted this sg rule into map: vip: %s, vni: %d\n", vpc_ip.c_str(), vni); + if (single_ebpf_rc == 0) { + // also update the security group rule version journal + db_client::get_instance().local_db_writer_queue.dispatch([version, &add_programmed_security_group_port_binding_version_db_stmt] { + get<0>(add_programmed_security_group_port_binding_version_db_stmt) = { version }; + db_client::get_instance().local_db.execute( + add_programmed_security_group_port_binding_version_db_stmt); + }); + } + + } + if (ebpf_rc < 0) { + // safely rollback + // rollback version, for insertion case let's revert it to 0 + security_group_rule_task_map.assign(security_group_id, 0); + + // rollback map status + map_updated = false; + } } } else { // otherwise @@ -337,66 +479,13 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ } update_ct++; + // exit transaction section + segment_lock.unlock(security_group_id); } if (map_updated) { if (!ebpf_ignored) { printf("ebpf_ignored = false\n"); - // step 1.5 get all related security group rules. - auto rows = db_client::get_instance().local_db.get_all<::SecurityGroupPortBinding>( - where( - c(&::SecurityGroupPortBinding::security_group_id) == security_group_id.c_str() - ) - ); - // printf("Retrieved %ld rows of security group rules with security group id == [%s]\n", rows.size(), security_group_id.c_str()); - int ebpf_rc = 0; - printf("Found %ld sg rules related to this ID: %s\n", rows.size(), security_group_id.c_str()); - for (auto &binding : rows) { - // step #2 - sync syscall ebpf map programming with return code - std::string delimiter = "-"; //because port_id is in the format of "vni-vpc_id" - std::string vpc_ip = binding.port_id.substr(binding.port_id.find(delimiter) + 1); - string remote_ip; - int prefixlen = 0 ; - remote_ip = remote_ip_prefix.substr(0, remote_ip_prefix.find("/")); - prefixlen = atoi((remote_ip_prefix.substr(remote_ip_prefix.find("/") + 1).c_str())); - - struct sockaddr_in local_ip_sock, remote_ip_sock; - inet_pton(AF_INET, vpc_ip.c_str(), &(local_ip_sock.sin_addr)); - inet_pton(AF_INET, remote_ip.c_str(), &(remote_ip_sock.sin_addr)); - sg_cidr_key_t sg_key; - sg_key.vni = vni; - sg_key.prefixlen = prefixlen + 96; // 96 = ( __u32 vni; + __u16 port; + __u8 direction; + __u8 protocol; + __u32 local_ip; ) - sg_key.remote_ip = remote_ip_sock.sin_addr.s_addr; - sg_key.local_ip = local_ip_sock.sin_addr.s_addr; - sg_key.direction = direction == "egress" ? 0 : 1; // going out is 0 and coming in is 1 - - if (protocol == "TCP") { - sg_key.protocol = IPPROTO_TCP; - } else if (protocol == "UDP") { - sg_key.protocol = IPPROTO_UDP; - } else { - sg_key.protocol = IPPROTO_NONE; - } - - sg_key.port = port_range_min; //TODO: see if we should use this or other fields - - sg_cidr_t sg_value; - sg_value.sg_id = 1; - sg_value.action = 1; // 1 for allow and other values for drop - printf("Inserting sg rule with prefixlen: %ld VNI: %s, port: %ld, direction: %s, protocol: %s, local_ip: %s, remote_ip: %s\n", - prefixlen, vni, port_range_min, direction.c_str(), protocol.c_str(), vpc_ip.c_str(), remote_ip.c_str()); - int single_ebpf_rc = bpf_map_update_elem(fd, &sg_key, &sg_value, BPF_ANY); - if (single_ebpf_rc != 0) { - ebpf_rc = single_ebpf_rc; - printf("Tried to insert into sg rule ebpf map, but got RC: [%ld], errno: [%s]\n", single_ebpf_rc, std::strerror(errno)); - } else { - printf("Insert into sg eBPF map returned %ld\n", single_ebpf_rc); - } - // also put in local in memory cache - db_client::get_instance().sg_rule_cache[sg_key] = sg_value;//.insert(epkey, ep); - printf("GPPC: Inserted this sg rule into map: vip: %s, vni: %d\n", vpc_ip.c_str(), vni); - - } // step #3 - async call to write/update to local db table db_client::get_instance().local_db_writer_queue.dispatch( @@ -410,13 +499,13 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ // no journal to write in this case. } else { printf("ebpf_ignored = true\n"); - // step #4 (case 2) - always write to local db table 2 (programming journal) when version intended ignored (no need to program older version) - db_client::get_instance().local_db_writer_queue.dispatch( - [security_group_id, remote_group_id, direction, remote_ip_prefix, protocol, port_range_max, port_range_min, &add_or_update_security_group_rule_db_stmt, ether_type, vni, version] { - get<0>(add_or_update_security_group_rule_db_stmt) = - { security_group_id, remote_group_id, direction, remote_ip_prefix, protocol, port_range_max, port_range_min, ether_type, vni, version }; - db_client::get_instance().local_db.execute(add_or_update_security_group_rule_db_stmt); - }); + // step #4 (case 2) - always write to local db table 5 (programming journal) when version intended ignored (no need to program older version) + // also update the security group rule version journal + db_client::get_instance().local_db_writer_queue.dispatch([version, &add_programmed_security_group_port_binding_version_db_stmt] { + get<0>(add_programmed_security_group_port_binding_version_db_stmt) = { version }; + db_client::get_instance().local_db.execute( + add_programmed_security_group_port_binding_version_db_stmt); + }); } } else { printf("Failed to update security group rule with vni: %d and id: %s in map, skipping it\n", vni, security_group_id.c_str()); From f755f8fd9a3b6a6eb8bdb4f9212312a2a75f3f52 Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Fri, 27 Jan 2023 17:40:48 -0800 Subject: [PATCH 32/33] Adapted the segment lock logic to the security group port binding --- src/comm/grpc_client.cpp | 221 ++++++++++++++++++++++++++------------- 1 file changed, 147 insertions(+), 74 deletions(-) diff --git a/src/comm/grpc_client.cpp b/src/comm/grpc_client.cpp index c620400..c1e6beb 100644 --- a/src/comm/grpc_client.cpp +++ b/src/comm/grpc_client.cpp @@ -538,6 +538,10 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ int update_ct = 0, max_update_ct = 5; while (!map_updated && (update_ct < max_update_ct)) { + // lock transaction section + // segment lock allows some level of concurrent manipulations of concurrent version map + // as long as the multi-threading version updates' keys are not hashed to the same slot in segment array + segment_lock.lock(security_group_id); printf("Inside while loop, map_updated = [%b], update_ct = [%ld], max_update_ct = [%ld]\n", map_updated, update_ct, max_update_ct); auto sg_pos = security_group_rule_task_map.find(security_group_port_binding_id); @@ -549,6 +553,77 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ // means successfully inserted, done with update map_updated = true; printf("Found neighbor key in security_group_rule_task_map\n"); + // step 1.5 get all related security group rules. + auto rows = db_client::get_instance().local_db.get_all<::SecurityGroupRule>( + where( + c(&::SecurityGroupRule::security_group_id) == security_group_id.c_str() + ) + ); + // printf("Retrieved %ld rows of security group rules with security group id == [%s]\n", rows.size(), security_group_id.c_str()); + int ebpf_rc = 0; + printf("Found %ld sg rules related to this ID: %s\n", rows.size(), security_group_id.c_str()); + for (auto &rule : rows) { + // step #2 - sync syscall ebpf map programming with return code + string remote_ip; + int prefixlen = 0 ; + remote_ip = rule.remote_ip_prefix.substr(0, rule.remote_ip_prefix.find("/")); + prefixlen = atoi((rule.remote_ip_prefix.substr(rule.remote_ip_prefix.find("/") + 1).c_str())); + + struct sockaddr_in local_ip_sock, remote_ip_sock; + inet_pton(AF_INET, vpc_ip.c_str(), &(local_ip_sock.sin_addr)); + inet_pton(AF_INET, remote_ip.c_str(), &(remote_ip_sock.sin_addr)); + sg_cidr_key_t sg_key; + sg_key.vni = atoi(vni.c_str()); + sg_key.prefixlen = prefixlen + 96; // 96 = ( __u32 vni; + __u16 port; + __u8 direction; + __u8 protocol; + __u32 local_ip; ) + sg_key.remote_ip = remote_ip_sock.sin_addr.s_addr; + sg_key.local_ip = local_ip_sock.sin_addr.s_addr; + sg_key.direction = rule.direction == "egress" ? 0 : 1; // going out is 0 and coming in is 1 + + if (rule.protocol == "tcp") { + sg_key.protocol = IPPROTO_TCP; + } else if (rule.protocol == "udp") { + sg_key.protocol = IPPROTO_UDP; + } else { + sg_key.protocol = IPPROTO_NONE; + } + + sg_key.port = rule.port_range_min; //TODO: see if we should use this or other fields + + sg_cidr_t sg_value; + sg_value.sg_id = 1; + sg_value.action = 1; // 1 for allow and other values for drop + printf("Inserting sg rule with prefixlen: %ld VNI: %s, port: %ld, direction: %s, protocol: %s, local_ip: %s, remote_ip: %s\n", + prefixlen, vni.c_str(), rule.port_range_min, rule.direction.c_str(), rule.protocol.c_str(), vpc_ip.c_str(), remote_ip.c_str()); + int single_ebpf_rc = bpf_map_update_elem(fd, &sg_key, &sg_value, BPF_ANY); + if (single_ebpf_rc != 0) { + ebpf_rc = single_ebpf_rc; + printf("Tried to insert into sg rule ebpf map, but got RC: [%ld], errno: [%s]\n", single_ebpf_rc, std::strerror(errno)); + } else { + printf("Insert into sg eBPF map returned %ld\n", single_ebpf_rc); + } + // also put in local in memory cache + db_client::get_instance().sg_rule_cache[sg_key] = sg_value;//.insert(epkey, ep); + printf("GPPC: Inserted this sg rule into map: vip: %s, vni: %s\n", vpc_ip.c_str(), vni.c_str()); + + } + // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded + if (0 == ebpf_rc) { + db_client::get_instance().local_db_writer_queue.dispatch([version, &add_programmed_security_group_port_binding_version_db_stmt] { + get<0>(add_programmed_security_group_port_binding_version_db_stmt) = { version }; + db_client::get_instance().local_db.execute( + add_programmed_security_group_port_binding_version_db_stmt); + }); + } else { + printf("ebpf_rc = [%ld], this version isn't finished, NOT updating the local DB.\n", ebpf_rc); + } + printf("Dispatched local db sg journal insert\n"); + // step #3 - async call to write/update to local db table + db_client::get_instance().local_db_writer_queue.dispatch([security_group_id, port_id, version, &add_or_update_security_group_port_binding_stmt] { + get<0>(add_or_update_security_group_port_binding_stmt) = { port_id, security_group_id }; + db_client::get_instance().local_db.execute(add_or_update_security_group_port_binding_stmt); + }); + printf("Dispatched local db security group port binding insert\n"); + } // 'else' means another thread already inserted before me, then it's not an insert case and next time in the loop will go to case of update } else { printf("Didn't find neighbor key in security_group_rule_task_map\n"); @@ -559,8 +634,78 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ // only update neighbor version // 1. when received (from ArionMaster) neighbor version is greater than current version in map // 2. and only if the element to update is the original element (version in 'find') - if (security_group_rule_task_map.assign_if_equal(security_group_port_binding_id, version, cur_ver)) { + if (security_group_rule_task_map.assign(security_group_port_binding_id, version)) { map_updated = true; + // step 1.5 get all related security group rules. + auto rows = db_client::get_instance().local_db.get_all<::SecurityGroupRule>( + where( + c(&::SecurityGroupRule::security_group_id) == security_group_id.c_str() + ) + ); + // printf("Retrieved %ld rows of security group rules with security group id == [%s]\n", rows.size(), security_group_id.c_str()); + int ebpf_rc = 0; + printf("Found %ld sg rules related to this ID: %s\n", rows.size(), security_group_id.c_str()); + for (auto &rule : rows) { + // step #2 - sync syscall ebpf map programming with return code + string remote_ip; + int prefixlen = 0 ; + remote_ip = rule.remote_ip_prefix.substr(0, rule.remote_ip_prefix.find("/")); + prefixlen = atoi((rule.remote_ip_prefix.substr(rule.remote_ip_prefix.find("/") + 1).c_str())); + + struct sockaddr_in local_ip_sock, remote_ip_sock; + inet_pton(AF_INET, vpc_ip.c_str(), &(local_ip_sock.sin_addr)); + inet_pton(AF_INET, remote_ip.c_str(), &(remote_ip_sock.sin_addr)); + sg_cidr_key_t sg_key; + sg_key.vni = atoi(vni.c_str()); + sg_key.prefixlen = prefixlen + 96; // 96 = ( __u32 vni; + __u16 port; + __u8 direction; + __u8 protocol; + __u32 local_ip; ) + sg_key.remote_ip = remote_ip_sock.sin_addr.s_addr; + sg_key.local_ip = local_ip_sock.sin_addr.s_addr; + sg_key.direction = rule.direction == "egress" ? 0 : 1; // going out is 0 and coming in is 1 + + if (rule.protocol == "tcp") { + sg_key.protocol = IPPROTO_TCP; + } else if (rule.protocol == "udp") { + sg_key.protocol = IPPROTO_UDP; + } else { + sg_key.protocol = IPPROTO_NONE; + } + + sg_key.port = rule.port_range_min; //TODO: see if we should use this or other fields + + sg_cidr_t sg_value; + sg_value.sg_id = 1; + sg_value.action = 1; // 1 for allow and other values for drop + printf("Inserting sg rule with prefixlen: %ld VNI: %s, port: %ld, direction: %s, protocol: %s, local_ip: %s, remote_ip: %s\n", + prefixlen, vni.c_str(), rule.port_range_min, rule.direction.c_str(), rule.protocol.c_str(), vpc_ip.c_str(), remote_ip.c_str()); + int single_ebpf_rc = bpf_map_update_elem(fd, &sg_key, &sg_value, BPF_ANY); + if (single_ebpf_rc != 0) { + ebpf_rc = single_ebpf_rc; + printf("Tried to insert into sg rule ebpf map, but got RC: [%ld], errno: [%s]\n", single_ebpf_rc, std::strerror(errno)); + } else { + printf("Insert into sg eBPF map returned %ld\n", single_ebpf_rc); + } + // also put in local in memory cache + db_client::get_instance().sg_rule_cache[sg_key] = sg_value;//.insert(epkey, ep); + printf("GPPC: Inserted this sg rule into map: vip: %s, vni: %s\n", vpc_ip.c_str(), vni.c_str()); + + } + // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded + if (0 == ebpf_rc) { + db_client::get_instance().local_db_writer_queue.dispatch([version, &add_programmed_security_group_port_binding_version_db_stmt] { + get<0>(add_programmed_security_group_port_binding_version_db_stmt) = { version }; + db_client::get_instance().local_db.execute( + add_programmed_security_group_port_binding_version_db_stmt); + }); + } else { + printf("ebpf_rc = [%ld], this version isn't finished, NOT updating the local DB.\n", ebpf_rc); + } + printf("Dispatched local db sg journal insert\n"); + // step #3 - async call to write/update to local db table + db_client::get_instance().local_db_writer_queue.dispatch([security_group_id, port_id, version, &add_or_update_security_group_port_binding_stmt] { + get<0>(add_or_update_security_group_port_binding_stmt) = { port_id, security_group_id }; + db_client::get_instance().local_db.execute(add_or_update_security_group_port_binding_stmt); + }); + printf("Dispatched local db security group port binding insert\n"); } } else { // otherwise @@ -575,84 +720,12 @@ void ArionMasterWatcherImpl::RequestArionMaster(vector *requ } update_ct++; + segment_lock.unlock(security_group_id); } if (map_updated) { if (!ebpf_ignored) { printf("ebpf_ignored = false\n"); - // step 1.5 get all related security group rules. - auto rows = db_client::get_instance().local_db.get_all<::SecurityGroupRule>( - where( - c(&::SecurityGroupRule::security_group_id) == security_group_id.c_str() - ) - ); -// printf("Retrieved %ld rows of security group rules with security group id == [%s]\n", rows.size(), security_group_id.c_str()); - int ebpf_rc = 0; - printf("Found %ld sg rules related to this ID: %s\n", rows.size(), security_group_id.c_str()); - for (auto &rule : rows) { - // step #2 - sync syscall ebpf map programming with return code - string remote_ip; - int prefixlen = 0 ; - remote_ip = rule.remote_ip_prefix.substr(0, rule.remote_ip_prefix.find("/")); - prefixlen = atoi((rule.remote_ip_prefix.substr(rule.remote_ip_prefix.find("/") + 1).c_str())); - - struct sockaddr_in local_ip_sock, remote_ip_sock; - inet_pton(AF_INET, vpc_ip.c_str(), &(local_ip_sock.sin_addr)); - inet_pton(AF_INET, remote_ip.c_str(), &(remote_ip_sock.sin_addr)); - sg_cidr_key_t sg_key; - sg_key.vni = atoi(vni.c_str()); - sg_key.prefixlen = prefixlen + 96; // 96 = ( __u32 vni; + __u16 port; + __u8 direction; + __u8 protocol; + __u32 local_ip; ) - sg_key.remote_ip = remote_ip_sock.sin_addr.s_addr; - sg_key.local_ip = local_ip_sock.sin_addr.s_addr; - sg_key.direction = rule.direction == "egress" ? 0 : 1; // going out is 0 and coming in is 1 - - if (rule.protocol == "tcp") { - sg_key.protocol = IPPROTO_TCP; - } else if (rule.protocol == "udp") { - sg_key.protocol = IPPROTO_UDP; - } else { - sg_key.protocol = IPPROTO_NONE; - } - - sg_key.port = rule.port_range_min; //TODO: see if we should use this or other fields - - sg_cidr_t sg_value; - sg_value.sg_id = 1; - sg_value.action = 1; // 1 for allow and other values for drop - printf("Inserting sg rule with prefixlen: %ld VNI: %s, port: %ld, direction: %s, protocol: %s, local_ip: %s, remote_ip: %s\n", - prefixlen, vni.c_str(), rule.port_range_min, rule.direction.c_str(), rule.protocol.c_str(), vpc_ip.c_str(), remote_ip.c_str()); - int single_ebpf_rc = bpf_map_update_elem(fd, &sg_key, &sg_value, BPF_ANY); - if (single_ebpf_rc != 0) { - ebpf_rc = single_ebpf_rc; - printf("Tried to insert into sg rule ebpf map, but got RC: [%ld], errno: [%s]\n", single_ebpf_rc, std::strerror(errno)); - } else { - printf("Insert into sg eBPF map returned %ld\n", single_ebpf_rc); - } - // also put in local in memory cache - db_client::get_instance().sg_rule_cache[sg_key] = sg_value;//.insert(epkey, ep); - printf("GPPC: Inserted this sg rule into map: vip: %s, vni: %s\n", vpc_ip.c_str(), vni.c_str()); - - } - // step #3 - async call to write/update to local db table - db_client::get_instance().local_db_writer_queue.dispatch([security_group_id, port_id, version, &add_or_update_security_group_port_binding_stmt] { - get<0>(add_or_update_security_group_port_binding_stmt) = { port_id, security_group_id }; - db_client::get_instance().local_db.execute(add_or_update_security_group_port_binding_stmt); - }); - printf("Dispatched local db security group port binding insert\n"); - - // step #4 (case 1) - when ebpf programming not ignored, write to table 2 (programming journal) when programming succeeded - if (0 == ebpf_rc) { - db_client::get_instance().local_db_writer_queue.dispatch([version, &add_programmed_security_group_port_binding_version_db_stmt] { - get<0>(add_programmed_security_group_port_binding_version_db_stmt) = { version }; - db_client::get_instance().local_db.execute( - add_programmed_security_group_port_binding_version_db_stmt); - }); - } else { - printf("ebpf_rc = [%ld], this version isn't finished, NOT updating the local DB.\n", ebpf_rc); - } - printf("Dispatched local db sg journal insert\n"); - - } else { printf("ebpf_ignored = true\n"); // step #4 (case 2) - always write to local db table 2 (programming journal) when version intended ignored (no need to program older version) From 3737b91a3a058a7ca2bb643bde673c0a0903fd3e Mon Sep 17 00:00:00 2001 From: Rio Zhu Date: Tue, 31 Jan 2023 16:22:50 -0800 Subject: [PATCH 33/33] Updated sg rule schema --- include/db_client.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/db_client.h b/include/db_client.h index 72773c8..7ed221d 100644 --- a/include/db_client.h +++ b/include/db_client.h @@ -43,6 +43,7 @@ struct SecurityGroupPortBinding { }; // local db table 3, stores the mapping between port and security group, 1 group can have multiple rules. struct SecurityGroupRule { + std::string id; //UUID, should be key in DB std::string security_group_id; std::string remote_group_id; std::string direction; @@ -105,6 +106,7 @@ inline auto make_storage_query () { primary_key(&NeighborProgrammingState::version) ), make_table("security_group_rule", + make_column("id", &SecurityGroupRule::id), make_column("security_group_id", &SecurityGroupRule::security_group_id), make_column("remote_group_id", &SecurityGroupRule::remote_group_id), make_column("direction", &SecurityGroupRule::direction), @@ -115,7 +117,7 @@ inline auto make_storage_query () { make_column("ether_type", &SecurityGroupRule::ether_type), make_column("vni", &SecurityGroupRule::vni), make_column("version", &SecurityGroupRule::version), - primary_key(&SecurityGroupRule::remote_group_id) + primary_key(&SecurityGroupRule::id) ), make_table("security_group_port_binding", make_column("port_id", &SecurityGroupPortBinding::port_id),