#define KBUILD_MODNAME "xdp_l3fwd" #include #include #include #include #include #include #include #include #include #include #include #include #ifndef AF_INET #define AF_INET 2 #endif #ifndef AF_INET6 #define AF_INET6 10 #endif #define IPV6_FLOWINFO_MASK bpf_htonl(0x0FFFFFFF) #define VLAN_MAX_DEPTH 2 #define IPV6_EXT_MAX_CHAIN 6 struct { __uint(type, BPF_MAP_TYPE_DEVMAP); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(int)); __uint(max_entries, 512); } xdp_l3fwd_ports SEC(".maps"); struct flow_key { __u8 proto; __u8 pad[3]; __u16 vlan_id; __u16 pad2; union { __u32 ipv4_src; __u8 ipv6_src[16]; }; union { __u32 ipv4_dst; __u8 ipv6_dst[16]; }; __u16 sport; __u16 dport; }; struct flow_stats { __u64 packets; __u64 bytes; }; struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(key_size, sizeof(struct flow_key)); __uint(value_size, sizeof(struct flow_stats)); __uint(max_entries, 65536); } xdp_flow_stats SEC(".maps"); struct vlan_hdr { __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; static __always_inline int ip_decrease_ttl(struct iphdr *iph) { __u32 check = (__u32)iph->check; check += (__u32)bpf_htons(0x0100); iph->check = (__sum16)(check + (check >= 0xFFFF)); return --iph->ttl; } static __always_inline void record_stats(struct xdp_md *ctx, struct flow_key *key, __u64 bytes) { struct flow_stats *stats; stats = bpf_map_lookup_elem(&xdp_flow_stats, key); if (stats) { stats->packets++; stats->bytes += bytes; } else { struct flow_stats new_stats = { .packets = 1, .bytes = bytes, }; bpf_map_update_elem(&xdp_flow_stats, key, &new_stats, BPF_ANY); } } static __always_inline int parse_vlan(void *data, void *data_end, __u64 *nh_off, __u16 *h_proto, __u16 *vlan_id) { struct vlan_hdr *vhdr; int i, vlan_count = 0; #pragma unroll for (i = 0; i < VLAN_MAX_DEPTH; i++) { if (*h_proto != bpf_htons(ETH_P_8021Q) && *h_proto != bpf_htons(ETH_P_8021AD)) break; vhdr = data + *nh_off; if ((void *)(vhdr + 1) > data_end) return -1; if (i == 0) *vlan_id = bpf_ntohs(vhdr->h_vlan_TCI) & 0x0FFF; *nh_off += sizeof(*vhdr); *h_proto = vhdr->h_vlan_encapsulated_proto; vlan_count++; } return vlan_count; } static __always_inline int skip_ip6hdrext(void *data, void *data_end, __u64 *nh_off, __u8 next_hdr_type) { struct ipv6_opt_hdr { __u8 nexthdr; __u8 hdrlen; } *hdr; int i; #pragma unroll for (i = 0; i < IPV6_EXT_MAX_CHAIN; i++) { hdr = data + *nh_off; if ((void *)(hdr + 1) > data_end) return -1; switch (next_hdr_type) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_MH: *nh_off += (hdr->hdrlen + 1) * 8; next_hdr_type = hdr->nexthdr; break; case IPPROTO_AH: *nh_off += (hdr->hdrlen + 2) * 4; next_hdr_type = hdr->nexthdr; break; case IPPROTO_FRAGMENT: *nh_off += 8; next_hdr_type = hdr->nexthdr; break; default: return next_hdr_type; } } return -1; } static __always_inline int xdp_l3fwd_flags(struct xdp_md *ctx, __u32 flags) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct bpf_fib_lookup fib_params; struct ethhdr *eth = data; struct ipv6hdr *ip6h; struct iphdr *iph; __u16 h_proto; __u64 nh_off; int rc, vlan_count; __u16 vlan_id = 0; nh_off = sizeof(*eth); if (data + nh_off > data_end) return XDP_DROP; __builtin_memset(&fib_params, 0, sizeof(fib_params)); h_proto = eth->h_proto; vlan_count = parse_vlan(data, data_end, &nh_off, &h_proto, &vlan_id); if (vlan_count < 0) return XDP_DROP; struct flow_key key = {}; key.vlan_id = vlan_id; __u64 bytes = data_end - data; if (h_proto == bpf_htons(ETH_P_IP)) { iph = data + nh_off; if ((void *)(iph + 1) > data_end) return XDP_DROP; if (iph->ttl <= 1) return XDP_PASS; key.proto = iph->protocol; key.ipv4_src = iph->saddr; key.ipv4_dst = iph->daddr; /* Calculate L4 offset - use pointer arithmetic from iph */ __u8 ihl = iph->ihl; if (ihl < 5) return XDP_DROP; void *l4ptr = (void *)iph + (ihl * 4); if (iph->protocol == IPPROTO_TCP) { struct tcphdr *tcph = l4ptr; if ((void *)(tcph + 1) > data_end) goto skip_v4_ports; key.sport = tcph->source; key.dport = tcph->dest; } else if (iph->protocol == IPPROTO_UDP) { struct udphdr *udph = l4ptr; if ((void *)(udph + 1) > data_end) goto skip_v4_ports; key.sport = udph->source; key.dport = udph->dest; } skip_v4_ports: fib_params.family = AF_INET; fib_params.tos = iph->tos; fib_params.l4_protocol = iph->protocol; fib_params.tot_len = bpf_ntohs(iph->tot_len); fib_params.ipv4_src = iph->saddr; fib_params.ipv4_dst = iph->daddr; } else if (h_proto == bpf_htons(ETH_P_IPV6)) { ip6h = data + nh_off; if ((void *)(ip6h + 1) > data_end) return XDP_DROP; if (ip6h->hop_limit <= 1) return XDP_PASS; __builtin_memcpy(key.ipv6_src, &ip6h->saddr, 16); __builtin_memcpy(key.ipv6_dst, &ip6h->daddr, 16); __u64 l4_off = nh_off + sizeof(*ip6h); int l4_proto = skip_ip6hdrext(data, data_end, &l4_off, ip6h->nexthdr); if (l4_proto < 0) l4_proto = ip6h->nexthdr; key.proto = l4_proto; void *l4ptr = data + l4_off; if (l4_proto == IPPROTO_TCP) { struct tcphdr *tcph = l4ptr; if ((void *)(tcph + 1) > data_end) goto skip_v6_ports; key.sport = tcph->source; key.dport = tcph->dest; } else if (l4_proto == IPPROTO_UDP) { struct udphdr *udph = l4ptr; if ((void *)(udph + 1) > data_end) goto skip_v6_ports; key.sport = udph->source; key.dport = udph->dest; } skip_v6_ports: fib_params.family = AF_INET6; fib_params.flowinfo = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK; fib_params.l4_protocol = l4_proto; fib_params.tot_len = bpf_ntohs(ip6h->payload_len); __builtin_memcpy(fib_params.ipv6_src, &ip6h->saddr, 16); __builtin_memcpy(fib_params.ipv6_dst, &ip6h->daddr, 16); } else { return XDP_PASS; } fib_params.ifindex = ctx->ingress_ifindex; rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); if (rc == BPF_FIB_LKUP_RET_SUCCESS) { if (!bpf_map_lookup_elem(&xdp_l3fwd_ports, &fib_params.ifindex)) return XDP_PASS; record_stats(ctx, &key, bytes); if (h_proto == bpf_htons(ETH_P_IP)) ip_decrease_ttl(iph); else if (h_proto == bpf_htons(ETH_P_IPV6)) ip6h->hop_limit--; __builtin_memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); __builtin_memcpy(eth->h_source, fib_params.smac, ETH_ALEN); return bpf_redirect_map(&xdp_l3fwd_ports, fib_params.ifindex, 0); } return XDP_PASS; } SEC("xdp") int xdp_l3fwd_prog(struct xdp_md *ctx) { return xdp_l3fwd_flags(ctx, 0); } SEC("xdp") int xdp_l3fwd_direct_prog(struct xdp_md *ctx) { return xdp_l3fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT); } char _license[] SEC("license") = "GPL";