#define KBUILD_MODNAME "xdp_l3fwd" #include #include #include #include #include #include #include #include #include #include #include #include #ifndef AF_INET #define AF_INET 2 #endif #ifndef AF_INET6 #define AF_INET6 10 #endif #define IPV6_FLOWINFO_MASK bpf_htonl(0x0FFFFFFF) #define VLAN_MAX_DEPTH 2 #define IPV6_EXT_MAX_CHAIN 6 struct vlan_hdr { __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; /* Auto-learned VLAN info */ struct vlan_learning_entry { __u16 vlan_id; __u16 confidence; __u32 last_seen; }; struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, __u32); __type(value, struct vlan_learning_entry); __uint(max_entries, 512); } xdp_vlan_learning SEC(".maps"); struct vlan_parent_info { __u32 parent_ifindex; __u16 vlan_id; __u16 pad; }; struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, __u32); __type(value, struct vlan_parent_info); __uint(max_entries, 512); } xdp_vlan_parents SEC(".maps"); struct flow_key { __u8 proto; __u8 pad[3]; __u16 vlan_id; __u16 pad2; union { __u32 ipv4_src; __u8 ipv6_src[16]; }; union { __u32 ipv4_dst; __u8 ipv6_dst[16]; }; __u16 sport; __u16 dport; }; struct flow_stats { __u64 packets; __u64 bytes; }; struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(key_size, sizeof(struct flow_key)); __uint(value_size, sizeof(struct flow_stats)); __uint(max_entries, 65536); } xdp_flow_stats SEC(".maps"); static __always_inline int ip_decrease_ttl(struct iphdr *iph) { __u32 check = (__u32)iph->check; check += (__u32)bpf_htons(0x0100); iph->check = (__sum16)(check + (check >= 0xFFFF)); return --iph->ttl; } static __always_inline void record_stats(struct flow_key *key, __u64 bytes) { struct flow_stats *stats = bpf_map_lookup_elem(&xdp_flow_stats, key); if (stats) { stats->packets++; stats->bytes += bytes; } else { struct flow_stats new_stats = { .packets = 1, .bytes = bytes, }; bpf_map_update_elem(&xdp_flow_stats, key, &new_stats, BPF_ANY); } } static __always_inline void learn_vlan(struct xdp_md *ctx, __u16 vlan_id) { __u32 ifindex = ctx->ingress_ifindex; struct vlan_learning_entry *entry = bpf_map_lookup_elem(&xdp_vlan_learning, &ifindex); if (entry) { if (vlan_id > 0) { if (entry->vlan_id == vlan_id) { if (entry->confidence < 65535) entry->confidence++; } else if (entry->confidence > 0) { entry->confidence--; if (entry->confidence == 0) { entry->vlan_id = vlan_id; entry->confidence = 1; } } } } else if (vlan_id > 0) { struct vlan_learning_entry new_entry = { .vlan_id = vlan_id, .confidence = 1, .last_seen = 0, }; bpf_map_update_elem(&xdp_vlan_learning, &ifindex, &new_entry, BPF_ANY); } } static __always_inline __u16 get_interface_vlan(struct xdp_md *ctx, __u32 ifindex) { struct vlan_parent_info *parent_info = bpf_map_lookup_elem(&xdp_vlan_parents, &ifindex); if (parent_info && parent_info->vlan_id > 0) { return parent_info->vlan_id; } struct vlan_learning_entry *learned = bpf_map_lookup_elem(&xdp_vlan_learning, &ifindex); if (learned && learned->confidence > 5) { return learned->vlan_id; } __u32 ingress_idx = ctx->ingress_ifindex; if (ingress_idx != ifindex) { struct vlan_learning_entry *ingress_learned = bpf_map_lookup_elem(&xdp_vlan_learning, &ingress_idx); if (ingress_learned && ingress_learned->confidence > 10) { struct vlan_learning_entry *egress_learned = bpf_map_lookup_elem(&xdp_vlan_learning, &ifindex); if (!egress_learned || egress_learned->confidence < 3) { return 0; } } } return 0; } static __always_inline int parse_vlan(void *data, void *data_end, __u64 *nh_off, __u16 *h_proto, __u16 *vlan_id) { struct vlan_hdr *vh; #pragma unroll for (int i = 0; i < VLAN_MAX_DEPTH; i++) { if (*h_proto != bpf_htons(ETH_P_8021Q) && *h_proto != bpf_htons(ETH_P_8021AD)) break; vh = (void *)((char *)data + *nh_off); if ((void *)(vh + 1) > data_end) return -1; if (i == 0) *vlan_id = bpf_ntohs(vh->h_vlan_TCI) & 0x0FFF; *nh_off += sizeof(*vh); *h_proto = vh->h_vlan_encapsulated_proto; } return 0; } static __always_inline int skip_ip6hdrext(void *data, void *data_end, __u64 *nh_off, __u8 next) { struct ipv6_opt_hdr *hdr; #pragma unroll for (int i = 0; i < IPV6_EXT_MAX_CHAIN; i++) { hdr = (void *)((char *)data + *nh_off); if ((void *)(hdr + 1) > data_end) return -1; switch (next) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_MH: *nh_off += (hdr->hdrlen + 1) * 8; next = hdr->nexthdr; break; case IPPROTO_AH: *nh_off += (hdr->hdrlen + 2) * 4; next = hdr->nexthdr; break; case IPPROTO_FRAGMENT: *nh_off += 8; next = hdr->nexthdr; break; default: return next; } } return -1; } /* Insert VLAN tag using head adjustment */ static __always_inline int insert_vlan_tag(struct xdp_md *ctx, __u16 vlan_id) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ethhdr *old_eth = data; if ((void *)(old_eth + 1) > data_end) return -1; struct ethhdr orig_eth; __builtin_memcpy(&orig_eth, old_eth, sizeof(orig_eth)); /* Expand headroom */ if (bpf_xdp_adjust_head(ctx, -(int)sizeof(struct vlan_hdr))) return -1; /* Re-read pointers after head adjustment */ data = (void *)(long)ctx->data; data_end = (void *)(long)ctx->data_end; struct ethhdr *new_eth = data; struct vlan_hdr *vlan = (struct vlan_hdr *)(new_eth + 1); if ((void *)(vlan + 1) > data_end) return -1; /* Copy ethernet header to new position */ __builtin_memcpy(new_eth->h_dest, orig_eth.h_dest, ETH_ALEN); __builtin_memcpy(new_eth->h_source, orig_eth.h_source, ETH_ALEN); /* Set up VLAN header */ vlan->h_vlan_TCI = bpf_htons(vlan_id & 0x0FFF); vlan->h_vlan_encapsulated_proto = orig_eth.h_proto; /* Update ethernet proto to VLAN */ new_eth->h_proto = bpf_htons(ETH_P_8021Q); return 0; } /* Remove VLAN tag */ static __always_inline int remove_vlan_tag(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ethhdr *eth = data; struct vlan_hdr *vlan = (struct vlan_hdr *)(eth + 1); if ((void *)(vlan + 1) > data_end) return -1; __be16 encap_proto = vlan->h_vlan_encapsulated_proto; struct ethhdr tmp_eth; __builtin_memcpy(&tmp_eth, eth, sizeof(tmp_eth)); /* Adjust head to remove VLAN header */ if (bpf_xdp_adjust_head(ctx, (int)sizeof(struct vlan_hdr))) return -1; /* Re-read pointers after head adjustment */ data = (void *)(long)ctx->data; data_end = (void *)(long)ctx->data_end; eth = data; if ((void *)(eth + 1) > data_end) return -1; __builtin_memcpy(eth->h_dest, tmp_eth.h_dest, ETH_ALEN); __builtin_memcpy(eth->h_source, tmp_eth.h_source, ETH_ALEN); eth->h_proto = encap_proto; return 0; } static __always_inline int xdp_l3fwd_flags(struct xdp_md *ctx, __u32 flags) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ethhdr *eth = data; __u64 nh_off = sizeof(*eth); if ((void *)((char *)data + nh_off) > data_end) return XDP_DROP; struct bpf_fib_lookup fib_params = {}; __u16 h_proto = eth->h_proto; __u16 vlan_id = 0; __u16 orig_vlan_id = 0; int had_vlan = 0; if (h_proto == bpf_htons(ETH_P_8021Q) || h_proto == bpf_htons(ETH_P_8021AD)) had_vlan = 1; if (parse_vlan(data, data_end, &nh_off, &h_proto, &vlan_id) < 0) return XDP_DROP; orig_vlan_id = vlan_id; if (vlan_id > 0) learn_vlan(ctx, vlan_id); struct flow_key key = {}; key.vlan_id = vlan_id; __u64 bytes = (char *)data_end - (char *)data; if (h_proto == bpf_htons(ETH_P_IP)) { struct iphdr *iph = (void *)((char *)data + nh_off); if ((void *)(iph + 1) > data_end) return XDP_DROP; if (iph->ttl <= 1) return XDP_PASS; key.proto = iph->protocol; key.ipv4_src = iph->saddr; key.ipv4_dst = iph->daddr; __u8 ihl = iph->ihl; if (ihl < 5) return XDP_DROP; __u64 l4_off = nh_off + (ihl * 4); void *l4_hdr = (void *)((char *)data + l4_off); if ((void *)((char *)l4_hdr + 4) <= data_end) { if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP) { __u16 *ports = l4_hdr; key.sport = ports[0]; key.dport = ports[1]; fib_params.sport = ports[0]; fib_params.dport = ports[1]; } } fib_params.family = AF_INET; fib_params.tos = iph->tos; fib_params.l4_protocol = iph->protocol; fib_params.tot_len = bpf_ntohs(iph->tot_len); fib_params.ipv4_src = iph->saddr; fib_params.ipv4_dst = iph->daddr; } else if (h_proto == bpf_htons(ETH_P_IPV6)) { struct ipv6hdr *ip6h = (void *)((char *)data + nh_off); if ((void *)(ip6h + 1) > data_end) return XDP_DROP; if (ip6h->hop_limit <= 1) return XDP_PASS; __builtin_memcpy(key.ipv6_src, &ip6h->saddr, 16); __builtin_memcpy(key.ipv6_dst, &ip6h->daddr, 16); __u64 l4_off = nh_off + sizeof(*ip6h); int l4_proto = skip_ip6hdrext(data, data_end, &l4_off, ip6h->nexthdr); if (l4_proto < 0) l4_proto = ip6h->nexthdr; key.proto = l4_proto; void *l4_hdr = (void *)((char *)data + l4_off); if ((void *)((char *)l4_hdr + 4) <= data_end) { if (l4_proto == IPPROTO_TCP || l4_proto == IPPROTO_UDP) { __u16 *ports = l4_hdr; key.sport = ports[0]; key.dport = ports[1]; fib_params.sport = ports[0]; fib_params.dport = ports[1]; } } fib_params.family = AF_INET6; __be32 flow = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK; fib_params.flowinfo = flow; fib_params.l4_protocol = l4_proto; fib_params.tot_len = bpf_ntohs(ip6h->payload_len); __builtin_memcpy(fib_params.ipv6_src, &ip6h->saddr, 16); __builtin_memcpy(fib_params.ipv6_dst, &ip6h->daddr, 16); } else { return XDP_PASS; } fib_params.ifindex = ctx->ingress_ifindex; int rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); if (rc == 0) { record_stats(&key, bytes); __u16 egress_vlan = get_interface_vlan(ctx, fib_params.ifindex); if (egress_vlan > 0 && !had_vlan) { /* Need to add VLAN tag */ if (insert_vlan_tag(ctx, egress_vlan) < 0) return XDP_DROP; } else if (egress_vlan == 0 && had_vlan) { /* Need to remove VLAN tag */ if (remove_vlan_tag(ctx) < 0) { /* Keep VLAN if removal fails */ } } else if (egress_vlan > 0 && had_vlan && egress_vlan != orig_vlan_id) { /* Need to change VLAN ID - reload pointers first */ data = (void *)(long)ctx->data; data_end = (void *)(long)ctx->data_end; eth = data; if ((void *)(eth + 1) > data_end) return XDP_DROP; if (eth->h_proto == bpf_htons(ETH_P_8021Q) || eth->h_proto == bpf_htons(ETH_P_8021AD)) { struct vlan_hdr *vlan = (struct vlan_hdr *)(eth + 1); if ((void *)(vlan + 1) > data_end) return XDP_DROP; vlan->h_vlan_TCI = bpf_htons(egress_vlan & 0x0FFF); } } /* CRITICAL: Always reload pointers after FIB lookup to satisfy verifier */ data = (void *)(long)ctx->data; data_end = (void *)(long)ctx->data_end; eth = data; /* Re-establish packet bounds for verifier */ if ((void *)(eth + 1) > data_end) return XDP_DROP; nh_off = sizeof(*eth); /* Skip VLAN header if present */ if (eth->h_proto == bpf_htons(ETH_P_8021Q) || eth->h_proto == bpf_htons(ETH_P_8021AD)) { nh_off += sizeof(struct vlan_hdr); } /* Verify nh_off is within bounds */ if ((void *)((char *)data + nh_off) > data_end) return XDP_DROP; /* Decrease TTL/hop_limit */ if (h_proto == bpf_htons(ETH_P_IP)) { struct iphdr *iph = (void *)((char *)data + nh_off); if ((void *)(iph + 1) > data_end) return XDP_DROP; ip_decrease_ttl(iph); } else if (h_proto == bpf_htons(ETH_P_IPV6)) { struct ipv6hdr *ip6h = (void *)((char *)data + nh_off); if ((void *)(ip6h + 1) > data_end) return XDP_DROP; ip6h->hop_limit--; } /* Update MAC addresses - verify eth is still valid */ if ((void *)(eth + 1) > data_end) return XDP_DROP; __builtin_memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); __builtin_memcpy(eth->h_source, fib_params.smac, ETH_ALEN); return bpf_redirect(fib_params.ifindex, 0); } return XDP_PASS; } SEC("xdp") int xdp_l3fwd_prog(struct xdp_md *ctx) { return xdp_l3fwd_flags(ctx, 0); } SEC("xdp") int xdp_l3fwd_direct_prog(struct xdp_md *ctx) { return xdp_l3fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT); } char _license[] SEC("license") = "GPL";