huatuo/bpf/dropwatch.c

288 lines
8.0 KiB
C

#include "vmlinux.h"
#include "vmlinux_net.h"
#include "bpf_common.h"
#include "bpf_ratelimit.h"
#define TYPE_TCP_COMMON_DROP 1
#define TYPE_TCP_SYN_FLOOD 2
#define TYPE_TCP_LISTEN_OVERFLOW_HANDSHAKE1 3
#define TYPE_TCP_LISTEN_OVERFLOW_HANDSHAKE3 4
#define SK_FL_PROTO_SHIFT 8
#define SK_FL_PROTO_MASK 0x0000ff00
#define SK_FL_TYPE_SHIFT 16
#define SK_FL_TYPE_MASK 0xffff0000
struct perf_event_t {
u64 tgid_pid;
u32 saddr;
u32 daddr;
u16 sport;
u16 dport;
u32 seq;
u32 ack_seq;
u32 queue_mapping;
u64 pkt_len;
s64 stack_size;
u64 stack[PERF_MAX_STACK_DEPTH];
u32 sk_max_ack_backlog;
u8 state;
u8 type;
char comm[TASK_COMM_LEN];
};
/* format: /sys/kernel/debug/tracing/events/skb/kfree_skb/format */
struct kfree_skb_args {
unsigned long long pad;
void *skbaddr;
void *location;
u16 protocol;
};
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(u32));
} perf_events SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(struct perf_event_t));
} dropwatch_stackmap SEC(".maps");
char __license[] SEC("license") = "Dual MIT/GPL";
static const struct perf_event_t zero_data = {};
static const u32 stackmap_key = 0;
BPF_RATELIMIT(rate, 1, 100); // 100/s
struct sock___5_10 {
u16 sk_type;
u16 sk_protocol;
}__attribute__((preserve_access_index));
static void sk_get_type_and_protocol(struct sock *sk, u16 *protocol, u16 *type)
{
// kernel version <= 4.18
//
// struct sock {
// unsigned int __sk_flags_offset[0];
// #ifdef __BIG_ENDIAN_BITFIELD
// #define SK_FL_PROTO_SHIFT 16
// #define SK_FL_PROTO_MASK 0x00ff0000
// #
// #define SK_FL_TYPE_SHIFT 0
// #define SK_FL_TYPE_MASK 0x0000ffff
// #else
// #define SK_FL_PROTO_SHIFT 8
// #define SK_FL_PROTO_MASK 0x0000ff00
// #
// #define SK_FL_TYPE_SHIFT 16
// #define SK_FL_TYPE_MASK 0xffff0000
// #endif
//
// unsigned int sk_padding : 1,
// sk_kern_sock : 1,
// sk_no_check_tx : 1,
// sk_no_check_rx : 1,
// sk_userlocks : 4,
// sk_protocol : 8,
// sk_type : 16;
// }
if (bpf_core_field_exists(sk->__sk_flags_offset)) {
u32 sk_flags;
bpf_probe_read(&sk_flags, sizeof(sk_flags), &sk->__sk_flags_offset);
*protocol = sk_flags >> SK_FL_PROTO_SHIFT;
*type = sk_flags >> SK_FL_TYPE_SHIFT;
return;
}
// kernel version >= 5.10
//
// struct sock {
// u16 sk_type;
// u16 sk_protocol;
// }
struct sock___5_10 *sk_new = (struct sock___5_10 *)sk;
*protocol = BPF_CORE_READ(sk_new, sk_protocol);
*type = BPF_CORE_READ(sk_new, sk_type);
return;
}
SEC("tracepoint/skb/kfree_skb")
int bpf_kfree_skb_prog(struct kfree_skb_args *ctx)
{
struct sk_buff *skb = ctx->skbaddr;
struct perf_event_t *data = NULL;
struct sock_common *sk_common;
struct tcphdr tcphdr;
struct iphdr iphdr;
struct sock *sk;
u16 protocol = 0;
u16 type = 0;
u8 state = 0;
/* only for IP && TCP */
if (ctx->protocol != ETH_P_IP)
return 0;
bpf_probe_read(&iphdr, sizeof(iphdr), skb_network_header(skb));
if (iphdr.protocol != IPPROTO_TCP)
return 0;
sk = BPF_CORE_READ(skb, sk);
if (!sk)
return 0;
sk_common = (struct sock_common *)sk;
// filter the sock by AF_INET, SOCK_STREAM, IPPROTO_TCP
if (BPF_CORE_READ(sk_common, skc_family) != AF_INET)
return 0;
sk_get_type_and_protocol(sk, &protocol, &type);
if ((u8)protocol != IPPROTO_TCP || type != SOCK_STREAM)
return 0;
// filter not CLOSE
state = BPF_CORE_READ(sk_common, skc_state);
if (state == TCP_CLOSE || state == 0)
return 0;
// ratelimit
if (bpf_ratelimited(&rate))
return 0;
data = bpf_map_lookup_elem(&dropwatch_stackmap, &stackmap_key);
if (!data) {
return 0;
}
bpf_probe_read(&tcphdr, sizeof(tcphdr), skb_transport_header(skb));
/* event */
data->tgid_pid = bpf_get_current_pid_tgid();
bpf_get_current_comm(&data->comm, sizeof(data->comm));
data->type = TYPE_TCP_COMMON_DROP;
data->state = state;
data->saddr = iphdr.saddr;
data->daddr = iphdr.daddr;
data->sport = tcphdr.source;
data->dport = tcphdr.dest;
data->seq = tcphdr.seq;
data->ack_seq = tcphdr.ack_seq;
data->pkt_len = BPF_CORE_READ(skb, len);
data->queue_mapping = BPF_CORE_READ(skb, queue_mapping);
data->stack_size = bpf_get_stack(ctx, data->stack, sizeof(data->stack), 0);
data->sk_max_ack_backlog = 0; // ignore sk_max_ack_backlog in dropwatch case.
// output
bpf_perf_event_output(ctx, &perf_events, BPF_F_CURRENT_CPU, data, sizeof(*data));
// clean
bpf_map_update_elem(&dropwatch_stackmap, &stackmap_key, &zero_data, BPF_EXIST);
return 0;
}
// The current kernel does not support kprobe+offset very well, waiting for kpatch to come online.
#if 0
static int fill_overflow_event(void *ctx, u8 type, struct sock *sk, struct sk_buff *skb)
{
struct perf_event_t *data = NULL;
struct iphdr iphdr;
struct tcphdr tcphdr;
data = bpf_map_lookup_elem(&dropwatch_stackmap, &stackmap_key);
if (!data) {
return 0;
}
bpf_probe_read(&iphdr, sizeof(iphdr), skb_network_header(skb));
bpf_probe_read(&tcphdr, sizeof(tcphdr), skb_transport_header(skb));
/* event */
data->tgid_pid = bpf_get_current_pid_tgid();
bpf_get_current_comm(&data->comm, sizeof(data->comm));
data->type = type;
data->state = 0;
data->saddr = iphdr.saddr;
data->daddr = iphdr.daddr;
data->sport = tcphdr.source;
data->dport = tcphdr.dest;
data->seq = tcphdr.seq;
data->ack_seq = tcphdr.ack_seq;
data->pkt_len = BPF_CORE_READ(skb, len);
data->queue_mapping = BPF_CORE_READ(skb, queue_mapping);
data->stack_size = 0; // ignore stack in not-overflow.
data->sk_max_ack_backlog = BPF_CORE_READ(sk, sk_max_ack_backlog);
// output
bpf_perf_event_output(ctx, &perf_events, BPF_F_CURRENT_CPU, data, sizeof(*data));
// clean
bpf_map_update_elem(&dropwatch_stackmap, &stackmap_key, &zero_data, BPF_EXIST);
return 0;
}
// the dropwatch case: syn_flood.
SEC("kprobe/tcp_conn_request+1290")
int bpf_tcp_syn_flood_action_prog(struct pt_regs *ctx)
{
// the function of `tcp_syn_flood_action` arguments:
// %r15: struct sock *sk
// %r13: struct sk_buff *skb
struct sock *sk = (void *)ctx->r15;
struct sk_buff *skb= (void *)ctx->r13;
// ratelimit
if (bpf_ratelimited(ctx, rate))
return 0;
// fill
return fill_overflow_event(ctx, TYPE_TCP_SYN_FLOOD, sk, skb);
}
// the dropwatch case: listen-overflow in the TCP_CLOSE state(client: TCP_SYN_SENT).
SEC("kprobe/tcp_conn_request+167")
int bpf_tcp_listen_overflow_handshake1_prog(struct pt_regs *ctx)
{
// this position has registers as follows:
// %r15: struct sock *sk
// %r13: struct sk_buff *skb
struct sock *sk = (void *)ctx->r15;
struct sk_buff *skb= (void *)ctx->r13;
// ratelimit
if (bpf_ratelimited(ctx, rate))
return 0;
// fill
return fill_overflow_event(ctx, TYPE_TCP_LISTEN_OVERFLOW_HANDSHAKE1, sk, skb);
}
// the dropwatch case: listen-overflow in the TCP_NEW_SYN_RECV state(client: TCP_ESTABLISHED).
SEC("kprobe/tcp_v4_syn_recv_sock+700")
int bpf_tcp_listen_overflow_handshake3_prog(struct pt_regs *ctx)
{
// this position has registers as follows:
// %rdi: struct sock *sk
// %rsi: struct sk_buff *skb
// %r15: struct request_sock *req
struct sock *sk = (void *)ctx->di;
struct sk_buff *skb= (void *)ctx->si;
// ratelimit
if (bpf_ratelimited(ctx, rate))
return 0;
// fill
return fill_overflow_event(ctx, TYPE_TCP_LISTEN_OVERFLOW_HANDSHAKE3, sk, skb);
}
#endif