anolis: bpf, cgroup: Introduce CGROUP_RICH_CONTAINER program type

ANBZ: #6193

The functionality of the old rich container cannot be used in cgroup v2
and cannot meet the requirements of different container resource views.
To address this issue, the functionality of cgroup v2 ebpf is leveraged,
and a new program type is introduced to calculate the resources of the
container within the ebpf program to meet the customized requirements of
the container.

By using BPF_PROG_TYPE_CGROUP_RICH_CONTAINER, the kernel passes a
pointer to bpf_rich_container_info to the user ebpf program, which can
customize the information to be written into CPU and memory, allowing
the processes inside the container to see the specified information.

Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Signed-off-by: Yi Tao <escape@linux.alibaba.com>
Reviewed-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/2065
This commit is contained in:
Yi Tao 2023-08-21 11:32:26 +08:00 committed by maqiao-alibaba
parent 95f19b8867
commit c716adc815
7 changed files with 111 additions and 0 deletions

View File

@ -22,6 +22,7 @@ struct bpf_cgroup_storage;
struct ctl_table;
struct ctl_table_header;
struct task_struct;
struct bpf_rich_container_info;
#ifdef CONFIG_CGROUP_BPF
enum cgroup_bpf_attach_type {
@ -49,6 +50,8 @@ enum cgroup_bpf_attach_type {
CGROUP_INET4_GETSOCKNAME,
CGROUP_INET6_GETSOCKNAME,
CGROUP_INET_SOCK_RELEASE,
CGROUP_RICH_CONTAINER_CPU,
CGROUP_RICH_CONTAINER_MEM,
MAX_CGROUP_BPF_ATTACH_TYPE,
ANOLIS_KABI_MAX_CG_BPF_ATTACH = 38
};
@ -89,6 +92,8 @@ to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type)
CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME);
CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME);
CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE);
CGROUP_ATYPE(CGROUP_RICH_CONTAINER_CPU);
CGROUP_ATYPE(CGROUP_RICH_CONTAINER_MEM);
default:
return CGROUP_BPF_ATTACH_TYPE_INVALID;
}
@ -225,6 +230,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
char **buf, size_t *pcount, loff_t *ppos,
enum cgroup_bpf_attach_type atype);
int __cgroup_bpf_run_filter_rich_container(
struct bpf_rich_container_info *info,
enum cgroup_bpf_attach_type atype);
int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
int *optname, char __user *optval,
int *optlen, char **kernel_optval);
@ -486,6 +495,22 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
__ret; \
})
#define BPF_CGROUP_RUN_PROG_RICH_CONTAINER_CPU(info, retval) \
({ \
int __ret = retval; \
if (cgroup_bpf_enabled) \
__ret = __cgroup_bpf_run_filter_rich_container(info, CGROUP_RICH_CONTAINER_CPU); \
__ret; \
})
#define BPF_CGROUP_RUN_PROG_RICH_CONTAINER_MEM(info, retval) \
({ \
int __ret = retval; \
if (cgroup_bpf_enabled) \
__ret = __cgroup_bpf_run_filter_rich_container(info, CGROUP_RICH_CONTAINER_MEM); \
__ret; \
})
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog);
int cgroup_bpf_prog_detach(const union bpf_attr *attr,

View File

@ -56,6 +56,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
struct bpf_sysctl, struct bpf_sysctl_kern)
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
struct bpf_sockopt, struct bpf_sockopt_kern)
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_RICH_CONTAINER, cg_rich_container,
struct bpf_rich_container_info, struct bpf_rich_container_info)
#endif
#ifdef CONFIG_BPF_LIRC_MODE2
BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,

View File

@ -671,6 +671,12 @@ struct cftype {
CK_KABI_RESERVE(1)
};
struct bpf_rich_container_info {
cpumask_t cpus_mask;
struct sysinfo sysinfo;
struct sysinfo_ext sysinfo_ext;
};
/*
* Control Group subsystem type.
* See Documentation/admin-guide/cgroup-v1/cgroups.rst for details

View File

@ -201,6 +201,8 @@ enum bpf_prog_type {
BPF_PROG_TYPE_EXT,
BPF_PROG_TYPE_LSM,
BPF_PROG_TYPE_SK_LOOKUP,
/* ======== anolis own features ======== */
BPF_PROG_TYPE_CGROUP_RICH_CONTAINER = 0x1000,
};
enum bpf_attach_type {
@ -242,6 +244,9 @@ enum bpf_attach_type {
BPF_XDP_CPUMAP,
BPF_SK_LOOKUP,
BPF_XDP,
/* ======== anolis own features ======== */
BPF_CGROUP_RICH_CONTAINER_CPU = 0x1000,
BPF_CGROUP_RICH_CONTAINER_MEM,
__MAX_BPF_ATTACH_TYPE
};

View File

@ -1313,6 +1313,19 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
return ret == 1 ? 0 : -EPERM;
}
int __cgroup_bpf_run_filter_rich_container(struct bpf_rich_container_info *info,
enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp;
int ret;
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[atype], info, BPF_PROG_RUN);
rcu_read_unlock();
return ret;
}
#ifdef CONFIG_NET
static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
enum cgroup_bpf_attach_type attach_type)
@ -1941,3 +1954,46 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
const struct bpf_prog_ops cg_sockopt_prog_ops = {
};
static const struct bpf_func_proto *
rich_container_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
return cgroup_base_func_proto(func_id, prog) ? : bpf_tracing_func_proto(func_id, prog);
}
static bool rich_container_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
int start_off, end_off;
switch (prog->expected_attach_type) {
case BPF_CGROUP_RICH_CONTAINER_CPU:
start_off = offsetof(struct bpf_rich_container_info, cpus_mask);
end_off = offsetofend(struct bpf_rich_container_info, cpus_mask);
break;
case BPF_CGROUP_RICH_CONTAINER_MEM:
start_off = offsetof(struct bpf_rich_container_info, sysinfo);
end_off = offsetofend(struct bpf_rich_container_info, sysinfo_ext);
break;
default:
return false;
}
if (off < start_off || off >= end_off)
return false;
if (off % size != 0)
return false;
return true;
}
const struct bpf_verifier_ops cg_rich_container_verifier_ops = {
.get_func_proto = rich_container_func_proto,
.is_valid_access = rich_container_is_valid_access,
};
const struct bpf_prog_ops cg_rich_container_prog_ops = {
};

View File

@ -2096,6 +2096,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
case BPF_PROG_TYPE_CGROUP_RICH_CONTAINER:
switch (expected_attach_type) {
case BPF_CGROUP_RICH_CONTAINER_CPU:
case BPF_CGROUP_RICH_CONTAINER_MEM:
return 0;
default:
return -EINVAL;
}
case BPF_PROG_TYPE_SK_LOOKUP:
if (expected_attach_type == BPF_SK_LOOKUP)
return 0;
@ -2152,6 +2160,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_LSM:
case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
case BPF_PROG_TYPE_EXT: /* extends any prog */
case BPF_PROG_TYPE_CGROUP_RICH_CONTAINER:
return true;
default:
return false;
@ -2958,6 +2967,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_SK_LOOKUP:
case BPF_PROG_TYPE_CGROUP_RICH_CONTAINER:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
if (!capable(CAP_NET_ADMIN))
@ -3013,6 +3023,9 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_FLOW_DISSECTOR;
case BPF_CGROUP_SYSCTL:
return BPF_PROG_TYPE_CGROUP_SYSCTL;
case BPF_CGROUP_RICH_CONTAINER_CPU:
case BPF_CGROUP_RICH_CONTAINER_MEM:
return BPF_PROG_TYPE_CGROUP_RICH_CONTAINER;
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
@ -3075,6 +3088,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_RICH_CONTAINER:
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
default:
@ -3112,6 +3126,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_RICH_CONTAINER:
return cgroup_bpf_prog_detach(attr, ptype);
default:
return -EINVAL;
@ -4132,6 +4147,7 @@ static int link_create(union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_RICH_CONTAINER:
ret = cgroup_bpf_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_TRACING:

View File

@ -8394,6 +8394,7 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_RICH_CONTAINER:
break;
case BPF_PROG_TYPE_RAW_TRACEPOINT:
if (!env->prog->aux->attach_btf_id)