anolis: pidns: add pid_max per namespace

ANBZ: #4226

In x86_64 with big ram people running containers set pid_max on host to
large values to be able to launch more containers. At the same time
containers running 32-bit software experience problems with large pids
-ps calls readdir/stat on proc entries and inode's i_ino happen to be
too big for the 32-bit API.
Thus, the ability to limit the pid value inside container is required.
And max_pid could be set through '/proc/sys/kernel/pid_max'

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Chao Wu <chaowu@linux.alibaba.com>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1288
This commit is contained in:
Chao Wu 2023-02-27 10:58:53 +08:00 committed by 小龙
parent 4293ddbee4
commit 79e997af0a
6 changed files with 81 additions and 4 deletions

View File

@ -110,8 +110,10 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new,
struct pid_namespace;
extern struct pid_namespace init_pid_ns;
#ifndef CONFIG_MAX_PID_PER_NS
extern int pid_max;
extern int pid_max_min, pid_max_max;
#endif
/*
* look up a PID in the hash table. Must be called with the tasklist_lock

View File

@ -21,6 +21,9 @@ struct pid_namespace {
struct kref kref;
struct idr idr;
struct rcu_head rcu;
#ifdef CONFIG_MAX_PID_PER_NS
int pid_max;
#endif
unsigned int pid_allocated;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;

View File

@ -1144,6 +1144,14 @@ config RICH_CONTAINER_CG_SWITCH
in rich container. If Y, some new interface will be created in per
cgroup directory, and if N, nothing will be created.
config MAX_PID_PER_NS
bool "Max Pid per namespace"
default n
help
This is to make each namespace has its own pid_max value, specified by
"/proc/sys/kernel/pid_max". It may break the compatibility of sysctl.pid_max,
say no if you don't know what this feature means.
config CGROUP_CPUACCT
bool "Simple CPU accounting controller"
help

View File

@ -63,8 +63,13 @@ int pid_max = PID_MAX_DEFAULT;
#define RESERVED_PIDS 300
#ifdef CONFIG_MAX_PID_PER_NS
static int pid_max_min = RESERVED_PIDS + 1;
static int pid_max_max = PID_MAX_LIMIT;
#else
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
#endif
/*
* PID-map pages start out as NULL, they get allocated upon
@ -187,12 +192,21 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
for (i = ns->level; i >= 0; i--) {
int tid = 0;
#ifdef CONFIG_MAX_PID_PER_NS
int pid_max_local = tmp->pid_max;
#endif
if (set_tid_size) {
tid = set_tid[ns->level - i];
retval = -EINVAL;
#ifdef CONFIG_MAX_PID_PER_NS
if (tid < 1 || tid >= pid_max_local)
goto out_free;
#else
if (tid < 1 || tid >= pid_max)
goto out_free;
#endif
/*
* Also fail if a PID != 1 is requested and
* no PID 1 exists.
@ -230,8 +244,13 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* Store a null pointer so find_pid_ns does not find
* a partially initialized PID (see below).
*/
#ifdef CONFIG_MAX_PID_PER_NS
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
pid_max_local, GFP_ATOMIC);
#else
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max,
GFP_ATOMIC);
#endif
}
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
@ -605,22 +624,62 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
return fd;
}
#ifdef CONFIG_MAX_PID_PER_NS
static int proc_dointvec_pidmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table tmp;
tmp = *table;
tmp.data = &(task_active_pid_ns(current)->pid_max);
return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
}
static struct ctl_table pid_ctl_table[] = {
{
.procname = "pid_max",
.data = &init_pid_ns.pid_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_pidmax,
.extra1 = &pid_max_min,
.extra2 = &pid_max_max,
},
{ }
};
static struct ctl_path pid_kern_path[] = { { .procname = "kernel" }, {} };
#endif
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
#ifdef CONFIG_MAX_PID_PER_NS
init_pid_ns.pid_max = min(pid_max_max, max_t(int, pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
#else
pid_max = min(pid_max_max, max_t(int, pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
#endif
pid_max_min =
max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus());
#ifdef CONFIG_MAX_PID_PER_NS
pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max,
pid_max_min);
#else
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
#endif
idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
#ifdef CONFIG_MAX_PID_PER_NS
register_sysctl_paths(pid_kern_path, pid_ctl_table);
#endif
}
static struct file *__pidfd_fget(struct task_struct *task, int fd)

View File

@ -131,6 +131,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
kref_init(&ns->kref);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
#ifdef CONFIG_MAX_PID_PER_NS
ns->pid_max = parent_pid_ns->pid_max;
#endif
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;

View File

@ -2371,6 +2371,7 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SMP */
#ifndef CONFIG_MAX_PID_PER_NS
{
.procname = "pid_max",
.data = &pid_max,
@ -2380,6 +2381,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &pid_max_min,
.extra2 = &pid_max_max,
},
#endif /* CONFIG_MAX_PID_PER_NS */
{
.procname = "panic_on_oops",
.data = &panic_on_oops,