anolis: pidns: add pid_max per namespace
ANBZ: #4226 In x86_64 with big ram people running containers set pid_max on host to large values to be able to launch more containers. At the same time containers running 32-bit software experience problems with large pids -ps calls readdir/stat on proc entries and inode's i_ino happen to be too big for the 32-bit API. Thus, the ability to limit the pid value inside container is required. And max_pid could be set through '/proc/sys/kernel/pid_max' Signed-off-by: Pavel Emelyanov <xemul@parallels.com> Signed-off-by: Chao Wu <chaowu@linux.alibaba.com> Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com> Link: https://gitee.com/anolis/cloud-kernel/pulls/1288
This commit is contained in:
parent
4293ddbee4
commit
79e997af0a
|
@ -110,8 +110,10 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new,
|
|||
struct pid_namespace;
|
||||
extern struct pid_namespace init_pid_ns;
|
||||
|
||||
#ifndef CONFIG_MAX_PID_PER_NS
|
||||
extern int pid_max;
|
||||
extern int pid_max_min, pid_max_max;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* look up a PID in the hash table. Must be called with the tasklist_lock
|
||||
|
|
|
@ -21,6 +21,9 @@ struct pid_namespace {
|
|||
struct kref kref;
|
||||
struct idr idr;
|
||||
struct rcu_head rcu;
|
||||
#ifdef CONFIG_MAX_PID_PER_NS
|
||||
int pid_max;
|
||||
#endif
|
||||
unsigned int pid_allocated;
|
||||
struct task_struct *child_reaper;
|
||||
struct kmem_cache *pid_cachep;
|
||||
|
|
|
@ -1144,6 +1144,14 @@ config RICH_CONTAINER_CG_SWITCH
|
|||
in rich container. If Y, some new interface will be created in per
|
||||
cgroup directory, and if N, nothing will be created.
|
||||
|
||||
config MAX_PID_PER_NS
|
||||
bool "Max Pid per namespace"
|
||||
default n
|
||||
help
|
||||
This is to make each namespace has its own pid_max value, specified by
|
||||
"/proc/sys/kernel/pid_max". It may break the compatibility of sysctl.pid_max,
|
||||
say no if you don't know what this feature means.
|
||||
|
||||
config CGROUP_CPUACCT
|
||||
bool "Simple CPU accounting controller"
|
||||
help
|
||||
|
|
67
kernel/pid.c
67
kernel/pid.c
|
@ -63,8 +63,13 @@ int pid_max = PID_MAX_DEFAULT;
|
|||
|
||||
#define RESERVED_PIDS 300
|
||||
|
||||
#ifdef CONFIG_MAX_PID_PER_NS
|
||||
static int pid_max_min = RESERVED_PIDS + 1;
|
||||
static int pid_max_max = PID_MAX_LIMIT;
|
||||
#else
|
||||
int pid_max_min = RESERVED_PIDS + 1;
|
||||
int pid_max_max = PID_MAX_LIMIT;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* PID-map pages start out as NULL, they get allocated upon
|
||||
|
@ -187,12 +192,21 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
|||
for (i = ns->level; i >= 0; i--) {
|
||||
int tid = 0;
|
||||
|
||||
#ifdef CONFIG_MAX_PID_PER_NS
|
||||
int pid_max_local = tmp->pid_max;
|
||||
#endif
|
||||
|
||||
if (set_tid_size) {
|
||||
tid = set_tid[ns->level - i];
|
||||
|
||||
retval = -EINVAL;
|
||||
#ifdef CONFIG_MAX_PID_PER_NS
|
||||
if (tid < 1 || tid >= pid_max_local)
|
||||
goto out_free;
|
||||
#else
|
||||
if (tid < 1 || tid >= pid_max)
|
||||
goto out_free;
|
||||
#endif
|
||||
/*
|
||||
* Also fail if a PID != 1 is requested and
|
||||
* no PID 1 exists.
|
||||
|
@ -230,8 +244,13 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
|||
* Store a null pointer so find_pid_ns does not find
|
||||
* a partially initialized PID (see below).
|
||||
*/
|
||||
#ifdef CONFIG_MAX_PID_PER_NS
|
||||
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
|
||||
pid_max, GFP_ATOMIC);
|
||||
pid_max_local, GFP_ATOMIC);
|
||||
#else
|
||||
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max,
|
||||
GFP_ATOMIC);
|
||||
#endif
|
||||
}
|
||||
spin_unlock_irq(&pidmap_lock);
|
||||
idr_preload_end();
|
||||
|
@ -605,22 +624,62 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
|
|||
return fd;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MAX_PID_PER_NS
|
||||
static int proc_dointvec_pidmax(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
struct ctl_table tmp;
|
||||
|
||||
tmp = *table;
|
||||
tmp.data = &(task_active_pid_ns(current)->pid_max);
|
||||
|
||||
return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
|
||||
}
|
||||
|
||||
static struct ctl_table pid_ctl_table[] = {
|
||||
{
|
||||
.procname = "pid_max",
|
||||
.data = &init_pid_ns.pid_max,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_pidmax,
|
||||
.extra1 = &pid_max_min,
|
||||
.extra2 = &pid_max_max,
|
||||
},
|
||||
{ }
|
||||
};
|
||||
|
||||
static struct ctl_path pid_kern_path[] = { { .procname = "kernel" }, {} };
|
||||
#endif
|
||||
|
||||
void __init pid_idr_init(void)
|
||||
{
|
||||
/* Verify no one has done anything silly: */
|
||||
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
|
||||
|
||||
/* bump default and minimum pid_max based on number of cpus */
|
||||
#ifdef CONFIG_MAX_PID_PER_NS
|
||||
init_pid_ns.pid_max = min(pid_max_max, max_t(int, pid_max,
|
||||
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
|
||||
#else
|
||||
pid_max = min(pid_max_max, max_t(int, pid_max,
|
||||
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
|
||||
pid_max_min = max_t(int, pid_max_min,
|
||||
PIDS_PER_CPU_MIN * num_possible_cpus());
|
||||
#endif
|
||||
pid_max_min =
|
||||
max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus());
|
||||
#ifdef CONFIG_MAX_PID_PER_NS
|
||||
pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max,
|
||||
pid_max_min);
|
||||
#else
|
||||
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
|
||||
|
||||
#endif
|
||||
idr_init(&init_pid_ns.idr);
|
||||
|
||||
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
|
||||
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
|
||||
#ifdef CONFIG_MAX_PID_PER_NS
|
||||
register_sysctl_paths(pid_kern_path, pid_ctl_table);
|
||||
#endif
|
||||
}
|
||||
|
||||
static struct file *__pidfd_fget(struct task_struct *task, int fd)
|
||||
|
|
|
@ -131,6 +131,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
|
|||
kref_init(&ns->kref);
|
||||
ns->level = level;
|
||||
ns->parent = get_pid_ns(parent_pid_ns);
|
||||
#ifdef CONFIG_MAX_PID_PER_NS
|
||||
ns->pid_max = parent_pid_ns->pid_max;
|
||||
#endif
|
||||
ns->user_ns = get_user_ns(user_ns);
|
||||
ns->ucounts = ucounts;
|
||||
ns->pid_allocated = PIDNS_ADDING;
|
||||
|
|
|
@ -2371,6 +2371,7 @@ static struct ctl_table kern_table[] = {
|
|||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
#endif /* CONFIG_SMP */
|
||||
#ifndef CONFIG_MAX_PID_PER_NS
|
||||
{
|
||||
.procname = "pid_max",
|
||||
.data = &pid_max,
|
||||
|
@ -2380,6 +2381,7 @@ static struct ctl_table kern_table[] = {
|
|||
.extra1 = &pid_max_min,
|
||||
.extra2 = &pid_max_max,
|
||||
},
|
||||
#endif /* CONFIG_MAX_PID_PER_NS */
|
||||
{
|
||||
.procname = "panic_on_oops",
|
||||
.data = &panic_on_oops,
|
||||
|
|
Loading…
Reference in New Issue