anolis: genirq/affinity: add support for limiting managed interrupts

ANBZ: #10929

Commit c410abbbac (genirq/affinity: Add is_managed to struct irq_affinity_desc)
introduced is_managed bit to struct irq_affinity_desc. Due to this commit treating
queue interrupts as managed interrupts, in scenarios where a large number of
devices are present (using massive msix queue interrupts), an excessive number
of IRQ matrix bits are reserved during interrupt allocation. This sequently leads
to the situation where interrupts for some devices cannot be properly allocated.

Support for limiting managed interrupts on every node.

Signed-off-by: Guanjun <guanjun@linux.alibaba.com>
Reviewed-by: Zelin Deng <zelin.deng@linux.alibaba.com>
Reviewed-by: Cruz Zhao <CruzZhao@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/3856
This commit is contained in:
Guanjun 2024-10-22 16:56:05 +08:00 committed by 小龙
parent 3ebad7e2c0
commit c6f538a97a
3 changed files with 78 additions and 5 deletions

View File

@ -2717,6 +2717,15 @@
different yeeloong laptops.
Example: machtype=lemote-yeeloong-2f-7inch
managed_irqs_per_node=
[KNL] Support for limiting the number of managed
interrupts on every node to prevent the case that
interrupts cannot be properly allocated where a large
number of devices are present. The default number is 0,
that means no limit to the number of managed irqs.
Format: integer between 0 and num_possible_cpus() / num_possible_nodes()
Default: 0
max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater
than or equal to this physical address is ignored.

View File

@ -648,6 +648,7 @@ static inline void irq_force_complete_move(struct irq_desc *desc) { }
#endif
extern int no_irq_affinity;
extern unsigned int managed_irqs_per_node;
#ifdef CONFIG_HARDIRQS_SW_RESEND
int irq_set_parent(int irq, int parent_irq);

View File

@ -9,6 +9,31 @@
#include <linux/cpu.h>
#include <linux/sort.h>
unsigned int __read_mostly managed_irqs_per_node;
static struct cpumask managed_irqs_free_cpumsk[MAX_NUMNODES] __cacheline_aligned_in_smp = {
[0 ... MAX_NUMNODES-1] = {CPU_BITS_ALL}
};
static int __init irq_managed_setup(char *str)
{
int ret;
ret = kstrtouint(str, 10, &managed_irqs_per_node);
if (ret < 0) {
pr_warn("managed_irqs_per_node= cannot parse, ignored\n");
return 0;
}
if (managed_irqs_per_node * num_possible_nodes() > num_possible_cpus()) {
managed_irqs_per_node = num_possible_cpus() / num_possible_nodes();
pr_warn("managed_irqs_per_node= cannot be larger than %u\n",
managed_irqs_per_node);
}
return 1;
}
__setup("managed_irqs_per_node=", irq_managed_setup);
static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
unsigned int cpus_per_vec)
{
@ -244,6 +269,39 @@ static void alloc_nodes_vectors(unsigned int numvecs,
}
}
static void __irq_prepare_affinity_mask(struct cpumask *premask,
cpumask_var_t *node_to_cpumask)
{
nodemask_t nodemsk = NODE_MASK_NONE;
unsigned int ncpus, n;
get_nodes_in_cpumask(node_to_cpumask, cpu_present_mask, &nodemsk);
for_each_node_mask(n, nodemsk) {
/*
* Try to allocate manage_irqs_per_node CPU bits on each numa
* node. If an insufficient number can be allocated, the free
* CPU bits will be reset to CPU_BITS_ALL for the next
* allocation. This design is considered for lockless
* and load balancing.
*/
cpumask_and(&managed_irqs_free_cpumsk[n],
&managed_irqs_free_cpumsk[n], cpu_present_mask);
cpumask_and(&managed_irqs_free_cpumsk[n],
&managed_irqs_free_cpumsk[n], node_to_cpumask[n]);
ncpus = cpumask_weight(&managed_irqs_free_cpumsk[n]);
if (ncpus < managed_irqs_per_node) {
/* Reset node n to current node cpumask */
cpumask_copy(&managed_irqs_free_cpumsk[n], node_to_cpumask[n]);
continue;
}
irq_spread_init_one(premask,
&managed_irqs_free_cpumsk[n], managed_irqs_per_node);
}
}
static int __irq_build_affinity_masks(unsigned int startvec,
unsigned int numvecs,
unsigned int firstvec,
@ -359,9 +417,14 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
get_online_cpus();
build_node_to_cpumask(node_to_cpumask);
/* Limit the count of managed interrupts on every node */
if (masks[startvec].is_managed && managed_irqs_per_node)
__irq_prepare_affinity_mask(npresmsk, node_to_cpumask);
/* Spread on present CPUs starting from affd->pre_vectors */
ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
node_to_cpumask, cpu_present_mask,
node_to_cpumask,
cpumask_empty(npresmsk) ? cpu_present_mask : npresmsk,
nmsk, masks);
if (ret < 0)
goto fail_build_affinity;
@ -455,6 +518,10 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
for (curvec = 0; curvec < affd->pre_vectors; curvec++)
cpumask_copy(&masks[curvec].mask, irq_default_affinity);
/* Mark the managed interrupts */
for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
masks[i].is_managed = 1;
/*
* Spread on present CPUs starting from affd->pre_vectors. If we
* have multiple sets, build each sets affinity mask separately.
@ -481,10 +548,6 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
for (; curvec < nvecs; curvec++)
cpumask_copy(&masks[curvec].mask, irq_default_affinity);
/* Mark the managed interrupts */
for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
masks[i].is_managed = 1;
return masks;
}