2014-10-27 16:12:00 +08:00
|
|
|
/*
|
|
|
|
* Local APIC related interfaces to support IOAPIC, MSI, HT_IRQ etc.
|
|
|
|
*
|
|
|
|
* Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
|
|
|
|
* Moved from arch/x86/kernel/apic/io_apic.c.
|
2015-04-13 14:11:24 +08:00
|
|
|
* Jiang Liu <jiang.liu@linux.intel.com>
|
|
|
|
* Enable support of hierarchical irqdomains
|
2014-10-27 16:12:00 +08:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*/
|
|
|
|
#include <linux/interrupt.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/slab.h>
|
2015-04-14 10:30:09 +08:00
|
|
|
#include <asm/irqdomain.h>
|
2014-10-27 16:12:00 +08:00
|
|
|
#include <asm/hw_irq.h>
|
|
|
|
#include <asm/apic.h>
|
|
|
|
#include <asm/i8259.h>
|
|
|
|
#include <asm/desc.h>
|
|
|
|
#include <asm/irq_remapping.h>
|
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
struct apic_chip_data {
|
|
|
|
struct irq_cfg cfg;
|
|
|
|
cpumask_var_t domain;
|
|
|
|
cpumask_var_t old_domain;
|
|
|
|
u8 move_in_progress : 1;
|
|
|
|
};
|
|
|
|
|
2015-04-13 14:11:24 +08:00
|
|
|
struct irq_domain *x86_vector_domain;
|
2015-12-11 01:52:59 +08:00
|
|
|
EXPORT_SYMBOL_GPL(x86_vector_domain);
|
2014-10-27 16:12:00 +08:00
|
|
|
static DEFINE_RAW_SPINLOCK(vector_lock);
|
2016-01-01 00:30:48 +08:00
|
|
|
static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask;
|
2015-04-13 14:11:24 +08:00
|
|
|
static struct irq_chip lapic_controller;
|
2015-04-13 14:11:56 +08:00
|
|
|
#ifdef CONFIG_X86_IO_APIC
|
2015-04-14 10:30:03 +08:00
|
|
|
static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
|
2015-04-13 14:11:56 +08:00
|
|
|
#endif
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
void lock_vector_lock(void)
|
|
|
|
{
|
|
|
|
/* Used to the online set of cpus does not change
|
|
|
|
* during assign_irq_vector.
|
|
|
|
*/
|
|
|
|
raw_spin_lock(&vector_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void unlock_vector_lock(void)
|
|
|
|
{
|
|
|
|
raw_spin_unlock(&vector_lock);
|
|
|
|
}
|
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data)
|
2014-10-27 16:12:00 +08:00
|
|
|
{
|
2015-04-13 14:11:24 +08:00
|
|
|
if (!irq_data)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
while (irq_data->parent_data)
|
|
|
|
irq_data = irq_data->parent_data;
|
|
|
|
|
2014-10-27 16:12:00 +08:00
|
|
|
return irq_data->chip_data;
|
|
|
|
}
|
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
|
|
|
|
{
|
|
|
|
struct apic_chip_data *data = apic_chip_data(irq_data);
|
|
|
|
|
|
|
|
return data ? &data->cfg : NULL;
|
|
|
|
}
|
2015-12-11 01:52:59 +08:00
|
|
|
EXPORT_SYMBOL_GPL(irqd_cfg);
|
2015-04-14 10:30:03 +08:00
|
|
|
|
|
|
|
struct irq_cfg *irq_cfg(unsigned int irq)
|
2014-10-27 16:12:00 +08:00
|
|
|
{
|
2015-04-14 10:30:03 +08:00
|
|
|
return irqd_cfg(irq_get_irq_data(irq));
|
|
|
|
}
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
static struct apic_chip_data *alloc_apic_chip_data(int node)
|
|
|
|
{
|
|
|
|
struct apic_chip_data *data;
|
|
|
|
|
|
|
|
data = kzalloc_node(sizeof(*data), GFP_KERNEL, node);
|
|
|
|
if (!data)
|
2014-10-27 16:12:00 +08:00
|
|
|
return NULL;
|
2015-04-14 10:30:03 +08:00
|
|
|
if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node))
|
|
|
|
goto out_data;
|
|
|
|
if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
|
2014-10-27 16:12:00 +08:00
|
|
|
goto out_domain;
|
2015-04-14 10:30:03 +08:00
|
|
|
return data;
|
2014-10-27 16:12:00 +08:00
|
|
|
out_domain:
|
2015-04-14 10:30:03 +08:00
|
|
|
free_cpumask_var(data->domain);
|
|
|
|
out_data:
|
|
|
|
kfree(data);
|
2014-10-27 16:12:00 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
static void free_apic_chip_data(struct apic_chip_data *data)
|
2014-10-27 16:12:00 +08:00
|
|
|
{
|
2015-04-14 10:30:03 +08:00
|
|
|
if (data) {
|
|
|
|
free_cpumask_var(data->domain);
|
|
|
|
free_cpumask_var(data->old_domain);
|
|
|
|
kfree(data);
|
2015-04-13 14:11:24 +08:00
|
|
|
}
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
static int __assign_irq_vector(int irq, struct apic_chip_data *d,
|
|
|
|
const struct cpumask *mask)
|
2014-10-27 16:12:00 +08:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* NOTE! The local APIC isn't very good at handling
|
|
|
|
* multiple interrupts at the same interrupt level.
|
|
|
|
* As the interrupt level is determined by taking the
|
|
|
|
* vector number and shifting that right by 4, we
|
|
|
|
* want to spread these out a bit so that they don't
|
|
|
|
* all fall in the same interrupt level.
|
|
|
|
*
|
|
|
|
* Also, we've got to be careful not to trash gate
|
|
|
|
* 0x80, because int 0x80 is hm, kind of importantish. ;)
|
|
|
|
*/
|
|
|
|
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
|
|
|
|
static int current_offset = VECTOR_OFFSET_START % 16;
|
2016-01-01 00:30:49 +08:00
|
|
|
int cpu, vector;
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2016-01-01 00:30:54 +08:00
|
|
|
/*
|
|
|
|
* If there is still a move in progress or the previous move has not
|
|
|
|
* been cleaned up completely, tell the caller to come back later.
|
|
|
|
*/
|
|
|
|
if (d->move_in_progress ||
|
|
|
|
cpumask_intersects(d->old_domain, cpu_online_mask))
|
2014-10-27 16:12:00 +08:00
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
/* Only try and allocate irqs on cpus that are present */
|
2015-04-14 10:30:03 +08:00
|
|
|
cpumask_clear(d->old_domain);
|
2016-01-01 00:30:46 +08:00
|
|
|
cpumask_clear(searched_cpumask);
|
2014-10-27 16:12:00 +08:00
|
|
|
cpu = cpumask_first_and(mask, cpu_online_mask);
|
|
|
|
while (cpu < nr_cpu_ids) {
|
2016-01-01 00:30:49 +08:00
|
|
|
int new_cpu, offset;
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2016-01-01 00:30:48 +08:00
|
|
|
/* Get the possible target cpus for @mask/@cpu from the apic */
|
2015-04-14 10:30:10 +08:00
|
|
|
apic->vector_allocation_domain(cpu, vector_cpumask, mask);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2016-01-01 00:30:48 +08:00
|
|
|
/*
|
|
|
|
* Clear the offline cpus from @vector_cpumask for searching
|
|
|
|
* and verify whether the result overlaps with @mask. If true,
|
|
|
|
* then the call to apic->cpu_mask_to_apicid_and() will
|
|
|
|
* succeed as well. If not, no point in trying to find a
|
|
|
|
* vector in this mask.
|
|
|
|
*/
|
|
|
|
cpumask_and(vector_searchmask, vector_cpumask, cpu_online_mask);
|
|
|
|
if (!cpumask_intersects(vector_searchmask, mask))
|
|
|
|
goto next_cpu;
|
|
|
|
|
2015-04-14 10:30:10 +08:00
|
|
|
if (cpumask_subset(vector_cpumask, d->domain)) {
|
|
|
|
if (cpumask_equal(vector_cpumask, d->domain))
|
2016-01-01 00:30:46 +08:00
|
|
|
goto success;
|
2014-10-27 16:12:00 +08:00
|
|
|
/*
|
2016-01-01 00:30:49 +08:00
|
|
|
* Mark the cpus which are not longer in the mask for
|
|
|
|
* cleanup.
|
2014-10-27 16:12:00 +08:00
|
|
|
*/
|
2016-01-01 00:30:49 +08:00
|
|
|
cpumask_andnot(d->old_domain, d->domain, vector_cpumask);
|
|
|
|
vector = d->cfg.vector;
|
|
|
|
goto update;
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
vector = current_vector;
|
|
|
|
offset = current_offset;
|
|
|
|
next:
|
|
|
|
vector += 16;
|
|
|
|
if (vector >= first_system_vector) {
|
|
|
|
offset = (offset + 1) % 16;
|
|
|
|
vector = FIRST_EXTERNAL_VECTOR + offset;
|
|
|
|
}
|
|
|
|
|
2016-01-01 00:30:47 +08:00
|
|
|
/* If the search wrapped around, try the next cpu */
|
|
|
|
if (unlikely(current_vector == vector))
|
|
|
|
goto next_cpu;
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
if (test_bit(vector, used_vectors))
|
|
|
|
goto next;
|
|
|
|
|
2016-01-01 00:30:48 +08:00
|
|
|
for_each_cpu(new_cpu, vector_searchmask) {
|
2015-08-03 04:38:27 +08:00
|
|
|
if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
|
2014-10-27 16:12:00 +08:00
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
/* Found one! */
|
|
|
|
current_vector = vector;
|
|
|
|
current_offset = offset;
|
2016-01-01 00:30:49 +08:00
|
|
|
/* Schedule the old vector for cleanup on all cpus */
|
|
|
|
if (d->cfg.vector)
|
2015-04-14 10:30:03 +08:00
|
|
|
cpumask_copy(d->old_domain, d->domain);
|
2016-01-01 00:30:48 +08:00
|
|
|
for_each_cpu(new_cpu, vector_searchmask)
|
2015-08-03 04:38:27 +08:00
|
|
|
per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
|
2016-01-01 00:30:49 +08:00
|
|
|
goto update;
|
2016-01-01 00:30:47 +08:00
|
|
|
|
|
|
|
next_cpu:
|
|
|
|
/*
|
|
|
|
* We exclude the current @vector_cpumask from the requested
|
|
|
|
* @mask and try again with the next online cpu in the
|
|
|
|
* result. We cannot modify @mask, so we use @vector_cpumask
|
|
|
|
* as a temporary buffer here as it will be reassigned when
|
|
|
|
* calling apic->vector_allocation_domain() above.
|
|
|
|
*/
|
|
|
|
cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask);
|
|
|
|
cpumask_andnot(vector_cpumask, mask, searched_cpumask);
|
|
|
|
cpu = cpumask_first_and(vector_cpumask, cpu_online_mask);
|
|
|
|
continue;
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
2016-01-01 00:30:46 +08:00
|
|
|
return -ENOSPC;
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2016-01-01 00:30:49 +08:00
|
|
|
update:
|
2016-01-01 00:30:50 +08:00
|
|
|
/*
|
|
|
|
* Exclude offline cpus from the cleanup mask and set the
|
|
|
|
* move_in_progress flag when the result is not empty.
|
|
|
|
*/
|
|
|
|
cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
|
|
|
|
d->move_in_progress = !cpumask_empty(d->old_domain);
|
x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().
When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.
I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.
We have to look at two cases:
1) The move_in_progress flag of the interrupt is set
This means the ioapic has been updated with the new vector, but it has not
fired yet. In theory there is a race:
set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
i.e. it's raised on the old vector.
So if the target cpu cannot handle that interrupt before the old vector is
cleaned up, we get a spurious interrupt and in the worst case the ioapic
irq line becomes stale, but my experiments so far have only resulted in
spurious interrupts.
But in case of cpu hotplug this should be a non issue because if the
affinity update happens right before all cpus rendevouz in stop machine,
there is no way that the interrupt can be blocked on the target cpu because
all cpus loops first with interrupts enabled in stop machine, so the old
vector is not yet cleaned up when the interrupt fires.
So the only way to run into this issue is if the delivery of the interrupt
on the apic/system bus would be delayed beyond the point where the target
cpu disables interrupts in stop machine. I doubt that it can happen, but at
least there is a theroretical chance. Virtualization might be able to
expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
hardware.
I've spent quite some time over the weekend to enforce that situation,
though I was not able to trigger the delayed case.
2) The move_in_progress flag is not set and the old_domain cpu mask is not
empty.
That means, that an interrupt was delivered after the change and the
cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
responded to it yet.
In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.
Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-03-14 16:40:46 +08:00
|
|
|
d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0;
|
2016-01-01 00:30:49 +08:00
|
|
|
d->cfg.vector = vector;
|
|
|
|
cpumask_copy(d->domain, vector_cpumask);
|
2016-01-01 00:30:46 +08:00
|
|
|
success:
|
2016-01-01 00:30:48 +08:00
|
|
|
/*
|
|
|
|
* Cache destination APIC IDs into cfg->dest_apicid. This cannot fail
|
|
|
|
* as we already established, that mask & d->domain & cpu_online_mask
|
|
|
|
* is not empty.
|
|
|
|
*/
|
|
|
|
BUG_ON(apic->cpu_mask_to_apicid_and(mask, d->domain,
|
|
|
|
&d->cfg.dest_apicid));
|
|
|
|
return 0;
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
static int assign_irq_vector(int irq, struct apic_chip_data *data,
|
2015-04-14 10:30:00 +08:00
|
|
|
const struct cpumask *mask)
|
2014-10-27 16:12:00 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
raw_spin_lock_irqsave(&vector_lock, flags);
|
2015-04-14 10:30:03 +08:00
|
|
|
err = __assign_irq_vector(irq, data, mask);
|
2014-10-27 16:12:00 +08:00
|
|
|
raw_spin_unlock_irqrestore(&vector_lock, flags);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2015-05-07 10:53:56 +08:00
|
|
|
static int assign_irq_vector_policy(int irq, int node,
|
|
|
|
struct apic_chip_data *data,
|
|
|
|
struct irq_alloc_info *info)
|
|
|
|
{
|
|
|
|
if (info && info->mask)
|
|
|
|
return assign_irq_vector(irq, data, info->mask);
|
|
|
|
if (node != NUMA_NO_NODE &&
|
|
|
|
assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
|
|
|
|
return 0;
|
|
|
|
return assign_irq_vector(irq, data, apic->target_cpus());
|
|
|
|
}
|
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
static void clear_irq_vector(int irq, struct apic_chip_data *data)
|
2014-10-27 16:12:00 +08:00
|
|
|
{
|
2015-08-03 04:38:27 +08:00
|
|
|
struct irq_desc *desc;
|
|
|
|
int cpu, vector;
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
BUG_ON(!data->cfg.vector);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
vector = data->cfg.vector;
|
|
|
|
for_each_cpu_and(cpu, data->domain, cpu_online_mask)
|
2015-08-03 04:38:25 +08:00
|
|
|
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
data->cfg.vector = 0;
|
|
|
|
cpumask_clear(data->domain);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2016-01-01 00:30:54 +08:00
|
|
|
/*
|
|
|
|
* If move is in progress or the old_domain mask is not empty,
|
|
|
|
* i.e. the cleanup IPI has not been processed yet, we need to remove
|
|
|
|
* the old references to desc from all cpus vector tables.
|
|
|
|
*/
|
|
|
|
if (!data->move_in_progress && cpumask_empty(data->old_domain))
|
2014-10-27 16:12:00 +08:00
|
|
|
return;
|
|
|
|
|
2015-08-03 04:38:27 +08:00
|
|
|
desc = irq_to_desc(irq);
|
2015-04-14 10:30:03 +08:00
|
|
|
for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
|
2014-10-27 16:12:00 +08:00
|
|
|
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
|
|
|
|
vector++) {
|
2015-08-03 04:38:27 +08:00
|
|
|
if (per_cpu(vector_irq, cpu)[vector] != desc)
|
2014-10-27 16:12:00 +08:00
|
|
|
continue;
|
2015-08-03 04:38:25 +08:00
|
|
|
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
|
2014-10-27 16:12:00 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2015-04-14 10:30:03 +08:00
|
|
|
data->move_in_progress = 0;
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
2015-04-13 14:11:24 +08:00
|
|
|
void init_irq_alloc_info(struct irq_alloc_info *info,
|
|
|
|
const struct cpumask *mask)
|
|
|
|
{
|
|
|
|
memset(info, 0, sizeof(*info));
|
|
|
|
info->mask = mask;
|
|
|
|
}
|
|
|
|
|
|
|
|
void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
|
|
|
|
{
|
|
|
|
if (src)
|
|
|
|
*dst = *src;
|
|
|
|
else
|
|
|
|
memset(dst, 0, sizeof(*dst));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void x86_vector_free_irqs(struct irq_domain *domain,
|
|
|
|
unsigned int virq, unsigned int nr_irqs)
|
|
|
|
{
|
2016-01-01 00:30:44 +08:00
|
|
|
struct apic_chip_data *apic_data;
|
2015-04-13 14:11:24 +08:00
|
|
|
struct irq_data *irq_data;
|
2016-01-01 00:30:44 +08:00
|
|
|
unsigned long flags;
|
2015-04-13 14:11:24 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_irqs; i++) {
|
|
|
|
irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
|
|
|
|
if (irq_data && irq_data->chip_data) {
|
2016-01-01 00:30:44 +08:00
|
|
|
raw_spin_lock_irqsave(&vector_lock, flags);
|
2015-04-13 14:11:24 +08:00
|
|
|
clear_irq_vector(virq + i, irq_data->chip_data);
|
2016-01-01 00:30:44 +08:00
|
|
|
apic_data = irq_data->chip_data;
|
|
|
|
irq_domain_reset_irq_data(irq_data);
|
|
|
|
raw_spin_unlock_irqrestore(&vector_lock, flags);
|
|
|
|
free_apic_chip_data(apic_data);
|
2015-04-13 14:11:56 +08:00
|
|
|
#ifdef CONFIG_X86_IO_APIC
|
|
|
|
if (virq + i < nr_legacy_irqs())
|
2015-04-14 10:30:03 +08:00
|
|
|
legacy_irq_data[virq + i] = NULL;
|
2015-04-13 14:11:56 +08:00
|
|
|
#endif
|
2015-04-13 14:11:24 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
|
|
|
|
unsigned int nr_irqs, void *arg)
|
|
|
|
{
|
|
|
|
struct irq_alloc_info *info = arg;
|
2015-04-14 10:30:03 +08:00
|
|
|
struct apic_chip_data *data;
|
2015-04-13 14:11:24 +08:00
|
|
|
struct irq_data *irq_data;
|
2015-06-01 16:05:14 +08:00
|
|
|
int i, err, node;
|
2015-04-13 14:11:24 +08:00
|
|
|
|
|
|
|
if (disable_apic)
|
|
|
|
return -ENXIO;
|
|
|
|
|
|
|
|
/* Currently vector allocator can't guarantee contiguous allocations */
|
|
|
|
if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
|
|
|
|
return -ENOSYS;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_irqs; i++) {
|
|
|
|
irq_data = irq_domain_get_irq_data(domain, virq + i);
|
|
|
|
BUG_ON(!irq_data);
|
2015-06-01 16:05:14 +08:00
|
|
|
node = irq_data_get_node(irq_data);
|
2015-04-13 14:11:56 +08:00
|
|
|
#ifdef CONFIG_X86_IO_APIC
|
2015-04-14 10:30:03 +08:00
|
|
|
if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
|
|
|
|
data = legacy_irq_data[virq + i];
|
2015-04-13 14:11:56 +08:00
|
|
|
else
|
|
|
|
#endif
|
2015-06-01 16:05:14 +08:00
|
|
|
data = alloc_apic_chip_data(node);
|
2015-04-14 10:30:03 +08:00
|
|
|
if (!data) {
|
2015-04-13 14:11:24 +08:00
|
|
|
err = -ENOMEM;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
irq_data->chip = &lapic_controller;
|
2015-04-14 10:30:03 +08:00
|
|
|
irq_data->chip_data = data;
|
2015-04-13 14:11:24 +08:00
|
|
|
irq_data->hwirq = virq + i;
|
2015-09-02 06:20:51 +08:00
|
|
|
err = assign_irq_vector_policy(virq + i, node, data, info);
|
2015-04-13 14:11:24 +08:00
|
|
|
if (err)
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error:
|
|
|
|
x86_vector_free_irqs(domain, virq, i + 1);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2015-05-05 17:10:11 +08:00
|
|
|
static const struct irq_domain_ops x86_vector_domain_ops = {
|
|
|
|
.alloc = x86_vector_alloc_irqs,
|
|
|
|
.free = x86_vector_free_irqs,
|
2015-04-13 14:11:24 +08:00
|
|
|
};
|
|
|
|
|
2014-10-27 16:12:05 +08:00
|
|
|
int __init arch_probe_nr_irqs(void)
|
|
|
|
{
|
|
|
|
int nr;
|
|
|
|
|
|
|
|
if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
|
|
|
|
nr_irqs = NR_VECTORS * nr_cpu_ids;
|
|
|
|
|
|
|
|
nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
|
|
|
|
#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
|
|
|
|
/*
|
|
|
|
* for MSI and HT dyn irq
|
|
|
|
*/
|
|
|
|
if (gsi_top <= NR_IRQS_LEGACY)
|
|
|
|
nr += 8 * nr_cpu_ids;
|
|
|
|
else
|
|
|
|
nr += gsi_top * 16;
|
|
|
|
#endif
|
|
|
|
if (nr < nr_irqs)
|
|
|
|
nr_irqs = nr;
|
|
|
|
|
2015-11-03 17:40:14 +08:00
|
|
|
/*
|
|
|
|
* We don't know if PIC is present at this point so we need to do
|
|
|
|
* probe() to get the right number of legacy IRQs.
|
|
|
|
*/
|
|
|
|
return legacy_pic->probe();
|
2014-10-27 16:12:05 +08:00
|
|
|
}
|
|
|
|
|
2015-04-13 14:11:56 +08:00
|
|
|
#ifdef CONFIG_X86_IO_APIC
|
|
|
|
static void init_legacy_irqs(void)
|
|
|
|
{
|
|
|
|
int i, node = cpu_to_node(0);
|
2015-04-14 10:30:03 +08:00
|
|
|
struct apic_chip_data *data;
|
2015-04-13 14:11:56 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For legacy IRQ's, start with assigning irq0 to irq15 to
|
2015-05-11 22:05:09 +08:00
|
|
|
* ISA_IRQ_VECTOR(i) for all cpu's.
|
2015-04-13 14:11:56 +08:00
|
|
|
*/
|
|
|
|
for (i = 0; i < nr_legacy_irqs(); i++) {
|
2015-04-14 10:30:03 +08:00
|
|
|
data = legacy_irq_data[i] = alloc_apic_chip_data(node);
|
|
|
|
BUG_ON(!data);
|
2015-05-11 22:05:09 +08:00
|
|
|
|
|
|
|
data->cfg.vector = ISA_IRQ_VECTOR(i);
|
2015-04-14 10:30:03 +08:00
|
|
|
cpumask_setall(data->domain);
|
|
|
|
irq_set_chip_data(i, data);
|
2015-04-13 14:11:56 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void init_legacy_irqs(void) { }
|
|
|
|
#endif
|
|
|
|
|
2014-10-27 16:12:05 +08:00
|
|
|
int __init arch_early_irq_init(void)
|
|
|
|
{
|
2015-04-13 14:11:56 +08:00
|
|
|
init_legacy_irqs();
|
|
|
|
|
2015-04-13 14:11:24 +08:00
|
|
|
x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops,
|
|
|
|
NULL);
|
|
|
|
BUG_ON(x86_vector_domain == NULL);
|
|
|
|
irq_set_default_host(x86_vector_domain);
|
|
|
|
|
2015-04-13 14:11:35 +08:00
|
|
|
arch_init_msi_domain(x86_vector_domain);
|
2015-04-13 14:11:43 +08:00
|
|
|
arch_init_htirq_domain(x86_vector_domain);
|
2015-04-13 14:11:35 +08:00
|
|
|
|
2015-04-14 10:30:10 +08:00
|
|
|
BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
|
2016-01-01 00:30:48 +08:00
|
|
|
BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
|
2016-01-01 00:30:46 +08:00
|
|
|
BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL));
|
2015-04-14 10:30:10 +08:00
|
|
|
|
2014-10-27 16:12:05 +08:00
|
|
|
return arch_early_ioapic_init();
|
|
|
|
}
|
|
|
|
|
2015-08-03 04:38:27 +08:00
|
|
|
/* Initialize vector_irq on a new cpu */
|
2014-10-27 16:12:00 +08:00
|
|
|
static void __setup_vector_irq(int cpu)
|
|
|
|
{
|
2015-04-14 10:30:03 +08:00
|
|
|
struct apic_chip_data *data;
|
2015-08-03 04:38:27 +08:00
|
|
|
struct irq_desc *desc;
|
|
|
|
int irq, vector;
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
/* Mark the inuse vectors */
|
2015-08-03 04:38:27 +08:00
|
|
|
for_each_irq_desc(irq, desc) {
|
|
|
|
struct irq_data *idata = irq_desc_get_irq_data(desc);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2015-08-03 04:38:27 +08:00
|
|
|
data = apic_chip_data(idata);
|
|
|
|
if (!data || !cpumask_test_cpu(cpu, data->domain))
|
2014-10-27 16:12:00 +08:00
|
|
|
continue;
|
2015-04-14 10:30:03 +08:00
|
|
|
vector = data->cfg.vector;
|
2015-08-03 04:38:27 +08:00
|
|
|
per_cpu(vector_irq, cpu)[vector] = desc;
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
/* Mark the free vectors */
|
|
|
|
for (vector = 0; vector < NR_VECTORS; ++vector) {
|
2015-08-03 04:38:27 +08:00
|
|
|
desc = per_cpu(vector_irq, cpu)[vector];
|
|
|
|
if (IS_ERR_OR_NULL(desc))
|
2014-10-27 16:12:00 +08:00
|
|
|
continue;
|
|
|
|
|
2015-08-03 04:38:27 +08:00
|
|
|
data = apic_chip_data(irq_desc_get_irq_data(desc));
|
2015-04-14 10:30:03 +08:00
|
|
|
if (!cpumask_test_cpu(cpu, data->domain))
|
2015-08-03 04:38:25 +08:00
|
|
|
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-07-06 01:12:32 +08:00
|
|
|
* Setup the vector to irq mappings. Must be called with vector_lock held.
|
2014-10-27 16:12:00 +08:00
|
|
|
*/
|
|
|
|
void setup_vector_irq(int cpu)
|
|
|
|
{
|
|
|
|
int irq;
|
|
|
|
|
2015-07-06 01:12:32 +08:00
|
|
|
lockdep_assert_held(&vector_lock);
|
2014-10-27 16:12:00 +08:00
|
|
|
/*
|
|
|
|
* On most of the platforms, legacy PIC delivers the interrupts on the
|
|
|
|
* boot cpu. But there are certain platforms where PIC interrupts are
|
|
|
|
* delivered to multiple cpu's. If the legacy IRQ is handled by the
|
|
|
|
* legacy PIC, for the new cpu that is coming online, setup the static
|
|
|
|
* legacy vector to irq mapping:
|
|
|
|
*/
|
|
|
|
for (irq = 0; irq < nr_legacy_irqs(); irq++)
|
2015-08-03 04:38:27 +08:00
|
|
|
per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
__setup_vector_irq(cpu);
|
|
|
|
}
|
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
static int apic_retrigger_irq(struct irq_data *irq_data)
|
2014-10-27 16:12:00 +08:00
|
|
|
{
|
2015-04-14 10:30:03 +08:00
|
|
|
struct apic_chip_data *data = apic_chip_data(irq_data);
|
2014-10-27 16:12:00 +08:00
|
|
|
unsigned long flags;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
raw_spin_lock_irqsave(&vector_lock, flags);
|
2015-04-14 10:30:03 +08:00
|
|
|
cpu = cpumask_first_and(data->domain, cpu_online_mask);
|
|
|
|
apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector);
|
2014-10-27 16:12:00 +08:00
|
|
|
raw_spin_unlock_irqrestore(&vector_lock, flags);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
void apic_ack_edge(struct irq_data *data)
|
|
|
|
{
|
2014-10-27 16:12:07 +08:00
|
|
|
irq_complete_move(irqd_cfg(data));
|
2014-10-27 16:12:00 +08:00
|
|
|
irq_move_irq(data);
|
|
|
|
ack_APIC_irq();
|
|
|
|
}
|
|
|
|
|
2015-04-14 10:30:01 +08:00
|
|
|
static int apic_set_affinity(struct irq_data *irq_data,
|
|
|
|
const struct cpumask *dest, bool force)
|
2015-04-13 14:11:24 +08:00
|
|
|
{
|
2015-04-14 10:30:03 +08:00
|
|
|
struct apic_chip_data *data = irq_data->chip_data;
|
2015-04-13 14:11:24 +08:00
|
|
|
int err, irq = irq_data->irq;
|
|
|
|
|
|
|
|
if (!config_enabled(CONFIG_SMP))
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
if (!cpumask_intersects(dest, cpu_online_mask))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
err = assign_irq_vector(irq, data, dest);
|
2016-01-01 00:30:48 +08:00
|
|
|
return err ? err : IRQ_SET_MASK_OK;
|
2015-04-13 14:11:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct irq_chip lapic_controller = {
|
|
|
|
.irq_ack = apic_ack_edge,
|
2015-04-14 10:30:01 +08:00
|
|
|
.irq_set_affinity = apic_set_affinity,
|
2015-04-13 14:11:24 +08:00
|
|
|
.irq_retrigger = apic_retrigger_irq,
|
|
|
|
};
|
|
|
|
|
2014-10-27 16:12:00 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2015-04-14 10:30:03 +08:00
|
|
|
static void __send_cleanup_vector(struct apic_chip_data *data)
|
2014-10-27 16:12:00 +08:00
|
|
|
{
|
2016-01-01 00:30:51 +08:00
|
|
|
raw_spin_lock(&vector_lock);
|
2016-01-01 00:30:52 +08:00
|
|
|
cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
|
2016-01-01 00:30:51 +08:00
|
|
|
data->move_in_progress = 0;
|
2016-01-01 00:30:52 +08:00
|
|
|
if (!cpumask_empty(data->old_domain))
|
|
|
|
apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR);
|
2016-01-01 00:30:51 +08:00
|
|
|
raw_spin_unlock(&vector_lock);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
2015-04-14 10:30:02 +08:00
|
|
|
void send_cleanup_vector(struct irq_cfg *cfg)
|
|
|
|
{
|
2015-04-14 10:30:03 +08:00
|
|
|
struct apic_chip_data *data;
|
|
|
|
|
|
|
|
data = container_of(cfg, struct apic_chip_data, cfg);
|
|
|
|
if (data->move_in_progress)
|
|
|
|
__send_cleanup_vector(data);
|
2015-04-14 10:30:02 +08:00
|
|
|
}
|
|
|
|
|
2014-10-27 16:12:00 +08:00
|
|
|
asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
|
|
|
|
{
|
|
|
|
unsigned vector, me;
|
|
|
|
|
2015-05-15 21:48:25 +08:00
|
|
|
entering_ack_irq();
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2015-08-03 04:38:23 +08:00
|
|
|
/* Prevent vectors vanishing under us */
|
|
|
|
raw_spin_lock(&vector_lock);
|
|
|
|
|
2014-10-27 16:12:00 +08:00
|
|
|
me = smp_processor_id();
|
|
|
|
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
|
2015-04-14 10:30:03 +08:00
|
|
|
struct apic_chip_data *data;
|
2015-08-03 04:38:27 +08:00
|
|
|
struct irq_desc *desc;
|
|
|
|
unsigned int irr;
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2015-08-03 04:38:23 +08:00
|
|
|
retry:
|
2015-08-03 04:38:27 +08:00
|
|
|
desc = __this_cpu_read(vector_irq[vector]);
|
|
|
|
if (IS_ERR_OR_NULL(desc))
|
2014-10-27 16:12:00 +08:00
|
|
|
continue;
|
|
|
|
|
2015-08-03 04:38:23 +08:00
|
|
|
if (!raw_spin_trylock(&desc->lock)) {
|
|
|
|
raw_spin_unlock(&vector_lock);
|
|
|
|
cpu_relax();
|
|
|
|
raw_spin_lock(&vector_lock);
|
|
|
|
goto retry;
|
|
|
|
}
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2015-08-03 04:38:27 +08:00
|
|
|
data = apic_chip_data(irq_desc_get_irq_data(desc));
|
2015-04-14 10:30:03 +08:00
|
|
|
if (!data)
|
2015-08-03 04:38:23 +08:00
|
|
|
goto unlock;
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
/*
|
2016-01-01 00:30:54 +08:00
|
|
|
* Nothing to cleanup if irq migration is in progress
|
|
|
|
* or this cpu is not set in the cleanup mask.
|
2014-10-27 16:12:00 +08:00
|
|
|
*/
|
2016-01-01 00:30:54 +08:00
|
|
|
if (data->move_in_progress ||
|
|
|
|
!cpumask_test_cpu(me, data->old_domain))
|
2014-10-27 16:12:00 +08:00
|
|
|
goto unlock;
|
|
|
|
|
2016-01-01 00:30:54 +08:00
|
|
|
/*
|
|
|
|
* We have two cases to handle here:
|
|
|
|
* 1) vector is unchanged but the target mask got reduced
|
|
|
|
* 2) vector and the target mask has changed
|
|
|
|
*
|
|
|
|
* #1 is obvious, but in #2 we have two vectors with the same
|
|
|
|
* irq descriptor: the old and the new vector. So we need to
|
|
|
|
* make sure that we only cleanup the old vector. The new
|
|
|
|
* vector has the current @vector number in the config and
|
|
|
|
* this cpu is part of the target mask. We better leave that
|
|
|
|
* one alone.
|
|
|
|
*/
|
2015-04-14 10:30:03 +08:00
|
|
|
if (vector == data->cfg.vector &&
|
|
|
|
cpumask_test_cpu(me, data->domain))
|
2014-10-27 16:12:00 +08:00
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
|
|
|
|
/*
|
|
|
|
* Check if the vector that needs to be cleanedup is
|
|
|
|
* registered at the cpu's IRR. If so, then this is not
|
|
|
|
* the best time to clean it up. Lets clean it up in the
|
|
|
|
* next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
|
|
|
|
* to myself.
|
|
|
|
*/
|
|
|
|
if (irr & (1 << (vector % 32))) {
|
|
|
|
apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
|
|
|
|
goto unlock;
|
|
|
|
}
|
2015-08-03 04:38:25 +08:00
|
|
|
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
|
2016-01-01 00:30:54 +08:00
|
|
|
cpumask_clear_cpu(me, data->old_domain);
|
2014-10-27 16:12:00 +08:00
|
|
|
unlock:
|
|
|
|
raw_spin_unlock(&desc->lock);
|
|
|
|
}
|
|
|
|
|
2015-08-03 04:38:23 +08:00
|
|
|
raw_spin_unlock(&vector_lock);
|
|
|
|
|
2015-05-15 21:48:25 +08:00
|
|
|
exiting_irq();
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
|
|
|
|
{
|
|
|
|
unsigned me;
|
2015-04-14 10:30:03 +08:00
|
|
|
struct apic_chip_data *data;
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2015-04-14 10:30:03 +08:00
|
|
|
data = container_of(cfg, struct apic_chip_data, cfg);
|
|
|
|
if (likely(!data->move_in_progress))
|
2014-10-27 16:12:00 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
me = smp_processor_id();
|
2015-04-14 10:30:03 +08:00
|
|
|
if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain))
|
|
|
|
__send_cleanup_vector(data);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void irq_complete_move(struct irq_cfg *cfg)
|
|
|
|
{
|
|
|
|
__irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
|
|
|
|
}
|
|
|
|
|
2016-01-01 00:30:53 +08:00
|
|
|
/*
|
x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().
When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.
I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.
We have to look at two cases:
1) The move_in_progress flag of the interrupt is set
This means the ioapic has been updated with the new vector, but it has not
fired yet. In theory there is a race:
set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
i.e. it's raised on the old vector.
So if the target cpu cannot handle that interrupt before the old vector is
cleaned up, we get a spurious interrupt and in the worst case the ioapic
irq line becomes stale, but my experiments so far have only resulted in
spurious interrupts.
But in case of cpu hotplug this should be a non issue because if the
affinity update happens right before all cpus rendevouz in stop machine,
there is no way that the interrupt can be blocked on the target cpu because
all cpus loops first with interrupts enabled in stop machine, so the old
vector is not yet cleaned up when the interrupt fires.
So the only way to run into this issue is if the delivery of the interrupt
on the apic/system bus would be delayed beyond the point where the target
cpu disables interrupts in stop machine. I doubt that it can happen, but at
least there is a theroretical chance. Virtualization might be able to
expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
hardware.
I've spent quite some time over the weekend to enforce that situation,
though I was not able to trigger the delayed case.
2) The move_in_progress flag is not set and the old_domain cpu mask is not
empty.
That means, that an interrupt was delivered after the change and the
cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
responded to it yet.
In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.
Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-03-14 16:40:46 +08:00
|
|
|
* Called from fixup_irqs() with @desc->lock held and interrupts disabled.
|
2016-01-01 00:30:53 +08:00
|
|
|
*/
|
|
|
|
void irq_force_complete_move(struct irq_desc *desc)
|
2014-10-27 16:12:00 +08:00
|
|
|
{
|
2016-01-01 00:30:53 +08:00
|
|
|
struct irq_data *irqdata = irq_desc_get_irq_data(desc);
|
|
|
|
struct apic_chip_data *data = apic_chip_data(irqdata);
|
|
|
|
struct irq_cfg *cfg = data ? &data->cfg : NULL;
|
x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().
When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.
I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.
We have to look at two cases:
1) The move_in_progress flag of the interrupt is set
This means the ioapic has been updated with the new vector, but it has not
fired yet. In theory there is a race:
set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
i.e. it's raised on the old vector.
So if the target cpu cannot handle that interrupt before the old vector is
cleaned up, we get a spurious interrupt and in the worst case the ioapic
irq line becomes stale, but my experiments so far have only resulted in
spurious interrupts.
But in case of cpu hotplug this should be a non issue because if the
affinity update happens right before all cpus rendevouz in stop machine,
there is no way that the interrupt can be blocked on the target cpu because
all cpus loops first with interrupts enabled in stop machine, so the old
vector is not yet cleaned up when the interrupt fires.
So the only way to run into this issue is if the delivery of the interrupt
on the apic/system bus would be delayed beyond the point where the target
cpu disables interrupts in stop machine. I doubt that it can happen, but at
least there is a theroretical chance. Virtualization might be able to
expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
hardware.
I've spent quite some time over the weekend to enforce that situation,
though I was not able to trigger the delayed case.
2) The move_in_progress flag is not set and the old_domain cpu mask is not
empty.
That means, that an interrupt was delivered after the change and the
cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
responded to it yet.
In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.
Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-03-14 16:40:46 +08:00
|
|
|
unsigned int cpu;
|
2016-01-01 00:30:52 +08:00
|
|
|
|
|
|
|
if (!cfg)
|
|
|
|
return;
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2016-01-01 00:30:52 +08:00
|
|
|
/*
|
2016-01-01 00:30:54 +08:00
|
|
|
* This is tricky. If the cleanup of @data->old_domain has not been
|
|
|
|
* done yet, then the following setaffinity call will fail with
|
|
|
|
* -EBUSY. This can leave the interrupt in a stale state.
|
|
|
|
*
|
x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().
When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.
I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.
We have to look at two cases:
1) The move_in_progress flag of the interrupt is set
This means the ioapic has been updated with the new vector, but it has not
fired yet. In theory there is a race:
set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
i.e. it's raised on the old vector.
So if the target cpu cannot handle that interrupt before the old vector is
cleaned up, we get a spurious interrupt and in the worst case the ioapic
irq line becomes stale, but my experiments so far have only resulted in
spurious interrupts.
But in case of cpu hotplug this should be a non issue because if the
affinity update happens right before all cpus rendevouz in stop machine,
there is no way that the interrupt can be blocked on the target cpu because
all cpus loops first with interrupts enabled in stop machine, so the old
vector is not yet cleaned up when the interrupt fires.
So the only way to run into this issue is if the delivery of the interrupt
on the apic/system bus would be delayed beyond the point where the target
cpu disables interrupts in stop machine. I doubt that it can happen, but at
least there is a theroretical chance. Virtualization might be able to
expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
hardware.
I've spent quite some time over the weekend to enforce that situation,
though I was not able to trigger the delayed case.
2) The move_in_progress flag is not set and the old_domain cpu mask is not
empty.
That means, that an interrupt was delivered after the change and the
cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
responded to it yet.
In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.
Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-03-14 16:40:46 +08:00
|
|
|
* All CPUs are stuck in stop machine with interrupts disabled so
|
|
|
|
* calling __irq_complete_move() would be completely pointless.
|
2016-01-01 00:30:52 +08:00
|
|
|
*/
|
|
|
|
raw_spin_lock(&vector_lock);
|
x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().
When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.
I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.
We have to look at two cases:
1) The move_in_progress flag of the interrupt is set
This means the ioapic has been updated with the new vector, but it has not
fired yet. In theory there is a race:
set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
i.e. it's raised on the old vector.
So if the target cpu cannot handle that interrupt before the old vector is
cleaned up, we get a spurious interrupt and in the worst case the ioapic
irq line becomes stale, but my experiments so far have only resulted in
spurious interrupts.
But in case of cpu hotplug this should be a non issue because if the
affinity update happens right before all cpus rendevouz in stop machine,
there is no way that the interrupt can be blocked on the target cpu because
all cpus loops first with interrupts enabled in stop machine, so the old
vector is not yet cleaned up when the interrupt fires.
So the only way to run into this issue is if the delivery of the interrupt
on the apic/system bus would be delayed beyond the point where the target
cpu disables interrupts in stop machine. I doubt that it can happen, but at
least there is a theroretical chance. Virtualization might be able to
expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
hardware.
I've spent quite some time over the weekend to enforce that situation,
though I was not able to trigger the delayed case.
2) The move_in_progress flag is not set and the old_domain cpu mask is not
empty.
That means, that an interrupt was delivered after the change and the
cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
responded to it yet.
In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.
Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-03-14 16:40:46 +08:00
|
|
|
/*
|
|
|
|
* Clean out all offline cpus (including the outgoing one) from the
|
|
|
|
* old_domain mask.
|
|
|
|
*/
|
2016-01-01 00:30:54 +08:00
|
|
|
cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
|
x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().
When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.
I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.
We have to look at two cases:
1) The move_in_progress flag of the interrupt is set
This means the ioapic has been updated with the new vector, but it has not
fired yet. In theory there is a race:
set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
i.e. it's raised on the old vector.
So if the target cpu cannot handle that interrupt before the old vector is
cleaned up, we get a spurious interrupt and in the worst case the ioapic
irq line becomes stale, but my experiments so far have only resulted in
spurious interrupts.
But in case of cpu hotplug this should be a non issue because if the
affinity update happens right before all cpus rendevouz in stop machine,
there is no way that the interrupt can be blocked on the target cpu because
all cpus loops first with interrupts enabled in stop machine, so the old
vector is not yet cleaned up when the interrupt fires.
So the only way to run into this issue is if the delivery of the interrupt
on the apic/system bus would be delayed beyond the point where the target
cpu disables interrupts in stop machine. I doubt that it can happen, but at
least there is a theroretical chance. Virtualization might be able to
expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
hardware.
I've spent quite some time over the weekend to enforce that situation,
though I was not able to trigger the delayed case.
2) The move_in_progress flag is not set and the old_domain cpu mask is not
empty.
That means, that an interrupt was delivered after the change and the
cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
responded to it yet.
In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.
Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-03-14 16:40:46 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If move_in_progress is cleared and the old_domain mask is empty,
|
|
|
|
* then there is nothing to cleanup. fixup_irqs() will take care of
|
|
|
|
* the stale vectors on the outgoing cpu.
|
|
|
|
*/
|
|
|
|
if (!data->move_in_progress && cpumask_empty(data->old_domain)) {
|
2016-01-01 00:30:54 +08:00
|
|
|
raw_spin_unlock(&vector_lock);
|
x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().
When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.
I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.
We have to look at two cases:
1) The move_in_progress flag of the interrupt is set
This means the ioapic has been updated with the new vector, but it has not
fired yet. In theory there is a race:
set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
i.e. it's raised on the old vector.
So if the target cpu cannot handle that interrupt before the old vector is
cleaned up, we get a spurious interrupt and in the worst case the ioapic
irq line becomes stale, but my experiments so far have only resulted in
spurious interrupts.
But in case of cpu hotplug this should be a non issue because if the
affinity update happens right before all cpus rendevouz in stop machine,
there is no way that the interrupt can be blocked on the target cpu because
all cpus loops first with interrupts enabled in stop machine, so the old
vector is not yet cleaned up when the interrupt fires.
So the only way to run into this issue is if the delivery of the interrupt
on the apic/system bus would be delayed beyond the point where the target
cpu disables interrupts in stop machine. I doubt that it can happen, but at
least there is a theroretical chance. Virtualization might be able to
expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
hardware.
I've spent quite some time over the weekend to enforce that situation,
though I was not able to trigger the delayed case.
2) The move_in_progress flag is not set and the old_domain cpu mask is not
empty.
That means, that an interrupt was delivered after the change and the
cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
responded to it yet.
In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.
Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-03-14 16:40:46 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 1) The interrupt is in move_in_progress state. That means that we
|
|
|
|
* have not seen an interrupt since the io_apic was reprogrammed to
|
|
|
|
* the new vector.
|
|
|
|
*
|
|
|
|
* 2) The interrupt has fired on the new vector, but the cleanup IPIs
|
|
|
|
* have not been processed yet.
|
|
|
|
*/
|
|
|
|
if (data->move_in_progress) {
|
2016-01-01 00:30:54 +08:00
|
|
|
/*
|
x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().
When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.
I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.
We have to look at two cases:
1) The move_in_progress flag of the interrupt is set
This means the ioapic has been updated with the new vector, but it has not
fired yet. In theory there is a race:
set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
i.e. it's raised on the old vector.
So if the target cpu cannot handle that interrupt before the old vector is
cleaned up, we get a spurious interrupt and in the worst case the ioapic
irq line becomes stale, but my experiments so far have only resulted in
spurious interrupts.
But in case of cpu hotplug this should be a non issue because if the
affinity update happens right before all cpus rendevouz in stop machine,
there is no way that the interrupt can be blocked on the target cpu because
all cpus loops first with interrupts enabled in stop machine, so the old
vector is not yet cleaned up when the interrupt fires.
So the only way to run into this issue is if the delivery of the interrupt
on the apic/system bus would be delayed beyond the point where the target
cpu disables interrupts in stop machine. I doubt that it can happen, but at
least there is a theroretical chance. Virtualization might be able to
expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
hardware.
I've spent quite some time over the weekend to enforce that situation,
though I was not able to trigger the delayed case.
2) The move_in_progress flag is not set and the old_domain cpu mask is not
empty.
That means, that an interrupt was delivered after the change and the
cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
responded to it yet.
In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.
Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-03-14 16:40:46 +08:00
|
|
|
* In theory there is a race:
|
|
|
|
*
|
|
|
|
* set_ioapic(new_vector) <-- Interrupt is raised before update
|
|
|
|
* is effective, i.e. it's raised on
|
|
|
|
* the old vector.
|
|
|
|
*
|
|
|
|
* So if the target cpu cannot handle that interrupt before
|
|
|
|
* the old vector is cleaned up, we get a spurious interrupt
|
|
|
|
* and in the worst case the ioapic irq line becomes stale.
|
|
|
|
*
|
|
|
|
* But in case of cpu hotplug this should be a non issue
|
|
|
|
* because if the affinity update happens right before all
|
|
|
|
* cpus rendevouz in stop machine, there is no way that the
|
|
|
|
* interrupt can be blocked on the target cpu because all cpus
|
|
|
|
* loops first with interrupts enabled in stop machine, so the
|
|
|
|
* old vector is not yet cleaned up when the interrupt fires.
|
|
|
|
*
|
|
|
|
* So the only way to run into this issue is if the delivery
|
|
|
|
* of the interrupt on the apic/system bus would be delayed
|
|
|
|
* beyond the point where the target cpu disables interrupts
|
|
|
|
* in stop machine. I doubt that it can happen, but at least
|
|
|
|
* there is a theroretical chance. Virtualization might be
|
|
|
|
* able to expose this, but AFAICT the IOAPIC emulation is not
|
|
|
|
* as stupid as the real hardware.
|
|
|
|
*
|
|
|
|
* Anyway, there is nothing we can do about that at this point
|
|
|
|
* w/o refactoring the whole fixup_irq() business completely.
|
|
|
|
* We print at least the irq number and the old vector number,
|
|
|
|
* so we have the necessary information when a problem in that
|
|
|
|
* area arises.
|
2016-01-01 00:30:54 +08:00
|
|
|
*/
|
x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().
When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.
I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.
We have to look at two cases:
1) The move_in_progress flag of the interrupt is set
This means the ioapic has been updated with the new vector, but it has not
fired yet. In theory there is a race:
set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
i.e. it's raised on the old vector.
So if the target cpu cannot handle that interrupt before the old vector is
cleaned up, we get a spurious interrupt and in the worst case the ioapic
irq line becomes stale, but my experiments so far have only resulted in
spurious interrupts.
But in case of cpu hotplug this should be a non issue because if the
affinity update happens right before all cpus rendevouz in stop machine,
there is no way that the interrupt can be blocked on the target cpu because
all cpus loops first with interrupts enabled in stop machine, so the old
vector is not yet cleaned up when the interrupt fires.
So the only way to run into this issue is if the delivery of the interrupt
on the apic/system bus would be delayed beyond the point where the target
cpu disables interrupts in stop machine. I doubt that it can happen, but at
least there is a theroretical chance. Virtualization might be able to
expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
hardware.
I've spent quite some time over the weekend to enforce that situation,
though I was not able to trigger the delayed case.
2) The move_in_progress flag is not set and the old_domain cpu mask is not
empty.
That means, that an interrupt was delivered after the change and the
cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
responded to it yet.
In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.
Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-03-14 16:40:46 +08:00
|
|
|
pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
|
|
|
|
irqdata->irq, cfg->old_vector);
|
2016-01-01 00:30:54 +08:00
|
|
|
}
|
x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().
When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.
I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.
We have to look at two cases:
1) The move_in_progress flag of the interrupt is set
This means the ioapic has been updated with the new vector, but it has not
fired yet. In theory there is a race:
set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
i.e. it's raised on the old vector.
So if the target cpu cannot handle that interrupt before the old vector is
cleaned up, we get a spurious interrupt and in the worst case the ioapic
irq line becomes stale, but my experiments so far have only resulted in
spurious interrupts.
But in case of cpu hotplug this should be a non issue because if the
affinity update happens right before all cpus rendevouz in stop machine,
there is no way that the interrupt can be blocked on the target cpu because
all cpus loops first with interrupts enabled in stop machine, so the old
vector is not yet cleaned up when the interrupt fires.
So the only way to run into this issue is if the delivery of the interrupt
on the apic/system bus would be delayed beyond the point where the target
cpu disables interrupts in stop machine. I doubt that it can happen, but at
least there is a theroretical chance. Virtualization might be able to
expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
hardware.
I've spent quite some time over the weekend to enforce that situation,
though I was not able to trigger the delayed case.
2) The move_in_progress flag is not set and the old_domain cpu mask is not
empty.
That means, that an interrupt was delivered after the change and the
cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
responded to it yet.
In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.
Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-03-14 16:40:46 +08:00
|
|
|
/*
|
|
|
|
* If old_domain is not empty, then other cpus still have the irq
|
|
|
|
* descriptor set in their vector array. Clean it up.
|
|
|
|
*/
|
|
|
|
for_each_cpu(cpu, data->old_domain)
|
|
|
|
per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED;
|
|
|
|
|
|
|
|
/* Cleanup the left overs of the (half finished) move */
|
|
|
|
cpumask_clear(data->old_domain);
|
|
|
|
data->move_in_progress = 0;
|
2016-01-01 00:30:52 +08:00
|
|
|
raw_spin_unlock(&vector_lock);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void __init print_APIC_field(int base)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
printk(KERN_DEBUG);
|
|
|
|
|
|
|
|
for (i = 0; i < 8; i++)
|
|
|
|
pr_cont("%08x", apic_read(base + i*0x10));
|
|
|
|
|
|
|
|
pr_cont("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __init print_local_APIC(void *dummy)
|
|
|
|
{
|
|
|
|
unsigned int i, v, ver, maxlvt;
|
|
|
|
u64 icr;
|
|
|
|
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("printing local APIC contents on CPU#%d/%d:\n",
|
|
|
|
smp_processor_id(), hard_smp_processor_id());
|
2014-10-27 16:12:00 +08:00
|
|
|
v = apic_read(APIC_ID);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id());
|
2014-10-27 16:12:00 +08:00
|
|
|
v = apic_read(APIC_LVR);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_info("... APIC VERSION: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
ver = GET_APIC_VERSION(v);
|
|
|
|
maxlvt = lapic_get_maxlvt();
|
|
|
|
|
|
|
|
v = apic_read(APIC_TASKPRI);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
/* !82489DX */
|
|
|
|
if (APIC_INTEGRATED(ver)) {
|
|
|
|
if (!APIC_XAPIC(ver)) {
|
|
|
|
v = apic_read(APIC_ARBPRI);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC ARBPRI: %08x (%02x)\n",
|
|
|
|
v, v & APIC_ARBPRI_MASK);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
v = apic_read(APIC_PROCPRI);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC PROCPRI: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remote read supported only in the 82489DX and local APIC for
|
|
|
|
* Pentium processors.
|
|
|
|
*/
|
|
|
|
if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
|
|
|
|
v = apic_read(APIC_RRR);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC RRR: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
v = apic_read(APIC_LDR);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC LDR: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
if (!x2apic_enabled()) {
|
|
|
|
v = apic_read(APIC_DFR);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC DFR: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
v = apic_read(APIC_SPIV);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC SPIV: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC ISR field:\n");
|
2014-10-27 16:12:00 +08:00
|
|
|
print_APIC_field(APIC_ISR);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC TMR field:\n");
|
2014-10-27 16:12:00 +08:00
|
|
|
print_APIC_field(APIC_TMR);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC IRR field:\n");
|
2014-10-27 16:12:00 +08:00
|
|
|
print_APIC_field(APIC_IRR);
|
|
|
|
|
|
|
|
/* !82489DX */
|
|
|
|
if (APIC_INTEGRATED(ver)) {
|
|
|
|
/* Due to the Pentium erratum 3AP. */
|
|
|
|
if (maxlvt > 3)
|
|
|
|
apic_write(APIC_ESR, 0);
|
|
|
|
|
|
|
|
v = apic_read(APIC_ESR);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC ESR: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
icr = apic_icr_read();
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC ICR: %08x\n", (u32)icr);
|
|
|
|
pr_debug("... APIC ICR2: %08x\n", (u32)(icr >> 32));
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
v = apic_read(APIC_LVTT);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC LVTT: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
if (maxlvt > 3) {
|
|
|
|
/* PC is LVT#4. */
|
|
|
|
v = apic_read(APIC_LVTPC);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC LVTPC: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
v = apic_read(APIC_LVT0);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC LVT0: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
v = apic_read(APIC_LVT1);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC LVT1: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
if (maxlvt > 2) {
|
|
|
|
/* ERR is LVT#3. */
|
|
|
|
v = apic_read(APIC_LVTERR);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC LVTERR: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
v = apic_read(APIC_TMICT);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC TMICT: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
v = apic_read(APIC_TMCCT);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC TMCCT: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
v = apic_read(APIC_TDCR);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC TDCR: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
|
|
|
|
v = apic_read(APIC_EFEAT);
|
|
|
|
maxlvt = (v >> 16) & 0xff;
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC EFEAT: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
v = apic_read(APIC_ECTRL);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC ECTRL: %08x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
for (i = 0; i < maxlvt; i++) {
|
|
|
|
v = apic_read(APIC_EILVTn(i));
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... APIC EILVT%d: %08x\n", i, v);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
pr_cont("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __init print_local_APICs(int maxcpu)
|
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
if (!maxcpu)
|
|
|
|
return;
|
|
|
|
|
|
|
|
preempt_disable();
|
|
|
|
for_each_online_cpu(cpu) {
|
|
|
|
if (cpu >= maxcpu)
|
|
|
|
break;
|
|
|
|
smp_call_function_single(cpu, print_local_APIC, NULL, 1);
|
|
|
|
}
|
|
|
|
preempt_enable();
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __init print_PIC(void)
|
|
|
|
{
|
|
|
|
unsigned int v;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
if (!nr_legacy_irqs())
|
|
|
|
return;
|
|
|
|
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("\nprinting PIC contents\n");
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
raw_spin_lock_irqsave(&i8259A_lock, flags);
|
|
|
|
|
|
|
|
v = inb(0xa1) << 8 | inb(0x21);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... PIC IMR: %04x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
v = inb(0xa0) << 8 | inb(0x20);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... PIC IRR: %04x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
outb(0x0b, 0xa0);
|
|
|
|
outb(0x0b, 0x20);
|
|
|
|
v = inb(0xa0) << 8 | inb(0x20);
|
|
|
|
outb(0x0a, 0xa0);
|
|
|
|
outb(0x0a, 0x20);
|
|
|
|
|
|
|
|
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
|
|
|
|
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... PIC ISR: %04x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
|
|
|
|
v = inb(0x4d1) << 8 | inb(0x4d0);
|
2014-10-27 16:12:01 +08:00
|
|
|
pr_debug("... PIC ELCR: %04x\n", v);
|
2014-10-27 16:12:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int show_lapic __initdata = 1;
|
|
|
|
static __init int setup_show_lapic(char *arg)
|
|
|
|
{
|
|
|
|
int num = -1;
|
|
|
|
|
|
|
|
if (strcmp(arg, "all") == 0) {
|
|
|
|
show_lapic = CONFIG_NR_CPUS;
|
|
|
|
} else {
|
|
|
|
get_option(&arg, &num);
|
|
|
|
if (num >= 0)
|
|
|
|
show_lapic = num;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("show_lapic=", setup_show_lapic);
|
|
|
|
|
|
|
|
static int __init print_ICs(void)
|
|
|
|
{
|
|
|
|
if (apic_verbosity == APIC_QUIET)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
print_PIC();
|
|
|
|
|
|
|
|
/* don't print out if apic is not there */
|
|
|
|
if (!cpu_has_apic && !apic_from_smp_config())
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
print_local_APICs(show_lapic);
|
|
|
|
print_IO_APICs();
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
late_initcall(print_ICs);
|