IOMMU Fixes for Linux v5.4-rc1

A couple of fixes for the AMD IOMMU driver have piled up:
 
 	* Some fixes for the reworked IO page-table which caused memory
 	  leaks or did not allow to downgrade mappings under some
 	  conditions.
 
 	* Locking fixes to fix a couple of possible races around
 	  accessing 'struct protection_domain'. The races got introduced
 	  when the dma-ops path became lock-less in the fast-path.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEr9jSbILcajRFYWYyK/BELZcBGuMFAl2PrpoACgkQK/BELZcB
 GuNo6A/9EpxNUllqaPLvGYJYPN1ye2kx9QOCYZW6vo+at10X9ywf69IqYtjP9cSe
 x5uWUy0BFjBhqHvMvQ+9m6begFsue/+csUZDmeP+KvBHwNxUOxFS/fb4P0WlmmNF
 /zzsjQbt+r1FRIdYodH2CvBJKyuxNxou0W1aARvs9iggoXVG5Es+WG9+kwnixBE+
 WB1gpuX0zKWlu31z2+i+JrVtdjMqoupfR/T40C4OsMD3NjfNi0bkCqmnqJ3CpNh9
 RWPmNlnd29imPhMYQonZcUFD6Ru4NOUCfEFCjHEK/nk9kSHMYjgkKFgOzvA8h1xG
 Nkzd0dRw39UMNYzKDGHHaE/xXRJV+kOFxZBcABnxfx2r+9EgXBD36AUOsfpeOdVi
 9ab75ok7Ly+tkCgdK7sEeuDD0HJiZkUYT7BqMTdBOt64BK/GtRvepF1Zv15hG6Xn
 imlAfyE4q+avTAJkrXeIu6IgdvF4XvorsIdeF5dKjCBTdTkj8DLXq/gejAo0g1NO
 shOz9E2lde1IdeT+U580nZy9JmkKDFjyeG4QkwSz7Oln/gHIFQS1K8A4i30kGiok
 vMsJzBidtUuqRWupwymtobCAggZE86O2XLOwnxolarJAFOqg5V2j7fSyL+XxXUDC
 r85Ve/jtAhMho5594X72CumoNzzr0bDyCcGerzvT0wBRXcKLIsw=
 =xajX
 -----END PGP SIGNATURE-----

Merge tag 'iommu-fixes-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu

Pull iommu fixes from Joerg Roedel:
 "A couple of fixes for the AMD IOMMU driver have piled up:

   - Some fixes for the reworked IO page-table which caused memory leaks
     or did not allow to downgrade mappings under some conditions.

   - Locking fixes to fix a couple of possible races around accessing
     'struct protection_domain'. The races got introduced when the
     dma-ops path became lock-less in the fast-path"

* tag 'iommu-fixes-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu:
  iommu/amd: Lock code paths traversing protection_domain->dev_list
  iommu/amd: Lock dev_data in attach/detach code paths
  iommu/amd: Check for busy devices earlier in attach_device()
  iommu/amd: Take domain->lock for complete attach/detach path
  iommu/amd: Remove amd_iommu_devtable_lock
  iommu/amd: Remove domain->updated
  iommu/amd: Wait for completion of IOTLB flush in attach_device
  iommu/amd: Unmap all L7 PTEs when downgrading page-sizes
  iommu/amd: Introduce first_pte_l7() helper
  iommu/amd: Fix downgrading default page-sizes in alloc_pte()
  iommu/amd: Fix pages leak in free_pagetable()
This commit is contained in:
Linus Torvalds 2019-09-29 10:00:14 -07:00
commit 4d2af08ed0
2 changed files with 141 additions and 96 deletions

View File

@ -70,7 +70,6 @@
*/
#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38))
static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
static DEFINE_SPINLOCK(pd_bitmap_lock);
/* List of all available dev_data structures */
@ -202,6 +201,7 @@ static struct iommu_dev_data *alloc_dev_data(u16 devid)
if (!dev_data)
return NULL;
spin_lock_init(&dev_data->lock);
dev_data->devid = devid;
ratelimit_default_init(&dev_data->rs);
@ -501,6 +501,29 @@ static void iommu_uninit_device(struct device *dev)
*/
}
/*
* Helper function to get the first pte of a large mapping
*/
static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
unsigned long *count)
{
unsigned long pte_mask, pg_size, cnt;
u64 *fpte;
pg_size = PTE_PAGE_SIZE(*pte);
cnt = PAGE_SIZE_PTE_COUNT(pg_size);
pte_mask = ~((cnt << 3) - 1);
fpte = (u64 *)(((unsigned long)pte) & pte_mask);
if (page_size)
*page_size = pg_size;
if (count)
*count = cnt;
return fpte;
}
/****************************************************************************
*
* Interrupt handling functions
@ -1311,8 +1334,12 @@ static void domain_flush_np_cache(struct protection_domain *domain,
dma_addr_t iova, size_t size)
{
if (unlikely(amd_iommu_np_cache)) {
unsigned long flags;
spin_lock_irqsave(&domain->lock, flags);
domain_flush_pages(domain, iova, size);
domain_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
}
}
@ -1425,7 +1452,7 @@ static void free_pagetable(struct protection_domain *domain)
BUG_ON(domain->mode < PAGE_MODE_NONE ||
domain->mode > PAGE_MODE_6_LEVEL);
free_sub_pt(root, domain->mode, freelist);
freelist = free_sub_pt(root, domain->mode, freelist);
free_page_list(freelist);
}
@ -1435,10 +1462,11 @@ static void free_pagetable(struct protection_domain *domain)
* another level increases the size of the address space by 9 bits to a size up
* to 64 bits.
*/
static void increase_address_space(struct protection_domain *domain,
static bool increase_address_space(struct protection_domain *domain,
gfp_t gfp)
{
unsigned long flags;
bool ret = false;
u64 *pte;
spin_lock_irqsave(&domain->lock, flags);
@ -1455,19 +1483,21 @@ static void increase_address_space(struct protection_domain *domain,
iommu_virt_to_phys(domain->pt_root));
domain->pt_root = pte;
domain->mode += 1;
domain->updated = true;
ret = true;
out:
spin_unlock_irqrestore(&domain->lock, flags);
return;
return ret;
}
static u64 *alloc_pte(struct protection_domain *domain,
unsigned long address,
unsigned long page_size,
u64 **pte_page,
gfp_t gfp)
gfp_t gfp,
bool *updated)
{
int level, end_lvl;
u64 *pte, *page;
@ -1475,7 +1505,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
BUG_ON(!is_power_of_2(page_size));
while (address > PM_LEVEL_SIZE(domain->mode))
increase_address_space(domain, gfp);
*updated = increase_address_space(domain, gfp) || *updated;
level = domain->mode - 1;
pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
@ -1489,9 +1519,32 @@ static u64 *alloc_pte(struct protection_domain *domain,
__pte = *pte;
pte_level = PM_PTE_LEVEL(__pte);
if (!IOMMU_PTE_PRESENT(__pte) ||
/*
* If we replace a series of large PTEs, we need
* to tear down all of them.
*/
if (IOMMU_PTE_PRESENT(__pte) &&
pte_level == PAGE_MODE_7_LEVEL) {
unsigned long count, i;
u64 *lpte;
lpte = first_pte_l7(pte, NULL, &count);
/*
* Unmap the replicated PTEs that still match the
* original large mapping
*/
for (i = 0; i < count; ++i)
cmpxchg64(&lpte[i], __pte, 0ULL);
*updated = true;
continue;
}
if (!IOMMU_PTE_PRESENT(__pte) ||
pte_level == PAGE_MODE_NONE) {
page = (u64 *)get_zeroed_page(gfp);
if (!page)
return NULL;
@ -1500,8 +1553,8 @@ static u64 *alloc_pte(struct protection_domain *domain,
/* pte could have been changed somewhere. */
if (cmpxchg64(pte, __pte, __npte) != __pte)
free_page((unsigned long)page);
else if (pte_level == PAGE_MODE_7_LEVEL)
domain->updated = true;
else if (IOMMU_PTE_PRESENT(__pte))
*updated = true;
continue;
}
@ -1566,17 +1619,12 @@ static u64 *fetch_pte(struct protection_domain *domain,
*page_size = PTE_LEVEL_PAGE_SIZE(level);
}
if (PM_PTE_LEVEL(*pte) == 0x07) {
unsigned long pte_mask;
/*
* If we have a series of large PTEs, make
* sure to return a pointer to the first one.
*/
*page_size = pte_mask = PTE_PAGE_SIZE(*pte);
pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
pte = (u64 *)(((unsigned long)pte) & pte_mask);
}
/*
* If we have a series of large PTEs, make
* sure to return a pointer to the first one.
*/
if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
pte = first_pte_l7(pte, page_size, NULL);
return pte;
}
@ -1615,26 +1663,29 @@ static int iommu_map_page(struct protection_domain *dom,
gfp_t gfp)
{
struct page *freelist = NULL;
bool updated = false;
u64 __pte, *pte;
int i, count;
int ret, i, count;
BUG_ON(!IS_ALIGNED(bus_addr, page_size));
BUG_ON(!IS_ALIGNED(phys_addr, page_size));
ret = -EINVAL;
if (!(prot & IOMMU_PROT_MASK))
return -EINVAL;
goto out;
count = PAGE_SIZE_PTE_COUNT(page_size);
pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp);
pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated);
ret = -ENOMEM;
if (!pte)
return -ENOMEM;
goto out;
for (i = 0; i < count; ++i)
freelist = free_clear_pte(&pte[i], pte[i], freelist);
if (freelist != NULL)
dom->updated = true;
updated = true;
if (count > 1) {
__pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
@ -1650,12 +1701,21 @@ static int iommu_map_page(struct protection_domain *dom,
for (i = 0; i < count; ++i)
pte[i] = __pte;
update_domain(dom);
ret = 0;
out:
if (updated) {
unsigned long flags;
spin_lock_irqsave(&dom->lock, flags);
update_domain(dom);
spin_unlock_irqrestore(&dom->lock, flags);
}
/* Everything flushed out, free pages now */
free_page_list(freelist);
return 0;
return ret;
}
static unsigned long iommu_unmap_page(struct protection_domain *dom,
@ -1806,8 +1866,12 @@ static void free_gcr3_table(struct protection_domain *domain)
static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom)
{
unsigned long flags;
spin_lock_irqsave(&dom->domain.lock, flags);
domain_flush_tlb(&dom->domain);
domain_flush_complete(&dom->domain);
spin_unlock_irqrestore(&dom->domain.lock, flags);
}
static void iova_domain_flush_tlb(struct iova_domain *iovad)
@ -2022,36 +2086,6 @@ static void do_detach(struct iommu_dev_data *dev_data)
domain->dev_cnt -= 1;
}
/*
* If a device is not yet associated with a domain, this function makes the
* device visible in the domain
*/
static int __attach_device(struct iommu_dev_data *dev_data,
struct protection_domain *domain)
{
int ret;
/* lock domain */
spin_lock(&domain->lock);
ret = -EBUSY;
if (dev_data->domain != NULL)
goto out_unlock;
/* Attach alias group root */
do_attach(dev_data, domain);
ret = 0;
out_unlock:
/* ready */
spin_unlock(&domain->lock);
return ret;
}
static void pdev_iommuv2_disable(struct pci_dev *pdev)
{
pci_disable_ats(pdev);
@ -2133,19 +2167,28 @@ static int attach_device(struct device *dev,
unsigned long flags;
int ret;
spin_lock_irqsave(&domain->lock, flags);
dev_data = get_dev_data(dev);
spin_lock(&dev_data->lock);
ret = -EBUSY;
if (dev_data->domain != NULL)
goto out;
if (!dev_is_pci(dev))
goto skip_ats_check;
pdev = to_pci_dev(dev);
if (domain->flags & PD_IOMMUV2_MASK) {
ret = -EINVAL;
if (!dev_data->passthrough)
return -EINVAL;
goto out;
if (dev_data->iommu_v2) {
if (pdev_iommuv2_enable(pdev) != 0)
return -EINVAL;
goto out;
dev_data->ats.enabled = true;
dev_data->ats.qdep = pci_ats_queue_depth(pdev);
@ -2158,9 +2201,9 @@ static int attach_device(struct device *dev,
}
skip_ats_check:
spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
ret = __attach_device(dev_data, domain);
spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
ret = 0;
do_attach(dev_data, domain);
/*
* We might boot into a crash-kernel here. The crashed kernel
@ -2169,25 +2212,16 @@ static int attach_device(struct device *dev,
*/
domain_flush_tlb_pde(domain);
domain_flush_complete(domain);
out:
spin_unlock(&dev_data->lock);
spin_unlock_irqrestore(&domain->lock, flags);
return ret;
}
/*
* Removes a device from a protection domain (unlocked)
*/
static void __detach_device(struct iommu_dev_data *dev_data)
{
struct protection_domain *domain;
domain = dev_data->domain;
spin_lock(&domain->lock);
do_detach(dev_data);
spin_unlock(&domain->lock);
}
/*
* Removes a device from a protection domain (with devtable_lock held)
*/
@ -2200,6 +2234,10 @@ static void detach_device(struct device *dev)
dev_data = get_dev_data(dev);
domain = dev_data->domain;
spin_lock_irqsave(&domain->lock, flags);
spin_lock(&dev_data->lock);
/*
* First check if the device is still attached. It might already
* be detached from its domain because the generic
@ -2207,15 +2245,12 @@ static void detach_device(struct device *dev)
* our alias handling.
*/
if (WARN_ON(!dev_data->domain))
return;
goto out;
/* lock device table */
spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
__detach_device(dev_data);
spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
do_detach(dev_data);
if (!dev_is_pci(dev))
return;
goto out;
if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
pdev_iommuv2_disable(to_pci_dev(dev));
@ -2223,6 +2258,11 @@ static void detach_device(struct device *dev)
pci_disable_ats(to_pci_dev(dev));
dev_data->ats.enabled = false;
out:
spin_unlock(&dev_data->lock);
spin_unlock_irqrestore(&domain->lock, flags);
}
static int amd_iommu_add_device(struct device *dev)
@ -2354,15 +2394,10 @@ static void update_device_table(struct protection_domain *domain)
static void update_domain(struct protection_domain *domain)
{
if (!domain->updated)
return;
update_device_table(domain);
domain_flush_devices(domain);
domain_flush_tlb_pde(domain);
domain->updated = false;
}
static int dir2prot(enum dma_data_direction direction)
@ -2392,6 +2427,7 @@ static dma_addr_t __map_single(struct device *dev,
{
dma_addr_t offset = paddr & ~PAGE_MASK;
dma_addr_t address, start, ret;
unsigned long flags;
unsigned int pages;
int prot = 0;
int i;
@ -2429,8 +2465,10 @@ static dma_addr_t __map_single(struct device *dev,
iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
}
spin_lock_irqsave(&dma_dom->domain.lock, flags);
domain_flush_tlb(&dma_dom->domain);
domain_flush_complete(&dma_dom->domain);
spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
dma_ops_free_iova(dma_dom, address, pages);
@ -2459,8 +2497,12 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
}
if (amd_iommu_unmap_flush) {
unsigned long flags;
spin_lock_irqsave(&dma_dom->domain.lock, flags);
domain_flush_tlb(&dma_dom->domain);
domain_flush_complete(&dma_dom->domain);
spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
dma_ops_free_iova(dma_dom, dma_addr, pages);
} else {
pages = __roundup_pow_of_two(pages);
@ -2866,16 +2908,16 @@ static void cleanup_domain(struct protection_domain *domain)
struct iommu_dev_data *entry;
unsigned long flags;
spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
spin_lock_irqsave(&domain->lock, flags);
while (!list_empty(&domain->dev_list)) {
entry = list_first_entry(&domain->dev_list,
struct iommu_dev_data, list);
BUG_ON(!entry->domain);
__detach_device(entry);
do_detach(entry);
}
spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
spin_unlock_irqrestore(&domain->lock, flags);
}
static void protection_domain_free(struct protection_domain *domain)
@ -3226,9 +3268,12 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
{
struct protection_domain *dom = to_pdomain(domain);
unsigned long flags;
spin_lock_irqsave(&dom->lock, flags);
domain_flush_tlb_pde(dom);
domain_flush_complete(dom);
spin_unlock_irqrestore(&dom->lock, flags);
}
static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
@ -3290,7 +3335,6 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom)
/* Update data structure */
domain->mode = PAGE_MODE_NONE;
domain->updated = true;
/* Make changes visible to IOMMUs */
update_domain(domain);
@ -3336,7 +3380,6 @@ int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
domain->glx = levels;
domain->flags |= PD_IOMMUV2_MASK;
domain->updated = true;
update_domain(domain);

View File

@ -475,7 +475,6 @@ struct protection_domain {
int glx; /* Number of levels for GCR3 table */
u64 *gcr3_tbl; /* Guest CR3 table */
unsigned long flags; /* flags to find out type of domain */
bool updated; /* complete domain flush required */
unsigned dev_cnt; /* devices assigned to this domain */
unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
};
@ -634,6 +633,9 @@ struct devid_map {
* This struct contains device specific data for the IOMMU
*/
struct iommu_dev_data {
/*Protect against attach/detach races */
spinlock_t lock;
struct list_head list; /* For domain->dev_list */
struct llist_node dev_data_list; /* For global dev_data_list */
struct protection_domain *domain; /* Domain the device is bound to */