From 0e78a87306a6f55b1c7bbafad1de62c3975953ca Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 3 May 2017 08:44:27 +0200 Subject: [PATCH 001/153] esp4: Fix udpencap for local TCP packets. Locally generated TCP packets are usually cloned, so we do skb_cow_data() on this packets. After that we need to reload the pointer to the esp header. On udpencap this header has an offset to skb_transport_header, so take this offset into account. Fixes: 67d349ed603 ("net/esp4: Fix invalid esph pointer crash") Fixes: fca11ebde3f0 ("esp4: Reorganize esp_output") Reported-by: Don Bowman Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 65cc02bd82bc..93322f895eab 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -248,6 +248,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * u8 *tail; u8 *vaddr; int nfrags; + int esph_offset; struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; @@ -313,11 +314,13 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } cow: + esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb); + nfrags = skb_cow_data(skb, tailen, &trailer); if (nfrags < 0) goto out; tail = skb_tail_pointer(trailer); - esp->esph = ip_esp_hdr(skb); + esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset); skip_cow: esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); From 9b3eb54106cf6acd03f07cf0ab01c13676a226c2 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 3 May 2017 16:43:19 +0200 Subject: [PATCH 002/153] xfrm: fix stack access out of bounds with CONFIG_XFRM_SUB_POLICY When CONFIG_XFRM_SUB_POLICY=y, xfrm_dst stores a copy of the flowi for that dst. Unfortunately, the code that allocates and fills this copy doesn't care about what type of flowi (flowi, flowi4, flowi6) gets passed. In multiple code paths (from raw_sendmsg, from TCP when replying to a FIN, in vxlan, geneve, and gre), the flowi that gets passed to xfrm is actually an on-stack flowi4, so we end up reading stuff from the stack past the end of the flowi4 struct. Since xfrm_dst->origin isn't used anywhere following commit ca116922afa8 ("xfrm: Eliminate "fl" and "pol" args to xfrm_bundle_ok()."), just get rid of it. xfrm_dst->partner isn't used either, so get rid of that too. Fixes: 9d6ec938019c ("ipv4: Use flowi4 in public route lookup interfaces.") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 10 --------- net/xfrm/xfrm_policy.c | 47 ------------------------------------------ 2 files changed, 57 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6793a30c66b1..7e7e2b0d2915 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -979,10 +979,6 @@ struct xfrm_dst { struct flow_cache_object flo; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols, num_xfrms; -#ifdef CONFIG_XFRM_SUB_POLICY - struct flowi *origin; - struct xfrm_selector *partner; -#endif u32 xfrm_genid; u32 policy_genid; u32 route_mtu_cached; @@ -998,12 +994,6 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) dst_release(xdst->route); if (likely(xdst->u.dst.xfrm)) xfrm_state_put(xdst->u.dst.xfrm); -#ifdef CONFIG_XFRM_SUB_POLICY - kfree(xdst->origin); - xdst->origin = NULL; - kfree(xdst->partner); - xdst->partner = NULL; -#endif } #endif diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b00a1d5a7f52..ed4e52d95172 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1797,43 +1797,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto out; } -#ifdef CONFIG_XFRM_SUB_POLICY -static int xfrm_dst_alloc_copy(void **target, const void *src, int size) -{ - if (!*target) { - *target = kmalloc(size, GFP_ATOMIC); - if (!*target) - return -ENOMEM; - } - - memcpy(*target, src, size); - return 0; -} -#endif - -static int xfrm_dst_update_parent(struct dst_entry *dst, - const struct xfrm_selector *sel) -{ -#ifdef CONFIG_XFRM_SUB_POLICY - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - return xfrm_dst_alloc_copy((void **)&(xdst->partner), - sel, sizeof(*sel)); -#else - return 0; -#endif -} - -static int xfrm_dst_update_origin(struct dst_entry *dst, - const struct flowi *fl) -{ -#ifdef CONFIG_XFRM_SUB_POLICY - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl)); -#else - return 0; -#endif -} - static int xfrm_expand_policies(const struct flowi *fl, u16 family, struct xfrm_policy **pols, int *num_pols, int *num_xfrms) @@ -1905,16 +1868,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, xdst = (struct xfrm_dst *)dst; xdst->num_xfrms = err; - if (num_pols > 1) - err = xfrm_dst_update_parent(dst, &pols[1]->selector); - else - err = xfrm_dst_update_origin(dst, fl); - if (unlikely(err)) { - dst_free(dst); - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); - return ERR_PTR(err); - } - xdst->num_pols = num_pols; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); From d90c902449a7561f1b1d58ba5a0d11728ce8b0b2 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 5 May 2017 07:40:42 +0200 Subject: [PATCH 003/153] af_key: Fix slab-out-of-bounds in pfkey_compile_policy. The sadb_x_sec_len is stored in the unit 'byte divided by eight'. So we have to multiply this value by eight before we can do size checks. Otherwise we may get a slab-out-of-bounds when we memcpy the user sec_ctx. Fixes: df71837d502 ("[LSM-IPSec]: Security association restriction.") Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Steffen Klassert --- net/key/af_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index c1950bb14735..512dc43d0ce6 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3285,7 +3285,7 @@ static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, p += pol->sadb_x_policy_len*8; sec_ctx = (struct sadb_x_sec_ctx *)p; if (len < pol->sadb_x_policy_len*8 + - sec_ctx->sadb_x_sec_len) { + sec_ctx->sadb_x_sec_len*8) { *dir = -EINVAL; goto out; } From 2c1497bbc8fdee897341ab48ee9c9209b421b8c0 Mon Sep 17 00:00:00 2001 From: Ilan Tayari Date: Mon, 8 May 2017 10:30:18 +0300 Subject: [PATCH 004/153] xfrm: Fix NETDEV_DOWN with IPSec offload Upon NETDEV_DOWN event, all xfrm_state objects which are bound to the device are flushed. The condition for this is wrong, though, testing dev->hw_features instead of dev->features. If a device has non-user-modifiable NETIF_F_HW_ESP, then its xfrm_state objects are not flushed, causing a crash later on after the device is deleted. Check dev->features instead of dev->hw_features. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Ilan Tayari Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 8ec8a3fcf8d4..574e6f32f94f 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -170,7 +170,7 @@ static int xfrm_dev_feat_change(struct net_device *dev) static int xfrm_dev_down(struct net_device *dev) { - if (dev->hw_features & NETIF_F_HW_ESP) + if (dev->features & NETIF_F_HW_ESP) xfrm_dev_state_flush(dev_net(dev), dev, true); xfrm_garbage_collect(dev_net(dev)); From 29f6ca6916e29fc46f1418885374d9ed50430687 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 May 2017 14:59:02 +0900 Subject: [PATCH 005/153] scsi: sd: Unlock zone in case of error in sd_setup_write_same_cmnd() scsi_io_init() may fail, leaving a zone of a zoned block device locked. Fix this by properly unlocking the write same request target zone if scsi_io_init() fails. Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index f9d1432d7cc5..e60a309b26bf 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -948,6 +948,10 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) rq->__data_len = sdp->sector_size; ret = scsi_init_io(cmd); rq->__data_len = nr_bytes; + + if (sd_is_zoned(sdkp) && ret != BLKPREP_OK) + sd_zbc_write_unlock_zone(cmd); + return ret; } From ed44fd7fd8a6785b73cfc6d44594c434e578d724 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 May 2017 15:48:19 +0900 Subject: [PATCH 006/153] scsi: sd: Write lock zone for REQ_OP_WRITE_ZEROES For a zoned block device, sd_zbc_complete() handles zone write unlock on completion of a REQ_OP_WRITE_ZEROES command but the zone write locking is missing from sd_setup_write_zeroes_cmnd(). This patch fixes this problem by locking the target zone of a REQ_OP_WRITE_ZEROES request. Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index e60a309b26bf..de9e2f2ef662 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -827,21 +827,32 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9); u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9); + int ret; if (!(rq->cmd_flags & REQ_NOUNMAP)) { switch (sdkp->zeroing_mode) { case SD_ZERO_WS16_UNMAP: - return sd_setup_write_same16_cmnd(cmd, true); + ret = sd_setup_write_same16_cmnd(cmd, true); + goto out; case SD_ZERO_WS10_UNMAP: - return sd_setup_write_same10_cmnd(cmd, true); + ret = sd_setup_write_same10_cmnd(cmd, true); + goto out; } } if (sdp->no_write_same) return BLKPREP_INVALID; + if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff) - return sd_setup_write_same16_cmnd(cmd, false); - return sd_setup_write_same10_cmnd(cmd, false); + ret = sd_setup_write_same16_cmnd(cmd, false); + else + ret = sd_setup_write_same10_cmnd(cmd, false); + +out: + if (sd_is_zoned(sdkp) && ret == BLKPREP_OK) + return sd_zbc_write_lock_zone(cmd); + + return ret; } static void sd_config_write_same(struct scsi_disk *sdkp) From 48ae8484e9fc324b4968d33c585e54bc98e44d61 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 10 May 2017 09:53:40 +0200 Subject: [PATCH 007/153] scsi: sg: don't return bogus Sg_requests If the list search in sg_get_rq_mark() fails to find a valid request, we return a bogus element. This then can later lead to a GPF in sg_remove_scat(). So don't return bogus Sg_requests in sg_get_rq_mark() but NULL in case the list search doesn't find a valid request. Signed-off-by: Johannes Thumshirn Reported-by: Andrey Konovalov Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Doug Gilbert Reviewed-by: Hannes Reinecke Acked-by: Doug Gilbert Signed-off-by: Martin K. Petersen --- drivers/scsi/sg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 0a38ba01b7b4..82c33a6edbea 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -2074,11 +2074,12 @@ sg_get_rq_mark(Sg_fd * sfp, int pack_id) if ((1 == resp->done) && (!resp->sg_io_owned) && ((-1 == pack_id) || (resp->header.pack_id == pack_id))) { resp->done = 2; /* guard against other readers */ - break; + write_unlock_irqrestore(&sfp->rq_list_lock, iflags); + return resp; } } write_unlock_irqrestore(&sfp->rq_list_lock, iflags); - return resp; + return NULL; } /* always adds to end of list */ From f83914fdfcc3ecb62a5a83eeb609ff59a9c2052d Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 12 May 2017 14:34:37 +0200 Subject: [PATCH 008/153] ALSA: usb-audio: fix Amanero Combo384 quirk on big-endian hosts Add missing endianness conversion when using the USB device-descriptor bcdDevice field when applying the Amanero Combo384 (endianness!) quirk. Fixes: 3eff682d765b ("ALSA: usb-audio: Support both DSD LE/BE Amanero firmware versions") Cc: Jussi Laako Signed-off-by: Johan Hovold Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 01eff6ce6401..d7b0b0a3a2db 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1364,7 +1364,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, /* Amanero Combo384 USB interface with native DSD support */ case USB_ID(0x16d0, 0x071a): if (fp->altsetting == 2) { - switch (chip->dev->descriptor.bcdDevice) { + switch (le16_to_cpu(chip->dev->descriptor.bcdDevice)) { case 0x199: return SNDRV_PCM_FMTBIT_DSD_U32_LE; case 0x19b: From fa16b69f1299004b60b625f181143500a246e5cb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 16 May 2017 09:11:33 +0200 Subject: [PATCH 009/153] ALSA: hda - No loopback on ALC299 codec ALC299 has no loopback mixer, but the driver still tries to add a beep control over the mixer NID which leads to the error at accessing it. This patch fixes it by properly declaring mixer_nid=0 for this codec. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195775 Fixes: 28f1f9b26cee ("ALSA: hda/realtek - Add new codec ID ALC299") Cc: stable@vger.kernel.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 58df440013c5..9c22ad694534 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6465,8 +6465,11 @@ static int patch_alc269(struct hda_codec *codec) break; case 0x10ec0225: case 0x10ec0295: + spec->codec_variant = ALC269_TYPE_ALC225; + break; case 0x10ec0299: spec->codec_variant = ALC269_TYPE_ALC225; + spec->gen.mixer_nid = 0; /* no loopback on ALC299 */ break; case 0x10ec0234: case 0x10ec0274: From 0daaecacb83bc6b656a56393ab77a31c28139bc7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 12 May 2017 10:44:08 -0700 Subject: [PATCH 010/153] xfs: fix indlen accounting error on partial delalloc conversion The delalloc -> real block conversion path uses an incorrect calculation in the case where the middle part of a delalloc extent is being converted. This is documented as a rare situation because XFS generally attempts to maximize contiguity by converting as much of a delalloc extent as possible. If this situation does occur, the indlen reservation for the two new delalloc extents left behind by the conversion of the middle range is calculated and compared with the original reservation. If more blocks are required, the delta is allocated from the global block pool. This delta value can be characterized as the difference between the new total requirement (temp + temp2) and the currently available reservation minus those blocks that have already been allocated (startblockval(PREV.br_startblock) - allocated). The problem is that the current code does not account for previously allocated blocks correctly. It subtracts the current allocation count from the (new - old) delta rather than the old indlen reservation. This means that more indlen blocks than have been allocated end up stashed in the remaining extents and free space accounting is broken as a result. Fix up the calculation to subtract the allocated block count from the original extent indlen and thus correctly allocate the reservation delta based on the difference between the new total requirement and the unused blocks from the original reservation. Also remove a bogus assert that contradicts the fact that the new indlen reservation can be larger than the original indlen reservation. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f02eb7673392..8adb91b05588 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2065,8 +2065,10 @@ xfs_bmap_add_extent_delay_real( } temp = xfs_bmap_worst_indlen(bma->ip, temp); temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); - diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + diff = (int)(temp + temp2 - + (startblockval(PREV.br_startblock) - + (bma->cur ? + bma->cur->bc_private.b.allocated : 0))); if (diff > 0) { error = xfs_mod_fdblocks(bma->ip->i_mount, -((int64_t)diff), false); @@ -2123,7 +2125,6 @@ xfs_bmap_add_extent_delay_real( temp = da_new; if (bma->cur) temp += bma->cur->bc_private.b.allocated; - ASSERT(temp <= da_old); if (temp < da_old) xfs_mod_fdblocks(bma->ip->i_mount, (int64_t)(da_old - temp), false); From 6eadbf4c8ba816c10d1c97bed9aa861d9fd17809 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 12 May 2017 10:44:08 -0700 Subject: [PATCH 011/153] xfs: BMAPX shouldn't barf on inline-format directories When we're fulfilling a BMAPX request, jump out early if the data fork is in local format. This prevents us from hitting a debugging check in bmapi_read and barfing errors back to userspace. The on-disk extent count check later isn't sufficient for IF_DELALLOC mode because da extents are in memory and not on disk. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_bmap_util.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2b954308a1d6..2e8851ee6759 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -582,9 +582,13 @@ xfs_getbmap( } break; default: + /* Local format data forks report no extents. */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + bmv->bmv_entries = 0; + return 0; + } if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) return -EINVAL; if (xfs_get_extsz_hint(ip) || From 6e747506dde195d3d05fe2bb8ef78aceba28a5e3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 12 May 2017 10:44:11 -0700 Subject: [PATCH 012/153] xfs: fix warnings about unused stack variables Reduce stack usage and get rid of compiler warnings by eliminating unused variables. Signed-off-by: Darrick J. Wong Reviewed-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_bmap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8adb91b05588..a7048eafa8e6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1280,7 +1280,6 @@ xfs_bmap_read_extents( xfs_bmbt_rec_t *frp; xfs_fsblock_t nextbno; xfs_extnum_t num_recs; - xfs_extnum_t start; num_recs = xfs_btree_get_numrecs(block); if (unlikely(i + num_recs > room)) { @@ -1303,7 +1302,6 @@ xfs_bmap_read_extents( * Copy records into the extent records. */ frp = XFS_BMBT_REC_ADDR(mp, block, 1); - start = i; for (j = 0; j < num_recs; j++, i++, frp++) { xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); trp->l0 = be64_to_cpu(frp->l0); From 892d2a5f705723b2cb488bfb38bcbdcf83273184 Mon Sep 17 00:00:00 2001 From: Zorro Lang Date: Mon, 15 May 2017 08:40:02 -0700 Subject: [PATCH 013/153] xfs: bad assertion for delalloc an extent that start at i_size By run fsstress long enough time enough in RHEL-7, I find an assertion failure (harder to reproduce on linux-4.11, but problem is still there): XFS: Assertion failed: (iflags & BMV_IF_DELALLOC) != 0, file: fs/xfs/xfs_bmap_util.c The assertion is in xfs_getbmap() funciton: if (map[i].br_startblock == DELAYSTARTBLOCK && --> map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); When map[i].br_startoff == XFS_B_TO_FSB(mp, XFS_ISIZE(ip)), the startoff is just at EOF. But we only need to make sure delalloc extents that are within EOF, not include EOF. Signed-off-by: Zorro Lang Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2e8851ee6759..9e3cc2146d5b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -716,7 +716,7 @@ xfs_getbmap( * extents. */ if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); if (map[i].br_startblock == HOLESTARTBLOCK && From ea9a46e1c49251331dbfda19ced7114337966178 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 12 May 2017 10:44:10 -0700 Subject: [PATCH 014/153] xfs: only return detailed fsmap info if the caller has CAP_SYS_ADMIN There were a number of handwaving complaints that one could "possibly" use inode numbers and extent maps to fingerprint a filesystem hosting multiple containers and somehow use the information to guess at the contents of other containers and attack them. Despite the total lack of any demonstration that this is actually possible, it's easier to restrict access now and broaden it later, so use the rmapbt fsmap backends only if the caller has CAP_SYS_ADMIN. Unprivileged users will just have to make do with only getting the free space and static metadata placement information. Signed-off-by: Darrick J. Wong Reviewed-by: Carlos Maiolino --- fs/xfs/xfs_fsmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 3683819887a5..814ed729881d 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -828,6 +828,7 @@ xfs_getfsmap( struct xfs_fsmap dkeys[2]; /* per-dev keys */ struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS]; struct xfs_getfsmap_info info = { NULL }; + bool use_rmap; int i; int error = 0; @@ -837,12 +838,14 @@ xfs_getfsmap( !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; + use_rmap = capable(CAP_SYS_ADMIN) && + xfs_sb_version_hasrmapbt(&mp->m_sb); head->fmh_entries = 0; /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else handlers[0].fn = xfs_getfsmap_datadev_bnobt; From 4fa8324461b824eaea9b6695464395710fe20c44 Mon Sep 17 00:00:00 2001 From: Derek Basehore Date: Thu, 11 May 2017 14:34:24 +0200 Subject: [PATCH 015/153] scsi: sd: Ignore sync cache failures when not supported Some external hard drives don't support the sync command even though the hard drive has write cache enabled. In this case, upon suspend request, sync cache failures are ignored if the error code in the sense header is ILLEGAL_REQUEST. There's not much we can do for these drives, so we shouldn't fail to suspend for this error case. The drive may stay powered if that's the setup for the port it's plugged into. Signed-off-by: Derek Basehore Signed-off-by: Thierry Escande Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index de9e2f2ef662..b6bb4e0ce0e3 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1582,17 +1582,21 @@ static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing) return retval; } -static int sd_sync_cache(struct scsi_disk *sdkp) +static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { int retries, res; struct scsi_device *sdp = sdkp->device; const int timeout = sdp->request_queue->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; - struct scsi_sense_hdr sshdr; + struct scsi_sense_hdr my_sshdr; if (!scsi_device_online(sdp)) return -ENODEV; + /* caller might not be interested in sense, but we need it */ + if (!sshdr) + sshdr = &my_sshdr; + for (retries = 3; retries > 0; --retries) { unsigned char cmd[10] = { 0 }; @@ -1601,7 +1605,7 @@ static int sd_sync_cache(struct scsi_disk *sdkp) * Leave the rest of the command zero to indicate * flush everything. */ - res = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, &sshdr, + res = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, sshdr, timeout, SD_MAX_RETRIES, 0, RQF_PM, NULL); if (res == 0) break; @@ -1611,11 +1615,12 @@ static int sd_sync_cache(struct scsi_disk *sdkp) sd_print_result(sdkp, "Synchronize Cache(10) failed", res); if (driver_byte(res) & DRIVER_SENSE) - sd_print_sense_hdr(sdkp, &sshdr); + sd_print_sense_hdr(sdkp, sshdr); + /* we need to evaluate the error return */ - if (scsi_sense_valid(&sshdr) && - (sshdr.asc == 0x3a || /* medium not present */ - sshdr.asc == 0x20)) /* invalid command */ + if (scsi_sense_valid(sshdr) && + (sshdr->asc == 0x3a || /* medium not present */ + sshdr->asc == 0x20)) /* invalid command */ /* this is no error here */ return 0; @@ -3459,7 +3464,7 @@ static void sd_shutdown(struct device *dev) if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); - sd_sync_cache(sdkp); + sd_sync_cache(sdkp, NULL); } if (system_state != SYSTEM_RESTART && sdkp->device->manage_start_stop) { @@ -3471,6 +3476,7 @@ static void sd_shutdown(struct device *dev) static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) { struct scsi_disk *sdkp = dev_get_drvdata(dev); + struct scsi_sense_hdr sshdr; int ret = 0; if (!sdkp) /* E.g.: runtime suspend following sd_remove() */ @@ -3478,12 +3484,23 @@ static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); - ret = sd_sync_cache(sdkp); + ret = sd_sync_cache(sdkp, &sshdr); + if (ret) { /* ignore OFFLINE device */ if (ret == -ENODEV) - ret = 0; - goto done; + return 0; + + if (!scsi_sense_valid(&sshdr) || + sshdr.sense_key != ILLEGAL_REQUEST) + return ret; + + /* + * sshdr.sense_key == ILLEGAL_REQUEST means this drive + * doesn't support sync. There's not much to do and + * suspend shouldn't fail. + */ + ret = 0; } } @@ -3495,7 +3512,6 @@ static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) ret = 0; } -done: return ret; } From dd6e1f71b785a6ac2511e2ddb86315f292873e59 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 11 May 2017 17:24:44 -0500 Subject: [PATCH 016/153] scsi: libfc: fix incorrect variable assignment Previous assignment was causing the use of the uninitialized variable _explan_ inside fc_seq_ls_rjt() function, which in this particular case is being called by fc_seq_els_rsp_send(). [mkp: fixed typo] Addresses-Coverity-ID: 1398125 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/libfc/fc_rport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c index b44c3136eb51..520325867e2b 100644 --- a/drivers/scsi/libfc/fc_rport.c +++ b/drivers/scsi/libfc/fc_rport.c @@ -1422,7 +1422,7 @@ static void fc_rport_recv_rtv_req(struct fc_rport_priv *rdata, fp = fc_frame_alloc(lport, sizeof(*rtv)); if (!fp) { rjt_data.reason = ELS_RJT_UNAB; - rjt_data.reason = ELS_EXPL_INSUF_RES; + rjt_data.explan = ELS_EXPL_INSUF_RES; fc_seq_els_rsp_send(in_fp, ELS_LS_RJT, &rjt_data); goto drop; } From 845d9e8df2fa879e6494e786f290e1fd5560ac8c Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:38 -0700 Subject: [PATCH 017/153] scsi: lpfc: Fix used-RPI accounting problem. With 255 vports created a link trasition can casue a crash. When going through discovery after a link bounce the driver is using rpis before the cmd FCOE_POST_HDR_TEMPLATES completes. By doing that the next rpi bumps the rpi range out of the boundary. The fix it to increment the next_rpi only when the FCOE_POST_HDR_TEMPLATE succeeds. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 3 ++- drivers/scsi/lpfc/lpfc_init.c | 24 +++++------------------- drivers/scsi/lpfc/lpfc_sli.c | 8 ++++++++ drivers/scsi/lpfc/lpfc_sli4.h | 1 + 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 67827e397431..3f9f6d5f8c69 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -8667,7 +8667,8 @@ lpfc_cmpl_els_fdisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, lpfc_do_scr_ns_plogi(phba, vport); goto out; fdisc_failed: - if (vport->fc_vport->vport_state != FC_VPORT_NO_FABRIC_RSCS) + if (vport->fc_vport && + (vport->fc_vport->vport_state != FC_VPORT_NO_FABRIC_RSCS)) lpfc_vport_set_state(vport, FC_VPORT_FAILED); /* Cancel discovery timer */ lpfc_can_disctmo(vport); diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 4b1eb98c228d..b1b181a756dc 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -6525,7 +6525,6 @@ lpfc_sli4_create_rpi_hdr(struct lpfc_hba *phba) uint16_t rpi_limit, curr_rpi_range; struct lpfc_dmabuf *dmabuf; struct lpfc_rpi_hdr *rpi_hdr; - uint32_t rpi_count; /* * If the SLI4 port supports extents, posting the rpi header isn't @@ -6538,8 +6537,7 @@ lpfc_sli4_create_rpi_hdr(struct lpfc_hba *phba) return NULL; /* The limit on the logical index is just the max_rpi count. */ - rpi_limit = phba->sli4_hba.max_cfg_param.rpi_base + - phba->sli4_hba.max_cfg_param.max_rpi - 1; + rpi_limit = phba->sli4_hba.max_cfg_param.max_rpi; spin_lock_irq(&phba->hbalock); /* @@ -6550,18 +6548,10 @@ lpfc_sli4_create_rpi_hdr(struct lpfc_hba *phba) curr_rpi_range = phba->sli4_hba.next_rpi; spin_unlock_irq(&phba->hbalock); - /* - * The port has a limited number of rpis. The increment here - * is LPFC_RPI_HDR_COUNT - 1 to account for the starting value - * and to allow the full max_rpi range per port. - */ - if ((curr_rpi_range + (LPFC_RPI_HDR_COUNT - 1)) > rpi_limit) - rpi_count = rpi_limit - curr_rpi_range; - else - rpi_count = LPFC_RPI_HDR_COUNT; - - if (!rpi_count) + /* Reached full RPI range */ + if (curr_rpi_range == rpi_limit) return NULL; + /* * First allocate the protocol header region for the port. The * port expects a 4KB DMA-mapped memory region that is 4K aligned. @@ -6595,13 +6585,9 @@ lpfc_sli4_create_rpi_hdr(struct lpfc_hba *phba) /* The rpi_hdr stores the logical index only. */ rpi_hdr->start_rpi = curr_rpi_range; + rpi_hdr->next_rpi = phba->sli4_hba.next_rpi + LPFC_RPI_HDR_COUNT; list_add_tail(&rpi_hdr->list, &phba->sli4_hba.lpfc_rpi_hdr_list); - /* - * The next_rpi stores the next logical module-64 rpi value used - * to post physical rpis in subsequent rpi postings. - */ - phba->sli4_hba.next_rpi += rpi_count; spin_unlock_irq(&phba->hbalock); return rpi_hdr; diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 2a4fc00dfa9b..e2d25ae5ba45 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -17137,6 +17137,14 @@ lpfc_sli4_post_rpi_hdr(struct lpfc_hba *phba, struct lpfc_rpi_hdr *rpi_page) "status x%x add_status x%x, mbx status x%x\n", shdr_status, shdr_add_status, rc); rc = -ENXIO; + } else { + /* + * The next_rpi stores the next logical module-64 rpi value used + * to post physical rpis in subsequent rpi postings. + */ + spin_lock_irq(&phba->hbalock); + phba->sli4_hba.next_rpi = rpi_page->next_rpi; + spin_unlock_irq(&phba->hbalock); } return rc; } diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index da46471337c8..915e8d5581bd 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -698,6 +698,7 @@ struct lpfc_rpi_hdr { struct lpfc_dmabuf *dmabuf; uint32_t page_count; uint32_t start_rpi; + uint16_t next_rpi; }; struct lpfc_rsrc_blks { From 0c9c6a75141810acade82add4f4708959a5d3a1d Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:39 -0700 Subject: [PATCH 018/153] scsi: lpfc: Fix system crash when port is reset. The driver panic when using the els_wq during port reset. Check for NULL els_wq before dereferencing. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 8 ++++++-- drivers/scsi/lpfc/lpfc_hbadisc.c | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index 6d7840b096e6..62571fa9c6ad 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -1228,7 +1228,11 @@ lpfc_sli_read_hs(struct lpfc_hba *phba) static inline struct lpfc_sli_ring * lpfc_phba_elsring(struct lpfc_hba *phba) { - if (phba->sli_rev == LPFC_SLI_REV4) - return phba->sli4_hba.els_wq->pring; + if (phba->sli_rev == LPFC_SLI_REV4) { + if (phba->sli4_hba.els_wq) + return phba->sli4_hba.els_wq->pring; + else + return NULL; + } return &phba->sli.sli3_ring[LPFC_ELS_RING]; } diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 0482c5580331..dcc9b3858778 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -693,9 +693,9 @@ lpfc_work_done(struct lpfc_hba *phba) pring = lpfc_phba_elsring(phba); status = (ha_copy & (HA_RXMASK << (4*LPFC_ELS_RING))); status >>= (4*LPFC_ELS_RING); - if ((status & HA_RXMASK) || - (pring->flag & LPFC_DEFERRED_RING_EVENT) || - (phba->hba_flag & HBA_SP_QUEUE_EVT)) { + if (pring && (status & HA_RXMASK || + pring->flag & LPFC_DEFERRED_RING_EVENT || + phba->hba_flag & HBA_SP_QUEUE_EVT)) { if (pring->flag & LPFC_STOP_IOCB_EVENT) { pring->flag |= LPFC_DEFERRED_RING_EVENT; /* Set the lpfc data pending flag */ From 547077a44b3b49f56c0f05c0b46c8c617dea591d Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:40 -0700 Subject: [PATCH 019/153] scsi: lpfc: Adding additional stats counters for nvme. More debug messages added for nvme statistics. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_attr.c | 24 ++++++++++------- drivers/scsi/lpfc/lpfc_debugfs.c | 27 +++++++++++-------- drivers/scsi/lpfc/lpfc_nvmet.c | 46 ++++++++++++++++++++++++-------- drivers/scsi/lpfc/lpfc_nvmet.h | 12 +++++---- drivers/scsi/lpfc/lpfc_sli.c | 38 ++++++++++++++++++++++---- drivers/scsi/lpfc/lpfc_sli4.h | 2 +- 6 files changed, 106 insertions(+), 43 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 4830370bfab1..41ec7451689b 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -205,8 +205,9 @@ lpfc_nvme_info_show(struct device *dev, struct device_attribute *attr, atomic_read(&tgtp->xmt_ls_rsp_error)); len += snprintf(buf+len, PAGE_SIZE-len, - "FCP: Rcv %08x Drop %08x\n", + "FCP: Rcv %08x Release %08x Drop %08x\n", atomic_read(&tgtp->rcv_fcp_cmd_in), + atomic_read(&tgtp->xmt_fcp_release), atomic_read(&tgtp->rcv_fcp_cmd_drop)); if (atomic_read(&tgtp->rcv_fcp_cmd_in) != @@ -218,15 +219,12 @@ lpfc_nvme_info_show(struct device *dev, struct device_attribute *attr, } len += snprintf(buf+len, PAGE_SIZE-len, - "FCP Rsp: RD %08x rsp %08x WR %08x rsp %08x\n", + "FCP Rsp: RD %08x rsp %08x WR %08x rsp %08x " + "drop %08x\n", atomic_read(&tgtp->xmt_fcp_read), atomic_read(&tgtp->xmt_fcp_read_rsp), atomic_read(&tgtp->xmt_fcp_write), - atomic_read(&tgtp->xmt_fcp_rsp)); - - len += snprintf(buf+len, PAGE_SIZE-len, - "FCP Rsp: abort %08x drop %08x\n", - atomic_read(&tgtp->xmt_fcp_abort), + atomic_read(&tgtp->xmt_fcp_rsp), atomic_read(&tgtp->xmt_fcp_drop)); len += snprintf(buf+len, PAGE_SIZE-len, @@ -236,10 +234,16 @@ lpfc_nvme_info_show(struct device *dev, struct device_attribute *attr, atomic_read(&tgtp->xmt_fcp_rsp_drop)); len += snprintf(buf+len, PAGE_SIZE-len, - "ABORT: Xmt %08x Err %08x Cmpl %08x", + "ABORT: Xmt %08x Cmpl %08x\n", + atomic_read(&tgtp->xmt_fcp_abort), + atomic_read(&tgtp->xmt_fcp_abort_cmpl)); + + len += snprintf(buf + len, PAGE_SIZE - len, + "ABORT: Sol %08x Usol %08x Err %08x Cmpl %08x", + atomic_read(&tgtp->xmt_abort_sol), + atomic_read(&tgtp->xmt_abort_unsol), atomic_read(&tgtp->xmt_abort_rsp), - atomic_read(&tgtp->xmt_abort_rsp_error), - atomic_read(&tgtp->xmt_abort_cmpl)); + atomic_read(&tgtp->xmt_abort_rsp_error)); len += snprintf(buf+len, PAGE_SIZE-len, "\n"); return len; diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index fce549a91911..a41daedeb967 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -797,11 +797,6 @@ lpfc_debugfs_nvmestat_data(struct lpfc_vport *vport, char *buf, int size) atomic_read(&tgtp->xmt_fcp_write), atomic_read(&tgtp->xmt_fcp_rsp)); - len += snprintf(buf + len, size - len, - "FCP Rsp: abort %08x drop %08x\n", - atomic_read(&tgtp->xmt_fcp_abort), - atomic_read(&tgtp->xmt_fcp_drop)); - len += snprintf(buf + len, size - len, "FCP Rsp Cmpl: %08x err %08x drop %08x\n", atomic_read(&tgtp->xmt_fcp_rsp_cmpl), @@ -809,10 +804,16 @@ lpfc_debugfs_nvmestat_data(struct lpfc_vport *vport, char *buf, int size) atomic_read(&tgtp->xmt_fcp_rsp_drop)); len += snprintf(buf + len, size - len, - "ABORT: Xmt %08x Err %08x Cmpl %08x", + "ABORT: Xmt %08x Cmpl %08x\n", + atomic_read(&tgtp->xmt_fcp_abort), + atomic_read(&tgtp->xmt_fcp_abort_cmpl)); + + len += snprintf(buf + len, size - len, + "ABORT: Sol %08x Usol %08x Err %08x Cmpl %08x", + atomic_read(&tgtp->xmt_abort_sol), + atomic_read(&tgtp->xmt_abort_unsol), atomic_read(&tgtp->xmt_abort_rsp), - atomic_read(&tgtp->xmt_abort_rsp_error), - atomic_read(&tgtp->xmt_abort_cmpl)); + atomic_read(&tgtp->xmt_abort_rsp_error)); len += snprintf(buf + len, size - len, "\n"); @@ -1959,6 +1960,7 @@ lpfc_debugfs_nvmestat_write(struct file *file, const char __user *buf, atomic_set(&tgtp->rcv_ls_req_out, 0); atomic_set(&tgtp->rcv_ls_req_drop, 0); atomic_set(&tgtp->xmt_ls_abort, 0); + atomic_set(&tgtp->xmt_ls_abort_cmpl, 0); atomic_set(&tgtp->xmt_ls_rsp, 0); atomic_set(&tgtp->xmt_ls_drop, 0); atomic_set(&tgtp->xmt_ls_rsp_error, 0); @@ -1967,19 +1969,22 @@ lpfc_debugfs_nvmestat_write(struct file *file, const char __user *buf, atomic_set(&tgtp->rcv_fcp_cmd_in, 0); atomic_set(&tgtp->rcv_fcp_cmd_out, 0); atomic_set(&tgtp->rcv_fcp_cmd_drop, 0); - atomic_set(&tgtp->xmt_fcp_abort, 0); atomic_set(&tgtp->xmt_fcp_drop, 0); atomic_set(&tgtp->xmt_fcp_read_rsp, 0); atomic_set(&tgtp->xmt_fcp_read, 0); atomic_set(&tgtp->xmt_fcp_write, 0); atomic_set(&tgtp->xmt_fcp_rsp, 0); + atomic_set(&tgtp->xmt_fcp_release, 0); atomic_set(&tgtp->xmt_fcp_rsp_cmpl, 0); atomic_set(&tgtp->xmt_fcp_rsp_error, 0); atomic_set(&tgtp->xmt_fcp_rsp_drop, 0); + atomic_set(&tgtp->xmt_fcp_abort, 0); + atomic_set(&tgtp->xmt_fcp_abort_cmpl, 0); + atomic_set(&tgtp->xmt_abort_sol, 0); + atomic_set(&tgtp->xmt_abort_unsol, 0); atomic_set(&tgtp->xmt_abort_rsp, 0); atomic_set(&tgtp->xmt_abort_rsp_error, 0); - atomic_set(&tgtp->xmt_abort_cmpl, 0); } return nbytes; } @@ -3143,7 +3148,7 @@ __lpfc_idiag_print_rqpair(struct lpfc_queue *qp, struct lpfc_queue *datqp, "\t\t%s RQ info: ", rqtype); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "AssocCQID[%02d]: RQ-STAT[nopost:x%x nobuf:x%x " - "trunc:x%x rcv:x%llx]\n", + "posted:x%x rcv:x%llx]\n", qp->assoc_qid, qp->q_cnt_1, qp->q_cnt_2, qp->q_cnt_3, (unsigned long long)qp->q_cnt_4); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index 94434e621c33..bb12e2c9fbf4 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -502,6 +502,7 @@ lpfc_nvmet_xmt_ls_rsp(struct nvmet_fc_target_port *tgtport, "6150 LS Drop IO x%x: Prep\n", ctxp->oxid); lpfc_in_buf_free(phba, &nvmebuf->dbuf); + atomic_inc(&nvmep->xmt_ls_abort); lpfc_nvmet_unsol_ls_issue_abort(phba, ctxp, ctxp->sid, ctxp->oxid); return -ENOMEM; @@ -545,6 +546,7 @@ lpfc_nvmet_xmt_ls_rsp(struct nvmet_fc_target_port *tgtport, lpfc_nlp_put(nvmewqeq->context1); lpfc_in_buf_free(phba, &nvmebuf->dbuf); + atomic_inc(&nvmep->xmt_ls_abort); lpfc_nvmet_unsol_ls_issue_abort(phba, ctxp, ctxp->sid, ctxp->oxid); return -ENXIO; } @@ -692,6 +694,7 @@ static void lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *rsp) { + struct lpfc_nvmet_tgtport *lpfc_nvmep = tgtport->private; struct lpfc_nvmet_rcv_ctx *ctxp = container_of(rsp, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req); struct lpfc_hba *phba = ctxp->phba; @@ -710,6 +713,8 @@ lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport, lpfc_nvmeio_data(phba, "NVMET FCP FREE: xri x%x ste %d\n", ctxp->oxid, ctxp->state, 0); + atomic_inc(&lpfc_nvmep->xmt_fcp_release); + if (aborting) return; @@ -796,6 +801,7 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba) atomic_set(&tgtp->rcv_ls_req_out, 0); atomic_set(&tgtp->rcv_ls_req_drop, 0); atomic_set(&tgtp->xmt_ls_abort, 0); + atomic_set(&tgtp->xmt_ls_abort_cmpl, 0); atomic_set(&tgtp->xmt_ls_rsp, 0); atomic_set(&tgtp->xmt_ls_drop, 0); atomic_set(&tgtp->xmt_ls_rsp_error, 0); @@ -803,18 +809,21 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba) atomic_set(&tgtp->rcv_fcp_cmd_in, 0); atomic_set(&tgtp->rcv_fcp_cmd_out, 0); atomic_set(&tgtp->rcv_fcp_cmd_drop, 0); - atomic_set(&tgtp->xmt_fcp_abort, 0); atomic_set(&tgtp->xmt_fcp_drop, 0); atomic_set(&tgtp->xmt_fcp_read_rsp, 0); atomic_set(&tgtp->xmt_fcp_read, 0); atomic_set(&tgtp->xmt_fcp_write, 0); atomic_set(&tgtp->xmt_fcp_rsp, 0); + atomic_set(&tgtp->xmt_fcp_release, 0); atomic_set(&tgtp->xmt_fcp_rsp_cmpl, 0); atomic_set(&tgtp->xmt_fcp_rsp_error, 0); atomic_set(&tgtp->xmt_fcp_rsp_drop, 0); + atomic_set(&tgtp->xmt_fcp_abort, 0); + atomic_set(&tgtp->xmt_fcp_abort_cmpl, 0); + atomic_set(&tgtp->xmt_abort_unsol, 0); + atomic_set(&tgtp->xmt_abort_sol, 0); atomic_set(&tgtp->xmt_abort_rsp, 0); atomic_set(&tgtp->xmt_abort_rsp_error, 0); - atomic_set(&tgtp->xmt_abort_cmpl, 0); } return error; } @@ -1011,6 +1020,7 @@ lpfc_nvmet_unsol_ls_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, oxid = 0; size = 0; sid = 0; + ctxp = NULL; goto dropit; } @@ -1117,6 +1127,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, oxid = 0; size = 0; sid = 0; + ctxp = NULL; goto dropit; } @@ -1193,8 +1204,11 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, atomic_inc(&tgtp->rcv_fcp_cmd_drop); lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, - "6159 FCP Drop IO x%x: err x%x\n", - ctxp->oxid, rc); + "6159 FCP Drop IO x%x: err x%x: x%x x%x x%x\n", + ctxp->oxid, rc, + atomic_read(&tgtp->rcv_fcp_cmd_in), + atomic_read(&tgtp->rcv_fcp_cmd_out), + atomic_read(&tgtp->xmt_fcp_release)); dropit: lpfc_nvmeio_data(phba, "NVMET FCP DROP: xri x%x sz %d from %06x\n", oxid, size, sid); @@ -1206,7 +1220,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, if (nvmebuf) { nvmebuf->iocbq->hba_wqidx = 0; /* We assume a rcv'ed cmd ALWAYs fits into 1 buffer */ - lpfc_nvmet_rq_post(phba, NULL, &nvmebuf->hbuf); + lpfc_nvmet_rq_post(phba, ctxp, &nvmebuf->hbuf); } #endif } @@ -1812,7 +1826,8 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, result = wcqe->parameter; tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; - atomic_inc(&tgtp->xmt_abort_cmpl); + if (ctxp->flag & LPFC_NVMET_ABORT_OP) + atomic_inc(&tgtp->xmt_fcp_abort_cmpl); ctxp->state = LPFC_NVMET_STE_DONE; @@ -1827,6 +1842,7 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, } ctxp->flag &= ~LPFC_NVMET_ABORT_OP; spin_unlock_irqrestore(&ctxp->ctxlock, flags); + atomic_inc(&tgtp->xmt_abort_rsp); lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS, "6165 ABORT cmpl: xri x%x flg x%x (%d) " @@ -1877,7 +1893,8 @@ lpfc_nvmet_unsol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, result = wcqe->parameter; tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; - atomic_inc(&tgtp->xmt_abort_cmpl); + if (ctxp->flag & LPFC_NVMET_ABORT_OP) + atomic_inc(&tgtp->xmt_fcp_abort_cmpl); if (!ctxp) { /* if context is clear, related io alrady complete */ @@ -1907,6 +1924,7 @@ lpfc_nvmet_unsol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, } ctxp->flag &= ~LPFC_NVMET_ABORT_OP; spin_unlock_irqrestore(&ctxp->ctxlock, flags); + atomic_inc(&tgtp->xmt_abort_rsp); lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS, "6316 ABTS cmpl xri x%x flg x%x (%x) " @@ -1953,7 +1971,7 @@ lpfc_nvmet_xmt_ls_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, result = wcqe->parameter; tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; - atomic_inc(&tgtp->xmt_abort_cmpl); + atomic_inc(&tgtp->xmt_ls_abort_cmpl); lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS, "6083 Abort cmpl: ctx %p WCQE: %08x %08x %08x %08x\n", @@ -2104,6 +2122,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba, /* Issue ABTS for this WQE based on iotag */ ctxp->abort_wqeq = lpfc_sli_get_iocbq(phba); if (!ctxp->abort_wqeq) { + atomic_inc(&tgtp->xmt_abort_rsp_error); lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS, "6161 ABORT failed: No wqeqs: " "xri: x%x\n", ctxp->oxid); @@ -2128,6 +2147,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba, /* driver queued commands are in process of being flushed */ if (phba->hba_flag & HBA_NVME_IOQ_FLUSH) { spin_unlock_irqrestore(&phba->hbalock, flags); + atomic_inc(&tgtp->xmt_abort_rsp_error); lpfc_printf_log(phba, KERN_ERR, LOG_NVME, "6163 Driver in reset cleanup - flushing " "NVME Req now. hba_flag x%x oxid x%x\n", @@ -2140,6 +2160,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba, /* Outstanding abort is in progress */ if (abts_wqeq->iocb_flag & LPFC_DRIVER_ABORTED) { spin_unlock_irqrestore(&phba->hbalock, flags); + atomic_inc(&tgtp->xmt_abort_rsp_error); lpfc_printf_log(phba, KERN_ERR, LOG_NVME, "6164 Outstanding NVME I/O Abort Request " "still pending on oxid x%x\n", @@ -2190,9 +2211,12 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba, abts_wqeq->context2 = ctxp; rc = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, abts_wqeq); spin_unlock_irqrestore(&phba->hbalock, flags); - if (rc == WQE_SUCCESS) + if (rc == WQE_SUCCESS) { + atomic_inc(&tgtp->xmt_abort_sol); return 0; + } + atomic_inc(&tgtp->xmt_abort_rsp_error); ctxp->flag &= ~LPFC_NVMET_ABORT_OP; lpfc_sli_release_iocbq(phba, abts_wqeq); lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS, @@ -2231,11 +2255,11 @@ lpfc_nvmet_unsol_fcp_issue_abort(struct lpfc_hba *phba, rc = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, abts_wqeq); spin_unlock_irqrestore(&phba->hbalock, flags); if (rc == WQE_SUCCESS) { - atomic_inc(&tgtp->xmt_abort_rsp); return 0; } aerr: + atomic_inc(&tgtp->xmt_abort_rsp_error); ctxp->flag &= ~LPFC_NVMET_ABORT_OP; atomic_inc(&tgtp->xmt_abort_rsp_error); lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS, @@ -2279,7 +2303,7 @@ lpfc_nvmet_unsol_ls_issue_abort(struct lpfc_hba *phba, rc = lpfc_sli4_issue_wqe(phba, LPFC_ELS_RING, abts_wqeq); spin_unlock_irqrestore(&phba->hbalock, flags); if (rc == WQE_SUCCESS) { - atomic_inc(&tgtp->xmt_abort_rsp); + atomic_inc(&tgtp->xmt_abort_unsol); return 0; } diff --git a/drivers/scsi/lpfc/lpfc_nvmet.h b/drivers/scsi/lpfc/lpfc_nvmet.h index 128759fe6650..837210a3e7c8 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.h +++ b/drivers/scsi/lpfc/lpfc_nvmet.h @@ -34,6 +34,7 @@ struct lpfc_nvmet_tgtport { atomic_t rcv_ls_req_out; atomic_t rcv_ls_req_drop; atomic_t xmt_ls_abort; + atomic_t xmt_ls_abort_cmpl; /* Stats counters - lpfc_nvmet_xmt_ls_rsp */ atomic_t xmt_ls_rsp; @@ -47,9 +48,9 @@ struct lpfc_nvmet_tgtport { atomic_t rcv_fcp_cmd_in; atomic_t rcv_fcp_cmd_out; atomic_t rcv_fcp_cmd_drop; + atomic_t xmt_fcp_release; /* Stats counters - lpfc_nvmet_xmt_fcp_op */ - atomic_t xmt_fcp_abort; atomic_t xmt_fcp_drop; atomic_t xmt_fcp_read_rsp; atomic_t xmt_fcp_read; @@ -62,12 +63,13 @@ struct lpfc_nvmet_tgtport { atomic_t xmt_fcp_rsp_drop; - /* Stats counters - lpfc_nvmet_unsol_issue_abort */ + /* Stats counters - lpfc_nvmet_xmt_fcp_abort */ + atomic_t xmt_fcp_abort; + atomic_t xmt_fcp_abort_cmpl; + atomic_t xmt_abort_sol; + atomic_t xmt_abort_unsol; atomic_t xmt_abort_rsp; atomic_t xmt_abort_rsp_error; - - /* Stats counters - lpfc_nvmet_xmt_abort_cmp */ - atomic_t xmt_abort_cmpl; }; struct lpfc_nvmet_rcv_ctx { diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index e2d25ae5ba45..333c5094b97d 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -512,6 +512,7 @@ lpfc_sli4_rq_put(struct lpfc_queue *hq, struct lpfc_queue *dq, } else { return -EINVAL; } + hq->RQ_buf_posted += hq->entry_repost; writel(doorbell.word0, hq->db_regaddr); } return put_index; @@ -12788,6 +12789,7 @@ lpfc_sli4_sp_handle_rcqe(struct lpfc_hba *phba, struct lpfc_rcqe *rcqe) struct fc_frame_header *fc_hdr; struct lpfc_queue *hrq = phba->sli4_hba.hdr_rq; struct lpfc_queue *drq = phba->sli4_hba.dat_rq; + struct lpfc_nvmet_tgtport *tgtp; struct hbq_dmabuf *dma_buf; uint32_t status, rq_id; unsigned long iflags; @@ -12808,7 +12810,6 @@ lpfc_sli4_sp_handle_rcqe(struct lpfc_hba *phba, struct lpfc_rcqe *rcqe) case FC_STATUS_RQ_BUF_LEN_EXCEEDED: lpfc_printf_log(phba, KERN_ERR, LOG_SLI, "2537 Receive Frame Truncated!!\n"); - hrq->RQ_buf_trunc++; case FC_STATUS_RQ_SUCCESS: lpfc_sli4_rq_release(hrq, drq); spin_lock_irqsave(&phba->hbalock, iflags); @@ -12819,6 +12820,7 @@ lpfc_sli4_sp_handle_rcqe(struct lpfc_hba *phba, struct lpfc_rcqe *rcqe) goto out; } hrq->RQ_rcv_buf++; + hrq->RQ_buf_posted--; memcpy(&dma_buf->cq_event.cqe.rcqe_cmpl, rcqe, sizeof(*rcqe)); /* If a NVME LS event (type 0x28), treat it as Fast path */ @@ -12832,8 +12834,21 @@ lpfc_sli4_sp_handle_rcqe(struct lpfc_hba *phba, struct lpfc_rcqe *rcqe) spin_unlock_irqrestore(&phba->hbalock, iflags); workposted = true; break; - case FC_STATUS_INSUFF_BUF_NEED_BUF: case FC_STATUS_INSUFF_BUF_FRM_DISC: + if (phba->nvmet_support) { + tgtp = phba->targetport->private; + lpfc_printf_log(phba, KERN_ERR, LOG_SLI | LOG_NVME, + "6402 RQE Error x%x, posted %d err_cnt " + "%d: %x %x %x\n", + status, hrq->RQ_buf_posted, + hrq->RQ_no_posted_buf, + atomic_read(&tgtp->rcv_fcp_cmd_in), + atomic_read(&tgtp->rcv_fcp_cmd_out), + atomic_read(&tgtp->xmt_fcp_release)); + } + /* fallthrough */ + + case FC_STATUS_INSUFF_BUF_NEED_BUF: hrq->RQ_no_posted_buf++; /* Post more buffers if possible */ spin_lock_irqsave(&phba->hbalock, iflags); @@ -13135,6 +13150,7 @@ lpfc_sli4_nvmet_handle_rcqe(struct lpfc_hba *phba, struct lpfc_queue *cq, struct lpfc_queue *drq; struct rqb_dmabuf *dma_buf; struct fc_frame_header *fc_hdr; + struct lpfc_nvmet_tgtport *tgtp; uint32_t status, rq_id; unsigned long iflags; uint32_t fctl, idx; @@ -13165,8 +13181,6 @@ lpfc_sli4_nvmet_handle_rcqe(struct lpfc_hba *phba, struct lpfc_queue *cq, case FC_STATUS_RQ_BUF_LEN_EXCEEDED: lpfc_printf_log(phba, KERN_ERR, LOG_SLI, "6126 Receive Frame Truncated!!\n"); - hrq->RQ_buf_trunc++; - break; case FC_STATUS_RQ_SUCCESS: lpfc_sli4_rq_release(hrq, drq); spin_lock_irqsave(&phba->hbalock, iflags); @@ -13178,6 +13192,7 @@ lpfc_sli4_nvmet_handle_rcqe(struct lpfc_hba *phba, struct lpfc_queue *cq, } spin_unlock_irqrestore(&phba->hbalock, iflags); hrq->RQ_rcv_buf++; + hrq->RQ_buf_posted--; fc_hdr = (struct fc_frame_header *)dma_buf->hbuf.virt; /* Just some basic sanity checks on FCP Command frame */ @@ -13200,8 +13215,21 @@ lpfc_sli4_nvmet_handle_rcqe(struct lpfc_hba *phba, struct lpfc_queue *cq, drop: lpfc_in_buf_free(phba, &dma_buf->dbuf); break; - case FC_STATUS_INSUFF_BUF_NEED_BUF: case FC_STATUS_INSUFF_BUF_FRM_DISC: + if (phba->nvmet_support) { + tgtp = phba->targetport->private; + lpfc_printf_log(phba, KERN_ERR, LOG_SLI | LOG_NVME, + "6401 RQE Error x%x, posted %d err_cnt " + "%d: %x %x %x\n", + status, hrq->RQ_buf_posted, + hrq->RQ_no_posted_buf, + atomic_read(&tgtp->rcv_fcp_cmd_in), + atomic_read(&tgtp->rcv_fcp_cmd_out), + atomic_read(&tgtp->xmt_fcp_release)); + } + /* fallthrough */ + + case FC_STATUS_INSUFF_BUF_NEED_BUF: hrq->RQ_no_posted_buf++; /* Post more buffers if possible */ spin_lock_irqsave(&phba->hbalock, iflags); diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index 915e8d5581bd..7a8cbeb6a745 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -195,7 +195,7 @@ struct lpfc_queue { /* defines for RQ stats */ #define RQ_no_posted_buf q_cnt_1 #define RQ_no_buf_found q_cnt_2 -#define RQ_buf_trunc q_cnt_3 +#define RQ_buf_posted q_cnt_3 #define RQ_rcv_buf q_cnt_4 uint64_t isr_timestamp; From 61f3d4bf4f8f062cf6be143c9b7adbc3a017ea6e Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:41 -0700 Subject: [PATCH 020/153] scsi: lpfc: Fix nvmet RQ resource needs for large block writes. Large block writes to the nvme target were failing because the default number of RQs posted was insufficient. Expand the NVMET RQs to 2048 RQEs and ensure a minimum of 512 RQEs are posted, no matter how many MRQs are configured. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_attr.c | 6 ++--- drivers/scsi/lpfc/lpfc_init.c | 23 ++++++++++------ drivers/scsi/lpfc/lpfc_nvmet.c | 2 +- drivers/scsi/lpfc/lpfc_nvmet.h | 1 + drivers/scsi/lpfc/lpfc_sli.c | 49 +++++++--------------------------- drivers/scsi/lpfc/lpfc_sli4.h | 2 +- 6 files changed, 31 insertions(+), 52 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 41ec7451689b..129d6cd7635b 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -60,9 +60,9 @@ #define LPFC_MIN_DEVLOSS_TMO 1 #define LPFC_MAX_DEVLOSS_TMO 255 -#define LPFC_DEF_MRQ_POST 256 -#define LPFC_MIN_MRQ_POST 32 -#define LPFC_MAX_MRQ_POST 512 +#define LPFC_DEF_MRQ_POST 512 +#define LPFC_MIN_MRQ_POST 512 +#define LPFC_MAX_MRQ_POST 2048 /* * Write key size should be multiple of 4. If write key is changed diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index b1b181a756dc..5f62e3a1dff6 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -3390,6 +3390,11 @@ lpfc_sli4_nvmet_sgl_update(struct lpfc_hba *phba) */ els_xri_cnt = lpfc_sli4_get_els_iocb_cnt(phba); nvmet_xri_cnt = phba->cfg_nvmet_mrq * phba->cfg_nvmet_mrq_post; + + /* Ensure we at least meet the minimun for the system */ + if (nvmet_xri_cnt < LPFC_NVMET_RQE_DEF_COUNT) + nvmet_xri_cnt = LPFC_NVMET_RQE_DEF_COUNT; + tot_cnt = phba->sli4_hba.max_cfg_param.max_xri - els_xri_cnt; if (nvmet_xri_cnt > tot_cnt) { phba->cfg_nvmet_mrq_post = tot_cnt / phba->cfg_nvmet_mrq; @@ -8158,7 +8163,7 @@ lpfc_sli4_queue_create(struct lpfc_hba *phba) /* Create NVMET Receive Queue for header */ qdesc = lpfc_sli4_queue_alloc(phba, phba->sli4_hba.rq_esize, - phba->sli4_hba.rq_ecount); + LPFC_NVMET_RQE_DEF_COUNT); if (!qdesc) { lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "3146 Failed allocate " @@ -8180,7 +8185,7 @@ lpfc_sli4_queue_create(struct lpfc_hba *phba) /* Create NVMET Receive Queue for data */ qdesc = lpfc_sli4_queue_alloc(phba, phba->sli4_hba.rq_esize, - phba->sli4_hba.rq_ecount); + LPFC_NVMET_RQE_DEF_COUNT); if (!qdesc) { lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "3156 Failed allocate " @@ -8770,9 +8775,6 @@ lpfc_sli4_queue_setup(struct lpfc_hba *phba) goto out_destroy; } - lpfc_rq_adjust_repost(phba, phba->sli4_hba.hdr_rq, LPFC_ELS_HBQ); - lpfc_rq_adjust_repost(phba, phba->sli4_hba.dat_rq, LPFC_ELS_HBQ); - rc = lpfc_rq_create(phba, phba->sli4_hba.hdr_rq, phba->sli4_hba.dat_rq, phba->sli4_hba.els_cq, LPFC_USOL); if (rc) { @@ -11096,7 +11098,7 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) struct lpfc_hba *phba; struct lpfc_vport *vport = NULL; struct Scsi_Host *shost = NULL; - int error, cnt; + int error, cnt, num; uint32_t cfg_mode, intr_mode; /* Allocate memory for HBA structure */ @@ -11131,8 +11133,13 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) } cnt = phba->cfg_iocb_cnt * 1024; - if (phba->nvmet_support) - cnt += phba->cfg_nvmet_mrq_post * phba->cfg_nvmet_mrq; + if (phba->nvmet_support) { + /* Ensure we at least meet the minimun for the system */ + num = (phba->cfg_nvmet_mrq_post * phba->cfg_nvmet_mrq); + if (num < LPFC_NVMET_RQE_DEF_COUNT) + num = LPFC_NVMET_RQE_DEF_COUNT; + cnt += num; + } /* Initialize and populate the iocb list per host */ lpfc_printf_log(phba, KERN_INFO, LOG_INIT, diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index bb12e2c9fbf4..dfa7296499cf 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -614,9 +614,9 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport, lpfc_nvmeio_data(phba, "NVMET FCP CMND: xri x%x op x%x len x%x\n", ctxp->oxid, rsp->op, rsp->rsplen); + ctxp->flag |= LPFC_NVMET_IO_INP; rc = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, nvmewqeq); if (rc == WQE_SUCCESS) { - ctxp->flag |= LPFC_NVMET_IO_INP; #ifdef CONFIG_SCSI_LPFC_DEBUG_FS if (!phba->ktime_on) return 0; diff --git a/drivers/scsi/lpfc/lpfc_nvmet.h b/drivers/scsi/lpfc/lpfc_nvmet.h index 837210a3e7c8..55f2a859dc70 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.h +++ b/drivers/scsi/lpfc/lpfc_nvmet.h @@ -22,6 +22,7 @@ ********************************************************************/ #define LPFC_NVMET_DEFAULT_SEGS (64 + 1) /* 256K IOs */ +#define LPFC_NVMET_RQE_DEF_COUNT 512 #define LPFC_NVMET_SUCCESS_LEN 12 /* Used for NVME Target */ diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 333c5094b97d..f344abce4949 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -479,22 +479,23 @@ lpfc_sli4_rq_put(struct lpfc_queue *hq, struct lpfc_queue *dq, if (unlikely(!hq) || unlikely(!dq)) return -ENOMEM; put_index = hq->host_index; - temp_hrqe = hq->qe[hq->host_index].rqe; + temp_hrqe = hq->qe[put_index].rqe; temp_drqe = dq->qe[dq->host_index].rqe; if (hq->type != LPFC_HRQ || dq->type != LPFC_DRQ) return -EINVAL; - if (hq->host_index != dq->host_index) + if (put_index != dq->host_index) return -EINVAL; /* If the host has not yet processed the next entry then we are done */ - if (((hq->host_index + 1) % hq->entry_count) == hq->hba_index) + if (((put_index + 1) % hq->entry_count) == hq->hba_index) return -EBUSY; lpfc_sli_pcimem_bcopy(hrqe, temp_hrqe, hq->entry_size); lpfc_sli_pcimem_bcopy(drqe, temp_drqe, dq->entry_size); /* Update the host index to point to the next slot */ - hq->host_index = ((hq->host_index + 1) % hq->entry_count); + hq->host_index = ((put_index + 1) % hq->entry_count); dq->host_index = ((dq->host_index + 1) % dq->entry_count); + hq->RQ_buf_posted++; /* Ring The Header Receive Queue Doorbell */ if (!(hq->host_index % hq->entry_repost)) { @@ -512,7 +513,6 @@ lpfc_sli4_rq_put(struct lpfc_queue *hq, struct lpfc_queue *dq, } else { return -EINVAL; } - hq->RQ_buf_posted += hq->entry_repost; writel(doorbell.word0, hq->db_regaddr); } return put_index; @@ -6905,14 +6905,9 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) INIT_LIST_HEAD(&rqbp->rqb_buffer_list); rqbp->rqb_alloc_buffer = lpfc_sli4_nvmet_alloc; rqbp->rqb_free_buffer = lpfc_sli4_nvmet_free; - rqbp->entry_count = 256; + rqbp->entry_count = LPFC_NVMET_RQE_DEF_COUNT; rqbp->buffer_count = 0; - /* Divide by 4 and round down to multiple of 16 */ - rc = (phba->cfg_nvmet_mrq_post >> 2) & 0xfff8; - phba->sli4_hba.nvmet_mrq_hdr[i]->entry_repost = rc; - phba->sli4_hba.nvmet_mrq_data[i]->entry_repost = rc; - lpfc_post_rq_buffer( phba, phba->sli4_hba.nvmet_mrq_hdr[i], phba->sli4_hba.nvmet_mrq_data[i], @@ -14892,34 +14887,6 @@ lpfc_wq_create(struct lpfc_hba *phba, struct lpfc_queue *wq, return status; } -/** - * lpfc_rq_adjust_repost - Adjust entry_repost for an RQ - * @phba: HBA structure that indicates port to create a queue on. - * @rq: The queue structure to use for the receive queue. - * @qno: The associated HBQ number - * - * - * For SLI4 we need to adjust the RQ repost value based on - * the number of buffers that are initially posted to the RQ. - */ -void -lpfc_rq_adjust_repost(struct lpfc_hba *phba, struct lpfc_queue *rq, int qno) -{ - uint32_t cnt; - - /* sanity check on queue memory */ - if (!rq) - return; - cnt = lpfc_hbq_defs[qno]->entry_count; - - /* Recalc repost for RQs based on buffers initially posted */ - cnt = (cnt >> 3); - if (cnt < LPFC_QUEUE_MIN_REPOST) - cnt = LPFC_QUEUE_MIN_REPOST; - - rq->entry_repost = cnt; -} - /** * lpfc_rq_create - Create a Receive Queue on the HBA * @phba: HBA structure that indicates port to create a queue on. @@ -15105,6 +15072,7 @@ lpfc_rq_create(struct lpfc_hba *phba, struct lpfc_queue *hrq, hrq->subtype = subtype; hrq->host_index = 0; hrq->hba_index = 0; + hrq->entry_repost = LPFC_RQ_REPOST; /* now create the data queue */ lpfc_sli4_config(phba, mbox, LPFC_MBOX_SUBSYSTEM_FCOE, @@ -15186,6 +15154,7 @@ lpfc_rq_create(struct lpfc_hba *phba, struct lpfc_queue *hrq, drq->subtype = subtype; drq->host_index = 0; drq->hba_index = 0; + drq->entry_repost = LPFC_RQ_REPOST; /* link the header and data RQs onto the parent cq child list */ list_add_tail(&hrq->list, &cq->child_list); @@ -15343,6 +15312,7 @@ lpfc_mrq_create(struct lpfc_hba *phba, struct lpfc_queue **hrqp, hrq->subtype = subtype; hrq->host_index = 0; hrq->hba_index = 0; + hrq->entry_repost = LPFC_RQ_REPOST; drq->db_format = LPFC_DB_RING_FORMAT; drq->db_regaddr = phba->sli4_hba.RQDBregaddr; @@ -15351,6 +15321,7 @@ lpfc_mrq_create(struct lpfc_hba *phba, struct lpfc_queue **hrqp, drq->subtype = subtype; drq->host_index = 0; drq->hba_index = 0; + drq->entry_repost = LPFC_RQ_REPOST; list_add_tail(&hrq->list, &cq->child_list); list_add_tail(&drq->list, &cq->child_list); diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index 7a8cbeb6a745..422bde85c9f1 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -156,6 +156,7 @@ struct lpfc_queue { uint32_t entry_size; /* Size of each queue entry. */ uint32_t entry_repost; /* Count of entries before doorbell is rung */ #define LPFC_QUEUE_MIN_REPOST 8 +#define LPFC_RQ_REPOST 64 uint32_t queue_id; /* Queue ID assigned by the hardware */ uint32_t assoc_qid; /* Queue ID associated with, for CQ/WQ/MQ */ uint32_t page_count; /* Number of pages allocated for this queue */ @@ -763,7 +764,6 @@ int lpfc_rq_create(struct lpfc_hba *, struct lpfc_queue *, int lpfc_mrq_create(struct lpfc_hba *phba, struct lpfc_queue **hrqp, struct lpfc_queue **drqp, struct lpfc_queue **cqp, uint32_t subtype); -void lpfc_rq_adjust_repost(struct lpfc_hba *, struct lpfc_queue *, int); int lpfc_eq_destroy(struct lpfc_hba *, struct lpfc_queue *); int lpfc_cq_destroy(struct lpfc_hba *, struct lpfc_queue *); int lpfc_mq_destroy(struct lpfc_hba *, struct lpfc_queue *); From 3120046a970aee08a0787fb6792590f1e0047f62 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:42 -0700 Subject: [PATCH 021/153] scsi: lpfc: Fix NVMEI driver not decrementing counter causing bad rport state. During driver boot, a latency in the NVMET driver side causes the incoming NVMEI PRLI to get rejected by the NVMET driver. When this happens, the NVMEI driver runs out of PRLI retries. Bouncing the link does not fix the situation. If the NVMEI driver decides, on PRLI completion failures, to retry the PRLI, always decrement the fc4_prli_sent counter. This allows the PRLI completion to resolve to UNMAPPED when NVMET rejects the PRLI. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 3f9f6d5f8c69..3085895464d9 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -2077,16 +2077,19 @@ lpfc_cmpl_els_prli(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, if (irsp->ulpStatus) { /* Check for retry */ + ndlp->fc4_prli_sent--; if (lpfc_els_retry(phba, cmdiocb, rspiocb)) { /* ELS command is being retried */ - ndlp->fc4_prli_sent--; goto out; } + /* PRLI failed */ lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS, - "2754 PRLI failure DID:%06X Status:x%x/x%x\n", + "2754 PRLI failure DID:%06X Status:x%x/x%x, " + "data: x%x\n", ndlp->nlp_DID, irsp->ulpStatus, - irsp->un.ulpWord[4]); + irsp->un.ulpWord[4], ndlp->fc4_prli_sent); + /* Do not call DSM for lpfc_els_abort'ed ELS cmds */ if (lpfc_error_lost_link(irsp)) goto out; From 7869da183a7cfc8a2189f6eddd3bc558be40d5e3 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:43 -0700 Subject: [PATCH 022/153] scsi: lpfc: Fix NMI watchdog assertions when running nvmet IOPS tests After running IOPS test for 30 second we get kernel:NMI watchdog: Watchdog detected hard LOCKUP on cpu 0 The driver is speend too much time in its ISR. In ISR EQ and CQ processing routines, if we hit the entry_repost numbers of EQE/CQEs just break out of the routine as opposed to hitting the doorbell with NOARM and continue processing. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_debugfs.c | 30 +++++++++++++++--------------- drivers/scsi/lpfc/lpfc_sli.c | 16 ++++++---------- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index a41daedeb967..7284533f4df2 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -3075,11 +3075,11 @@ __lpfc_idiag_print_wq(struct lpfc_queue *qp, char *wqtype, qp->assoc_qid, qp->q_cnt_1, (unsigned long long)qp->q_cnt_4); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, - "\t\tWQID[%02d], QE-CNT[%04d], QE-SIZE[%04d], " - "HOST-IDX[%04d], PORT-IDX[%04d]", + "\t\tWQID[%02d], QE-CNT[%04d], QE-SZ[%04d], " + "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]", qp->queue_id, qp->entry_count, qp->entry_size, qp->host_index, - qp->hba_index); + qp->hba_index, qp->entry_repost); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n"); return len; @@ -3126,11 +3126,11 @@ __lpfc_idiag_print_cq(struct lpfc_queue *qp, char *cqtype, qp->assoc_qid, qp->q_cnt_1, qp->q_cnt_2, qp->q_cnt_3, (unsigned long long)qp->q_cnt_4); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, - "\tCQID[%02d], QE-CNT[%04d], QE-SIZE[%04d], " - "HOST-IDX[%04d], PORT-IDX[%04d]", + "\tCQID[%02d], QE-CNT[%04d], QE-SZ[%04d], " + "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]", qp->queue_id, qp->entry_count, qp->entry_size, qp->host_index, - qp->hba_index); + qp->hba_index, qp->entry_repost); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n"); @@ -3152,16 +3152,16 @@ __lpfc_idiag_print_rqpair(struct lpfc_queue *qp, struct lpfc_queue *datqp, qp->assoc_qid, qp->q_cnt_1, qp->q_cnt_2, qp->q_cnt_3, (unsigned long long)qp->q_cnt_4); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, - "\t\tHQID[%02d], QE-CNT[%04d], QE-SIZE[%04d], " - "HOST-IDX[%04d], PORT-IDX[%04d]\n", + "\t\tHQID[%02d], QE-CNT[%04d], QE-SZ[%04d], " + "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]\n", qp->queue_id, qp->entry_count, qp->entry_size, - qp->host_index, qp->hba_index); + qp->host_index, qp->hba_index, qp->entry_repost); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, - "\t\tDQID[%02d], QE-CNT[%04d], QE-SIZE[%04d], " - "HOST-IDX[%04d], PORT-IDX[%04d]\n", + "\t\tDQID[%02d], QE-CNT[%04d], QE-SZ[%04d], " + "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]\n", datqp->queue_id, datqp->entry_count, datqp->entry_size, datqp->host_index, - datqp->hba_index); + datqp->hba_index, datqp->entry_repost); return len; } @@ -3247,10 +3247,10 @@ __lpfc_idiag_print_eq(struct lpfc_queue *qp, char *eqtype, eqtype, qp->q_cnt_1, qp->q_cnt_2, qp->q_cnt_3, (unsigned long long)qp->q_cnt_4); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, - "EQID[%02d], QE-CNT[%04d], QE-SIZE[%04d], " - "HOST-IDX[%04d], PORT-IDX[%04d]", + "EQID[%02d], QE-CNT[%04d], QE-SZ[%04d], " + "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]", qp->queue_id, qp->entry_count, qp->entry_size, - qp->host_index, qp->hba_index); + qp->host_index, qp->hba_index, qp->entry_repost); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n"); return len; diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index f344abce4949..cc45e9191062 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -12961,7 +12961,7 @@ lpfc_sli4_sp_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe, while ((cqe = lpfc_sli4_cq_get(cq))) { workposted |= lpfc_sli4_sp_handle_mcqe(phba, cqe); if (!(++ecount % cq->entry_repost)) - lpfc_sli4_cq_release(cq, LPFC_QUEUE_NOARM); + break; cq->CQ_mbox++; } break; @@ -12975,7 +12975,7 @@ lpfc_sli4_sp_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe, workposted |= lpfc_sli4_sp_handle_cqe(phba, cq, cqe); if (!(++ecount % cq->entry_repost)) - lpfc_sli4_cq_release(cq, LPFC_QUEUE_NOARM); + break; } /* Track the max number of CQEs processed in 1 EQ */ @@ -13227,10 +13227,6 @@ lpfc_sli4_nvmet_handle_rcqe(struct lpfc_hba *phba, struct lpfc_queue *cq, case FC_STATUS_INSUFF_BUF_NEED_BUF: hrq->RQ_no_posted_buf++; /* Post more buffers if possible */ - spin_lock_irqsave(&phba->hbalock, iflags); - phba->hba_flag |= HBA_POST_RECEIVE_BUFFER; - spin_unlock_irqrestore(&phba->hbalock, iflags); - workposted = true; break; } out: @@ -13384,7 +13380,7 @@ lpfc_sli4_hba_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe, while ((cqe = lpfc_sli4_cq_get(cq))) { workposted |= lpfc_sli4_fp_handle_cqe(phba, cq, cqe); if (!(++ecount % cq->entry_repost)) - lpfc_sli4_cq_release(cq, LPFC_QUEUE_NOARM); + break; } /* Track the max number of CQEs processed in 1 EQ */ @@ -13475,7 +13471,7 @@ lpfc_sli4_fof_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe) while ((cqe = lpfc_sli4_cq_get(cq))) { workposted |= lpfc_sli4_fp_handle_cqe(phba, cq, cqe); if (!(++ecount % cq->entry_repost)) - lpfc_sli4_cq_release(cq, LPFC_QUEUE_NOARM); + break; } /* Track the max number of CQEs processed in 1 EQ */ @@ -13557,7 +13553,7 @@ lpfc_sli4_fof_intr_handler(int irq, void *dev_id) while ((eqe = lpfc_sli4_eq_get(eq))) { lpfc_sli4_fof_handle_eqe(phba, eqe); if (!(++ecount % eq->entry_repost)) - lpfc_sli4_eq_release(eq, LPFC_QUEUE_NOARM); + break; eq->EQ_processed++; } @@ -13674,7 +13670,7 @@ lpfc_sli4_hba_intr_handler(int irq, void *dev_id) lpfc_sli4_hba_handle_eqe(phba, eqe, hba_eqidx); if (!(++ecount % fpeq->entry_repost)) - lpfc_sli4_eq_release(fpeq, LPFC_QUEUE_NOARM); + break; fpeq->EQ_processed++; } From 3c603be9798758dde794daa622e0f7017dbff3a7 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:44 -0700 Subject: [PATCH 023/153] scsi: lpfc: Separate NVMET data buffer pool fir ELS/CT. Using 2048 byte buffer and onle 128 bytes is needed. Create nee LFPC_NVMET_DATA_BUF_SIZE define to use for NVMET RQ/MRQs. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 1 + drivers/scsi/lpfc/lpfc_crtn.h | 1 + drivers/scsi/lpfc/lpfc_hw4.h | 1 + drivers/scsi/lpfc/lpfc_init.c | 7 ++++++- drivers/scsi/lpfc/lpfc_mem.c | 33 ++++++++++++++++++++++++++------- drivers/scsi/lpfc/lpfc_sli.c | 19 +++++++++++++++---- 6 files changed, 50 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index 62571fa9c6ad..c4b38491da8e 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -943,6 +943,7 @@ struct lpfc_hba { struct pci_pool *lpfc_mbuf_pool; struct pci_pool *lpfc_hrb_pool; /* header receive buffer pool */ struct pci_pool *lpfc_drb_pool; /* data receive buffer pool */ + struct pci_pool *lpfc_nvmet_drb_pool; /* data receive buffer pool */ struct pci_pool *lpfc_hbq_pool; /* SLI3 hbq buffer pool */ struct pci_pool *txrdy_payload_pool; struct lpfc_dma_pool lpfc_mbuf_safety_pool; diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index 1c55408ac718..fb7fc48a1324 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -271,6 +271,7 @@ int lpfc_sli4_fcf_rr_next_proc(struct lpfc_vport *, uint16_t); void lpfc_sli4_clear_fcf_rr_bmask(struct lpfc_hba *); int lpfc_mem_alloc(struct lpfc_hba *, int align); +int lpfc_nvmet_mem_alloc(struct lpfc_hba *phba); int lpfc_mem_alloc_active_rrq_pool_s4(struct lpfc_hba *); void lpfc_mem_free(struct lpfc_hba *); void lpfc_mem_free_all(struct lpfc_hba *); diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h index 1d12f2be36bc..df97c6b7433b 100644 --- a/drivers/scsi/lpfc/lpfc_hw4.h +++ b/drivers/scsi/lpfc/lpfc_hw4.h @@ -1356,6 +1356,7 @@ struct lpfc_mbx_wq_destroy { #define LPFC_HDR_BUF_SIZE 128 #define LPFC_DATA_BUF_SIZE 2048 +#define LPFC_NVMET_DATA_BUF_SIZE 128 struct rq_context { uint32_t word0; #define lpfc_rq_context_rqe_count_SHIFT 16 /* Version 0 Only */ diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 5f62e3a1dff6..26b6a843d32d 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -5956,16 +5956,21 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) for (i = 0; i < lpfc_enable_nvmet_cnt; i++) { if (wwn == lpfc_enable_nvmet[i]) { #if (IS_ENABLED(CONFIG_NVME_TARGET_FC)) + if (lpfc_nvmet_mem_alloc(phba)) + break; + + phba->nvmet_support = 1; /* a match */ + lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "6017 NVME Target %016llx\n", wwn); - phba->nvmet_support = 1; /* a match */ #else lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "6021 Can't enable NVME Target." " NVME_TARGET_FC infrastructure" " is not in kernel\n"); #endif + break; } } } diff --git a/drivers/scsi/lpfc/lpfc_mem.c b/drivers/scsi/lpfc/lpfc_mem.c index 5986c7957199..91060afc9721 100644 --- a/drivers/scsi/lpfc/lpfc_mem.c +++ b/drivers/scsi/lpfc/lpfc_mem.c @@ -214,6 +214,21 @@ lpfc_mem_alloc(struct lpfc_hba *phba, int align) return -ENOMEM; } +int +lpfc_nvmet_mem_alloc(struct lpfc_hba *phba) +{ + phba->lpfc_nvmet_drb_pool = + pci_pool_create("lpfc_nvmet_drb_pool", + phba->pcidev, LPFC_NVMET_DATA_BUF_SIZE, + SGL_ALIGN_SZ, 0); + if (!phba->lpfc_nvmet_drb_pool) { + lpfc_printf_log(phba, KERN_ERR, LOG_INIT, + "6024 Can't enable NVME Target - no memory\n"); + return -ENOMEM; + } + return 0; +} + /** * lpfc_mem_free - Frees memory allocated by lpfc_mem_alloc * @phba: HBA to free memory for @@ -232,6 +247,9 @@ lpfc_mem_free(struct lpfc_hba *phba) /* Free HBQ pools */ lpfc_sli_hbqbuf_free_all(phba); + if (phba->lpfc_nvmet_drb_pool) + pci_pool_destroy(phba->lpfc_nvmet_drb_pool); + phba->lpfc_nvmet_drb_pool = NULL; if (phba->lpfc_drb_pool) pci_pool_destroy(phba->lpfc_drb_pool); phba->lpfc_drb_pool = NULL; @@ -624,20 +642,20 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) kfree(dma_buf); return NULL; } - dma_buf->dbuf.virt = pci_pool_alloc(phba->lpfc_drb_pool, GFP_KERNEL, - &dma_buf->dbuf.phys); + dma_buf->dbuf.virt = pci_pool_alloc(phba->lpfc_nvmet_drb_pool, + GFP_KERNEL, &dma_buf->dbuf.phys); if (!dma_buf->dbuf.virt) { pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, dma_buf->hbuf.phys); kfree(dma_buf); return NULL; } - dma_buf->total_size = LPFC_DATA_BUF_SIZE; + dma_buf->total_size = LPFC_NVMET_DATA_BUF_SIZE; dma_buf->context = kzalloc(sizeof(struct lpfc_nvmet_rcv_ctx), GFP_KERNEL); if (!dma_buf->context) { - pci_pool_free(phba->lpfc_drb_pool, dma_buf->dbuf.virt, + pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, dma_buf->dbuf.phys); pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, dma_buf->hbuf.phys); @@ -648,7 +666,7 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) dma_buf->iocbq = lpfc_sli_get_iocbq(phba); if (!dma_buf->iocbq) { kfree(dma_buf->context); - pci_pool_free(phba->lpfc_drb_pool, dma_buf->dbuf.virt, + pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, dma_buf->dbuf.phys); pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, dma_buf->hbuf.phys); @@ -678,7 +696,7 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) if (!dma_buf->sglq) { lpfc_sli_release_iocbq(phba, dma_buf->iocbq); kfree(dma_buf->context); - pci_pool_free(phba->lpfc_drb_pool, dma_buf->dbuf.virt, + pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, dma_buf->dbuf.phys); pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, dma_buf->hbuf.phys); @@ -718,7 +736,8 @@ lpfc_sli4_nvmet_free(struct lpfc_hba *phba, struct rqb_dmabuf *dmab) lpfc_sli_release_iocbq(phba, dmab->iocbq); kfree(dmab->context); pci_pool_free(phba->lpfc_hrb_pool, dmab->hbuf.virt, dmab->hbuf.phys); - pci_pool_free(phba->lpfc_drb_pool, dmab->dbuf.virt, dmab->dbuf.phys); + pci_pool_free(phba->lpfc_nvmet_drb_pool, + dmab->dbuf.virt, dmab->dbuf.phys); kfree(dmab); } diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index cc45e9191062..49d5c4700054 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -15079,7 +15079,12 @@ lpfc_rq_create(struct lpfc_hba *phba, struct lpfc_queue *hrq, if (phba->sli4_hba.pc_sli4_params.rqv == LPFC_Q_CREATE_VERSION_1) { bf_set(lpfc_rq_context_rqe_count_1, &rq_create->u.request.context, hrq->entry_count); - rq_create->u.request.context.buffer_size = LPFC_DATA_BUF_SIZE; + if (subtype == LPFC_NVMET) + rq_create->u.request.context.buffer_size = + LPFC_NVMET_DATA_BUF_SIZE; + else + rq_create->u.request.context.buffer_size = + LPFC_DATA_BUF_SIZE; bf_set(lpfc_rq_context_rqe_size, &rq_create->u.request.context, LPFC_RQE_SIZE_8); bf_set(lpfc_rq_context_page_size, &rq_create->u.request.context, @@ -15116,8 +15121,14 @@ lpfc_rq_create(struct lpfc_hba *phba, struct lpfc_queue *hrq, LPFC_RQ_RING_SIZE_4096); break; } - bf_set(lpfc_rq_context_buf_size, &rq_create->u.request.context, - LPFC_DATA_BUF_SIZE); + if (subtype == LPFC_NVMET) + bf_set(lpfc_rq_context_buf_size, + &rq_create->u.request.context, + LPFC_NVMET_DATA_BUF_SIZE); + else + bf_set(lpfc_rq_context_buf_size, + &rq_create->u.request.context, + LPFC_DATA_BUF_SIZE); } bf_set(lpfc_rq_context_cq_id, &rq_create->u.request.context, cq->queue_id); @@ -15263,7 +15274,7 @@ lpfc_mrq_create(struct lpfc_hba *phba, struct lpfc_queue **hrqp, cq->queue_id); bf_set(lpfc_rq_context_data_size, &rq_create->u.request.context, - LPFC_DATA_BUF_SIZE); + LPFC_NVMET_DATA_BUF_SIZE); bf_set(lpfc_rq_context_hdr_size, &rq_create->u.request.context, LPFC_HDR_BUF_SIZE); From 6c621a2229b084da0d926967f84b059a10c26ede Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:45 -0700 Subject: [PATCH 024/153] scsi: lpfc: Separate NVMET RQ buffer posting from IO resources SGL/iocbq/context Currently IO resources are mapped 1 to 1 with RQ buffers posted Added logic to separate RQE buffers from IO op resources (sgl/iocbq/context). During initialization, the driver will determine how many SGLs it will allocate for NVMET (based on what the firmware reports) and associate a NVMET IOCBq and NVMET context structure with each one. Now that hdr/data buffers are immediately reposted back to the RQ, 512 RQEs for each MRQ is sufficient. Also, since NVMET data buffers are now 128 bytes, lpfc_nvmet_mrq_post is not necessary anymore as we will always post the max (512) buffers per NVMET MRQ. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 11 +- drivers/scsi/lpfc/lpfc_attr.c | 11 -- drivers/scsi/lpfc/lpfc_crtn.h | 8 +- drivers/scsi/lpfc/lpfc_init.c | 92 ++----------- drivers/scsi/lpfc/lpfc_mem.c | 73 +--------- drivers/scsi/lpfc/lpfc_nvmet.c | 244 +++++++++++++++++++++++++-------- drivers/scsi/lpfc/lpfc_nvmet.h | 1 + drivers/scsi/lpfc/lpfc_sli.c | 78 ++++++++++- drivers/scsi/lpfc/lpfc_sli4.h | 4 +- 9 files changed, 290 insertions(+), 232 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index c4b38491da8e..72641b1d3ab8 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -141,6 +141,13 @@ struct lpfc_dmabuf { uint32_t buffer_tag; /* used for tagged queue ring */ }; +struct lpfc_nvmet_ctxbuf { + struct list_head list; + struct lpfc_nvmet_rcv_ctx *context; + struct lpfc_iocbq *iocbq; + struct lpfc_sglq *sglq; +}; + struct lpfc_dma_pool { struct lpfc_dmabuf *elements; uint32_t max_count; @@ -163,9 +170,6 @@ struct rqb_dmabuf { struct lpfc_dmabuf dbuf; uint16_t total_size; uint16_t bytes_recv; - void *context; - struct lpfc_iocbq *iocbq; - struct lpfc_sglq *sglq; struct lpfc_queue *hrq; /* ptr to associated Header RQ */ struct lpfc_queue *drq; /* ptr to associated Data RQ */ }; @@ -777,7 +781,6 @@ struct lpfc_hba { uint32_t cfg_nvme_oas; uint32_t cfg_nvme_io_channel; uint32_t cfg_nvmet_mrq; - uint32_t cfg_nvmet_mrq_post; uint32_t cfg_enable_nvmet; uint32_t cfg_nvme_enable_fb; uint32_t cfg_nvmet_fb_size; diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 129d6cd7635b..65264582915a 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -3315,14 +3315,6 @@ LPFC_ATTR_R(nvmet_mrq, 1, 1, 16, "Specify number of RQ pairs for processing NVMET cmds"); -/* - * lpfc_nvmet_mrq_post: Specify number buffers to post on every MRQ - * - */ -LPFC_ATTR_R(nvmet_mrq_post, LPFC_DEF_MRQ_POST, - LPFC_MIN_MRQ_POST, LPFC_MAX_MRQ_POST, - "Specify number of buffers to post on every MRQ"); - /* * lpfc_enable_fc4_type: Defines what FC4 types are supported. * Supported Values: 1 - register just FCP @@ -5158,7 +5150,6 @@ struct device_attribute *lpfc_hba_attrs[] = { &dev_attr_lpfc_suppress_rsp, &dev_attr_lpfc_nvme_io_channel, &dev_attr_lpfc_nvmet_mrq, - &dev_attr_lpfc_nvmet_mrq_post, &dev_attr_lpfc_nvme_enable_fb, &dev_attr_lpfc_nvmet_fb_size, &dev_attr_lpfc_enable_bg, @@ -6198,7 +6189,6 @@ lpfc_get_cfgparam(struct lpfc_hba *phba) lpfc_enable_fc4_type_init(phba, lpfc_enable_fc4_type); lpfc_nvmet_mrq_init(phba, lpfc_nvmet_mrq); - lpfc_nvmet_mrq_post_init(phba, lpfc_nvmet_mrq_post); /* Initialize first burst. Target vs Initiator are different. */ lpfc_nvme_enable_fb_init(phba, lpfc_nvme_enable_fb); @@ -6295,7 +6285,6 @@ lpfc_nvme_mod_param_dep(struct lpfc_hba *phba) /* Not NVME Target mode. Turn off Target parameters. */ phba->nvmet_support = 0; phba->cfg_nvmet_mrq = 0; - phba->cfg_nvmet_mrq_post = 0; phba->cfg_nvmet_fb_size = 0; } diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index fb7fc48a1324..cc95abd130b4 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -75,6 +75,8 @@ void lpfc_init_vpi_cmpl(struct lpfc_hba *, LPFC_MBOXQ_t *); void lpfc_cancel_all_vport_retry_delay_timer(struct lpfc_hba *); void lpfc_retry_pport_discovery(struct lpfc_hba *); void lpfc_release_rpi(struct lpfc_hba *, struct lpfc_vport *, uint16_t); +int lpfc_init_iocb_list(struct lpfc_hba *phba, int cnt); +void lpfc_free_iocb_list(struct lpfc_hba *phba); void lpfc_mbx_cmpl_local_config_link(struct lpfc_hba *, LPFC_MBOXQ_t *); void lpfc_mbx_cmpl_reg_login(struct lpfc_hba *, LPFC_MBOXQ_t *); @@ -246,16 +248,14 @@ struct hbq_dmabuf *lpfc_sli4_rb_alloc(struct lpfc_hba *); void lpfc_sli4_rb_free(struct lpfc_hba *, struct hbq_dmabuf *); struct rqb_dmabuf *lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba); void lpfc_sli4_nvmet_free(struct lpfc_hba *phba, struct rqb_dmabuf *dmab); -void lpfc_nvmet_rq_post(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp, - struct lpfc_dmabuf *mp); +void lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, + struct lpfc_nvmet_ctxbuf *ctxp); int lpfc_nvmet_rcv_unsol_abort(struct lpfc_vport *vport, struct fc_frame_header *fc_hdr); void lpfc_sli4_build_dflt_fcf_record(struct lpfc_hba *, struct fcf_record *, uint16_t); int lpfc_sli4_rq_put(struct lpfc_queue *hq, struct lpfc_queue *dq, struct lpfc_rqe *hrqe, struct lpfc_rqe *drqe); -int lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hq, - struct lpfc_queue *dq, int count); int lpfc_free_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hq); void lpfc_unregister_fcf(struct lpfc_hba *); void lpfc_unregister_fcf_rescan(struct lpfc_hba *); diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 26b6a843d32d..86b0b26dfeea 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -1099,7 +1099,7 @@ lpfc_hba_down_post_s4(struct lpfc_hba *phba) list_for_each_entry_safe(ctxp, ctxp_next, &nvmet_aborts, list) { ctxp->flag &= ~(LPFC_NVMET_XBUSY | LPFC_NVMET_ABORT_OP); - lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf); + lpfc_nvmet_ctxbuf_post(phba, ctxp->ctxbuf); } } @@ -3381,7 +3381,7 @@ lpfc_sli4_nvmet_sgl_update(struct lpfc_hba *phba) { struct lpfc_sglq *sglq_entry = NULL, *sglq_entry_next = NULL; uint16_t i, lxri, xri_cnt, els_xri_cnt; - uint16_t nvmet_xri_cnt, tot_cnt; + uint16_t nvmet_xri_cnt; LIST_HEAD(nvmet_sgl_list); int rc; @@ -3389,20 +3389,9 @@ lpfc_sli4_nvmet_sgl_update(struct lpfc_hba *phba) * update on pci function's nvmet xri-sgl list */ els_xri_cnt = lpfc_sli4_get_els_iocb_cnt(phba); - nvmet_xri_cnt = phba->cfg_nvmet_mrq * phba->cfg_nvmet_mrq_post; - /* Ensure we at least meet the minimun for the system */ - if (nvmet_xri_cnt < LPFC_NVMET_RQE_DEF_COUNT) - nvmet_xri_cnt = LPFC_NVMET_RQE_DEF_COUNT; - - tot_cnt = phba->sli4_hba.max_cfg_param.max_xri - els_xri_cnt; - if (nvmet_xri_cnt > tot_cnt) { - phba->cfg_nvmet_mrq_post = tot_cnt / phba->cfg_nvmet_mrq; - nvmet_xri_cnt = phba->cfg_nvmet_mrq * phba->cfg_nvmet_mrq_post; - lpfc_printf_log(phba, KERN_INFO, LOG_SLI, - "6301 NVMET post-sgl count changed to %d\n", - phba->cfg_nvmet_mrq_post); - } + /* For NVMET, ALL remaining XRIs are dedicated for IO processing */ + nvmet_xri_cnt = phba->sli4_hba.max_cfg_param.max_xri - els_xri_cnt; if (nvmet_xri_cnt > phba->sli4_hba.nvmet_xri_cnt) { /* els xri-sgl expanded */ @@ -5835,6 +5824,8 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) spin_lock_init(&phba->sli4_hba.abts_nvme_buf_list_lock); INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvme_buf_list); INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list); + INIT_LIST_HEAD(&phba->sli4_hba.lpfc_nvmet_ctx_list); + /* Fast-path XRI aborted CQ Event work queue list */ INIT_LIST_HEAD(&phba->sli4_hba.sp_nvme_xri_aborted_work_queue); } @@ -6279,7 +6270,7 @@ lpfc_unset_driver_resource_phase2(struct lpfc_hba *phba) * * This routine is invoked to free the driver's IOCB list and memory. **/ -static void +void lpfc_free_iocb_list(struct lpfc_hba *phba) { struct lpfc_iocbq *iocbq_entry = NULL, *iocbq_next = NULL; @@ -6307,7 +6298,7 @@ lpfc_free_iocb_list(struct lpfc_hba *phba) * 0 - successful * other values - error **/ -static int +int lpfc_init_iocb_list(struct lpfc_hba *phba, int iocb_count) { struct lpfc_iocbq *iocbq_entry = NULL; @@ -8321,46 +8312,6 @@ lpfc_sli4_queue_destroy(struct lpfc_hba *phba) INIT_LIST_HEAD(&phba->sli4_hba.lpfc_wq_list); } -int -lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq, - struct lpfc_queue *drq, int count) -{ - int rc, i; - struct lpfc_rqe hrqe; - struct lpfc_rqe drqe; - struct lpfc_rqb *rqbp; - struct rqb_dmabuf *rqb_buffer; - LIST_HEAD(rqb_buf_list); - - rqbp = hrq->rqbp; - for (i = 0; i < count; i++) { - rqb_buffer = (rqbp->rqb_alloc_buffer)(phba); - if (!rqb_buffer) - break; - rqb_buffer->hrq = hrq; - rqb_buffer->drq = drq; - list_add_tail(&rqb_buffer->hbuf.list, &rqb_buf_list); - } - while (!list_empty(&rqb_buf_list)) { - list_remove_head(&rqb_buf_list, rqb_buffer, struct rqb_dmabuf, - hbuf.list); - - hrqe.address_lo = putPaddrLow(rqb_buffer->hbuf.phys); - hrqe.address_hi = putPaddrHigh(rqb_buffer->hbuf.phys); - drqe.address_lo = putPaddrLow(rqb_buffer->dbuf.phys); - drqe.address_hi = putPaddrHigh(rqb_buffer->dbuf.phys); - rc = lpfc_sli4_rq_put(hrq, drq, &hrqe, &drqe); - if (rc < 0) { - (rqbp->rqb_free_buffer)(phba, rqb_buffer); - } else { - list_add_tail(&rqb_buffer->hbuf.list, - &rqbp->rqb_buffer_list); - rqbp->buffer_count++; - } - } - return 1; -} - int lpfc_free_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *rq) { @@ -11103,7 +11054,7 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) struct lpfc_hba *phba; struct lpfc_vport *vport = NULL; struct Scsi_Host *shost = NULL; - int error, cnt, num; + int error; uint32_t cfg_mode, intr_mode; /* Allocate memory for HBA structure */ @@ -11137,27 +11088,6 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) goto out_unset_pci_mem_s4; } - cnt = phba->cfg_iocb_cnt * 1024; - if (phba->nvmet_support) { - /* Ensure we at least meet the minimun for the system */ - num = (phba->cfg_nvmet_mrq_post * phba->cfg_nvmet_mrq); - if (num < LPFC_NVMET_RQE_DEF_COUNT) - num = LPFC_NVMET_RQE_DEF_COUNT; - cnt += num; - } - - /* Initialize and populate the iocb list per host */ - lpfc_printf_log(phba, KERN_INFO, LOG_INIT, - "2821 initialize iocb list %d total %d\n", - phba->cfg_iocb_cnt, cnt); - error = lpfc_init_iocb_list(phba, cnt); - - if (error) { - lpfc_printf_log(phba, KERN_ERR, LOG_INIT, - "1413 Failed to initialize iocb list.\n"); - goto out_unset_driver_resource_s4; - } - INIT_LIST_HEAD(&phba->active_rrq_list); INIT_LIST_HEAD(&phba->fcf.fcf_pri_list); @@ -11166,7 +11096,7 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) if (error) { lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "1414 Failed to set up driver resource.\n"); - goto out_free_iocb_list; + goto out_unset_driver_resource_s4; } /* Get the default values for Model Name and Description */ @@ -11266,8 +11196,6 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) lpfc_destroy_shost(phba); out_unset_driver_resource: lpfc_unset_driver_resource_phase2(phba); -out_free_iocb_list: - lpfc_free_iocb_list(phba); out_unset_driver_resource_s4: lpfc_sli4_driver_resource_unset(phba); out_unset_pci_mem_s4: diff --git a/drivers/scsi/lpfc/lpfc_mem.c b/drivers/scsi/lpfc/lpfc_mem.c index 91060afc9721..fcc05a1517c2 100644 --- a/drivers/scsi/lpfc/lpfc_mem.c +++ b/drivers/scsi/lpfc/lpfc_mem.c @@ -629,8 +629,6 @@ struct rqb_dmabuf * lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) { struct rqb_dmabuf *dma_buf; - struct lpfc_iocbq *nvmewqe; - union lpfc_wqe128 *wqe; dma_buf = kzalloc(sizeof(struct rqb_dmabuf), GFP_KERNEL); if (!dma_buf) @@ -651,60 +649,6 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) return NULL; } dma_buf->total_size = LPFC_NVMET_DATA_BUF_SIZE; - - dma_buf->context = kzalloc(sizeof(struct lpfc_nvmet_rcv_ctx), - GFP_KERNEL); - if (!dma_buf->context) { - pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, - dma_buf->dbuf.phys); - pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, - dma_buf->hbuf.phys); - kfree(dma_buf); - return NULL; - } - - dma_buf->iocbq = lpfc_sli_get_iocbq(phba); - if (!dma_buf->iocbq) { - kfree(dma_buf->context); - pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, - dma_buf->dbuf.phys); - pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, - dma_buf->hbuf.phys); - kfree(dma_buf); - lpfc_printf_log(phba, KERN_ERR, LOG_NVME, - "2621 Ran out of nvmet iocb/WQEs\n"); - return NULL; - } - dma_buf->iocbq->iocb_flag = LPFC_IO_NVMET; - nvmewqe = dma_buf->iocbq; - wqe = (union lpfc_wqe128 *)&nvmewqe->wqe; - /* Initialize WQE */ - memset(wqe, 0, sizeof(union lpfc_wqe)); - /* Word 7 */ - bf_set(wqe_ct, &wqe->generic.wqe_com, SLI4_CT_RPI); - bf_set(wqe_class, &wqe->generic.wqe_com, CLASS3); - bf_set(wqe_pu, &wqe->generic.wqe_com, 1); - /* Word 10 */ - bf_set(wqe_nvme, &wqe->fcp_tsend.wqe_com, 1); - bf_set(wqe_ebde_cnt, &wqe->generic.wqe_com, 0); - bf_set(wqe_qosd, &wqe->generic.wqe_com, 0); - - dma_buf->iocbq->context1 = NULL; - spin_lock(&phba->sli4_hba.sgl_list_lock); - dma_buf->sglq = __lpfc_sli_get_nvmet_sglq(phba, dma_buf->iocbq); - spin_unlock(&phba->sli4_hba.sgl_list_lock); - if (!dma_buf->sglq) { - lpfc_sli_release_iocbq(phba, dma_buf->iocbq); - kfree(dma_buf->context); - pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, - dma_buf->dbuf.phys); - pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, - dma_buf->hbuf.phys); - kfree(dma_buf); - lpfc_printf_log(phba, KERN_ERR, LOG_NVME, - "6132 Ran out of nvmet XRIs\n"); - return NULL; - } return dma_buf; } @@ -723,18 +667,6 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) void lpfc_sli4_nvmet_free(struct lpfc_hba *phba, struct rqb_dmabuf *dmab) { - unsigned long flags; - - __lpfc_clear_active_sglq(phba, dmab->sglq->sli4_lxritag); - dmab->sglq->state = SGL_FREED; - dmab->sglq->ndlp = NULL; - - spin_lock_irqsave(&phba->sli4_hba.sgl_list_lock, flags); - list_add_tail(&dmab->sglq->list, &phba->sli4_hba.lpfc_nvmet_sgl_list); - spin_unlock_irqrestore(&phba->sli4_hba.sgl_list_lock, flags); - - lpfc_sli_release_iocbq(phba, dmab->iocbq); - kfree(dmab->context); pci_pool_free(phba->lpfc_hrb_pool, dmab->hbuf.virt, dmab->hbuf.phys); pci_pool_free(phba->lpfc_nvmet_drb_pool, dmab->dbuf.virt, dmab->dbuf.phys); @@ -822,6 +754,11 @@ lpfc_rq_buf_free(struct lpfc_hba *phba, struct lpfc_dmabuf *mp) rc = lpfc_sli4_rq_put(rqb_entry->hrq, rqb_entry->drq, &hrqe, &drqe); if (rc < 0) { (rqbp->rqb_free_buffer)(phba, rqb_entry); + lpfc_printf_log(phba, KERN_ERR, LOG_INIT, + "6409 Cannot post to RQ %d: %x %x\n", + rqb_entry->hrq->queue_id, + rqb_entry->hrq->host_index, + rqb_entry->hrq->hba_index); } else { list_add_tail(&rqb_entry->hbuf.list, &rqbp->rqb_buffer_list); rqbp->buffer_count++; diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index dfa7296499cf..fcc77ae0c71c 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -142,7 +142,7 @@ lpfc_nvmet_xmt_ls_rsp_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, } /** - * lpfc_nvmet_rq_post - Repost a NVMET RQ DMA buffer and clean up context + * lpfc_nvmet_ctxbuf_post - Repost a NVMET RQ DMA buffer and clean up context * @phba: HBA buffer is associated with * @ctxp: context to clean up * @mp: Buffer to free @@ -155,24 +155,24 @@ lpfc_nvmet_xmt_ls_rsp_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, * Returns: None **/ void -lpfc_nvmet_rq_post(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp, - struct lpfc_dmabuf *mp) +lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf) { - if (ctxp) { - if (ctxp->flag) - lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS, - "6314 rq_post ctx xri x%x flag x%x\n", - ctxp->oxid, ctxp->flag); + struct lpfc_nvmet_rcv_ctx *ctxp = ctx_buf->context; + unsigned long iflag; - if (ctxp->txrdy) { - pci_pool_free(phba->txrdy_payload_pool, ctxp->txrdy, - ctxp->txrdy_phys); - ctxp->txrdy = NULL; - ctxp->txrdy_phys = 0; - } - ctxp->state = LPFC_NVMET_STE_FREE; + if (ctxp->txrdy) { + pci_pool_free(phba->txrdy_payload_pool, ctxp->txrdy, + ctxp->txrdy_phys); + ctxp->txrdy = NULL; + ctxp->txrdy_phys = 0; } - lpfc_rq_buf_free(phba, mp); + ctxp->state = LPFC_NVMET_STE_FREE; + + spin_lock_irqsave(&phba->sli4_hba.nvmet_io_lock, iflag); + list_add_tail(&ctx_buf->list, + &phba->sli4_hba.lpfc_nvmet_ctx_list); + phba->sli4_hba.nvmet_ctx_cnt++; + spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_lock, iflag); } #ifdef CONFIG_SCSI_LPFC_DEBUG_FS @@ -718,7 +718,7 @@ lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport, if (aborting) return; - lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf); + lpfc_nvmet_ctxbuf_post(phba, ctxp->ctxbuf); } static struct nvmet_fc_target_template lpfc_tgttemplate = { @@ -739,17 +739,128 @@ static struct nvmet_fc_target_template lpfc_tgttemplate = { .target_priv_sz = sizeof(struct lpfc_nvmet_tgtport), }; +void +lpfc_nvmet_cleanup_io_context(struct lpfc_hba *phba) +{ + struct lpfc_nvmet_ctxbuf *ctx_buf, *next_ctx_buf; + unsigned long flags; + + list_for_each_entry_safe( + ctx_buf, next_ctx_buf, + &phba->sli4_hba.lpfc_nvmet_ctx_list, list) { + spin_lock_irqsave( + &phba->sli4_hba.abts_nvme_buf_list_lock, flags); + list_del_init(&ctx_buf->list); + spin_unlock_irqrestore( + &phba->sli4_hba.abts_nvme_buf_list_lock, flags); + __lpfc_clear_active_sglq(phba, + ctx_buf->sglq->sli4_lxritag); + ctx_buf->sglq->state = SGL_FREED; + ctx_buf->sglq->ndlp = NULL; + + spin_lock_irqsave(&phba->sli4_hba.sgl_list_lock, flags); + list_add_tail(&ctx_buf->sglq->list, + &phba->sli4_hba.lpfc_nvmet_sgl_list); + spin_unlock_irqrestore(&phba->sli4_hba.sgl_list_lock, + flags); + + lpfc_sli_release_iocbq(phba, ctx_buf->iocbq); + kfree(ctx_buf->context); + } +} + +int +lpfc_nvmet_setup_io_context(struct lpfc_hba *phba) +{ + struct lpfc_nvmet_ctxbuf *ctx_buf; + struct lpfc_iocbq *nvmewqe; + union lpfc_wqe128 *wqe; + int i; + + lpfc_printf_log(phba, KERN_INFO, LOG_NVME, + "6403 Allocate NVMET resources for %d XRIs\n", + phba->sli4_hba.nvmet_xri_cnt); + + /* For all nvmet xris, allocate resources needed to process a + * received command on a per xri basis. + */ + for (i = 0; i < phba->sli4_hba.nvmet_xri_cnt; i++) { + ctx_buf = kzalloc(sizeof(*ctx_buf), GFP_KERNEL); + if (!ctx_buf) { + lpfc_printf_log(phba, KERN_ERR, LOG_NVME, + "6404 Ran out of memory for NVMET\n"); + return -ENOMEM; + } + + ctx_buf->context = kzalloc(sizeof(*ctx_buf->context), + GFP_KERNEL); + if (!ctx_buf->context) { + kfree(ctx_buf); + lpfc_printf_log(phba, KERN_ERR, LOG_NVME, + "6405 Ran out of NVMET " + "context memory\n"); + return -ENOMEM; + } + ctx_buf->context->ctxbuf = ctx_buf; + + ctx_buf->iocbq = lpfc_sli_get_iocbq(phba); + if (!ctx_buf->iocbq) { + kfree(ctx_buf->context); + kfree(ctx_buf); + lpfc_printf_log(phba, KERN_ERR, LOG_NVME, + "6406 Ran out of NVMET iocb/WQEs\n"); + return -ENOMEM; + } + ctx_buf->iocbq->iocb_flag = LPFC_IO_NVMET; + nvmewqe = ctx_buf->iocbq; + wqe = (union lpfc_wqe128 *)&nvmewqe->wqe; + /* Initialize WQE */ + memset(wqe, 0, sizeof(union lpfc_wqe)); + /* Word 7 */ + bf_set(wqe_ct, &wqe->generic.wqe_com, SLI4_CT_RPI); + bf_set(wqe_class, &wqe->generic.wqe_com, CLASS3); + bf_set(wqe_pu, &wqe->generic.wqe_com, 1); + /* Word 10 */ + bf_set(wqe_nvme, &wqe->fcp_tsend.wqe_com, 1); + bf_set(wqe_ebde_cnt, &wqe->generic.wqe_com, 0); + bf_set(wqe_qosd, &wqe->generic.wqe_com, 0); + + ctx_buf->iocbq->context1 = NULL; + spin_lock(&phba->sli4_hba.sgl_list_lock); + ctx_buf->sglq = __lpfc_sli_get_nvmet_sglq(phba, ctx_buf->iocbq); + spin_unlock(&phba->sli4_hba.sgl_list_lock); + if (!ctx_buf->sglq) { + lpfc_sli_release_iocbq(phba, ctx_buf->iocbq); + kfree(ctx_buf->context); + kfree(ctx_buf); + lpfc_printf_log(phba, KERN_ERR, LOG_NVME, + "6407 Ran out of NVMET XRIs\n"); + return -ENOMEM; + } + spin_lock(&phba->sli4_hba.nvmet_io_lock); + list_add_tail(&ctx_buf->list, + &phba->sli4_hba.lpfc_nvmet_ctx_list); + spin_unlock(&phba->sli4_hba.nvmet_io_lock); + } + phba->sli4_hba.nvmet_ctx_cnt = phba->sli4_hba.nvmet_xri_cnt; + return 0; +} + int lpfc_nvmet_create_targetport(struct lpfc_hba *phba) { struct lpfc_vport *vport = phba->pport; struct lpfc_nvmet_tgtport *tgtp; struct nvmet_fc_port_info pinfo; - int error = 0; + int error; if (phba->targetport) return 0; + error = lpfc_nvmet_setup_io_context(phba); + if (error) + return error; + memset(&pinfo, 0, sizeof(struct nvmet_fc_port_info)); pinfo.node_name = wwn_to_u64(vport->fc_nodename.u.wwn); pinfo.port_name = wwn_to_u64(vport->fc_portname.u.wwn); @@ -778,13 +889,16 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba) &phba->pcidev->dev, &phba->targetport); #else - error = -ENOMEM; + error = -ENOENT; #endif if (error) { lpfc_printf_log(phba, KERN_ERR, LOG_NVME_DISC, "6025 Cannot register NVME targetport " "x%x\n", error); phba->targetport = NULL; + + lpfc_nvmet_cleanup_io_context(phba); + } else { tgtp = (struct lpfc_nvmet_tgtport *) phba->targetport->private; @@ -874,7 +988,7 @@ lpfc_sli4_nvmet_xri_aborted(struct lpfc_hba *phba, list_for_each_entry_safe(ctxp, next_ctxp, &phba->sli4_hba.lpfc_abts_nvmet_ctx_list, list) { - if (ctxp->rqb_buffer->sglq->sli4_xritag != xri) + if (ctxp->ctxbuf->sglq->sli4_xritag != xri) continue; /* Check if we already received a free context call @@ -895,7 +1009,7 @@ lpfc_sli4_nvmet_xri_aborted(struct lpfc_hba *phba, (ndlp->nlp_state == NLP_STE_UNMAPPED_NODE || ndlp->nlp_state == NLP_STE_MAPPED_NODE)) { lpfc_set_rrq_active(phba, ndlp, - ctxp->rqb_buffer->sglq->sli4_lxritag, + ctxp->ctxbuf->sglq->sli4_lxritag, rxid, 1); lpfc_sli4_abts_err_handler(phba, ndlp, axri); } @@ -904,8 +1018,8 @@ lpfc_sli4_nvmet_xri_aborted(struct lpfc_hba *phba, "6318 XB aborted %x flg x%x (%x)\n", ctxp->oxid, ctxp->flag, released); if (released) - lpfc_nvmet_rq_post(phba, ctxp, - &ctxp->rqb_buffer->hbuf); + lpfc_nvmet_ctxbuf_post(phba, ctxp->ctxbuf); + if (rrq_empty) lpfc_worker_wake_up(phba); return; @@ -933,7 +1047,7 @@ lpfc_nvmet_rcv_unsol_abort(struct lpfc_vport *vport, list_for_each_entry_safe(ctxp, next_ctxp, &phba->sli4_hba.lpfc_abts_nvmet_ctx_list, list) { - if (ctxp->rqb_buffer->sglq->sli4_xritag != xri) + if (ctxp->ctxbuf->sglq->sli4_xritag != xri) continue; spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock); @@ -985,6 +1099,7 @@ lpfc_nvmet_destroy_targetport(struct lpfc_hba *phba) init_completion(&tgtp->tport_unreg_done); nvmet_fc_unregister_targetport(phba->targetport); wait_for_completion_timeout(&tgtp->tport_unreg_done, 5); + lpfc_nvmet_cleanup_io_context(phba); } phba->targetport = NULL; #endif @@ -1115,15 +1230,18 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp; struct lpfc_nvmet_tgtport *tgtp; struct fc_frame_header *fc_hdr; + struct lpfc_nvmet_ctxbuf *ctx_buf; uint32_t *payload; uint32_t size, oxid, sid, rc; + unsigned long iflag; #ifdef CONFIG_SCSI_LPFC_DEBUG_FS uint32_t id; #endif + ctx_buf = NULL; if (!nvmebuf || !phba->targetport) { lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, - "6157 FCP Drop IO\n"); + "6157 NVMET FCP Drop IO\n"); oxid = 0; size = 0; sid = 0; @@ -1131,6 +1249,23 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, goto dropit; } + spin_lock_irqsave(&phba->sli4_hba.nvmet_io_lock, iflag); + if (phba->sli4_hba.nvmet_ctx_cnt) { + list_remove_head(&phba->sli4_hba.lpfc_nvmet_ctx_list, + ctx_buf, struct lpfc_nvmet_ctxbuf, list); + phba->sli4_hba.nvmet_ctx_cnt--; + } + spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_lock, iflag); + + if (!ctx_buf) { + lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, + "6408 No NVMET ctx Drop IO\n"); + oxid = 0; + size = 0; + sid = 0; + ctxp = NULL; + goto dropit; + } tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; payload = (uint32_t *)(nvmebuf->dbuf.virt); @@ -1139,16 +1274,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, oxid = be16_to_cpu(fc_hdr->fh_ox_id); sid = sli4_sid_from_fc_hdr(fc_hdr); - ctxp = (struct lpfc_nvmet_rcv_ctx *)nvmebuf->context; - if (ctxp == NULL) { - atomic_inc(&tgtp->rcv_fcp_cmd_drop); - lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, - "6158 FCP Drop IO x%x: Alloc\n", - oxid); - lpfc_nvmet_rq_post(phba, NULL, &nvmebuf->hbuf); - /* Cannot send ABTS without context */ - return; - } + ctxp = (struct lpfc_nvmet_rcv_ctx *)ctx_buf->context; memset(ctxp, 0, sizeof(ctxp->ctx)); ctxp->wqeq = NULL; ctxp->txrdy = NULL; @@ -1158,9 +1284,9 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, ctxp->oxid = oxid; ctxp->sid = sid; ctxp->state = LPFC_NVMET_STE_RCV; - ctxp->rqb_buffer = nvmebuf; ctxp->entry_cnt = 1; ctxp->flag = 0; + ctxp->ctxbuf = ctx_buf; spin_lock_init(&ctxp->ctxlock); #ifdef CONFIG_SCSI_LPFC_DEBUG_FS @@ -1192,6 +1318,9 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, * The calling sequence should be: * nvmet_fc_rcv_fcp_req -> lpfc_nvmet_xmt_fcp_op/cmp -> req->done * lpfc_nvmet_xmt_fcp_op_cmp should free the allocated ctxp. + * When we return from nvmet_fc_rcv_fcp_req, all relevant info in + * the NVME command / FC header is stored, so we are free to repost + * the buffer. */ rc = nvmet_fc_rcv_fcp_req(phba->targetport, &ctxp->ctx.fcp_req, payload, size); @@ -1199,6 +1328,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, /* Process FCP command */ if (rc == 0) { atomic_inc(&tgtp->rcv_fcp_cmd_out); + lpfc_rq_buf_free(phba, &nvmebuf->hbuf); /* repost */ return; } @@ -1213,15 +1343,17 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, lpfc_nvmeio_data(phba, "NVMET FCP DROP: xri x%x sz %d from %06x\n", oxid, size, sid); if (oxid) { + lpfc_nvmet_defer_release(phba, ctxp); lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, sid, oxid); + lpfc_rq_buf_free(phba, &nvmebuf->hbuf); /* repost */ return; } - if (nvmebuf) { - nvmebuf->iocbq->hba_wqidx = 0; - /* We assume a rcv'ed cmd ALWAYs fits into 1 buffer */ - lpfc_nvmet_rq_post(phba, ctxp, &nvmebuf->hbuf); - } + if (ctx_buf) + lpfc_nvmet_ctxbuf_post(phba, ctx_buf); + + if (nvmebuf) + lpfc_rq_buf_free(phba, &nvmebuf->hbuf); /* repost */ #endif } @@ -1273,7 +1405,7 @@ lpfc_nvmet_unsol_fcp_event(struct lpfc_hba *phba, uint64_t isr_timestamp) { if (phba->nvmet_support == 0) { - lpfc_nvmet_rq_post(phba, NULL, &nvmebuf->hbuf); + lpfc_rq_buf_free(phba, &nvmebuf->hbuf); return; } lpfc_nvmet_unsol_fcp_buffer(phba, pring, nvmebuf, @@ -1474,7 +1606,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba, nvmewqe = ctxp->wqeq; if (nvmewqe == NULL) { /* Allocate buffer for command wqe */ - nvmewqe = ctxp->rqb_buffer->iocbq; + nvmewqe = ctxp->ctxbuf->iocbq; if (nvmewqe == NULL) { lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, "6110 lpfc_nvmet_prep_fcp_wqe: No " @@ -1501,7 +1633,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba, return NULL; } - sgl = (struct sli4_sge *)ctxp->rqb_buffer->sglq->sgl; + sgl = (struct sli4_sge *)ctxp->ctxbuf->sglq->sgl; switch (rsp->op) { case NVMET_FCOP_READDATA: case NVMET_FCOP_READDATA_RSP: @@ -1851,15 +1983,16 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, wcqe->word0, wcqe->total_data_placed, result, wcqe->word3); + cmdwqe->context2 = NULL; + cmdwqe->context3 = NULL; /* * if transport has released ctx, then can reuse it. Otherwise, * will be recycled by transport release call. */ if (released) - lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf); + lpfc_nvmet_ctxbuf_post(phba, ctxp->ctxbuf); - cmdwqe->context2 = NULL; - cmdwqe->context3 = NULL; + /* This is the iocbq for the abort, not the command */ lpfc_sli_release_iocbq(phba, cmdwqe); /* Since iaab/iaar are NOT set, there is no work left. @@ -1932,15 +2065,15 @@ lpfc_nvmet_unsol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, ctxp->oxid, ctxp->flag, released, wcqe->word0, wcqe->total_data_placed, result, wcqe->word3); + + cmdwqe->context2 = NULL; + cmdwqe->context3 = NULL; /* * if transport has released ctx, then can reuse it. Otherwise, * will be recycled by transport release call. */ if (released) - lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf); - - cmdwqe->context2 = NULL; - cmdwqe->context3 = NULL; + lpfc_nvmet_ctxbuf_post(phba, ctxp->ctxbuf); /* Since iaab/iaar are NOT set, there is no work left. * For LPFC_NVMET_XBUSY, lpfc_sli4_nvmet_xri_aborted @@ -2002,10 +2135,6 @@ lpfc_nvmet_unsol_issue_abort(struct lpfc_hba *phba, sid, xri, ctxp->wqeq->sli4_xritag); tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; - if (!ctxp->wqeq) { - ctxp->wqeq = ctxp->rqb_buffer->iocbq; - ctxp->wqeq->hba_wqidx = 0; - } ndlp = lpfc_findnode_did(phba->pport, sid); if (!ndlp || !NLP_CHK_NODE_ACT(ndlp) || @@ -2101,7 +2230,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba, tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; if (!ctxp->wqeq) { - ctxp->wqeq = ctxp->rqb_buffer->iocbq; + ctxp->wqeq = ctxp->ctxbuf->iocbq; ctxp->wqeq->hba_wqidx = 0; } @@ -2239,7 +2368,7 @@ lpfc_nvmet_unsol_fcp_issue_abort(struct lpfc_hba *phba, tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; if (!ctxp->wqeq) { - ctxp->wqeq = ctxp->rqb_buffer->iocbq; + ctxp->wqeq = ctxp->ctxbuf->iocbq; ctxp->wqeq->hba_wqidx = 0; } @@ -2294,6 +2423,7 @@ lpfc_nvmet_unsol_ls_issue_abort(struct lpfc_hba *phba, } abts_wqeq = ctxp->wqeq; wqe_abts = &abts_wqeq->wqe; + lpfc_nvmet_unsol_issue_abort(phba, ctxp, sid, xri); spin_lock_irqsave(&phba->hbalock, flags); diff --git a/drivers/scsi/lpfc/lpfc_nvmet.h b/drivers/scsi/lpfc/lpfc_nvmet.h index 55f2a859dc70..6eb2f5d8d4ed 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.h +++ b/drivers/scsi/lpfc/lpfc_nvmet.h @@ -106,6 +106,7 @@ struct lpfc_nvmet_rcv_ctx { #define LPFC_NVMET_CTX_RLS 0x8 /* ctx free requested */ #define LPFC_NVMET_ABTS_RCV 0x10 /* ABTS received on exchange */ struct rqb_dmabuf *rqb_buffer; + struct lpfc_nvmet_ctxbuf *ctxbuf; #ifdef CONFIG_SCSI_LPFC_DEBUG_FS uint64_t ts_isr_cmd; diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 49d5c4700054..d68ee3ee299a 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -6513,6 +6513,49 @@ lpfc_set_host_data(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox) (phba->hba_flag & HBA_FCOE_MODE) ? "FCoE" : "FC"); } +static int +lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq, + struct lpfc_queue *drq, int count) +{ + int rc, i; + struct lpfc_rqe hrqe; + struct lpfc_rqe drqe; + struct lpfc_rqb *rqbp; + struct rqb_dmabuf *rqb_buffer; + LIST_HEAD(rqb_buf_list); + + rqbp = hrq->rqbp; + for (i = 0; i < count; i++) { + /* IF RQ is already full, don't bother */ + if (rqbp->buffer_count + i >= rqbp->entry_count - 1) + break; + rqb_buffer = rqbp->rqb_alloc_buffer(phba); + if (!rqb_buffer) + break; + rqb_buffer->hrq = hrq; + rqb_buffer->drq = drq; + list_add_tail(&rqb_buffer->hbuf.list, &rqb_buf_list); + } + while (!list_empty(&rqb_buf_list)) { + list_remove_head(&rqb_buf_list, rqb_buffer, struct rqb_dmabuf, + hbuf.list); + + hrqe.address_lo = putPaddrLow(rqb_buffer->hbuf.phys); + hrqe.address_hi = putPaddrHigh(rqb_buffer->hbuf.phys); + drqe.address_lo = putPaddrLow(rqb_buffer->dbuf.phys); + drqe.address_hi = putPaddrHigh(rqb_buffer->dbuf.phys); + rc = lpfc_sli4_rq_put(hrq, drq, &hrqe, &drqe); + if (rc < 0) { + rqbp->rqb_free_buffer(phba, rqb_buffer); + } else { + list_add_tail(&rqb_buffer->hbuf.list, + &rqbp->rqb_buffer_list); + rqbp->buffer_count++; + } + } + return 1; +} + /** * lpfc_sli4_hba_setup - SLI4 device initialization PCI function * @phba: Pointer to HBA context object. @@ -6525,7 +6568,7 @@ lpfc_set_host_data(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox) int lpfc_sli4_hba_setup(struct lpfc_hba *phba) { - int rc, i; + int rc, i, cnt; LPFC_MBOXQ_t *mboxq; struct lpfc_mqe *mqe; uint8_t *vpd; @@ -6876,6 +6919,21 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) goto out_destroy_queue; } phba->sli4_hba.nvmet_xri_cnt = rc; + + cnt = phba->cfg_iocb_cnt * 1024; + /* We need 1 iocbq for every SGL, for IO processing */ + cnt += phba->sli4_hba.nvmet_xri_cnt; + /* Initialize and populate the iocb list per host */ + lpfc_printf_log(phba, KERN_INFO, LOG_INIT, + "2821 initialize iocb list %d total %d\n", + phba->cfg_iocb_cnt, cnt); + rc = lpfc_init_iocb_list(phba, cnt); + if (rc) { + lpfc_printf_log(phba, KERN_ERR, LOG_INIT, + "1413 Failed to init iocb list.\n"); + goto out_destroy_queue; + } + lpfc_nvmet_create_targetport(phba); } else { /* update host scsi xri-sgl sizes and mappings */ @@ -6895,10 +6953,21 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) "and mapping: %d\n", rc); goto out_destroy_queue; } + + cnt = phba->cfg_iocb_cnt * 1024; + /* Initialize and populate the iocb list per host */ + lpfc_printf_log(phba, KERN_INFO, LOG_INIT, + "2820 initialize iocb list %d total %d\n", + phba->cfg_iocb_cnt, cnt); + rc = lpfc_init_iocb_list(phba, cnt); + if (rc) { + lpfc_printf_log(phba, KERN_ERR, LOG_INIT, + "6301 Failed to init iocb list.\n"); + goto out_destroy_queue; + } } if (phba->nvmet_support && phba->cfg_nvmet_mrq) { - /* Post initial buffers to all RQs created */ for (i = 0; i < phba->cfg_nvmet_mrq; i++) { rqbp = phba->sli4_hba.nvmet_mrq_hdr[i]->rqbp; @@ -6911,7 +6980,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) lpfc_post_rq_buffer( phba, phba->sli4_hba.nvmet_mrq_hdr[i], phba->sli4_hba.nvmet_mrq_data[i], - phba->cfg_nvmet_mrq_post); + LPFC_NVMET_RQE_DEF_COUNT); } } @@ -7078,6 +7147,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) /* Unset all the queues set up in this routine when error out */ lpfc_sli4_queue_unset(phba); out_destroy_queue: + lpfc_free_iocb_list(phba); lpfc_sli4_queue_destroy(phba); out_stop_timers: lpfc_stop_hba_timers(phba); @@ -18731,7 +18801,7 @@ lpfc_sli4_issue_wqe(struct lpfc_hba *phba, uint32_t ring_number, spin_lock_irqsave(&pring->ring_lock, iflags); ctxp = pwqe->context2; - sglq = ctxp->rqb_buffer->sglq; + sglq = ctxp->ctxbuf->sglq; if (pwqe->sli4_xritag == NO_XRI) { pwqe->sli4_lxritag = sglq->sli4_lxritag; pwqe->sli4_xritag = sglq->sli4_xritag; diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index 422bde85c9f1..19e2f190ea2e 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -618,10 +618,12 @@ struct lpfc_sli4_hba { uint16_t scsi_xri_start; uint16_t els_xri_cnt; uint16_t nvmet_xri_cnt; + uint16_t nvmet_ctx_cnt; struct list_head lpfc_els_sgl_list; struct list_head lpfc_abts_els_sgl_list; struct list_head lpfc_nvmet_sgl_list; struct list_head lpfc_abts_nvmet_ctx_list; + struct list_head lpfc_nvmet_ctx_list; struct list_head lpfc_abts_scsi_buf_list; struct list_head lpfc_abts_nvme_buf_list; struct lpfc_sglq **lpfc_sglq_active_list; @@ -662,8 +664,6 @@ struct lpfc_sli4_hba { uint16_t num_online_cpu; uint16_t num_present_cpu; uint16_t curr_disp_cpu; - - uint16_t nvmet_mrq_post_idx; }; enum lpfc_sge_type { From a8cf5dfeb4d84248c0ad12386ae0cb36ee21589a Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:46 -0700 Subject: [PATCH 025/153] scsi: lpfc: Added recovery logic for running out of NVMET IO context resources Previous logic would just drop the IO. Added logic to queue the IO to wait for an IO context resource from an IO thats already in progress. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 1 + drivers/scsi/lpfc/lpfc_attr.c | 6 ++ drivers/scsi/lpfc/lpfc_crtn.h | 2 + drivers/scsi/lpfc/lpfc_debugfs.c | 6 ++ drivers/scsi/lpfc/lpfc_init.c | 2 + drivers/scsi/lpfc/lpfc_nvmet.c | 138 ++++++++++++++++++++++++++----- drivers/scsi/lpfc/lpfc_sli.c | 7 +- drivers/scsi/lpfc/lpfc_sli4.h | 6 +- 8 files changed, 144 insertions(+), 24 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index 72641b1d3ab8..c47bde6205c9 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -170,6 +170,7 @@ struct rqb_dmabuf { struct lpfc_dmabuf dbuf; uint16_t total_size; uint16_t bytes_recv; + uint16_t idx; struct lpfc_queue *hrq; /* ptr to associated Header RQ */ struct lpfc_queue *drq; /* ptr to associated Data RQ */ }; diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 65264582915a..bb2d9e238225 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -245,6 +245,12 @@ lpfc_nvme_info_show(struct device *dev, struct device_attribute *attr, atomic_read(&tgtp->xmt_abort_rsp), atomic_read(&tgtp->xmt_abort_rsp_error)); + len += snprintf(buf + len, PAGE_SIZE - len, + "IO_CTX: %08x outstanding %08x total %x", + phba->sli4_hba.nvmet_ctx_cnt, + phba->sli4_hba.nvmet_io_wait_cnt, + phba->sli4_hba.nvmet_io_wait_total); + len += snprintf(buf+len, PAGE_SIZE-len, "\n"); return len; } diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index cc95abd130b4..8912767e7bc8 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -77,6 +77,8 @@ void lpfc_retry_pport_discovery(struct lpfc_hba *); void lpfc_release_rpi(struct lpfc_hba *, struct lpfc_vport *, uint16_t); int lpfc_init_iocb_list(struct lpfc_hba *phba, int cnt); void lpfc_free_iocb_list(struct lpfc_hba *phba); +int lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq, + struct lpfc_queue *drq, int count, int idx); void lpfc_mbx_cmpl_local_config_link(struct lpfc_hba *, LPFC_MBOXQ_t *); void lpfc_mbx_cmpl_reg_login(struct lpfc_hba *, LPFC_MBOXQ_t *); diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 7284533f4df2..c7d1c9d37a64 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -842,6 +842,12 @@ lpfc_debugfs_nvmestat_data(struct lpfc_vport *vport, char *buf, int size) } spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock); } + + len += snprintf(buf + len, size - len, + "IO_CTX: %08x outstanding %08x total %08x\n", + phba->sli4_hba.nvmet_ctx_cnt, + phba->sli4_hba.nvmet_io_wait_cnt, + phba->sli4_hba.nvmet_io_wait_total); } else { if (!(phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)) return len; diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 86b0b26dfeea..9f6c7e71814b 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -5825,6 +5825,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvme_buf_list); INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list); INIT_LIST_HEAD(&phba->sli4_hba.lpfc_nvmet_ctx_list); + INIT_LIST_HEAD(&phba->sli4_hba.lpfc_nvmet_io_wait_list); /* Fast-path XRI aborted CQ Event work queue list */ INIT_LIST_HEAD(&phba->sli4_hba.sp_nvme_xri_aborted_work_queue); @@ -5833,6 +5834,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) /* This abort list used by worker thread */ spin_lock_init(&phba->sli4_hba.sgl_list_lock); spin_lock_init(&phba->sli4_hba.nvmet_io_lock); + spin_lock_init(&phba->sli4_hba.nvmet_io_wait_lock); /* * Initialize driver internal slow-path work queues diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index fcc77ae0c71c..312f54278bd4 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -158,6 +158,12 @@ void lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf) { struct lpfc_nvmet_rcv_ctx *ctxp = ctx_buf->context; + struct lpfc_nvmet_tgtport *tgtp; + struct fc_frame_header *fc_hdr; + struct rqb_dmabuf *nvmebuf; + struct lpfc_dmabuf *hbufp; + uint32_t *payload; + uint32_t size, oxid, sid, rc; unsigned long iflag; if (ctxp->txrdy) { @@ -168,6 +174,87 @@ lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf) } ctxp->state = LPFC_NVMET_STE_FREE; + spin_lock_irqsave(&phba->sli4_hba.nvmet_io_wait_lock, iflag); + if (phba->sli4_hba.nvmet_io_wait_cnt) { + hbufp = &nvmebuf->hbuf; + list_remove_head(&phba->sli4_hba.lpfc_nvmet_io_wait_list, + nvmebuf, struct rqb_dmabuf, + hbuf.list); + phba->sli4_hba.nvmet_io_wait_cnt--; + spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_wait_lock, + iflag); + + fc_hdr = (struct fc_frame_header *)(nvmebuf->hbuf.virt); + oxid = be16_to_cpu(fc_hdr->fh_ox_id); + tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; + payload = (uint32_t *)(nvmebuf->dbuf.virt); + size = nvmebuf->bytes_recv; + sid = sli4_sid_from_fc_hdr(fc_hdr); + + ctxp = (struct lpfc_nvmet_rcv_ctx *)ctx_buf->context; + memset(ctxp, 0, sizeof(ctxp->ctx)); + ctxp->wqeq = NULL; + ctxp->txrdy = NULL; + ctxp->offset = 0; + ctxp->phba = phba; + ctxp->size = size; + ctxp->oxid = oxid; + ctxp->sid = sid; + ctxp->state = LPFC_NVMET_STE_RCV; + ctxp->entry_cnt = 1; + ctxp->flag = 0; + ctxp->ctxbuf = ctx_buf; + spin_lock_init(&ctxp->ctxlock); + +#ifdef CONFIG_SCSI_LPFC_DEBUG_FS + if (phba->ktime_on) { + ctxp->ts_cmd_nvme = ktime_get_ns(); + ctxp->ts_isr_cmd = ctxp->ts_cmd_nvme; + ctxp->ts_nvme_data = 0; + ctxp->ts_data_wqput = 0; + ctxp->ts_isr_data = 0; + ctxp->ts_data_nvme = 0; + ctxp->ts_nvme_status = 0; + ctxp->ts_status_wqput = 0; + ctxp->ts_isr_status = 0; + ctxp->ts_status_nvme = 0; + } +#endif + atomic_inc(&tgtp->rcv_fcp_cmd_in); + /* + * The calling sequence should be: + * nvmet_fc_rcv_fcp_req->lpfc_nvmet_xmt_fcp_op/cmp- req->done + * lpfc_nvmet_xmt_fcp_op_cmp should free the allocated ctxp. + * When we return from nvmet_fc_rcv_fcp_req, all relevant info + * the NVME command / FC header is stored. + * A buffer has already been reposted for this IO, so just free + * the nvmebuf. + */ + rc = nvmet_fc_rcv_fcp_req(phba->targetport, &ctxp->ctx.fcp_req, + payload, size); + + /* Process FCP command */ + if (rc == 0) { + atomic_inc(&tgtp->rcv_fcp_cmd_out); + nvmebuf->hrq->rqbp->rqb_free_buffer(phba, nvmebuf); + return; + } + + atomic_inc(&tgtp->rcv_fcp_cmd_drop); + lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, + "2582 FCP Drop IO x%x: err x%x: x%x x%x x%x\n", + ctxp->oxid, rc, + atomic_read(&tgtp->rcv_fcp_cmd_in), + atomic_read(&tgtp->rcv_fcp_cmd_out), + atomic_read(&tgtp->xmt_fcp_release)); + + lpfc_nvmet_defer_release(phba, ctxp); + lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, sid, oxid); + nvmebuf->hrq->rqbp->rqb_free_buffer(phba, nvmebuf); + return; + } + spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_wait_lock, iflag); + spin_lock_irqsave(&phba->sli4_hba.nvmet_io_lock, iflag); list_add_tail(&ctx_buf->list, &phba->sli4_hba.lpfc_nvmet_ctx_list); @@ -1232,7 +1319,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr; struct lpfc_nvmet_ctxbuf *ctx_buf; uint32_t *payload; - uint32_t size, oxid, sid, rc; + uint32_t size, oxid, sid, rc, qno; unsigned long iflag; #ifdef CONFIG_SCSI_LPFC_DEBUG_FS uint32_t id; @@ -1257,21 +1344,41 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, } spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_lock, iflag); + fc_hdr = (struct fc_frame_header *)(nvmebuf->hbuf.virt); + oxid = be16_to_cpu(fc_hdr->fh_ox_id); + size = nvmebuf->bytes_recv; + +#ifdef CONFIG_SCSI_LPFC_DEBUG_FS + if (phba->cpucheck_on & LPFC_CHECK_NVMET_RCV) { + id = smp_processor_id(); + if (id < LPFC_CHECK_CPU_CNT) + phba->cpucheck_rcv_io[id]++; + } +#endif + + lpfc_nvmeio_data(phba, "NVMET FCP RCV: xri x%x sz %d CPU %02x\n", + oxid, size, smp_processor_id()); + if (!ctx_buf) { - lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, - "6408 No NVMET ctx Drop IO\n"); - oxid = 0; - size = 0; - sid = 0; - ctxp = NULL; - goto dropit; + /* Queue this NVME IO to process later */ + spin_lock_irqsave(&phba->sli4_hba.nvmet_io_wait_lock, iflag); + list_add_tail(&nvmebuf->hbuf.list, + &phba->sli4_hba.lpfc_nvmet_io_wait_list); + phba->sli4_hba.nvmet_io_wait_cnt++; + phba->sli4_hba.nvmet_io_wait_total++; + spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_wait_lock, + iflag); + + /* Post a brand new DMA buffer to RQ */ + qno = nvmebuf->idx; + lpfc_post_rq_buffer( + phba, phba->sli4_hba.nvmet_mrq_hdr[qno], + phba->sli4_hba.nvmet_mrq_data[qno], 1, qno); + return; } tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; payload = (uint32_t *)(nvmebuf->dbuf.virt); - fc_hdr = (struct fc_frame_header *)(nvmebuf->hbuf.virt); - size = nvmebuf->bytes_recv; - oxid = be16_to_cpu(fc_hdr->fh_ox_id); sid = sli4_sid_from_fc_hdr(fc_hdr); ctxp = (struct lpfc_nvmet_rcv_ctx *)ctx_buf->context; @@ -1302,17 +1409,8 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, ctxp->ts_isr_status = 0; ctxp->ts_status_nvme = 0; } - - if (phba->cpucheck_on & LPFC_CHECK_NVMET_RCV) { - id = smp_processor_id(); - if (id < LPFC_CHECK_CPU_CNT) - phba->cpucheck_rcv_io[id]++; - } #endif - lpfc_nvmeio_data(phba, "NVMET FCP RCV: xri x%x sz %d CPU %02x\n", - oxid, size, smp_processor_id()); - atomic_inc(&tgtp->rcv_fcp_cmd_in); /* * The calling sequence should be: diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index d68ee3ee299a..3fb4e715bfa2 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -6513,9 +6513,9 @@ lpfc_set_host_data(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox) (phba->hba_flag & HBA_FCOE_MODE) ? "FCoE" : "FC"); } -static int +int lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq, - struct lpfc_queue *drq, int count) + struct lpfc_queue *drq, int count, int idx) { int rc, i; struct lpfc_rqe hrqe; @@ -6534,6 +6534,7 @@ lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq, break; rqb_buffer->hrq = hrq; rqb_buffer->drq = drq; + rqb_buffer->idx = idx; list_add_tail(&rqb_buffer->hbuf.list, &rqb_buf_list); } while (!list_empty(&rqb_buf_list)) { @@ -6980,7 +6981,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) lpfc_post_rq_buffer( phba, phba->sli4_hba.nvmet_mrq_hdr[i], phba->sli4_hba.nvmet_mrq_data[i], - LPFC_NVMET_RQE_DEF_COUNT); + LPFC_NVMET_RQE_DEF_COUNT, i); } } diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index 19e2f190ea2e..c1c9a9125266 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -619,13 +619,16 @@ struct lpfc_sli4_hba { uint16_t els_xri_cnt; uint16_t nvmet_xri_cnt; uint16_t nvmet_ctx_cnt; + uint16_t nvmet_io_wait_cnt; + uint16_t nvmet_io_wait_total; struct list_head lpfc_els_sgl_list; struct list_head lpfc_abts_els_sgl_list; struct list_head lpfc_nvmet_sgl_list; struct list_head lpfc_abts_nvmet_ctx_list; - struct list_head lpfc_nvmet_ctx_list; struct list_head lpfc_abts_scsi_buf_list; struct list_head lpfc_abts_nvme_buf_list; + struct list_head lpfc_nvmet_ctx_list; + struct list_head lpfc_nvmet_io_wait_list; struct lpfc_sglq **lpfc_sglq_active_list; struct list_head lpfc_rpi_hdr_list; unsigned long *rpi_bmask; @@ -657,6 +660,7 @@ struct lpfc_sli4_hba { spinlock_t abts_scsi_buf_list_lock; /* list of aborted SCSI IOs */ spinlock_t sgl_list_lock; /* list of aborted els IOs */ spinlock_t nvmet_io_lock; + spinlock_t nvmet_io_wait_lock; /* IOs waiting for ctx resources */ uint32_t physical_port; /* CPU to vector mapping information */ From 82820f0cf19aa62e2608c2909bd44e7a68268ff5 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:47 -0700 Subject: [PATCH 026/153] scsi: lpfc: Fix NVME I+T not registering NVME as a supported FC4 type When the driver send the RPA command, it does not send supported FC4 Type NVME to the management server. Encode NVME (type x28) in the AttribEntry in the RPA command. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_ct.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index c7962dae4dab..f2cd19c6c2df 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -2092,6 +2092,7 @@ lpfc_fdmi_port_attr_fc4type(struct lpfc_vport *vport, ae->un.AttrTypes[3] = 0x02; /* Type 1 - ELS */ ae->un.AttrTypes[2] = 0x01; /* Type 8 - FCP */ + ae->un.AttrTypes[6] = 0x01; /* Type 40 - NVME */ ae->un.AttrTypes[7] = 0x01; /* Type 32 - CT */ size = FOURBYTES + 32; ad->AttrLen = cpu_to_be16(size); From 667a7662529bf0afb1d84a32ceb0da0a875a3b6c Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:48 -0700 Subject: [PATCH 027/153] scsi: lpfc: Fix debugfs root inode "lpfc" not getting deleted on driver unload. When unloading and reloading the driver, the driver fails to recreate the lpfc root inode in the debugfs tree. The driver is incorrectly removing the lpfc root inode in lpfc_debugfs_terminate in the first driver instance that unloads and then sets the lpfc_debugfs_root global parameter to NULL. When the final driver instance unloads, the debugfs calls quietly ignore the remove on a NULL pointer. The bug is that the debugfs_remove call returns void so the driver doesn't know to correctly set the global parameter to NULL. Base the debugfs_remove of the lpfc_debugfs_root parameter on lpfc_debugfs_hba_count because this parameter tracks the fnX instance tracked per driver instance. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_debugfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index c7d1c9d37a64..4bcb92c844ca 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -5866,8 +5866,10 @@ lpfc_debugfs_terminate(struct lpfc_vport *vport) atomic_dec(&lpfc_debugfs_hba_count); } - debugfs_remove(lpfc_debugfs_root); /* lpfc */ - lpfc_debugfs_root = NULL; + if (atomic_read(&lpfc_debugfs_hba_count) == 0) { + debugfs_remove(lpfc_debugfs_root); /* lpfc */ + lpfc_debugfs_root = NULL; + } } #endif return; From 64eb4dcb140a7c5547f6e965fb471b1b75c01108 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:49 -0700 Subject: [PATCH 028/153] scsi: lpfc: Cleanup entry_repost settings on SLI4 queues Too many work items being processed in IRQ context take a lot of CPU time and cause problems. With a recent change, we get out of the ISR after hitting entry_repost work items on a queue. However, the actual values for entry repost are still high. EQ is 128 and CQ is 128, this could translate into processing 128 * 128 (16384) work items under IRQ context. Set entry_repost in the actual queue creation routine now. Limit EQ repost to 8 and CQ repost to 64 to further limit the amount of time spent in the IRQ. Fix fof IRQ routines as well. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_sli.c | 19 ++++++++----------- drivers/scsi/lpfc/lpfc_sli4.h | 6 ++++-- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 3fb4e715bfa2..903c06ff828a 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -13922,17 +13922,10 @@ lpfc_sli4_queue_alloc(struct lpfc_hba *phba, uint32_t entry_size, } queue->entry_size = entry_size; queue->entry_count = entry_count; - - /* - * entry_repost is calculated based on the number of entries in the - * queue. This works out except for RQs. If buffers are NOT initially - * posted for every RQE, entry_repost should be adjusted accordingly. - */ - queue->entry_repost = (entry_count >> 3); - if (queue->entry_repost < LPFC_QUEUE_MIN_REPOST) - queue->entry_repost = LPFC_QUEUE_MIN_REPOST; queue->phba = phba; + /* entry_repost will be set during q creation */ + return queue; out_fail: lpfc_sli4_queue_free(queue); @@ -14163,6 +14156,7 @@ lpfc_eq_create(struct lpfc_hba *phba, struct lpfc_queue *eq, uint32_t imax) status = -ENXIO; eq->host_index = 0; eq->hba_index = 0; + eq->entry_repost = LPFC_EQ_REPOST; mempool_free(mbox, phba->mbox_mem_pool); return status; @@ -14236,9 +14230,9 @@ lpfc_cq_create(struct lpfc_hba *phba, struct lpfc_queue *cq, default: lpfc_printf_log(phba, KERN_ERR, LOG_SLI, "0361 Unsupported CQ count: " - "entry cnt %d sz %d pg cnt %d repost %d\n", + "entry cnt %d sz %d pg cnt %d\n", cq->entry_count, cq->entry_size, - cq->page_count, cq->entry_repost); + cq->page_count); if (cq->entry_count < 256) { status = -EINVAL; goto out; @@ -14291,6 +14285,7 @@ lpfc_cq_create(struct lpfc_hba *phba, struct lpfc_queue *cq, cq->assoc_qid = eq->queue_id; cq->host_index = 0; cq->hba_index = 0; + cq->entry_repost = LPFC_CQ_REPOST; out: mempool_free(mbox, phba->mbox_mem_pool); @@ -14482,6 +14477,7 @@ lpfc_cq_create_set(struct lpfc_hba *phba, struct lpfc_queue **cqp, cq->assoc_qid = eq->queue_id; cq->host_index = 0; cq->hba_index = 0; + cq->entry_repost = LPFC_CQ_REPOST; rc = 0; list_for_each_entry(dmabuf, &cq->page_list, list) { @@ -14730,6 +14726,7 @@ lpfc_mq_create(struct lpfc_hba *phba, struct lpfc_queue *mq, mq->subtype = subtype; mq->host_index = 0; mq->hba_index = 0; + mq->entry_repost = LPFC_MQ_REPOST; /* link the mq onto the parent cq child list */ list_add_tail(&mq->list, &cq->child_list); diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index c1c9a9125266..cf863db27700 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -24,7 +24,6 @@ #define LPFC_XRI_EXCH_BUSY_WAIT_TMO 10000 #define LPFC_XRI_EXCH_BUSY_WAIT_T1 10 #define LPFC_XRI_EXCH_BUSY_WAIT_T2 30000 -#define LPFC_RELEASE_NOTIFICATION_INTERVAL 32 #define LPFC_RPI_LOW_WATER_MARK 10 #define LPFC_UNREG_FCF 1 @@ -155,8 +154,11 @@ struct lpfc_queue { uint32_t entry_count; /* Number of entries to support on the queue */ uint32_t entry_size; /* Size of each queue entry. */ uint32_t entry_repost; /* Count of entries before doorbell is rung */ -#define LPFC_QUEUE_MIN_REPOST 8 +#define LPFC_EQ_REPOST 8 +#define LPFC_MQ_REPOST 8 +#define LPFC_CQ_REPOST 64 #define LPFC_RQ_REPOST 64 +#define LPFC_RELEASE_NOTIFICATION_INTERVAL 32 /* For WQs */ uint32_t queue_id; /* Queue ID assigned by the hardware */ uint32_t assoc_qid; /* Queue ID associated with, for CQ/WQ/MQ */ uint32_t page_count; /* Number of pages allocated for this queue */ From dc53a61852279f25909d99dad4638b4aee0b2d82 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:50 -0700 Subject: [PATCH 029/153] scsi: lpfc: Fix NVMEI's handling of NVMET's PRLI response attributes Code review of NVMEI's FC_PORT_ROLE_NVME_DISCOVERY looked wrong. Discussions with storage architecture team clarified NVMEI's audit of the PRLI response port roles. Following up discussion with code review showed a few minor corrections were required - especially in anticipation of NVME auto discovery. During PRLI, NVMEI should sent prli_init - which it it does. NVMET should send prli_tgt and prli_disc - which it does. When NVMEI receives a PRLI Response now, it audits the incoming target bits and stores the attributes in the corresponding NDLP. Later, when NVMEI registers the NVME rport, it uses the stored ndlp attributes to set the rport port_roles correctly. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_disc.h | 1 + drivers/scsi/lpfc/lpfc_nportdisc.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_disc.h b/drivers/scsi/lpfc/lpfc_disc.h index 9d5a379f4b15..094c97b9e5f7 100644 --- a/drivers/scsi/lpfc/lpfc_disc.h +++ b/drivers/scsi/lpfc/lpfc_disc.h @@ -90,6 +90,7 @@ struct lpfc_nodelist { #define NLP_FCP_INITIATOR 0x10 /* entry is an FCP Initiator */ #define NLP_NVME_TARGET 0x20 /* entry is a NVME Target */ #define NLP_NVME_INITIATOR 0x40 /* entry is a NVME Initiator */ +#define NLP_NVME_DISCOVERY 0x80 /* entry has NVME disc srvc */ uint16_t nlp_fc4_type; /* FC types node supports. */ /* Assigned from GID_FF, only diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c index 8777c2d5f50d..bff3de053df4 100644 --- a/drivers/scsi/lpfc/lpfc_nportdisc.c +++ b/drivers/scsi/lpfc/lpfc_nportdisc.c @@ -1944,7 +1944,13 @@ lpfc_cmpl_prli_prli_issue(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, /* Target driver cannot solicit NVME FB. */ if (bf_get_be32(prli_tgt, nvpr)) { + /* Complete the nvme target roles. The transport + * needs to know if the rport is capable of + * discovery in addition to its role. + */ ndlp->nlp_type |= NLP_NVME_TARGET; + if (bf_get_be32(prli_disc, nvpr)) + ndlp->nlp_type |= NLP_NVME_DISCOVERY; if ((bf_get_be32(prli_fba, nvpr) == 1) && (bf_get_be32(prli_fb_sz, nvpr) > 0) && (phba->cfg_nvme_enable_fb) && From ae9e28f36a6cca4e5760f4927b70b6c9e588db1a Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:51 -0700 Subject: [PATCH 030/153] scsi: lpfc: Add MDS Diagnostic support. Added code to support Cisco MDS loopback diagnostic. The diagnostics run various loopbacks including one which loops-back frame through the driver. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 2 + drivers/scsi/lpfc/lpfc_els.c | 7 ++ drivers/scsi/lpfc/lpfc_hbadisc.c | 3 +- drivers/scsi/lpfc/lpfc_hw4.h | 15 +++- drivers/scsi/lpfc/lpfc_init.c | 13 +++ drivers/scsi/lpfc/lpfc_sli.c | 131 +++++++++++++++++++++++++++++-- 6 files changed, 161 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index c47bde6205c9..f2c0ba6ced78 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -675,6 +675,8 @@ struct lpfc_hba { /* INIT_LINK mailbox command */ #define LS_NPIV_FAB_SUPPORTED 0x2 /* Fabric supports NPIV */ #define LS_IGNORE_ERATT 0x4 /* intr handler should ignore ERATT */ +#define LS_MDS_LINK_DOWN 0x8 /* MDS Diagnostics Link Down */ +#define LS_MDS_LOOPBACK 0x16 /* MDS Diagnostics Link Up (Loopback) */ uint32_t hba_flag; /* hba generic flags */ #define HBA_ERATT_HANDLED 0x1 /* This flag is set when eratt handled */ diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 3085895464d9..1d36f82fa369 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -1047,6 +1047,13 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, irsp->ulpStatus, irsp->un.ulpWord[4], irsp->ulpTimeout); + + /* If this is not a loop open failure, bail out */ + if (!(irsp->ulpStatus == IOSTAT_LOCAL_REJECT && + ((irsp->un.ulpWord[4] & IOERR_PARAM_MASK) == + IOERR_LOOP_OPEN_FAILURE))) + goto flogifail; + /* FLOGI failed, so there is no fabric */ spin_lock_irq(shost->host_lock); vport->fc_flag &= ~(FC_FABRIC | FC_PUBLIC_LOOP); diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index dcc9b3858778..3ffcd9215ca8 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -701,7 +701,8 @@ lpfc_work_done(struct lpfc_hba *phba) /* Set the lpfc data pending flag */ set_bit(LPFC_DATA_READY, &phba->data_flags); } else { - if (phba->link_state >= LPFC_LINK_UP) { + if (phba->link_state >= LPFC_LINK_UP || + phba->link_flag & LS_MDS_LOOPBACK) { pring->flag &= ~LPFC_DEFERRED_RING_EVENT; lpfc_sli_handle_slow_ring_event(phba, pring, (status & diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h index df97c6b7433b..e0a5fce416ae 100644 --- a/drivers/scsi/lpfc/lpfc_hw4.h +++ b/drivers/scsi/lpfc/lpfc_hw4.h @@ -4421,6 +4421,19 @@ struct fcp_treceive64_wqe { }; #define TXRDY_PAYLOAD_LEN 12 +#define CMD_SEND_FRAME 0xE1 + +struct send_frame_wqe { + struct ulp_bde64 bde; /* words 0-2 */ + uint32_t frame_len; /* word 3 */ + uint32_t fc_hdr_wd0; /* word 4 */ + uint32_t fc_hdr_wd1; /* word 5 */ + struct wqe_common wqe_com; /* words 6-11 */ + uint32_t fc_hdr_wd2; /* word 12 */ + uint32_t fc_hdr_wd3; /* word 13 */ + uint32_t fc_hdr_wd4; /* word 14 */ + uint32_t fc_hdr_wd5; /* word 15 */ +}; union lpfc_wqe { uint32_t words[16]; @@ -4439,7 +4452,7 @@ union lpfc_wqe { struct fcp_trsp64_wqe fcp_trsp; struct fcp_tsend64_wqe fcp_tsend; struct fcp_treceive64_wqe fcp_treceive; - + struct send_frame_wqe send_frame; }; union lpfc_wqe128 { diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 9f6c7e71814b..9add9473cae5 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -4540,6 +4540,19 @@ lpfc_sli4_async_fc_evt(struct lpfc_hba *phba, struct lpfc_acqe_fc_la *acqe_fc) pmb->vport = phba->pport; if (phba->sli4_hba.link_state.status != LPFC_FC_LA_TYPE_LINK_UP) { + phba->link_flag &= ~(LS_MDS_LINK_DOWN | LS_MDS_LOOPBACK); + + switch (phba->sli4_hba.link_state.status) { + case LPFC_FC_LA_TYPE_MDS_LINK_DOWN: + phba->link_flag |= LS_MDS_LINK_DOWN; + break; + case LPFC_FC_LA_TYPE_MDS_LOOPBACK: + phba->link_flag |= LS_MDS_LOOPBACK; + break; + default: + break; + } + /* Parse and translate status field */ mb = &pmb->u.mb; mb->mbxStatus = lpfc_sli4_parse_latt_fault(phba, diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 903c06ff828a..d6b184839bc2 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -74,6 +74,8 @@ static struct lpfc_iocbq *lpfc_sli4_els_wcqe_to_rspiocbq(struct lpfc_hba *, struct lpfc_iocbq *); static void lpfc_sli4_send_seq_to_ulp(struct lpfc_vport *, struct hbq_dmabuf *); +static void lpfc_sli4_handle_mds_loopback(struct lpfc_vport *vport, + struct hbq_dmabuf *dmabuf); static int lpfc_sli4_fp_handle_cqe(struct lpfc_hba *, struct lpfc_queue *, struct lpfc_cqe *); static int lpfc_sli4_post_sgl_list(struct lpfc_hba *, struct list_head *, @@ -5907,7 +5909,7 @@ lpfc_set_features(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox, bf_set(lpfc_mbx_set_feature_mds, &mbox->u.mqe.un.set_feature, 1); bf_set(lpfc_mbx_set_feature_mds_deep_loopbk, - &mbox->u.mqe.un.set_feature, 0); + &mbox->u.mqe.un.set_feature, 1); mbox->u.mqe.un.set_feature.feature = LPFC_SET_MDS_DIAGS; mbox->u.mqe.un.set_feature.param_len = 8; break; @@ -8688,8 +8690,11 @@ lpfc_sli4_iocb2wqe(struct lpfc_hba *phba, struct lpfc_iocbq *iocbq, memset(wqe, 0, sizeof(union lpfc_wqe128)); /* Some of the fields are in the right position already */ memcpy(wqe, &iocbq->iocb, sizeof(union lpfc_wqe)); - wqe->generic.wqe_com.word7 = 0; /* The ct field has moved so reset */ - wqe->generic.wqe_com.word10 = 0; + if (iocbq->iocb.ulpCommand != CMD_SEND_FRAME) { + /* The ct field has moved so reset */ + wqe->generic.wqe_com.word7 = 0; + wqe->generic.wqe_com.word10 = 0; + } abort_tag = (uint32_t) iocbq->iotag; xritag = iocbq->sli4_xritag; @@ -9183,6 +9188,10 @@ lpfc_sli4_iocb2wqe(struct lpfc_hba *phba, struct lpfc_iocbq *iocbq, } break; + case CMD_SEND_FRAME: + bf_set(wqe_xri_tag, &wqe->generic.wqe_com, xritag); + bf_set(wqe_reqtag, &wqe->generic.wqe_com, iocbq->iotag); + return 0; case CMD_XRI_ABORTED_CX: case CMD_CREATE_XRI_CR: /* Do we expect to use this? */ case CMD_IOCB_FCP_IBIDIR64_CR: /* bidirectional xfer */ @@ -16137,6 +16146,8 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr) struct fc_vft_header *fc_vft_hdr; uint32_t *header = (uint32_t *) fc_hdr; +#define FC_RCTL_MDS_DIAGS 0xF4 + switch (fc_hdr->fh_r_ctl) { case FC_RCTL_DD_UNCAT: /* uncategorized information */ case FC_RCTL_DD_SOL_DATA: /* solicited data */ @@ -16164,6 +16175,7 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr) case FC_RCTL_F_BSY: /* fabric busy to data frame */ case FC_RCTL_F_BSYL: /* fabric busy to link control frame */ case FC_RCTL_LCR: /* link credit reset */ + case FC_RCTL_MDS_DIAGS: /* MDS Diagnostics */ case FC_RCTL_END: /* end */ break; case FC_RCTL_VFTH: /* Virtual Fabric tagging Header */ @@ -16173,12 +16185,16 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr) default: goto drop; } + +#define FC_TYPE_VENDOR_UNIQUE 0xFF + switch (fc_hdr->fh_type) { case FC_TYPE_BLS: case FC_TYPE_ELS: case FC_TYPE_FCP: case FC_TYPE_CT: case FC_TYPE_NVME: + case FC_TYPE_VENDOR_UNIQUE: break; case FC_TYPE_IP: case FC_TYPE_ILS: @@ -16189,12 +16205,14 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr) lpfc_printf_log(phba, KERN_INFO, LOG_ELS, "2538 Received frame rctl:%s (x%x), type:%s (x%x), " "frame Data:%08x %08x %08x %08x %08x %08x %08x\n", + (fc_hdr->fh_r_ctl == FC_RCTL_MDS_DIAGS) ? "MDS Diags" : lpfc_rctl_names[fc_hdr->fh_r_ctl], fc_hdr->fh_r_ctl, - lpfc_type_names[fc_hdr->fh_type], fc_hdr->fh_type, - be32_to_cpu(header[0]), be32_to_cpu(header[1]), - be32_to_cpu(header[2]), be32_to_cpu(header[3]), - be32_to_cpu(header[4]), be32_to_cpu(header[5]), - be32_to_cpu(header[6])); + (fc_hdr->fh_type == FC_TYPE_VENDOR_UNIQUE) ? + "Vendor Unique" : lpfc_type_names[fc_hdr->fh_type], + fc_hdr->fh_type, be32_to_cpu(header[0]), + be32_to_cpu(header[1]), be32_to_cpu(header[2]), + be32_to_cpu(header[3]), be32_to_cpu(header[4]), + be32_to_cpu(header[5]), be32_to_cpu(header[6])); return 0; drop: lpfc_printf_log(phba, KERN_WARNING, LOG_ELS, @@ -17000,6 +17018,96 @@ lpfc_sli4_send_seq_to_ulp(struct lpfc_vport *vport, lpfc_sli_release_iocbq(phba, iocbq); } +static void +lpfc_sli4_mds_loopback_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, + struct lpfc_iocbq *rspiocb) +{ + struct lpfc_dmabuf *pcmd = cmdiocb->context2; + + if (pcmd && pcmd->virt) + pci_pool_free(phba->lpfc_drb_pool, pcmd->virt, pcmd->phys); + kfree(pcmd); + lpfc_sli_release_iocbq(phba, cmdiocb); +} + +static void +lpfc_sli4_handle_mds_loopback(struct lpfc_vport *vport, + struct hbq_dmabuf *dmabuf) +{ + struct fc_frame_header *fc_hdr; + struct lpfc_hba *phba = vport->phba; + struct lpfc_iocbq *iocbq = NULL; + union lpfc_wqe *wqe; + struct lpfc_dmabuf *pcmd = NULL; + uint32_t frame_len; + int rc; + + fc_hdr = (struct fc_frame_header *)dmabuf->hbuf.virt; + frame_len = bf_get(lpfc_rcqe_length, &dmabuf->cq_event.cqe.rcqe_cmpl); + + /* Send the received frame back */ + iocbq = lpfc_sli_get_iocbq(phba); + if (!iocbq) + goto exit; + + /* Allocate buffer for command payload */ + pcmd = kmalloc(sizeof(struct lpfc_dmabuf), GFP_KERNEL); + if (pcmd) + pcmd->virt = pci_pool_alloc(phba->lpfc_drb_pool, GFP_KERNEL, + &pcmd->phys); + if (!pcmd || !pcmd->virt) + goto exit; + + INIT_LIST_HEAD(&pcmd->list); + + /* copyin the payload */ + memcpy(pcmd->virt, dmabuf->dbuf.virt, frame_len); + + /* fill in BDE's for command */ + iocbq->iocb.un.xseq64.bdl.addrHigh = putPaddrHigh(pcmd->phys); + iocbq->iocb.un.xseq64.bdl.addrLow = putPaddrLow(pcmd->phys); + iocbq->iocb.un.xseq64.bdl.bdeFlags = BUFF_TYPE_BDE_64; + iocbq->iocb.un.xseq64.bdl.bdeSize = frame_len; + + iocbq->context2 = pcmd; + iocbq->vport = vport; + iocbq->iocb_flag &= ~LPFC_FIP_ELS_ID_MASK; + iocbq->iocb_flag |= LPFC_USE_FCPWQIDX; + + /* + * Setup rest of the iocb as though it were a WQE + * Build the SEND_FRAME WQE + */ + wqe = (union lpfc_wqe *)&iocbq->iocb; + + wqe->send_frame.frame_len = frame_len; + wqe->send_frame.fc_hdr_wd0 = be32_to_cpu(*((uint32_t *)fc_hdr)); + wqe->send_frame.fc_hdr_wd1 = be32_to_cpu(*((uint32_t *)fc_hdr + 1)); + wqe->send_frame.fc_hdr_wd2 = be32_to_cpu(*((uint32_t *)fc_hdr + 2)); + wqe->send_frame.fc_hdr_wd3 = be32_to_cpu(*((uint32_t *)fc_hdr + 3)); + wqe->send_frame.fc_hdr_wd4 = be32_to_cpu(*((uint32_t *)fc_hdr + 4)); + wqe->send_frame.fc_hdr_wd5 = be32_to_cpu(*((uint32_t *)fc_hdr + 5)); + + iocbq->iocb.ulpCommand = CMD_SEND_FRAME; + iocbq->iocb.ulpLe = 1; + iocbq->iocb_cmpl = lpfc_sli4_mds_loopback_cmpl; + rc = lpfc_sli_issue_iocb(phba, LPFC_ELS_RING, iocbq, 0); + if (rc == IOCB_ERROR) + goto exit; + + lpfc_in_buf_free(phba, &dmabuf->dbuf); + return; + +exit: + lpfc_printf_log(phba, KERN_WARNING, LOG_SLI, + "2023 Unable to process MDS loopback frame\n"); + if (pcmd && pcmd->virt) + pci_pool_free(phba->lpfc_drb_pool, pcmd->virt, pcmd->phys); + kfree(pcmd); + lpfc_sli_release_iocbq(phba, iocbq); + lpfc_in_buf_free(phba, &dmabuf->dbuf); +} + /** * lpfc_sli4_handle_received_buffer - Handle received buffers from firmware * @phba: Pointer to HBA context object. @@ -17038,6 +17146,13 @@ lpfc_sli4_handle_received_buffer(struct lpfc_hba *phba, fcfi = bf_get(lpfc_rcqe_fcf_id, &dmabuf->cq_event.cqe.rcqe_cmpl); + if (fc_hdr->fh_r_ctl == 0xF4 && fc_hdr->fh_type == 0xFF) { + vport = phba->pport; + /* Handle MDS Loopback frames */ + lpfc_sli4_handle_mds_loopback(vport, dmabuf); + return; + } + /* d_id this frame is directed to */ did = sli4_did_from_fc_hdr(fc_hdr); From 2848e1d503d60955ff51ae9ec8d5eada6bd9ba6d Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:52 -0700 Subject: [PATCH 031/153] scsi: lpfc: update version to 11.2.0.14 Change driver version to 11.2.0.14. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h index 1c26dc67151b..c2653244221c 100644 --- a/drivers/scsi/lpfc/lpfc_version.h +++ b/drivers/scsi/lpfc/lpfc_version.h @@ -20,7 +20,7 @@ * included with this package. * *******************************************************************/ -#define LPFC_DRIVER_VERSION "11.2.0.12" +#define LPFC_DRIVER_VERSION "11.2.0.14" #define LPFC_DRIVER_NAME "lpfc" /* Used for SLI 2/3 */ From 5667c86acf021e6dcf02584408b4484a273ac68f Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Sun, 14 May 2017 21:41:55 -0700 Subject: [PATCH 032/153] mac80211: strictly check mesh address extension mode Mesh forwarding path checks for address extension mode to fetch appropriate proxied address and MPP address. Existing condition that looks for 6 address format is not strict enough so that frames with improper values are processed and invalid entries are added into MPP table. Fix that by adding a stricter check before processing the packet. Per IEEE Std 802.11s-2011 spec. Table 7-6g1 lists address extension mode 0x3 as reserved one. And also Table Table 9-13 does not specify 0x3 as valid address field. Fixes: 9b395bc3be1c ("mac80211: verify that skb data is present") Signed-off-by: Rajkumar Manoharan Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 3 ++- net/wireless/util.c | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 35f4c7d7a500..1f75280ba26c 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2492,7 +2492,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) if (is_multicast_ether_addr(hdr->addr1)) { mpp_addr = hdr->addr3; proxied_addr = mesh_hdr->eaddr1; - } else if (mesh_hdr->flags & MESH_FLAGS_AE_A5_A6) { + } else if ((mesh_hdr->flags & MESH_FLAGS_AE) == + MESH_FLAGS_AE_A5_A6) { /* has_a4 already checked in ieee80211_rx_mesh_check */ mpp_addr = hdr->addr4; proxied_addr = mesh_hdr->eaddr2; diff --git a/net/wireless/util.c b/net/wireless/util.c index 7198373e2920..4992f1025c9d 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -454,6 +454,8 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, if (iftype == NL80211_IFTYPE_MESH_POINT) skb_copy_bits(skb, hdrlen, &mesh_flags, 1); + mesh_flags &= MESH_FLAGS_AE; + switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case cpu_to_le16(IEEE80211_FCTL_TODS): @@ -469,9 +471,9 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, iftype != NL80211_IFTYPE_STATION)) return -1; if (iftype == NL80211_IFTYPE_MESH_POINT) { - if (mesh_flags & MESH_FLAGS_AE_A4) + if (mesh_flags == MESH_FLAGS_AE_A4) return -1; - if (mesh_flags & MESH_FLAGS_AE_A5_A6) { + if (mesh_flags == MESH_FLAGS_AE_A5_A6) { skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), tmp.h_dest, 2 * ETH_ALEN); @@ -487,9 +489,9 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, ether_addr_equal(tmp.h_source, addr))) return -1; if (iftype == NL80211_IFTYPE_MESH_POINT) { - if (mesh_flags & MESH_FLAGS_AE_A5_A6) + if (mesh_flags == MESH_FLAGS_AE_A5_A6) return -1; - if (mesh_flags & MESH_FLAGS_AE_A4) + if (mesh_flags == MESH_FLAGS_AE_A4) skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), tmp.h_source, ETH_ALEN); From 53cf29d3b1bc5b86fcff5fdc52f873d79d908ef4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 17 May 2017 19:02:17 -0300 Subject: [PATCH 033/153] scsi: lpfc: Fix NULL pointer dereference during PCI error recovery Recent commit on patchset "lpfc updates for 11.2.0.14" fixed an issue about dereferencing a NULL pointer on port reset. The specific commit, named "lpfc: Fix system crash when port is reset.", is missing a check against NULL pointer on lpfc_els_flush_cmd() though. Since we destroy the queues on adapter resets, like in PCI error recovery path, we need the validation present on this patch in order to avoid a NULL pointer dereference when trying to flush commands of ELS wq, after it has been destroyed (which would lead to a kernel oops). Tested-by: Raphael Silva Signed-off-by: Guilherme G. Piccoli Acked-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 1d36f82fa369..8e532b39ae93 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -7451,6 +7451,13 @@ lpfc_els_flush_cmd(struct lpfc_vport *vport) */ spin_lock_irq(&phba->hbalock); pring = lpfc_phba_elsring(phba); + + /* Bail out if we've no ELS wq, like in PCI error recovery case. */ + if (unlikely(!pring)) { + spin_unlock_irq(&phba->hbalock); + return; + } + if (phba->sli_rev == LPFC_SLI_REV4) spin_lock(&pring->ring_lock); From eeeb51d834d76c66784e7fe1a9ace3ce3f8d2af1 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 16 May 2017 20:52:29 -0700 Subject: [PATCH 034/153] scsi: lpfc: fix build issue if NVME_FC_TARGET is not defined fix build issue if NVME_FC_TARGET is not defined. noop the code. The code will never be invoked if target mode is not enabled. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_nvmet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index 312f54278bd4..f94294b77b7b 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -157,6 +157,7 @@ lpfc_nvmet_xmt_ls_rsp_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, void lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf) { +#if (IS_ENABLED(CONFIG_NVME_TARGET_FC)) struct lpfc_nvmet_rcv_ctx *ctxp = ctx_buf->context; struct lpfc_nvmet_tgtport *tgtp; struct fc_frame_header *fc_hdr; @@ -260,6 +261,7 @@ lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf) &phba->sli4_hba.lpfc_nvmet_ctx_list); phba->sli4_hba.nvmet_ctx_cnt++; spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_lock, iflag); +#endif } #ifdef CONFIG_SCSI_LPFC_DEBUG_FS From 9933e113c2e87a9f46a40fde8dafbf801dca1ab9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 10 May 2017 03:48:23 +0800 Subject: [PATCH 035/153] crypto: skcipher - Add missing API setkey checks The API setkey checks for key sizes and alignment went AWOL during the skcipher conversion. This patch restores them. Cc: Fixes: 4e6c3df4d729 ("crypto: skcipher - Add low-level skcipher...") Reported-by: Baozeng Signed-off-by: Herbert Xu --- crypto/skcipher.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 014af741fc6a..4faa0fd53b0c 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -764,6 +764,44 @@ static int crypto_init_skcipher_ops_ablkcipher(struct crypto_tfm *tfm) return 0; } +static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm, + const u8 *key, unsigned int keylen) +{ + unsigned long alignmask = crypto_skcipher_alignmask(tfm); + struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); + u8 *buffer, *alignbuffer; + unsigned long absize; + int ret; + + absize = keylen + alignmask; + buffer = kmalloc(absize, GFP_ATOMIC); + if (!buffer) + return -ENOMEM; + + alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); + memcpy(alignbuffer, key, keylen); + ret = cipher->setkey(tfm, alignbuffer, keylen); + kzfree(buffer); + return ret; +} + +static int skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); + unsigned long alignmask = crypto_skcipher_alignmask(tfm); + + if (keylen < cipher->min_keysize || keylen > cipher->max_keysize) { + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + if ((unsigned long)key & alignmask) + return skcipher_setkey_unaligned(tfm, key, keylen); + + return cipher->setkey(tfm, key, keylen); +} + static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); @@ -784,7 +822,7 @@ static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) tfm->__crt_alg->cra_type == &crypto_givcipher_type) return crypto_init_skcipher_ops_ablkcipher(tfm); - skcipher->setkey = alg->setkey; + skcipher->setkey = skcipher_setkey; skcipher->encrypt = alg->encrypt; skcipher->decrypt = alg->decrypt; skcipher->ivsize = alg->ivsize; From 463f620b1256e0488d932088e04a372817e8c42e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Potomski?= Date: Fri, 12 May 2017 08:36:27 +0200 Subject: [PATCH 036/153] scsi: ufs: Clean up some rpm/spm level SysFS nodes upon remove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When reloading module these two attributes aren't cleaned up properly and they persist causing warnings when trying to load module again. Additionally they are not recreated properly due to that. Signed-off-by: Michał Potomski Reviewed-by: Subhash Jadavani Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index abc7e87937cc..ffe8d8608818 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -7698,6 +7698,12 @@ static inline void ufshcd_add_sysfs_nodes(struct ufs_hba *hba) ufshcd_add_spm_lvl_sysfs_nodes(hba); } +static inline void ufshcd_remove_sysfs_nodes(struct ufs_hba *hba) +{ + device_remove_file(hba->dev, &hba->rpm_lvl_attr); + device_remove_file(hba->dev, &hba->spm_lvl_attr); +} + /** * ufshcd_shutdown - shutdown routine * @hba: per adapter instance @@ -7735,6 +7741,7 @@ EXPORT_SYMBOL(ufshcd_shutdown); */ void ufshcd_remove(struct ufs_hba *hba) { + ufshcd_remove_sysfs_nodes(hba); scsi_remove_host(hba->host); /* disable interrupts */ ufshcd_disable_intr(hba, hba->intr_mask); From a351e40b6de550049423a26f7ded7b639e363d89 Mon Sep 17 00:00:00 2001 From: Varun Prakash Date: Wed, 17 May 2017 20:30:43 +0530 Subject: [PATCH 037/153] scsi: csiostor: fix use after free in csio_hw_use_fwconfig() mbp pointer is passed to csio_hw_validate_caps() so call mempool_free() after calling csio_hw_validate_caps(). Signed-off-by: Varun Prakash Fixes: 541c571fa2fd ("csiostor:Use firmware version from cxgb4/t4fw_version.h") Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/csiostor/csio_hw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/csiostor/csio_hw.c b/drivers/scsi/csiostor/csio_hw.c index 622bdabc8894..dab195f04da7 100644 --- a/drivers/scsi/csiostor/csio_hw.c +++ b/drivers/scsi/csiostor/csio_hw.c @@ -1769,7 +1769,6 @@ csio_hw_use_fwconfig(struct csio_hw *hw, int reset, u32 *fw_cfg_param) goto bye; } - mempool_free(mbp, hw->mb_mempool); if (finicsum != cfcsum) { csio_warn(hw, "Config File checksum mismatch: csum=%#x, computed=%#x\n", @@ -1780,6 +1779,10 @@ csio_hw_use_fwconfig(struct csio_hw *hw, int reset, u32 *fw_cfg_param) rv = csio_hw_validate_caps(hw, mbp); if (rv != 0) goto bye; + + mempool_free(mbp, hw->mb_mempool); + mbp = NULL; + /* * Note that we're operating with parameters * not supplied by the driver, rather than from hard-wired From 1bad6c4a57efda0d5f5bf8a2403b21b1ed24875c Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 18 May 2017 15:40:05 -0700 Subject: [PATCH 038/153] scsi: zero per-cmd private driver data for each MQ I/O In lower layer driver's (LLD) scsi_host_template, the driver may optionally ask SCSI to allocate its private driver memory for each command, by specifying cmd_size. This memory is allocated at the end of scsi_cmnd by SCSI. Later when SCSI queues a command, the LLD can use scsi_cmd_priv to get to its private data. Some LLD, e.g. hv_storvsc, doesn't clear its private data before use. In this case, the LLD may get to stale or uninitialized data in its private driver memory. This may result in unexpected driver and hardware behavior. Fix this problem by also zeroing the private driver memory before passing them to LLD. Signed-off-by: Long Li Reviewed-by: Bart Van Assche Reviewed-by: KY Srinivasan Reviewed-by: Christoph Hellwig CC: # 4.11+ Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index e31f1cc90b81..99e16ac479e3 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1851,7 +1851,7 @@ static int scsi_mq_prep_fn(struct request *req) /* zero out the cmd, except for the embedded scsi_request */ memset((char *)cmd + sizeof(cmd->req), 0, - sizeof(*cmd) - sizeof(cmd->req)); + sizeof(*cmd) - sizeof(cmd->req) + shost->hostt->cmd_size); req->special = cmd; From bae3dee0992dcb336a591468376b046e5447997b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 16 May 2017 14:17:20 +0800 Subject: [PATCH 039/153] mmc: sdhci-xenon: kill xenon_clean_phy() Currently, the xenon_clean_phy() is only used for freeing phy_params. The phy_params is allocated by devm_kzalloc(), there's no need to free is explicitly. Signed-off-by: Jisheng Zhang Acked-by: Hu Ziji Acked-by: Adrian Hunter --- drivers/mmc/host/sdhci-xenon-phy.c | 14 +------------- drivers/mmc/host/sdhci-xenon.c | 6 +----- drivers/mmc/host/sdhci-xenon.h | 1 - 3 files changed, 2 insertions(+), 19 deletions(-) diff --git a/drivers/mmc/host/sdhci-xenon-phy.c b/drivers/mmc/host/sdhci-xenon-phy.c index 6356781f1cca..f7e26b031e76 100644 --- a/drivers/mmc/host/sdhci-xenon-phy.c +++ b/drivers/mmc/host/sdhci-xenon-phy.c @@ -787,14 +787,6 @@ int xenon_phy_adj(struct sdhci_host *host, struct mmc_ios *ios) return ret; } -void xenon_clean_phy(struct sdhci_host *host) -{ - struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct xenon_priv *priv = sdhci_pltfm_priv(pltfm_host); - - kfree(priv->phy_params); -} - static int xenon_add_phy(struct device_node *np, struct sdhci_host *host, const char *phy_name) { @@ -819,11 +811,7 @@ static int xenon_add_phy(struct device_node *np, struct sdhci_host *host, if (ret) return ret; - ret = xenon_emmc_phy_parse_param_dt(host, np, priv->phy_params); - if (ret) - xenon_clean_phy(host); - - return ret; + return xenon_emmc_phy_parse_param_dt(host, np, priv->phy_params); } int xenon_phy_parse_dt(struct device_node *np, struct sdhci_host *host) diff --git a/drivers/mmc/host/sdhci-xenon.c b/drivers/mmc/host/sdhci-xenon.c index 67246655315b..bc1781bb070b 100644 --- a/drivers/mmc/host/sdhci-xenon.c +++ b/drivers/mmc/host/sdhci-xenon.c @@ -486,7 +486,7 @@ static int xenon_probe(struct platform_device *pdev) err = xenon_sdhc_prepare(host); if (err) - goto clean_phy_param; + goto err_clk; err = sdhci_add_host(host); if (err) @@ -496,8 +496,6 @@ static int xenon_probe(struct platform_device *pdev) remove_sdhc: xenon_sdhc_unprepare(host); -clean_phy_param: - xenon_clean_phy(host); err_clk: clk_disable_unprepare(pltfm_host->clk); free_pltfm: @@ -510,8 +508,6 @@ static int xenon_remove(struct platform_device *pdev) struct sdhci_host *host = platform_get_drvdata(pdev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - xenon_clean_phy(host); - sdhci_remove_host(host, 0); xenon_sdhc_unprepare(host); diff --git a/drivers/mmc/host/sdhci-xenon.h b/drivers/mmc/host/sdhci-xenon.h index 6e6523ea01ce..73debb42dc2f 100644 --- a/drivers/mmc/host/sdhci-xenon.h +++ b/drivers/mmc/host/sdhci-xenon.h @@ -93,7 +93,6 @@ struct xenon_priv { }; int xenon_phy_adj(struct sdhci_host *host, struct mmc_ios *ios); -void xenon_clean_phy(struct sdhci_host *host); int xenon_phy_parse_dt(struct device_node *np, struct sdhci_host *host); void xenon_soc_pad_ctrl(struct sdhci_host *host, From aca69344c8a99e7374d913e42ba9120c398ee16f Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 16 May 2017 11:36:51 +0200 Subject: [PATCH 040/153] mmc: cavium-octeon: Fix interrupt enable code OCTEON SoCs with CIU3 do not have interrupt masking local to the MMC bus interface. Unfortunately, some even have a diagnostic register at the same address of the enable register, which causes the interrupts to fire immediately if stored to, thus breaking the driver. The proper action on these SoCs is not to touch this register. Fixes: 01d95843335c ("mmc: cavium: Add MMC support for Octeon SOCs.") Signed-off-by: David Daney [jglauber@cavium.com: removed point after subject line] Signed-off-by: Jan Glauber Signed-off-by: Ulf Hansson --- drivers/mmc/host/cavium-octeon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/cavium-octeon.c b/drivers/mmc/host/cavium-octeon.c index 772d0900026d..d698d66e3327 100644 --- a/drivers/mmc/host/cavium-octeon.c +++ b/drivers/mmc/host/cavium-octeon.c @@ -108,7 +108,7 @@ static void octeon_mmc_release_bus(struct cvm_mmc_host *host) static void octeon_mmc_int_enable(struct cvm_mmc_host *host, u64 val) { writeq(val, host->base + MIO_EMM_INT(host)); - if (!host->dma_active || (host->dma_active && !host->has_ciu3)) + if (!host->has_ciu3) writeq(val, host->base + MIO_EMM_INT_EN(host)); } From 899e4aad15e93315fa18ab9e9c88904ad237cfa0 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 16 May 2017 11:36:52 +0200 Subject: [PATCH 041/153] mmc: cavium-octeon: Use proper GPIO name for power control The devm_gpiod_get_optional() function appends a "-gpios" to the string passed to it, so if we want to find the "power-gpios" signal, we must pass "power" to this function. Fixes: 01d95843335c ("mmc: cavium: Add MMC support for Octeon SOCs.") Signed-off-by: David Daney [jglauber@cavium.com: removed point after subject line] Signed-off-by: Jan Glauber Signed-off-by: Ulf Hansson --- drivers/mmc/host/cavium-octeon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/cavium-octeon.c b/drivers/mmc/host/cavium-octeon.c index d698d66e3327..cbb566377508 100644 --- a/drivers/mmc/host/cavium-octeon.c +++ b/drivers/mmc/host/cavium-octeon.c @@ -267,7 +267,7 @@ static int octeon_mmc_probe(struct platform_device *pdev) } host->global_pwr_gpiod = devm_gpiod_get_optional(&pdev->dev, - "power-gpios", + "power", GPIOD_OUT_HIGH); if (IS_ERR(host->global_pwr_gpiod)) { dev_err(&pdev->dev, "Invalid power GPIO\n"); From a486cd23661c9387fb076c3f6ae8b2aa9d20d54a Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Fri, 19 May 2017 12:47:00 +0200 Subject: [PATCH 042/153] xfrm: fix state migration copy replay sequence numbers During xfrm migration copy replay and preplay sequence numbers from the previous state. Here is a tcpdump output showing the problem. 10.0.10.46 is running vanilla kernel, is the IKE/IPsec responder. After the migration it sent wrong sequence number, reset to 1. The migration is from 10.0.0.52 to 10.0.0.53. IP 10.0.0.52.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7cf), length 136 IP 10.0.10.46.4500 > 10.0.0.52.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x7cf), length 136 IP 10.0.0.52.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7d0), length 136 IP 10.0.10.46.4500 > 10.0.0.52.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x7d0), length 136 IP 10.0.0.53.4500 > 10.0.10.46.4500: NONESP-encap: isakmp: child_sa inf2[I] IP 10.0.10.46.4500 > 10.0.0.53.4500: NONESP-encap: isakmp: child_sa inf2[R] IP 10.0.0.53.4500 > 10.0.10.46.4500: NONESP-encap: isakmp: child_sa inf2[I] IP 10.0.10.46.4500 > 10.0.0.53.4500: NONESP-encap: isakmp: child_sa inf2[R] IP 10.0.0.53.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7d1), length 136 NOTE: next sequence is wrong 0x1 IP 10.0.10.46.4500 > 10.0.0.53.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x1), length 136 IP 10.0.0.53.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7d2), length 136 IP 10.0.10.46.4500 > 10.0.0.53.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x2), length 136 Signed-off-by: Antony Antony Reviewed-by: Richard Guy Briggs Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index fc3c5aa38754..2e291bc5f1fc 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1383,6 +1383,8 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig) x->curlft.add_time = orig->curlft.add_time; x->km.state = orig->km.state; x->km.seq = orig->km.seq; + x->replay = orig->replay; + x->preplay = orig->preplay; return x; From 3ecb3ac7b950ff8f6c6a61e8b7b0d6e3546429a0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 15 May 2017 19:16:15 -0700 Subject: [PATCH 043/153] xfs: avoid mount-time deadlock in CoW extent recovery If a malicious user corrupts the refcount btree to cause a cycle between different levels of the tree, the next mount attempt will deadlock in the CoW recovery routine while grabbing buffer locks. We can use the ability to re-grab a buffer that was previous locked to a transaction to avoid deadlocks, so do that here. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_refcount.c | 43 ++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index b177ef33cd4c..82a38d86ebad 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1629,13 +1629,28 @@ xfs_refcount_recover_cow_leftovers( if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) return -EOPNOTSUPP; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + INIT_LIST_HEAD(&debris); + + /* + * In this first part, we use an empty transaction to gather up + * all the leftover CoW extents so that we can subsequently + * delete them. The empty transaction is used to avoid + * a buffer lock deadlock if there happens to be a loop in the + * refcountbt because we're allowed to re-grab a buffer that is + * already attached to our transaction. When we're done + * recording the CoW debris we cancel the (empty) transaction + * and everything goes away cleanly. + */ + error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (error) + goto out_trans; + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); /* Find all the leftover CoW staging extents. */ - INIT_LIST_HEAD(&debris); memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); low.rc.rc_startblock = XFS_REFC_COW_START; @@ -1645,10 +1660,11 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_cursor; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_buf_relse(agbp); + xfs_trans_brelse(tp, agbp); + xfs_trans_cancel(tp); /* Now iterate the list to free the leftovers */ - list_for_each_entry(rr, &debris, rr_list) { + list_for_each_entry_safe(rr, n, &debris, rr_list) { /* Set up transaction. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); if (error) @@ -1676,8 +1692,16 @@ xfs_refcount_recover_cow_leftovers( error = xfs_trans_commit(tp); if (error) goto out_free; + + list_del(&rr->rr_list); + kmem_free(rr); } + return error; +out_defer: + xfs_defer_cancel(&dfops); +out_trans: + xfs_trans_cancel(tp); out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { @@ -1688,11 +1712,6 @@ xfs_refcount_recover_cow_leftovers( out_cursor: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_buf_relse(agbp); - goto out_free; - -out_defer: - xfs_defer_cancel(&dfops); - xfs_trans_cancel(tp); - goto out_free; + xfs_trans_brelse(tp, agbp); + goto out_trans; } From e480eabae232b92ff44ce63678280373713920a4 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 18 May 2017 21:33:44 +0200 Subject: [PATCH 044/153] drm/radeon: Fix oops upon driver load on PowerXpress laptops Nicolai Stange reports the following oops which is caused by dereferencing rdev->pdev before it's subsequently set by radeon_device_init(). Fix it. BUG: unable to handle kernel NULL pointer dereference at 00000000000007cb IP: radeon_driver_load_kms+0xeb/0x230 [radeon] ... Call Trace: drm_dev_register+0x146/0x1d0 [drm] drm_get_pci_dev+0x9a/0x180 [drm] radeon_pci_probe+0xb8/0xe0 [radeon] local_pci_probe+0x45/0xa0 pci_device_probe+0x14f/0x1a0 driver_probe_device+0x29c/0x450 __driver_attach+0xdf/0xf0 ? driver_probe_device+0x450/0x450 bus_for_each_dev+0x6c/0xc0 driver_attach+0x1e/0x20 bus_add_driver+0x170/0x270 driver_register+0x60/0xe0 ? 0xffffffffc0508000 __pci_register_driver+0x4c/0x50 drm_pci_init+0xeb/0x100 [drm] ? vga_switcheroo_register_handler+0x6a/0x90 ? 0xffffffffc0508000 radeon_init+0x98/0xb6 [radeon] do_one_initcall+0x52/0x1a0 ? __vunmap+0x81/0xb0 ? kmem_cache_alloc_trace+0x159/0x1b0 ? do_init_module+0x27/0x1f8 do_init_module+0x5f/0x1f8 load_module+0x27ce/0x2be0 SYSC_finit_module+0xdf/0x110 ? SYSC_finit_module+0xdf/0x110 SyS_finit_module+0xe/0x10 do_syscall_64+0x67/0x150 entry_SYSCALL64_slow_path+0x25/0x25 Fixes: 7ffb0ce31cf9 ("drm/radeon: Don't register Thunderbolt eGPU with vga_switcheroo") Reported-and-tested-by: Nicolai Stange Signed-off-by: Lukas Wunner Link: http://patchwork.freedesktop.org/patch/msgid/cfb91ba052af06117137eec0637543a2626a7979.1495135190.git.lukas@wunner.de --- drivers/gpu/drm/radeon/radeon_kms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index e3e7cb1d10a2..4761f27f2ca2 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -116,7 +116,7 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags) if ((radeon_runtime_pm != 0) && radeon_has_atpx() && ((flags & RADEON_IS_IGP) == 0) && - !pci_is_thunderbolt_attached(rdev->pdev)) + !pci_is_thunderbolt_attached(dev->pdev)) flags |= RADEON_IS_PX; /* radeon_device_init should report only fatal error From 5165da5923d6c7df6f2927b0113b2e4d9288661e Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Fri, 5 May 2017 11:06:50 +0200 Subject: [PATCH 045/153] i2c: i2c-tiny-usb: fix buffer not being DMA capable Since v4.9 i2c-tiny-usb generates the below call trace and longer works, since it can't communicate with the USB device. The reason is, that since v4.9 the USB stack checks, that the buffer it should transfer is DMA capable. This was a requirement since v2.2 days, but it usually worked nevertheless. [ 17.504959] ------------[ cut here ]------------ [ 17.505488] WARNING: CPU: 0 PID: 93 at drivers/usb/core/hcd.c:1587 usb_hcd_map_urb_for_dma+0x37c/0x570 [ 17.506545] transfer buffer not dma capable [ 17.507022] Modules linked in: [ 17.507370] CPU: 0 PID: 93 Comm: i2cdetect Not tainted 4.11.0-rc8+ #10 [ 17.508103] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 17.509039] Call Trace: [ 17.509320] ? dump_stack+0x5c/0x78 [ 17.509714] ? __warn+0xbe/0xe0 [ 17.510073] ? warn_slowpath_fmt+0x5a/0x80 [ 17.510532] ? nommu_map_sg+0xb0/0xb0 [ 17.510949] ? usb_hcd_map_urb_for_dma+0x37c/0x570 [ 17.511482] ? usb_hcd_submit_urb+0x336/0xab0 [ 17.511976] ? wait_for_completion_timeout+0x12f/0x1a0 [ 17.512549] ? wait_for_completion_timeout+0x65/0x1a0 [ 17.513125] ? usb_start_wait_urb+0x65/0x160 [ 17.513604] ? usb_control_msg+0xdc/0x130 [ 17.514061] ? usb_xfer+0xa4/0x2a0 [ 17.514445] ? __i2c_transfer+0x108/0x3c0 [ 17.514899] ? i2c_transfer+0x57/0xb0 [ 17.515310] ? i2c_smbus_xfer_emulated+0x12f/0x590 [ 17.515851] ? _raw_spin_unlock_irqrestore+0x11/0x20 [ 17.516408] ? i2c_smbus_xfer+0x125/0x330 [ 17.516876] ? i2c_smbus_xfer+0x125/0x330 [ 17.517329] ? i2cdev_ioctl_smbus+0x1c1/0x2b0 [ 17.517824] ? i2cdev_ioctl+0x75/0x1c0 [ 17.518248] ? do_vfs_ioctl+0x9f/0x600 [ 17.518671] ? vfs_write+0x144/0x190 [ 17.519078] ? SyS_ioctl+0x74/0x80 [ 17.519463] ? entry_SYSCALL_64_fastpath+0x1e/0xad [ 17.519959] ---[ end trace d047c04982f5ac50 ]--- Cc: Signed-off-by: Sebastian Reichel Reviewed-by: Greg Kroah-Hartman Acked-by: Till Harbaum Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-tiny-usb.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-tiny-usb.c b/drivers/i2c/busses/i2c-tiny-usb.c index 0ed77eeff31e..a2e3dd715380 100644 --- a/drivers/i2c/busses/i2c-tiny-usb.c +++ b/drivers/i2c/busses/i2c-tiny-usb.c @@ -178,22 +178,39 @@ static int usb_read(struct i2c_adapter *adapter, int cmd, int value, int index, void *data, int len) { struct i2c_tiny_usb *dev = (struct i2c_tiny_usb *)adapter->algo_data; + void *dmadata = kmalloc(len, GFP_KERNEL); + int ret; + + if (!dmadata) + return -ENOMEM; /* do control transfer */ - return usb_control_msg(dev->usb_dev, usb_rcvctrlpipe(dev->usb_dev, 0), + ret = usb_control_msg(dev->usb_dev, usb_rcvctrlpipe(dev->usb_dev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_INTERFACE | - USB_DIR_IN, value, index, data, len, 2000); + USB_DIR_IN, value, index, dmadata, len, 2000); + + memcpy(data, dmadata, len); + kfree(dmadata); + return ret; } static int usb_write(struct i2c_adapter *adapter, int cmd, int value, int index, void *data, int len) { struct i2c_tiny_usb *dev = (struct i2c_tiny_usb *)adapter->algo_data; + void *dmadata = kmemdup(data, len, GFP_KERNEL); + int ret; + + if (!dmadata) + return -ENOMEM; /* do control transfer */ - return usb_control_msg(dev->usb_dev, usb_sndctrlpipe(dev->usb_dev, 0), + ret = usb_control_msg(dev->usb_dev, usb_sndctrlpipe(dev->usb_dev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_INTERFACE, - value, index, data, len, 2000); + value, index, dmadata, len, 2000); + + kfree(dmadata); + return ret; } static void i2c_tiny_usb_free(struct i2c_tiny_usb *dev) From e2c824924cdb41528932c550647406ad81336b18 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 22 May 2017 07:46:55 +0200 Subject: [PATCH 046/153] i2c: designware: Fix bogus sda_hold_time due to uninitialized vars We need to initializes those variables to 0 for platforms that do not provide ACPI parameters. Otherwise, we set sda_hold_time to random values, breaking e.g. Galileo and IOT2000 boards. Fixes: 9d6408433019 ("i2c: designware: don't infer timings described by ACPI from clock rate") Signed-off-by: Jan Kiszka Reviewed-by: Ard Biesheuvel Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 6283b99d2b17..d1263b82d646 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -94,9 +94,9 @@ static void dw_i2c_acpi_params(struct platform_device *pdev, char method[], static int dw_i2c_acpi_configure(struct platform_device *pdev) { struct dw_i2c_dev *dev = platform_get_drvdata(pdev); + u32 ss_ht = 0, fp_ht = 0, hs_ht = 0, fs_ht = 0; acpi_handle handle = ACPI_HANDLE(&pdev->dev); const struct acpi_device_id *id; - u32 ss_ht, fp_ht, hs_ht, fs_ht; struct acpi_device *adev; const char *uid; From 63691587f7b0028326ddd1226c378aaaeca4d4e4 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 22 May 2017 16:32:46 +0200 Subject: [PATCH 047/153] ALSA: hda - Apply dual-codec quirk for MSI Z270-Gaming mobo MSI Z270-Gamin mobo has also two ALC1220 codecs like Gigabyte AZ370- Gaming mobo. Apply the same quirk to this one. Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 9c22ad694534..3fdd5af190a4 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2328,6 +2328,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK_VENDOR(0x1462, "MSI", ALC882_FIXUP_GPIO3), SND_PCI_QUIRK(0x1458, 0xa002, "Gigabyte EP45-DS3/Z87X-UD3H", ALC889_FIXUP_FRONT_HP_NO_PRESENCE), SND_PCI_QUIRK(0x1458, 0xa0b8, "Gigabyte AZ370-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), + SND_PCI_QUIRK(0x1462, 0xda57, "MSI Z270-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), SND_PCI_QUIRK(0x147b, 0x107a, "Abit AW9D-MAX", ALC882_FIXUP_ABIT_AW9D_MAX), SND_PCI_QUIRK_VENDOR(0x1558, "Clevo laptop", ALC882_FIXUP_EAPD), SND_PCI_QUIRK(0x161f, 0x2054, "Medion laptop", ALC883_FIXUP_EAPD), From ba90d6a6b00a84f2a18112145c113e5ef628e561 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 22 May 2017 16:38:47 +0200 Subject: [PATCH 048/153] ALSA: hda - Provide dual-codecs model option for a few Realtek codecs Recently some laptops and mobos are equipped with the dual Realtek codecs that require special quirks. For making the debugging easier, add the model "dual-codecs" to be passed via module option. Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 3fdd5af190a4..918e45268915 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2343,6 +2343,7 @@ static const struct hda_model_fixup alc882_fixup_models[] = { {.id = ALC883_FIXUP_ACER_EAPD, .name = "acer-aspire"}, {.id = ALC882_FIXUP_INV_DMIC, .name = "inv-dmic"}, {.id = ALC882_FIXUP_NO_PRIMARY_HP, .name = "no-primary-hp"}, + {.id = ALC1220_FIXUP_GB_DUAL_CODECS, .name = "dual-codecs"}, {} }; @@ -6015,6 +6016,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC292_FIXUP_TPT440_DOCK, .name = "tpt440-dock"}, {.id = ALC292_FIXUP_TPT440, .name = "tpt440"}, {.id = ALC292_FIXUP_TPT460, .name = "tpt460"}, + {.id = ALC233_FIXUP_LENOVO_MULTI_CODECS, .name = "dual-codecs"}, {} }; #define ALC225_STANDARD_PINS \ @@ -7342,6 +7344,7 @@ static const struct hda_model_fixup alc662_fixup_models[] = { {.id = ALC662_FIXUP_ASUS_MODE8, .name = "asus-mode8"}, {.id = ALC662_FIXUP_INV_DMIC, .name = "inv-dmic"}, {.id = ALC668_FIXUP_DELL_MIC_NO_PRESENCE, .name = "dell-headset-multi"}, + {.id = ALC662_FIXUP_LENOVO_MULTI_CODECS, .name = "dual-codecs"}, {} }; From a79e7df97592b2326be81d5dae286bdb5c529a01 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 22 May 2017 16:41:24 +0200 Subject: [PATCH 049/153] ALSA: hda - Update the list of quirk models I've forgotten to sync the documentation with the actually available options for some time. Now all updated. Signed-off-by: Takashi Iwai --- Documentation/sound/hd-audio/models.rst | 114 ++++++++++++++---------- 1 file changed, 65 insertions(+), 49 deletions(-) diff --git a/Documentation/sound/hd-audio/models.rst b/Documentation/sound/hd-audio/models.rst index 5338673c88d9..773d2bfacc6c 100644 --- a/Documentation/sound/hd-audio/models.rst +++ b/Documentation/sound/hd-audio/models.rst @@ -16,6 +16,8 @@ ALC880 6-jack in back, 2-jack in front 6stack-digout 6-jack with a SPDIF out +6stack-automute + 6-jack with headphone jack detection ALC260 ====== @@ -62,6 +64,8 @@ lenovo-dock Enables docking station I/O for some Lenovos hp-gpio-led GPIO LED support on HP laptops +hp-dock-gpio-mic1-led + HP dock with mic LED support dell-headset-multi Headset jack, which can also be used as mic-in dell-headset-dock @@ -72,6 +76,12 @@ alc283-sense-combo Combo jack sensing on ALC283 tpt440-dock Pin configs for Lenovo Thinkpad Dock support +tpt440 + Lenovo Thinkpad T440s setup +tpt460 + Lenovo Thinkpad T460/560 setup +dual-codecs + Lenovo laptops with dual codecs ALC66x/67x/892 ============== @@ -97,6 +107,8 @@ inv-dmic Inverted internal mic workaround dell-headset-multi Headset jack, which can also be used as mic-in +dual-codecs + Lenovo laptops with dual codecs ALC680 ====== @@ -114,6 +126,8 @@ inv-dmic Inverted internal mic workaround no-primary-hp VAIO Z/VGC-LN51JGB workaround (for fixed speaker DAC) +dual-codecs + ALC1220 dual codecs for Gaming mobos ALC861/660 ========== @@ -206,65 +220,47 @@ auto Conexant 5045 ============= -laptop-hpsense - Laptop with HP sense (old model laptop) -laptop-micsense - Laptop with Mic sense (old model fujitsu) -laptop-hpmicsense - Laptop with HP and Mic senses -benq - Benq R55E -laptop-hp530 - HP 530 laptop -test - for testing/debugging purpose, almost all controls can be - adjusted. Appearing only when compiled with $CONFIG_SND_DEBUG=y +cap-mix-amp + Fix max input level on mixer widget +toshiba-p105 + Toshiba P105 quirk +hp-530 + HP 530 quirk Conexant 5047 ============= -laptop - Basic Laptop config -laptop-hp - Laptop config for some HP models (subdevice 30A5) -laptop-eapd - Laptop config with EAPD support -test - for testing/debugging purpose, almost all controls can be - adjusted. Appearing only when compiled with $CONFIG_SND_DEBUG=y +cap-mix-amp + Fix max input level on mixer widget Conexant 5051 ============= -laptop - Basic Laptop config (default) -hp - HP Spartan laptop -hp-dv6736 - HP dv6736 -hp-f700 - HP Compaq Presario F700 -ideapad - Lenovo IdeaPad laptop -toshiba - Toshiba Satellite M300 +lenovo-x200 + Lenovo X200 quirk Conexant 5066 ============= -laptop - Basic Laptop config (default) -hp-laptop - HP laptops, e g G60 -asus - Asus K52JU, Lenovo G560 -dell-laptop - Dell laptops -dell-vostro - Dell Vostro -olpc-xo-1_5 - OLPC XO 1.5 -ideapad - Lenovo IdeaPad U150 +stereo-dmic + Workaround for inverted stereo digital mic +gpio1 + Enable GPIO1 pin +headphone-mic-pin + Enable headphone mic NID 0x18 without detection +tp410 + Thinkpad T400 & co quirks thinkpad - Lenovo Thinkpad + Thinkpad mute/mic LED quirk +lemote-a1004 + Lemote A1004 quirk +lemote-a1205 + Lemote A1205 quirk +olpc-xo + OLPC XO quirk +mute-led-eapd + Mute LED control via EAPD +hp-dock + HP dock support +mute-led-gpio + Mute LED control via GPIO STAC9200 ======== @@ -444,6 +440,8 @@ dell-eq Dell desktops/laptops alienware Alienware M17x +asus-mobo + Pin configs for ASUS mobo with 5.1/SPDIF out auto BIOS setup (default) @@ -477,6 +475,8 @@ hp-envy-ts-bass Pin fixup for HP Envy TS bass speaker (NID 0x10) hp-bnb13-eq Hardware equalizer setup for HP laptops +hp-envy-ts-bass + HP Envy TS bass support auto BIOS setup (default) @@ -496,10 +496,22 @@ auto Cirrus Logic CS4206/4207 ======================== +mbp53 + MacBook Pro 5,3 mbp55 MacBook Pro 5,5 imac27 IMac 27 Inch +imac27_122 + iMac 12,2 +apple + Generic Apple quirk +mbp101 + MacBookPro 10,1 +mbp81 + MacBookPro 8,1 +mba42 + MacBookAir 4,2 auto BIOS setup (default) @@ -509,6 +521,10 @@ mba6 MacBook Air 6,1 and 6,2 gpio0 Enable GPIO 0 amp +mbp11 + MacBookPro 11,2 +macmini + MacMini 7,1 auto BIOS setup (default) From 9e7b9a25e170722f15ed54f5b963e9867f79195d Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Mon, 22 May 2017 13:09:19 +0200 Subject: [PATCH 050/153] mmc: cavium: Prevent crash with incomplete DT In case the DT specifies neither a regulator nor a gpio for the shared power the driver will crash accessing the regulator. Prevent the crash by checking the regulator before use. Use mmc_regulator_get_supply() instead of open coding the same logic. Signed-off-by: Jan Glauber Signed-off-by: Ulf Hansson --- drivers/mmc/host/cavium.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/mmc/host/cavium.c b/drivers/mmc/host/cavium.c index 58b51ba6aabd..b8aaf0fdb77c 100644 --- a/drivers/mmc/host/cavium.c +++ b/drivers/mmc/host/cavium.c @@ -839,14 +839,14 @@ static void cvm_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) cvm_mmc_reset_bus(slot); if (host->global_pwr_gpiod) host->set_shared_power(host, 0); - else + else if (!IS_ERR(mmc->supply.vmmc)) mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, 0); break; case MMC_POWER_UP: if (host->global_pwr_gpiod) host->set_shared_power(host, 1); - else + else if (!IS_ERR(mmc->supply.vmmc)) mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, ios->vdd); break; } @@ -968,20 +968,15 @@ static int cvm_mmc_of_parse(struct device *dev, struct cvm_mmc_slot *slot) return -EINVAL; } - mmc->supply.vmmc = devm_regulator_get_optional(dev, "vmmc"); - if (IS_ERR(mmc->supply.vmmc)) { - if (PTR_ERR(mmc->supply.vmmc) == -EPROBE_DEFER) - return -EPROBE_DEFER; - /* - * Legacy Octeon firmware has no regulator entry, fall-back to - * a hard-coded voltage to get a sane OCR. - */ + ret = mmc_regulator_get_supply(mmc); + if (ret == -EPROBE_DEFER) + return ret; + /* + * Legacy Octeon firmware has no regulator entry, fall-back to + * a hard-coded voltage to get a sane OCR. + */ + if (IS_ERR(mmc->supply.vmmc)) mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34; - } else { - ret = mmc_regulator_get_ocrmask(mmc->supply.vmmc); - if (ret > 0) - mmc->ocr_avail = ret; - } /* Common MMC bindings */ ret = mmc_of_parse(mmc); From c2372c20425bd75a5527b3e2204059762190f6ca Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Mon, 22 May 2017 13:09:20 +0200 Subject: [PATCH 051/153] of/platform: Make of_platform_device_destroy globally visible of_platform_device_destroy is the counterpart to of_platform_device_create which is a non-static function. After creating a platform device it might be neccessary to destroy it to deal with -EPROBE_DEFER where a repeated of_platform_device_create call would fail otherwise. Therefore also make of_platform_device_destroy globally visible. Signed-off-by: Jan Glauber Acked-by: Rob Herring Signed-off-by: Ulf Hansson --- drivers/of/platform.c | 3 ++- include/linux/of_platform.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/of/platform.c b/drivers/of/platform.c index 71fecc2debfc..703a42118ffc 100644 --- a/drivers/of/platform.c +++ b/drivers/of/platform.c @@ -523,7 +523,7 @@ static int __init of_platform_default_populate_init(void) arch_initcall_sync(of_platform_default_populate_init); #endif -static int of_platform_device_destroy(struct device *dev, void *data) +int of_platform_device_destroy(struct device *dev, void *data) { /* Do not touch devices not populated from the device tree */ if (!dev->of_node || !of_node_check_flag(dev->of_node, OF_POPULATED)) @@ -544,6 +544,7 @@ static int of_platform_device_destroy(struct device *dev, void *data) of_node_clear_flag(dev->of_node, OF_POPULATED_BUS); return 0; } +EXPORT_SYMBOL_GPL(of_platform_device_destroy); /** * of_platform_depopulate() - Remove devices populated from device tree diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index dc8224ae28d5..e0d1946270f3 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -64,6 +64,7 @@ extern struct platform_device *of_platform_device_create(struct device_node *np, const char *bus_id, struct device *parent); +extern int of_platform_device_destroy(struct device *dev, void *data); extern int of_platform_bus_probe(struct device_node *root, const struct of_device_id *matches, struct device *parent); From 8fb83b142823cdd1f85f78dcf9e861e9033919f9 Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Mon, 22 May 2017 13:09:21 +0200 Subject: [PATCH 052/153] mmc: cavium: Fix probing race with regulator If the regulator probing is not yet finished this driver might catch a -EPROBE_DEFER. Returning after this condition did not remove the created platform device. On a repeated call to the probe function the of_platform_device_create fails. Calling of_platform_device_destroy after EPROBE_DEFER resolves this bug. Signed-off-by: Jan Glauber Signed-off-by: Ulf Hansson --- drivers/mmc/host/cavium-octeon.c | 11 ++++++++++- drivers/mmc/host/cavium-thunderx.c | 6 ++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/cavium-octeon.c b/drivers/mmc/host/cavium-octeon.c index cbb566377508..951d2cdd7888 100644 --- a/drivers/mmc/host/cavium-octeon.c +++ b/drivers/mmc/host/cavium-octeon.c @@ -288,11 +288,20 @@ static int octeon_mmc_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "Error populating slots\n"); octeon_mmc_set_shared_power(host, 0); - return ret; + goto error; } i++; } return 0; + +error: + for (i = 0; i < CAVIUM_MAX_MMC; i++) { + if (host->slot[i]) + cvm_mmc_of_slot_remove(host->slot[i]); + if (host->slot_pdev[i]) + of_platform_device_destroy(&host->slot_pdev[i]->dev, NULL); + } + return ret; } static int octeon_mmc_remove(struct platform_device *pdev) diff --git a/drivers/mmc/host/cavium-thunderx.c b/drivers/mmc/host/cavium-thunderx.c index fe3d77267cd6..b9cc95998799 100644 --- a/drivers/mmc/host/cavium-thunderx.c +++ b/drivers/mmc/host/cavium-thunderx.c @@ -146,6 +146,12 @@ static int thunder_mmc_probe(struct pci_dev *pdev, return 0; error: + for (i = 0; i < CAVIUM_MAX_MMC; i++) { + if (host->slot[i]) + cvm_mmc_of_slot_remove(host->slot[i]); + if (host->slot_pdev[i]) + of_platform_device_destroy(&host->slot_pdev[i]->dev, NULL); + } clk_disable_unprepare(host->clk); return ret; } From f5f968f2371ccdebb8a365487649673c9af68d09 Mon Sep 17 00:00:00 2001 From: Srinath Mannam Date: Thu, 18 May 2017 22:27:40 +0530 Subject: [PATCH 053/153] mmc: sdhci-iproc: suppress spurious interrupt with Multiblock read The stingray SDHCI hardware supports ACMD12 and automatically issues after multi block transfer completed. If ACMD12 in SDHCI is disabled, spurious tx done interrupts are seen on multi block read command with below error message: Got data interrupt 0x00000002 even though no data operation was in progress. This patch uses SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12 to enable ACM12 support in SDHCI hardware and suppress spurious interrupt. Signed-off-by: Srinath Mannam Reviewed-by: Ray Jui Reviewed-by: Scott Branden Acked-by: Adrian Hunter Fixes: b580c52d58d9 ("mmc: sdhci-iproc: add IPROC SDHCI driver") Cc: Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-iproc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c index 3275d4995812..61666d269771 100644 --- a/drivers/mmc/host/sdhci-iproc.c +++ b/drivers/mmc/host/sdhci-iproc.c @@ -187,7 +187,8 @@ static const struct sdhci_iproc_data iproc_cygnus_data = { }; static const struct sdhci_pltfm_data sdhci_iproc_pltfm_data = { - .quirks = SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK, + .quirks = SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | + SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12, .quirks2 = SDHCI_QUIRK2_ACMD23_BROKEN, .ops = &sdhci_iproc_ops, }; From 0544f5494a03b8846db74e02be5685d1f32b06c9 Mon Sep 17 00:00:00 2001 From: Marta Rybczynska Date: Mon, 10 Apr 2017 17:12:34 +0200 Subject: [PATCH 054/153] nvme-rdma: support devices with queue size < 32 In the case of small NVMe-oF queue size (<32) we may enter a deadlock caused by the fact that the IB completions aren't sent waiting for 32 and the send queue will fill up. The error is seen as (using mlx5): [ 2048.693355] mlx5_0:mlx5_ib_post_send:3765:(pid 7273): [ 2048.693360] nvme nvme1: nvme_rdma_post_send failed with error code -12 This patch changes the way the signaling is done so that it depends on the queue depth now. The magic define has been removed completely. Cc: stable@vger.kernel.org Signed-off-by: Marta Rybczynska Signed-off-by: Samuel Jones Acked-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index dd1c6deef82f..e2c18f3d9dcf 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1038,6 +1038,19 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) nvme_rdma_wr_error(cq, wc, "SEND"); } +static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) +{ + int sig_limit; + + /* + * We signal completion every queue depth/2 and also handle the + * degenerated case of a device with queue_depth=1, where we + * would need to signal every message. + */ + sig_limit = max(queue->queue_size / 2, 1); + return (++queue->sig_count % sig_limit) == 0; +} + static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, struct ib_send_wr *first, bool flush) @@ -1065,9 +1078,6 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, * Would have been way to obvious to handle this in hardware or * at least the RDMA stack.. * - * This messy and racy code sniplet is copy and pasted from the iSER - * initiator, and the magic '32' comes from there as well. - * * Always signal the flushes. The magic request used for the flush * sequencer is not allocated in our driver's tagset and it's * triggered to be freed by blk_cleanup_queue(). So we need to @@ -1075,7 +1085,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, * embedded in request's payload, is not freed when __ib_process_cq() * calls wr_cqe->done(). */ - if ((++queue->sig_count % 32) == 0 || flush) + if (nvme_rdma_queue_sig_limit(queue) || flush) wr.send_flags |= IB_SEND_SIGNALED; if (first) From 806f026f9b901eaf1a6baeb48b5da18d6a4f818e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 22 May 2017 23:05:03 +0800 Subject: [PATCH 055/153] nvme: use blk_mq_start_hw_queues() in nvme_kill_queues() Inside nvme_kill_queues(), we have to start hw queues for draining requests in sw queues, .dispatch list and requeue list, so use blk_mq_start_hw_queues() instead of blk_mq_start_stopped_hw_queues() which only run queues if queues are stopped, but the queues may have been started already, for example nvme_start_queues() is called in reset work function. blk_mq_start_hw_queues() run hw queues in current context, instead of running asynchronously like before. Given nvme_kill_queues() is run from either remove context or reset worker context, both are fine to run hw queue directly. And the mutex of namespaces_mutex isn't a problem too becasue nvme_start_freeze() runs hw queue in this way already. Cc: stable@vger.kernel.org Reported-by: Zhang Yi Reviewed-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d5e0906262ea..40d5e4a9e8d7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2437,7 +2437,13 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); blk_mq_abort_requeue_list(ns->queue); - blk_mq_start_stopped_hw_queues(ns->queue, true); + + /* + * Forcibly start all queues to avoid having stuck requests. + * Note that we must ensure the queues are not stopped + * when the final removal happens. + */ + blk_mq_start_hw_queues(ns->queue); } mutex_unlock(&ctrl->namespaces_mutex); } From 986f75c876dbafed98eba7cb516c5118f155db23 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 22 May 2017 23:05:04 +0800 Subject: [PATCH 056/153] nvme: avoid to use blk_mq_abort_requeue_list() NVMe may add request into requeue list simply and not kick off the requeue if hw queues are stopped. Then blk_mq_abort_requeue_list() is called in both nvme_kill_queues() and nvme_ns_remove() for dealing with this issue. Unfortunately blk_mq_abort_requeue_list() is absolutely a race maker, for example, one request may be requeued during the aborting. So this patch just calls blk_mq_kick_requeue_list() in nvme_kill_queues() to handle this issue like what nvme_start_queues() does. Now all requests in requeue list when queues are stopped will be handled by blk_mq_kick_requeue_list() when queues are restarted, either in nvme_start_queues() or in nvme_kill_queues(). Cc: stable@vger.kernel.org Reported-by: Zhang Yi Reviewed-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 40d5e4a9e8d7..04e115834702 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2098,7 +2098,6 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (ns->ndev) nvme_nvm_unregister_sysfs(ns); del_gendisk(ns->disk); - blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); } @@ -2436,7 +2435,6 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) continue; revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); - blk_mq_abort_requeue_list(ns->queue); /* * Forcibly start all queues to avoid having stuck requests. @@ -2444,6 +2442,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) * when the final removal happens. */ blk_mq_start_hw_queues(ns->queue); + + /* draining requests in requeue list */ + blk_mq_kick_requeue_list(ns->queue); } mutex_unlock(&ctrl->namespaces_mutex); } From 7254a50a5db40ca6739ddf37e0a45e6912532b2c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 22 May 2017 23:05:05 +0800 Subject: [PATCH 057/153] blk-mq: remove blk_mq_abort_requeue_list() No one uses it any more, so remove it. Reviewed-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig --- block/blk-mq.c | 19 ------------------- include/linux/blk-mq.h | 1 - 2 files changed, 20 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a69ad122ed66..f2224ffd225d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -628,25 +628,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); -void blk_mq_abort_requeue_list(struct request_queue *q) -{ - unsigned long flags; - LIST_HEAD(rq_list); - - spin_lock_irqsave(&q->requeue_lock, flags); - list_splice_init(&q->requeue_list, &rq_list); - spin_unlock_irqrestore(&q->requeue_lock, flags); - - while (!list_empty(&rq_list)) { - struct request *rq; - - rq = list_first_entry(&rq_list, struct request, queuelist); - list_del_init(&rq->queuelist); - blk_mq_end_request(rq, -EIO); - } -} -EXPORT_SYMBOL(blk_mq_abort_requeue_list); - struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { if (tag < tags->nr_tags) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c47aa248c640..fcd641032f8d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -238,7 +238,6 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); -void blk_mq_abort_requeue_list(struct request_queue *q); void blk_mq_complete_request(struct request *rq); bool blk_mq_queue_stopped(struct request_queue *q); From 0ce872bf8b5c4d425a41940a523ff1b8daa0b275 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:15 -0700 Subject: [PATCH 058/153] nvme_fc: get rid of local reconnect_delay Remove the local copy of reconnect_delay. Use the value in the controller options directly. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index dca7165fabcf..c3ab1043efbd 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -165,7 +165,6 @@ struct nvme_fc_ctrl { struct work_struct delete_work; struct work_struct reset_work; struct delayed_work connect_work; - int reconnect_delay; int connect_attempts; struct kref ref; @@ -2615,9 +2614,9 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->reconnect_delay); + ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, - ctrl->reconnect_delay * HZ); + ctrl->ctrl.opts->reconnect_delay * HZ); } else dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); @@ -2695,9 +2694,9 @@ nvme_fc_connect_ctrl_work(struct work_struct *work) dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->reconnect_delay); + ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, - ctrl->reconnect_delay * HZ); + ctrl->ctrl.opts->reconnect_delay * HZ); } else dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller reconnect complete\n", @@ -2755,7 +2754,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work); INIT_WORK(&ctrl->reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); - ctrl->reconnect_delay = opts->reconnect_delay; spin_lock_init(&ctrl->lock); /* io queue count */ From 5bbecdbc8e7ffaaf47ac1f02014bf3bedda3fd11 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:16 -0700 Subject: [PATCH 059/153] nvme_fc: Support ctrl_loss_tmo Sync with Sagi's recent addition of ctrl_loss_tmo in the core fabrics layer. Remove local connect limits and connect_attempts variable. Use fabrics new nr_connects variable and use of nvmf_should_reconnect() Refactor duplicate reconnect failure code. Addresses review comment by Sagi on controller reset support: http://lists.infradead.org/pipermail/linux-nvme/2017-April/009261.html Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 120 ++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 69 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index c3ab1043efbd..a0f05d5e966c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -45,8 +45,6 @@ enum nvme_fc_queue_flags { #define NVMEFC_QUEUE_DELAY 3 /* ms units */ -#define NVME_FC_MAX_CONNECT_ATTEMPTS 1 - struct nvme_fc_queue { struct nvme_fc_ctrl *ctrl; struct device *dev; @@ -165,7 +163,6 @@ struct nvme_fc_ctrl { struct work_struct delete_work; struct work_struct reset_work; struct delayed_work connect_work; - int connect_attempts; struct kref ref; u32 flags; @@ -2305,7 +2302,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) int ret; bool changed; - ctrl->connect_attempts++; + ++ctrl->ctrl.opts->nr_reconnects; /* * Create the admin queue @@ -2402,7 +2399,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - ctrl->connect_attempts = 0; + ctrl->ctrl.opts->nr_reconnects = 0; kref_get(&ctrl->ctrl.kref); @@ -2545,16 +2542,22 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) nvme_put_ctrl(&ctrl->ctrl); } +static bool +__nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) + return true; + + if (!queue_work(nvme_fc_wq, &ctrl->delete_work)) + return true; + + return false; +} + static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl) { - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return -EBUSY; - - if (!queue_work(nvme_fc_wq, &ctrl->delete_work)) - return -EBUSY; - - return 0; + return __nvme_fc_schedule_delete_work(ctrl) ? -EBUSY : 0; } /* @@ -2579,6 +2582,35 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) return ret; } +static void +nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) +{ + /* If we are resetting/deleting then do nothing */ + if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) { + WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || + ctrl->ctrl.state == NVME_CTRL_LIVE); + return; + } + + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", + ctrl->cnum, status); + + if (nvmf_should_reconnect(&ctrl->ctrl)) { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", + ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); + queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, + ctrl->ctrl.opts->reconnect_delay * HZ); + } else { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: Max reconnect attempts (%d) " + "reached. Removing controller\n", + ctrl->cnum, ctrl->ctrl.opts->nr_reconnects); + WARN_ON(__nvme_fc_schedule_delete_work(ctrl)); + } +} + static void nvme_fc_reset_ctrl_work(struct work_struct *work) { @@ -2590,34 +2622,9 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) nvme_fc_delete_association(ctrl); ret = nvme_fc_create_association(ctrl); - if (ret) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", - ctrl->cnum, ret); - if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Max reconnect attempts (%d) " - "reached. Removing controller\n", - ctrl->cnum, ctrl->connect_attempts); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, - NVME_CTRL_DELETING)) { - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: failed to change state " - "to DELETING\n", ctrl->cnum); - return; - } - - WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work)); - return; - } - - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, - ctrl->ctrl.opts->reconnect_delay * HZ); - } else + if (ret) + nvme_fc_reconnect_or_delete(ctrl, ret); + else dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); } @@ -2670,34 +2677,9 @@ nvme_fc_connect_ctrl_work(struct work_struct *work) struct nvme_fc_ctrl, connect_work); ret = nvme_fc_create_association(ctrl); - if (ret) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Reconnect attempt failed (%d)\n", - ctrl->cnum, ret); - if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Max reconnect attempts (%d) " - "reached. Removing controller\n", - ctrl->cnum, ctrl->connect_attempts); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, - NVME_CTRL_DELETING)) { - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: failed to change state " - "to DELETING\n", ctrl->cnum); - return; - } - - WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work)); - return; - } - - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, - ctrl->ctrl.opts->reconnect_delay * HZ); - } else + if (ret) + nvme_fc_reconnect_or_delete(ctrl, ret); + else dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller reconnect complete\n", ctrl->cnum); @@ -2969,7 +2951,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) static struct nvmf_transport_ops nvme_fc_transport = { .name = "fc", .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR, - .allowed_opts = NVMF_OPT_RECONNECT_DELAY, + .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO, .create_ctrl = nvme_fc_create_ctrl, }; From a5321aa5efea05ae748dc5b3e8053584213325ca Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:18 -0700 Subject: [PATCH 060/153] nvme_fc: revise comment on teardown Per the recommendation by Sagi on: http://lists.infradead.org/pipermail/linux-nvme/2017-April/009261.html An extra reference was pointed out. There's no issue with the references, but rather a literal interpretation of what the comment is saying. Reword the comment to avoid confusion. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index a0f05d5e966c..0b7f7dd2779a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2532,10 +2532,10 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) /* * tear down the controller - * This will result in the last reference on the nvme ctrl to - * expire, calling the transport nvme_fc_nvme_ctrl_freed() callback. - * From there, the transport will tear down it's logical queues and - * association. + * After the last reference on the nvme ctrl is removed, + * the transport nvme_fc_nvme_ctrl_freed() callback will be + * invoked. From there, the transport will tear down it's + * logical queues and association. */ nvme_uninit_ctrl(&ctrl->ctrl); From 589ff7753bb54edd3ee4a9399ccc3ac48d9b22d7 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:19 -0700 Subject: [PATCH 061/153] nvme_fc: set logging level on resets/deletes Per the review by Sagi on: http://lists.infradead.org/pipermail/linux-nvme/2017-April/009261.html Looked at existing warn vs info vs err dev_xxx levels for the messages printed on reconnects and deletes: - Resets due to error and resets transitioned to deletes are dev_warn - Other reset/disconnect messages are dev_info - Removed chatty io queue related messages Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 0b7f7dd2779a..e4817f9f4323 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1747,7 +1747,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport association error detected: %s\n", ctrl->cnum, errmsg); - dev_info(ctrl->ctrl.device, + dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: resetting controller\n", ctrl->cnum); /* stop the queues on error, cleanup is in reset thread */ @@ -2191,9 +2191,6 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) if (!opts->nr_io_queues) return 0; - dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", - opts->nr_io_queues); - nvme_fc_init_io_queues(ctrl); memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); @@ -2264,9 +2261,6 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) if (ctrl->queue_count == 1) return 0; - dev_info(ctrl->ctrl.device, "Recreating %d I/O queues.\n", - opts->nr_io_queues); - nvme_fc_init_io_queues(ctrl); ret = blk_mq_reinit_tagset(&ctrl->tag_set); @@ -2592,7 +2586,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) return; } - dev_warn(ctrl->ctrl.device, + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", ctrl->cnum, status); @@ -2603,7 +2597,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, ctrl->ctrl.opts->reconnect_delay * HZ); } else { - dev_info(ctrl->ctrl.device, + dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: Max reconnect attempts (%d) " "reached. Removing controller\n", ctrl->cnum, ctrl->ctrl.opts->nr_reconnects); @@ -2638,7 +2632,7 @@ nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - dev_warn(ctrl->ctrl.device, + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: admin requested controller reset\n", ctrl->cnum); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) From e392e1f1f408fe8baf1046c970d05cbf1f0ec945 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:24 -0700 Subject: [PATCH 062/153] nvme_fc: correct nvme status set on abort correct nvme status set on abort. Patch that changed status to being actual nvme status crossed in the night with the patch that added abort values. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e4817f9f4323..775869c69df6 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1372,9 +1372,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); if (!complete_rq) { if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { - status = cpu_to_le16(NVME_SC_ABORT_REQ); + status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); if (blk_queue_dying(rq->q)) - status |= cpu_to_le16(NVME_SC_DNR); + status |= cpu_to_le16(NVME_SC_DNR << 1); } nvme_end_request(rq, status, result); } else From 2cb657bc0242dfdca20869685bf179774ef1a6fb Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:22 -0700 Subject: [PATCH 063/153] nvme_fc: remove extra controller reference taken on reconnect fix extra controller reference taken on reconnect by moving reference to initial controller create Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 775869c69df6..14a009e43aa5 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2395,8 +2395,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.opts->nr_reconnects = 0; - kref_get(&ctrl->ctrl.kref); - if (ctrl->queue_count > 1) { nvme_start_queues(&ctrl->ctrl); nvme_queue_scan(&ctrl->ctrl); @@ -2793,7 +2791,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.opts = NULL; /* initiate nvme ctrl ref counting teardown */ nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); /* as we're past the point where we transition to the ref * counting teardown path, if we return a bad pointer here, @@ -2809,6 +2806,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, return ERR_PTR(ret); } + kref_get(&ctrl->ctrl.kref); + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", ctrl->cnum, ctrl->ctrl.opts->subsysnqn); From aace34c0bb8ea3c8bdcec865b6a4be4db0a68e33 Mon Sep 17 00:00:00 2001 From: Tin Huynh Date: Mon, 22 May 2017 16:19:20 +0700 Subject: [PATCH 064/153] leds: pca955x: Correct I2C Functionality The driver checks an incorrect flag of functionality of adapter. When a driver requires i2c_smbus_read_byte_data and i2c_smbus_write_byte_data, it should check I2C_FUNC_SMBUS_BYTE_DATA instead I2C_FUNC_I2C. This patch fixes the problem. Signed-off-by: Tin Huynh Signed-off-by: Jacek Anaszewski --- drivers/leds/leds-pca955x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/leds-pca955x.c b/drivers/leds/leds-pca955x.c index 78a7ce816a47..9a873118ea5f 100644 --- a/drivers/leds/leds-pca955x.c +++ b/drivers/leds/leds-pca955x.c @@ -285,7 +285,7 @@ static int pca955x_probe(struct i2c_client *client, "slave address 0x%02x\n", client->name, chip->bits, client->addr); - if (!i2c_check_functionality(adapter, I2C_FUNC_I2C)) + if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA)) return -EIO; if (pdata) { From 5b81fc3cc625e857275573cb4240bbab553f919c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 17 May 2017 13:07:24 -0700 Subject: [PATCH 065/153] blk-throttle: add hierarchy support for latency target and idle time For idle time, children's setting should not be bigger than parent's. For latency target, children's setting should not be smaller than parent's. The leaf nodes will adjust their settings according to the hierarchy and compare their IO with the settings and do upgrade/downgrade. parents nodes don't need to track their IO latency/idle time. Signed-off-by: Shaohua Li Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 50 +++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b78db2e5fdff..16174f8cb0a1 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -157,6 +157,7 @@ struct throtl_grp { unsigned long last_check_time; unsigned long latency_target; /* us */ + unsigned long latency_target_conf; /* us */ /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; @@ -165,6 +166,7 @@ struct throtl_grp { unsigned long checked_last_finish_time; /* ns / 1024 */ unsigned long avg_idletime; /* ns / 1024 */ unsigned long idletime_threshold; /* us */ + unsigned long idletime_threshold_conf; /* us */ unsigned int bio_cnt; /* total bios */ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ @@ -482,6 +484,7 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) /* LIMIT_LOW will have default value 0 */ tg->latency_target = DFL_LATENCY_TARGET; + tg->latency_target_conf = DFL_LATENCY_TARGET; return &tg->pd; } @@ -512,6 +515,7 @@ static void throtl_pd_init(struct blkg_policy_data *pd) tg->td = td; tg->idletime_threshold = td->dft_idletime_threshold; + tg->idletime_threshold_conf = td->dft_idletime_threshold; } /* @@ -1367,8 +1371,25 @@ static void tg_conf_updated(struct throtl_grp *tg) * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) - tg_update_has_rules(blkg_to_tg(blkg)); + blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) { + struct throtl_grp *this_tg = blkg_to_tg(blkg); + struct throtl_grp *parent_tg; + + tg_update_has_rules(this_tg); + /* ignore root/second level */ + if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || + !blkg->parent->parent) + continue; + parent_tg = blkg_to_tg(blkg->parent); + /* + * make sure all children has lower idle time threshold and + * higher latency target + */ + this_tg->idletime_threshold = min(this_tg->idletime_threshold, + parent_tg->idletime_threshold); + this_tg->latency_target = max(this_tg->latency_target, + parent_tg->latency_target); + } /* * We're already holding queue_lock and know @tg is valid. Let's @@ -1497,8 +1518,8 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, tg->iops_conf[READ][off] == iops_dft && tg->iops_conf[WRITE][off] == iops_dft && (off != LIMIT_LOW || - (tg->idletime_threshold == tg->td->dft_idletime_threshold && - tg->latency_target == DFL_LATENCY_TARGET))) + (tg->idletime_threshold_conf == tg->td->dft_idletime_threshold && + tg->latency_target_conf == DFL_LATENCY_TARGET))) return 0; if (tg->bps_conf[READ][off] != bps_dft) @@ -1514,17 +1535,17 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops_conf[WRITE][off]); if (off == LIMIT_LOW) { - if (tg->idletime_threshold == ULONG_MAX) + if (tg->idletime_threshold_conf == ULONG_MAX) strcpy(idle_time, " idle=max"); else snprintf(idle_time, sizeof(idle_time), " idle=%lu", - tg->idletime_threshold); + tg->idletime_threshold_conf); - if (tg->latency_target == ULONG_MAX) + if (tg->latency_target_conf == ULONG_MAX) strcpy(latency_time, " latency=max"); else snprintf(latency_time, sizeof(latency_time), - " latency=%lu", tg->latency_target); + " latency=%lu", tg->latency_target_conf); } seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", @@ -1563,8 +1584,8 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, v[2] = tg->iops_conf[READ][index]; v[3] = tg->iops_conf[WRITE][index]; - idle_time = tg->idletime_threshold; - latency_time = tg->latency_target; + idle_time = tg->idletime_threshold_conf; + latency_time = tg->latency_target_conf; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; @@ -1628,10 +1649,10 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, blk_throtl_update_limit_valid(tg->td); if (tg->td->limit_valid[LIMIT_LOW]) tg->td->limit_index = LIMIT_LOW; - tg->idletime_threshold = (idle_time == ULONG_MAX) ? - ULONG_MAX : idle_time; - tg->latency_target = (latency_time == ULONG_MAX) ? - ULONG_MAX : latency_time; + tg->idletime_threshold_conf = idle_time; + tg->idletime_threshold = tg->idletime_threshold_conf; + tg->latency_target_conf = latency_time; + tg->latency_target = tg->latency_target_conf; } tg_conf_updated(tg); ret = 0; @@ -2385,6 +2406,7 @@ void blk_throtl_register_queue(struct request_queue *q) struct throtl_grp *tg = blkg_to_tg(blkg); tg->idletime_threshold = td->dft_idletime_threshold; + tg->idletime_threshold_conf = td->dft_idletime_threshold; } rcu_read_unlock(); } From 4cff729f62d1bd433178f1ffe09db5718835e925 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 17 May 2017 13:07:25 -0700 Subject: [PATCH 066/153] blk-throttle: output some debug info in trace These info are important to understand what's happening and help debug. Signed-off-by: Shaohua Li Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 16174f8cb0a1..1f8d62f5e808 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1748,12 +1748,18 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) * - IO latency is largely below threshold */ unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice); + bool ret; time = min_t(unsigned long, MAX_IDLE_TIME, time); - return (ktime_get_ns() >> 10) - tg->last_finish_time > time || + ret = (ktime_get_ns() >> 10) - tg->last_finish_time > time || tg->avg_idletime > tg->idletime_threshold || (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); + throtl_log(&tg->service_queue, + "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", + tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt, + tg->bio_cnt, ret, tg->td->scale); + return ret; } static bool throtl_tg_can_upgrade(struct throtl_grp *tg) @@ -1849,6 +1855,7 @@ static void throtl_upgrade_state(struct throtl_data *td) struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; + throtl_log(&td->service_queue, "upgrade to max"); td->limit_index = LIMIT_MAX; td->low_upgrade_time = jiffies; td->scale = 0; @@ -1871,6 +1878,7 @@ static void throtl_downgrade_state(struct throtl_data *td, int new) { td->scale /= 2; + throtl_log(&td->service_queue, "downgrade, scale %d", td->scale); if (td->scale) { td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; return; @@ -2044,6 +2052,11 @@ static void throtl_update_latency_buckets(struct throtl_data *td) td->avg_buckets[i].valid = true; last_latency = td->avg_buckets[i].latency; } + + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) + throtl_log(&td->service_queue, + "Latency bucket %d: latency=%ld, valid=%d", i, + td->avg_buckets[i].latency, td->avg_buckets[i].valid); } #else static inline void throtl_update_latency_buckets(struct throtl_data *td) From 9bb67aeb96784527dbc784c7a1b234461299363c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 17 May 2017 13:07:26 -0700 Subject: [PATCH 067/153] blk-throttle: respect 0 bps/iops settings for io.low If a cgroup with low limit 0 for both bps/iops, the cgroup's low limit is ignored and we throttle the cgroup with its max limit. In this way, other cgroups with a low limit will not get protected. To fix this, we don't do the exception any more. cgroup will be throttled to a limit 0 if it uese default setting. To avoid completed stall, we give such cgroup tiny IO resources. Signed-off-by: Shaohua Li Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 1f8d62f5e808..f6a9f42a0ad7 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -27,6 +27,8 @@ static int throtl_quantum = 32; #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ /* default latency target is 0, eg, guarantee IO latency by default */ #define DFL_LATENCY_TARGET (0) +#define MIN_THROTL_BPS (320 * 1024) +#define MIN_THROTL_IOPS (10) #define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) @@ -296,8 +298,14 @@ static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) td = tg->td; ret = tg->bps[rw][td->limit_index]; - if (ret == 0 && td->limit_index == LIMIT_LOW) - return tg->bps[rw][LIMIT_MAX]; + if (ret == 0 && td->limit_index == LIMIT_LOW) { + /* intermediate node or iops isn't 0 */ + if (!list_empty(&blkg->blkcg->css.children) || + tg->iops[rw][td->limit_index]) + return U64_MAX; + else + return MIN_THROTL_BPS; + } if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { @@ -317,10 +325,17 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return UINT_MAX; + td = tg->td; ret = tg->iops[rw][td->limit_index]; - if (ret == 0 && tg->td->limit_index == LIMIT_LOW) - return tg->iops[rw][LIMIT_MAX]; + if (ret == 0 && tg->td->limit_index == LIMIT_LOW) { + /* intermediate node or bps isn't 0 */ + if (!list_empty(&blkg->blkcg->css.children) || + tg->bps[rw][td->limit_index]) + return UINT_MAX; + else + return MIN_THROTL_IOPS; + } if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { @@ -1353,7 +1368,7 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; } -static void tg_conf_updated(struct throtl_grp *tg) +static void tg_conf_updated(struct throtl_grp *tg, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; @@ -1371,7 +1386,8 @@ static void tg_conf_updated(struct throtl_grp *tg) * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) { + blkg_for_each_descendant_pre(blkg, pos_css, + global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg); struct throtl_grp *parent_tg; @@ -1434,7 +1450,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, else *(unsigned int *)((void *)tg + of_cft(of)->private) = v; - tg_conf_updated(tg); + tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_finish(&ctx); @@ -1522,16 +1538,16 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, tg->latency_target_conf == DFL_LATENCY_TARGET))) return 0; - if (tg->bps_conf[READ][off] != bps_dft) + if (tg->bps_conf[READ][off] != U64_MAX) snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps_conf[READ][off]); - if (tg->bps_conf[WRITE][off] != bps_dft) + if (tg->bps_conf[WRITE][off] != U64_MAX) snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps_conf[WRITE][off]); - if (tg->iops_conf[READ][off] != iops_dft) + if (tg->iops_conf[READ][off] != UINT_MAX) snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops_conf[READ][off]); - if (tg->iops_conf[WRITE][off] != iops_dft) + if (tg->iops_conf[WRITE][off] != UINT_MAX) snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops_conf[WRITE][off]); if (off == LIMIT_LOW) { @@ -1654,7 +1670,8 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->latency_target_conf = latency_time; tg->latency_target = tg->latency_target_conf; } - tg_conf_updated(tg); + tg_conf_updated(tg, index == LIMIT_LOW && + tg->td->limit_valid[LIMIT_LOW]); ret = 0; out_finish: blkg_conf_finish(&ctx); From b4f428ef2844e9fa8154f2faaca249aa74e222a7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 17 May 2017 13:07:27 -0700 Subject: [PATCH 068/153] blk-throttle: force user to configure all settings for io.low Default value of io.low limit is 0. If user doesn't configure the limit, last patch makes cgroup be throttled to very tiny bps/iops, which could stall the system. A cgroup with default settings of io.low limit really means nothing, so we force user to configure all settings, otherwise io.low limit doesn't take effect. With this stragety, default setting of latency/idle isn't important, so just set them to very conservative and safe value. Signed-off-by: Shaohua Li Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 78 ++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 42 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f6a9f42a0ad7..fc13dd0c6e39 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -22,13 +22,11 @@ static int throtl_quantum = 32; #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ) -#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */ -#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */ #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ -/* default latency target is 0, eg, guarantee IO latency by default */ -#define DFL_LATENCY_TARGET (0) #define MIN_THROTL_BPS (320 * 1024) #define MIN_THROTL_IOPS (10) +#define DFL_LATENCY_TARGET (-1L) +#define DFL_IDLE_THRESHOLD (0) #define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) @@ -205,8 +203,6 @@ struct throtl_data unsigned int limit_index; bool limit_valid[LIMIT_CNT]; - unsigned long dft_idletime_threshold; /* us */ - unsigned long low_upgrade_time; unsigned long low_downgrade_time; @@ -500,6 +496,8 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->latency_target = DFL_LATENCY_TARGET; tg->latency_target_conf = DFL_LATENCY_TARGET; + tg->idletime_threshold = DFL_IDLE_THRESHOLD; + tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; return &tg->pd; } @@ -528,9 +526,6 @@ static void throtl_pd_init(struct blkg_policy_data *pd) if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; - - tg->idletime_threshold = td->dft_idletime_threshold; - tg->idletime_threshold_conf = td->dft_idletime_threshold; } /* @@ -1534,7 +1529,7 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, tg->iops_conf[READ][off] == iops_dft && tg->iops_conf[WRITE][off] == iops_dft && (off != LIMIT_LOW || - (tg->idletime_threshold_conf == tg->td->dft_idletime_threshold && + (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD && tg->latency_target_conf == DFL_LATENCY_TARGET))) return 0; @@ -1660,16 +1655,31 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->iops_conf[READ][LIMIT_MAX]); tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], tg->iops_conf[WRITE][LIMIT_MAX]); + tg->idletime_threshold_conf = idle_time; + tg->latency_target_conf = latency_time; - if (index == LIMIT_LOW) { - blk_throtl_update_limit_valid(tg->td); - if (tg->td->limit_valid[LIMIT_LOW]) - tg->td->limit_index = LIMIT_LOW; - tg->idletime_threshold_conf = idle_time; + /* force user to configure all settings for low limit */ + if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] || + tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) || + tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD || + tg->latency_target_conf == DFL_LATENCY_TARGET) { + tg->bps[READ][LIMIT_LOW] = 0; + tg->bps[WRITE][LIMIT_LOW] = 0; + tg->iops[READ][LIMIT_LOW] = 0; + tg->iops[WRITE][LIMIT_LOW] = 0; + tg->idletime_threshold = DFL_IDLE_THRESHOLD; + tg->latency_target = DFL_LATENCY_TARGET; + } else if (index == LIMIT_LOW) { tg->idletime_threshold = tg->idletime_threshold_conf; - tg->latency_target_conf = latency_time; tg->latency_target = tg->latency_target_conf; } + + blk_throtl_update_limit_valid(tg->td); + if (tg->td->limit_valid[LIMIT_LOW]) { + if (index == LIMIT_LOW) + tg->td->limit_index = LIMIT_LOW; + } else + tg->td->limit_index = LIMIT_MAX; tg_conf_updated(tg, index == LIMIT_LOW && tg->td->limit_valid[LIMIT_LOW]); ret = 0; @@ -1760,17 +1770,19 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) /* * cgroup is idle if: * - single idle is too long, longer than a fixed value (in case user - * configure a too big threshold) or 4 times of slice + * configure a too big threshold) or 4 times of idletime threshold * - average think time is more than threshold * - IO latency is largely below threshold */ - unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice); + unsigned long time; bool ret; - time = min_t(unsigned long, MAX_IDLE_TIME, time); - ret = (ktime_get_ns() >> 10) - tg->last_finish_time > time || - tg->avg_idletime > tg->idletime_threshold || - (tg->latency_target && tg->bio_cnt && + time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); + ret = tg->latency_target == DFL_LATENCY_TARGET || + tg->idletime_threshold == DFL_IDLE_THRESHOLD || + (ktime_get_ns() >> 10) - tg->last_finish_time > time || + tg->avg_idletime > tg->idletime_threshold || + (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); throtl_log(&tg->service_queue, "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", @@ -2405,19 +2417,14 @@ void blk_throtl_exit(struct request_queue *q) void blk_throtl_register_queue(struct request_queue *q) { struct throtl_data *td; - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; td = q->td; BUG_ON(!td); - if (blk_queue_nonrot(q)) { + if (blk_queue_nonrot(q)) td->throtl_slice = DFL_THROTL_SLICE_SSD; - td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD; - } else { + else td->throtl_slice = DFL_THROTL_SLICE_HD; - td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD; - } #ifndef CONFIG_BLK_DEV_THROTTLING_LOW /* if no low limit, use previous default */ td->throtl_slice = DFL_THROTL_SLICE_HD; @@ -2426,19 +2433,6 @@ void blk_throtl_register_queue(struct request_queue *q) td->track_bio_latency = !q->mq_ops && !q->request_fn; if (!td->track_bio_latency) blk_stat_enable_accounting(q); - - /* - * some tg are created before queue is fully initialized, eg, nonrot - * isn't initialized yet - */ - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - - tg->idletime_threshold = td->dft_idletime_threshold; - tg->idletime_threshold_conf = td->dft_idletime_threshold; - } - rcu_read_unlock(); } #ifdef CONFIG_BLK_DEV_THROTTLING_LOW From c849e55178f559c4bbed43efb113cb7602aade89 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 16 May 2017 19:21:08 +0200 Subject: [PATCH 069/153] PCI: endpoint: Make PCI_ENDPOINT depend on HAS_DMA If NO_DMA=y: drivers/built-in.o: In function `__pci_epc_create': (.text+0xef4e): undefined reference to `bad_dma_ops' drivers/built-in.o: In function `pci_epc_add_epf': (.text+0xf676): undefined reference to `bad_dma_ops' drivers/built-in.o: In function `pci_epf_alloc_space': (.text+0xfa32): undefined reference to `bad_dma_ops' drivers/built-in.o: In function `pci_epf_free_space': (.text+0xfac4): undefined reference to `bad_dma_ops' Add a dependency on HAS_DMA to fix this. Signed-off-by: Geert Uytterhoeven Signed-off-by: Bjorn Helgaas --- drivers/pci/endpoint/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/endpoint/Kconfig b/drivers/pci/endpoint/Kconfig index c23f146fb5a6..c09623ca8c3b 100644 --- a/drivers/pci/endpoint/Kconfig +++ b/drivers/pci/endpoint/Kconfig @@ -6,6 +6,7 @@ menu "PCI Endpoint" config PCI_ENDPOINT bool "PCI Endpoint Support" + depends on HAS_DMA help Enable this configuration option to support configurable PCI endpoint. This should be enabled if the platform has a PCI From e40cf640b8f632091a30ef0b030c83546f07c902 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 22 May 2017 16:52:24 -0500 Subject: [PATCH 070/153] switchtec: Use new cdev_device_add() helper function Convert from "cdev_add() + device_add()" to cdev_device_add(), and from "device_del() + cdev_del()" to cdev_device_del(). [bhelgaas: changelog] Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas --- drivers/pci/switch/switchtec.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index cc6e085008fb..abaa227a5f34 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -1291,7 +1291,6 @@ static struct switchtec_dev *stdev_create(struct pci_dev *pdev) cdev = &stdev->cdev; cdev_init(cdev, &switchtec_fops); cdev->owner = THIS_MODULE; - cdev->kobj.parent = &dev->kobj; return stdev; @@ -1479,11 +1478,7 @@ static int switchtec_pci_probe(struct pci_dev *pdev, SWITCHTEC_EVENT_EN_IRQ, &stdev->mmio_part_cfg->mrpc_comp_hdr); - rc = cdev_add(&stdev->cdev, stdev->dev.devt, 1); - if (rc) - goto err_put; - - rc = device_add(&stdev->dev); + rc = cdev_device_add(&stdev->cdev, &stdev->dev); if (rc) goto err_devadd; @@ -1492,7 +1487,6 @@ static int switchtec_pci_probe(struct pci_dev *pdev, return 0; err_devadd: - cdev_del(&stdev->cdev); stdev_kill(stdev); err_put: ida_simple_remove(&switchtec_minor_ida, MINOR(stdev->dev.devt)); @@ -1506,8 +1500,7 @@ static void switchtec_pci_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); - device_del(&stdev->dev); - cdev_del(&stdev->cdev); + cdev_device_del(&stdev->cdev, &stdev->dev); ida_simple_remove(&switchtec_minor_ida, MINOR(stdev->dev.devt)); dev_info(&stdev->dev, "unregistered.\n"); From 9871e9bb5cf6ff0b51457ca74c270c5c5230b224 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 22 May 2017 16:52:30 -0500 Subject: [PATCH 071/153] switchtec: Fix minor bug with partition ID register When a switch endpoint is configured without NTB, the mmio_ntb registers will read all zeros. However, in corner case configurations where the partition ID is not zero and NTB is not enabled, the code will have the wrong partition ID and this causes the driver to use the wrong set of drivers. To fix this we simply take the partition ID from the system info region. Reported-by: Dingbao Chen Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas --- drivers/pci/switch/switchtec.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index abaa227a5f34..f6a63406c76e 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -1441,12 +1441,15 @@ static int switchtec_init_pci(struct switchtec_dev *stdev, stdev->mmio_sys_info = stdev->mmio + SWITCHTEC_GAS_SYS_INFO_OFFSET; stdev->mmio_flash_info = stdev->mmio + SWITCHTEC_GAS_FLASH_INFO_OFFSET; stdev->mmio_ntb = stdev->mmio + SWITCHTEC_GAS_NTB_OFFSET; - stdev->partition = ioread8(&stdev->mmio_ntb->partition_id); + stdev->partition = ioread8(&stdev->mmio_sys_info->partition_id); stdev->partition_count = ioread8(&stdev->mmio_ntb->partition_count); stdev->mmio_part_cfg_all = stdev->mmio + SWITCHTEC_GAS_PART_CFG_OFFSET; stdev->mmio_part_cfg = &stdev->mmio_part_cfg_all[stdev->partition]; stdev->mmio_pff_csr = stdev->mmio + SWITCHTEC_GAS_PFF_CSR_OFFSET; + if (stdev->partition_count < 1) + stdev->partition_count = 1; + init_pff(stdev); pci_set_drvdata(pdev, stdev); From 415b6185c541dc0a21457ff307cdb61950a6eb9f Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 22 May 2017 17:06:30 -0500 Subject: [PATCH 072/153] PCI: imx6: Fix config read timeout handling Commit cc7b0d495589 ("PCI: designware: Update PCI config space remap function") made PCI configuration requests non-posted, which means we now get a synchronous abort when the CFG space read to probe for downstream devices times out. Synchronous aborts need to be handled differently from the async aborts we were getting before, in particular the PC needs to be advanced when resolving the abort. This is mostly a copy of what other PCI drivers do on ARM to handle those aborts. [bhelgaas: changelog, "Fixes"] Fixes: cc7b0d495589 ("PCI: designware: Update PCI config space remap function") Tested-by: Fabio Estevam Tested-by: Peter Senna Tschudin Signed-off-by: Lucas Stach Signed-off-by: Bjorn Helgaas Acked-by: Richard Zhu --- drivers/pci/dwc/pci-imx6.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/drivers/pci/dwc/pci-imx6.c b/drivers/pci/dwc/pci-imx6.c index a98cba55c7f0..19a289b8cc94 100644 --- a/drivers/pci/dwc/pci-imx6.c +++ b/drivers/pci/dwc/pci-imx6.c @@ -252,7 +252,34 @@ static void imx6_pcie_reset_phy(struct imx6_pcie *imx6_pcie) static int imx6q_pcie_abort_handler(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { - return 0; + unsigned long pc = instruction_pointer(regs); + unsigned long instr = *(unsigned long *)pc; + int reg = (instr >> 12) & 15; + + /* + * If the instruction being executed was a read, + * make it look like it read all-ones. + */ + if ((instr & 0x0c100000) == 0x04100000) { + unsigned long val; + + if (instr & 0x00400000) + val = 255; + else + val = -1; + + regs->uregs[reg] = val; + regs->ARM_pc += 4; + return 0; + } + + if ((instr & 0x0e100090) == 0x00100090) { + regs->uregs[reg] = -1; + regs->ARM_pc += 4; + return 0; + } + + return 1; } static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) @@ -819,8 +846,8 @@ static int __init imx6_pcie_init(void) * we can install the handler here without risking it * accessing some uninitialized driver state. */ - hook_fault_code(16 + 6, imx6q_pcie_abort_handler, SIGBUS, 0, - "imprecise external abort"); + hook_fault_code(8, imx6q_pcie_abort_handler, SIGBUS, 0, + "external abort on non-linefetch"); return platform_driver_register(&imx6_pcie_driver); } From 1fc2e41f7af4572b07190f9dec28396b418e9a36 Mon Sep 17 00:00:00 2001 From: Alexander Tsoy Date: Mon, 22 May 2017 20:58:11 +0300 Subject: [PATCH 073/153] ALSA: hda - apply STAC_9200_DELL_M22 quirk for Dell Latitude D430 This model is actually called 92XXM2-8 in Windows driver. But since pin configs for M22 and M28 are identical, just reuse M22 quirk. Fixes external microphone (tested) and probably docking station ports (not tested). Signed-off-by: Alexander Tsoy Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_sigmatel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index faa3d38bac0b..6cefdf6c0b75 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -1559,6 +1559,8 @@ static const struct snd_pci_quirk stac9200_fixup_tbl[] = { "Dell Inspiron 1501", STAC_9200_DELL_M26), SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x01f6, "unknown Dell", STAC_9200_DELL_M26), + SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0201, + "Dell Latitude D430", STAC_9200_DELL_M22), /* Panasonic */ SND_PCI_QUIRK(0x10f7, 0x8338, "Panasonic CF-74", STAC_9200_PANASONIC), /* Gateway machines needs EAPD to be set on resume */ From 429030bc944ee9a8bbe5d9bb23dcda0ae2205450 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 19 May 2017 14:58:19 -0300 Subject: [PATCH 074/153] drm: qxl: Delay entering atomic context during cursor update qxl_release_map will enter an atomic context, but since we still need to alloc memory for BOs, we better delay that until we have everything we need, in case we need to sleep inside the allocation. This avoids the Sleep in atomic state below, which was reported by Mike. [ 43.910362] BUG: sleeping function called from invalid context at mm/slab.h:432 [ 43.910955] in_atomic(): 1, irqs_disabled(): 0, pid: 2077, name: Xorg [ 43.911472] Preemption disabled at: [ 43.911478] [] qxl_bo_kmap_atomic_page+0xa5/0x100 [qxl] [ 43.912103] CPU: 0 PID: 2077 Comm: Xorg Tainted: G E 4.12.0-master #38 [ 43.912550] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20161202_174313-build11a 04/01/2014 [ 43.913202] Call Trace: [ 43.913371] dump_stack+0x65/0x89 [ 43.913581] ? qxl_bo_kmap_atomic_page+0xa5/0x100 [qxl] [ 43.913876] ___might_sleep+0x11a/0x190 [ 43.914095] __might_sleep+0x4a/0x80 [ 43.914319] ? qxl_bo_create+0x50/0x190 [qxl] [ 43.914565] kmem_cache_alloc_trace+0x46/0x180 [ 43.914836] qxl_bo_create+0x50/0x190 [qxl] [ 43.915082] ? refcount_dec_and_test+0x11/0x20 [ 43.915332] ? ttm_mem_io_reserve+0x41/0xe0 [ttm] [ 43.915595] qxl_alloc_bo_reserved+0x37/0xb0 [qxl] [ 43.915884] qxl_cursor_atomic_update+0x8f/0x260 [qxl] [ 43.916172] ? drm_atomic_helper_update_legacy_modeset_state+0x1d6/0x210 [drm_kms_helper] [ 43.916623] drm_atomic_helper_commit_planes+0xec/0x230 [drm_kms_helper] [ 43.916995] drm_atomic_helper_commit_tail+0x2b/0x60 [drm_kms_helper] [ 43.917398] commit_tail+0x65/0x70 [drm_kms_helper] [ 43.917693] drm_atomic_helper_commit+0xa9/0x100 [drm_kms_helper] [ 43.918039] drm_atomic_commit+0x4b/0x50 [drm] [ 43.918334] drm_atomic_helper_update_plane+0xf1/0x110 [drm_kms_helper] [ 43.918902] __setplane_internal+0x19f/0x280 [drm] [ 43.919240] drm_mode_cursor_universal+0x101/0x1c0 [drm] [ 43.919541] drm_mode_cursor_common+0x15b/0x1d0 [drm] [ 43.919858] drm_mode_cursor2_ioctl+0xe/0x10 [drm] [ 43.920157] drm_ioctl+0x211/0x460 [drm] [ 43.920383] ? drm_mode_cursor_ioctl+0x50/0x50 [drm] [ 43.920664] ? handle_mm_fault+0x93/0x160 [ 43.920893] do_vfs_ioctl+0x96/0x6e0 [ 43.921117] ? __fget+0x73/0xa0 [ 43.921322] SyS_ioctl+0x41/0x70 [ 43.921545] entry_SYSCALL_64_fastpath+0x1a/0xa5 [ 43.922188] RIP: 0033:0x7f1145804bc7 [ 43.922526] RSP: 002b:00007ffcd3e50508 EFLAGS: 00003246 ORIG_RAX: 0000000000000010 [ 43.923367] RAX: ffffffffffffffda RBX: 0000000000000040 RCX: 00007f1145804bc7 [ 43.923852] RDX: 00007ffcd3e50540 RSI: 00000000c02464bb RDI: 000000000000000b [ 43.924299] RBP: 0000000000000040 R08: 0000000000000040 R09: 000000000000000c [ 43.924694] R10: 00007ffcd3e50340 R11: 0000000000003246 R12: 0000000000000018 [ 43.925128] R13: 00000000022bc390 R14: 0000000000000040 R15: 00007ffcd3e5062c Reported-by: Mike Galbraith Signed-off-by: Gabriel Krisman Bertazi Link: http://patchwork.freedesktop.org/patch/msgid/20170519175819.15682-1-krisman@collabora.co.uk Signed-off-by: Gerd Hoffmann --- drivers/gpu/drm/qxl/qxl_display.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c index 058340a002c2..4a340efd8ba6 100644 --- a/drivers/gpu/drm/qxl/qxl_display.c +++ b/drivers/gpu/drm/qxl/qxl_display.c @@ -575,8 +575,6 @@ static void qxl_cursor_atomic_update(struct drm_plane *plane, if (ret) return; - cmd = (struct qxl_cursor_cmd *) qxl_release_map(qdev, release); - if (fb != old_state->fb) { obj = to_qxl_framebuffer(fb)->obj; user_bo = gem_to_qxl_bo(obj); @@ -614,6 +612,7 @@ static void qxl_cursor_atomic_update(struct drm_plane *plane, qxl_bo_kunmap(cursor_bo); qxl_bo_kunmap(user_bo); + cmd = (struct qxl_cursor_cmd *) qxl_release_map(qdev, release); cmd->u.set.visible = 1; cmd->u.set.shape = qxl_bo_physical_address(qdev, cursor_bo, 0); @@ -624,6 +623,7 @@ static void qxl_cursor_atomic_update(struct drm_plane *plane, if (ret) goto out_free_release; + cmd = (struct qxl_cursor_cmd *) qxl_release_map(qdev, release); cmd->type = QXL_CURSOR_MOVE; } From f928543404bdf6bb4e8d6a6c3ced5edebd0d6f38 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 22 May 2017 15:59:45 +0200 Subject: [PATCH 075/153] drm: Fix deadlock retry loop in page_flip_ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I failed to properly onion-wrap the unwind code: We acquire the vblank reference before we start with the wait-wound locking dance, hence we must make sure we retry before we drop the reference. Oops. v2: The vblank_put must be after the frambuffer_put (Michel). I suck at unwrapping code that doesn't use separate labels for each stage, but checks each pointer first ... While re-reading everything I also realized that we must clean up the fb refcounts, and specifically plane->old_fb before we drop the locks, either in the final unlocking, or in the w/w retry path. Hence the correct fix is to drop the vblank_put to the very bottom. Fixes: 29dc0d1de182 ("drm: Roll out acquire context for the page_flip ioctl") Cc: Harry Wentland Cc: Daniel Vetter Cc: Jani Nikula Cc: Sean Paul Cc: David Airlie Cc: dri-devel@lists.freedesktop.org Reported-by: Tommi Rantala Cc: Tommi Rantala Cc: Michel Dänzer Tested-by: Tommi Rantala Reviewed-by: Michel Dänzer Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/20170522135945.28831-1-daniel.vetter@ffwll.ch --- drivers/gpu/drm/drm_plane.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c index fedd4d60d9cd..5dc8c4350602 100644 --- a/drivers/gpu/drm/drm_plane.c +++ b/drivers/gpu/drm/drm_plane.c @@ -948,8 +948,6 @@ int drm_mode_page_flip_ioctl(struct drm_device *dev, } out: - if (ret && crtc->funcs->page_flip_target) - drm_crtc_vblank_put(crtc); if (fb) drm_framebuffer_put(fb); if (crtc->primary->old_fb) @@ -964,5 +962,8 @@ int drm_mode_page_flip_ioctl(struct drm_device *dev, drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); + if (ret && crtc->funcs->page_flip_target) + drm_crtc_vblank_put(crtc); + return ret; } From c477ebe21fabe0010a2ed324ce3a1762c757d867 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Sat, 6 May 2017 11:41:30 +0200 Subject: [PATCH 076/153] mmc: dt: pwrseq-simple: Invent power-off-delay-us During power off, after the GPIO pin has been asserted, some devices like the Wifi chip from TI, Wl18xx, needs a delay before the host continues with clock gating and turning off regulators as to follow a graceful shutdown sequence. Therefore invent an optional power-off-delay-us DT binding for mmc-pwrseq-simple, to allow us to support this constraint. Cc: devicetree@vger.kernel.org Cc: Rob Herring Cc: linux-mmc@vger.kernel.org Signed-off-by: Ulf Hansson Acked-by: Arnd Bergmann --- Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt index e25436861867..9029b45b8a22 100644 --- a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt +++ b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt @@ -18,6 +18,8 @@ Optional properties: "ext_clock" (External clock provided to the card). - post-power-on-delay-ms : Delay in ms after powering the card and de-asserting the reset-gpios (if any) +- power-off-delay-us : Delay in us after asserting the reset-gpios (if any) + during power off of the card. Example: From e9256e142f597edf90c68cec22db4c4aebaa27de Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Sat, 6 May 2017 11:43:05 +0200 Subject: [PATCH 077/153] mmc: pwrseq_simple: Parse DTS for the power-off-delay-us property If the optional power-off-delay-us property is found, insert the corresponding delay after asserting the GPIO during power off. This enables a graceful shutdown sequence for some devices. Cc: linux-mmc@vger.kernel.org Signed-off-by: Ulf Hansson Acked-by: Arnd Bergmann --- drivers/mmc/core/pwrseq_simple.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/mmc/core/pwrseq_simple.c b/drivers/mmc/core/pwrseq_simple.c index 1304160de168..13ef162cf066 100644 --- a/drivers/mmc/core/pwrseq_simple.c +++ b/drivers/mmc/core/pwrseq_simple.c @@ -27,6 +27,7 @@ struct mmc_pwrseq_simple { struct mmc_pwrseq pwrseq; bool clk_enabled; u32 post_power_on_delay_ms; + u32 power_off_delay_us; struct clk *ext_clk; struct gpio_descs *reset_gpios; }; @@ -78,6 +79,10 @@ static void mmc_pwrseq_simple_power_off(struct mmc_host *host) mmc_pwrseq_simple_set_gpios_value(pwrseq, 1); + if (pwrseq->power_off_delay_us) + usleep_range(pwrseq->power_off_delay_us, + 2 * pwrseq->power_off_delay_us); + if (!IS_ERR(pwrseq->ext_clk) && pwrseq->clk_enabled) { clk_disable_unprepare(pwrseq->ext_clk); pwrseq->clk_enabled = false; @@ -119,6 +124,8 @@ static int mmc_pwrseq_simple_probe(struct platform_device *pdev) device_property_read_u32(dev, "post-power-on-delay-ms", &pwrseq->post_power_on_delay_ms); + device_property_read_u32(dev, "power-off-delay-us", + &pwrseq->power_off_delay_us); pwrseq->pwrseq.dev = dev; pwrseq->pwrseq.ops = &mmc_pwrseq_simple_ops; From f74ac688c981138c914f9afba50b646146e35585 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 24 Apr 2017 22:40:22 +0200 Subject: [PATCH 078/153] mfd: dts: hi655x: Add clock binding for the pmic The hi655x PMIC provides the regulators but also a clock. The latter is missing in the definition, so extend the documentation to include this as well. Signed-off-by: Daniel Lezcano Acked-by: Rob Herring Acked-by: Lee Jones [Ulf: Split patch and updated changelog] Signed-off-by: Ulf Hansson Acked-by: Arnd Bergmann --- Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt b/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt index 05485699d70e..9630ac0e4b56 100644 --- a/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt +++ b/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt @@ -16,6 +16,11 @@ Required properties: - reg: Base address of PMIC on Hi6220 SoC. - interrupt-controller: Hi655x has internal IRQs (has own IRQ domain). - pmic-gpios: The GPIO used by PMIC IRQ. +- #clock-cells: From common clock binding; shall be set to 0 + +Optional properties: +- clock-output-names: From common clock binding to override the + default output clock name Example: pmic: pmic@f8000000 { @@ -24,4 +29,5 @@ Example: interrupt-controller; #interrupt-cells = <2>; pmic-gpios = <&gpio1 2 GPIO_ACTIVE_HIGH>; + #clock-cells = <0>; } From 307ded8968868e55343e063fbe96cff1efd77eb6 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 24 Apr 2017 22:40:22 +0200 Subject: [PATCH 079/153] arm64: dts: hikey: Add clock for the pmic mfd The hi655x PMIC provides the regulators but also a clock. The latter is missing so let's add it. This clock is used by WiFi/Bluetooth chip, but that connection is done in a separate change on top of this one. Signed-off-by: Daniel Lezcano Acked-by: Rob Herring Acked-by: Lee Jones [Ulf: Split patch and updated changelog] Signed-off-by: Ulf Hansson Acked-by: Arnd Bergmann --- arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts index 75bce2d0b1a8..d22eb3a646c4 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts +++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts @@ -330,6 +330,7 @@ bt_active_led { pmic: pmic@f8000000 { compatible = "hisilicon,hi655x-pmic"; reg = <0x0 0xf8000000 0x0 0x1000>; + #clock-cells = <0>; interrupt-controller; #interrupt-cells = <2>; pmic-gpios = <&gpio1 2 GPIO_ACTIVE_HIGH>; From 1b32a5ff98fbb271d2235ddcfe3b58f514f8260a Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 May 2017 12:46:55 +0200 Subject: [PATCH 080/153] arm64: dts: hi6220: Move the fixed_5v_hub regulator to the hikey dts The regulator is a part of the hikey board, therefore let's move it from the hi6220 SoC dtsi file into the hikey dts file . Let's also rename the regulator according to the datasheet (5V_HUB) to better reflect the HW. Signed-off-by: Ulf Hansson Acked-by: Daniel Lezcano Acked-by: Arnd Bergmann --- arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts | 10 ++++++++++ arch/arm64/boot/dts/hisilicon/hi6220.dtsi | 12 +----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts index d22eb3a646c4..0f6cba77fc76 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts +++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts @@ -81,6 +81,16 @@ reboot-mode { }; }; + reg_5v_hub: regulator@0 { + compatible = "regulator-fixed"; + regulator-name = "5V_HUB"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + regulator-boot-on; + gpio = <&gpio0 7 0>; + regulator-always-on; + }; + soc { spi0: spi@f7106000 { status = "ok"; diff --git a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi index 1e5129b19280..951152d44c02 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi +++ b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi @@ -725,20 +725,10 @@ i2c2: i2c@f7102000 { status = "disabled"; }; - fixed_5v_hub: regulator@0 { - compatible = "regulator-fixed"; - regulator-name = "fixed_5v_hub"; - regulator-min-microvolt = <5000000>; - regulator-max-microvolt = <5000000>; - regulator-boot-on; - gpio = <&gpio0 7 0>; - regulator-always-on; - }; - usb_phy: usbphy { compatible = "hisilicon,hi6220-usb-phy"; #phy-cells = <0>; - phy-supply = <&fixed_5v_hub>; + phy-supply = <®_5v_hub>; hisilicon,peripheral-syscon = <&sys_ctrl>; }; From 84f7c60b31f10e3a438153bc7408ad536f585641 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 May 2017 13:51:27 +0200 Subject: [PATCH 081/153] arm64: dts: hikey: Add the SYS_5V and the VDD_3V3 regulators Add these regulators to better describe the HW, but also because those is needed in following changes. Signed-off-by: Ulf Hansson Acked-by: Daniel Lezcano Acked-by: Arnd Bergmann --- .../arm64/boot/dts/hisilicon/hi6220-hikey.dts | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts index 0f6cba77fc76..802f4a4bed30 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts +++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts @@ -81,7 +81,26 @@ reboot-mode { }; }; - reg_5v_hub: regulator@0 { + reg_sys_5v: regulator@0 { + compatible = "regulator-fixed"; + regulator-name = "SYS_5V"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + regulator-boot-on; + regulator-always-on; + }; + + reg_vdd_3v3: regulator@1 { + compatible = "regulator-fixed"; + regulator-name = "VDD_3V3"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + vin-supply = <®_sys_5v>; + }; + + reg_5v_hub: regulator@2 { compatible = "regulator-fixed"; regulator-name = "5V_HUB"; regulator-min-microvolt = <5000000>; @@ -89,6 +108,7 @@ reg_5v_hub: regulator@0 { regulator-boot-on; gpio = <&gpio0 7 0>; regulator-always-on; + vin-supply = <®_sys_5v>; }; soc { From 76f1dfb687150e852aa74573962cfc158a9570cc Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 May 2017 14:18:26 +0200 Subject: [PATCH 082/153] arm64: dts: hi6220: Move board data from the dwmmc nodes to hikey dts Move the board specific descriptions for the dwmmc nodes in the hi6220 SoC dtsi, into the hikey dts as it's there these belongs. While changing this, let's take the opportunity to drop the use of the "ti,non-removable" binding for one of the dwmmc device nodes, as it's not a valid binding and not used. Drop also the unnecessary use of "num-slots = <0x1>" for all of the dwmmc nodes, as there is no need to set this since when default number of slots is one. Signed-off-by: Ulf Hansson Acked-by: Daniel Lezcano Acked-by: Arnd Bergmann --- .../arm64/boot/dts/hisilicon/hi6220-hikey.dts | 23 ++++++++++++++++++- arch/arm64/boot/dts/hisilicon/hi6220.dtsi | 19 --------------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts index 802f4a4bed30..5132d8ed4664 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts +++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts @@ -286,8 +286,29 @@ gpio15: gpio@f702b000 { /* GPIO blocks 16 thru 19 do not appear to be routed to pins */ + dwmmc_0: dwmmc0@f723d000 { + cap-mmc-highspeed; + non-removable; + bus-width = <0x8>; + vmmc-supply = <&ldo19>; + }; + + dwmmc_1: dwmmc1@f723e000 { + card-detect-delay = <200>; + cap-sd-highspeed; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; + vqmmc-supply = <&ldo7>; + vmmc-supply = <&ldo10>; + bus-width = <0x4>; + disable-wp; + cd-gpios = <&gpio1 0 1>; + }; + dwmmc_2: dwmmc2@f723f000 { - ti,non-removable; + broken-cd; + bus-width = <0x4>; non-removable; /* WL_EN */ vmmc-supply = <&wlan_en_reg>; diff --git a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi index 951152d44c02..5013e4b2ea71 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi +++ b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi @@ -756,17 +756,12 @@ mailbox: mailbox@f7510000 { dwmmc_0: dwmmc0@f723d000 { compatible = "hisilicon,hi6220-dw-mshc"; - num-slots = <0x1>; - cap-mmc-highspeed; - non-removable; reg = <0x0 0xf723d000 0x0 0x1000>; interrupts = <0x0 0x48 0x4>; clocks = <&sys_ctrl 2>, <&sys_ctrl 1>; clock-names = "ciu", "biu"; resets = <&sys_ctrl PERIPH_RSTDIS0_MMC0>; reset-names = "reset"; - bus-width = <0x8>; - vmmc-supply = <&ldo19>; pinctrl-names = "default"; pinctrl-0 = <&emmc_pmx_func &emmc_clk_cfg_func &emmc_cfg_func &emmc_rst_cfg_func>; @@ -774,13 +769,7 @@ dwmmc_0: dwmmc0@f723d000 { dwmmc_1: dwmmc1@f723e000 { compatible = "hisilicon,hi6220-dw-mshc"; - num-slots = <0x1>; - card-detect-delay = <200>; hisilicon,peripheral-syscon = <&ao_ctrl>; - cap-sd-highspeed; - sd-uhs-sdr12; - sd-uhs-sdr25; - sd-uhs-sdr50; reg = <0x0 0xf723e000 0x0 0x1000>; interrupts = <0x0 0x49 0x4>; #address-cells = <0x1>; @@ -789,11 +778,6 @@ dwmmc_1: dwmmc1@f723e000 { clock-names = "ciu", "biu"; resets = <&sys_ctrl PERIPH_RSTDIS0_MMC1>; reset-names = "reset"; - vqmmc-supply = <&ldo7>; - vmmc-supply = <&ldo10>; - bus-width = <0x4>; - disable-wp; - cd-gpios = <&gpio1 0 1>; pinctrl-names = "default", "idle"; pinctrl-0 = <&sd_pmx_func &sd_clk_cfg_func &sd_cfg_func>; pinctrl-1 = <&sd_pmx_idle &sd_clk_cfg_idle &sd_cfg_idle>; @@ -801,15 +785,12 @@ dwmmc_1: dwmmc1@f723e000 { dwmmc_2: dwmmc2@f723f000 { compatible = "hisilicon,hi6220-dw-mshc"; - num-slots = <0x1>; reg = <0x0 0xf723f000 0x0 0x1000>; interrupts = <0x0 0x4a 0x4>; clocks = <&sys_ctrl HI6220_MMC2_CIUCLK>, <&sys_ctrl HI6220_MMC2_CLK>; clock-names = "ciu", "biu"; resets = <&sys_ctrl PERIPH_RSTDIS0_MMC2>; reset-names = "reset"; - bus-width = <0x4>; - broken-cd; pinctrl-names = "default", "idle"; pinctrl-0 = <&sdio_pmx_func &sdio_clk_cfg_func &sdio_cfg_func>; pinctrl-1 = <&sdio_pmx_idle &sdio_clk_cfg_idle &sdio_cfg_idle>; From ea452678734eb782126f999bf5c4fb3e71d3b196 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 May 2017 16:11:33 +0200 Subject: [PATCH 083/153] arm64: dts: hikey: Fix WiFi support The description of the connection between the dwmmc (SDIO) controller and the Wifi chip, which is attached to the SDIO bus is wrong. Currently the SDIO card can't be detected and thus the Wifi doesn't work. Let's fix this by assigning the correct vmmc supply, which is the always on regulator VDD_3V3 and remove the WLAN enable regulator altogether. Then to properly deal with the power on/off sequence, add a mmc-pwrseq node to describe the resources needed to detect the SDIO card. Except for the WLAN enable GPIO and its corresponding assert/de-assert delays, the mmc-pwrseq node also contains a handle to a clock provided by the hi655x pmic. This clock is also needed to be able to turn on the WiFi chip. Signed-off-by: Ulf Hansson Acked-by: Arnd Bergmann --- .../arm64/boot/dts/hisilicon/hi6220-hikey.dts | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts index 5132d8ed4664..49f6a6242cf9 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts +++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts @@ -111,6 +111,15 @@ reg_5v_hub: regulator@2 { vin-supply = <®_sys_5v>; }; + wl1835_pwrseq: wl1835-pwrseq { + compatible = "mmc-pwrseq-simple"; + /* WLAN_EN GPIO */ + reset-gpios = <&gpio0 5 GPIO_ACTIVE_LOW>; + clocks = <&pmic>; + clock-names = "ext_clock"; + power-off-delay-us = <10>; + }; + soc { spi0: spi@f7106000 { status = "ok"; @@ -307,11 +316,10 @@ dwmmc_1: dwmmc1@f723e000 { }; dwmmc_2: dwmmc2@f723f000 { - broken-cd; bus-width = <0x4>; non-removable; - /* WL_EN */ - vmmc-supply = <&wlan_en_reg>; + vmmc-supply = <®_vdd_3v3>; + mmc-pwrseq = <&wl1835_pwrseq>; #address-cells = <0x1>; #size-cells = <0x0>; @@ -323,18 +331,6 @@ wlcore: wlcore@2 { interrupts = <3 IRQ_TYPE_EDGE_RISING>; }; }; - - wlan_en_reg: regulator@1 { - compatible = "regulator-fixed"; - regulator-name = "wlan-en-regulator"; - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <1800000>; - /* WLAN_EN GPIO */ - gpio = <&gpio0 5 0>; - /* WLAN card specific delay */ - startup-delay-us = <70000>; - enable-active-high; - }; }; leds { From 1b57b6210f4e52904393be97c62122aae69bc8aa Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 23 May 2017 09:58:07 +0100 Subject: [PATCH 084/153] cfg80211: make cfg80211_sched_scan_results() work from atomic context Drivers should be able to call cfg80211_sched_scan_results() from atomic context. However, with the introduction of multiple scheduled scan feature this requirement was not taken into account resulting in regression shown below. [ 119.021594] BUG: scheduling while atomic: irq/47-iwlwifi/517/0x00000200 [ 119.021604] Modules linked in: [...] [ 119.021759] CPU: 1 PID: 517 Comm: irq/47-iwlwifi Not tainted 4.12.0-rc2-t440s-20170522+ #1 [ 119.021763] Hardware name: LENOVO 20AQS03H00/20AQS03H00, BIOS GJET91WW (2.41 ) 09/21/2016 [ 119.021766] Call Trace: [ 119.021778] ? dump_stack+0x5c/0x84 [ 119.021784] ? __schedule_bug+0x4c/0x70 [ 119.021792] ? __schedule+0x496/0x5c0 [ 119.021798] ? schedule+0x2d/0x80 [ 119.021804] ? schedule_preempt_disabled+0x5/0x10 [ 119.021810] ? __mutex_lock.isra.0+0x18e/0x4c0 [ 119.021817] ? __wake_up+0x2f/0x50 [ 119.021833] ? cfg80211_sched_scan_results+0x19/0x60 [cfg80211] [ 119.021844] ? cfg80211_sched_scan_results+0x19/0x60 [cfg80211] [ 119.021859] ? iwl_mvm_rx_lmac_scan_iter_complete_notif+0x17/0x30 [iwlmvm] [ 119.021869] ? iwl_pcie_rx_handle+0x2a9/0x7e0 [iwlwifi] [ 119.021878] ? iwl_pcie_irq_handler+0x17c/0x730 [iwlwifi] [ 119.021884] ? irq_forced_thread_fn+0x60/0x60 [ 119.021887] ? irq_thread_fn+0x16/0x40 [ 119.021892] ? irq_thread+0x109/0x180 [ 119.021896] ? wake_threads_waitq+0x30/0x30 [ 119.021901] ? kthread+0xf2/0x130 [ 119.021905] ? irq_thread_dtor+0x90/0x90 [ 119.021910] ? kthread_create_on_node+0x40/0x40 [ 119.021915] ? ret_from_fork+0x26/0x40 Fixes: b34939b98369 ("cfg80211: add request id to cfg80211_sched_scan_*() api") Reported-by: Sander Eikelenboom Signed-off-by: Arend van Spriel Signed-off-by: Johannes Berg --- net/wireless/scan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 14d5f0c8c45f..9f0901f3e42b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -322,9 +322,9 @@ cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid) { struct cfg80211_sched_scan_request *pos; - ASSERT_RTNL(); + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); - list_for_each_entry(pos, &rdev->sched_scan_req_list, list) { + list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list) { if (pos->reqid == reqid) return pos; } @@ -398,13 +398,13 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) trace_cfg80211_sched_scan_results(wiphy, reqid); /* ignore if we're not scanning */ - rtnl_lock(); + rcu_read_lock(); request = cfg80211_find_sched_scan_req(rdev, reqid); if (request) { request->report_results = true; queue_work(cfg80211_wq, &rdev->sched_scan_res_wk); } - rtnl_unlock(); + rcu_read_unlock(); } EXPORT_SYMBOL(cfg80211_sched_scan_results); From c70d9d809fdeecedb96972457ee45c49a232d97f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 May 2017 15:40:12 -0500 Subject: [PATCH 085/153] ptrace: Properly initialize ptracer_cred on fork When I introduced ptracer_cred I failed to consider the weirdness of fork where the task_struct copies the old value by default. This winds up leaving ptracer_cred set even when a process forks and the child process does not wind up being ptraced. Because ptracer_cred is not set on non-ptraced processes whose parents were ptraced this has broken the ability of the enlightenment window manager to start setuid children. Fix this by properly initializing ptracer_cred in ptrace_init_task This must be done with a little bit of care to preserve the current value of ptracer_cred when ptrace carries through fork. Re-reading the ptracer_cred from the ptracing process at this point is inconsistent with how PT_PTRACE_CAP has been maintained all of these years. Tested-by: Takashi Iwai Fixes: 64b875f7ac8a ("ptrace: Capture the ptracer's creds not PT_PTRACE_CAP") Signed-off-by: "Eric W. Biederman" --- include/linux/ptrace.h | 7 +++++-- kernel/ptrace.c | 20 +++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 422bc2e4cb6a..ef3eb8bbfee4 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -54,7 +54,8 @@ extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, - struct task_struct *new_parent); + struct task_struct *new_parent, + const struct cred *ptracer_cred); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 @@ -206,7 +207,7 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) if (unlikely(ptrace) && current->ptrace) { child->ptrace = current->ptrace; - __ptrace_link(child, current->parent); + __ptrace_link(child, current->parent, current->ptracer_cred); if (child->ptrace & PT_SEIZED) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); @@ -215,6 +216,8 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) set_tsk_thread_flag(child, TIF_SIGPENDING); } + else + child->ptracer_cred = NULL; } /** diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 266ddcc1d8bb..60f356d91060 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, } +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, + const struct cred *ptracer_cred) +{ + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); + child->parent = new_parent; + child->ptracer_cred = get_cred(ptracer_cred); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. * * Must be called with the tasklist lock write-held. */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) +static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_entry)); - list_add(&child->ptrace_entry, &new_parent->ptraced); - child->parent = new_parent; rcu_read_lock(); - child->ptracer_cred = get_cred(__task_cred(new_parent)); + __ptrace_link(child, new_parent, __task_cred(new_parent)); rcu_read_unlock(); } @@ -386,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request, flags |= PT_SEIZED; task->ptrace = flags; - __ptrace_link(task, current); + ptrace_link(task, current); /* SEIZE doesn't trap tracee on attach */ if (!seize) @@ -459,7 +465,7 @@ static int ptrace_traceme(void) */ if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace = PT_PTRACED; - __ptrace_link(current, current->real_parent); + ptrace_link(current, current->real_parent); } } write_unlock_irq(&tasklist_lock); From cdc5a7f363be34287ac6c2345e5d1d3b37cf4a23 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 9 May 2017 20:45:06 +0300 Subject: [PATCH 086/153] net/mlx5e: Use the correct delete call on offloaded TC encap entry detach We wrongly direcly invoke hlist_del_rcu() and not hash_del_rcu() which does a slightly different call now and may change later, fix that. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Signed-off-by: Or Gerlitz Reported-by: Paul Blakey Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 11c27e4fadf6..a90dd26ea51c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -384,7 +384,7 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv, if (e->flags & MLX5_ENCAP_ENTRY_VALID) mlx5_encap_dealloc(priv->mdev, e->encap_id); - hlist_del_rcu(&e->encap_hlist); + hash_del_rcu(&e->encap_hlist); kfree(e->encap_header); kfree(e); } From 3aa4266405a6c2e03eb0ff12d7c573d3d903da4c Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 10 May 2017 13:48:41 +0300 Subject: [PATCH 087/153] net/sched: act_csum: Add accessors for offloading drivers Add the accessors for realizing if this is a csum action, and for which fields checksum is needed. Signed-off-by: Or Gerlitz Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- include/net/tc_act/tc_csum.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index f31fb6331a53..3248beaf16b0 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -3,6 +3,7 @@ #include #include +#include struct tcf_csum { struct tc_action common; @@ -11,4 +12,18 @@ struct tcf_csum { }; #define to_tcf_csum(a) ((struct tcf_csum *)a) +static inline bool is_tcf_csum(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + if (a->ops && a->ops->type == TCA_ACT_CSUM) + return true; +#endif + return false; +} + +static inline u32 tcf_csum_update_flags(const struct tc_action *a) +{ + return to_tcf_csum(a)->update_flags; +} + #endif /* __NET_TC_CSUM_H */ From 26c02749936f064abf771a0f5f49b280fcfd8b66 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 10 May 2017 13:59:54 +0300 Subject: [PATCH 088/153] net/mlx5e: Allow TC csum offload if applied together with pedit action When offloading header re-writes, the HW re-calculates the relevant L3/L4 checksums. Hence, when upper layers (as done by OVS) ask for TC checksum action offload together with pedit offload, don't err. This command now works: tc filter add dev ens1f0 protocol ip parent ffff: prio 20 flower skip_sw ip_proto tcp dst_port 9001 action pedit ex munge tcp dport set 0x1234 pipe action csum tcp Signed-off-by: Or Gerlitz Reported-by: Paul Blakey Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index a90dd26ea51c..9dd83c7e4c51 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include "en.h" @@ -1109,6 +1110,28 @@ static int parse_tc_pedit_action(struct mlx5e_priv *priv, return err; } +static bool csum_offload_supported(struct mlx5e_priv *priv, u32 action, u32 update_flags) +{ + u32 prot_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR | TCA_CSUM_UPDATE_FLAG_TCP | + TCA_CSUM_UPDATE_FLAG_UDP; + + /* The HW recalcs checksums only if re-writing headers */ + if (!(action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)) { + netdev_warn(priv->netdev, + "TC csum action is only offloaded with pedit\n"); + return false; + } + + if (update_flags & ~prot_flags) { + netdev_warn(priv->netdev, + "can't offload TC csum action for some header/s - flags %#x\n", + update_flags); + return false; + } + + return true; +} + static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, struct mlx5e_tc_flow_parse_attr *parse_attr, struct mlx5e_tc_flow *flow) @@ -1149,6 +1172,14 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, continue; } + if (is_tcf_csum(a)) { + if (csum_offload_supported(priv, attr->action, + tcf_csum_update_flags(a))) + continue; + + return -EOPNOTSUPP; + } + if (is_tcf_skbedit_mark(a)) { u32 mark = tcf_skbedit_mark(a); @@ -1651,6 +1682,14 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, continue; } + if (is_tcf_csum(a)) { + if (csum_offload_supported(priv, attr->action, + tcf_csum_update_flags(a))) + continue; + + return -EOPNOTSUPP; + } + if (is_tcf_mirred_egress_redirect(a)) { int ifindex = tcf_mirred_ifindex(a); struct net_device *out_dev, *encap_dev = NULL; From d824bf3fe2d352fc2c52b7ede05b1a0e95d946be Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 9 May 2017 19:02:42 +0300 Subject: [PATCH 089/153] net/mlx5e: Properly enforce disallowing of partial field re-write offload Currently we don't support partial header re-writes through TC pedit action offloading. However, the code that enforces that wasn't err-ing on cases where the first and last bits of the mask are set but there is some zero bit between them, such as in the below example, fix that! tc filter add dev enp1s0 protocol ip parent ffff: prio 10 flower ip_proto udp dst_port 2001 skip_sw action pedit munge ip src set 1.0.0.1 retain 0xff0000ff Fixes: d79b6df6b10a ('net/mlx5e: Add parsing of TC pedit actions to HW format') Signed-off-by: Or Gerlitz Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9dd83c7e4c51..0387c321f0a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -926,7 +926,7 @@ static int offload_pedit_fields(struct pedit_headers *masks, struct mlx5e_tc_flow_parse_attr *parse_attr) { struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals; - int i, action_size, nactions, max_actions, first, last; + int i, action_size, nactions, max_actions, first, last, first_z; void *s_masks_p, *a_masks_p, *vals_p; u32 s_mask, a_mask, val; struct mlx5_fields *f; @@ -985,9 +985,10 @@ static int offload_pedit_fields(struct pedit_headers *masks, memcpy(&val, vals_p, f->size); field_bsize = f->size * BITS_PER_BYTE; + first_z = find_first_zero_bit(&mask, field_bsize); first = find_first_bit(&mask, field_bsize); last = find_last_bit(&mask, field_bsize); - if (first > 0 || last != (field_bsize - 1)) { + if (first > 0 || last != (field_bsize - 1) || first_z < last) { printk(KERN_WARNING "mlx5: partial rewrite (mask %lx) is currently not offloaded\n", mask); return -EOPNOTSUPP; From e3ca4e0583a02a04503d9c827fb5c5d50abc4ff5 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 9 May 2017 13:37:26 +0300 Subject: [PATCH 090/153] net/mlx5e: Fix warnings around parsing of TC pedit actions The sparse tool emits these correct complaints: drivers/net/ethernet/mellanox/mlx5/core//en_tc.c:1005:25: warning: cast to restricted __be32 drivers/net/ethernet/mellanox/mlx5/core//en_tc.c:1007:25: warning: cast to restricted __be16 The value is provided from user-space in network order, but there's no way for them to realize that, avoid the warnings by casting to the appropriate type. Fixes: d79b6df6b10a ('net/mlx5e: Add parsing of TC pedit actions to HW format') Signed-off-by: Or Gerlitz Reported-by: Leon Romanovsky Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 0387c321f0a2..ec63158ab643 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -928,9 +928,9 @@ static int offload_pedit_fields(struct pedit_headers *masks, struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals; int i, action_size, nactions, max_actions, first, last, first_z; void *s_masks_p, *a_masks_p, *vals_p; - u32 s_mask, a_mask, val; struct mlx5_fields *f; u8 cmd, field_bsize; + u32 s_mask, a_mask; unsigned long mask; void *action; @@ -947,7 +947,8 @@ static int offload_pedit_fields(struct pedit_headers *masks, for (i = 0; i < ARRAY_SIZE(fields); i++) { f = &fields[i]; /* avoid seeing bits set from previous iterations */ - s_mask = a_mask = mask = val = 0; + s_mask = 0; + a_mask = 0; s_masks_p = (void *)set_masks + f->offset; a_masks_p = (void *)add_masks + f->offset; @@ -982,9 +983,8 @@ static int offload_pedit_fields(struct pedit_headers *masks, memset(a_masks_p, 0, f->size); } - memcpy(&val, vals_p, f->size); - field_bsize = f->size * BITS_PER_BYTE; + first_z = find_first_zero_bit(&mask, field_bsize); first = find_first_bit(&mask, field_bsize); last = find_last_bit(&mask, field_bsize); @@ -1004,11 +1004,11 @@ static int offload_pedit_fields(struct pedit_headers *masks, } if (field_bsize == 32) - MLX5_SET(set_action_in, action, data, ntohl(val)); + MLX5_SET(set_action_in, action, data, ntohl(*(__be32 *)vals_p)); else if (field_bsize == 16) - MLX5_SET(set_action_in, action, data, ntohs(val)); + MLX5_SET(set_action_in, action, data, ntohs(*(__be16 *)vals_p)); else if (field_bsize == 8) - MLX5_SET(set_action_in, action, data, val); + MLX5_SET(set_action_in, action, data, *(u8 *)vals_p); action += action_size; nactions++; From b57fe691961cc8f00541f9a435c70df45d41e514 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Thu, 27 Apr 2017 17:59:00 +0300 Subject: [PATCH 091/153] net/mlx5e: IPoIB, handle RX packet correctly IPoIB packet contains the pseudo header area, we need to pull it prior to reset_mac_header in order to let the GRO work well. In more details: GRO checks the mac address of the new coming packet, it does that by comparing the hard_header_len size of the current packet to the previous one in that session, the comparison is over hard_header_len size. Now, the driver prepares that area in the skb by allocating area from the reserved part and resetting the correct mac header to it. Fixes: 9d6bd752c63c ("net/mlx5e: IPoIB, RX handler") Signed-off-by: Erez Shitrit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 7b1566f0ae58..66b5fec15313 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1041,6 +1041,8 @@ void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq) #define MLX5_IB_GRH_BYTES 40 #define MLX5_IPOIB_ENCAP_LEN 4 #define MLX5_GID_SIZE 16 +#define MLX5_IPOIB_PSEUDO_LEN 20 +#define MLX5_IPOIB_HARD_LEN (MLX5_IPOIB_PSEUDO_LEN + MLX5_IPOIB_ENCAP_LEN) static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, @@ -1048,6 +1050,7 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, struct sk_buff *skb) { struct net_device *netdev = rq->netdev; + char *pseudo_header; u8 *dgid; u8 g; @@ -1076,8 +1079,11 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, if (likely(netdev->features & NETIF_F_RXHASH)) mlx5e_skb_set_hash(cqe, skb); + /* 20 bytes of ipoib header and 4 for encap existing */ + pseudo_header = skb_push(skb, MLX5_IPOIB_PSEUDO_LEN); + memset(pseudo_header, 0, MLX5_IPOIB_PSEUDO_LEN); skb_reset_mac_header(skb); - skb_pull(skb, MLX5_IPOIB_ENCAP_LEN); + skb_pull(skb, MLX5_IPOIB_HARD_LEN); skb->dev = netdev; From 73dd3a4839c1d27c36d4dcc92e1ff44225ecbeb7 Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Thu, 23 Feb 2017 11:19:36 +0200 Subject: [PATCH 092/153] net/mlx5: Avoid using pending command interface slots Currently when firmware command gets stuck or it takes long time to complete, the driver command will get timeout and the command slot is freed and can be used for new commands, and if the firmware receive new command on the old busy slot its behavior is unexpected and this could be harmful. To fix this when the driver command gets timeout we return failure, but we don't free the command slot and we wait for the firmware to explicitly respond to that command. Once all the entries are busy we will stop processing new firmware commands. Fixes: 9cba4ebcf374 ('net/mlx5: Fix potential deadlock in command mode change') Signed-off-by: Mohamad Haj Yahia Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 41 ++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- .../net/ethernet/mellanox/mlx5/core/health.c | 2 +- include/linux/mlx5/driver.h | 7 +++- 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 5bdaf3d545b2..10d282841f5b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -774,7 +774,7 @@ static void cb_timeout_handler(struct work_struct *work) mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n", mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); } static void cmd_work_handler(struct work_struct *work) @@ -804,6 +804,7 @@ static void cmd_work_handler(struct work_struct *work) } cmd->ent_arr[ent->idx] = ent; + set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); lay = get_inst(cmd, ent->idx); ent->lay = lay; memset(lay, 0, sizeof(*lay)); @@ -825,6 +826,20 @@ static void cmd_work_handler(struct work_struct *work) if (ent->callback) schedule_delayed_work(&ent->cb_timeout_work, cb_timeout); + /* Skip sending command to fw if internal error */ + if (pci_channel_offline(dev->pdev) || + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + u8 status = 0; + u32 drv_synd; + + ent->ret = mlx5_internal_err_ret_value(dev, msg_to_opcode(ent->in), &drv_synd, &status); + MLX5_SET(mbox_out, ent->out, status, status); + MLX5_SET(mbox_out, ent->out, syndrome, drv_synd); + + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); + return; + } + /* ring doorbell after the descriptor is valid */ mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx); wmb(); @@ -835,7 +850,7 @@ static void cmd_work_handler(struct work_struct *work) poll_timeout(ent); /* make sure we read the descriptor after ownership is SW */ rmb(); - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, (ent->ret == -ETIMEDOUT)); } } @@ -879,7 +894,7 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) wait_for_completion(&ent->done); } else if (!wait_for_completion_timeout(&ent->done, timeout)) { ent->ret = -ETIMEDOUT; - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); } err = ent->ret; @@ -1375,7 +1390,7 @@ static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg) } } -void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) +void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced) { struct mlx5_cmd *cmd = &dev->cmd; struct mlx5_cmd_work_ent *ent; @@ -1395,6 +1410,19 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) struct semaphore *sem; ent = cmd->ent_arr[i]; + + /* if we already completed the command, ignore it */ + if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, + &ent->state)) { + /* only real completion can free the cmd slot */ + if (!forced) { + mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n", + ent->idx); + free_ent(cmd, ent->idx); + } + continue; + } + if (ent->callback) cancel_delayed_work(&ent->cb_timeout_work); if (ent->page_queue) @@ -1417,7 +1445,10 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n", ent->ret, deliv_status_to_str(ent->status), ent->status); } - free_ent(cmd, ent->idx); + + /* only real completion will free the entry slot */ + if (!forced) + free_ent(cmd, ent->idx); if (ent->callback) { ds = ent->ts2 - ent->ts1; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index ea5d8d37a75c..33eae5ad2fb0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -422,7 +422,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr) break; case MLX5_EVENT_TYPE_CMD: - mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector)); + mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector), false); break; case MLX5_EVENT_TYPE_PORT_CHANGE: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index d0515391d33b..44f59b1d6f0f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -90,7 +90,7 @@ static void trigger_cmd_completions(struct mlx5_core_dev *dev) spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); mlx5_core_dbg(dev, "vector 0x%llx\n", vector); - mlx5_cmd_comp_handler(dev, vector); + mlx5_cmd_comp_handler(dev, vector, true); return; no_trig: diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index bcdf739ee41a..93273d9ea4d1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -787,7 +787,12 @@ enum { typedef void (*mlx5_cmd_cbk_t)(int status, void *context); +enum { + MLX5_CMD_ENT_STATE_PENDING_COMP, +}; + struct mlx5_cmd_work_ent { + unsigned long state; struct mlx5_cmd_msg *in; struct mlx5_cmd_msg *out; void *uout; @@ -976,7 +981,7 @@ void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); -void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec); +void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced); void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type); int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, int nent, u64 mask, const char *name, From b665d98edc9ab295169be2fc5bb4e89a46de0a1a Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 18 May 2017 13:34:43 +0300 Subject: [PATCH 093/153] net/mlx5: Tolerate irq_set_affinity_hint() failures Add tolerance to failures of irq_set_affinity_hint(). Its role is to give hints that optimizes performance, and should not block the driver load. In non-SMP systems, functionality is not available as there is a single core, and all these calls definitely fail. Hence, do not call the function and avoid the warning prints. Fixes: db058a186f98 ("net/mlx5_core: Set irq affinity hints") Signed-off-by: Tariq Toukan Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 0c123d571b4c..fe5546bb4153 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -612,7 +612,6 @@ static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) struct mlx5_priv *priv = &mdev->priv; struct msix_entry *msix = priv->msix_arr; int irq = msix[i + MLX5_EQ_VEC_COMP_BASE].vector; - int err; if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) { mlx5_core_warn(mdev, "zalloc_cpumask_var failed"); @@ -622,18 +621,12 @@ static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node), priv->irq_info[i].mask); - err = irq_set_affinity_hint(irq, priv->irq_info[i].mask); - if (err) { - mlx5_core_warn(mdev, "irq_set_affinity_hint failed,irq 0x%.4x", - irq); - goto err_clear_mask; - } +#ifdef CONFIG_SMP + if (irq_set_affinity_hint(irq, priv->irq_info[i].mask)) + mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq); +#endif return 0; - -err_clear_mask: - free_cpumask_var(priv->irq_info[i].mask); - return err; } static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i) From 7bd897cfce1eb373892d35d7f73201b0f9b221c4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 May 2017 17:28:36 +0300 Subject: [PATCH 094/153] block: fix an error code in add_partition() We don't set an error code on this path. It means that we return NULL instead of an error pointer and the caller does a NULL dereference. Fixes: 6d1d8050b4bc ("block, partition: add partition_meta_info to hd_struct") Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- block/partition-generic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index ff07b9143ca4..c5ec8246e25e 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -320,8 +320,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (info) { struct partition_meta_info *pinfo = alloc_part_info(disk); - if (!pinfo) + if (!pinfo) { + err = -ENOMEM; goto out_free_stats; + } memcpy(pinfo, info, sizeof(*info)); p->info = pinfo; } From 7f65b1f5adc5f8496ca8bec4947de66fefe36220 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Mon, 22 May 2017 14:50:30 +0200 Subject: [PATCH 095/153] cdc-ether: divorce initialisation with a filter reset and a generic method Some devices need their multicast filter reset but others are crashed by that. So the methods need to be separated. Signed-off-by: Oliver Neukum Reported-by: "Ridgway, Keith" Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 33 +++++++++++++++++++++++++-------- include/linux/usb/usbnet.h | 1 + 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index f3ae88fdf332..8ab281b478f2 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -310,13 +310,6 @@ int usbnet_generic_cdc_bind(struct usbnet *dev, struct usb_interface *intf) return -ENODEV; } - /* Some devices don't initialise properly. In particular - * the packet filter is not reset. There are devices that - * don't do reset all the way. So the packet filter should - * be set to a sane initial value. - */ - usbnet_cdc_update_filter(dev); - return 0; bad_desc: @@ -325,6 +318,30 @@ int usbnet_generic_cdc_bind(struct usbnet *dev, struct usb_interface *intf) } EXPORT_SYMBOL_GPL(usbnet_generic_cdc_bind); + +/* like usbnet_generic_cdc_bind() but handles filter initialization + * correctly + */ +int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf) +{ + int rv; + + rv = usbnet_generic_cdc_bind(dev, intf); + if (rv < 0) + goto bail_out; + + /* Some devices don't initialise properly. In particular + * the packet filter is not reset. There are devices that + * don't do reset all the way. So the packet filter should + * be set to a sane initial value. + */ + usbnet_cdc_update_filter(dev); + +bail_out: + return rv; +} +EXPORT_SYMBOL_GPL(usbnet_ether_cdc_bind); + void usbnet_cdc_unbind(struct usbnet *dev, struct usb_interface *intf) { struct cdc_state *info = (void *) &dev->data; @@ -417,7 +434,7 @@ int usbnet_cdc_bind(struct usbnet *dev, struct usb_interface *intf) BUILD_BUG_ON((sizeof(((struct usbnet *)0)->data) < sizeof(struct cdc_state))); - status = usbnet_generic_cdc_bind(dev, intf); + status = usbnet_ether_cdc_bind(dev, intf); if (status < 0) return status; diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 7dffa5624ea6..97116379db5f 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -206,6 +206,7 @@ struct cdc_state { }; extern int usbnet_generic_cdc_bind(struct usbnet *, struct usb_interface *); +extern int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf); extern int usbnet_cdc_bind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_unbind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_status(struct usbnet *, struct urb *); From 12e8b570e732eaa5eae3a2895ba3fbcf91bde2b4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 22 May 2017 20:13:07 +0200 Subject: [PATCH 096/153] mlx5: fix bug reading rss_hash_type from CQE Masks for extracting part of the Completion Queue Entry (CQE) field rss_hash_type was swapped, namely CQE_RSS_HTYPE_IP and CQE_RSS_HTYPE_L4. The bug resulted in setting skb->l4_hash, even-though the rss_hash_type indicated that hash was NOT computed over the L4 (UDP or TCP) part of the packet. Added comments from the datasheet, to make it more clear what these masks are selecting. Signed-off-by: Jesper Dangaard Brouer Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index dd9a263ed368..a940ec6a046c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -787,8 +787,14 @@ enum { }; enum { - CQE_RSS_HTYPE_IP = 0x3 << 6, - CQE_RSS_HTYPE_L4 = 0x3 << 2, + CQE_RSS_HTYPE_IP = 0x3 << 2, + /* cqe->rss_hash_type[3:2] - IP destination selected for hash + * (00 = none, 01 = IPv4, 10 = IPv6, 11 = Reserved) + */ + CQE_RSS_HTYPE_L4 = 0x3 << 6, + /* cqe->rss_hash_type[7:6] - L4 destination selected for hash + * (00 = none, 01 = TCP. 10 = UDP, 11 = IPSEC.SPI + */ }; enum { From 223220356d5ebc05ead9a8d697abb0c0a906fc81 Mon Sep 17 00:00:00 2001 From: Richard Date: Sun, 21 May 2017 12:27:00 -0700 Subject: [PATCH 097/153] partitions/msdos: FreeBSD UFS2 file systems are not recognized The code in block/partitions/msdos.c recognizes FreeBSD, OpenBSD and NetBSD partitions and does a reasonable job picking out OpenBSD and NetBSD UFS subpartitions. But for FreeBSD the subpartitions are always "bad". Kernel: Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/msdos.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 93e7c1b32edd..5610cd537da7 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -300,6 +300,8 @@ static void parse_bsd(struct parsed_partitions *state, continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); + if (memcmp(flavour, "bsd\0", 4) == 0) + bsd_start += offset; if (offset == bsd_start && size == bsd_size) /* full parent partition, we have it already */ continue; From 6f4dbd149d2a151b89d1a5bbf7530ee5546c7908 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 11:33:16 +0200 Subject: [PATCH 098/153] libceph: use kbasename() and kill ceph_file_part() Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/ceph_debug.h | 6 +++--- net/ceph/ceph_common.c | 13 ------------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h index aa2e19182d99..51c5bd64bd00 100644 --- a/include/linux/ceph/ceph_debug.h +++ b/include/linux/ceph/ceph_debug.h @@ -3,6 +3,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include + #ifdef CONFIG_CEPH_LIB_PRETTYDEBUG /* @@ -12,12 +14,10 @@ */ # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) -extern const char *ceph_file_part(const char *s, int len); # define dout(fmt, ...) \ pr_debug("%.*s %12.12s:%-4d : " fmt, \ 8 - (int)sizeof(KBUILD_MODNAME), " ", \ - ceph_file_part(__FILE__, sizeof(__FILE__)), \ - __LINE__, ##__VA_ARGS__) + kbasename(__FILE__), __LINE__, ##__VA_ARGS__) # else /* faux printk call just to see any compiler warnings. */ # define dout(fmt, ...) do { \ diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4fd02831beed..47e94b560ba0 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -56,19 +56,6 @@ static const struct kernel_param_ops param_ops_supported_features = { module_param_cb(supported_features, ¶m_ops_supported_features, NULL, S_IRUGO); -/* - * find filename portion of a path (/foo/bar/baz -> baz) - */ -const char *ceph_file_part(const char *s, int len) -{ - const char *e = s + len; - - while (e != s && *(e-1) != '/') - e--; - return e; -} -EXPORT_SYMBOL(ceph_file_part); - const char *ceph_msg_type_name(int type) { switch (type) { From 1759f7b0e3fab1d1882d7c680af9d12c5c111b0e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 11:38:17 +0200 Subject: [PATCH 099/153] libceph: make ceph_msg_data_advance() return void Both callers ignore the returned bool. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/messenger.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 5766a6c896c4..d7ab481b2508 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1174,8 +1174,8 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, * Returns true if the result moves the cursor on to the next piece * of the data item. */ -static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, - size_t bytes) +static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) { bool new_piece; @@ -1207,8 +1207,6 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, new_piece = true; } cursor->need_crc = new_piece; - - return new_piece; } static size_t sizeof_footer(struct ceph_connection *con) @@ -1577,7 +1575,6 @@ static int write_partial_message_data(struct ceph_connection *con) size_t page_offset; size_t length; bool last_piece; - bool need_crc; int ret; page = ceph_msg_data_next(cursor, &page_offset, &length, @@ -1592,7 +1589,7 @@ static int write_partial_message_data(struct ceph_connection *con) } if (do_datacrc && cursor->need_crc) crc = ceph_crc32c_page(crc, page, page_offset, length); - need_crc = ceph_msg_data_advance(cursor, (size_t)ret); + ceph_msg_data_advance(cursor, (size_t)ret); } dout("%s %p msg %p done\n", __func__, con, msg); @@ -2299,7 +2296,7 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = ceph_crc32c_page(crc, page, page_offset, ret); - (void) ceph_msg_data_advance(cursor, (size_t)ret); + ceph_msg_data_advance(cursor, (size_t)ret); } if (do_datacrc) con->in_data_crc = crc; From f3b4e55ded9b3c52831a7d2ab9e511293c99fc11 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 11:59:22 +0200 Subject: [PATCH 100/153] libceph: drop version variable from ceph_monmap_decode() It's set but not used: CEPH_FEATURE_MONNAMES feature bit isn't advertised, which guarantees a v1 MonMap. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/mon_client.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 29a0ef351c5e..250f11f78609 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -43,15 +43,13 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end) int i, err = -EINVAL; struct ceph_fsid fsid; u32 epoch, num_mon; - u16 version; u32 len; ceph_decode_32_safe(&p, end, len, bad); ceph_decode_need(&p, end, len, bad); dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); - - ceph_decode_16_safe(&p, end, version, bad); + p += sizeof(u16); /* skip version */ ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); From d18a1247c4070390fc0c2d83d89a72afe921882e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 12:21:56 +0200 Subject: [PATCH 101/153] libceph: validate blob_struct_v in process_one_ticket() None of these are validated in userspace, but since we do validate reply_struct_v in ceph_x_proc_ticket_reply(), tkt_struct_v (first) and CephXServiceTicket struct_v (second) in process_one_ticket(), validate CephXTicketBlob struct_v as well. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/auth_x.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 2034fb926670..d0126df33f1f 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -215,6 +215,9 @@ static int process_one_ticket(struct ceph_auth_client *ac, dout(" ticket blob is %d bytes\n", dlen); ceph_decode_need(ptp, tpend, 1 + sizeof(u64), bad); blob_struct_v = ceph_decode_8(ptp); + if (blob_struct_v != 1) + goto bad; + new_secret_id = ceph_decode_64(ptp); ret = ceph_decode_buffer(&new_ticket_blob, ptp, tpend); if (ret) From b51456a6096ebf9f4ceb2cc7e176b471d4b70af0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 14:24:36 +0200 Subject: [PATCH 102/153] libceph: fix error handling in process_one_ticket() Don't leak key internals after new_session_key is populated. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/auth_x.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index d0126df33f1f..8757fb87dab8 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -151,7 +151,7 @@ static int process_one_ticket(struct ceph_auth_client *ac, struct timespec validity; void *tp, *tpend; void **ptp; - struct ceph_crypto_key new_session_key; + struct ceph_crypto_key new_session_key = { 0 }; struct ceph_buffer *new_ticket_blob; unsigned long new_expires, new_renew_after; u64 new_secret_id; @@ -237,13 +237,13 @@ static int process_one_ticket(struct ceph_auth_client *ac, type, ceph_entity_type_name(type), th->secret_id, (int)th->ticket_blob->vec.iov_len); xi->have_keys |= th->service; - -out: - return ret; + return 0; bad: ret = -EINVAL; - goto out; +out: + ceph_crypto_key_destroy(&new_session_key); + return ret; } static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, From 293dffaad8d500e1a5336eeb90d544cf40d4fbd8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 May 2017 17:25:10 +0300 Subject: [PATCH 103/153] libceph: NULL deref on crush_decode() error path If there is not enough space then ceph_decode_32_safe() does a goto bad. We need to return an error code in that situation. The current code returns ERR_PTR(0) which is NULL. The callers are not expecting that and it results in a NULL dereference. Fixes: f24e9980eb86 ("ceph: OSD client") Signed-off-by: Dan Carpenter Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index ffe9e904d4d1..55e3a477f92d 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -317,6 +317,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) u32 yes; struct crush_rule *r; + err = -EINVAL; ceph_decode_32_safe(p, end, yes, bad); if (!yes) { dout("crush_decode NO rule %d off %x %p to %p\n", From 4d071c3238987325b9e50e33051a40d1cce311cc Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 23 May 2017 14:18:17 -0500 Subject: [PATCH 104/153] PCI/PM: Add needs_resume flag to avoid suspend complete optimization Some drivers - like i915 - may not support the system suspend direct complete optimization due to differences in their runtime and system suspend sequence. Add a flag that when set resumes the device before calling the driver's system suspend handlers which effectively disables the optimization. Needed by a future patch fixing suspend/resume on i915. Suggested by Rafael. Signed-off-by: Imre Deak Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki Cc: stable@vger.kernel.org --- drivers/pci/pci.c | 3 ++- include/linux/pci.h | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b01bd5bba8e6..563901cd9c06 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2144,7 +2144,8 @@ bool pci_dev_keep_suspended(struct pci_dev *pci_dev) if (!pm_runtime_suspended(dev) || pci_target_state(pci_dev) != pci_dev->current_state - || platform_pci_need_resume(pci_dev)) + || platform_pci_need_resume(pci_dev) + || (pci_dev->dev_flags & PCI_DEV_FLAGS_NEEDS_RESUME)) return false; /* diff --git a/include/linux/pci.h b/include/linux/pci.h index 33c2b0b77429..df7dd9021646 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -183,6 +183,11 @@ enum pci_dev_flags { PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT = (__force pci_dev_flags_t) (1 << 9), /* Do not use FLR even if device advertises PCI_AF_CAP */ PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10), + /* + * Resume before calling the driver's system suspend hooks, disabling + * the direct_complete optimization. + */ + PCI_DEV_FLAGS_NEEDS_RESUME = (__force pci_dev_flags_t) (1 << 11), }; enum pci_irq_reroute_variant { From 82bc9a42cf854fdf63155759c0aa790bd1f361b0 Mon Sep 17 00:00:00 2001 From: Patrik Jakobsson Date: Tue, 18 Apr 2017 13:43:32 +0200 Subject: [PATCH 105/153] drm/gma500/psb: Actually use VBT mode when it is found With LVDS we were incorrectly picking the pre-programmed mode instead of the prefered mode provided by VBT. Make sure we pick the VBT mode if one is provided. It is likely that the mode read-out code is still wrong but this patch fixes the immediate problem on most machines. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=78562 Cc: Signed-off-by: Patrik Jakobsson Link: http://patchwork.freedesktop.org/patch/msgid/20170418114332.12183-1-patrik.r.jakobsson@gmail.com --- drivers/gpu/drm/gma500/psb_intel_lvds.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/gma500/psb_intel_lvds.c b/drivers/gpu/drm/gma500/psb_intel_lvds.c index 0066fe7e622e..be3eefec5152 100644 --- a/drivers/gpu/drm/gma500/psb_intel_lvds.c +++ b/drivers/gpu/drm/gma500/psb_intel_lvds.c @@ -759,20 +759,23 @@ void psb_intel_lvds_init(struct drm_device *dev, if (scan->type & DRM_MODE_TYPE_PREFERRED) { mode_dev->panel_fixed_mode = drm_mode_duplicate(dev, scan); + DRM_DEBUG_KMS("Using mode from DDC\n"); goto out; /* FIXME: check for quirks */ } } /* Failed to get EDID, what about VBT? do we need this? */ - if (mode_dev->vbt_mode) + if (dev_priv->lfp_lvds_vbt_mode) { mode_dev->panel_fixed_mode = - drm_mode_duplicate(dev, mode_dev->vbt_mode); + drm_mode_duplicate(dev, dev_priv->lfp_lvds_vbt_mode); - if (!mode_dev->panel_fixed_mode) - if (dev_priv->lfp_lvds_vbt_mode) - mode_dev->panel_fixed_mode = - drm_mode_duplicate(dev, - dev_priv->lfp_lvds_vbt_mode); + if (mode_dev->panel_fixed_mode) { + mode_dev->panel_fixed_mode->type |= + DRM_MODE_TYPE_PREFERRED; + DRM_DEBUG_KMS("Using mode from VBT\n"); + goto out; + } + } /* * If we didn't get EDID, try checking if the panel is already turned @@ -789,6 +792,7 @@ void psb_intel_lvds_init(struct drm_device *dev, if (mode_dev->panel_fixed_mode) { mode_dev->panel_fixed_mode->type |= DRM_MODE_TYPE_PREFERRED; + DRM_DEBUG_KMS("Using pre-programmed mode\n"); goto out; /* FIXME: check for quirks */ } } From 0a2ad541071f99eaf4589c3551176fca191c1ee2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 5 May 2017 18:47:37 +0800 Subject: [PATCH 106/153] libceph: cleanup old messages according to reconnect seq when reopen a connection, use 'reconnect seq' to clean up messages that have already been received by peer. Link: http://tracker.ceph.com/issues/18690 Signed-off-by: "Yan, Zheng" Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d7ab481b2508..588a91930051 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2228,10 +2228,18 @@ static void process_ack(struct ceph_connection *con) struct ceph_msg *m; u64 ack = le64_to_cpu(con->in_temp_ack); u64 seq; + bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ); + struct list_head *list = reconnect ? &con->out_queue : &con->out_sent; - while (!list_empty(&con->out_sent)) { - m = list_first_entry(&con->out_sent, struct ceph_msg, - list_head); + /* + * In the reconnect case, con_fault() has requeued messages + * in out_sent. We should cleanup old messages according to + * the reconnect seq. + */ + while (!list_empty(list)) { + m = list_first_entry(list, struct ceph_msg, list_head); + if (reconnect && m->needs_out_seq) + break; seq = le64_to_cpu(m->hdr.seq); if (seq > ack) break; @@ -2240,6 +2248,7 @@ static void process_ack(struct ceph_connection *con) m->ack_stamp = jiffies; ceph_msg_remove(m); } + prepare_read_tag(con); } From 42c99fc4c7069371da7b04b9099319dd1c633ee2 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 5 May 2017 18:28:44 +0100 Subject: [PATCH 107/153] ceph: check that the new inode size is within limits in ceph_fallocate() Currently the ceph client doesn't respect the rlimit in fallocate. This means that a user can allocate a file with size > RLIMIT_FSIZE. This patch adds the call to inode_newsize_ok() to verify filesystem limits and ulimits. This should make ceph successfully run xfstest generic/228. Signed-off-by: Luis Henriques Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3fdde0b283c9..29308a80d66f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1671,8 +1671,12 @@ static long ceph_fallocate(struct file *file, int mode, } size = i_size_read(inode); - if (!(mode & FALLOC_FL_KEEP_SIZE)) + if (!(mode & FALLOC_FL_KEEP_SIZE)) { endoff = offset + length; + ret = inode_newsize_ok(inode, endoff); + if (ret) + goto unlock; + } if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; From 3ab2137915aea0ce7b3ec02e0f260ecc0f1c289d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 23 May 2017 13:28:54 +0800 Subject: [PATCH 108/153] sctp: fix stream update when processing dupcookie Since commit 3dbcc105d556 ("sctp: alloc stream info when initializing asoc"), stream and stream.out info are always alloced when creating an asoc. So it's not correct to check !asoc->stream before updating stream info when processing dupcookie, but would be better to check asoc state instead. Fixes: 3dbcc105d556 ("sctp: alloc stream info when initializing asoc") Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/associola.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index a9708da28eb5..95238284c422 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1176,7 +1176,9 @@ void sctp_assoc_update(struct sctp_association *asoc, asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; - if (!asoc->stream) { + + if (sctp_state(asoc, COOKIE_WAIT)) { + sctp_stream_free(asoc->stream); asoc->stream = new->stream; new->stream = NULL; } From 7e06297768886337707f5833942b3bd524a6d3d5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 23 May 2017 13:28:55 +0800 Subject: [PATCH 109/153] sctp: set new_asoc temp when processing dupcookie After sctp changed to use transport hashtable, a transport would be added into global hashtable when adding the peer to an asoc, then the asoc can be got by searching the transport in the hashtbale. The problem is when processing dupcookie in sctp_sf_do_5_2_4_dupcook, a new asoc would be created. A peer with the same addr and port as the one in the old asoc might be added into the new asoc, but fail to be added into the hashtable, as they also belong to the same sk. It causes that sctp's dupcookie processing can not really work. Since the new asoc will be freed after copying it's information to the old asoc, it's more like a temp asoc. So this patch is to fix it by setting it as a temp asoc to avoid adding it's any transport into the hashtable and also avoid allocing assoc_id. An extra thing it has to do is to also alloc stream info for any temp asoc, as sctp dupcookie process needs it to update old asoc. But I don't think it would hurt something, as a temp asoc would always be freed after finishing processing cookie echo packet. Reported-by: Jianwen Ji Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 13 ++++--------- net/sctp/sm_statefuns.c | 3 +++ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 8a08f13469c4..92e332e17391 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2454,16 +2454,11 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * stream sequence number shall be set to 0. */ - /* Allocate storage for the negotiated streams if it is not a temporary - * association. - */ - if (!asoc->temp) { - if (sctp_stream_init(asoc, gfp)) - goto clean_up; + if (sctp_stream_init(asoc, gfp)) + goto clean_up; - if (sctp_assoc_set_id(asoc, gfp)) - goto clean_up; - } + if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) + goto clean_up; /* ADDIP Section 4.1 ASCONF Chunk Procedures * diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 4f5e6cfc7f60..f863b5573e42 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2088,6 +2088,9 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net, } } + /* Set temp so that it won't be added into hashtable */ + new_asoc->temp = 1; + /* Compare the tie_tag in cookie with the verification tag of * current association. */ From 159a07604a99bd01e7db112de08d53dc4fcad109 Mon Sep 17 00:00:00 2001 From: Quentin Schulz Date: Tue, 23 May 2017 11:48:08 +0200 Subject: [PATCH 110/153] net: fec: add post PHY reset delay DT property Some PHY require to wait for a bit after the reset GPIO has been toggled. This adds support for the DT property `phy-reset-post-delay` which gives the delay in milliseconds to wait after reset. If the DT property is not given, no delay is observed. Post reset delay greater than 1000ms are invalid. Signed-off-by: Quentin Schulz Reviewed-by: Andrew Lunn Acked-by: Fugang Duan Signed-off-by: David S. Miller --- .../devicetree/bindings/net/fsl-fec.txt | 4 ++++ drivers/net/ethernet/freescale/fec_main.c | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/fsl-fec.txt b/Documentation/devicetree/bindings/net/fsl-fec.txt index a1e3693cca16..6f55bdd52f8a 100644 --- a/Documentation/devicetree/bindings/net/fsl-fec.txt +++ b/Documentation/devicetree/bindings/net/fsl-fec.txt @@ -15,6 +15,10 @@ Optional properties: - phy-reset-active-high : If present then the reset sequence using the GPIO specified in the "phy-reset-gpios" property is reversed (H=reset state, L=operation state). +- phy-reset-post-delay : Post reset delay in milliseconds. If present then + a delay of phy-reset-post-delay milliseconds will be observed after the + phy-reset-gpios has been toggled. Can be omitted thus no delay is + observed. Delay is in range of 1ms to 1000ms. Other delays are invalid. - phy-supply : regulator that powers the Ethernet PHY. - phy-handle : phandle to the PHY device connected to this device. - fixed-link : Assume a fixed link. See fixed-link.txt in the same directory. diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 56a563f90b0b..f7c8649fd28f 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3192,7 +3192,7 @@ static int fec_reset_phy(struct platform_device *pdev) { int err, phy_reset; bool active_high = false; - int msec = 1; + int msec = 1, phy_post_delay = 0; struct device_node *np = pdev->dev.of_node; if (!np) @@ -3209,6 +3209,11 @@ static int fec_reset_phy(struct platform_device *pdev) else if (!gpio_is_valid(phy_reset)) return 0; + err = of_property_read_u32(np, "phy-reset-post-delay", &phy_post_delay); + /* valid reset duration should be less than 1s */ + if (!err && phy_post_delay > 1000) + return -EINVAL; + active_high = of_property_read_bool(np, "phy-reset-active-high"); err = devm_gpio_request_one(&pdev->dev, phy_reset, @@ -3226,6 +3231,15 @@ static int fec_reset_phy(struct platform_device *pdev) gpio_set_value_cansleep(phy_reset, !active_high); + if (!phy_post_delay) + return 0; + + if (phy_post_delay > 20) + msleep(phy_post_delay); + else + usleep_range(phy_post_delay * 1000, + phy_post_delay * 1000 + 1000); + return 0; } #else /* CONFIG_OF */ From 0ff50e83b5122e836ca492fefb11656b225ac29c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 23 May 2017 13:20:28 +0200 Subject: [PATCH 111/153] net: rtnetlink: bail out from rtnl_fdb_dump() on parse error rtnl_fdb_dump() failed to check the result of nlmsg_parse(), which led to contents of |ifm| being uninitialized because nlh->nlmsglen was too small to accommodate |ifm|. The uninitialized data may affect some branches and result in unwanted effects, although kernel data doesn't seem to leak to the userspace directly. The bug has been detected with KMSAN and syzkaller. For the record, here is the KMSAN report: ================================================================== BUG: KMSAN: use of unitialized memory in rtnl_fdb_dump+0x5dc/0x1000 CPU: 0 PID: 1039 Comm: probe Not tainted 4.11.0-rc5+ #2727 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x143/0x1b0 lib/dump_stack.c:52 kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:1007 __kmsan_warning_32+0x66/0xb0 mm/kmsan/kmsan_instr.c:491 rtnl_fdb_dump+0x5dc/0x1000 net/core/rtnetlink.c:3230 netlink_dump+0x84f/0x1190 net/netlink/af_netlink.c:2168 __netlink_dump_start+0xc97/0xe50 net/netlink/af_netlink.c:2258 netlink_dump_start ./include/linux/netlink.h:165 rtnetlink_rcv_msg+0xae9/0xb40 net/core/rtnetlink.c:4094 netlink_rcv_skb+0x339/0x5a0 net/netlink/af_netlink.c:2339 rtnetlink_rcv+0x83/0xa0 net/core/rtnetlink.c:4110 netlink_unicast_kernel net/netlink/af_netlink.c:1272 netlink_unicast+0x13b7/0x1480 net/netlink/af_netlink.c:1298 netlink_sendmsg+0x10b8/0x10f0 net/netlink/af_netlink.c:1844 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 ___sys_sendmsg+0xd4b/0x10f0 net/socket.c:1997 __sys_sendmsg net/socket.c:2031 SYSC_sendmsg+0x2c6/0x3f0 net/socket.c:2042 SyS_sendmsg+0x87/0xb0 net/socket.c:2038 do_syscall_64+0x102/0x150 arch/x86/entry/common.c:285 entry_SYSCALL64_slow_path+0x25/0x25 arch/x86/entry/entry_64.S:246 RIP: 0033:0x401300 RSP: 002b:00007ffc3b0e6d58 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002b0 RCX: 0000000000401300 RDX: 0000000000000000 RSI: 00007ffc3b0e6d80 RDI: 0000000000000003 RBP: 00007ffc3b0e6e00 R08: 000000000000000b R09: 0000000000000004 R10: 000000000000000d R11: 0000000000000246 R12: 0000000000000000 R13: 00000000004065a0 R14: 0000000000406630 R15: 0000000000000000 origin: 000000008fe00056 save_stack_trace+0x59/0x60 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:352 kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:247 kmsan_poison_shadow+0x6d/0xc0 mm/kmsan/kmsan.c:260 slab_alloc_node mm/slub.c:2743 __kmalloc_node_track_caller+0x1f4/0x390 mm/slub.c:4349 __kmalloc_reserve net/core/skbuff.c:138 __alloc_skb+0x2cd/0x740 net/core/skbuff.c:231 alloc_skb ./include/linux/skbuff.h:933 netlink_alloc_large_skb net/netlink/af_netlink.c:1144 netlink_sendmsg+0x934/0x10f0 net/netlink/af_netlink.c:1819 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 ___sys_sendmsg+0xd4b/0x10f0 net/socket.c:1997 __sys_sendmsg net/socket.c:2031 SYSC_sendmsg+0x2c6/0x3f0 net/socket.c:2042 SyS_sendmsg+0x87/0xb0 net/socket.c:2038 do_syscall_64+0x102/0x150 arch/x86/entry/common.c:285 return_from_SYSCALL_64+0x0/0x6a arch/x86/entry/entry_64.S:246 ================================================================== and the reproducer: ================================================================== #include #include #include #include int main() { int sock = socket(PF_NETLINK, SOCK_DGRAM | SOCK_NONBLOCK, 0); struct msghdr msg; memset(&msg, 0, sizeof(msg)); char nlmsg_buf[32]; memset(nlmsg_buf, 0, sizeof(nlmsg_buf)); struct nlmsghdr *nlmsg = nlmsg_buf; nlmsg->nlmsg_len = 0x11; nlmsg->nlmsg_type = 0x1e; // RTM_NEWROUTE = RTM_BASE + 0x0e // type = 0x0e = 1110b // kind = 2 nlmsg->nlmsg_flags = 0x101; // NLM_F_ROOT | NLM_F_REQUEST nlmsg->nlmsg_seq = 0; nlmsg->nlmsg_pid = 0; nlmsg_buf[16] = (char)7; struct iovec iov; iov.iov_base = nlmsg_buf; iov.iov_len = 17; msg.msg_iov = &iov; msg.msg_iovlen = 1; sendmsg(sock, &msg, 0); return 0; } ================================================================== Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 49a279a7cc15..9e2c0a7cb325 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3231,8 +3231,11 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int err = 0; int fidx = 0; - if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, NULL) == 0) { + err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, NULL); + if (err < 0) { + return -EINVAL; + } else if (err == 0) { if (tb[IFLA_MASTER]) br_idx = nla_get_u32(tb[IFLA_MASTER]); } From cd47512e51190efc34a6b90d5c6b54de036ea421 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 23 May 2017 08:19:49 -0700 Subject: [PATCH 112/153] net/phy: fix mdio-octeon dependency and build Fix build errors by making this driver depend on OF_MDIO, like several other similar drivers do. drivers/built-in.o: In function `octeon_mdiobus_remove': mdio-octeon.c:(.text+0x196ee0): undefined reference to `mdiobus_unregister' mdio-octeon.c:(.text+0x196ee8): undefined reference to `mdiobus_free' drivers/built-in.o: In function `octeon_mdiobus_probe': mdio-octeon.c:(.text+0x196f1d): undefined reference to `devm_mdiobus_alloc_size' mdio-octeon.c:(.text+0x196ffe): undefined reference to `of_mdiobus_register' mdio-octeon.c:(.text+0x197010): undefined reference to `mdiobus_free' Signed-off-by: Randy Dunlap Cc: Andrew Lunn Cc: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 60ffc9da6a28..c360dd6ead22 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -108,7 +108,7 @@ config MDIO_MOXART config MDIO_OCTEON tristate "Octeon and some ThunderX SOCs MDIO buses" depends on 64BIT - depends on HAS_IOMEM + depends on HAS_IOMEM && OF_MDIO select MDIO_CAVIUM help This module provides a driver for the Octeon and ThunderX MDIO From f2899788353c13891412b273fdff5f02d49aa40f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 23 May 2017 17:49:13 +0200 Subject: [PATCH 113/153] net: phy: marvell: Limit errata to 88m1101 The 88m1101 has an errata when configuring autoneg. However, it was being applied to many other Marvell PHYs as well. Limit its scope to just the 88m1101. Fixes: 76884679c644 ("phylib: Add support for Marvell 88e1111S and 88e1145") Reported-by: Daniel Walker Signed-off-by: Andrew Lunn Acked-by: Harini Katakam Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 66 ++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 272b051a0199..9097e42bec2e 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -255,34 +255,6 @@ static int marvell_config_aneg(struct phy_device *phydev) { int err; - /* The Marvell PHY has an errata which requires - * that certain registers get written in order - * to restart autonegotiation */ - err = phy_write(phydev, MII_BMCR, BMCR_RESET); - - if (err < 0) - return err; - - err = phy_write(phydev, 0x1d, 0x1f); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1e, 0x200c); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1d, 0x5); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1e, 0); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1e, 0x100); - if (err < 0) - return err; - err = marvell_set_polarity(phydev, phydev->mdix_ctrl); if (err < 0) return err; @@ -316,6 +288,42 @@ static int marvell_config_aneg(struct phy_device *phydev) return 0; } +static int m88e1101_config_aneg(struct phy_device *phydev) +{ + int err; + + /* This Marvell PHY has an errata which requires + * that certain registers get written in order + * to restart autonegotiation + */ + err = phy_write(phydev, MII_BMCR, BMCR_RESET); + + if (err < 0) + return err; + + err = phy_write(phydev, 0x1d, 0x1f); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1e, 0x200c); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1d, 0x5); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1e, 0); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1e, 0x100); + if (err < 0) + return err; + + return marvell_config_aneg(phydev); +} + static int m88e1111_config_aneg(struct phy_device *phydev) { int err; @@ -1892,7 +1900,7 @@ static struct phy_driver marvell_drivers[] = { .flags = PHY_HAS_INTERRUPT, .probe = marvell_probe, .config_init = &marvell_config_init, - .config_aneg = &marvell_config_aneg, + .config_aneg = &m88e1101_config_aneg, .read_status = &genphy_read_status, .ack_interrupt = &marvell_ack_interrupt, .config_intr = &marvell_config_intr, From b3c85a0fb2c79f2c945fa1305b39974d0acf3105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 10 May 2017 20:06:58 +0200 Subject: [PATCH 114/153] drm/amdgpu: fix fundamental suspend/resume issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reinitializing the VM manager during suspend/resume is a very very bad idea since all the VMs are still active and kicking. This can lead to random VM faults after resume when new processes become the same client ID assigned. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 22 +++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 1 + drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 15 ++------------- drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 15 ++------------- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 15 ++------------- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 16 ++-------------- 6 files changed, 30 insertions(+), 54 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 07ff3b1514f1..1bf36c3542c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -672,6 +672,7 @@ void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub, struct amdgpu_vm_id_manager *id_mgr = &adev->vm_manager.id_mgr[vmhub]; struct amdgpu_vm_id *id = &id_mgr->ids[vmid]; + atomic64_set(&id->owner, 0); id->gds_base = 0; id->gds_size = 0; id->gws_base = 0; @@ -680,6 +681,26 @@ void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub, id->oa_size = 0; } +/** + * amdgpu_vm_reset_all_id - reset VMID to zero + * + * @adev: amdgpu device structure + * + * Reset VMID to force flush on next use + */ +void amdgpu_vm_reset_all_ids(struct amdgpu_device *adev) +{ + unsigned i, j; + + for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) { + struct amdgpu_vm_id_manager *id_mgr = + &adev->vm_manager.id_mgr[i]; + + for (j = 1; j < id_mgr->num_ids; ++j) + amdgpu_vm_reset_id(adev, i, j); + } +} + /** * amdgpu_vm_bo_find - find the bo_va for a specific vm & bo * @@ -2270,7 +2291,6 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev) for (i = 0; i < AMDGPU_MAX_RINGS; ++i) adev->vm_manager.seqno[i] = 0; - atomic_set(&adev->vm_manager.vm_pte_next_ring, 0); atomic64_set(&adev->vm_manager.client_counter, 0); spin_lock_init(&adev->vm_manager.prt_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index d97e28b4bdc4..e1d951ece433 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -204,6 +204,7 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job); void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub, unsigned vmid); +void amdgpu_vm_reset_all_ids(struct amdgpu_device *adev); int amdgpu_vm_update_directories(struct amdgpu_device *adev, struct amdgpu_vm *vm); int amdgpu_vm_clear_freed(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c index a572979f186c..d860939152df 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c @@ -950,10 +950,6 @@ static int gmc_v6_0_suspend(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->vm_manager.enabled) { - gmc_v6_0_vm_fini(adev); - adev->vm_manager.enabled = false; - } gmc_v6_0_hw_fini(adev); return 0; @@ -968,16 +964,9 @@ static int gmc_v6_0_resume(void *handle) if (r) return r; - if (!adev->vm_manager.enabled) { - r = gmc_v6_0_vm_init(adev); - if (r) { - dev_err(adev->dev, "vm manager initialization failed (%d).\n", r); - return r; - } - adev->vm_manager.enabled = true; - } + amdgpu_vm_reset_all_ids(adev); - return r; + return 0; } static bool gmc_v6_0_is_idle(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c index a9083a16a250..2750e5c23813 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c @@ -1117,10 +1117,6 @@ static int gmc_v7_0_suspend(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->vm_manager.enabled) { - gmc_v7_0_vm_fini(adev); - adev->vm_manager.enabled = false; - } gmc_v7_0_hw_fini(adev); return 0; @@ -1135,16 +1131,9 @@ static int gmc_v7_0_resume(void *handle) if (r) return r; - if (!adev->vm_manager.enabled) { - r = gmc_v7_0_vm_init(adev); - if (r) { - dev_err(adev->dev, "vm manager initialization failed (%d).\n", r); - return r; - } - adev->vm_manager.enabled = true; - } + amdgpu_vm_reset_all_ids(adev); - return r; + return 0; } static bool gmc_v7_0_is_idle(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index 4ac99784160a..f56b4089ee9f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -1209,10 +1209,6 @@ static int gmc_v8_0_suspend(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->vm_manager.enabled) { - gmc_v8_0_vm_fini(adev); - adev->vm_manager.enabled = false; - } gmc_v8_0_hw_fini(adev); return 0; @@ -1227,16 +1223,9 @@ static int gmc_v8_0_resume(void *handle) if (r) return r; - if (!adev->vm_manager.enabled) { - r = gmc_v8_0_vm_init(adev); - if (r) { - dev_err(adev->dev, "vm manager initialization failed (%d).\n", r); - return r; - } - adev->vm_manager.enabled = true; - } + amdgpu_vm_reset_all_ids(adev); - return r; + return 0; } static bool gmc_v8_0_is_idle(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index dc1e1c1d6b24..f936332a069d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -791,10 +791,6 @@ static int gmc_v9_0_suspend(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->vm_manager.enabled) { - gmc_v9_0_vm_fini(adev); - adev->vm_manager.enabled = false; - } gmc_v9_0_hw_fini(adev); return 0; @@ -809,17 +805,9 @@ static int gmc_v9_0_resume(void *handle) if (r) return r; - if (!adev->vm_manager.enabled) { - r = gmc_v9_0_vm_init(adev); - if (r) { - dev_err(adev->dev, - "vm manager initialization failed (%d).\n", r); - return r; - } - adev->vm_manager.enabled = true; - } + amdgpu_vm_reset_all_ids(adev); - return r; + return 0; } static bool gmc_v9_0_is_idle(void *handle) From 35d2f80b07bbe03fb358afb0bdeff7437a7d67ff Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 23 May 2017 13:38:41 -0400 Subject: [PATCH 115/153] vlan: Fix tcp checksum offloads in Q-in-Q vlans It appears that TCP checksum offloading has been broken for Q-in-Q vlans. The behavior was execerbated by the series commit afb0bc972b52 ("Merge branch 'stacked_vlan_tso'") that that enabled accleleration features on stacked vlans. However, event without that series, it is possible to trigger this issue. It just requires a lot more specialized configuration. The root cause is the interaction between how netdev_intersect_features() works, the features actually set on the vlan devices and HW having the ability to run checksum with longer headers. The issue starts when netdev_interesect_features() replaces NETIF_F_HW_CSUM with a combination of NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM, if the HW advertises IP|IPV6 specific checksums. This happens for tagged and multi-tagged packets. However, HW that enables IP|IPV6 checksum offloading doesn't gurantee that packets with arbitrarily long headers can be checksummed. This patch disables IP|IPV6 checksums on the packet for multi-tagged packets. CC: Toshiaki Makita CC: Michal Kubecek Signed-off-by: Vladislav Yasevich Acked-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 8d5fcd6284ce..283dc2f5364d 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -614,14 +614,16 @@ static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb) static inline netdev_features_t vlan_features_check(const struct sk_buff *skb, netdev_features_t features) { - if (skb_vlan_tagged_multi(skb)) - features = netdev_intersect_features(features, - NETIF_F_SG | - NETIF_F_HIGHDMA | - NETIF_F_FRAGLIST | - NETIF_F_HW_CSUM | - NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); + if (skb_vlan_tagged_multi(skb)) { + /* In the case of multi-tagged packets, use a direct mask + * instead of using netdev_interesect_features(), to make + * sure that only devices supporting NETIF_F_HW_CSUM will + * have checksum offloading support. + */ + features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | + NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + } return features; } From cc6e9de62a7f84c9293a2ea41bc412b55bb46e85 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 23 May 2017 13:38:42 -0400 Subject: [PATCH 116/153] be2net: Fix offload features for Q-in-Q packets At least some of the be2net cards do not seem to be capabled of performing checksum offload computions on Q-in-Q packets. In these case, the recevied checksum on the remote is invalid and TCP syn packets are dropped. This patch adds a call to check disbled acceleration features on Q-in-Q tagged traffic. CC: Sathya Perla CC: Ajit Khaparde CC: Sriharsha Basavapatna CC: Somnath Kotur Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index f3a09ab55900..4eee18ce9be4 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5078,9 +5078,11 @@ static netdev_features_t be_features_check(struct sk_buff *skb, struct be_adapter *adapter = netdev_priv(dev); u8 l4_hdr = 0; - /* The code below restricts offload features for some tunneled packets. + /* The code below restricts offload features for some tunneled and + * Q-in-Q packets. * Offload features for normal (non tunnel) packets are unchanged. */ + features = vlan_features_check(skb, features); if (!skb->encapsulation || !(adapter->flags & BE_FLAGS_VXLAN_OFFLOADS)) return features; From 2836b4f224d4fd7d1a2b23c3eecaf0f0ae199a74 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 23 May 2017 13:38:43 -0400 Subject: [PATCH 117/153] virtio-net: enable TSO/checksum offloads for Q-in-Q vlans Since virtio does not provide it's own ndo_features_check handler, TSO, and now checksum offload, are disabled for stacked vlans. Re-enable the support and let the host take care of it. This restores/improves Guest-to-Guest performance over Q-in-Q vlans. Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 9320d96a1632..3e9246cc49c3 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1989,6 +1989,7 @@ static const struct net_device_ops virtnet_netdev = { .ndo_poll_controller = virtnet_netpoll, #endif .ndo_xdp = virtnet_xdp, + .ndo_features_check = passthru_features_check, }; static void virtnet_config_changed_work(struct work_struct *work) From 0a646f331db0eb9efc8d3a95a44872036d441d58 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 11 May 2017 13:10:02 -0400 Subject: [PATCH 118/153] drm/amdgpu/ci: disable mclk switching for high refresh rates (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even if the vblank period would allow it, it still seems to be problematic on some cards. v2: fix logic inversion (Nils) bug: https://bugs.freedesktop.org/show_bug.cgi?id=96868 Cc: stable@vger.kernel.org Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/ci_dpm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/ci_dpm.c b/drivers/gpu/drm/amd/amdgpu/ci_dpm.c index 6dc1410b380f..ec93714e4524 100644 --- a/drivers/gpu/drm/amd/amdgpu/ci_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/ci_dpm.c @@ -906,6 +906,12 @@ static bool ci_dpm_vblank_too_short(struct amdgpu_device *adev) u32 vblank_time = amdgpu_dpm_get_vblank_time(adev); u32 switch_limit = adev->mc.vram_type == AMDGPU_VRAM_TYPE_GDDR5 ? 450 : 300; + /* disable mclk switching if the refresh is >120Hz, even if the + * blanking period would allow it + */ + if (amdgpu_dpm_get_vrefresh(adev) > 120) + return true; + if (vblank_time < switch_limit) return true; else From 58d7e3e427db1bd68f33025519a9468140280a75 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 11 May 2017 13:14:14 -0400 Subject: [PATCH 119/153] drm/radeon/ci: disable mclk switching for high refresh rates (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even if the vblank period would allow it, it still seems to be problematic on some cards. v2: fix logic inversion (Nils) bug: https://bugs.freedesktop.org/show_bug.cgi?id=96868 Cc: stable@vger.kernel.org Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/ci_dpm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/radeon/ci_dpm.c b/drivers/gpu/drm/radeon/ci_dpm.c index 7ba450832e6b..ea36dc4dd5d2 100644 --- a/drivers/gpu/drm/radeon/ci_dpm.c +++ b/drivers/gpu/drm/radeon/ci_dpm.c @@ -776,6 +776,12 @@ bool ci_dpm_vblank_too_short(struct radeon_device *rdev) u32 vblank_time = r600_dpm_get_vblank_time(rdev); u32 switch_limit = pi->mem_gddr5 ? 450 : 300; + /* disable mclk switching if the refresh is >120Hz, even if the + * blanking period would allow it + */ + if (r600_dpm_get_vrefresh(rdev) > 120) + return true; + if (vblank_time < switch_limit) return true; else From 09be4a5219610a6fae3215d4f51f948d6f5d2609 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 11 May 2017 13:46:12 -0400 Subject: [PATCH 120/153] drm/amd/powerplay/smu7: add vblank check for mclk switching (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check to make sure the vblank period is long enough to support mclk switching. v2: drop needless initial assignment (Nils) bug: https://bugs.freedesktop.org/show_bug.cgi?id=96868 Cc: stable@vger.kernel.org Acked-by: Christian König Reviewed-by: Rex Zhu Signed-off-by: Alex Deucher --- .../gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c index a74a3db3056c..1445c51b6d05 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c @@ -2655,6 +2655,28 @@ static int smu7_get_power_state_size(struct pp_hwmgr *hwmgr) return sizeof(struct smu7_power_state); } +static int smu7_vblank_too_short(struct pp_hwmgr *hwmgr, + uint32_t vblank_time_us) +{ + struct smu7_hwmgr *data = (struct smu7_hwmgr *)(hwmgr->backend); + uint32_t switch_limit_us; + + switch (hwmgr->chip_id) { + case CHIP_POLARIS10: + case CHIP_POLARIS11: + case CHIP_POLARIS12: + switch_limit_us = data->is_memory_gddr5 ? 190 : 150; + break; + default: + switch_limit_us = data->is_memory_gddr5 ? 450 : 150; + break; + } + + if (vblank_time_us < switch_limit_us) + return true; + else + return false; +} static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, struct pp_power_state *request_ps, @@ -2669,6 +2691,7 @@ static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, bool disable_mclk_switching; bool disable_mclk_switching_for_frame_lock; struct cgs_display_info info = {0}; + struct cgs_mode_info mode_info = {0}; const struct phm_clock_and_voltage_limits *max_limits; uint32_t i; struct smu7_hwmgr *data = (struct smu7_hwmgr *)(hwmgr->backend); @@ -2677,6 +2700,7 @@ static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, int32_t count; int32_t stable_pstate_sclk = 0, stable_pstate_mclk = 0; + info.mode_info = &mode_info; data->battery_state = (PP_StateUILabel_Battery == request_ps->classification.ui_label); @@ -2703,8 +2727,6 @@ static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, cgs_get_active_displays_info(hwmgr->device, &info); - /*TO DO result = PHM_CheckVBlankTime(hwmgr, &vblankTooShort);*/ - minimum_clocks.engineClock = hwmgr->display_config.min_core_set_clock; minimum_clocks.memoryClock = hwmgr->display_config.min_mem_set_clock; @@ -2769,8 +2791,9 @@ static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, PHM_PlatformCaps_DisableMclkSwitchingForFrameLock); - disable_mclk_switching = (1 < info.display_count) || - disable_mclk_switching_for_frame_lock; + disable_mclk_switching = ((1 < info.display_count) || + disable_mclk_switching_for_frame_lock || + smu7_vblank_too_short(hwmgr, mode_info.vblank_time_us)); sclk = smu7_ps->performance_levels[0].engine_clock; mclk = smu7_ps->performance_levels[0].memory_clock; From 2275a3a2fe9914ba6d76c8ea490da3c08342bd19 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 11 May 2017 13:57:41 -0400 Subject: [PATCH 121/153] drm/amd/powerplay/smu7: disable mclk switching for high refresh rates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even if the vblank period would allow it, it still seems to be problematic on some cards. bug: https://bugs.freedesktop.org/show_bug.cgi?id=96868 Cc: stable@vger.kernel.org Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c index 1445c51b6d05..102eb6d029fa 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c @@ -2793,7 +2793,8 @@ static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, disable_mclk_switching = ((1 < info.display_count) || disable_mclk_switching_for_frame_lock || - smu7_vblank_too_short(hwmgr, mode_info.vblank_time_us)); + smu7_vblank_too_short(hwmgr, mode_info.vblank_time_us) || + (mode_info.refresh_rate > 120)); sclk = smu7_ps->performance_levels[0].engine_clock; mclk = smu7_ps->performance_levels[0].memory_clock; From 3d18e33735a02b1a90aecf14410bf3edbfd4d3dc Mon Sep 17 00:00:00 2001 From: Lyude Date: Thu, 11 May 2017 19:31:12 -0400 Subject: [PATCH 122/153] drm/radeon: Unbreak HPD handling for r600+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We end up reading the interrupt register for HPD5, and then writing it to HPD6 which on systems without anything using HPD5 results in permanently disabling hotplug on one of the display outputs after the first time we acknowledge a hotplug interrupt from the GPU. This code is really bad. But for now, let's just fix this. I will hopefully have a large patch series to refactor all of this soon. Reviewed-by: Christian König Signed-off-by: Lyude Cc: stable@vger.kernel.org Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/cik.c | 4 ++-- drivers/gpu/drm/radeon/evergreen.c | 4 ++-- drivers/gpu/drm/radeon/r600.c | 2 +- drivers/gpu/drm/radeon/si.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index ccebe0f8d2e1..008c145b7f29 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -7401,7 +7401,7 @@ static inline void cik_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.cik.disp_int_cont5 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } @@ -7431,7 +7431,7 @@ static inline void cik_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.cik.disp_int_cont5 & DC_HPD6_RX_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_RX_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index f130ec41ee4b..0bf103536404 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -4927,7 +4927,7 @@ static void evergreen_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } @@ -4958,7 +4958,7 @@ static void evergreen_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_RX_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_RX_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index 0a085176e79b..e06e2d8feab3 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -3988,7 +3988,7 @@ static void r600_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.r600.disp_int_cont2 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index ceee87f029d9..76d1888528e6 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -6317,7 +6317,7 @@ static inline void si_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } @@ -6348,7 +6348,7 @@ static inline void si_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_RX_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_RX_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } From 7c4378f4523d4af05b5941ea906e7032631eb753 Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Thu, 11 May 2017 18:22:17 +0800 Subject: [PATCH 123/153] drm/amdgpu: fix NULL pointer panic of emit_gds_switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ 338.384770] BUG: unable to handle kernel NULL pointer dereference at (null) [ 338.384817] IP: [< (null)>] (null) [ 338.385505] RIP: 0010:[<0000000000000000>] [< (null)>] (null) [ 338.385950] Call Trace: [ 338.385993] [] ? amdgpu_vm_flush+0x283/0x400 [amdgpu] [ 338.386025] [] ? printk+0x4d/0x4f [ 338.386074] [] amdgpu_ib_schedule+0x4a6/0x4d0 [amdgpu] [ 338.386140] [] amdgpu_job_run+0x64/0x180 [amdgpu] [ 338.386203] [] amd_sched_main+0x2e9/0x4a0 [amdgpu] [ 338.386232] [] ? prepare_to_wait_event+0x110/0x110 [ 338.386295] [] ? amd_sched_select_entity+0xe0/0xe0 [amdgpu] [ 338.386327] [] kthread+0xd3/0xf0 [ 338.386349] [] ? kthread_park+0x60/0x60 [ 338.386376] [] ret_from_fork+0x25/0x30 [ 338.386401] Code: Bad RIP value. [ 338.386420] RIP [< (null)>] (null) [ 338.386443] RSP [ 338.386458] CR2: 0000000000000000 [ 338.398508] ---[ end trace 4c66fcdc74b9a0a2 ]--- Signed-off-by: Chunming Zhou Reviewed-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 1bf36c3542c1..8ecf82c5fe74 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -634,7 +634,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job) mutex_unlock(&id_mgr->lock); } - if (gds_switch_needed) { + if (ring->funcs->emit_gds_switch && gds_switch_needed) { id->gds_base = job->gds_base; id->gds_size = job->gds_size; id->gws_base = job->gws_base; From 3083696a1ee68f4845f8e9a21b91e343ff25eff3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 May 2017 13:13:45 +0800 Subject: [PATCH 124/153] drm/amd/powerplay: fix a signedness bugs Smatch complains about a signedness bug here: vega10_hwmgr.c:4202 vega10_force_clock_level() warn: always true condition '(i >= 0) => (0-u32max >= 0)' Fixes: 7b52db39a4c2 ("drm/amd/powerplay: fix bug sclk/mclk level can't be set on vega10.") Signed-off-by: Dan Carpenter Reviewed-by: Eric Huang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c index ad30f5d3a10d..2614af2f553f 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c @@ -4186,7 +4186,7 @@ static int vega10_force_clock_level(struct pp_hwmgr *hwmgr, enum pp_clock_type type, uint32_t mask) { struct vega10_hwmgr *data = (struct vega10_hwmgr *)(hwmgr->backend); - uint32_t i; + int i; if (hwmgr->dpm_level != AMD_DPM_FORCED_LEVEL_MANUAL) return -EINVAL; From a4d768e702de224cc85e0c8eac9311763403b368 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 22 May 2017 19:54:10 -0700 Subject: [PATCH 125/153] xfs: fix unaligned access in xfs_btree_visit_blocks This structure copy was throwing unaligned access warnings on sparc64: Kernel unaligned access at TPC[1043c088] xfs_btree_visit_blocks+0x88/0xe0 [xfs] xfs_btree_copy_ptrs does a memcpy, which avoids it. Signed-off-by: Eric Sandeen Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5392674bf893..3a673ba201aa 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4395,7 +4395,7 @@ xfs_btree_visit_blocks( xfs_btree_readahead_ptr(cur, ptr, 1); /* save for the next iteration of the loop */ - lptr = *ptr; + xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); } /* for each buffer in the level */ From 8affebe16d79ebefb1d9d6d56a46dc89716f9453 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 23 May 2017 08:30:46 -0700 Subject: [PATCH 126/153] xfs: fix off-by-one on max nr_pages in xfs_find_get_desired_pgoff() xfs_find_get_desired_pgoff() is used to search for offset of hole or data in page range [index, end] (both inclusive), and the max number of pages to search should be at least one, if end == index. Otherwise the only page is missed and no hole or data is found, which is not correct. When block size is smaller than page size, this can be demonstrated by preallocating a file with size smaller than page size and writing data to the last block. E.g. run this xfs_io command on a 1k block size XFS on x86_64 host. # xfs_io -fc "falloc 0 3k" -c "pwrite 2k 1k" \ -c "seek -d 0" /mnt/xfs/testfile wrote 1024/1024 bytes at offset 2048 1 KiB, 1 ops; 0.0000 sec (33.675 MiB/sec and 34482.7586 ops/sec) Whence Result DATA EOF Data at offset 2k was missed, and lseek(2) returned ENXIO. This is uncovered by generic/285 subtest 07 and 08 on ppc64 host, where pagesize is 64k. Because a recent change to generic/285 reduced the preallocated file size to smaller than 64k. Cc: stable@vger.kernel.org # v3.7+ Signed-off-by: Eryu Guan Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 35703a801372..aefa2134a8cb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1049,7 +1049,7 @@ xfs_find_get_desired_pgoff( unsigned nr_pages; unsigned int i; - want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); /* From 5375023ae1266553a7baa0845e82917d8803f48c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 May 2017 16:36:22 -0700 Subject: [PATCH 127/153] xfs: Fix missed holes in SEEK_HOLE implementation XFS SEEK_HOLE implementation could miss a hole in an unwritten extent as can be seen by the following command: xfs_io -c "falloc 0 256k" -c "pwrite 0 56k" -c "pwrite 128k 8k" -c "seek -h 0" file wrote 57344/57344 bytes at offset 0 56 KiB, 14 ops; 0.0000 sec (49.312 MiB/sec and 12623.9856 ops/sec) wrote 8192/8192 bytes at offset 131072 8 KiB, 2 ops; 0.0000 sec (70.383 MiB/sec and 18018.0180 ops/sec) Whence Result HOLE 139264 Where we can see that hole at offset 56k was just ignored by SEEK_HOLE implementation. The bug is in xfs_find_get_desired_pgoff() which does not properly detect the case when pages are not contiguous. Fix the problem by properly detecting when found page has larger offset than expected. CC: stable@vger.kernel.org Fixes: d126d43f631f996daeee5006714fed914be32368 Signed-off-by: Jan Kara Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aefa2134a8cb..f1517e9928c7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1076,17 +1076,6 @@ xfs_find_get_desired_pgoff( break; } - /* - * At lease we found one page. If this is the first time we - * step into the loop, and if the first page index offset is - * greater than the given search offset, a hole was found. - */ - if (type == HOLE_OFF && lastoff == startoff && - lastoff < page_offset(pvec.pages[0])) { - found = true; - break; - } - for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; loff_t b_offset; @@ -1098,18 +1087,18 @@ xfs_find_get_desired_pgoff( * file mapping. However, page->index will not change * because we have a reference on the page. * - * Searching done if the page index is out of range. - * If the current offset is not reaches the end of - * the specified search range, there should be a hole - * between them. + * If current page offset is beyond where we've ended, + * we've found a hole. */ - if (page->index > end) { - if (type == HOLE_OFF && lastoff < endoff) { - *offset = lastoff; - found = true; - } + if (type == HOLE_OFF && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { + found = true; + *offset = lastoff; goto out; } + /* Searching done if the page index is out of range. */ + if (page->index > end) + goto out; lock_page(page); /* From d7fd24257aa60316bf81093f7f909dc9475ae974 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 May 2017 16:36:23 -0700 Subject: [PATCH 128/153] xfs: Fix off-by-in in loop termination in xfs_find_get_desired_pgoff() There is an off-by-one error in loop termination conditions in xfs_find_get_desired_pgoff() since 'end' may index a page beyond end of desired range if 'endoff' is page aligned. It doesn't have any visible effects but still it is good to fix it. Signed-off-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f1517e9928c7..dc0e4cb7029b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1043,7 +1043,7 @@ xfs_find_get_desired_pgoff( index = startoff >> PAGE_SHIFT; endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); - end = endoff >> PAGE_SHIFT; + end = (endoff - 1) >> PAGE_SHIFT; do { int want; unsigned nr_pages; From a54fba8f5a0dc36161cacdf2aa90f007f702ec1a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 May 2017 16:36:24 -0700 Subject: [PATCH 129/153] xfs: Move handling of missing page into one place in xfs_find_get_desired_pgoff() Currently several places in xfs_find_get_desired_pgoff() handle the case of a missing page. Make them all handled in one place after the loop has terminated. Signed-off-by: Jan Kara Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 38 ++++++++------------------------------ 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index dc0e4cb7029b..5fb5a0958a14 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1052,29 +1052,8 @@ xfs_find_get_desired_pgoff( want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); - /* - * No page mapped into given range. If we are searching holes - * and if this is the first time we got into the loop, it means - * that the given offset is landed in a hole, return it. - * - * If we have already stepped through some block buffers to find - * holes but they all contains data. In this case, the last - * offset is already updated and pointed to the end of the last - * mapped page, if it does not reach the endpoint to search, - * that means there should be a hole between them. - */ - if (nr_pages == 0) { - /* Data search found nothing */ - if (type == DATA_OFF) - break; - - ASSERT(type == HOLE_OFF); - if (lastoff == startoff || lastoff < endoff) { - found = true; - *offset = lastoff; - } + if (nr_pages == 0) break; - } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1140,21 +1119,20 @@ xfs_find_get_desired_pgoff( /* * The number of returned pages less than our desired, search - * done. In this case, nothing was found for searching data, - * but we found a hole behind the last offset. + * done. */ - if (nr_pages < want) { - if (type == HOLE_OFF) { - *offset = lastoff; - found = true; - } + if (nr_pages < want) break; - } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + /* No page at lastoff and we are not done - we found a hole. */ + if (type == HOLE_OFF && lastoff < endoff) { + *offset = lastoff; + found = true; + } out: pagevec_release(&pvec); return found; From 11387fe4a98f75d1f4cdb3efe3b42b19205c9df5 Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Tue, 23 May 2017 18:37:27 -0400 Subject: [PATCH 130/153] geneve: fix fill_info when using collect_metadata Since 9b4437a5b870 ("geneve: Unify LWT and netdev handling.") fill_info does not return UDP_ZERO_CSUM6_RX when using COLLECT_METADATA. This is because it uses ip_tunnel_info_af() with the device level info, which is not valid for COLLECT_METADATA. Fix by checking for the presence of the actual sockets. Fixes: 9b4437a5b870 ("geneve: Unify LWT and netdev handling.") Signed-off-by: Eric Garver Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/geneve.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index dec5d563ab19..959fd12d2e67 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1293,7 +1293,7 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u32(skb, IFLA_GENEVE_ID, vni)) goto nla_put_failure; - if (ip_tunnel_info_af(info) == AF_INET) { + if (rtnl_dereference(geneve->sock4)) { if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE, info->key.u.ipv4.dst)) goto nla_put_failure; @@ -1302,8 +1302,10 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(info->key.tun_flags & TUNNEL_CSUM))) goto nla_put_failure; + } + #if IS_ENABLED(CONFIG_IPV6) - } else { + if (rtnl_dereference(geneve->sock6)) { if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6, &info->key.u.ipv6.dst)) goto nla_put_failure; @@ -1315,8 +1317,8 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, !geneve->use_udp6_rx_checksums)) goto nla_put_failure; -#endif } +#endif if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) || nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) || From b62ce397675502325d4282924bf70cfb6a005c3a Mon Sep 17 00:00:00 2001 From: Rex Zhu Date: Mon, 22 May 2017 13:11:41 +0800 Subject: [PATCH 131/153] drm/amdgpu: fix null point error when rmmod amdgpu. this bug happened when amdgpu load failed. [ 75.740951] BUG: unable to handle kernel paging request at 00000000000031c0 [ 75.748167] IP: [] amdgpu_fbdev_restore_mode+0x20/0x60 [amdgpu] [ 75.755774] PGD 0 [ 75.759185] Oops: 0000 [#1] SMP [ 75.762408] Modules linked in: amdgpu(OE-) ttm(OE) drm_kms_helper(OE) drm(OE) i2c_algo_bit(E) fb_sys_fops(E) syscopyarea(E) sysfillrect(E) sysimgblt(E) rpcsec_gss_krb5(E) nfsv4(E) nfs(E) fscache(E) eeepc_wmi(E) asus_wmi(E) sparse_keymap(E) intel_rapl(E) snd_hda_codec_hdmi(E) snd_hda_codec_realtek(E) snd_hda_codec_generic(E) snd_hda_intel(E) snd_hda_codec(E) snd_hda_core(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) snd_hwdep(E) snd_pcm(E) snd_seq_midi(E) coretemp(E) kvm_intel(E) snd_seq_midi_event(E) snd_rawmidi(E) kvm(E) snd_seq(E) joydev(E) snd_seq_device(E) snd_timer(E) irqbypass(E) crct10dif_pclmul(E) crc32_pclmul(E) mei_me(E) ghash_clmulni_intel(E) snd(E) aesni_intel(E) mei(E) soundcore(E) aes_x86_64(E) shpchp(E) serio_raw(E) lrw(E) acpi_pad(E) gf128mul(E) glue_helper(E) ablk_helper(E) mac_hid(E) [ 75.835574] cryptd(E) parport_pc(E) ppdev(E) lp(E) nfsd(E) parport(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) sunrpc(E) autofs4(E) hid_generic(E) usbhid(E) mxm_wmi(E) psmouse(E) e1000e(E) ptp(E) pps_core(E) ahci(E) libahci(E) wmi(E) video(E) i2c_hid(E) hid(E) [ 75.858489] CPU: 5 PID: 1603 Comm: rmmod Tainted: G OE 4.9.0-custom #2 [ 75.866183] Hardware name: System manufacturer System Product Name/Z170-A, BIOS 0901 08/31/2015 [ 75.875050] task: ffff88045d1bbb80 task.stack: ffffc90002de4000 [ 75.881094] RIP: 0010:[] [] amdgpu_fbdev_restore_mode+0x20/0x60 [amdgpu] [ 75.891238] RSP: 0018:ffffc90002de7d48 EFLAGS: 00010286 [ 75.896648] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001 [ 75.903933] RDX: 0000000000000000 RSI: ffff88045d1bbb80 RDI: 0000000000000286 [ 75.911183] RBP: ffffc90002de7d50 R08: 0000000000000502 R09: 0000000000000004 [ 75.918449] R10: 0000000000000000 R11: 0000000000000001 R12: ffff880464bf0000 [ 75.925675] R13: ffffffffa0853000 R14: 0000000000000000 R15: 0000564e44f88210 [ 75.932980] FS: 00007f13d5400700(0000) GS:ffff880476540000(0000) knlGS:0000000000000000 [ 75.941238] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 75.947088] CR2: 00000000000031c0 CR3: 000000045fd0b000 CR4: 00000000003406e0 [ 75.954332] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 75.961566] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 75.968834] Stack: [ 75.970881] ffff880464bf0000 ffffc90002de7d60 ffffffffa0636592 ffffc90002de7d80 [ 75.978454] ffffffffa059015f ffff880464bf0000 ffff880464bf0000 ffffc90002de7da8 [ 75.986076] ffffffffa0595216 ffff880464bf0000 ffff880460f4d000 ffffffffa0853000 [ 75.993692] Call Trace: [ 75.996177] [] amdgpu_driver_lastclose_kms+0x12/0x20 [amdgpu] [ 76.003700] [] drm_lastclose+0x2f/0xd0 [drm] [ 76.009777] [] drm_dev_unregister+0x16/0xd0 [drm] [ 76.016255] [] drm_put_dev+0x34/0x70 [drm] [ 76.022139] [] amdgpu_pci_remove+0x15/0x20 [amdgpu] [ 76.028800] [] pci_device_remove+0x39/0xc0 [ 76.034661] [] __device_release_driver+0x9a/0x140 [ 76.041121] [] driver_detach+0xb8/0xc0 [ 76.046575] [] bus_remove_driver+0x55/0xd0 [ 76.052401] [] driver_unregister+0x2c/0x50 [ 76.058244] [] pci_unregister_driver+0x29/0x90 [ 76.064466] [] drm_pci_exit+0x9e/0xb0 [drm] [ 76.070507] [] amdgpu_exit+0x1c/0x32 [amdgpu] [ 76.076609] [] SyS_delete_module+0x1a0/0x200 [ 76.082627] [] ? rcu_eqs_enter.isra.36+0x4a/0x50 [ 76.089001] [] do_syscall_64+0x6e/0x180 [ 76.094583] [] entry_SYSCALL64_slow_path+0x25/0x25 [ 76.101114] Code: 94 c0 c3 31 c0 5d c3 0f 1f 40 00 0f 1f 44 00 00 55 31 c0 48 89 e5 53 48 89 fb 48 c7 c7 1d 21 84 a0 e8 ab 77 b3 e0 e8 fc 8b d7 e0 <48> 8b bb c0 31 00 00 48 85 ff 74 09 e8 ff eb fc ff 85 c0 75 03 [ 76.121432] RIP [] amdgpu_fbdev_restore_mode+0x20/0x60 [amdgpu] Signed-off-by: Rex Zhu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c index 236d9950221b..c0d8c6ff6380 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c @@ -425,10 +425,15 @@ bool amdgpu_fbdev_robj_is_fb(struct amdgpu_device *adev, struct amdgpu_bo *robj) void amdgpu_fbdev_restore_mode(struct amdgpu_device *adev) { - struct amdgpu_fbdev *afbdev = adev->mode_info.rfbdev; + struct amdgpu_fbdev *afbdev; struct drm_fb_helper *fb_helper; int ret; + if (!adev) + return; + + afbdev = adev->mode_info.rfbdev; + if (!afbdev) return; From 65d786c21bf8140dac83563306f46fe0b13a9aaa Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 23 May 2017 18:18:37 -0500 Subject: [PATCH 132/153] net: fix potential null pointer dereference Add null check to avoid a potential null pointer dereference. Addresses-Coverity-ID: 1408831 Signed-off-by: Gustavo A. R. Silva Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- drivers/net/gtp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 4fea1b3dfbb4..7b652bb7ebe4 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -873,7 +873,7 @@ static struct gtp_dev *gtp_find_dev(struct net *src_net, struct nlattr *nla[]) /* Check if there's an existing gtpX device to configure */ dev = dev_get_by_index_rcu(net, nla_get_u32(nla[GTPA_LINK])); - if (dev->netdev_ops == >p_netdev_ops) + if (dev && dev->netdev_ops == >p_netdev_ops) gtp = netdev_priv(dev); put_net(net); From 7c3f1875c66fbc19762760097cabc91849ea0bbb Mon Sep 17 00:00:00 2001 From: Roman Kapl Date: Wed, 24 May 2017 10:22:22 +0200 Subject: [PATCH 133/153] net: move somaxconn init from sysctl code The default value for somaxconn is set in sysctl_core_net_init(), but this function is not called when kernel is configured without CONFIG_SYSCTL. This results in the kernel not being able to accept TCP connections, because the backlog has zero size. Usually, the user ends up with: "TCP: request_sock_TCP: Possible SYN flooding on port 7. Dropping request. Check SNMP counters." If SYN cookies are not enabled the connection is rejected. Before ef547f2ac16 (tcp: remove max_qlen_log), the effects were less severe, because the backlog was always at least eight slots long. Signed-off-by: Roman Kapl Signed-off-by: David S. Miller --- net/core/net_namespace.c | 19 +++++++++++++++++++ net/core/sysctl_net_core.c | 2 -- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1934efd4a9d4..26bbfababff2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -315,6 +315,25 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) goto out; } +static int __net_init net_defaults_init_net(struct net *net) +{ + net->core.sysctl_somaxconn = SOMAXCONN; + return 0; +} + +static struct pernet_operations net_defaults_ops = { + .init = net_defaults_init_net, +}; + +static __init int net_defaults_init(void) +{ + if (register_pernet_subsys(&net_defaults_ops)) + panic("Cannot initialize net default settings"); + + return 0; +} + +core_initcall(net_defaults_init); #ifdef CONFIG_NET_NS static struct ucounts *inc_net_namespaces(struct user_namespace *ns) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index ea23254b2457..b7cd9aafe99e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -479,8 +479,6 @@ static __net_init int sysctl_core_net_init(struct net *net) { struct ctl_table *tbl; - net->core.sysctl_somaxconn = SOMAXCONN; - tbl = netns_core_table; if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); From ba615f675281d76fd19aa03558777f81fb6b6084 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 24 May 2017 09:59:31 -0700 Subject: [PATCH 134/153] tcp: avoid fastopen API to be used on AF_UNSPEC Fastopen API should be used to perform fastopen operations on the TCP socket. It does not make sense to use fastopen API to perform disconnect by calling it with AF_UNSPEC. The fastopen data path is also prone to race conditions and bugs when using with AF_UNSPEC. One issue reported and analyzed by Vegard Nossum is as follows: +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Thread A: Thread B: ------------------------------------------------------------------------ sendto() - tcp_sendmsg() - sk_stream_memory_free() = 0 - goto wait_for_sndbuf - sk_stream_wait_memory() - sk_wait_event() // sleep | sendto(flags=MSG_FASTOPEN, dest_addr=AF_UNSPEC) | - tcp_sendmsg() | - tcp_sendmsg_fastopen() | - __inet_stream_connect() | - tcp_disconnect() //because of AF_UNSPEC | - tcp_transmit_skb()// send RST | - return 0; // no reconnect! | - sk_stream_wait_connect() | - sock_error() | - xchg(&sk->sk_err, 0) | - return -ECONNRESET - ... // wake up, see sk->sk_err == 0 - skb_entail() on TCP_CLOSE socket If the connection is reopened then we will send a brand new SYN packet after thread A has already queued a buffer. At this point I think the socket internal state (sequence numbers etc.) becomes messed up. When the new connection is closed, the FIN-ACK is rejected because the sequence number is outside the window. The other side tries to retransmit, but __tcp_retransmit_skb() calls tcp_trim_head() on an empty skb which corrupts the skb data length and hits a BUG() in copy_and_csum_bits(). +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Hence, this patch adds a check for AF_UNSPEC in the fastopen data path and return EOPNOTSUPP to user if such case happens. Fixes: cf60af03ca4e7 ("tcp: Fast Open client - sendmsg(MSG_FASTOPEN)") Reported-by: Vegard Nossum Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 842b575f8fdd..59792d283ff8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1084,9 +1084,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); + struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) + if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && + uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ @@ -1108,7 +1111,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; - err = __inet_stream_connect(sk->sk_socket, msg->msg_name, + err = __inet_stream_connect(sk->sk_socket, uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst From 5990baaa6d7b437dfcf58b7021ca56b1d6b35869 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Wed, 24 May 2017 15:19:35 -0700 Subject: [PATCH 135/153] arp: fixed -Wuninitialized compiler warning Commit 7d472a59c0e5ec117220a05de6b370447fb6cb66 ("arp: always override existing neigh entries with gratuitous ARP") introduced a compiler warning: net/ipv4/arp.c:880:35: warning: 'addr_type' may be used uninitialized in this function [-Wmaybe-uninitialized] While the code logic seems to be correct and doesn't allow the variable to be used uninitialized, and the warning is not consistently reproducible, it's still worth fixing it for other people not to waste time looking at the warning in case it pops up in the build environment. Yes, compiler is probably at fault, but we will need to accommodate. Fixes: 7d472a59c0e5 ("arp: always override existing neigh entries with gratuitous ARP") Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index ae96e6f3e0cb..e9f3386a528b 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -863,8 +863,8 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); + addr_type = -1; if (n || IN_DEV_ARP_ACCEPT(in_dev)) { - addr_type = -1; is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } From 1ad2f5838d345e1c102bd1cd27c4f4c1349b0dc8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:05 +0200 Subject: [PATCH 136/153] bpf: fix incorrect pruning decision when alignment must be tracked Currently, when we enforce alignment tracking on direct packet access, the verifier lets the following program pass despite doing a packet write with unaligned access: 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (61) r7 = *(u32 *)(r1 +8) 3: (bf) r0 = r2 4: (07) r0 += 14 5: (25) if r7 > 0x1 goto pc+4 R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 6: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 7: (63) *(u32 *)(r0 -4) = r0 8: (b7) r0 = 0 9: (95) exit from 6 to 8: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 8: (b7) r0 = 0 9: (95) exit from 5 to 10: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=2 R10=fp 10: (07) r0 += 1 11: (05) goto pc-6 6: safe <----- here, wrongly found safe processed 15 insns However, if we enforce a pruning mismatch by adding state into r8 which is then being mismatched in states_equal(), we find that for the otherwise same program, the verifier detects a misaligned packet access when actually walking that path: 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (61) r7 = *(u32 *)(r1 +8) 3: (b7) r8 = 1 4: (bf) r0 = r2 5: (07) r0 += 14 6: (25) if r7 > 0x1 goto pc+4 R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 7: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 8: (63) *(u32 *)(r0 -4) = r0 9: (b7) r0 = 0 10: (95) exit from 7 to 9: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 9: (b7) r0 = 0 10: (95) exit from 6 to 11: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=2 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 11: (07) r0 += 1 12: (b7) r8 = 0 13: (05) goto pc-7 <----- mismatch due to r8 7: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=15,r=15) R1=ctx R2=pkt(id=0,off=0,r=15) R3=pkt_end R7=inv,min_value=2 R8=imm0,min_value=0,max_value=0,min_align=2147483648 R10=fp 8: (63) *(u32 *)(r0 -4) = r0 misaligned packet access off 2+15+-4 size 4 The reason why we fail to see it in states_equal() is that the third test in compare_ptrs_to_packet() ... if (old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; ... will let the above pass. The situation we run into is that old->off <= cur->off (14 <= 15), meaning that prior walked paths went with smaller offset, which was later used in the packet access after successful packet range check and found to be safe already. For example: Given is R0=pkt(id=0,off=0,r=0). Adding offset 14 as in above program to it, results in R0=pkt(id=0,off=14,r=0) before the packet range test. Now, testing this against R3=pkt_end with 'if r0 > r3 goto out' will transform R0 into R0=pkt(id=0,off=14,r=14) for the case when we're within bounds. A write into the packet at offset *(u32 *)(r0 -4), that is, 2 + 14 -4, is valid and aligned (2 is for NET_IP_ALIGN). After processing this with all fall-through paths, we later on check paths from branches. When the above skb->mark test is true, then we jump near the end of the program, perform r0 += 1, and jump back to the 'if r0 > r3 goto out' test we've visited earlier already. This time, R0 is of type R0=pkt(id=0,off=15,r=0), and we'll prune that part because this time we'll have a larger safe packet range, and we already found that with off=14 all further insn were already safe, so it's safe as well with a larger off. However, the problem is that the subsequent write into the packet with 2 + 15 -4 is then unaligned, and not caught by the alignment tracking. Note that min_align, aux_off, and aux_off_align were all 0 in this example. Since we cannot tell at this time what kind of packet access was performed in the prior walk and what minimal requirements it has (we might do so in the future, but that requires more complexity), fix it to disable this pruning case for strict alignment for now, and let the verifier do check such paths instead. With that applied, the test cases pass and reject the program due to misalignment. Fixes: d1174416747d ("bpf: Track alignment of register values in the verifier.") Reference: http://patchwork.ozlabs.org/patch/761909/ Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c72cd41f5b8b..e37e06b1229d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -843,9 +843,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, { bool strict = env->strict_alignment; - if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - strict = true; - switch (reg->type) { case PTR_TO_PACKET: return check_pkt_ptr_alignment(reg, off, size, strict); @@ -2696,7 +2693,8 @@ static int check_cfg(struct bpf_verifier_env *env) /* the following conditions reduce the number of explored insns * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet */ -static bool compare_ptrs_to_packet(struct bpf_reg_state *old, +static bool compare_ptrs_to_packet(struct bpf_verifier_env *env, + struct bpf_reg_state *old, struct bpf_reg_state *cur) { if (old->id != cur->id) @@ -2739,7 +2737,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old, * 'if (R4 > data_end)' and all further insn were already good with r=20, * so they will be good with r=30 and we can prune the search. */ - if (old->off <= cur->off && + if (!env->strict_alignment && old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; @@ -2810,7 +2808,7 @@ static bool states_equal(struct bpf_verifier_env *env, continue; if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && - compare_ptrs_to_packet(rold, rcur)) + compare_ptrs_to_packet(env, rold, rcur)) continue; return false; @@ -3588,10 +3586,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) } else { log_level = 0; } - if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT) + + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - else - env->strict_alignment = false; ret = replace_map_fd_with_map_ptr(env); if (ret < 0) @@ -3697,7 +3695,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, mutex_lock(&bpf_verifier_lock); log_level = 0; + env->strict_alignment = false; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + env->strict_alignment = true; env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), From a9789ef9afcb4fb0193f8dd94f2665ba3ad71e79 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:06 +0200 Subject: [PATCH 137/153] bpf: properly reset caller saved regs after helper call and ld_abs/ind Currently, after performing helper calls, we clear all caller saved registers, that is r0 - r5 and fill r0 depending on struct bpf_func_proto specification. The way we reset these regs can affect pruning decisions in later paths, since we only reset register's imm to 0 and type to NOT_INIT. However, we leave out clearing of other variables such as id, min_value, max_value, etc, which can later on lead to pruning mismatches due to stale data. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e37e06b1229d..339c8a1371de 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -463,19 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; +static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + + memset(®s[regno], 0, sizeof(regs[regno])); + regs[regno].type = NOT_INIT; + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; + regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} + static void init_reg_state(struct bpf_reg_state *regs) { int i; - for (i = 0; i < MAX_BPF_REG; i++) { - regs[i].type = NOT_INIT; - regs[i].imm = 0; - regs[i].min_value = BPF_REGISTER_MIN_RANGE; - regs[i].max_value = BPF_REGISTER_MAX_RANGE; - regs[i].min_align = 0; - regs[i].aux_off = 0; - regs[i].aux_off_align = 0; - } + for (i = 0; i < MAX_BPF_REG; i++) + mark_reg_not_init(regs, i); /* frame pointer */ regs[BPF_REG_FP].type = FRAME_PTR; @@ -1346,7 +1349,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs = state->regs; - struct bpf_reg_state *reg; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1413,11 +1415,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } /* reset caller saved regs */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* update return register */ if (fn->ret_type == RET_INTEGER) { @@ -2445,7 +2444,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; u8 mode = BPF_MODE(insn->code); - struct bpf_reg_state *reg; int i, err; if (!may_access_skb(env->prog->type)) { @@ -2478,11 +2476,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* reset caller saved regs to unreadable */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* mark destination R0 register as readable, since it contains * the value fetched from the packet From 41703a731066fde79c3e5ccf3391cf77a98aeda5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:07 +0200 Subject: [PATCH 138/153] bpf: add bpf_clone_redirect to bpf_helper_changes_pkt_data The bpf_clone_redirect() still needs to be listed in bpf_helper_changes_pkt_data() since we call into bpf_try_make_head_writable() from there, thus we need to invalidate prior pkt regs as well. Fixes: 36bbef52c7eb ("bpf: direct packet write and access for helpers for clsact progs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/filter.c b/net/core/filter.c index a253a6197e6b..a6bb95fa87b2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2281,6 +2281,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_change_head || func == bpf_skb_change_tail || func == bpf_skb_pull_data || + func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head) From a316338cb71a3260201490e615f2f6d5c0d8fb2c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:08 +0200 Subject: [PATCH 139/153] bpf: fix wrong exposure of map_flags into fdinfo for lpm trie_alloc() always needs to have BPF_F_NO_PREALLOC passed in via attr->map_flags, since it does not support preallocation yet. We check the flag, but we never copy the flag into trie->map.map_flags, which is later on exposed into fdinfo and used by loaders such as iproute2. Latter uses this in bpf_map_selfcheck_pinned() to test whether a pinned map has the same spec as the one from the BPF obj file and if not, bails out, which is currently the case for lpm since it exposes always 0 as flags. Also copy over flags in array_map_alloc() and stack_map_alloc(). They always have to be 0 right now, but we should make sure to not miss to copy them over at a later point in time when we add actual flags for them to use. Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") Reported-by: Jarno Rajahalme Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 1 + kernel/bpf/lpm_trie.c | 1 + kernel/bpf/stackmap.c | 1 + 3 files changed, 3 insertions(+) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5e00b2333c26..172dc8ee0e3b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; + array->map.map_flags = attr->map_flags; array->elem_size = elem_size; if (!percpu) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 39cfafd895b8..b09185f0f17d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) trie->map.key_size = attr->key_size; trie->map.value_size = attr->value_size; trie->map.max_entries = attr->max_entries; + trie->map.map_flags = attr->map_flags; trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4dfd6f2ec2f9..31147d730abf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->map.key_size = attr->key_size; smap->map.value_size = value_size; smap->map.max_entries = attr->max_entries; + smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; From 614d0d77b49a9b131e58b77473698ab5b2c525b7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:09 +0200 Subject: [PATCH 140/153] bpf: add various verifier test cases This patch adds various verifier test cases: 1) A test case for the pruning issue when tracking alignment is used. 2) Various PTR_TO_MAP_VALUE_OR_NULL tests to make sure pointer arithmetic turns such register into UNKNOWN_VALUE type. 3) Test cases for the special treatment of LD_ABS/LD_IND to make sure verifier doesn't break calling convention here. Latter is needed, since f.e. arm64 JIT uses r1 - r5 for storing temporary data, so they really must be marked as NOT_INIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 10 + tools/include/linux/filter.h | 10 + tools/testing/selftests/bpf/test_verifier.c | 239 +++++++++++++++++++- 3 files changed, 255 insertions(+), 4 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 56197f82af45..62d948f80730 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -272,6 +272,16 @@ struct bpf_prog_aux; .off = OFF, \ .imm = IMM }) +/* Unconditional jumps, goto pc + off16 */ + +#define BPF_JMP_A(OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_JA, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = 0 }) + /* Function call */ #define BPF_EMIT_CALL(FUNC) \ diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index 390d7c9685fd..4ce25d43e8e3 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -208,6 +208,16 @@ .off = OFF, \ .imm = IMM }) +/* Unconditional jumps, goto pc + off16 */ + +#define BPF_JMP_A(OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_JA, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = 0 }) + /* Function call */ #define BPF_EMIT_CALL(FUNC) \ diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 3773562056da..cabb19b1e371 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -49,6 +49,7 @@ #define MAX_NR_MAPS 4 #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS (1 << 0) +#define F_LOAD_WITH_STRICT_ALIGNMENT (1 << 1) struct bpf_test { const char *descr; @@ -2614,6 +2615,30 @@ static struct bpf_test tests[] = { .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, + { + "direct packet access: test17 (pruning, alignment)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 14), + BPF_JMP_IMM(BPF_JGT, BPF_REG_7, 1, 4), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), + BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, -4), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), + BPF_JMP_A(-6), + }, + .errstr = "misaligned packet access off 2+15+-4 size 4", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, + }, { "helper access to packet: test1, valid packet_ptr range", .insns = { @@ -3340,6 +3365,70 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS }, + { + "alu ops on ptr_to_map_value_or_null, 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 4 }, + .errstr = "R4 invalid mem access", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS + }, + { + "alu ops on ptr_to_map_value_or_null, 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_4, -1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 4 }, + .errstr = "R4 invalid mem access", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS + }, + { + "alu ops on ptr_to_map_value_or_null, 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 4 }, + .errstr = "R4 invalid mem access", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS + }, { "invalid memory access with multiple map_lookup_elem calls", .insns = { @@ -4937,7 +5026,149 @@ static struct bpf_test tests[] = { .fixup_map_in_map = { 3 }, .errstr = "R1 type=map_value_or_null expected=map_ptr", .result = REJECT, - } + }, + { + "ld_abs: check calling conv, r1", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .errstr = "R1 !read_ok", + .result = REJECT, + }, + { + "ld_abs: check calling conv, r2", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .errstr = "R2 !read_ok", + .result = REJECT, + }, + { + "ld_abs: check calling conv, r3", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_3), + BPF_EXIT_INSN(), + }, + .errstr = "R3 !read_ok", + .result = REJECT, + }, + { + "ld_abs: check calling conv, r4", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_4), + BPF_EXIT_INSN(), + }, + .errstr = "R4 !read_ok", + .result = REJECT, + }, + { + "ld_abs: check calling conv, r5", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), + BPF_EXIT_INSN(), + }, + .errstr = "R5 !read_ok", + .result = REJECT, + }, + { + "ld_abs: check calling conv, r7", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_7, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "ld_ind: check calling conv, r1", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_LD_IND(BPF_W, BPF_REG_1, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .errstr = "R1 !read_ok", + .result = REJECT, + }, + { + "ld_ind: check calling conv, r2", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_LD_IND(BPF_W, BPF_REG_2, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .errstr = "R2 !read_ok", + .result = REJECT, + }, + { + "ld_ind: check calling conv, r3", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_LD_IND(BPF_W, BPF_REG_3, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_3), + BPF_EXIT_INSN(), + }, + .errstr = "R3 !read_ok", + .result = REJECT, + }, + { + "ld_ind: check calling conv, r4", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_4, 1), + BPF_LD_IND(BPF_W, BPF_REG_4, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_4), + BPF_EXIT_INSN(), + }, + .errstr = "R4 !read_ok", + .result = REJECT, + }, + { + "ld_ind: check calling conv, r5", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_5, 1), + BPF_LD_IND(BPF_W, BPF_REG_5, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), + BPF_EXIT_INSN(), + }, + .errstr = "R5 !read_ok", + .result = REJECT, + }, + { + "ld_ind: check calling conv, r7", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_7, 1), + BPF_LD_IND(BPF_W, BPF_REG_7, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) @@ -5059,9 +5290,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv, do_test_fixup(test, prog, map_fds); - fd_prog = bpf_load_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, - prog, prog_len, "GPL", 0, bpf_vlog, - sizeof(bpf_vlog)); + fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, + prog, prog_len, test->flags & F_LOAD_WITH_STRICT_ALIGNMENT, + "GPL", 0, bpf_vlog, sizeof(bpf_vlog)); expected_ret = unpriv && test->result_unpriv != UNDEF ? test->result_unpriv : test->result; From 791caeb084c57e3a4d648cf1ee799d1f70c0ef4e Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 24 May 2017 16:35:49 -0700 Subject: [PATCH 141/153] test_bpf: Add a couple of tests for BPF_JSGE. Some JITs can optimize comparisons with zero. Add a couple of BPF_JSGE tests against immediate zero. Signed-off-by: David Daney Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- lib/test_bpf.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 889bc31785be..be88cbaadde3 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -4504,6 +4504,44 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + { + "JMP_JSGE_K: Signed jump: value walk 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -3), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 6), + BPF_ALU64_IMM(BPF_ADD, R1, 1), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 4), + BPF_ALU64_IMM(BPF_ADD, R1, 1), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 2), + BPF_ALU64_IMM(BPF_ADD, R1, 1), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 1), + BPF_EXIT_INSN(), /* bad exit */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* good exit */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSGE_K: Signed jump: value walk 2", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -3), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 4), + BPF_ALU64_IMM(BPF_ADD, R1, 2), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 2), + BPF_ALU64_IMM(BPF_ADD, R1, 2), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 1), + BPF_EXIT_INSN(), /* bad exit */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* good exit */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGT | BPF_K */ { "JMP_JGT_K: if (3 > 2) return 1", From 797a93647a48d6cb8a20641a86a71713a947f786 Mon Sep 17 00:00:00 2001 From: Nithin Sujir Date: Wed, 24 May 2017 19:45:17 -0700 Subject: [PATCH 142/153] bonding: Don't update slave->link until ready to commit In the loadbalance arp monitoring scheme, when a slave link change is detected, the slave->link is immediately updated and slave_state_changed is set. Later down the function, the rtnl_lock is acquired and the changes are committed, updating the bond link state. However, the acquisition of the rtnl_lock can fail. The next time the monitor runs, since slave->link is already updated, it determines that link is unchanged. This results in the bond link state permanently out of sync with the slave link. This patch modifies bond_loadbalance_arp_mon() to handle link changes identical to bond_ab_arp_{inspect/commit}(). The new link state is maintained in slave->new_link until we're ready to commit at which point it's copied into slave->link. NOTE: miimon_{inspect/commit}() has a more complex state machine requiring the use of the bond_{propose,commit}_link_state() functions which maintains the intermediate state in slave->link_new_state. The arp monitors don't require that. Testing: This bug is very easy to reproduce with the following steps. 1. In a loop, toggle a slave link of a bond slave interface. 2. In a separate loop, do ifconfig up/down of an unrelated interface to create contention for rtnl_lock. Within a few iterations, the bond link goes out of sync with the slave link. Signed-off-by: Nithin Nayak Sujir Cc: Mahesh Bandewar Cc: Jay Vosburgh Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 73313318399c..2359478b977f 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2612,11 +2612,13 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) bond_for_each_slave_rcu(bond, slave, iter) { unsigned long trans_start = dev_trans_start(slave->dev); + slave->new_link = BOND_LINK_NOCHANGE; + if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, trans_start, 1) && bond_time_in_interval(bond, slave->last_rx, 1)) { - slave->link = BOND_LINK_UP; + slave->new_link = BOND_LINK_UP; slave_state_changed = 1; /* primary_slave has no meaning in round-robin @@ -2643,7 +2645,7 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) if (!bond_time_in_interval(bond, trans_start, 2) || !bond_time_in_interval(bond, slave->last_rx, 2)) { - slave->link = BOND_LINK_DOWN; + slave->new_link = BOND_LINK_DOWN; slave_state_changed = 1; if (slave->link_failure_count < UINT_MAX) @@ -2674,6 +2676,11 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) if (!rtnl_trylock()) goto re_arm; + bond_for_each_slave(bond, slave, iter) { + if (slave->new_link != BOND_LINK_NOCHANGE) + slave->link = slave->new_link; + } + if (slave_state_changed) { bond_slave_state_change(bond); if (BOND_MODE(bond) == BOND_MODE_XOR) From 9bdcfb10f221e796c9619fe48655e0f1272f1d92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 May 2017 15:14:43 +0200 Subject: [PATCH 143/153] nvme-pci: consistencly use ctrl->device for logging This is what most of the code already does and gives much more useful prefixes than the device embedded in the pci_dev. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/pci.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4c2ff2bb26bc..bf8bec39c017 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -263,7 +263,7 @@ static void nvme_dbbuf_set(struct nvme_dev *dev) c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr); if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) { - dev_warn(dev->dev, "unable to set dbbuf\n"); + dev_warn(dev->ctrl.device, "unable to set dbbuf\n"); /* Free memory and continue on */ nvme_dbbuf_dma_free(dev); } @@ -1394,11 +1394,11 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, &pci_status); if (result == PCIBIOS_SUCCESSFUL) - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", csts, pci_status); else - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", csts, result); } @@ -1740,8 +1740,8 @@ static int nvme_pci_enable(struct nvme_dev *dev) */ if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { dev->q_depth = 2; - dev_warn(dev->dev, "detected Apple NVMe controller, set " - "queue depth=%u to work around controller resets\n", + dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " + "set queue depth=%u to work around controller resets\n", dev->q_depth); } @@ -1759,7 +1759,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) if (dev->cmbsz) { if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, &dev_attr_cmb.attr, NULL)) - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "failed to add sysfs attribute for CMB\n"); } } From d3d5b87ddde09bade512526f6df90e8c06c28230 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 May 2017 15:14:44 +0200 Subject: [PATCH 144/153] nvme: replace is_flags field in nvme_ctrl_ops with a flags field So that we can have more flags for transport-specific behavior. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 3 ++- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/target/loop.c | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 04e115834702..228f7c73e2f1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1605,7 +1605,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); - if (ctrl->ops->is_fabrics) { + if (ctrl->ops->flags & NVME_F_FABRICS) { ctrl->icdoff = le16_to_cpu(id->icdoff); ctrl->ioccsz = le32_to_cpu(id->ioccsz); ctrl->iorcsz = le32_to_cpu(id->iorcsz); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 14a009e43aa5..5b14cbefb724 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2647,7 +2647,7 @@ nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl) static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .name = "fc", .module = THIS_MODULE, - .is_fabrics = true, + .flags = NVME_F_FABRICS, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 29c708ca9621..7c4b0f6636c5 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -208,7 +208,8 @@ struct nvme_ns { struct nvme_ctrl_ops { const char *name; struct module *module; - bool is_fabrics; + unsigned int flags; +#define NVME_F_FABRICS (1 << 0) int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index e2c18f3d9dcf..28bd255c144d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1792,7 +1792,7 @@ static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl) static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .name = "rdma", .module = THIS_MODULE, - .is_fabrics = true, + .flags = NVME_F_FABRICS, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index feb497134aee..e503cfff0337 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -558,7 +558,7 @@ static int nvme_loop_reset_ctrl(struct nvme_ctrl *nctrl) static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .name = "loop", .module = THIS_MODULE, - .is_fabrics = true, + .flags = NVME_F_FABRICS, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, From c81bfba9983fc44210d3eb5971e0faac597bf50d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 May 2017 15:14:45 +0200 Subject: [PATCH 145/153] nvme: only setup block integrity if supported by the driver Currently only the PCIe driver supports metadata, so we should not claim integrity support for the other drivers. This prevents nasty crashes with targets that advertise metadata support on fabrics. Also use the opportunity to factor out some code into a separate helper that isn't even compiled if CONFIG_BLK_DEV_INTEGRITY is disabled. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/core.c | 50 +++++++++++++++++++++++++--------------- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 1 + 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 228f7c73e2f1..a60926410438 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -925,6 +925,29 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) } #ifdef CONFIG_BLK_DEV_INTEGRITY +static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, + u16 bs) +{ + struct nvme_ns *ns = disk->private_data; + u16 old_ms = ns->ms; + u8 pi_type = 0; + + ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + + /* PI implementation requires metadata equal t10 pi tuple size */ + if (ns->ms == sizeof(struct t10_pi_tuple)) + pi_type = id->dps & NVME_NS_DPS_PI_MASK; + + if (blk_get_integrity(disk) && + (ns->pi_type != pi_type || ns->ms != old_ms || + bs != queue_logical_block_size(disk->queue) || + (ns->ms && ns->ext))) + blk_integrity_unregister(disk); + + ns->pi_type = pi_type; +} + static void nvme_init_integrity(struct nvme_ns *ns) { struct blk_integrity integrity; @@ -951,6 +974,10 @@ static void nvme_init_integrity(struct nvme_ns *ns) blk_queue_max_integrity_segments(ns->queue, 1); } #else +static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, + u16 bs) +{ +} static void nvme_init_integrity(struct nvme_ns *ns) { } @@ -997,37 +1024,22 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; - u8 lbaf, pi_type; - u16 old_ms; - unsigned short bs; - - old_ms = ns->ms; - lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; - ns->lba_shift = id->lbaf[lbaf].ds; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); - ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + u16 bs; /* * If identify namespace failed, use default 512 byte block size so * block layer can use before failing read/write for 0 capacity. */ + ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; bs = 1 << ns->lba_shift; - /* XXX: PI implementation requires metadata equal t10 pi tuple size */ - pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? - id->dps & NVME_NS_DPS_PI_MASK : 0; blk_mq_freeze_queue(disk->queue); - if (blk_get_integrity(disk) && (ns->pi_type != pi_type || - ns->ms != old_ms || - bs != queue_logical_block_size(disk->queue) || - (ns->ms && ns->ext))) - blk_integrity_unregister(disk); - ns->pi_type = pi_type; + if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) + nvme_prep_integrity(disk, id, bs); blk_queue_logical_block_size(ns->queue, bs); - if (ns->ms && !blk_get_integrity(disk) && !ns->ext) nvme_init_integrity(ns); if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7c4b0f6636c5..9d6a070d4391 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -210,6 +210,7 @@ struct nvme_ctrl_ops { struct module *module; unsigned int flags; #define NVME_F_FABRICS (1 << 0) +#define NVME_F_METADATA_SUPPORTED (1 << 1) int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bf8bec39c017..6103b178e43a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2047,6 +2047,7 @@ static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, + .flags = NVME_F_METADATA_SUPPORTED, .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, From 50af47d04ca530544b27affffb0722f158e2bb9c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 24 May 2017 15:06:31 -0700 Subject: [PATCH 146/153] nvme: Quirk APST on Intel 600P/P3100 devices They have known firmware bugs. A fix is apparently in the works -- once fixed firmware is available, someone from Intel (Hi, Keith!) can adjust the quirk accordingly. Cc: stable@vger.kernel.org # v4.11 Cc: Kai-Heng Feng Cc: Mario Limonciello Signed-off-by: Andy Lutomirski Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6103b178e43a..d52701df7245 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2294,6 +2294,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0x0a54), .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, + { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ From a8ecdd7117ee68fe27009acc8021423870c1dcd7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 25 May 2017 16:38:06 -0700 Subject: [PATCH 147/153] blk-mq: Only register debugfs attributes for blk-mq queues The code in blk-mq-debugfs.c assumes that it is working on a blk-mq queue and is not intended to work on a blk-sq queue. Hence only register blk-mq debugfs attributes for blk-mq queues. Fixes: commit 9c1051aacde8 ("blk-mq: untangle debugfs and sysfs") Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Ming Lei Reviewed-by: Omar Sandoval Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 504fee940052..712b018e9f54 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -887,10 +887,10 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } - if (q->mq_ops) + if (q->mq_ops) { __blk_mq_register_dev(dev, q); - - blk_mq_debugfs_register(q); + blk_mq_debugfs_register(q); + } kobject_uevent(&q->kobj, KOBJ_ADD); From 83b4605b0c16cde5b00c8cf192408d51eab75402 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 May 2017 18:59:54 +0200 Subject: [PATCH 148/153] PCI/msi: fix the pci_alloc_irq_vectors_affinity stub We need to return an error for any call that asks for MSI / MSI-X vectors only, so that non-trivial fallback logic can work properly. Also valid dev->irq and use the "correct" errno value based on feedback from Linus. Signed-off-by: Christoph Hellwig Reported-by: Steven Rostedt Fixes: aff17164 ("PCI: Provide sensible IRQ vector alloc/free routines") Signed-off-by: Linus Torvalds --- include/linux/pci.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/pci.h b/include/linux/pci.h index 33c2b0b77429..fc2e832d7b9c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1342,9 +1342,9 @@ pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, unsigned int max_vecs, unsigned int flags, const struct irq_affinity *aff_desc) { - if (min_vecs > 1) - return -EINVAL; - return 1; + if ((flags & PCI_IRQ_LEGACY) && min_vecs == 1 && dev->irq) + return 1; + return -ENOSPC; } static inline void pci_free_irq_vectors(struct pci_dev *dev) From 0908cf4dfef35fc6ac12329007052ebe93ff1081 Mon Sep 17 00:00:00 2001 From: linzhang Date: Thu, 25 May 2017 14:07:18 +0800 Subject: [PATCH 149/153] net: llc: add lock_sock in llc_ui_bind to avoid a race condition There is a race condition in llc_ui_bind if two or more processes/threads try to bind a same socket. If more processes/threads bind a same socket success that will lead to two problems, one is this action is not what we expected, another is will lead to kernel in unstable status or oops(in my simple test case, cause llc2.ko can't unload). The current code is test SOCK_ZAPPED bit to avoid a process to bind a same socket twice but that is can't avoid more processes/threads try to bind a same socket at the same time. So, add lock_sock in llc_ui_bind like others, such as llc_ui_connect. Signed-off-by: Lin Zhang Signed-off-by: David S. Miller --- net/llc/af_llc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 8364fe5b59e4..c38d16f22d2a 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -311,6 +311,8 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) int rc = -EINVAL; dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); + + lock_sock(sk); if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; @@ -382,6 +384,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) out_put: llc_sap_put(sap); out: + release_sock(sk); return rc; } From 804ec7ebe8ea003999ca8d1bfc499edc6a9e07df Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 25 May 2017 19:14:56 +0200 Subject: [PATCH 150/153] sctp: fix ICMP processing if skb is non-linear sometimes ICMP replies to INIT chunks are ignored by the client, even if the encapsulated SCTP headers match an open socket. This happens when the ICMP packet is carried by a paged skb: use skb_header_pointer() to read packet contents beyond the SCTP header, so that chunk header and initiate tag are validated correctly. v2: - don't use skb_header_pointer() to read the transport header, since icmp_socket_deliver() already puts these 8 bytes in the linear area. - change commit message to make specific reference to INIT chunks. Signed-off-by: Davide Caratti Acked-by: Marcelo Ricardo Leitner Acked-by: Vlad Yasevich Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/input.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index 0e06a278d2a9..ba9ad32fc447 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -473,15 +473,14 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, struct sctp_association **app, struct sctp_transport **tpp) { + struct sctp_init_chunk *chunkhdr, _chunkhdr; union sctp_addr saddr; union sctp_addr daddr; struct sctp_af *af; struct sock *sk = NULL; struct sctp_association *asoc; struct sctp_transport *transport = NULL; - struct sctp_init_chunk *chunkhdr; __u32 vtag = ntohl(sctphdr->vtag); - int len = skb->len - ((void *)sctphdr - (void *)skb->data); *app = NULL; *tpp = NULL; @@ -516,13 +515,16 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, * discard the packet. */ if (vtag == 0) { - chunkhdr = (void *)sctphdr + sizeof(struct sctphdr); - if (len < sizeof(struct sctphdr) + sizeof(sctp_chunkhdr_t) - + sizeof(__be32) || + /* chunk header + first 4 octects of init header */ + chunkhdr = skb_header_pointer(skb, skb_transport_offset(skb) + + sizeof(struct sctphdr), + sizeof(struct sctp_chunkhdr) + + sizeof(__be32), &_chunkhdr); + if (!chunkhdr || chunkhdr->chunk_hdr.type != SCTP_CID_INIT || - ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) { + ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) goto out; - } + } else if (vtag != asoc->c.peer_vtag) { goto out; } From 0e9a709560dbcfbace8bf4019dc5298619235891 Mon Sep 17 00:00:00 2001 From: Peter Dawson Date: Fri, 26 May 2017 06:35:18 +1000 Subject: [PATCH 151/153] ip6_tunnel, ip6_gre: fix setting of DSCP on encapsulated packets This fix addresses two problems in the way the DSCP field is formulated on the encapsulating header of IPv6 tunnels. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195661 1) The IPv6 tunneling code was manipulating the DSCP field of the encapsulating packet using the 32b flowlabel. Since the flowlabel is only the lower 20b it was incorrect to assume that the upper 12b containing the DSCP and ECN fields would remain intact when formulating the encapsulating header. This fix handles the 'inherit' and 'fixed-value' DSCP cases explicitly using the extant dsfield u8 variable. 2) The use of INET_ECN_encapsulate(0, dsfield) in ip6_tnl_xmit was incorrect and resulted in the DSCP value always being set to 0. Commit 90427ef5d2a4 ("ipv6: fix flow labels when the traffic class is non-0") caused the regression by masking out the flowlabel which exposed the incorrect handling of the DSCP portion of the flowlabel in ip6_tunnel and ip6_gre. Fixes: 90427ef5d2a4 ("ipv6: fix flow labels when the traffic class is non-0") Signed-off-by: Peter Dawson Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 13 +++++++------ net/ipv6/ip6_tunnel.c | 21 +++++++++++++-------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 8d128ba79b66..0c5b4caa1949 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -537,11 +537,10 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - dsfield = ipv4_get_dsfield(iph); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) - & IPV6_TCLASS_MASK; + dsfield = ipv4_get_dsfield(iph); + else + dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else @@ -598,9 +597,11 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); + dsfield = ipv6_get_dsfield(ipv6h); + else + dsfield = ip6_tclass(t->parms.flowinfo); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 6eb2ae507500..7ae6c503f1ca 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1196,7 +1196,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; @@ -1231,8 +1231,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (tproto != IPPROTO_IPIP && tproto != 0) return -1; - dsfield = ipv4_get_dsfield(iph); - if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; @@ -1246,6 +1244,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPIP; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; + dsfield = ip6_tclass(key->label); } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; @@ -1254,8 +1253,9 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPIP; if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) - & IPV6_TCLASS_MASK; + dsfield = ipv4_get_dsfield(iph); + else + dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else @@ -1267,6 +1267,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; + dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph)); + skb_set_inner_ipproto(skb, IPPROTO_IPIP); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, @@ -1300,8 +1302,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) ip6_tnl_addr_conflict(t, ipv6h)) return -1; - dsfield = ipv6_get_dsfield(ipv6h); - if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; @@ -1315,6 +1315,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPV6; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; + dsfield = ip6_tclass(key->label); } else { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ @@ -1337,7 +1338,9 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPV6; if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK); + dsfield = ipv6_get_dsfield(ipv6h); + else + dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) @@ -1351,6 +1354,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; + dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h)); + skb_set_inner_ipproto(skb, IPPROTO_IPV6); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, From 82533ad9a1ce3a7a6863849a552c2cc041b55e0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 25 May 2017 22:54:53 +0200 Subject: [PATCH 152/153] net: ethernet: ax88796: don't call free_irq without request_irq first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function ax_init_dev (which is called only from the driver's .probe function) calls free_irq in the error path without having requested the irq in the first place. So drop the free_irq call in the error path. Fixes: 825a2ff1896e ("AX88796 network driver") Signed-off-by: Uwe Kleine-König Signed-off-by: David S. Miller --- drivers/net/ethernet/8390/ax88796.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c index b0a3b85fc6f8..db02bc2fb4b2 100644 --- a/drivers/net/ethernet/8390/ax88796.c +++ b/drivers/net/ethernet/8390/ax88796.c @@ -748,13 +748,13 @@ static int ax_init_dev(struct net_device *dev) ret = ax_mii_init(dev); if (ret) - goto out_irq; + goto err_out; ax_NS8390_init(dev, 0); ret = register_netdev(dev); if (ret) - goto out_irq; + goto err_out; netdev_info(dev, "%dbit, irq %d, %lx, MAC: %pM\n", ei_local->word16 ? 16 : 8, dev->irq, dev->base_addr, @@ -762,9 +762,6 @@ static int ax_init_dev(struct net_device *dev) return 0; - out_irq: - /* cleanup irq */ - free_irq(dev->irq, dev); err_out: return ret; } From 3fb07daff8e99243366a081e5129560734de4ada Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 May 2017 14:27:35 -0700 Subject: [PATCH 153/153] ipv4: add reference counting to metrics Andrey Konovalov reported crashes in ipv4_mtu() I could reproduce the issue with KASAN kernels, between 10.246.7.151 and 10.246.7.152 : 1) 20 concurrent netperf -t TCP_RR -H 10.246.7.152 -l 1000 & 2) At the same time run following loop : while : do ip ro add 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 ip ro del 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 done Cong Wang attempted to add back rt->fi in commit 82486aa6f1b9 ("ipv4: restore rt->fi for reference counting") but this proved to add some issues that were complex to solve. Instead, I suggested to add a refcount to the metrics themselves, being a standalone object (in particular, no reference to other objects) I tried to make this patch as small as possible to ease its backport, instead of being super clean. Note that we believe that only ipv4 dst need to take care of the metric refcount. But if this is wrong, this patch adds the basic infrastructure to extend this to other families. Many thanks to Julian Anastasov for reviewing this patch, and Cong Wang for his efforts on this problem. Fixes: 2860583fe840 ("ipv4: Kill rt->fi") Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Reviewed-by: Julian Anastasov Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/dst.h | 8 +++++++- include/net/ip_fib.h | 10 +++++----- net/core/dst.c | 23 ++++++++++++++--------- net/ipv4/fib_semantics.c | 17 ++++++++++------- net/ipv4/route.c | 10 +++++++++- 5 files changed, 45 insertions(+), 23 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 049af33da3b6..cfc043784166 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -107,10 +107,16 @@ struct dst_entry { }; }; +struct dst_metrics { + u32 metrics[RTAX_MAX]; + atomic_t refcnt; +}; +extern const struct dst_metrics dst_default_metrics; + u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); -extern const u32 dst_default_metrics[]; #define DST_METRICS_READ_ONLY 0x1UL +#define DST_METRICS_REFCOUNTED 0x2UL #define DST_METRICS_FLAGS 0x3UL #define __DST_METRICS_PTR(Y) \ ((u32 *)((Y) & ~DST_METRICS_FLAGS)) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 6692c5758b33..f7f6aa789c61 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -114,11 +114,11 @@ struct fib_info { __be32 fib_prefsrc; u32 fib_tb_id; u32 fib_priority; - u32 *fib_metrics; -#define fib_mtu fib_metrics[RTAX_MTU-1] -#define fib_window fib_metrics[RTAX_WINDOW-1] -#define fib_rtt fib_metrics[RTAX_RTT-1] -#define fib_advmss fib_metrics[RTAX_ADVMSS-1] + struct dst_metrics *fib_metrics; +#define fib_mtu fib_metrics->metrics[RTAX_MTU-1] +#define fib_window fib_metrics->metrics[RTAX_WINDOW-1] +#define fib_rtt fib_metrics->metrics[RTAX_RTT-1] +#define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_weight; diff --git a/net/core/dst.c b/net/core/dst.c index 960e503b5a52..6192f11beec9 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -151,13 +151,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard_out); -const u32 dst_default_metrics[RTAX_MAX + 1] = { +const struct dst_metrics dst_default_metrics = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ - [RTAX_MAX] = 0xdeadbeef, + .refcnt = ATOMIC_INIT(1), }; void dst_init(struct dst_entry *dst, struct dst_ops *ops, @@ -169,7 +169,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, if (dev) dev_hold(dev); dst->ops = ops; - dst_init_metrics(dst, dst_default_metrics, true); + dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; dst->path = dst; dst->from = NULL; @@ -314,25 +314,30 @@ EXPORT_SYMBOL(dst_release); u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { - u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); + struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); if (p) { - u32 *old_p = __DST_METRICS_PTR(old); + struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + atomic_set(&p->refcnt, 1); + memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { kfree(p); - p = __DST_METRICS_PTR(prev); + p = (struct dst_metrics *)__DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; + } else if (prev & DST_METRICS_REFCOUNTED) { + if (atomic_dec_and_test(&old_p->refcnt)) + kfree(old_p); } } - return p; + BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); + return (u32 *)p; } EXPORT_SYMBOL(dst_cow_metrics_generic); @@ -341,7 +346,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) { unsigned long prev, new; - new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; + new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; prev = cmpxchg(&dst->_metrics, old, new); if (prev == old) kfree(__DST_METRICS_PTR(old)); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da449ddb8cc1..ad9ad4aab5da 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -203,6 +203,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + struct dst_metrics *m; change_nexthops(fi) { if (nexthop_nh->nh_dev) @@ -213,8 +214,9 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - if (fi->fib_metrics != (u32 *) dst_default_metrics) - kfree(fi->fib_metrics); + m = fi->fib_metrics; + if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) + kfree(m); kfree(fi); } @@ -971,11 +973,11 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) return -EINVAL; - fi->fib_metrics[type - 1] = val; + fi->fib_metrics->metrics[type - 1] = val; } if (ecn_ca) - fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; return 0; } @@ -1033,11 +1035,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; fib_info_cnt++; if (cfg->fc_mx) { - fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); if (!fi->fib_metrics) goto failure; + atomic_set(&fi->fib_metrics->refcnt, 1); } else - fi->fib_metrics = (u32 *) dst_default_metrics; + fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; @@ -1238,7 +1241,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) goto nla_put_failure; - if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) goto nla_put_failure; if (fi->fib_prefsrc && diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 655d9eebe43e..6883b3d4ba8f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1385,8 +1385,12 @@ static void rt_add_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *) dst; + if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt)) + kfree(p); + if (!list_empty(&rt->rt_uncached)) { struct uncached_list *ul = rt->rt_uncached_list; @@ -1438,7 +1442,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); + dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); + if (fi->fib_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + atomic_inc(&fi->fib_metrics->refcnt); + } #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif