From 07a63cbe8bcb6ba72fb989dcab1ec55ec6c36c7e Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 2 May 2017 13:36:00 +0200 Subject: [PATCH 01/74] s390/cputime: fix incorrect system time git commit c5328901aa1db134 "[S390] entry[64].S improvements" removed the update of the exit_timer lowcore field from the critical section cleanup of the .Lsysc_restore/.Lsysc_done and .Lio_restore/.Lio_done blocks. If the PSW is updated by the critical section cleanup to point to user space again, the interrupt entry code will do a vtime calculation after the cleanup completed with an exit_timer value which has *not* been updated. Due to this incorrect system time deltas are calculated. If an interrupt occured with an old PSW between .Lsysc_restore/.Lsysc_done or .Lio_restore/.Lio_done update __LC_EXIT_TIMER with the system entry time of the interrupt. Cc: stable@vger.kernel.org # 3.3+ Tested-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/entry.S | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index a5f5d3bb3dbc..e408d9cc5b96 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -312,6 +312,7 @@ ENTRY(system_call) lg %r14,__LC_VDSO_PER_CPU lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) +.Lsysc_exit_timer: stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER lmg %r11,%r15,__PT_R11(%r11) @@ -623,6 +624,7 @@ ENTRY(io_int_handler) lg %r14,__LC_VDSO_PER_CPU lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) +.Lio_exit_timer: stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER lmg %r11,%r15,__PT_R11(%r11) @@ -1174,15 +1176,23 @@ cleanup_critical: br %r14 .Lcleanup_sysc_restore: + # check if stpt has been executed clg %r9,BASED(.Lcleanup_sysc_restore_insn) + jh 0f + mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER + cghi %r11,__LC_SAVE_AREA_ASYNC je 0f + mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER +0: clg %r9,BASED(.Lcleanup_sysc_restore_insn+8) + je 1f lg %r9,24(%r11) # get saved pointer to pt_regs mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) mvc 0(64,%r11),__PT_R8(%r9) lmg %r0,%r7,__PT_R0(%r9) -0: lmg %r8,%r9,__LC_RETURN_PSW +1: lmg %r8,%r9,__LC_RETURN_PSW br %r14 .Lcleanup_sysc_restore_insn: + .quad .Lsysc_exit_timer .quad .Lsysc_done - 4 .Lcleanup_io_tif: @@ -1190,15 +1200,20 @@ cleanup_critical: br %r14 .Lcleanup_io_restore: + # check if stpt has been executed clg %r9,BASED(.Lcleanup_io_restore_insn) - je 0f + jh 0f + mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER +0: clg %r9,BASED(.Lcleanup_io_restore_insn+8) + je 1f lg %r9,24(%r11) # get saved r11 pointer to pt_regs mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) mvc 0(64,%r11),__PT_R8(%r9) lmg %r0,%r7,__PT_R0(%r9) -0: lmg %r8,%r9,__LC_RETURN_PSW +1: lmg %r8,%r9,__LC_RETURN_PSW br %r14 .Lcleanup_io_restore_insn: + .quad .Lio_exit_timer .quad .Lio_done - 4 .Lcleanup_idle: From 085b6ba0f7971d18fc3078b25e8309e9e75659cb Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 2 May 2017 12:38:57 +0200 Subject: [PATCH 02/74] s390/ftrace: fix compile for !MODULES Fix this compile error if CONFIG_MODULES is disabled: arch/s390/built-in.o: In function `ftrace_plt_init': arch/s390/kernel/ftrace.o:(.init.text+0x34cc): undefined reference to `module_alloc' Reported-by: Rob Landley Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 60a8a4e207ed..68f2b8a15eab 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -172,6 +172,8 @@ int __init ftrace_dyn_arch_init(void) return 0; } +#ifdef CONFIG_MODULES + static int __init ftrace_plt_init(void) { unsigned int *ip; @@ -190,6 +192,8 @@ static int __init ftrace_plt_init(void) } device_initcall(ftrace_plt_init); +#endif /* CONFIG_MODULES */ + #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* * Hook the return address and push it in the stack of return addresses From db55947dd2d09cd3e6f722d1205934fec793ee63 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 2 May 2017 13:20:11 +0200 Subject: [PATCH 03/74] s390/uprobes: fix compile for !KPROBES Fix the following compile error(s) if CONFIG_KPROBES is disabled: arch/s390/kernel/uprobes.c:79:14: error: implicit declaration of function 'probe_get_fixup_type' arch/s390/kernel/uprobes.c:87:14: error: 'FIXUP_PSW_NORMAL' undeclared (first use in this function) Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/dis.h | 2 ++ arch/s390/include/asm/kprobes.h | 20 ++++++++++---------- arch/s390/lib/probes.c | 1 + 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/dis.h b/arch/s390/include/asm/dis.h index 60323c21938b..37f617dfbede 100644 --- a/arch/s390/include/asm/dis.h +++ b/arch/s390/include/asm/dis.h @@ -40,6 +40,8 @@ static inline int insn_length(unsigned char code) return ((((int) code + 64) >> 7) + 1) << 1; } +struct pt_regs; + void show_code(struct pt_regs *regs); void print_fn_code(unsigned char *code, unsigned long len); int insn_to_mnemonic(unsigned char *instruction, char *buf, unsigned int len); diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index 1293c4066cfc..28792ef82c83 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -27,12 +27,21 @@ * 2005-Dec Used as a template for s390 by Mike Grundy * */ +#include #include #define BREAKPOINT_INSTRUCTION 0x0002 +#define FIXUP_PSW_NORMAL 0x08 +#define FIXUP_BRANCH_NOT_TAKEN 0x04 +#define FIXUP_RETURN_REGISTER 0x02 +#define FIXUP_NOT_REQUIRED 0x01 + +int probe_is_prohibited_opcode(u16 *insn); +int probe_get_fixup_type(u16 *insn); +int probe_is_insn_relative_long(u16 *insn); + #ifdef CONFIG_KPROBES -#include #include #include #include @@ -56,11 +65,6 @@ typedef u16 kprobe_opcode_t; #define KPROBE_SWAP_INST 0x10 -#define FIXUP_PSW_NORMAL 0x08 -#define FIXUP_BRANCH_NOT_TAKEN 0x04 -#define FIXUP_RETURN_REGISTER 0x02 -#define FIXUP_NOT_REQUIRED 0x01 - /* Architecture specific copy of original instruction */ struct arch_specific_insn { /* copy of original instruction */ @@ -90,10 +94,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr); int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); -int probe_is_prohibited_opcode(u16 *insn); -int probe_get_fixup_type(u16 *insn); -int probe_is_insn_relative_long(u16 *insn); - #define flush_insn_slot(p) do { } while (0) #endif /* CONFIG_KPROBES */ diff --git a/arch/s390/lib/probes.c b/arch/s390/lib/probes.c index ae90e1ae3607..1963ddbf4ab3 100644 --- a/arch/s390/lib/probes.c +++ b/arch/s390/lib/probes.c @@ -4,6 +4,7 @@ * Copyright IBM Corp. 2014 */ +#include #include #include From eb77e6b80f3bed262c7773236f0fb84649fd3091 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 27 Apr 2017 12:11:54 -0500 Subject: [PATCH 04/74] EDAC, amd64: Fix reporting of Chip Select sizes on Fam17h The wrong index into the csbases/csmasks arrays was being passed to the function to compute the chip select sizes, which resulted in the wrong size being computed. Address that so that the correct values are computed and printed. Also, redo how we calculate the number of pages in a CS row. Reported-by: Benjamin Bennett Signed-off-by: Yazen Ghannam Cc: # 4.10.x Cc: linux-edac Link: http://lkml.kernel.org/r/1493313114-11260-1-git-send-email-Yazen.Ghannam@amd.com [ Remove unneeded integer math comment, minor cleanups. ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 40 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 82dab1692264..3aea55698165 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -782,24 +782,26 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl) { - u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases; - int dimm, size0, size1; + int dimm, size0, size1, cs0, cs1; edac_printk(KERN_DEBUG, EDAC_MC, "UMC%d chip selects:\n", ctrl); for (dimm = 0; dimm < 4; dimm++) { size0 = 0; + cs0 = dimm * 2; - if (dcsb[dimm*2] & DCSB_CS_ENABLE) - size0 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, dimm); + if (csrow_enabled(cs0, ctrl, pvt)) + size0 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, cs0); size1 = 0; - if (dcsb[dimm*2 + 1] & DCSB_CS_ENABLE) - size1 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, dimm); + cs1 = dimm * 2 + 1; + + if (csrow_enabled(cs1, ctrl, pvt)) + size1 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, cs1); amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", - dimm * 2, size0, - dimm * 2 + 1, size1); + cs0, size0, + cs1, size1); } } @@ -2756,26 +2758,22 @@ static void read_mc_regs(struct amd64_pvt *pvt) * encompasses * */ -static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) +static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr_orig) { - u32 cs_mode, nr_pages; u32 dbam = dct ? pvt->dbam1 : pvt->dbam0; + int csrow_nr = csrow_nr_orig; + u32 cs_mode, nr_pages; + if (!pvt->umc) + csrow_nr >>= 1; - /* - * The math on this doesn't look right on the surface because x/2*4 can - * be simplified to x*2 but this expression makes use of the fact that - * it is integral math where 1/2=0. This intermediate value becomes the - * number of bits to shift the DBAM register to extract the proper CSROW - * field. - */ - cs_mode = DBAM_DIMM(csrow_nr / 2, dbam); + cs_mode = DBAM_DIMM(csrow_nr, dbam); - nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode, (csrow_nr / 2)) - << (20 - PAGE_SHIFT); + nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode, csrow_nr); + nr_pages <<= 20 - PAGE_SHIFT; edac_dbg(0, "csrow: %d, channel: %d, DBAM idx: %d\n", - csrow_nr, dct, cs_mode); + csrow_nr_orig, dct, cs_mode); edac_dbg(0, "nr_pages/channel: %u\n", nr_pages); return nr_pages; From 3d05f3aed5d721c2c77d20288c29ab26c6193ed5 Mon Sep 17 00:00:00 2001 From: Julia Cartwright Date: Fri, 28 Apr 2017 12:41:02 -0500 Subject: [PATCH 05/74] md/raid5: make use of spin_lock_irq over local_irq_disable + spin_lock On mainline, there is no functional difference, just less code, and symmetric lock/unlock paths. On PREEMPT_RT builds, this fixes the following warning, seen by Alexander GQ Gerasiov, due to the sleeping nature of spinlocks. BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:993 in_atomic(): 0, irqs_disabled(): 1, pid: 58, name: kworker/u12:1 CPU: 5 PID: 58 Comm: kworker/u12:1 Tainted: G W 4.9.20-rt16-stand6-686 #1 Hardware name: Supermicro SYS-5027R-WRF/X9SRW-F, BIOS 3.2a 10/28/2015 Workqueue: writeback wb_workfn (flush-253:0) Call Trace: dump_stack+0x47/0x68 ? migrate_enable+0x4a/0xf0 ___might_sleep+0x101/0x180 rt_spin_lock+0x17/0x40 add_stripe_bio+0x4e3/0x6c0 [raid456] ? preempt_count_add+0x42/0xb0 raid5_make_request+0x737/0xdd0 [raid456] Reported-by: Alexander GQ Gerasiov Tested-by: Alexander GQ Gerasiov Signed-off-by: Julia Cartwright Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2e38cfac5b1d..3809a2192132 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) { int i; - local_irq_disable(); - spin_lock(conf->hash_locks); + spin_lock_irq(conf->hash_locks); for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); spin_lock(&conf->device_lock); @@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) { int i; spin_unlock(&conf->device_lock); - for (i = NR_STRIPE_HASH_LOCKS; i; i--) - spin_unlock(conf->hash_locks + i - 1); - local_irq_enable(); + for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) + spin_unlock(conf->hash_locks + i); + spin_unlock_irq(conf->hash_locks); } /* Find first data disk in a raid6 stripe */ @@ -714,12 +713,11 @@ static bool is_full_stripe_write(struct stripe_head *sh) static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) { - local_irq_disable(); if (sh1 > sh2) { - spin_lock(&sh2->stripe_lock); + spin_lock_irq(&sh2->stripe_lock); spin_lock_nested(&sh1->stripe_lock, 1); } else { - spin_lock(&sh1->stripe_lock); + spin_lock_irq(&sh1->stripe_lock); spin_lock_nested(&sh2->stripe_lock, 1); } } @@ -727,8 +725,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) { spin_unlock(&sh1->stripe_lock); - spin_unlock(&sh2->stripe_lock); - local_irq_enable(); + spin_unlock_irq(&sh2->stripe_lock); } /* Only freshly new full stripe normal write stripe can be added to a batch list */ From 2214c260c72b0bd94e6c1c19bf451686212025d3 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Mon, 8 May 2017 11:56:55 +0200 Subject: [PATCH 06/74] md: don't return -EAGAIN in md_allow_write for external metadata arrays This essentially reverts commit b5470dc5fc18 ("md: resolve external metadata handling deadlock in md_allow_write") with some adjustments. Since commit 6791875e2e53 ("md: make reconfig_mutex optional for writes to md sysfs files.") changing array_state to 'active' does not use mddev_lock() and will not cause a deadlock with md_allow_write(). This revert simplifies userspace tools that write to sysfs attributes like "stripe_cache_size" or "consistency_policy" because it removes the need for special handling for external metadata arrays, checking for EAGAIN and retrying the write. Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/md.c | 20 ++++++++------------ drivers/md/md.h | 2 +- drivers/md/raid1.c | 9 +++------ drivers/md/raid5.c | 12 +++--------- 4 files changed, 15 insertions(+), 28 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 82f798be964f..10367ffe92e3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8022,18 +8022,15 @@ EXPORT_SYMBOL(md_write_end); * may proceed without blocking. It is important to call this before * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. - * - * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock - * is dropped, so return -EAGAIN after notifying userspace. */ -int md_allow_write(struct mddev *mddev) +void md_allow_write(struct mddev *mddev) { if (!mddev->pers) - return 0; + return; if (mddev->ro) - return 0; + return; if (!mddev->pers->sync_request) - return 0; + return; spin_lock(&mddev->lock); if (mddev->in_sync) { @@ -8046,13 +8043,12 @@ int md_allow_write(struct mddev *mddev) spin_unlock(&mddev->lock); md_update_sb(mddev, 0); sysfs_notify_dirent_safe(mddev->sysfs_state); + /* wait for the dirty state to be recorded in the metadata */ + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); - - if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) - return -EAGAIN; - else - return 0; } EXPORT_SYMBOL_GPL(md_allow_write); diff --git a/drivers/md/md.h b/drivers/md/md.h index 4e75d121bfcc..11f15146ce51 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -665,7 +665,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, bool metadata_op); extern void md_do_sync(struct md_thread *thread); extern void md_new_event(struct mddev *mddev); -extern int md_allow_write(struct mddev *mddev); +extern void md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7ed59351fe97..7c1f73398800 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3197,7 +3197,7 @@ static int raid1_reshape(struct mddev *mddev) struct r1conf *conf = mddev->private; int cnt, raid_disks; unsigned long flags; - int d, d2, err; + int d, d2; /* Cannot change chunk_size, layout, or level */ if (mddev->chunk_sectors != mddev->new_chunk_sectors || @@ -3209,11 +3209,8 @@ static int raid1_reshape(struct mddev *mddev) return -EINVAL; } - if (!mddev_is_clustered(mddev)) { - err = md_allow_write(mddev); - if (err) - return err; - } + if (!mddev_is_clustered(mddev)) + md_allow_write(mddev); raid_disks = mddev->raid_disks + mddev->delta_disks; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3809a2192132..f8055a7abb4b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2309,14 +2309,12 @@ static int resize_stripes(struct r5conf *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - int err; + int err = 0; struct kmem_cache *sc; int i; int hash, cnt; - err = md_allow_write(conf->mddev); - if (err) - return err; + md_allow_write(conf->mddev); /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], @@ -6310,7 +6308,6 @@ int raid5_set_cache_size(struct mddev *mddev, int size) { struct r5conf *conf = mddev->private; - int err; if (size <= 16 || size > 32768) return -EINVAL; @@ -6322,10 +6319,7 @@ raid5_set_cache_size(struct mddev *mddev, int size) ; mutex_unlock(&conf->cache_size_mutex); - - err = md_allow_write(mddev); - if (err) - return err; + md_allow_write(mddev); mutex_lock(&conf->cache_size_mutex); while (size > conf->max_nr_stripes) From 29efc390b9462582ae95eb9a0b8cd17ab956afc0 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Sun, 7 May 2017 17:36:24 -0700 Subject: [PATCH 07/74] md/md0: optimize raid0 discard handling There are complaints that raid0 discard handling is slow. Currently we divide discard request into chunks and dispatch to underlayer disks. The block layer will do merge to form big requests. This causes a lot of request split/merge and uses significant CPU time. A simple idea is to calculate the range for each raid disk for an IO request and send a discard request to raid disks, which will avoid the split/merge completely. Previously Coly tried the approach, but the implementation was too complex because of raid0 zones. This patch always split bio in zone boundary and handle bio within one zone. It simplifies the implementation a lot. Reviewed-by: NeilBrown Acked-by: Coly Li Signed-off-by: Shaohua Li --- drivers/md/raid0.c | 116 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 102 insertions(+), 14 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 84e58596594d..d6c0bc76e837 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev) blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_discard_sectors(mddev->queue, UINT_MAX); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); blk_queue_io_opt(mddev->queue, @@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev, } } +static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) +{ + struct r0conf *conf = mddev->private; + struct strip_zone *zone; + sector_t start = bio->bi_iter.bi_sector; + sector_t end; + unsigned int stripe_size; + sector_t first_stripe_index, last_stripe_index; + sector_t start_disk_offset; + unsigned int start_disk_index; + sector_t end_disk_offset; + unsigned int end_disk_index; + unsigned int disk; + + zone = find_zone(conf, &start); + + if (bio_end_sector(bio) > zone->zone_end) { + struct bio *split = bio_split(bio, + zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, + mddev->bio_set); + bio_chain(split, bio); + generic_make_request(bio); + bio = split; + end = zone->zone_end; + } else + end = bio_end_sector(bio); + + if (zone != conf->strip_zone) + end = end - zone[-1].zone_end; + + /* Now start and end is the offset in zone */ + stripe_size = zone->nb_dev * mddev->chunk_sectors; + + first_stripe_index = start; + sector_div(first_stripe_index, stripe_size); + last_stripe_index = end; + sector_div(last_stripe_index, stripe_size); + + start_disk_index = (int)(start - first_stripe_index * stripe_size) / + mddev->chunk_sectors; + start_disk_offset = ((int)(start - first_stripe_index * stripe_size) % + mddev->chunk_sectors) + + first_stripe_index * mddev->chunk_sectors; + end_disk_index = (int)(end - last_stripe_index * stripe_size) / + mddev->chunk_sectors; + end_disk_offset = ((int)(end - last_stripe_index * stripe_size) % + mddev->chunk_sectors) + + last_stripe_index * mddev->chunk_sectors; + + for (disk = 0; disk < zone->nb_dev; disk++) { + sector_t dev_start, dev_end; + struct bio *discard_bio = NULL; + struct md_rdev *rdev; + + if (disk < start_disk_index) + dev_start = (first_stripe_index + 1) * + mddev->chunk_sectors; + else if (disk > start_disk_index) + dev_start = first_stripe_index * mddev->chunk_sectors; + else + dev_start = start_disk_offset; + + if (disk < end_disk_index) + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > end_disk_index) + dev_end = last_stripe_index * mddev->chunk_sectors; + else + dev_end = end_disk_offset; + + if (dev_end <= dev_start) + continue; + + rdev = conf->devlist[(zone - conf->strip_zone) * + conf->strip_zone[0].nb_dev + disk]; + if (__blkdev_issue_discard(rdev->bdev, + dev_start + zone->dev_start + rdev->data_offset, + dev_end - dev_start, GFP_NOIO, 0, &discard_bio) || + !discard_bio) + continue; + bio_chain(discard_bio, bio); + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(rdev->bdev), + discard_bio, disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); + generic_make_request(discard_bio); + } + bio_endio(bio); +} + static void raid0_make_request(struct mddev *mddev, struct bio *bio) { struct strip_zone *zone; @@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) return; } + if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { + raid0_handle_discard(mddev, bio); + return; + } + bio_sector = bio->bi_iter.bi_sector; sector = bio_sector; chunk_sects = mddev->chunk_sectors; @@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) bio->bi_iter.bi_sector = sector + zone->dev_start + tmp_dev->data_offset; - if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { - /* Just ignore it */ - bio_endio(bio); - } else { - if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), - bio, disk_devt(mddev->gendisk), - bio_sector); - mddev_check_writesame(mddev, bio); - mddev_check_write_zeroes(mddev, bio); - generic_make_request(bio); - } + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), + bio, disk_devt(mddev->gendisk), + bio_sector); + mddev_check_writesame(mddev, bio); + mddev_check_write_zeroes(mddev, bio); + generic_make_request(bio); } static void raid0_status(struct seq_file *seq, struct mddev *mddev) From f5c8b9601036869e162cb278aaafbf003dc4e5a0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 3 May 2017 09:15:07 +0200 Subject: [PATCH 08/74] s390/uaccess: use sane length for __strncpy_from_user() The average string that is copied from user space to kernel space is rather short. E.g. booting a system involves about 50.000 strncpy_from_user() calls where the NULL terminated string has an average size of 27 bytes. By default our s390 specific strncpy_from_user() implementation however copies up to 4096 bytes, which is a waste of cpu cycles and cache lines. Therefore reduce the default length to L1_CACHE_BYTES (256 bytes), which also reduces the average execution time of strncpy_from_user() by 30-40%. Alternatively we could have switched to the generic strncpy_from_user() implementation, however it turned out that that variant would be slower than the now optimized s390 variant. Reported-by: Al Viro Reported-by: Linus Torvalds Signed-off-by: Heiko Carstens Reviewed-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/lib/uaccess.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 1e5bb2b86c42..b3bd3f23b8e8 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -337,8 +337,8 @@ long __strncpy_from_user(char *dst, const char __user *src, long size) return 0; done = 0; do { - offset = (size_t)src & ~PAGE_MASK; - len = min(size - done, PAGE_SIZE - offset); + offset = (size_t)src & (L1_CACHE_BYTES - 1); + len = min(size - done, L1_CACHE_BYTES - offset); if (copy_from_user(dst, src, len)) return -EFAULT; len_str = strnlen(dst, len); From 80ba38469aa28bbcfc7a31e5b41adfc42120465e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 4 May 2017 13:06:58 +0200 Subject: [PATCH 09/74] s390/topology: let topology_mnest_limit() return unsigned char Fixes a couple of compile warnings with gcc 7.1.0 : arch/s390/kernel/sysinfo.c:578:20: note: directive argument in the range [-2147483648, 4] sprintf(link_to, "15_1_%d", topology_mnest_limit()); Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/sysinfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index 73bff45ced55..e784bed6ed7f 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -146,7 +146,7 @@ extern int topology_max_mnest; * Returns the maximum nesting level supported by the cpu topology code. * The current maximum level is 4 which is the drawer level. */ -static inline int topology_mnest_limit(void) +static inline unsigned char topology_mnest_limit(void) { return min(topology_max_mnest, 4); } From 2685df6776b2c69967eaead48f694869f7dc91ca Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 4 May 2017 13:35:33 +0200 Subject: [PATCH 10/74] s390/ccwgroup: increase string buffer size Avoid false positive warnings like this with gcc 7.1: drivers/s390/cio/ccwgroup.c:41:21: warning: '%d' directive writing between 1 and 10 bytes into a region of size 4 sprintf(str, "cdev%d", i); and simply increase the size of the string buffer. Reviewed-by: Sebastian Ott Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/ccwgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index e443b0d0b236..34b9ad6b3143 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -35,7 +35,7 @@ static struct bus_type ccwgroup_bus_type; static void __ccwgroup_remove_symlinks(struct ccwgroup_device *gdev) { int i; - char str[8]; + char str[16]; for (i = 0; i < gdev->count; i++) { sprintf(str, "cdev%d", i); @@ -238,7 +238,7 @@ static void ccwgroup_release(struct device *dev) static int __ccwgroup_create_symlinks(struct ccwgroup_device *gdev) { - char str[8]; + char str[16]; int i, rc; for (i = 0; i < gdev->count; i++) { From 6423ef691c50568e56bb7e33a35bf9f8d6142d23 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 4 May 2017 13:39:18 +0200 Subject: [PATCH 11/74] s390/qdio: increase string buffer size Avoid false positive warnings like this with gcc 7.1: drivers/s390/cio/qdio_debug.h:63:4: note: 'snprintf' output between 8 and 17 bytes into a destination of size 16 snprintf(debug_buffer, QDIO_DBF_LEN, text); and simply increase the size of the string buffer. Reviewed-by: Sebastian Ott Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio_debug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/qdio_debug.h b/drivers/s390/cio/qdio_debug.h index f33ce8577619..1d595d17bf11 100644 --- a/drivers/s390/cio/qdio_debug.h +++ b/drivers/s390/cio/qdio_debug.h @@ -11,7 +11,7 @@ #include "qdio.h" /* that gives us 15 characters in the text event views */ -#define QDIO_DBF_LEN 16 +#define QDIO_DBF_LEN 32 extern debug_info_t *qdio_dbf_setup; extern debug_info_t *qdio_dbf_error; From d04a4c76f71dd5335f8e499b59617382d84e2b8d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 4 May 2017 09:42:22 +0200 Subject: [PATCH 12/74] s390: move _text symbol to address higher than zero The perf tool assumes that kernel symbols are never present at address zero. In fact it assumes if functions that map symbols to addresses return zero, that the symbol was not found. Given that s390's _text symbol historically is located at address zero this yields at least a couple of false errors and warnings in one of perf's test cases about not present symbols ("perf test 1"). To fix this simply move the _text symbol to address 0x200, just behind the initial psw and channel program located at the beginning of the kernel image. This is now hard coded within the linker script. I tried a nicer solution which moves the initial psw and channel program into an own section. However that would move the symbols within the "real" head.text section to different addresses, since the ".org" statements within head.S are relative to the head.text section. If there is a new section in front, everything else will be moved. Alternatively I could have adjusted all ".org" statements. But this current solution seems to be the easiest one, since nobody really cares where the _text symbol is actually located. Reported-by: Zvonko Kosic Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/vmlinux.lds.S | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 72307f108c40..6e2c42bd1c3b 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -31,8 +31,14 @@ SECTIONS { . = 0x00000000; .text : { - _text = .; /* Text and read-only data */ + /* Text and read-only data */ HEAD_TEXT + /* + * E.g. perf doesn't like symbols starting at address zero, + * therefore skip the initial PSW and channel program located + * at address zero and let _text start at 0x200. + */ + _text = 0x200; TEXT_TEXT SCHED_TEXT CPUIDLE_TEXT From bb3338d3474e0329918fda9dae2c52751731eb58 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 8 May 2017 17:39:24 -0700 Subject: [PATCH 13/74] md/raid5-cache: in r5l_do_submit_io(), submit io->split_bio first In r5l_do_submit_io(), it is necessary to check io->split_bio before submit io->current_bio. This is because, endio of current_bio may free the whole IO unit, and thus change io->split_bio. Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 26ba09282e7c..a6a62e212cd3 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -622,20 +622,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) __r5l_set_io_unit_state(io, IO_UNIT_IO_START); spin_unlock_irqrestore(&log->io_list_lock, flags); + /* + * In case of journal device failures, submit_bio will get error + * and calls endio, then active stripes will continue write + * process. Therefore, it is not necessary to check Faulty bit + * of journal device here. + * + * We can't check split_bio after current_bio is submitted. If + * io->split_bio is null, after current_bio is submitted, current_bio + * might already be completed and the io_unit is freed. We submit + * split_bio first to avoid the issue. + */ + if (io->split_bio) { + if (io->has_flush) + io->split_bio->bi_opf |= REQ_PREFLUSH; + if (io->has_fua) + io->split_bio->bi_opf |= REQ_FUA; + submit_bio(io->split_bio); + } + if (io->has_flush) io->current_bio->bi_opf |= REQ_PREFLUSH; if (io->has_fua) io->current_bio->bi_opf |= REQ_FUA; submit_bio(io->current_bio); - - if (!io->split_bio) - return; - - if (io->has_flush) - io->split_bio->bi_opf |= REQ_PREFLUSH; - if (io->has_fua) - io->split_bio->bi_opf |= REQ_FUA; - submit_bio(io->split_bio); } /* deferred io_unit will be dispatched here */ From efc0c21c9ea786d6f019d7df7b4e3932f3578d90 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Thu, 2 Mar 2017 12:23:45 +0100 Subject: [PATCH 14/74] s390: convert debug_info.ref_count from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/debug.h | 3 ++- arch/s390/kernel/debug.c | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index 0206c8052328..df7b54ea956d 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #define DEBUG_MAX_LEVEL 6 /* debug levels range from 0 to 6 */ @@ -31,7 +32,7 @@ struct debug_view; typedef struct debug_info { struct debug_info* next; struct debug_info* prev; - atomic_t ref_count; + refcount_t ref_count; spinlock_t lock; int level; int nr_areas; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 530226b6cb19..86b3e74f569e 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -277,7 +277,7 @@ debug_info_alloc(const char *name, int pages_per_area, int nr_areas, memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); memset(rc->debugfs_entries, 0 ,DEBUG_MAX_VIEWS * sizeof(struct dentry*)); - atomic_set(&(rc->ref_count), 0); + refcount_set(&(rc->ref_count), 0); return rc; @@ -361,7 +361,7 @@ debug_info_create(const char *name, int pages_per_area, int nr_areas, debug_area_last = rc; rc->next = NULL; - debug_info_get(rc); + refcount_set(&rc->ref_count, 1); out: return rc; } @@ -416,7 +416,7 @@ static void debug_info_get(debug_info_t * db_info) { if (db_info) - atomic_inc(&db_info->ref_count); + refcount_inc(&db_info->ref_count); } /* @@ -431,7 +431,7 @@ debug_info_put(debug_info_t *db_info) if (!db_info) return; - if (atomic_dec_and_test(&db_info->ref_count)) { + if (refcount_dec_and_test(&db_info->ref_count)) { for (i = 0; i < DEBUG_MAX_VIEWS; i++) { if (!db_info->views[i]) continue; From 23b245c04d0ef408087430dd4d1b214a5da1eb78 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 10 May 2017 08:47:11 -0700 Subject: [PATCH 15/74] md/raid1/10: avoid unnecessary locking If we add bios to block plugging list, locking is unnecessry, since the block unplug is guaranteed not to run at that time. Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 7 +++---- drivers/md/raid10.c | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7c1f73398800..a17ed6218d51 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1529,17 +1529,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, plug = container_of(cb, struct raid1_plug_cb, cb); else plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); if (plug) { bio_list_add(&plug->pending, mbio); plug->pending_cnt++; } else { + spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; - } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) + spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); + } } r1_bio_write_done(r1_bio); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6b86a0032cf8..4343d7ff9916 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, plug = container_of(cb, struct raid10_plug_cb, cb); else plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); if (plug) { bio_list_add(&plug->pending, mbio); plug->pending_cnt++; } else { + spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; - } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) + spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); + } } static void raid10_write_request(struct mddev *mddev, struct bio *bio, From 70d466f760b351fe30b5f8c956354ddf29aa676b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 11 May 2017 15:28:28 -0700 Subject: [PATCH 16/74] md/r5cache: gracefully handle journal device errors for writeback mode For the raid456 with writeback cache, when journal device failed during normal operation, it is still possible to persist all data, as all pending data is still in stripe cache. However, it is necessary to handle journal failure gracefully. During journal failures, the following logic handles the graceful shutdown of journal: 1. raid5_error() marks the device as Faulty and schedules async work log->disable_writeback_work; 2. In disable_writeback_work (r5c_disable_writeback_async), the mddev is suspended, set to write through, and then resumed. mddev_suspend() flushes all cached stripes; 3. All cached stripes need to be flushed carefully to the RAID array. This patch fixes issues within the process above: 1. In r5c_update_on_rdev_error() schedule disable_writeback_work for journal failures; 2. In r5c_disable_writeback_async(), wait for MD_SB_CHANGE_PENDING, since raid5_error() updates superblock. 3. In handle_stripe(), allow stripes with data in journal (s.injournal > 0) to make progress during log_failed; 4. In delay_towrite(), if log failed only process data in the cache (skip new writes in dev->towrite); 5. In __get_priority_stripe(), process loprio_list during journal device failures. 6. In raid5_remove_disk(), wait for all cached stripes are flushed before calling log_exit(). Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 11 +++++++++-- drivers/md/raid5-log.h | 3 ++- drivers/md/raid5.c | 29 +++++++++++++++++++++++------ 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index a6a62e212cd3..cc3f8442f11f 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -24,6 +24,7 @@ #include "md.h" #include "raid5.h" #include "bitmap.h" +#include "raid5-log.h" /* * metadata/data stored in disk with 4k size unit (a block) regardless @@ -680,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work) return; pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", mdname(mddev)); + + /* wait superblock change before suspend */ + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + mddev_suspend(mddev); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; mddev_resume(mddev); @@ -2983,7 +2989,7 @@ static int r5l_load_log(struct r5l_log *log) return ret; } -void r5c_update_on_rdev_error(struct mddev *mddev) +void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; struct r5l_log *log = conf->log; @@ -2991,7 +2997,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev) if (!log) return; - if (raid5_calc_degraded(conf) > 0 && + if ((raid5_calc_degraded(conf) > 0 || + test_bit(Journal, &rdev->flags)) && conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) schedule_work(&log->disable_writeback_work); } diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 27097101ccca..328d67aedda4 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num); extern void r5c_check_stripe_cache_usage(struct r5conf *conf); extern void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; -extern void r5c_update_on_rdev_error(struct mddev *mddev); +extern void r5c_update_on_rdev_error(struct mddev *mddev, + struct md_rdev *rdev); extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); extern struct dma_async_tx_descriptor * diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f8055a7abb4b..0ac57a925606 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2689,7 +2689,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); - r5c_update_on_rdev_error(mddev); + r5c_update_on_rdev_error(mddev, rdev); } /* @@ -3050,6 +3050,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) * When LOG_CRITICAL, stripes with injournal == 0 will be sent to * no_space_stripes list. * + * 3. during journal failure + * In journal failure, we try to flush all cached data to raid disks + * based on data in stripe cache. The array is read-only to upper + * layers, so we would skip all pending writes. + * */ static inline bool delay_towrite(struct r5conf *conf, struct r5dev *dev, @@ -3063,6 +3068,9 @@ static inline bool delay_towrite(struct r5conf *conf, if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && s->injournal > 0) return true; + /* case 3 above */ + if (s->log_failed && s->injournal) + return true; return false; } @@ -4696,10 +4704,15 @@ static void handle_stripe(struct stripe_head *sh) " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, s.failed_num[0], s.failed_num[1]); - /* check if the array has lost more than max_degraded devices and, + /* + * check if the array has lost more than max_degraded devices and, * if so, some requests might need to be failed. + * + * When journal device failed (log_failed), we will only process + * the stripe if there is data need write to raid disks */ - if (s.failed > conf->max_degraded || s.log_failed) { + if (s.failed > conf->max_degraded || + (s.log_failed && s.injournal == 0)) { sh->check_state = 0; sh->reconstruct_state = 0; break_stripe_batch_list(sh, 0); @@ -5272,8 +5285,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) struct stripe_head *sh, *tmp; struct list_head *handle_list = NULL; struct r5worker_group *wg; - bool second_try = !r5c_is_writeback(conf->log); - bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state); + bool second_try = !r5c_is_writeback(conf->log) && + !r5l_log_disk_error(conf); + bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) || + r5l_log_disk_error(conf); again: wg = NULL; @@ -7521,7 +7536,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * neilb: there is no locking about new writes here, * so this cannot be safe. */ - if (atomic_read(&conf->active_stripes)) { + if (atomic_read(&conf->active_stripes) || + atomic_read(&conf->r5c_cached_full_stripes) || + atomic_read(&conf->r5c_cached_partial_stripes)) { return -EBUSY; } log_exit(conf); From 5ddf0440a1a28f00f69ed2e093476bab3b60c2c3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 11 May 2017 17:03:44 -0700 Subject: [PATCH 17/74] md/r5cache: handle sync with data in write back cache Currently, sync of raid456 array cannot make progress when hitting data in writeback r5cache. This patch fixes this issue by flushing cached data of the stripe before processing the sync request. This is achived by: 1. In handle_stripe(), do not set STRIPE_SYNCING if the stripe is in write back cache; 2. In r5c_try_caching_write(), handle the stripe in sync with write through; 3. In do_release_stripe(), make stripe in sync write out and send it to the state machine. Shaohua: explictly set STRIPE_HANDLE after write out completed Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 8 +++++++- drivers/md/raid5.c | 21 +++++++++++++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index cc3f8442f11f..4c00bc248287 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2637,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf, * When run in degraded mode, array is set to write-through mode. * This check helps drain pending write safely in the transition to * write-through mode. + * + * When a stripe is syncing, the write is also handled in write + * through mode. */ - if (s->failed) { + if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) { r5c_make_stripe_write_out(sh); return -EAGAIN; } @@ -2841,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, } r5l_append_flush_payload(log, sh->sector); + /* stripe is flused to raid disks, we can do resync now */ + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) + set_bit(STRIPE_HANDLE, &sh->state); } int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0ac57a925606..9c4f7659f8b1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -233,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, if (test_bit(R5_InJournal, &sh->dev[i].flags)) injournal++; /* - * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with - * data in journal, so they are not released to cached lists + * In the following cases, the stripe cannot be released to cached + * lists. Therefore, we make the stripe write out and set + * STRIPE_HANDLE: + * 1. when quiesce in r5c write back; + * 2. when resync is requested fot the stripe. */ - if (conf->quiesce && r5c_is_writeback(conf->log) && - !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) || + (conf->quiesce && r5c_is_writeback(conf->log) && + !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { if (test_bit(STRIPE_R5C_CACHING, &sh->state)) r5c_make_stripe_write_out(sh); set_bit(STRIPE_HANDLE, &sh->state); @@ -4656,8 +4660,13 @@ static void handle_stripe(struct stripe_head *sh) if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { spin_lock(&sh->stripe_lock); - /* Cannot process 'sync' concurrently with 'discard' */ - if (!test_bit(STRIPE_DISCARD, &sh->state) && + /* + * Cannot process 'sync' concurrently with 'discard'. + * Flush data in r5cache before 'sync'. + */ + if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && + !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) && + !test_bit(STRIPE_DISCARD, &sh->state) && test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); From d82dd0e34d0347be201fd274dc84cd645dccc064 Mon Sep 17 00:00:00 2001 From: Tomasz Majchrzak Date: Fri, 12 May 2017 14:26:10 +0200 Subject: [PATCH 18/74] raid1: prefer disk without bad blocks If an array consists of two drives and the first drive has the bad block, the read request to the region overlapping the bad block chooses the same disk (with bad block) as device to read from over and over and the request gets stuck. If the first disk only partially overlaps with bad block, it becomes a candidate ("best disk") for shorter range of sectors. The second disk is capable of reading the entire requested range and it is updated accordingly, however it is not recorded as a best device for the request. In the end the request is sent to the first disk to read entire range of sectors. It fails and is re-tried in a moment but with the same outcome. Actually it is quite likely scenario but it had little exposure in my test until commit 715d40b93b10 ("md/raid1: add failfast handling for reads.") removed preference for idle disk. Such scenario had been passing as second disk was always chosen when idle. Reset a candidate ("best disk") to read from if disk can read entire range. Do it only if other disk has already been chosen as a candidate for a smaller range. The head position / disk type logic will select the best disk to read from - it is fine as disk with bad block won't be considered for it. Signed-off-by: Tomasz Majchrzak Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a17ed6218d51..af5056d56878 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect break; } continue; - } else + } else { + if ((sectors > best_good_sectors) && (best_disk >= 0)) + best_disk = -1; best_good_sectors = sectors; + } if (best_disk >= 0) /* At least two disks to choose from so failfast is OK */ From b9a985db98961ae1ba0be169f19df1c567e4ffe0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 11 May 2017 18:21:01 -0500 Subject: [PATCH 19/74] pid_ns: Sleep in TASK_INTERRUPTIBLE in zap_pid_ns_processes The code can potentially sleep for an indefinite amount of time in zap_pid_ns_processes triggering the hung task timeout, and increasing the system average. This is undesirable. Sleep with a task state of TASK_INTERRUPTIBLE instead of TASK_UNINTERRUPTIBLE to remove these undesirable side effects. Apparently under heavy load this has been allowing Chrome to trigger the hung time task timeout error and cause ChromeOS to reboot. Reported-by: Vovo Yang Reported-by: Guenter Roeck Tested-by: Guenter Roeck Fixes: 6347e9009104 ("pidns: guarantee that the pidns init will be the last pidns process reaped") Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index d1f3e9f558b8..74a5a7255b4d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -277,7 +277,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * if reparented. */ for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->nr_hashed == init_pids) break; schedule(); From 3fd37226216620c1a468afa999739d5016fbc349 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 May 2017 19:11:31 +0300 Subject: [PATCH 20/74] pid_ns: Fix race between setns'ed fork() and zap_pid_ns_processes() Imagine we have a pid namespace and a task from its parent's pid_ns, which made setns() to the pid namespace. The task is doing fork(), while the pid namespace's child reaper is dying. We have the race between them: Task from parent pid_ns Child reaper copy_process() .. alloc_pid() .. .. zap_pid_ns_processes() .. disable_pid_allocation() .. read_lock(&tasklist_lock) .. iterate over pids in pid_ns .. kill tasks linked to pids .. read_unlock(&tasklist_lock) write_lock_irq(&tasklist_lock); .. attach_pid(p, PIDTYPE_PID); .. .. .. So, just created task p won't receive SIGKILL signal, and the pid namespace will be in contradictory state. Only manual kill will help there, but does the userspace care about this? I suppose, the most users just inject a task into a pid namespace and wait a SIGCHLD from it. The patch fixes the problem. It simply checks for (pid_ns->nr_hashed & PIDNS_HASH_ADDING) in copy_process(). We do it under the tasklist_lock, and can't skip PIDNS_HASH_ADDING as noted by Oleg: "zap_pid_ns_processes() does disable_pid_allocation() and then takes tasklist_lock to kill the whole namespace. Given that copy_process() checks PIDNS_HASH_ADDING under write_lock(tasklist) they can't race; if copy_process() takes this lock first, the new child will be killed, otherwise copy_process() can't miss the change in ->nr_hashed." If allocation is disabled, we just return -ENOMEM like it's made for such cases in alloc_pid(). v2: Do not move disable_pid_allocation(), do not introduce a new variable in copy_process() and simplify the patch as suggested by Oleg Nesterov. Account the problem with double irq enabling found by Eric W. Biederman. Fixes: c876ad768215 ("pidns: Stop pid allocation when init dies") Signed-off-by: Kirill Tkhai CC: Andrew Morton CC: Ingo Molnar CC: Peter Zijlstra CC: Oleg Nesterov CC: Mike Rapoport CC: Michal Hocko CC: Andy Lutomirski CC: "Eric W. Biederman" CC: Andrei Vagin CC: Cyrill Gorcunov CC: Serge Hallyn Cc: stable@vger.kernel.org Acked-by: Oleg Nesterov Signed-off-by: Eric W. Biederman --- kernel/fork.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 06d759ab4c62..aa1076c5e4a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1845,11 +1845,13 @@ static __latent_entropy struct task_struct *copy_process( */ recalc_sigpending(); if (signal_pending(current)) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } + if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { + retval = -ENOMEM; + goto bad_fork_cancel_cgroup; + } if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -1907,6 +1909,8 @@ static __latent_entropy struct task_struct *copy_process( return p; bad_fork_cancel_cgroup: + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); bad_fork_free_pid: cgroup_threadgroup_change_end(current); From 90b4f30b6d15222a509dacf47f29efef2b22571e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 May 2017 16:30:12 +0200 Subject: [PATCH 21/74] hwmon: (coretemp) Handle frozen hotplug state correctly The recent conversion to the hotplug state machine missed that the original hotplug notifiers did not execute in the frozen state, which is used on suspend on resume. This does not matter on single socket machines, but on multi socket systems this breaks when the device for a non-boot socket is removed when the last CPU of that socket is brought offline. The device removal locks up the machine hard w/o any debug output. Prevent executing the hotplug callbacks when cpuhp_tasks_frozen is true. Thanks to Tommi for providing debug information patiently while I failed to spot the obvious. Fixes: e00ca5df37ad ("hwmon: (coretemp) Convert to hotplug state machine") Reported-by: Tommi Rantala Tested-by: Tommi Rantala Signed-off-by: Thomas Gleixner Signed-off-by: Guenter Roeck --- drivers/hwmon/coretemp.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index 3ac4c03ba77b..c13a4fd86b3c 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -604,6 +604,13 @@ static int coretemp_cpu_online(unsigned int cpu) struct cpuinfo_x86 *c = &cpu_data(cpu); struct platform_data *pdata; + /* + * Don't execute this on resume as the offline callback did + * not get executed on suspend. + */ + if (cpuhp_tasks_frozen) + return 0; + /* * CPUID.06H.EAX[0] indicates whether the CPU has thermal * sensors. We check this bit only, all the early CPUs @@ -654,6 +661,13 @@ static int coretemp_cpu_offline(unsigned int cpu) struct temp_data *tdata; int indx, target; + /* + * Don't execute this on suspend as the device remove locks + * up the machine. + */ + if (cpuhp_tasks_frozen) + return 0; + /* If the physical CPU device does not exist, just return */ if (!pdev) return 0; From 072792dcdfc8d5f91a26050e5665285f50afebf5 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 06:14:16 -0400 Subject: [PATCH 22/74] dm cache: fix incorrect 'idle_time' reset in IO tracker Some bios have no payload (eg, a FLUSH), don't reset the idle_time when these come in. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 1db375f50a13..0760ba409c21 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -94,6 +94,9 @@ static void iot_io_begin(struct io_tracker *iot, sector_t len) static void __iot_io_end(struct io_tracker *iot, sector_t len) { + if (!len) + return; + iot->in_flight -= len; if (!iot->in_flight) iot->idle_time = jiffies; From a8cd1eba6135e086109e2b94bf96deb17456ede8 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 05:07:34 -0400 Subject: [PATCH 23/74] dm cache policy smq: only demote entries in bottom half of the clean multiqueue Heavy IO load may mean there are very few clean blocks in the cache, and we risk demoting entries that get hit a lot. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-smq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 72479bd61e11..a177559f2049 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1190,7 +1190,7 @@ static void queue_demotion(struct smq_policy *mq) if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed))) return; - e = q_peek(&mq->clean, mq->clean.nr_levels, true); + e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true); if (!e) { if (!clean_target_met(mq, false)) queue_writeback(mq); From 78c45607b909fb384c47c134d89b39285a6a8b45 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 05:09:38 -0400 Subject: [PATCH 24/74] dm cache policy smq: be more aggressive about triggering a writeback If there are no clean entries to demote we really want to writeback immediately. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-smq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index a177559f2049..5aa8f43856c5 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1192,7 +1192,7 @@ static void queue_demotion(struct smq_policy *mq) e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true); if (!e) { - if (!clean_target_met(mq, false)) + if (!clean_target_met(mq, true)) queue_writeback(mq); return; } From 4d44ec5ab751be63c5d348f13294304d87baa8c3 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 05:11:06 -0400 Subject: [PATCH 25/74] dm cache policy smq: put newly promoted entries at the top of the multiqueue This stops entries bouncing in and out of the cache quickly. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-smq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 5aa8f43856c5..54421a846a0c 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1452,6 +1452,7 @@ static void __complete_background_work(struct smq_policy *mq, clear_pending(mq, e); if (success) { e->oblock = work->oblock; + e->level = NR_CACHE_LEVELS - 1; push(mq, e); // h, q, a } else { From 6cf4cc8f8b3b7bc9e3c04a7eab44b985d50029fc Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 07:48:18 -0400 Subject: [PATCH 26/74] dm cache policy smq: stop preemptively demoting blocks It causes a lot of churn if the working set's size is close to the fast device's size. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-smq.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 54421a846a0c..758480a1893d 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1134,13 +1134,10 @@ static bool clean_target_met(struct smq_policy *mq, bool idle) percent_to_target(mq, CLEAN_TARGET); } -static bool free_target_met(struct smq_policy *mq, bool idle) +static bool free_target_met(struct smq_policy *mq) { unsigned nr_free; - if (!idle) - return true; - nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated; return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >= percent_to_target(mq, FREE_TARGET); @@ -1220,7 +1217,7 @@ static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, * We always claim to be 'idle' to ensure some demotions happen * with continuous loads. */ - if (!free_target_met(mq, true)) + if (!free_target_met(mq)) queue_demotion(mq); return; } @@ -1421,14 +1418,10 @@ static int smq_get_background_work(struct dm_cache_policy *p, bool idle, spin_lock_irqsave(&mq->lock, flags); r = btracker_issue(mq->bg_work, result); if (r == -ENODATA) { - /* find some writeback work to do */ - if (mq->migrations_allowed && !free_target_met(mq, idle)) - queue_demotion(mq); - - else if (!clean_target_met(mq, idle)) + if (!clean_target_met(mq, idle)) { queue_writeback(mq); - - r = btracker_issue(mq->bg_work, result); + r = btracker_issue(mq->bg_work, result); + } } spin_unlock_irqrestore(&mq->lock, flags); From 701e03e4e180f0cd97d4139a32e2b2d879d12da2 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 08:22:31 -0400 Subject: [PATCH 27/74] dm cache: track all IO to the cache rather than just the origin device's IO IO tracking used to throttle writebacks when the origin device is busy. Even if all the IO is going to the fast device, writebacks can significantly degrade performance. So track all IO to gauge whether the cache is busy or not. Otherwise, synthetic IO tests (e.g. fio) that might send all IO to the fast device wouldn't cause writebacks to get throttled. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 0760ba409c21..232078e48167 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -477,7 +477,7 @@ struct cache { spinlock_t invalidation_lock; struct list_head invalidation_requests; - struct io_tracker origin_tracker; + struct io_tracker tracker; struct work_struct commit_ws; struct batcher committer; @@ -904,8 +904,7 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) static bool accountable_bio(struct cache *cache, struct bio *bio) { - return ((bio->bi_bdev == cache->origin_dev->bdev) && - bio_op(bio) != REQ_OP_DISCARD); + return bio_op(bio) != REQ_OP_DISCARD; } static void accounted_begin(struct cache *cache, struct bio *bio) @@ -915,7 +914,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio) if (accountable_bio(cache, bio)) { pb->len = bio_sectors(bio); - iot_io_begin(&cache->origin_tracker, pb->len); + iot_io_begin(&cache->tracker, pb->len); } } @@ -924,7 +923,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio) size_t pb_data_size = get_per_bio_data_size(cache); struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - iot_io_end(&cache->origin_tracker, pb->len); + iot_io_end(&cache->tracker, pb->len); } static void accounted_request(struct cache *cache, struct bio *bio) @@ -1725,7 +1724,7 @@ enum busy { static enum busy spare_migration_bandwidth(struct cache *cache) { - bool idle = iot_idle_for(&cache->origin_tracker, HZ); + bool idle = iot_idle_for(&cache->tracker, HZ); sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * cache->sectors_per_block; @@ -2720,7 +2719,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) batcher_init(&cache->committer, commit_op, cache, issue_op, cache, cache->wq); - iot_init(&cache->origin_tracker); + iot_init(&cache->tracker); init_rwsem(&cache->background_work_lock); prevent_background_work(cache); @@ -2944,7 +2943,7 @@ static void cache_postsuspend(struct dm_target *ti) cancel_delayed_work(&cache->waker); flush_workqueue(cache->wq); - WARN_ON(cache->origin_tracker.in_flight); + WARN_ON(cache->tracker.in_flight); /* * If it's a flush suspend there won't be any deferred bios, so this From 49b7f768900f4084a65c3689d955b2fceac39e53 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 09:07:16 -0400 Subject: [PATCH 28/74] dm cache: simplify the IDLE vs BUSY state calculation Drop the MODERATE state since it wasn't buying us much. Also, in check_migrations(), prepare for the next commit ("dm cache policy smq: don't do any writebacks unless IDLE") by deferring to the policy to make the final decision on whether writebacks can be serviced. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 232078e48167..d682a0511381 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1718,7 +1718,6 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock, enum busy { IDLE, - MODERATE, BUSY }; @@ -1728,10 +1727,10 @@ static enum busy spare_migration_bandwidth(struct cache *cache) sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * cache->sectors_per_block; - if (current_volume <= cache->migration_threshold) - return idle ? IDLE : MODERATE; + if (idle && current_volume <= cache->migration_threshold) + return IDLE; else - return idle ? MODERATE : BUSY; + return BUSY; } static void inc_hit_counter(struct cache *cache, struct bio *bio) @@ -2047,8 +2046,6 @@ static void check_migrations(struct work_struct *ws) for (;;) { b = spare_migration_bandwidth(cache); - if (b == BUSY) - break; r = policy_get_background_work(cache->policy, b == IDLE, &op); if (r == -ENODATA) From 2e63309507c818e8b631a03f02c363031c007fb7 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 09:09:04 -0400 Subject: [PATCH 29/74] dm cache policy smq: don't do any writebacks unless IDLE If there are no clean blocks to be demoted the writeback will be triggered at that point. Preemptively writing back can hurt high IO load scenarios. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-smq.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 758480a1893d..e5eb9c9b4bc8 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1120,8 +1120,6 @@ static bool clean_target_met(struct smq_policy *mq, bool idle) * Cache entries may not be populated. So we cannot rely on the * size of the clean queue. */ - unsigned nr_clean; - if (idle) { /* * We'd like to clean everything. @@ -1129,9 +1127,10 @@ static bool clean_target_met(struct smq_policy *mq, bool idle) return q_size(&mq->dirty) == 0u; } - nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); - return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >= - percent_to_target(mq, CLEAN_TARGET); + /* + * If we're busy we don't worry about cleaning at all. + */ + return true; } static bool free_target_met(struct smq_policy *mq) From fb317002ab4419ae7e068bee6897f2d5745aa3b9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 May 2017 12:50:53 +0200 Subject: [PATCH 30/74] s390/virtio: change virtio_feature_desc:features type to __le32 The feature member of virtio_feature_desc contains little endian values, given that it contents will be converted with le32_to_cpu(). The "wrong" __u32 type leads to the sparse warnings below. In order to avoid them, use the correct __le32 type instead. drivers/s390/virtio/virtio_ccw.c:749:14: warning: cast to restricted __le32 drivers/s390/virtio/virtio_ccw.c:762:28: warning: cast to restricted __le32 Acked-by: Halil Pasic Acked-by: Cornelia Huck Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/virtio/virtio_ccw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 0ed209f3d8b0..c7d4ef7b4b22 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -87,7 +87,7 @@ struct vq_info_block { } __packed; struct virtio_feature_desc { - __u32 features; + __le32 features; __u8 index; } __packed; From 68118e0e73aa3a6291c8b9eb1ee708e05f110cea Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Sun, 7 May 2017 07:16:30 +0200 Subject: [PATCH 31/74] i2c: mux: reg: put away the parent i2c adapter on probe failure It is only prudent to let go of resources that are not used. Fixes: b3fdd32799d8 ("i2c: mux: Add register-based mux i2c-mux-reg") Signed-off-by: Peter Rosin --- drivers/i2c/muxes/i2c-mux-reg.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-reg.c b/drivers/i2c/muxes/i2c-mux-reg.c index 406d5059072c..11974e3cd1e5 100644 --- a/drivers/i2c/muxes/i2c-mux-reg.c +++ b/drivers/i2c/muxes/i2c-mux-reg.c @@ -196,20 +196,25 @@ static int i2c_mux_reg_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); mux->data.reg_size = resource_size(res); mux->data.reg = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(mux->data.reg)) - return PTR_ERR(mux->data.reg); + if (IS_ERR(mux->data.reg)) { + ret = PTR_ERR(mux->data.reg); + goto err_put_parent; + } } if (mux->data.reg_size != 4 && mux->data.reg_size != 2 && mux->data.reg_size != 1) { dev_err(&pdev->dev, "Invalid register size\n"); - return -EINVAL; + ret = -EINVAL; + goto err_put_parent; } muxc = i2c_mux_alloc(parent, &pdev->dev, mux->data.n_values, 0, 0, i2c_mux_reg_select, NULL); - if (!muxc) - return -ENOMEM; + if (!muxc) { + ret = -ENOMEM; + goto err_put_parent; + } muxc->priv = mux; platform_set_drvdata(pdev, muxc); @@ -233,6 +238,8 @@ static int i2c_mux_reg_probe(struct platform_device *pdev) add_adapter_failed: i2c_mux_del_adapters(muxc); +err_put_parent: + i2c_put_adapter(parent); return ret; } From a36d4637e4a06be067b8e327a0b1118bb2a73cb8 Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Mon, 15 May 2017 18:48:55 +0200 Subject: [PATCH 32/74] i2c: mux: reg: rename label to indicate what it does That maintains sanity if it is ever called from some other spot, and also makes the label names coherent. Signed-off-by: Peter Rosin --- drivers/i2c/muxes/i2c-mux-reg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-reg.c b/drivers/i2c/muxes/i2c-mux-reg.c index 11974e3cd1e5..d97031804de8 100644 --- a/drivers/i2c/muxes/i2c-mux-reg.c +++ b/drivers/i2c/muxes/i2c-mux-reg.c @@ -228,7 +228,7 @@ static int i2c_mux_reg_probe(struct platform_device *pdev) ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i], class); if (ret) - goto add_adapter_failed; + goto err_del_mux_adapters; } dev_dbg(&pdev->dev, "%d port mux on %s adapter\n", @@ -236,7 +236,7 @@ static int i2c_mux_reg_probe(struct platform_device *pdev) return 0; -add_adapter_failed: +err_del_mux_adapters: i2c_mux_del_adapters(muxc); err_put_parent: i2c_put_adapter(parent); From 9fce894d03a98ec8e8e8106a964644633d2772ee Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Mon, 15 May 2017 09:03:50 +0200 Subject: [PATCH 33/74] i2c: mux: only print failure message on error As is, a failure message is printed unconditionally, which is confusing. And noisy. Fixes: 8d4d159f25a7 ("i2c: mux: provide more info on failure in i2c_mux_add_adapter") Signed-off-by: Peter Rosin --- drivers/i2c/i2c-mux.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c index 26f7237558ba..9669ca4937b8 100644 --- a/drivers/i2c/i2c-mux.c +++ b/drivers/i2c/i2c-mux.c @@ -395,18 +395,20 @@ int i2c_mux_add_adapter(struct i2c_mux_core *muxc, if (force_nr) { priv->adap.nr = force_nr; ret = i2c_add_numbered_adapter(&priv->adap); - dev_err(&parent->dev, - "failed to add mux-adapter %u as bus %u (error=%d)\n", - chan_id, force_nr, ret); + if (ret < 0) { + dev_err(&parent->dev, + "failed to add mux-adapter %u as bus %u (error=%d)\n", + chan_id, force_nr, ret); + goto err_free_priv; + } } else { ret = i2c_add_adapter(&priv->adap); - dev_err(&parent->dev, - "failed to add mux-adapter %u (error=%d)\n", - chan_id, ret); - } - if (ret < 0) { - kfree(priv); - return ret; + if (ret < 0) { + dev_err(&parent->dev, + "failed to add mux-adapter %u (error=%d)\n", + chan_id, ret); + goto err_free_priv; + } } WARN(sysfs_create_link(&priv->adap.dev.kobj, &muxc->dev->kobj, @@ -422,6 +424,10 @@ int i2c_mux_add_adapter(struct i2c_mux_core *muxc, muxc->adapter[muxc->num_adapters++] = &priv->adap; return 0; + +err_free_priv: + kfree(priv); + return ret; } EXPORT_SYMBOL_GPL(i2c_mux_add_adapter); From 91bcdb92d39711d1adb40c26b653b7978d93eb98 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 15 May 2017 09:43:05 -0400 Subject: [PATCH 34/74] dm thin metadata: call precommit before saving the roots These calls were the wrong way round in __write_initial_superblock. Cc: stable@vger.kernel.org Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-thin-metadata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 0f0251d0d337..d31d18d9727c 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -484,11 +484,11 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd) if (r < 0) return r; - r = save_sm_roots(pmd); + r = dm_tm_pre_commit(pmd->tm); if (r < 0) return r; - r = dm_tm_pre_commit(pmd->tm); + r = save_sm_roots(pmd); if (r < 0) return r; From 0377a07c7a035e0d033cd8b29f0cb15244c0916a Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 15 May 2017 09:45:40 -0400 Subject: [PATCH 35/74] dm space map disk: fix some book keeping in the disk space map When decrementing the reference count for a block, the free count wasn't being updated if the reference count went to zero. Cc: stable@vger.kernel.org Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-space-map-disk.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index ebb280a14325..32adf6b4a9c7 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -142,10 +142,23 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) { + int r; + uint32_t old_count; enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - return sm_ll_dec(&smd->ll, b, &ev); + r = sm_ll_dec(&smd->ll, b, &ev); + if (!r && (ev == SM_FREE)) { + /* + * It's only free if it's also free in the last + * transaction. + */ + r = sm_ll_lookup(&smd->old_ll, b, &old_count); + if (!r && !old_count) + smd->nr_allocated_this_transaction--; + } + + return r; } static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) From ece0728037b15f4d31198f12b359104bcb5db4c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 May 2017 17:28:36 +0200 Subject: [PATCH 36/74] dm rq: add a missing break to map_request We don't want to bug when receiving a DM_MAPIO_KILL value.. Fixes: 412445ac ("dm: introduce a new DM_MAPIO_KILL return value") Signed-off-by: Christoph Hellwig Signed-off-by: Mike Snitzer --- drivers/md/dm-rq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 2af27026aa2e..b639fa7246ee 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -507,6 +507,7 @@ static int map_request(struct dm_rq_target_io *tio) case DM_MAPIO_KILL: /* The target wants to complete the I/O */ dm_kill_unmapped_request(rq, -EIO); + break; default: DMWARN("unimplemented target map return value: %d", r); BUG(); From 18a482f5245cc875755090853e84283512b3e6bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 May 2017 17:28:37 +0200 Subject: [PATCH 37/74] dm mpath: don't return -EIO from dm_report_EIO Instead just turn the macro into a helper for the warning message. This removes an unnecessary assignment and will allow the next commit to fix a place where -EIO is the wrong return value. Signed-off-by: Christoph Hellwig Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 926a6bcb32c8..d55454f98b59 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -447,7 +447,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) * it has been invoked. */ #define dm_report_EIO(m) \ -({ \ +do { \ struct mapped_device *md = dm_table_get_md((m)->ti->table); \ \ pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \ @@ -455,8 +455,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \ test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \ dm_noflush_suspending((m)->ti)); \ - -EIO; \ -}) +} while (0) /* * Map cloned requests (request-based multipath) @@ -481,7 +480,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, if (!pgpath) { if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) return DM_MAPIO_DELAY_REQUEUE; - return dm_report_EIO(m); /* Failed */ + dm_report_EIO(m); /* Failed */ + return -EIO; } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { if (pg_init_all_paths(m)) @@ -558,7 +558,8 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m if (!pgpath) { if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) return DM_MAPIO_REQUEUE; - return dm_report_EIO(m); + dm_report_EIO(m); + return -EIO; } mpio->pgpath = pgpath; @@ -1493,7 +1494,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, if (atomic_read(&m->nr_valid_paths) == 0 && !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { if (error == -EIO) - error = dm_report_EIO(m); + dm_report_EIO(m); /* complete with the original error */ r = DM_ENDIO_DONE; } @@ -1524,8 +1525,10 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone, fail_path(mpio->pgpath); if (atomic_read(&m->nr_valid_paths) == 0 && - !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) - return dm_report_EIO(m); + !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + dm_report_EIO(m); + return -EIO; + } /* Queue for the daemon to resubmit */ dm_bio_restore(get_bio_details_from_bio(clone), clone); From f98e0eb68008aff9824d1c4dad7276c8bab83ca5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 May 2017 17:28:38 +0200 Subject: [PATCH 38/74] dm mpath: multipath_clone_and_map must not return -EIO Since 412445ac ("dm: introduce a new DM_MAPIO_KILL return value"), the clone_and_map_rq methods must not return errno values, so fix it up to properly return DM_MAPIO_KILL, instead of the -EIO value that snuck in due to a conflict between two patches. Signed-off-by: Christoph Hellwig Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d55454f98b59..3df056b73b66 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -481,7 +481,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) return DM_MAPIO_DELAY_REQUEUE; dm_report_EIO(m); /* Failed */ - return -EIO; + return DM_MAPIO_KILL; } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { if (pg_init_all_paths(m)) From bafbb9c73241760023d8981191ddd30bb1c6dbac Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Mon, 15 May 2017 17:05:47 -0400 Subject: [PATCH 39/74] tcp: eliminate negative reordering in tcp_clean_rtx_queue tcp_ack() can call tcp_fragment() which may dededuct the value tp->fackets_out when MSS changes. When prior_fackets is larger than tp->fackets_out, tcp_clean_rtx_queue() can invoke tcp_update_reordering() with negative values. This results in absurd tp->reodering values higher than sysctl_tcp_max_reordering. Note that tcp_update_reordering indeeds sets tp->reordering to min(sysctl_tcp_max_reordering, metric), but because the comparison is signed, a negative metric always wins. Fixes: c7caf8d3ed7a ("[TCP]: Fix reord detection due to snd_una covered holes") Reported-by: Rebecca Isaacs Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 06e2dbc2b4a2..174d4376baa5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3190,7 +3190,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, int delta; /* Non-retransmitted hole got filled? That's reordering */ - if (reord < prior_fackets) + if (reord < prior_fackets && reord <= tp->fackets_out) tcp_update_reordering(sk, tp->fackets_out - reord, 0); delta = tcp_is_fack(tp) ? pkts_acked : From bcfc7d33110b0f33069d74138eeb7ca9acbb3c85 Mon Sep 17 00:00:00 2001 From: Thomas Winter Date: Tue, 16 May 2017 10:14:44 +1200 Subject: [PATCH 40/74] ipmr: vrf: Find VIFs using the actual device The skb->dev that is passed into ip_mr_input is the loX device for VRFs. When we lookup a vif for this dev, none is found as we do not create vifs for loopbacks. Instead lookup a vif for the actual device that the packet was received on, eg the vlan. Signed-off-by: Thomas Winter cc: David Ahern cc: Nikolay Aleksandrov cc: roopa Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3a02d52ed50e..551de4d023a8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1980,6 +1980,20 @@ int ip_mr_input(struct sk_buff *skb) struct net *net = dev_net(skb->dev); int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; struct mr_table *mrt; + struct net_device *dev; + + /* skb->dev passed in is the loX master dev for vrfs. + * As there are no vifs associated with loopback devices, + * get the proper interface that does have a vif associated with it. + */ + dev = skb->dev; + if (netif_is_l3_master(skb->dev)) { + dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); + if (!dev) { + kfree_skb(skb); + return -ENODEV; + } + } /* Packet is looped back after forward, it should not be * forwarded second time, but still can be delivered locally. @@ -2017,7 +2031,7 @@ int ip_mr_input(struct sk_buff *skb) /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); if (!cache) { - int vif = ipmr_find_vif(mrt, skb->dev); + int vif = ipmr_find_vif(mrt, dev); if (vif >= 0) cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, @@ -2037,7 +2051,7 @@ int ip_mr_input(struct sk_buff *skb) } read_lock(&mrt_lock); - vif = ipmr_find_vif(mrt, skb->dev); + vif = ipmr_find_vif(mrt, dev); if (vif >= 0) { int err2 = ipmr_cache_unresolved(mrt, vif, skb); read_unlock(&mrt_lock); From 6f61dd3aa35179043f1fcdb0965c5d56278ab724 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 12 May 2017 14:52:34 -0700 Subject: [PATCH 41/74] efi-pstore: Fix read iter after pstore API refactor During the internal pstore API refactoring, the EFI vars read entry was accidentally made to update a stack variable instead of the pstore private data pointer. This corrects the problem (and removes the now needless argument). Fixes: 125cc42baf8a ("pstore: Replace arguments for read() API") Signed-off-by: Kees Cook --- drivers/firmware/efi/efi-pstore.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index 93d8cdbe7ef4..44148fd4c9f2 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -155,19 +155,14 @@ static int efi_pstore_scan_sysfs_exit(struct efivar_entry *pos, * efi_pstore_sysfs_entry_iter * * @record: pstore record to pass to callback - * @pos: entry to begin iterating from * * You MUST call efivar_enter_iter_begin() before this function, and * efivar_entry_iter_end() afterwards. * - * It is possible to begin iteration from an arbitrary entry within - * the list by passing @pos. @pos is updated on return to point to - * the next entry of the last one passed to efi_pstore_read_func(). - * To begin iterating from the beginning of the list @pos must be %NULL. */ -static int efi_pstore_sysfs_entry_iter(struct pstore_record *record, - struct efivar_entry **pos) +static int efi_pstore_sysfs_entry_iter(struct pstore_record *record) { + struct efivar_entry **pos = (struct efivar_entry **)&record->psi->data; struct efivar_entry *entry, *n; struct list_head *head = &efivar_sysfs_list; int size = 0; @@ -218,7 +213,6 @@ static int efi_pstore_sysfs_entry_iter(struct pstore_record *record, */ static ssize_t efi_pstore_read(struct pstore_record *record) { - struct efivar_entry *entry = (struct efivar_entry *)record->psi->data; ssize_t size; record->buf = kzalloc(EFIVARS_DATA_SIZE_MAX, GFP_KERNEL); @@ -229,7 +223,7 @@ static ssize_t efi_pstore_read(struct pstore_record *record) size = -EINTR; goto out; } - size = efi_pstore_sysfs_entry_iter(record, &entry); + size = efi_pstore_sysfs_entry_iter(record); efivar_entry_iter_end(); out: From 263eec9b2a82e8697d064709414914b5b10ac538 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 15 May 2017 17:33:37 +0200 Subject: [PATCH 42/74] smc: switch to usage of IB_PD_UNSAFE_GLOBAL_RKEY Currently, SMC enables remote access to physical memory when a user has successfully configured and established an SMC-connection until ten minutes after the last SMC connection is closed. Because this is considered a security risk, drivers are supposed to use IB_PD_UNSAFE_GLOBAL_RKEY in such a case. This patch changes the current SMC code to use IB_PD_UNSAFE_GLOBAL_RKEY. This improves user awareness, but does not remove the security risk itself. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 4 ++-- net/smc/smc_core.c | 16 +++------------- net/smc/smc_core.h | 2 +- net/smc/smc_ib.c | 21 ++------------------- net/smc/smc_ib.h | 2 -- 5 files changed, 8 insertions(+), 37 deletions(-) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index e41f594a1e1d..03ec058d18df 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -204,7 +204,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); @@ -256,7 +256,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 65020e93ff21..3ac09a629ea1 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -613,19 +613,8 @@ int smc_rmb_create(struct smc_sock *smc) rmb_desc = NULL; continue; /* if mapping failed, try smaller one */ } - rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, - &rmb_desc->mr_rx[SMC_SINGLE_LINK]); - if (rc) { - smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - tmp_bufsize, rmb_desc, - DMA_FROM_DEVICE); - kfree(rmb_desc->cpu_addr); - kfree(rmb_desc); - rmb_desc = NULL; - continue; - } + rmb_desc->rkey[SMC_SINGLE_LINK] = + lgr->lnk[SMC_SINGLE_LINK].roce_pd->unsafe_global_rkey; rmb_desc->used = 1; write_lock_bh(&lgr->rmbs_lock); list_add(&rmb_desc->list, @@ -668,6 +657,7 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && + (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && test_bit(i, lgr->rtokens_used_mask)) { conn->rtoken_idx = i; return 0; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 27eb38056a27..b013cb43a327 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -93,7 +93,7 @@ struct smc_buf_desc { u64 dma_addr[SMC_LINKS_PER_LGR_MAX]; /* mapped address of buffer */ void *cpu_addr; /* virtual address of buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + u32 rkey[SMC_LINKS_PER_LGR_MAX]; /* for rmb only: * rkey provided to peer */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index cb69ab977cd7..b31715505a35 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -37,24 +37,6 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system * identifier */ -int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct ib_mr **mr) -{ - int rc; - - if (*mr) - return 0; /* already done */ - - /* obtain unique key - - * next invocation of get_dma_mr returns a different key! - */ - *mr = pd->device->get_dma_mr(pd, access_flags); - rc = PTR_ERR_OR_ZERO(*mr); - if (IS_ERR(*mr)) - *mr = NULL; - return rc; -} - static int smc_ib_modify_qp_init(struct smc_link *lnk) { struct ib_qp_attr qp_attr; @@ -210,7 +192,8 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) { int rc; - lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); + lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, + IB_PD_UNSAFE_GLOBAL_RKEY); rc = PTR_ERR_OR_ZERO(lnk->roce_pd); if (IS_ERR(lnk->roce_pd)) lnk->roce_pd = NULL; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 7e1f0e24d177..b567152a526d 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -61,8 +61,6 @@ void smc_ib_dealloc_protection_domain(struct smc_link *lnk); int smc_ib_create_protection_domain(struct smc_link *lnk); void smc_ib_destroy_queue_pair(struct smc_link *lnk); int smc_ib_create_queue_pair(struct smc_link *lnk); -int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct ib_mr **mr); int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); From 19a0f7e37c0761a0a1cbf550705a6063c9675223 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 May 2017 09:51:38 +0300 Subject: [PATCH 43/74] net/smc: Add warning about remote memory exposure The driver explicitly bypasses APIs to register all memory once a connection is made, and thus allows remote access to memory. Signed-off-by: Christoph Hellwig Signed-off-by: Leon Romanovsky Acked-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/smc/Kconfig b/net/smc/Kconfig index c717ef0896aa..33954852f3f8 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -8,6 +8,10 @@ config SMC The Linux implementation of the SMC-R solution is designed as a separate socket family SMC. + Warning: SMC will expose all memory for remote reads and writes + once a connection is established. Don't enable this option except + for tightly controlled lab environment. + Select this option if you want to run SMC socket applications config SMC_DIAG From f6c5775ff0bfa62b072face6bf1d40f659f194b2 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 15 May 2017 23:19:17 -0700 Subject: [PATCH 44/74] net: Improve handling of failures on link and route dumps In general, rtnetlink dumps do not anticipate failure to dump a single object (e.g., link or route) on a single pass. As both route and link objects have grown via more attributes, that is no longer a given. netlink dumps can handle a failure if the dump function returns an error; specifically, netlink_dump adds the return code to the response if it is <= 0 so userspace is notified of the failure. The missing piece is the rtnetlink dump functions returning the error. Fix route and link dump functions to return the errors if no object is added to an skb (detected by skb->len != 0). IPv6 route dumps (rt6_dump_route) already return the error; this patch updates IPv4 and link dumps. Other dump functions may need to be ajusted as well. Reported-by: Jan Moskyto Matejka Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 36 ++++++++++++++++++++++++------------ net/ipv4/fib_frontend.c | 15 +++++++++++---- net/ipv4/fib_trie.c | 26 ++++++++++++++------------ 3 files changed, 49 insertions(+), 28 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d7f82c3450b1..49a279a7cc15 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1627,13 +1627,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, 0, flags, ext_filter_mask); - /* If we ran out of room on the first message, - * we're in trouble - */ - WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); - if (err < 0) - goto out; + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: @@ -1641,10 +1641,12 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) } } out: + err = skb->len; +out_err: cb->args[1] = idx; cb->args[0] = h; - return skb->len; + return err; } int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, @@ -3453,8 +3455,12 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) err = br_dev->netdev_ops->ndo_bridge_getlink( skb, portid, seq, dev, filter_mask, NLM_F_MULTI); - if (err < 0 && err != -EOPNOTSUPP) - break; + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } } idx++; } @@ -3465,16 +3471,22 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) seq, dev, filter_mask, NLM_F_MULTI); - if (err < 0 && err != -EOPNOTSUPP) - break; + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } } idx++; } } + err = skb->len; +out_err: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static inline size_t bridge_nlmsg_size(void) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 39bd1edee676..83e3ed258467 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -763,7 +763,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; - int dumped = 0; + int dumped = 0, err; if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) @@ -783,20 +783,27 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); - if (fib_table_dump(tb, skb, cb) < 0) - goto out; + err = fib_table_dump(tb, skb, cb); + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } dumped = 1; next: e++; } } out: + err = skb->len; +out_err: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; - return skb->len; + return err; } /* Prepare and feed intra-kernel routing request. diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1201409ba1dc..51182ff2b441 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1983,6 +1983,8 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + int err; + if (i < s_i) { i++; continue; @@ -1993,17 +1995,14 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, continue; } - if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, - fa->fa_type, - xkey, - KEYLENGTH - fa->fa_slen, - fa->fa_tos, - fa->fa_info, NLM_F_MULTI) < 0) { + err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + tb->tb_id, fa->fa_type, + xkey, KEYLENGTH - fa->fa_slen, + fa->fa_tos, fa->fa_info, NLM_F_MULTI); + if (err < 0) { cb->args[4] = i; - return -1; + return err; } i++; } @@ -2025,10 +2024,13 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, t_key key = cb->args[3]; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { + int err; + + err = fn_trie_dump_leaf(l, tb, skb, cb); + if (err < 0) { cb->args[3] = key; cb->args[2] = count; - return -1; + return err; } ++count; From 13840d38016203f0095cd547b90352812d24b787 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 30 Apr 2017 17:32:28 -0400 Subject: [PATCH 45/74] dm bufio: make the parameter "retain_bytes" unsigned long Change the type of the parameter "retain_bytes" from unsigned to unsigned long, so that on 64-bit machines the user can set more than 4GiB of data to be retained. Also, change the type of the variable "count" in the function "__evict_old_buffers" to unsigned long. The assignment "count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];" could result in unsigned long to unsigned overflow and that could result in buffers not being freed when they should. While at it, avoid division in get_retain_buffers(). Division is slow, we can change it to shift because we have precalculated the log2 of block size. Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 5db11a405129..cd8139593ccd 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -218,7 +218,7 @@ static DEFINE_SPINLOCK(param_spinlock); * Buffers are freed after this timeout */ static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; -static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; +static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; static unsigned long dm_bufio_peak_allocated; static unsigned long dm_bufio_allocated_kmem_cache; @@ -1558,10 +1558,10 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) return true; } -static unsigned get_retain_buffers(struct dm_bufio_client *c) +static unsigned long get_retain_buffers(struct dm_bufio_client *c) { - unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); - return retain_bytes / c->block_size; + unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); + return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT); } static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, @@ -1571,7 +1571,7 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, struct dm_buffer *b, *tmp; unsigned long freed = 0; unsigned long count = nr_to_scan; - unsigned retain_target = get_retain_buffers(c); + unsigned long retain_target = get_retain_buffers(c); for (l = 0; l < LIST_SIZE; l++) { list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { @@ -1794,8 +1794,8 @@ static bool older_than(struct dm_buffer *b, unsigned long age_hz) static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) { struct dm_buffer *b, *tmp; - unsigned retain_target = get_retain_buffers(c); - unsigned count; + unsigned long retain_target = get_retain_buffers(c); + unsigned long count; LIST_HEAD(write_list); dm_bufio_lock(c); @@ -1955,7 +1955,7 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); -module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR); +module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); From 2432a3fb5cff026005aaad24e42226402f7fd0aa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 May 2017 13:27:49 +0200 Subject: [PATCH 46/74] mlx5e: add CONFIG_INET dependency We now reference the arp_tbl, which requires IPv4 support to be enabled in the kernel, otherwise we get a link error: drivers/net/built-in.o: In function `mlx5e_tc_update_neigh_used_value': (.text+0x16afec): undefined reference to `arp_tbl' drivers/net/built-in.o: In function `mlx5e_rep_neigh_init': en_rep.c:(.text+0x16c16d): undefined reference to `arp_tbl' drivers/net/built-in.o: In function `mlx5e_rep_netevent_event': en_rep.c:(.text+0x16cbb5): undefined reference to `arp_tbl' This adds a Kconfig dependency for it. Fixes: 232c001398ae ("net/mlx5e: Add support to neighbour update flow") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index fc52d742b7f7..27251a78075c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -13,7 +13,7 @@ config MLX5_CORE config MLX5_CORE_EN bool "Mellanox Technologies ConnectX-4 Ethernet support" - depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE + depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE depends on IPV6=y || IPV6=n || MLX5_CORE=m imply PTP_1588_CLOCK default n From 88ad60c23a394b2f8bf1e570c756f415435d1d35 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Tue, 16 May 2017 14:07:24 +0200 Subject: [PATCH 47/74] i2c: mv64xxx: don't override deferred probing when getting irq There is no reason to use platform_get_irq() for non-DT probing and irq_of_parse_and_map() for DT probing. Indeed, platform_get_irq() works fine for both. In addition, using platform_get_irq() properly returns -EPROBE_DEFER when the interrupt controller is not yet available, so instead of inventing our own error code (-ENXIO), return the one provided by platform_get_irq(). Signed-off-by: Thomas Petazzoni Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mv64xxx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c index cf737ec8563b..5c4db65c5019 100644 --- a/drivers/i2c/busses/i2c-mv64xxx.c +++ b/drivers/i2c/busses/i2c-mv64xxx.c @@ -819,7 +819,6 @@ mv64xxx_of_config(struct mv64xxx_i2c_data *drv_data, rc = -EINVAL; goto out; } - drv_data->irq = irq_of_parse_and_map(np, 0); drv_data->rstc = devm_reset_control_get_optional(dev, NULL); if (IS_ERR(drv_data->rstc)) { @@ -902,10 +901,11 @@ mv64xxx_i2c_probe(struct platform_device *pd) if (!IS_ERR(drv_data->clk)) clk_prepare_enable(drv_data->clk); + drv_data->irq = platform_get_irq(pd, 0); + if (pdata) { drv_data->freq_m = pdata->freq_m; drv_data->freq_n = pdata->freq_n; - drv_data->irq = platform_get_irq(pd, 0); drv_data->adapter.timeout = msecs_to_jiffies(pdata->timeout); drv_data->offload_enabled = false; memcpy(&drv_data->reg_offsets, &mv64xxx_i2c_regs_mv64xxx, sizeof(drv_data->reg_offsets)); @@ -915,7 +915,7 @@ mv64xxx_i2c_probe(struct platform_device *pd) goto exit_clk; } if (drv_data->irq < 0) { - rc = -ENXIO; + rc = drv_data->irq; goto exit_reset; } From 83345d51a49a4b3f3b4a08a5db644dae438b0189 Mon Sep 17 00:00:00 2001 From: Tin Huynh Date: Wed, 17 May 2017 11:25:34 +0700 Subject: [PATCH 48/74] i2c: xgene: Set ACPI_COMPANION_I2C With ACPI, i2c-core requires ACPI companion to be set in order for it to create slave device. This patch sets the ACPI companion accordingly. Signed-off-by: Tin Huynh Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-xgene-slimpro.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-xgene-slimpro.c b/drivers/i2c/busses/i2c-xgene-slimpro.c index dbe7e44c9321..6ba6c83ca8f1 100644 --- a/drivers/i2c/busses/i2c-xgene-slimpro.c +++ b/drivers/i2c/busses/i2c-xgene-slimpro.c @@ -416,6 +416,7 @@ static int xgene_slimpro_i2c_probe(struct platform_device *pdev) adapter->class = I2C_CLASS_HWMON; adapter->dev.parent = &pdev->dev; adapter->dev.of_node = pdev->dev.of_node; + ACPI_COMPANION_SET(&adapter->dev, ACPI_COMPANION(&pdev->dev)); i2c_set_adapdata(adapter, ctx); rc = i2c_add_adapter(adapter); if (rc) { From 7e1b9521f5a8356553f5e58b07952bf346632ea4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 11 Mar 2017 19:09:45 +0000 Subject: [PATCH 49/74] dm cache: handle kmalloc failure allocating background_tracker struct Currently there is no kmalloc failure check on the allocation of the background_tracker struct in btracker_create(), and so a NULL return will lead to a NULL pointer dereference. Add a NULL check. Detected by CoverityScan, CID#1416587 ("Dereference null return value") Fixes: b29d4986d ("dm cache: significant rework to leverage dm-bio-prison-v2") Signed-off-by: Colin Ian King Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-background-tracker.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c index 9b1afdfb13f0..707233891291 100644 --- a/drivers/md/dm-cache-background-tracker.c +++ b/drivers/md/dm-cache-background-tracker.c @@ -33,6 +33,11 @@ struct background_tracker *btracker_create(unsigned max_work) { struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL); + if (!b) { + DMERR("couldn't create background_tracker"); + return NULL; + } + b->max_work = max_work; atomic_set(&b->pending_promotes, 0); atomic_set(&b->pending_writebacks, 0); From 23d268eb240954e6e78f7cfab04f2b1e79f84489 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 16 May 2017 07:53:43 -0700 Subject: [PATCH 50/74] arp: honour gratuitous ARP _replies_ When arp_accept is 1, gratuitous ARPs are supposed to override matching entries irrespective of whether they arrive during locktime. This was implemented in commit 56022a8fdd87 ("ipv4: arp: update neighbour address when a gratuitous arp is received and arp_accept is set") There is a glitch in the patch though. RFC 2002, section 4.6, "ARP, Proxy ARP, and Gratuitous ARP", defines gratuitous ARPs so that they can be either of Request or Reply type. Those Reply gratuitous ARPs can be triggered with standard tooling, for example, arping -A option does just that. This patch fixes the glitch, making both Request and Reply flavours of gratuitous ARPs to behave identically. As per RFC, if gratuitous ARPs are of Reply type, their Target Hardware Address field should also be set to the link-layer address to which this cache entry should be updated. The field is present in ARP over Ethernet but not in IEEE 1394. In this patch, I don't consider any broadcasted ARP replies as gratuitous if the field is not present, to conform the standard. It's not clear whether there is such a thing for IEEE 1394 as a gratuitous ARP reply; until it's cleared up, we will ignore such broadcasts. Note that they will still update existing ARP cache entries, assuming they arrive out of locktime time interval. Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 0937b34c27ca..d54345a06f72 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -653,6 +653,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha; + unsigned char *tha = NULL; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; @@ -724,6 +725,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) break; #endif default: + tha = arp_ptr; arp_ptr += dev->addr_len; } memcpy(&tip, arp_ptr, 4); @@ -842,8 +844,18 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) It is possible, that this option should be enabled for some devices (strip is candidate) */ - is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && - addr_type == RTN_UNICAST; + is_garp = tip == sip && addr_type == RTN_UNICAST; + + /* Unsolicited ARP _replies_ also require target hwaddr to be + * the same as source. + */ + if (is_garp && arp->ar_op == htons(ARPOP_REPLY)) + is_garp = + /* IPv4 over IEEE 1394 doesn't provide target + * hardware address field in its ARP payload. + */ + tha && + !memcmp(tha, sha, dev->addr_len); if (!n && ((arp->ar_op == htons(ARPOP_REPLY) && From 77d7123342dcf6442341b67816321d71da8b2b16 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 16 May 2017 08:44:24 -0700 Subject: [PATCH 51/74] neighbour: update neigh timestamps iff update is effective It's a common practice to send gratuitous ARPs after moving an IP address to another device to speed up healing of a service. To fulfill service availability constraints, the timing of network peers updating their caches to point to a new location of an IP address can be particularly important. Sometimes neigh_update calls won't touch neither lladdr nor state, for example if an update arrives in locktime interval. The neigh->updated value is tested by the protocol specific neigh code, which in turn will influence whether NEIGH_UPDATE_F_OVERRIDE gets set in the call to neigh_update() or not. As a result, we may effectively ignore the update request, bailing out of touching the neigh entry, except that we still bump its timestamps inside neigh_update. This may be a problem for updates arriving in quick succession. For example, consider the following scenario: A service is moved to another device with its IP address. The new device sends three gratuitous ARP requests into the network with ~1 seconds interval between them. Just before the first request arrives to one of network peer nodes, its neigh entry for the IP address transitions from STALE to DELAY. This transition, among other things, updates neigh->updated. Once the kernel receives the first gratuitous ARP, it ignores it because its arrival time is inside the locktime interval. The kernel still bumps neigh->updated. Then the second gratuitous ARP request arrives, and it's also ignored because it's still in the (new) locktime interval. Same happens for the third request. The node eventually heals itself (after delay_first_probe_time seconds since the initial transition to DELAY state), but it just wasted some time and require a new ARP request/reply round trip. This unfortunate behaviour both puts more load on the network, as well as reduces service availability. This patch changes neigh_update so that it bumps neigh->updated (as well as neigh->confirmed) only once we are sure that either lladdr or entry state will change). In the scenario described above, it means that the second gratuitous ARP request will actually update the entry lladdr. Ideally, we would update the neigh entry on the very first gratuitous ARP request. The locktime mechanism is designed to ignore ARP updates in a short timeframe after a previous ARP update was honoured by the kernel layer. This would require tracking timestamps for state transitions separately from timestamps when actual updates are received. This would probably involve changes in neighbour struct. Therefore, the patch doesn't tackle the issue of the first gratuitous APR ignored, leaving it for a follow-up. Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/core/neighbour.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 58b0bcc125b5..d274f81fcc2c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1132,10 +1132,6 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, lladdr = neigh->ha; } - if (new & NUD_CONNECTED) - neigh->confirmed = jiffies; - neigh->updated = jiffies; - /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ @@ -1157,6 +1153,16 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } } + /* Update timestamps only once we know we will make a change to the + * neighbour entry. Otherwise we risk to move the locktime window with + * noop updates and ignore relevant ARP updates. + */ + if (new != old || lladdr != neigh->ha) { + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + } + if (new != old) { neigh_del_timer(neigh); if (new & NUD_PROBE) From 05d8cba4a1e8c7e2d1f91a24a2f3d26852938a04 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 16 May 2017 14:15:03 +0900 Subject: [PATCH 52/74] kbuild: skip install/check of headers right under uapi directories Since commit 61562f981e92 ("uapi: export all arch specifics directories"), "make INSTALL_HDR_PATH=$root/usr headers_install" deletes standard glibc headers and others in $(root)/usr/include. The cause of the issue is that headers_install now starts descending from arch/$(hdr-arch)/include/uapi with $(root)/usr/include for its destination when installing asm headers. So, headers already there are assumed to be unwanted. When headers_install starts descending from include/uapi with $(root)/usr/include for its destination, it works around the problem by creating an dummy destination $(root)/usr/include/uapi, but this is tricky. To fix the problem in a clean way is to skip headers install/check in include/uapi and arch/$(hdr-arch)/include/uapi because we know there are only sub-directories in uapi directories. A good side effect is the empty destination $(root)/usr/include/uapi will go away. I am also removing the trailing slash in the headers_check target to skip checking in arch/$(hdr-arch)/include/uapi. Fixes: 61562f981e92 ("uapi: export all arch specifics directories") Reported-by: Dan Williams Signed-off-by: Masahiro Yamada Tested-by: Dan Williams Acked-by: Nicolas Dichtel --- Makefile | 2 +- scripts/Makefile.headersinst | 43 ++++++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index b400c0604fac..b1ee4a49efa2 100644 --- a/Makefile +++ b/Makefile @@ -1172,7 +1172,7 @@ headers_check_all: headers_install_all PHONY += headers_check headers_check: headers_install $(Q)$(MAKE) $(hdr-inst)=include/uapi HDRCHECK=1 - $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/uapi/ $(hdr-dst) HDRCHECK=1 + $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/uapi $(hdr-dst) HDRCHECK=1 # --------------------------------------------------------------------------- # Kernel selftest diff --git a/scripts/Makefile.headersinst b/scripts/Makefile.headersinst index 6ba97a1f9c5a..ce753a408c56 100644 --- a/scripts/Makefile.headersinst +++ b/scripts/Makefile.headersinst @@ -8,6 +8,29 @@ # # ========================================================================== +PHONY := __headers +__headers: + +include scripts/Kbuild.include + +srcdir := $(srctree)/$(obj) +subdirs := $(patsubst $(srcdir)/%/.,%,$(wildcard $(srcdir)/*/.)) +# caller may set destination dir (when installing to asm/) +_dst := $(if $(dst),$(dst),$(obj)) + +# Recursion +__headers: $(subdirs) + +.PHONY: $(subdirs) +$(subdirs): + $(Q)$(MAKE) $(hdr-inst)=$(obj)/$@ dst=$(_dst)/$@ + +# Skip header install/check for include/uapi and arch/$(hdr-arch)/include/uapi. +# We have only sub-directories there. +skip-inst := $(if $(filter %/uapi,$(obj)),1) + +ifeq ($(skip-inst),) + # generated header directory gen := $(if $(gen),$(gen),$(subst include/,include/generated/,$(obj))) @@ -15,21 +38,14 @@ gen := $(if $(gen),$(gen),$(subst include/,include/generated/,$(obj))) kbuild-file := $(srctree)/$(obj)/Kbuild -include $(kbuild-file) -# called may set destination dir (when installing to asm/) -_dst := $(if $(dst),$(dst),$(obj)) - old-kbuild-file := $(srctree)/$(subst uapi/,,$(obj))/Kbuild ifneq ($(wildcard $(old-kbuild-file)),) include $(old-kbuild-file) endif -include scripts/Kbuild.include - installdir := $(INSTALL_HDR_PATH)/$(subst uapi/,,$(_dst)) -srcdir := $(srctree)/$(obj) gendir := $(objtree)/$(gen) -subdirs := $(patsubst $(srcdir)/%/.,%,$(wildcard $(srcdir)/*/.)) header-files := $(notdir $(wildcard $(srcdir)/*.h)) header-files += $(notdir $(wildcard $(srcdir)/*.agh)) header-files := $(filter-out $(no-export-headers), $(header-files)) @@ -88,11 +104,9 @@ quiet_cmd_check = CHECK $(printdir) ($(words $(all-files)) files) $(PERL) $< $(INSTALL_HDR_PATH)/include $(SRCARCH); \ touch $@ -PHONY += __headersinst __headerscheck - ifndef HDRCHECK # Rules for installing headers -__headersinst: $(subdirs) $(install-file) +__headers: $(install-file) @: targets += $(install-file) @@ -104,7 +118,7 @@ $(install-file): scripts/headers_install.sh \ $(call if_changed,install) else -__headerscheck: $(subdirs) $(check-file) +__headers: $(check-file) @: targets += $(check-file) @@ -113,11 +127,6 @@ $(check-file): scripts/headers_check.pl $(output-files) FORCE endif -# Recursion -.PHONY: $(subdirs) -$(subdirs): - $(Q)$(MAKE) $(hdr-inst)=$(obj)/$@ dst=$(_dst)/$@ - targets := $(wildcard $(sort $(targets))) cmd_files := $(wildcard \ $(foreach f,$(targets),$(dir $(f)).$(notdir $(f)).cmd)) @@ -126,6 +135,8 @@ ifneq ($(cmd_files),) include $(cmd_files) endif +endif # skip-inst + .PHONY: $(PHONY) PHONY += FORCE FORCE: ; From 2423496af35d94a87156b063ea5cedffc10a70a1 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Tue, 16 May 2017 14:36:23 -0400 Subject: [PATCH 53/74] ipv6: Prevent overrun when parsing v6 header options The KASAN warning repoted below was discovered with a syzkaller program. The reproducer is basically: int s = socket(AF_INET6, SOCK_RAW, NEXTHDR_HOP); send(s, &one_byte_of_data, 1, MSG_MORE); send(s, &more_than_mtu_bytes_data, 2000, 0); The socket() call sets the nexthdr field of the v6 header to NEXTHDR_HOP, the first send call primes the payload with a non zero byte of data, and the second send call triggers the fragmentation path. The fragmentation code tries to parse the header options in order to figure out where to insert the fragment option. Since nexthdr points to an invalid option, the calculation of the size of the network header can made to be much larger than the linear section of the skb and data is read outside of it. This fix makes ip6_find_1stfrag return an error if it detects running out-of-bounds. [ 42.361487] ================================================================== [ 42.364412] BUG: KASAN: slab-out-of-bounds in ip6_fragment+0x11c8/0x3730 [ 42.365471] Read of size 840 at addr ffff88000969e798 by task ip6_fragment-oo/3789 [ 42.366469] [ 42.366696] CPU: 1 PID: 3789 Comm: ip6_fragment-oo Not tainted 4.11.0+ #41 [ 42.367628] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1ubuntu1 04/01/2014 [ 42.368824] Call Trace: [ 42.369183] dump_stack+0xb3/0x10b [ 42.369664] print_address_description+0x73/0x290 [ 42.370325] kasan_report+0x252/0x370 [ 42.370839] ? ip6_fragment+0x11c8/0x3730 [ 42.371396] check_memory_region+0x13c/0x1a0 [ 42.371978] memcpy+0x23/0x50 [ 42.372395] ip6_fragment+0x11c8/0x3730 [ 42.372920] ? nf_ct_expect_unregister_notifier+0x110/0x110 [ 42.373681] ? ip6_copy_metadata+0x7f0/0x7f0 [ 42.374263] ? ip6_forward+0x2e30/0x2e30 [ 42.374803] ip6_finish_output+0x584/0x990 [ 42.375350] ip6_output+0x1b7/0x690 [ 42.375836] ? ip6_finish_output+0x990/0x990 [ 42.376411] ? ip6_fragment+0x3730/0x3730 [ 42.376968] ip6_local_out+0x95/0x160 [ 42.377471] ip6_send_skb+0xa1/0x330 [ 42.377969] ip6_push_pending_frames+0xb3/0xe0 [ 42.378589] rawv6_sendmsg+0x2051/0x2db0 [ 42.379129] ? rawv6_bind+0x8b0/0x8b0 [ 42.379633] ? _copy_from_user+0x84/0xe0 [ 42.380193] ? debug_check_no_locks_freed+0x290/0x290 [ 42.380878] ? ___sys_sendmsg+0x162/0x930 [ 42.381427] ? rcu_read_lock_sched_held+0xa3/0x120 [ 42.382074] ? sock_has_perm+0x1f6/0x290 [ 42.382614] ? ___sys_sendmsg+0x167/0x930 [ 42.383173] ? lock_downgrade+0x660/0x660 [ 42.383727] inet_sendmsg+0x123/0x500 [ 42.384226] ? inet_sendmsg+0x123/0x500 [ 42.384748] ? inet_recvmsg+0x540/0x540 [ 42.385263] sock_sendmsg+0xca/0x110 [ 42.385758] SYSC_sendto+0x217/0x380 [ 42.386249] ? SYSC_connect+0x310/0x310 [ 42.386783] ? __might_fault+0x110/0x1d0 [ 42.387324] ? lock_downgrade+0x660/0x660 [ 42.387880] ? __fget_light+0xa1/0x1f0 [ 42.388403] ? __fdget+0x18/0x20 [ 42.388851] ? sock_common_setsockopt+0x95/0xd0 [ 42.389472] ? SyS_setsockopt+0x17f/0x260 [ 42.390021] ? entry_SYSCALL_64_fastpath+0x5/0xbe [ 42.390650] SyS_sendto+0x40/0x50 [ 42.391103] entry_SYSCALL_64_fastpath+0x1f/0xbe [ 42.391731] RIP: 0033:0x7fbbb711e383 [ 42.392217] RSP: 002b:00007ffff4d34f28 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 42.393235] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fbbb711e383 [ 42.394195] RDX: 0000000000001000 RSI: 00007ffff4d34f60 RDI: 0000000000000003 [ 42.395145] RBP: 0000000000000046 R08: 00007ffff4d34f40 R09: 0000000000000018 [ 42.396056] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000400aad [ 42.396598] R13: 0000000000000066 R14: 00007ffff4d34ee0 R15: 00007fbbb717af00 [ 42.397257] [ 42.397411] Allocated by task 3789: [ 42.397702] save_stack_trace+0x16/0x20 [ 42.398005] save_stack+0x46/0xd0 [ 42.398267] kasan_kmalloc+0xad/0xe0 [ 42.398548] kasan_slab_alloc+0x12/0x20 [ 42.398848] __kmalloc_node_track_caller+0xcb/0x380 [ 42.399224] __kmalloc_reserve.isra.32+0x41/0xe0 [ 42.399654] __alloc_skb+0xf8/0x580 [ 42.400003] sock_wmalloc+0xab/0xf0 [ 42.400346] __ip6_append_data.isra.41+0x2472/0x33d0 [ 42.400813] ip6_append_data+0x1a8/0x2f0 [ 42.401122] rawv6_sendmsg+0x11ee/0x2db0 [ 42.401505] inet_sendmsg+0x123/0x500 [ 42.401860] sock_sendmsg+0xca/0x110 [ 42.402209] ___sys_sendmsg+0x7cb/0x930 [ 42.402582] __sys_sendmsg+0xd9/0x190 [ 42.402941] SyS_sendmsg+0x2d/0x50 [ 42.403273] entry_SYSCALL_64_fastpath+0x1f/0xbe [ 42.403718] [ 42.403871] Freed by task 1794: [ 42.404146] save_stack_trace+0x16/0x20 [ 42.404515] save_stack+0x46/0xd0 [ 42.404827] kasan_slab_free+0x72/0xc0 [ 42.405167] kfree+0xe8/0x2b0 [ 42.405462] skb_free_head+0x74/0xb0 [ 42.405806] skb_release_data+0x30e/0x3a0 [ 42.406198] skb_release_all+0x4a/0x60 [ 42.406563] consume_skb+0x113/0x2e0 [ 42.406910] skb_free_datagram+0x1a/0xe0 [ 42.407288] netlink_recvmsg+0x60d/0xe40 [ 42.407667] sock_recvmsg+0xd7/0x110 [ 42.408022] ___sys_recvmsg+0x25c/0x580 [ 42.408395] __sys_recvmsg+0xd6/0x190 [ 42.408753] SyS_recvmsg+0x2d/0x50 [ 42.409086] entry_SYSCALL_64_fastpath+0x1f/0xbe [ 42.409513] [ 42.409665] The buggy address belongs to the object at ffff88000969e780 [ 42.409665] which belongs to the cache kmalloc-512 of size 512 [ 42.410846] The buggy address is located 24 bytes inside of [ 42.410846] 512-byte region [ffff88000969e780, ffff88000969e980) [ 42.411941] The buggy address belongs to the page: [ 42.412405] page:ffffea000025a780 count:1 mapcount:0 mapping: (null) index:0x0 compound_mapcount: 0 [ 42.413298] flags: 0x100000000008100(slab|head) [ 42.413729] raw: 0100000000008100 0000000000000000 0000000000000000 00000001800c000c [ 42.414387] raw: ffffea00002a9500 0000000900000007 ffff88000c401280 0000000000000000 [ 42.415074] page dumped because: kasan: bad access detected [ 42.415604] [ 42.415757] Memory state around the buggy address: [ 42.416222] ffff88000969e880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 42.416904] ffff88000969e900: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 42.417591] >ffff88000969e980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 42.418273] ^ [ 42.418588] ffff88000969ea00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 42.419273] ffff88000969ea80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 42.419882] ================================================================== Reported-by: Andrey Konovalov Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- net/ipv6/ip6_offload.c | 2 ++ net/ipv6/ip6_output.c | 4 ++++ net/ipv6/output_core.c | 14 ++++++++------ net/ipv6/udp_offload.c | 2 ++ 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 93e58a5e1837..eab36abc9f22 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -117,6 +117,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (udpfrag) { unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + if (unfrag_ip6hlen < 0) + return ERR_PTR(unfrag_ip6hlen); fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen); fptr->frag_off = htons(offset); if (skb->next) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 58f6288e9ba5..01deecda2f84 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -598,6 +598,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, u8 *prevhdr, nexthdr = 0; hlen = ip6_find_1stfragopt(skb, &prevhdr); + if (hlen < 0) { + err = hlen; + goto fail; + } nexthdr = *prevhdr; mtu = ip6_skb_dst_mtu(skb); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index cd4252346a32..e9065b8d3af8 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -79,14 +79,13 @@ EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); unsigned int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; - while (offset + 1 <= packet_len) { + while (offset <= packet_len) { + struct ipv6_opt_hdr *exthdr; switch (**nexthdr) { @@ -107,13 +106,16 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) return offset; } - offset += ipv6_optlen(exthdr); - *nexthdr = &exthdr->nexthdr; + if (offset + sizeof(struct ipv6_opt_hdr) > packet_len) + return -EINVAL; + exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; } - return offset; + return -EINVAL; } EXPORT_SYMBOL(ip6_find_1stfragopt); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index ac858c480f2f..b348cff47395 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -91,6 +91,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, * bytes to insert fragment header. */ unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + if (unfrag_ip6hlen < 0) + return ERR_PTR(unfrag_ip6hlen); nexthdr = *prevhdr; *prevhdr = NEXTHDR_FRAGMENT; unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + From b6c41cb050d5debc7e4eaa0a81cbdbad72588891 Mon Sep 17 00:00:00 2001 From: Nitin Gupta Date: Mon, 15 May 2017 16:28:17 -0700 Subject: [PATCH 54/74] sparc64: Fix mapping of 64k pages with MAP_FIXED An incorrect huge page alignment check caused mmap failure for 64K pages when MAP_FIXED is used with address not aligned to HPAGE_SIZE. Orabug: 25885991 Fixes: dcd1912d21a0 ("sparc64: Add 64K page size support") Signed-off-by: Nitin Gupta Signed-off-by: David S. Miller --- arch/sparc/include/asm/hugetlb.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index dcbf985ab243..d1f837dc77a4 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -24,9 +24,11 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { - if (len & ~HPAGE_MASK) + struct hstate *h = hstate_file(file); + + if (len & ~huge_page_mask(h)) return -EINVAL; - if (addr & ~HPAGE_MASK) + if (addr & ~huge_page_mask(h)) return -EINVAL; return 0; } From deba804c90642c8ed0f15ac1083663976d578f54 Mon Sep 17 00:00:00 2001 From: Orlando Arias Date: Tue, 16 May 2017 15:34:00 -0400 Subject: [PATCH 55/74] sparc: Fix -Wstringop-overflow warning Greetings, GCC 7 introduced the -Wstringop-overflow flag to detect buffer overflows in calls to string handling functions [1][2]. Due to the way ``empty_zero_page'' is declared in arch/sparc/include/setup.h, this causes a warning to trigger at compile time in the function mem_init(), which is subsequently converted to an error. The ensuing patch fixes this issue and aligns the declaration of empty_zero_page to that of other architectures. Thank you. Cheers, Orlando. [1] https://gcc.gnu.org/ml/gcc-patches/2016-10/msg02308.html [2] https://gcc.gnu.org/gcc-7/changes.html Signed-off-by: Orlando Arias -------------------------------------------------------------------------------- Signed-off-by: David S. Miller --- arch/sparc/include/asm/pgtable_32.h | 4 ++-- arch/sparc/include/asm/setup.h | 2 +- arch/sparc/mm/init_32.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index ce6f56980aef..cf190728360b 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -91,9 +91,9 @@ extern unsigned long pfn_base; * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern unsigned long empty_zero_page; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(&empty_zero_page)) +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) /* * In general all page table modifications should use the V8 atomic diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h index 478bf6bb4598..3fae200dd251 100644 --- a/arch/sparc/include/asm/setup.h +++ b/arch/sparc/include/asm/setup.h @@ -16,7 +16,7 @@ extern char reboot_command[]; */ extern unsigned char boot_cpu_id; -extern unsigned long empty_zero_page; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; extern int serial_console; static inline int con_is_present(void) diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index c6afe98de4d9..3bd0d513bddb 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -290,7 +290,7 @@ void __init mem_init(void) /* Saves us work later. */ - memset((void *)&empty_zero_page, 0, PAGE_SIZE); + memset((void *)empty_zero_page, 0, PAGE_SIZE); i = last_valid_pfn >> ((20 - PAGE_SHIFT) + 5); i += 1; From 48078d2dac0a26f84f5f3ec704f24f7c832cce14 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 17 May 2017 11:47:00 -0400 Subject: [PATCH 56/74] sparc/ftrace: Fix ftrace graph time measurement The ftrace function_graph time measurements of a given function is not accurate according to those recorded by ftrace using the function filters. This change pulls the x86_64 fix from 'commit 722b3c746953 ("ftrace/graph: Trace function entry before updating index")' into the sparc specific prepare_ftrace_return which stops ftrace from counting interrupted tasks in the time measurement. Example measurements for select_task_rq_fair running "hackbench 100 process 1000": | tracing/trace_stat/function0 | function_graph Before patch | 2.802 us | 4.255 us After patch | 2.749 us | 3.094 us Signed-off-by: Liam R. Howlett Signed-off-by: David S. Miller --- arch/sparc/kernel/ftrace.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c index 6bcff698069b..cec54dc4ab81 100644 --- a/arch/sparc/kernel/ftrace.c +++ b/arch/sparc/kernel/ftrace.c @@ -130,18 +130,17 @@ unsigned long prepare_ftrace_return(unsigned long parent, if (unlikely(atomic_read(¤t->tracing_graph_pause))) return parent + 8UL; + trace.func = self_addr; + trace.depth = current->curr_ret_stack + 1; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + return parent + 8UL; + if (ftrace_push_return_trace(parent, self_addr, &trace.depth, frame_pointer, NULL) == -EBUSY) return parent + 8UL; - trace.func = self_addr; - - /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) { - current->curr_ret_stack--; - return parent + 8UL; - } - return return_hooker; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ From 9142e9007f2d7ab58a587a1e1d921b0064a339aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 13:27:53 -0700 Subject: [PATCH 57/74] net: fix compile error in skb_orphan_partial() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_INET is not set, net/core/sock.c can not compile : net/core/sock.c: In function ‘skb_orphan_partial’: net/core/sock.c:1810:2: error: implicit declaration of function ‘skb_is_tcp_pure_ack’ [-Werror=implicit-function-declaration] if (skb_is_tcp_pure_ack(skb)) ^ Fix this by always including Fixes: f6ba8d33cfbb ("netem: fix skb_orphan_partial()") Signed-off-by: Eric Dumazet Reported-by: Paul Gortmaker Reported-by: Randy Dunlap Reported-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/core/sock.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index e43e71d7856b..727f924b7f91 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -139,10 +139,7 @@ #include -#ifdef CONFIG_INET #include -#endif - #include static DEFINE_MUTEX(proto_list_mutex); From 87fe603274aa9889c05cca3c3e45675e1997cb13 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 16 May 2017 16:39:43 -0400 Subject: [PATCH 58/74] bnxt_en: Call bnxt_dcb_init() after getting firmware DCBX configuration. In the current code, bnxt_dcb_init() is called too early before we determine if the firmware DCBX agent is running or not. As a result, we are not setting the DCB_CAP_DCBX_HOST and DCB_CAP_DCBX_LLD_MANAGED flags properly to report to DCBNL. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index b56c54d68d5e..03f55daecb20 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7630,8 +7630,6 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->min_mtu = ETH_ZLEN; dev->max_mtu = BNXT_MAX_MTU; - bnxt_dcb_init(bp); - #ifdef CONFIG_BNXT_SRIOV init_waitqueue_head(&bp->sriov_cfg_wait); #endif @@ -7669,6 +7667,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) bnxt_hwrm_func_qcfg(bp); bnxt_hwrm_port_led_qcaps(bp); bnxt_ethtool_init(bp); + bnxt_dcb_init(bp); bnxt_set_rx_skb_mode(bp, false); bnxt_set_tpa_flags(bp); From f667724b99ad1afc91f16064d8fb293d2805bd57 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 16 May 2017 16:39:44 -0400 Subject: [PATCH 59/74] bnxt_en: Check status of firmware DCBX agent before setting DCB_CAP_DCBX_HOST. Otherwise, all the host based DCBX settings from lldpad will fail if the firmware DCBX agent is running. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c index 46de2f8ff024..5c6dd0ce209f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c @@ -553,8 +553,10 @@ static u8 bnxt_dcbnl_setdcbx(struct net_device *dev, u8 mode) if ((mode & DCB_CAP_DCBX_VER_CEE) || !(mode & DCB_CAP_DCBX_VER_IEEE)) return 1; - if ((mode & DCB_CAP_DCBX_HOST) && BNXT_VF(bp)) - return 1; + if (mode & DCB_CAP_DCBX_HOST) { + if (BNXT_VF(bp) || (bp->flags & BNXT_FLAG_FW_LLDP_AGENT)) + return 1; + } if (mode == bp->dcbx_cap) return 0; From 579f1d926c66cc0bd3bd87b1fe2e091084b07430 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 17 May 2017 15:18:05 -0700 Subject: [PATCH 60/74] selftests/bpf: fix broken build due to types.h Commit 0a5539f66133 ("bpf: Provide a linux/types.h override for bpf selftests.") caused a build failure for tools/testing/selftest/bpf because of some missing types: $ make -C tools/testing/selftests/bpf/ ... In file included from /home/yhs/work/net-next/tools/testing/selftests/bpf/test_pkt_access.c:8: ../../../include/uapi/linux/bpf.h:170:3: error: unknown type name '__aligned_u64' __aligned_u64 key; ... /usr/include/linux/swab.h:160:8: error: unknown type name '__always_inline' static __always_inline __u16 __swab16p(const __u16 *p) ... The type __aligned_u64 is defined in linux:include/uapi/linux/types.h. The fix is to copy missing type definition into tools/testing/selftests/bpf/include/uapi/linux/types.h. Adding additional include "string.h" resolves __always_inline issue. Fixes: 0a5539f66133 ("bpf: Provide a linux/types.h override for bpf selftests.") Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- .../selftests/bpf/include/uapi/linux/types.h | 16 ++++++++++++++++ tools/testing/selftests/bpf/test_pkt_access.c | 1 + 2 files changed, 17 insertions(+) diff --git a/tools/testing/selftests/bpf/include/uapi/linux/types.h b/tools/testing/selftests/bpf/include/uapi/linux/types.h index fbd16a7554af..51841848fbfe 100644 --- a/tools/testing/selftests/bpf/include/uapi/linux/types.h +++ b/tools/testing/selftests/bpf/include/uapi/linux/types.h @@ -3,4 +3,20 @@ #include +/* copied from linux:include/uapi/linux/types.h */ +#define __bitwise +typedef __u16 __bitwise __le16; +typedef __u16 __bitwise __be16; +typedef __u32 __bitwise __le32; +typedef __u32 __bitwise __be32; +typedef __u64 __bitwise __le64; +typedef __u64 __bitwise __be64; + +typedef __u16 __bitwise __sum16; +typedef __u32 __bitwise __wsum; + +#define __aligned_u64 __u64 __attribute__((aligned(8))) +#define __aligned_be64 __be64 __attribute__((aligned(8))) +#define __aligned_le64 __le64 __attribute__((aligned(8))) + #endif /* _UAPI_LINUX_TYPES_H */ diff --git a/tools/testing/selftests/bpf/test_pkt_access.c b/tools/testing/selftests/bpf/test_pkt_access.c index 39387bb7e08c..6e11ba11709e 100644 --- a/tools/testing/selftests/bpf/test_pkt_access.c +++ b/tools/testing/selftests/bpf/test_pkt_access.c @@ -5,6 +5,7 @@ * License as published by the Free Software Foundation. */ #include +#include #include #include #include From 7dd7eb9513bd02184d45f000ab69d78cb1fa1531 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 17 May 2017 22:54:11 -0400 Subject: [PATCH 61/74] ipv6: Check ip6_find_1stfragopt() return value properly. Do not use unsigned variables to see if it returns a negative error or not. Fixes: 2423496af35d ("ipv6: Prevent overrun when parsing v6 header options") Reported-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv6/ip6_offload.c | 9 ++++----- net/ipv6/ip6_output.c | 7 +++---- net/ipv6/udp_offload.c | 8 +++++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index eab36abc9f22..280268f1dd7b 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -63,7 +63,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, const struct net_offload *ops; int proto; struct frag_hdr *fptr; - unsigned int unfrag_ip6hlen; unsigned int payload_len; u8 *prevhdr; int offset = 0; @@ -116,10 +115,10 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, skb->network_header = (u8 *)ipv6h - skb->head; if (udpfrag) { - unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); - if (unfrag_ip6hlen < 0) - return ERR_PTR(unfrag_ip6hlen); - fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen); + int err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + return ERR_PTR(err); + fptr = (struct frag_hdr *)((u8 *)ipv6h + err); fptr->frag_off = htons(offset); if (skb->next) fptr->frag_off |= htons(IP6_MF); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 01deecda2f84..d4a31becbd25 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -597,11 +597,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; - hlen = ip6_find_1stfragopt(skb, &prevhdr); - if (hlen < 0) { - err = hlen; + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) goto fail; - } + hlen = err; nexthdr = *prevhdr; mtu = ip6_skb_dst_mtu(skb); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b348cff47395..a2267f80febb 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -29,6 +29,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u8 frag_hdr_sz = sizeof(struct frag_hdr); __wsum csum; int tnl_hlen; + int err; mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) @@ -90,9 +91,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, /* Find the unfragmentable header and shift it left by frag_hdr_sz * bytes to insert fragment header. */ - unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); - if (unfrag_ip6hlen < 0) - return ERR_PTR(unfrag_ip6hlen); + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + return ERR_PTR(err); + unfrag_ip6hlen = err; nexthdr = *prevhdr; *prevhdr = NEXTHDR_FRAGMENT; unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + From 3c2ce60bdd3d57051bf85615deec04a694473840 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 18 May 2017 03:00:06 +0200 Subject: [PATCH 62/74] bpf: adjust verifier heuristics Current limits with regards to processing program paths do not really reflect today's needs anymore due to programs becoming more complex and verifier smarter, keeping track of more data such as const ALU operations, alignment tracking, spilling of PTR_TO_MAP_VALUE_ADJ registers, and other features allowing for smarter matching of what LLVM generates. This also comes with the side-effect that we result in fewer opportunities to prune search states and thus often need to do more work to prove safety than in the past due to different register states and stack layout where we mismatch. Generally, it's quite hard to determine what caused a sudden increase in complexity, it could be caused by something as trivial as a single branch somewhere at the beginning of the program where LLVM assigned a stack slot that is marked differently throughout other branches and thus causing a mismatch, where verifier then needs to prove safety for the whole rest of the program. Subsequently, programs with even less than half the insn size limit can get rejected. We noticed that while some programs load fine under pre 4.11, they get rejected due to hitting limits on more recent kernels. We saw that in the vast majority of cases (90+%) pruning failed due to register mismatches. In case of stack mismatches, majority of cases failed due to different stack slot types (invalid, spill, misc) rather than differences in spilled registers. This patch makes pruning more aggressive by also adding markers that sit at conditional jumps as well. Currently, we only mark jump targets for pruning. For example in direct packet access, these are usually error paths where we bail out. We found that adding these markers, it can reduce number of processed insns by up to 30%. Another option is to ignore reg->id in probing PTR_TO_MAP_VALUE_OR_NULL registers, which can help pruning slightly as well by up to 7% observed complexity reduction as stand-alone. Meaning, if a previous path with register type PTR_TO_MAP_VALUE_OR_NULL for map X was found to be safe, then in the current state a PTR_TO_MAP_VALUE_OR_NULL register for the same map X must be safe as well. Last but not least the patch also adds a scheduling point and bumps the current limit for instructions to be processed to a more adequate value. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 39f2dcbc4cbc..1eddb713b815 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -140,7 +140,7 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_INSNS 65536 +#define BPF_COMPLEXITY_LIMIT_INSNS 98304 #define BPF_COMPLEXITY_LIMIT_STACK 1024 #define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) @@ -2640,6 +2640,7 @@ static int check_cfg(struct bpf_verifier_env *env) env->explored_states[t + 1] = STATE_LIST_MARK; } else { /* conditional jump with two edges */ + env->explored_states[t] = STATE_LIST_MARK; ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret == 1) goto peek_stack; @@ -2798,6 +2799,12 @@ static bool states_equal(struct bpf_verifier_env *env, rcur->type != NOT_INIT)) continue; + /* Don't care about the reg->id in this case. */ + if (rold->type == PTR_TO_MAP_VALUE_OR_NULL && + rcur->type == PTR_TO_MAP_VALUE_OR_NULL && + rold->map_ptr == rcur->map_ptr) + continue; + if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && compare_ptrs_to_packet(rold, rcur)) continue; @@ -2932,6 +2939,9 @@ static int do_check(struct bpf_verifier_env *env) goto process_bpf_exit; } + if (need_resched()) + cond_resched(); + if (log_level > 1 || (log_level && do_print_state)) { if (log_level > 1) verbose("%d:", insn_idx); From 64df6d525fcff1630098db9238bfd2b3e092d5c1 Mon Sep 17 00:00:00 2001 From: linzhang Date: Wed, 17 May 2017 12:05:07 +0800 Subject: [PATCH 63/74] net: x25: fix one potential use-after-free issue The function x25_init is not properly unregister related resources on error handler.It is will result in kernel oops if x25_init init failed, so add properly unregister call on error handler. Also, i adjust the coding style and make x25_register_sysctl properly return failure. Signed-off-by: linzhang Signed-off-by: David S. Miller --- include/net/x25.h | 4 ++-- net/x25/af_x25.c | 24 ++++++++++++++++-------- net/x25/sysctl_net_x25.c | 5 ++++- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/include/net/x25.h b/include/net/x25.h index c383aa4edbf0..6d30a01d281d 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -298,10 +298,10 @@ void x25_check_rbuf(struct sock *); /* sysctl_net_x25.c */ #ifdef CONFIG_SYSCTL -void x25_register_sysctl(void); +int x25_register_sysctl(void); void x25_unregister_sysctl(void); #else -static inline void x25_register_sysctl(void) {}; +static inline int x25_register_sysctl(void) { return 0; }; static inline void x25_unregister_sysctl(void) {}; #endif /* CONFIG_SYSCTL */ diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 8b911c29860e..5a1a98df3499 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1791,32 +1791,40 @@ void x25_kill_by_neigh(struct x25_neigh *nb) static int __init x25_init(void) { - int rc = proto_register(&x25_proto, 0); + int rc; - if (rc != 0) + rc = proto_register(&x25_proto, 0); + if (rc) goto out; rc = sock_register(&x25_family_ops); - if (rc != 0) + if (rc) goto out_proto; dev_add_pack(&x25_packet_type); rc = register_netdevice_notifier(&x25_dev_notifier); - if (rc != 0) + if (rc) goto out_sock; + rc = x25_register_sysctl(); + if (rc) + goto out_dev; + + rc = x25_proc_init(); + if (rc) + goto out_sysctl; + pr_info("Linux Version 0.2\n"); - x25_register_sysctl(); - rc = x25_proc_init(); - if (rc != 0) - goto out_dev; out: return rc; +out_sysctl: + x25_unregister_sysctl(); out_dev: unregister_netdevice_notifier(&x25_dev_notifier); out_sock: + dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); out_proto: proto_unregister(&x25_proto); diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c index a06dfe143c67..ba078c85f0a1 100644 --- a/net/x25/sysctl_net_x25.c +++ b/net/x25/sysctl_net_x25.c @@ -73,9 +73,12 @@ static struct ctl_table x25_table[] = { { }, }; -void __init x25_register_sysctl(void) +int __init x25_register_sysctl(void) { x25_table_header = register_net_sysctl(&init_net, "net/x25", x25_table); + if (!x25_table_header) + return -ENOMEM; + return 0; } void x25_unregister_sysctl(void) From 47ab37a19a8112670e63474016d5fbf86bc8737f Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Wed, 17 May 2017 15:28:19 +0800 Subject: [PATCH 64/74] net: ethernet: faraday: To support device tree usage. To support device tree usage for ftmac100. Signed-off-by: Greentime Hu Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftmac100.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/faraday/ftmac100.c b/drivers/net/ethernet/faraday/ftmac100.c index 6ac336b546e6..1536356e2ea8 100644 --- a/drivers/net/ethernet/faraday/ftmac100.c +++ b/drivers/net/ethernet/faraday/ftmac100.c @@ -1174,11 +1174,17 @@ static int ftmac100_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id ftmac100_of_ids[] = { + { .compatible = "andestech,atmac100" }, + { } +}; + static struct platform_driver ftmac100_driver = { .probe = ftmac100_probe, .remove = ftmac100_remove, .driver = { .name = DRV_NAME, + .of_match_table = ftmac100_of_ids }, }; @@ -1202,3 +1208,4 @@ module_exit(ftmac100_exit); MODULE_AUTHOR("Po-Yu Chuang "); MODULE_DESCRIPTION("FTMAC100 driver"); MODULE_LICENSE("GPL"); +MODULE_DEVICE_TABLE(of, ftmac100_of_ids); From a285860211bf257b0e6d522dac6006794be348af Mon Sep 17 00:00:00 2001 From: Tobias Jungel Date: Wed, 17 May 2017 09:29:12 +0200 Subject: [PATCH 65/74] bridge: netlink: check vlan_default_pvid range Currently it is allowed to set the default pvid of a bridge to a value above VLAN_VID_MASK (0xfff). This patch adds a check to br_validate and returns -EINVAL in case the pvid is out of bounds. Reproduce by calling: [root@test ~]# ip l a type bridge [root@test ~]# ip l a type dummy [root@test ~]# ip l s bridge0 type bridge vlan_filtering 1 [root@test ~]# ip l s bridge0 type bridge vlan_default_pvid 9999 [root@test ~]# ip l s dummy0 master bridge0 [root@test ~]# bridge vlan port vlan ids bridge0 9999 PVID Egress Untagged dummy0 9999 PVID Egress Untagged Fixes: 0f963b7592ef ("bridge: netlink: add support for default_pvid") Acked-by: Nikolay Aleksandrov Signed-off-by: Tobias Jungel Acked-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c5ce7745b230..574f78824d8a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -835,6 +835,13 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return -EPROTONOSUPPORT; } } + + if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { + __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); + + if (defpvid >= VLAN_VID_MASK) + return -EINVAL; + } #endif return 0; From a3f96c47c8d4c38be71fdbb440a99968c764ba62 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 17 May 2017 14:52:16 +0200 Subject: [PATCH 66/74] udp: make *udp*_queue_rcv_skb() functions static Since the udp memory accounting refactor, we don't need any more to export the *udp*_queue_rcv_skb(). Make them static and fix a couple of sparse warnings: net/ipv4/udp.c:1615:5: warning: symbol 'udp_queue_rcv_skb' was not declared. Should it be static? net/ipv6/udp.c:572:5: warning: symbol 'udpv6_queue_rcv_skb' was not declared. Should it be static? Fixes: 850cbaddb52d ("udp: use it's own memory accounting schema") Fixes: c915fe13cbaa ("udplite: fix NULL pointer dereference") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 ++-- net/ipv4/udp_impl.h | 1 - net/ipv6/udp.c | 4 ++-- net/ipv6/udp_impl.h | 1 - 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ea6e4cff9faf..1d6219bf2d6b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1612,7 +1612,7 @@ static void udp_v4_rehash(struct sock *sk) udp_lib_rehash(sk, new_hash); } -int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -1657,7 +1657,7 @@ EXPORT_SYMBOL(udp_encap_enable); * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ -int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index feb50a16398d..a8cf8c6fb60c 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -25,7 +25,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); -int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 04862abfe4ec..06ec39b79609 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -526,7 +526,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } -int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -569,7 +569,7 @@ void udpv6_encap_enable(void) } EXPORT_SYMBOL(udpv6_encap_enable); -int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index e78bdc76dcc3..f180b3d85e31 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -26,7 +26,6 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); -int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udpv6_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS From fdcee2cbb8438702ea1b328fb6e0ac5e9a40c7f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 May 2017 07:16:40 -0700 Subject: [PATCH 67/74] sctp: do not inherit ipv6_{mc|ac|fl}_list from parent SCTP needs fixes similar to 83eaddab4378 ("ipv6/dccp: do not inherit ipv6_mc_list from parent"), otherwise bad things can happen. Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 142b70e959af..f5b45b8b8b16 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -677,6 +677,9 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, newnp = inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; rcu_read_lock(); opt = rcu_dereference(np->opt); From 486181bcb3248e2f1977f4e69387a898234a4e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 17 May 2017 16:31:41 +0200 Subject: [PATCH 68/74] qmi_wwan: add another Lenovo EM74xx device ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In their infinite wisdom, and never ending quest for end user frustration, Lenovo has decided to use a new USB device ID for the wwan modules in their 2017 laptops. The actual hardware is still the Sierra Wireless EM7455 or EM7430, depending on region. Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index d7165767ca9d..8f923a147fa9 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1196,6 +1196,8 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x1199, 0x9071, 10)}, /* Sierra Wireless MC74xx */ {QMI_FIXED_INTF(0x1199, 0x9079, 8)}, /* Sierra Wireless EM74xx */ {QMI_FIXED_INTF(0x1199, 0x9079, 10)}, /* Sierra Wireless EM74xx */ + {QMI_FIXED_INTF(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */ + {QMI_FIXED_INTF(0x1199, 0x907b, 10)}, /* Sierra Wireless EM74xx */ {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)}, /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */ {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */ {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ From 1ac91bff9c65d4a5e0da244495ea6275fd514c5a Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Thu, 18 May 2017 00:08:16 +0530 Subject: [PATCH 69/74] cxgb4: update latest firmware version supported Change t4fw_version.h to update latest firmware version number to 1.16.43.0. Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h index fa376444e57c..3549d3876278 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h @@ -37,7 +37,7 @@ #define T4FW_VERSION_MAJOR 0x01 #define T4FW_VERSION_MINOR 0x10 -#define T4FW_VERSION_MICRO 0x21 +#define T4FW_VERSION_MICRO 0x2B #define T4FW_VERSION_BUILD 0x00 #define T4FW_MIN_VERSION_MAJOR 0x01 @@ -46,7 +46,7 @@ #define T5FW_VERSION_MAJOR 0x01 #define T5FW_VERSION_MINOR 0x10 -#define T5FW_VERSION_MICRO 0x21 +#define T5FW_VERSION_MICRO 0x2B #define T5FW_VERSION_BUILD 0x00 #define T5FW_MIN_VERSION_MAJOR 0x00 @@ -55,7 +55,7 @@ #define T6FW_VERSION_MAJOR 0x01 #define T6FW_VERSION_MINOR 0x10 -#define T6FW_VERSION_MICRO 0x21 +#define T6FW_VERSION_MICRO 0x2B #define T6FW_VERSION_BUILD 0x00 #define T6FW_MIN_VERSION_MAJOR 0x00 From 6dd4aba36f2dca00c3b6976a0d59c5cee16ad545 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Thu, 18 May 2017 09:18:52 +0200 Subject: [PATCH 70/74] mlxsw: spectrum_dpipe: Fix incorrect entry index In case of disabled counters the entry index will be incorrect. Fix this by moving the entry index set before the counter status check. Fixes: 2ba5999f009d ("mlxsw: spectrum: Add Support for erif table entries access") Signed-off-by: Arkadi Sharshevsky Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c index ea56f6ade6b4..5f0a7bc692a4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c @@ -199,10 +199,11 @@ static int mlxsw_sp_erif_entry_get(struct mlxsw_sp *mlxsw_sp, entry->counter_valid = false; entry->counter = 0; + entry->index = mlxsw_sp_rif_index(rif); + if (!counters_enabled) return 0; - entry->index = mlxsw_sp_rif_index(rif); err = mlxsw_sp_rif_counter_value_get(mlxsw_sp, rif, MLXSW_SP_RIF_COUNTER_EGRESS, &cnt); From 6b1206bbbce6092b2ec412125300889e6e551bc2 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Thu, 18 May 2017 09:18:53 +0200 Subject: [PATCH 71/74] mlxsw: spectrum_router: Fix rif counter freeing routine During rif counter freeing the counter index can be invalid. Add check of validity before freeing the counter. Fixes: e0c0afd8aa4e ("mlxsw: spectrum: Support for counters on router interfaces") Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 33cec1cc1642..9f89c4137d21 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -206,6 +206,9 @@ void mlxsw_sp_rif_counter_free(struct mlxsw_sp *mlxsw_sp, { unsigned int *p_counter_index; + if (!mlxsw_sp_rif_counter_valid_get(rif, dir)) + return; + p_counter_index = mlxsw_sp_rif_p_counter_get(rif, dir); if (WARN_ON(!p_counter_index)) return; From 5f5c5449acad0cd3322e53e1ac68c044483b0aa5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 18 May 2017 15:01:34 +0200 Subject: [PATCH 72/74] sh_eth: Use platform device for printing before register_netdev() The MDIO initialization failure message is printed using the network device, before it has been registered, leading to: (null): failed to initialise MDIO Use the platform device instead to fix this: sh-eth ee700000.ethernet: failed to initialise MDIO Fixes: daacf03f0bbfefee ("sh_eth: Register MDIO bus before registering the network device") Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index f68c4db656ed..c85222b02754 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -3220,7 +3220,7 @@ static int sh_eth_drv_probe(struct platform_device *pdev) /* MDIO bus init */ ret = sh_mdio_init(mdp, pd); if (ret) { - dev_err(&ndev->dev, "failed to initialise MDIO\n"); + dev_err(&pdev->dev, "failed to initialise MDIO\n"); goto out_release; } From b7ce520e9f71ff65d0aa0ad86223f94ae4095fae Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 18 May 2017 15:01:35 +0200 Subject: [PATCH 73/74] sh_eth: Do not print an error message for probe deferral EPROBE_DEFER is not an error, hence printing an error message like sh-eth ee700000.ethernet: failed to initialise MDIO may confuse the user. To fix this, suppress the error message in case of probe deferral. While at it, shorten the message, and add the actual error code. Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index c85222b02754..2d686ccf971b 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -3220,7 +3220,8 @@ static int sh_eth_drv_probe(struct platform_device *pdev) /* MDIO bus init */ ret = sh_mdio_init(mdp, pd); if (ret) { - dev_err(&pdev->dev, "failed to initialise MDIO\n"); + if (ret != -EPROBE_DEFER) + dev_err(&pdev->dev, "MDIO init failed: %d\n", ret); goto out_release; } From c0e01eac7ada785fdeaea1ae5476ec1cf3b00374 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 18 May 2017 13:03:52 +0200 Subject: [PATCH 74/74] mlxsw: spectrum: Avoid possible NULL pointer dereference In case we got an FDB notification for a port that doesn't exist we execute an FDB entry delete to prevent it from re-appearing the next time we poll for notifications. If the operation failed we would trigger a NULL pointer dereference as 'mlxsw_sp_port' is NULL. Fix it by reporting the error using the underlying bus device instead. Fixes: 12f1501e7511 ("mlxsw: spectrum: remove FDB entry in case we get unknown object notification") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 0d8411f1f954..f4bb0c0b7c1d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1497,8 +1497,7 @@ static void mlxsw_sp_fdb_notify_mac_process(struct mlxsw_sp *mlxsw_sp, err = mlxsw_sp_port_fdb_uc_op(mlxsw_sp, local_port, mac, fid, adding, true); if (err) { - if (net_ratelimit()) - netdev_err(mlxsw_sp_port->dev, "Failed to set FDB entry\n"); + dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to set FDB entry\n"); return; } @@ -1558,8 +1557,7 @@ static void mlxsw_sp_fdb_notify_mac_lag_process(struct mlxsw_sp *mlxsw_sp, err = mlxsw_sp_port_fdb_uc_lag_op(mlxsw_sp, lag_id, mac, fid, lag_vid, adding, true); if (err) { - if (net_ratelimit()) - netdev_err(mlxsw_sp_port->dev, "Failed to set FDB entry\n"); + dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to set FDB entry\n"); return; }