From d1b22e36e3188f47dbf8aaef54d9e79df8e91f4c Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 14 Oct 2020 00:49:26 +0000 Subject: [PATCH 1/4] Documentation/x86: Rename resctrl_ui.rst and add two errata to the file Intel Memory Bandwidth Monitoring (MBM) counters may report system memory bandwidth incorrectly on some Intel processors. This is reported in documented in erratum SKX99, erratum BDF102 and in the RDT reference manual, see Documentation/x86/index.rst. To work around the errata, MBM total and local readings are corrected using a correction factor table. Since the correction factor table is not publicly documented anywhere, document the table and the errata in Documentation/x86/resctrl.rst for future reference. [ bp: Move web links to the doc, massage. ] Suggested-by: Borislav Petkov Signed-off-by: Fenghua Yu Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20201014004927.1839452-2-fenghua.yu@intel.com --- Documentation/x86/index.rst | 2 +- .../x86/{resctrl_ui.rst => resctrl.rst} | 93 +++++++++++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) rename Documentation/x86/{resctrl_ui.rst => resctrl.rst} (90%) diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst index b224d12c880b..9c6ebf355f81 100644 --- a/Documentation/x86/index.rst +++ b/Documentation/x86/index.rst @@ -27,7 +27,7 @@ x86-specific Documentation pti mds microcode - resctrl_ui + resctrl tsx_async_abort usb-legacy-support i386/index diff --git a/Documentation/x86/resctrl_ui.rst b/Documentation/x86/resctrl.rst similarity index 90% rename from Documentation/x86/resctrl_ui.rst rename to Documentation/x86/resctrl.rst index e59b7b93a9b4..71a531061e4e 100644 --- a/Documentation/x86/resctrl_ui.rst +++ b/Documentation/x86/resctrl.rst @@ -1209,3 +1209,96 @@ View the llc occupancy snapshot:: # cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy 11234000 + +Intel RDT Errata +================ + +Intel MBM Counters May Report System Memory Bandwidth Incorrectly +----------------------------------------------------------------- + +Errata SKX99 for Skylake server and BDF102 for Broadwell server. + +Problem: Intel Memory Bandwidth Monitoring (MBM) counters track metrics +according to the assigned Resource Monitor ID (RMID) for that logical +core. The IA32_QM_CTR register (MSR 0xC8E), used to report these +metrics, may report incorrect system bandwidth for certain RMID values. + +Implication: Due to the errata, system memory bandwidth may not match +what is reported. + +Workaround: MBM total and local readings are corrected according to the +following correction factor table: + ++---------------+---------------+---------------+-----------------+ +|core count |rmid count |rmid threshold |correction factor| ++---------------+---------------+---------------+-----------------+ +|1 |8 |0 |1.000000 | ++---------------+---------------+---------------+-----------------+ +|2 |16 |0 |1.000000 | ++---------------+---------------+---------------+-----------------+ +|3 |24 |15 |0.969650 | ++---------------+---------------+---------------+-----------------+ +|4 |32 |0 |1.000000 | ++---------------+---------------+---------------+-----------------+ +|6 |48 |31 |0.969650 | ++---------------+---------------+---------------+-----------------+ +|7 |56 |47 |1.142857 | ++---------------+---------------+---------------+-----------------+ +|8 |64 |0 |1.000000 | ++---------------+---------------+---------------+-----------------+ +|9 |72 |63 |1.185115 | ++---------------+---------------+---------------+-----------------+ +|10 |80 |63 |1.066553 | ++---------------+---------------+---------------+-----------------+ +|11 |88 |79 |1.454545 | ++---------------+---------------+---------------+-----------------+ +|12 |96 |0 |1.000000 | ++---------------+---------------+---------------+-----------------+ +|13 |104 |95 |1.230769 | ++---------------+---------------+---------------+-----------------+ +|14 |112 |95 |1.142857 | ++---------------+---------------+---------------+-----------------+ +|15 |120 |95 |1.066667 | ++---------------+---------------+---------------+-----------------+ +|16 |128 |0 |1.000000 | ++---------------+---------------+---------------+-----------------+ +|17 |136 |127 |1.254863 | ++---------------+---------------+---------------+-----------------+ +|18 |144 |127 |1.185255 | ++---------------+---------------+---------------+-----------------+ +|19 |152 |0 |1.000000 | ++---------------+---------------+---------------+-----------------+ +|20 |160 |127 |1.066667 | ++---------------+---------------+---------------+-----------------+ +|21 |168 |0 |1.000000 | ++---------------+---------------+---------------+-----------------+ +|22 |176 |159 |1.454334 | ++---------------+---------------+---------------+-----------------+ +|23 |184 |0 |1.000000 | ++---------------+---------------+---------------+-----------------+ +|24 |192 |127 |0.969744 | ++---------------+---------------+---------------+-----------------+ +|25 |200 |191 |1.280246 | ++---------------+---------------+---------------+-----------------+ +|26 |208 |191 |1.230921 | ++---------------+---------------+---------------+-----------------+ +|27 |216 |0 |1.000000 | ++---------------+---------------+---------------+-----------------+ +|28 |224 |191 |1.143118 | ++---------------+---------------+---------------+-----------------+ + +If rmid > rmid threshold, MBM total and local values should be multiplied +by the correction factor. + +See: + +1. Erratum SKX99 in Intel Xeon Processor Scalable Family Specification Update: +http://web.archive.org/web/20200716124958/https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-spec-update.html + +2. Erratum BDF102 in Intel Xeon E5-2600 v4 Processor Product Family Specification Update: +http://web.archive.org/web/20191125200531/https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/xeon-e5-v4-spec-update.pdf + +3. The errata in Intel Resource Director Technology (Intel RDT) on 2nd Generation Intel Xeon Scalable Processors Reference Manual: +https://software.intel.com/content/www/us/en/develop/articles/intel-resource-director-technology-rdt-reference-manual.html + +for further information. From 4868a61d498af3627c3b571aa48107a845001e51 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 14 Oct 2020 00:49:27 +0000 Subject: [PATCH 2/4] x86/resctrl: Correct MBM total and local values Intel Memory Bandwidth Monitoring (MBM) counters may report system memory bandwidth incorrectly on some Intel processors. The errata SKX99 for Skylake server, BDF102 for Broadwell server, and the correction factor table are documented in Documentation/x86/resctrl.rst. Intel MBM counters track metrics according to the assigned Resource Monitor ID (RMID) for that logical core. The IA32_QM_CTR register (MSR 0xC8E) used to report these metrics, may report incorrect system bandwidth for certain RMID values. Due to the errata, system memory bandwidth may not match what is reported. To work around the errata, correct MBM total and local readings using a correction factor table. If rmid > rmid threshold, MBM total and local values should be multiplied by the correction factor. [ bp: Mark mbm_cf_table[] __initdata. ] Signed-off-by: Fenghua Yu Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20201014004927.1839452-3-fenghua.yu@intel.com --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++ arch/x86/kernel/cpu/resctrl/internal.h | 1 + arch/x86/kernel/cpu/resctrl/monitor.c | 82 +++++++++++++++++++++++++- 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index e5f4ee8f4c3b..1a681fde4c51 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -893,6 +893,10 @@ static __init void __check_quirks_intel(void) set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); else set_rdt_options("!l3cat"); + fallthrough; + case INTEL_FAM6_BROADWELL_X: + intel_rdt_mbm_apply_quirk(); + break; } } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 80fa997fae60..6cb068fcf501 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -616,6 +616,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); void mbm_handle_overflow(struct work_struct *work); +void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); u32 delay_bw_map(unsigned long bw, struct rdt_resource *r); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 54dffe574e67..622073ffa715 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -64,6 +64,69 @@ unsigned int rdt_mon_features; */ unsigned int resctrl_cqm_threshold; +#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) + +/* + * The correction factor table is documented in Documentation/x86/resctrl.rst. + * If rmid > rmid threshold, MBM total and local values should be multiplied + * by the correction factor. + * + * The original table is modified for better code: + * + * 1. The threshold 0 is changed to rmid count - 1 so don't do correction + * for the case. + * 2. MBM total and local correction table indexed by core counter which is + * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. + * 3. The correction factor is normalized to 2^20 (1048576) so it's faster + * to calculate corrected value by shifting: + * corrected_value = (original_value * correction_factor) >> 20 + */ +static const struct mbm_correction_factor_table { + u32 rmidthreshold; + u64 cf; +} mbm_cf_table[] __initdata = { + {7, CF(1.000000)}, + {15, CF(1.000000)}, + {15, CF(0.969650)}, + {31, CF(1.000000)}, + {31, CF(1.066667)}, + {31, CF(0.969650)}, + {47, CF(1.142857)}, + {63, CF(1.000000)}, + {63, CF(1.185115)}, + {63, CF(1.066553)}, + {79, CF(1.454545)}, + {95, CF(1.000000)}, + {95, CF(1.230769)}, + {95, CF(1.142857)}, + {95, CF(1.066667)}, + {127, CF(1.000000)}, + {127, CF(1.254863)}, + {127, CF(1.185255)}, + {151, CF(1.000000)}, + {127, CF(1.066667)}, + {167, CF(1.000000)}, + {159, CF(1.454334)}, + {183, CF(1.000000)}, + {127, CF(0.969744)}, + {191, CF(1.280246)}, + {191, CF(1.230921)}, + {215, CF(1.000000)}, + {191, CF(1.143118)}, +}; + +static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; +static u64 mbm_cf __read_mostly; + +static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) +{ + /* Correct MBM value. */ + if (rmid > mbm_cf_rmidthreshold) + val = (val * mbm_cf) >> 20; + + return val; +} + static inline struct rmid_entry *__rmid_entry(u32 rmid) { struct rmid_entry *entry; @@ -260,7 +323,8 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) m->chunks += chunks; m->prev_msr = tval; - rr->val += m->chunks; + rr->val += get_corrected_mbm_count(rmid, m->chunks); + return 0; } @@ -280,7 +344,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); m->chunks += chunks; - cur_bw = (chunks * r->mon_scale) >> 20; + cur_bw = (get_corrected_mbm_count(rmid, chunks) * r->mon_scale) >> 20; if (m->delta_comp) m->delta_bw = abs(cur_bw - m->prev_bw); @@ -644,3 +708,17 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } + +void __init intel_rdt_mbm_apply_quirk(void) +{ + int cf_index; + + cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; + if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { + pr_info("No MBM correction factor available\n"); + return; + } + + mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; + mbm_cf = mbm_cf_table[cf_index].cf; +} From 2002d2951398317d0f46e64ae6d8dd58ed541c6d Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Wed, 11 Nov 2020 00:02:28 +0100 Subject: [PATCH 3/4] x86/resctrl: Constify kernfs_ops The only usage of the kf_ops field in the rftype struct is to pass it as argument to __kernfs_create_file(), which accepts a pointer to const. Make it a pointer to const. This makes it possible to make rdtgroup_kf_single_ops and kf_mondata_ops const, which allows the compiler to put them in read-only memory. Signed-off-by: Rikard Falkeborn Signed-off-by: Borislav Petkov Acked-by: Reinette Chatre Link: https://lkml.kernel.org/r/20201110230228.801785-1-rikard.falkeborn@gmail.com --- arch/x86/kernel/cpu/resctrl/internal.h | 2 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 6cb068fcf501..5540b025880c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -264,7 +264,7 @@ void __exit rdtgroup_exit(void); struct rftype { char *name; umode_t mode; - struct kernfs_ops *kf_ops; + const struct kernfs_ops *kf_ops; unsigned long flags; unsigned long fflags; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index af323e2e3100..78dcbb8e0172 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -240,13 +240,13 @@ static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, return -EINVAL; } -static struct kernfs_ops rdtgroup_kf_single_ops = { +static const struct kernfs_ops rdtgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, .write = rdtgroup_file_write, .seq_show = rdtgroup_seqfile_show, }; -static struct kernfs_ops kf_mondata_ops = { +static const struct kernfs_ops kf_mondata_ops = { .atomic_write_len = PAGE_SIZE, .seq_show = rdtgroup_mondata_show, }; From 19eb86a72df50adcf554f234469bb5b7209b7640 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Tue, 1 Dec 2020 02:06:58 +0800 Subject: [PATCH 4/4] x86/resctrl: Clean up unused function parameter in rmdir path Commit fd8d9db3559a ("x86/resctrl: Remove superfluous kernfs_get() calls to prevent refcount leak") removed superfluous kernfs_get() calls in rdtgroup_ctrl_remove() and rdtgroup_rmdir_ctrl(). That change resulted in an unused function parameter to these two functions. Clean up the unused function parameter in rdtgroup_ctrl_remove(), rdtgroup_rmdir_mon() and their callers rdtgroup_rmdir_ctrl() and rdtgroup_rmdir(). Signed-off-by: Xiaochen Shen Signed-off-by: Borislav Petkov Reviewed-by: Reinette Chatre Link: https://lkml.kernel.org/r/1606759618-13181-1-git-send-email-xiaochen.shen@intel.com --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 05a026d4420b..bcbec8533b74 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3018,8 +3018,7 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, return -EPERM; } -static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, - cpumask_var_t tmpmask) +static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { struct rdtgroup *prdtgrp = rdtgrp->mon.parent; int cpu; @@ -3051,8 +3050,7 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, return 0; } -static int rdtgroup_ctrl_remove(struct kernfs_node *kn, - struct rdtgroup *rdtgrp) +static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) { rdtgrp->flags = RDT_DELETED; list_del(&rdtgrp->rdtgroup_list); @@ -3061,8 +3059,7 @@ static int rdtgroup_ctrl_remove(struct kernfs_node *kn, return 0; } -static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, - cpumask_var_t tmpmask) +static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { int cpu; @@ -3089,7 +3086,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, closid_free(rdtgrp->closid); free_rmid(rdtgrp->mon.rmid); - rdtgroup_ctrl_remove(kn, rdtgrp); + rdtgroup_ctrl_remove(rdtgrp); /* * Free all the child monitor group rmids. @@ -3126,13 +3123,13 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) rdtgrp != &rdtgroup_default) { if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = rdtgroup_ctrl_remove(kn, rdtgrp); + ret = rdtgroup_ctrl_remove(rdtgrp); } else { - ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); } } else if (rdtgrp->type == RDTMON_GROUP && is_mon_groups(parent_kn, kn->name)) { - ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); } else { ret = -EPERM; }