From ef3a62d272f033989e83eb1f26505f93f93e3e69 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 18 Jun 2008 15:39:48 -0700 Subject: [PATCH 01/65] mac80211: detect driver tx bugs When a driver rejects a frame in it's ->tx() callback, it must also stop queues, otherwise mac80211 can go into a loop here. Detect this situation and abort the loop after five retries, warning about the driver bug. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/mac80211/tx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 28d8bd53bd3a..c80d5899f279 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1132,7 +1132,7 @@ static int ieee80211_tx(struct net_device *dev, struct sk_buff *skb, ieee80211_tx_handler *handler; struct ieee80211_tx_data tx; ieee80211_tx_result res = TX_DROP, res_prepare; - int ret, i; + int ret, i, retries = 0; WARN_ON(__ieee80211_queue_pending(local, control->queue)); @@ -1216,6 +1216,13 @@ static int ieee80211_tx(struct net_device *dev, struct sk_buff *skb, if (!__ieee80211_queue_stopped(local, control->queue)) { clear_bit(IEEE80211_LINK_STATE_PENDING, &local->state[control->queue]); + retries++; + /* + * Driver bug, it's rejecting packets but + * not stopping queues. + */ + if (WARN_ON_ONCE(retries > 5)) + goto drop; goto retry; } memcpy(&store->control, control, From d38b149794e7444a55e741446717147e7f0467f8 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 3 Apr 2008 10:40:39 +0200 Subject: [PATCH 02/65] hwmon: (lm85) Fix function RANGE_TO_REG() Function RANGE_TO_REG() is broken. For a requested range of 2000 (2 degrees C), it will return an index value of 15, i.e. 80.0 degrees C, instead of the expected index value of 0. All other values are handled properly, just 2000 isn't. The bug was introduced back in November 2004 by this patch: http://git.kernel.org/?p=linux/kernel/git/tglx/history.git;a=commit;h=1c28d80f1992240373099d863e4996cdd5d646d0 While this can be fixed easily with the current code, I'd rather rewrite the whole function in a way which is more obviously correct. Signed-off-by: Jean Delvare Cc: Justin Thiessen Signed-off-by: Mark M. Hoffman --- drivers/hwmon/lm85.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/hwmon/lm85.c b/drivers/hwmon/lm85.c index 182fe6a5605f..ee5eca1c1921 100644 --- a/drivers/hwmon/lm85.c +++ b/drivers/hwmon/lm85.c @@ -192,23 +192,20 @@ static int RANGE_TO_REG( int range ) { int i; - if ( range < lm85_range_map[0] ) { - return 0 ; - } else if ( range > lm85_range_map[15] ) { + if (range >= lm85_range_map[15]) return 15 ; - } else { /* find closest match */ - for ( i = 14 ; i >= 0 ; --i ) { - if ( range > lm85_range_map[i] ) { /* range bracketed */ - if ((lm85_range_map[i+1] - range) < - (range - lm85_range_map[i])) { - i++; - break; - } - break; - } + + /* Find the closest match */ + for (i = 14; i >= 0; --i) { + if (range >= lm85_range_map[i]) { + if ((lm85_range_map[i + 1] - range) < + (range - lm85_range_map[i])) + return i + 1; + return i; } } - return( i & 0x0f ); + + return 0; } #define RANGE_FROM_REG(val) (lm85_range_map[(val)&0x0f]) From ed4ec814e45ae8b1596aea0a29b92f6c3614acaa Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Sat, 26 Apr 2008 16:34:26 +0200 Subject: [PATCH 03/65] hwmon: (adt7473) Initialize max_duty_at_overheat before use data->max_duty_at_overheat is not updated in adt7473_update_device, so it might be used before it is initialized (if the user reads from sysfs file max_duty_at_crit before writing to it.) Signed-off-by: Jean Delvare Acked-by: Darrick J. Wong Signed-off-by: Mark M. Hoffman --- drivers/hwmon/adt7473.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hwmon/adt7473.c b/drivers/hwmon/adt7473.c index c1009d6f9796..93dbf5e7ff8a 100644 --- a/drivers/hwmon/adt7473.c +++ b/drivers/hwmon/adt7473.c @@ -309,6 +309,9 @@ static struct adt7473_data *adt7473_update_device(struct device *dev) ADT7473_REG_PWM_BHVR(i)); } + i = i2c_smbus_read_byte_data(client, ADT7473_REG_CFG4); + data->max_duty_at_overheat = !!(i & ADT7473_CFG4_MAX_DUTY_AT_OVT); + data->limits_last_updated = local_jiffies; data->limits_valid = 1; From 125ff8087fca28e922e7ad6e082efcf04fe2f0f4 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Sat, 23 Feb 2008 10:57:53 +0100 Subject: [PATCH 04/65] hwmon: Update the sysfs interface documentation * Document the characteristics of libsensors 3.0.0 and 3.0.1. * The sysfs interface is no longer subject to changes. Signed-off-by: Jean Delvare Acked-by: Juerg Haefliger Signed-off-by: Mark M. Hoffman --- Documentation/hwmon/sysfs-interface | 33 ++++++++++++----------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface index f4a8ebc1ef1a..2d845730d4e0 100644 --- a/Documentation/hwmon/sysfs-interface +++ b/Documentation/hwmon/sysfs-interface @@ -2,17 +2,12 @@ Naming and data format standards for sysfs files ------------------------------------------------ The libsensors library offers an interface to the raw sensors data -through the sysfs interface. See libsensors documentation and source for -further information. As of writing this document, libsensors -(from lm_sensors 2.8.3) is heavily chip-dependent. Adding or updating -support for any given chip requires modifying the library's code. -This is because libsensors was written for the procfs interface -older kernel modules were using, which wasn't standardized enough. -Recent versions of libsensors (from lm_sensors 2.8.2 and later) have -support for the sysfs interface, though. - -The new sysfs interface was designed to be as chip-independent as -possible. +through the sysfs interface. Since lm-sensors 3.0.0, libsensors is +completely chip-independent. It assumes that all the kernel drivers +implement the standard sysfs interface described in this document. +This makes adding or updating support for any given chip very easy, as +libsensors, and applications using it, do not need to be modified. +This is a major improvement compared to lm-sensors 2. Note that motherboards vary widely in the connections to sensor chips. There is no standard that ensures, for example, that the second @@ -35,19 +30,17 @@ access this data in a simple and consistent way. That said, such programs will have to implement conversion, labeling and hiding of inputs. For this reason, it is still not recommended to bypass the library. -If you are developing a userspace application please send us feedback on -this standard. - -Note that this standard isn't completely established yet, so it is subject -to changes. If you are writing a new hardware monitoring driver those -features can't seem to fit in this interface, please contact us with your -extension proposal. Keep in mind that backward compatibility must be -preserved. - Each chip gets its own directory in the sysfs /sys/devices tree. To find all sensor chips, it is easier to follow the device symlinks from /sys/class/hwmon/hwmon*. +Up to lm-sensors 3.0.0, libsensors looks for hardware monitoring attributes +in the "physical" device directory. Since lm-sensors 3.0.1, attributes found +in the hwmon "class" device directory are also supported. Complex drivers +(e.g. drivers for multifunction chips) may want to use this possibility to +avoid namespace pollution. The only drawback will be that older versions of +libsensors won't support the driver in question. + All sysfs values are fixed point numbers. There is only one value per file, unlike the older /proc specification. From 1604e78b7d6e6087ae9bde6e7a6b41cda80d6557 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 26 Feb 2008 19:34:48 +0100 Subject: [PATCH 05/65] hwmon: (abituguru3) Identify Abit AW8D board as such This patch identifies the Abit AW8D board as such, and adds support for its aux5 fan connector Signed-off-by: Hans de Goede Acked-by: Jean Delvare Signed-off-by: Mark M. Hoffman --- drivers/hwmon/abituguru3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/abituguru3.c b/drivers/hwmon/abituguru3.c index ed33fddc4dee..f2086e99a9cb 100644 --- a/drivers/hwmon/abituguru3.c +++ b/drivers/hwmon/abituguru3.c @@ -323,7 +323,7 @@ static const struct abituguru3_motherboard_info abituguru3_motherboards[] = { { "AUX1 Fan", 36, 2, 60, 1, 0 }, { NULL, 0, 0, 0, 0, 0 } } }, - { 0x0013, "unknown", { + { 0x0013, "Abit AW8D", { { "CPU Core", 0, 0, 10, 1, 0 }, { "DDR", 1, 0, 10, 1, 0 }, { "DDR VTT", 2, 0, 10, 1, 0 }, @@ -349,6 +349,7 @@ static const struct abituguru3_motherboard_info abituguru3_motherboards[] = { { "AUX2 Fan", 36, 2, 60, 1, 0 }, { "AUX3 Fan", 37, 2, 60, 1, 0 }, { "AUX4 Fan", 38, 2, 60, 1, 0 }, + { "AUX5 Fan", 39, 2, 60, 1, 0 }, { NULL, 0, 0, 0, 0, 0 } } }, { 0x0014, "Abit AB9 Pro", { From 25845c22647fad2a0852cf6bf277d84e8a7a6b4a Mon Sep 17 00:00:00 2001 From: Marc Hulsman Date: Sun, 8 Jun 2008 10:59:41 -0400 Subject: [PATCH 06/65] hwmon: (w83791d) new maintainer Signed-off-by: Charles Spirakis Acked-by: Jean Delvare Signed-off-by: Mark M. Hoffman --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 9d4304266043..fa30364d8d08 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4428,10 +4428,10 @@ M: johnpol@2ka.mipt.ru S: Maintained W83791D HARDWARE MONITORING DRIVER -P: Charles Spirakis -M: bezaur@gmail.com +P: Marc Hulsman +M: m.hulsman@tudelft.nl L: lm-sensors@lm-sensors.org -S: Odd Fixes +S: Maintained W83793 HARDWARE MONITORING DRIVER P: Rudolf Marek From b3aeab0cdbd0fe5339a3a5918b59eebf148cbcd1 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 23 May 2008 16:10:41 +0200 Subject: [PATCH 07/65] hwmon: (abituguru3) update driver detection It has been reported that the abituguru3 driver fails to load after a BIOS update. This patch fixes this by loosening the detection routine so that it will work after the BIOS update too. To compensate for the now very loose detection an additional check is added on the DMI Base Board vendor string to make sure we only load on Abit motherboards, this is the same as the check in the abituguru (1 / 2) driver. Signed-of-by: Hans de Goede Signed-off-by: Alistair John Strachan Signed-off-by: Mark M. Hoffman --- drivers/hwmon/abituguru3.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/abituguru3.c b/drivers/hwmon/abituguru3.c index f2086e99a9cb..f00f497b9ca9 100644 --- a/drivers/hwmon/abituguru3.c +++ b/drivers/hwmon/abituguru3.c @@ -30,6 +30,7 @@ #include #include #include +#include #include /* uGuru3 bank addresses */ @@ -1112,11 +1113,12 @@ static int __init abituguru3_detect(void) { /* See if there is an uguru3 there. An idle uGuru3 will hold 0x00 or 0x08 at DATA and 0xAC at CMD. Sometimes the uGuru3 will hold 0x05 - at CMD instead, why is unknown. So we test for 0x05 too. */ + or 0x55 at CMD instead, why is unknown. */ u8 data_val = inb_p(ABIT_UGURU3_BASE + ABIT_UGURU3_DATA); u8 cmd_val = inb_p(ABIT_UGURU3_BASE + ABIT_UGURU3_CMD); if (((data_val == 0x00) || (data_val == 0x08)) && - ((cmd_val == 0xAC) || (cmd_val == 0x05))) + ((cmd_val == 0xAC) || (cmd_val == 0x05) || + (cmd_val == 0x55))) return ABIT_UGURU3_BASE; ABIT_UGURU3_DEBUG("no Abit uGuru3 found, data = 0x%02X, cmd = " @@ -1139,6 +1141,15 @@ static int __init abituguru3_init(void) int address, err; struct resource res = { .flags = IORESOURCE_IO }; +#ifdef CONFIG_DMI + const char *board_vendor = dmi_get_system_info(DMI_BOARD_VENDOR); + + /* safety check, refuse to load on non Abit motherboards */ + if (!force && (!board_vendor || + strcmp(board_vendor, "http://www.abit.com.tw/"))) + return -ENODEV; +#endif + address = abituguru3_detect(); if (address < 0) return address; From bcccc3a28e9cbb44549cde326852c26203a53a56 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Sat, 3 May 2008 19:19:16 -0700 Subject: [PATCH 08/65] hwmon: (lm75) sensor reading bugfix LM75 sensor reading bugfix: never save error status as valid sensor output. This could be improved, but at least this prevents certain rude failure modes. Signed-off-by: David Brownell Acked-by: Jean Delvare Signed-off-by: Mark M. Hoffman --- drivers/hwmon/lm75.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/hwmon/lm75.c b/drivers/hwmon/lm75.c index fa7696905154..de698dc73020 100644 --- a/drivers/hwmon/lm75.c +++ b/drivers/hwmon/lm75.c @@ -251,10 +251,13 @@ static int lm75_detach_client(struct i2c_client *client) the SMBus standard. */ static int lm75_read_value(struct i2c_client *client, u8 reg) { + int value; + if (reg == LM75_REG_CONF) return i2c_smbus_read_byte_data(client, reg); - else - return swab16(i2c_smbus_read_word_data(client, reg)); + + value = i2c_smbus_read_word_data(client, reg); + return (value < 0) ? value : swab16(value); } static int lm75_write_value(struct i2c_client *client, u8 reg, u16 value) @@ -287,9 +290,16 @@ static struct lm75_data *lm75_update_device(struct device *dev) int i; dev_dbg(&client->dev, "Starting lm75 update\n"); - for (i = 0; i < ARRAY_SIZE(data->temp); i++) - data->temp[i] = lm75_read_value(client, - LM75_REG_TEMP[i]); + for (i = 0; i < ARRAY_SIZE(data->temp); i++) { + int status; + + status = lm75_read_value(client, LM75_REG_TEMP[i]); + if (status < 0) + dev_dbg(&client->dev, "reg %d, err %d\n", + LM75_REG_TEMP[i], status); + else + data->temp[i] = status; + } data->last_updated = jiffies; data->valid = 1; } From aea7427f70cce5fa8f99ce447b213e9e3b49f24c Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Thu, 19 Jun 2008 16:29:39 -0700 Subject: [PATCH 09/65] ipv6: Remove options header when setsockopt's optlen is 0 Remove the sticky Hop-by-Hop options header by calling setsockopt() for IPV6_HOPOPTS with a zero option length, per RFC3542. Routing header and Destination options header does the same as Hop-by-Hop options header. Signed-off-by: Shan Wei Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c042ce19bd14..86e28a75267f 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -345,18 +345,21 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; + + /* remove any sticky options header with a zero option + * length, per RFC3542. + */ if (optlen == 0) optval = NULL; + else if (optlen < sizeof(struct ipv6_opt_hdr) || + optlen & 0x7 || optlen > 8 * 255) + goto e_inval; /* hop-by-hop / destination options are privileged option */ retv = -EPERM; if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW)) break; - if (optlen < sizeof(struct ipv6_opt_hdr) || - optlen & 0x7 || optlen > 8 * 255) - goto e_inval; - opt = ipv6_renew_options(sk, np->opt, optname, (struct ipv6_opt_hdr __user *)optval, optlen); From f630e43a215a3129d0c1173cae0bce6ea4855cf7 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 19 Jun 2008 16:33:57 -0700 Subject: [PATCH 10/65] ipv6: Drop packets for loopback address from outside of the box. [ Based upon original report and patch by Karsten Keil. Karsten has verified that this fixes the TAHI test case "ICMPv6 test v6LC.5.1.2 Part F". -DaveM ] Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 6 ++++++ net/ipv6/ip6_input.c | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index e0a612bc9c4e..f422f7218e1c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -367,6 +367,12 @@ static inline int ipv6_addr_any(const struct in6_addr *a) a->s6_addr32[2] | a->s6_addr32[3] ) == 0); } +static inline int ipv6_addr_loopback(const struct in6_addr *a) +{ + return ((a->s6_addr32[0] | a->s6_addr32[1] | + a->s6_addr32[2] | (a->s6_addr32[3] ^ htonl(1))) == 0); +} + static inline int ipv6_addr_v4mapped(const struct in6_addr *a) { return ((a->s6_addr32[0] | a->s6_addr32[1] | diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 4e5c8615832c..17eb48b8e329 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -102,6 +102,15 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; + /* + * RFC4291 2.5.3 + * A packet received on an interface with a destination address + * of loopback must be dropped. + */ + if (!(dev->flags & IFF_LOOPBACK) && + ipv6_addr_loopback(&hdr->daddr)) + goto err; + skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); From 8a8cde163ea724baf74e7752a31a69d3121a240e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:28 +0200 Subject: [PATCH 11/65] sched: rt: dont stop the period timer when there are tasks wanting to run So if the group ever gets throttled, it will never wake up again. Reported-by: "Daniel K." Signed-off-by: Peter Zijlstra Tested-by: Daniel K. Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1dad5bbb59b6..0f3c19197fa4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -250,7 +250,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; spin_unlock(&rt_rq->rt_runtime_lock); - } + } else if (rt_rq->rt_nr_running) + idle = 0; if (enqueue) sched_rt_rq_enqueue(rt_rq); From bb10ed0994927d433f6dbdf274fdb26cfcf516b7 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 19 Jun 2008 15:04:07 -0700 Subject: [PATCH 12/65] sched: fix wait_for_completion_timeout() spurious failure under heavy load It seems that the current implementaton of wait_for_completion_timeout() has a small problem under very high load for the common pattern: if (!wait_for_completion_timeout(&done, timeout)) /* handle failure */ because the implementation very roughly does (lots of code deleted to show the basic flow): static inline long __sched do_wait_for_common(struct completion *x, long timeout, int state) { if (x->done) return timeout; do { timeout = schedule_timeout(timeout); if (!timeout) return timeout; } while (!x->done); return timeout; } so if the system is very busy and x->done is not set when do_wait_for_common() is entered, it is possible that the first call to schedule_timeout() returns 0 because the task doing wait_for_completion doesn't get rescheduled for a long time, even if it is woken up early enough. In this case, wait_for_completion_timeout() returns 0 without even checking x->done again, and the code above falls into its failure case purely for scheduler reasons, even if the hardware event or whatever was being waited for happened early enough. It would make sense to add an extra test to do_wait_for() in the timeout case and return 1 if x->done is actually set. A quick audit (not exhaustive) of wait_for_completion_timeout() callers seems to indicate that no one actually cares about the return value in the success case -- they just test for 0 (timed out) versus non-zero (wait succeeded). Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 4a3cb0614158..577f160131bd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4405,6 +4405,16 @@ do_wait_for_common(struct completion *x, long timeout, int state) spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); + + /* + * If the completion has arrived meanwhile + * then return 1 jiffy time left: + */ + if (x->done && !timeout) { + timeout = 1; + break; + } + if (!timeout) { __remove_wait_queue(&x->wait, &wait); return timeout; From 54481cf88bc59923ea30f2ca345a73c60155e901 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 19 Jun 2008 09:41:22 -0700 Subject: [PATCH 13/65] x86: fix NULL pointer deref in __switch_to MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I am able to reproduce the oops reported by Simon in __switch_to() with lguest. My debug showed that there is at least one lguest specific issue (which should be present in 2.6.25 and before aswell) and it got exposed with a kernel oops with the recent fpu dynamic allocation patches. In addition to the previous possible scenario (with fpu_counter), in the presence of lguest, it is possible that the cpu's TS bit it still set and the lguest launcher task's thread_info has TS_USEDFPU still set. This is because of the way the lguest launcher handling the guest's TS bit. (look at lguest_set_ts() in lguest_arch_run_guest()). This can result in a DNA fault while doing unlazy_fpu() in __switch_to(). This will end up causing a DNA fault in the context of new process thats getting context switched in (as opossed to handling DNA fault in the context of lguest launcher/helper process). This is wrong in both pre and post 2.6.25 kernels. In the recent 2.6.26-rc series, this is showing up as NULL pointer dereferences or sleeping function called from atomic context(__switch_to()), as we free and dynamically allocate the FPU context for the newly created threads. Older kernels might show some FPU corruption for processes running inside of lguest. With the appended patch, my test system is running for more than 50 mins now. So atleast some of your oops (hopefully all!) should get fixed. Please give it a try. I will spend more time with this fix tomorrow. Reported-by: Simon Holm Thøgersen Reported-by: Patrick McHardy Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- drivers/lguest/x86/core.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 5126d5d9ea0e..2e554a4ab337 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -176,7 +176,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * we set it now, so we can trap and pass that trap to the Guest if it * uses the FPU. */ if (cpu->ts) - lguest_set_ts(); + unlazy_fpu(current); /* SYSENTER is an optimized way of doing system calls. We can't allow * it because it always jumps to privilege level 0. A normal Guest @@ -196,6 +196,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * trap made the switcher code come back, and an error code which some * traps set. */ + /* Restore SYSENTER if it's supposed to be on. */ + if (boot_cpu_has(X86_FEATURE_SEP)) + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + /* If the Guest page faulted, then the cr2 register will tell us the * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite @@ -203,13 +207,12 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) if (cpu->regs->trapnum == 14) cpu->arch.last_pagefault = read_cr2(); /* Similarly, if we took a trap because the Guest used the FPU, - * we have to restore the FPU it expects to see. */ + * we have to restore the FPU it expects to see. + * math_state_restore() may sleep and we may even move off to + * a different CPU. So all the critical stuff should be done + * before this. */ else if (cpu->regs->trapnum == 7) math_state_restore(); - - /* Restore SYSENTER if it's supposed to be on. */ - if (boot_cpu_has(X86_FEATURE_SEP)) - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); } /*H:130 Now we've examined the hypercall code; our Guest can make requests. From 46539383791a0e59a4af7412056dfbfc5240af0a Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Mon, 16 Jun 2008 14:58:13 -0700 Subject: [PATCH 14/65] xen: Use wmb instead of rmb in xen_evtchn_do_upcall(). This patch is ported one from 534:77db69c38249 of linux-2.6.18-xen.hg. Use wmb instead of rmb to enforce ordering between evtchn_upcall_pending and evtchn_pending_sel stores in xen_evtchn_do_upcall(). Cc: Samuel Thibault Signed-off-by: Isaku Yamahata Cc: Nick Piggin Cc: the arch/x86 maintainers Signed-off-by: Ingo Molnar --- drivers/xen/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 4f0f22b020ea..76e5b7386af9 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -529,7 +529,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ /* Clear master flag /before/ clearing selector flag. */ - rmb(); + wmb(); #endif pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); while (pending_words != 0) { From 05345b0f006ac226d0d25d48fcb2d792ac44a071 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 15:01:53 -0700 Subject: [PATCH 15/65] xen: mask unwanted pte bits in __supported_pte_mask [ Stable: this isn't a bugfix in itself, but it's a pre-requiste for "xen: don't drop NX bit" ] Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel Cc: the arch/x86 maintainers Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 5 +++++ arch/x86/xen/mmu.c | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e457d61..c048de34d6a1 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1228,6 +1228,11 @@ asmlinkage void __init xen_start_kernel(void) if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!is_initial_xendomain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + /* set the limit of our address space */ xen_reserve_top(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3525ef523a74..3f2a67fe6ad6 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -199,10 +199,8 @@ pgdval_t xen_pgd_val(pgd_t pgd) pte_t xen_make_pte(pteval_t pte) { - if (pte & _PAGE_PRESENT) { + if (pte & _PAGE_PRESENT) pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } return (pte_t){ .pte = pte }; } From ebb9cfe20fe167f29960a5e913193a684fac50bf Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 15:01:56 -0700 Subject: [PATCH 16/65] xen: don't drop NX bit Because NX is now enforced properly, we must put the hypercall page into the .text segment so that it is executable. Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel Cc: the arch/x86 maintainers Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 56 +++++++++++++++++++++++------------------ arch/x86/xen/xen-head.S | 2 +- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3f2a67fe6ad6..265601d5a6ae 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -179,46 +179,54 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, preempt_enable(); } +/* Assume pteval_t is equivalent to all the other *val_t types. */ +static pteval_t pte_mfn_to_pfn(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_MASK; + val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; + } + + return val; +} + +static pteval_t pte_pfn_to_mfn(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_MASK; + val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; + } + + return val; +} + pteval_t xen_pte_val(pte_t pte) { - pteval_t ret = pte.pte; - - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - - return ret; + return pte_mfn_to_pfn(pte.pte); } pgdval_t xen_pgd_val(pgd_t pgd) { - pgdval_t ret = pgd.pgd; - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; + return pte_mfn_to_pfn(pgd.pgd); } pte_t xen_make_pte(pteval_t pte) { - if (pte & _PAGE_PRESENT) - pte = phys_to_machine(XPADDR(pte)).maddr; - - return (pte_t){ .pte = pte }; + pte = pte_pfn_to_mfn(pte); + return native_make_pte(pte); } pgd_t xen_make_pgd(pgdval_t pgd) { - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; + pgd = pte_pfn_to_mfn(pgd); + return native_make_pgd(pgd); } pmdval_t xen_pmd_val(pmd_t pmd) { - pmdval_t ret = native_pmd_val(pmd); - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; + return pte_mfn_to_pfn(pmd.pmd); } #ifdef CONFIG_X86_PAE void xen_set_pud(pud_t *ptr, pud_t val) @@ -265,9 +273,7 @@ void xen_pmd_clear(pmd_t *pmdp) pmd_t xen_make_pmd(pmdval_t pmd) { - if (pmd & _PAGE_PRESENT) - pmd = phys_to_machine(XPADDR(pmd)).maddr; - + pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); } #else /* !PAE */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 288d587ce73c..3175e973fd0d 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -17,7 +17,7 @@ ENTRY(startup_xen) __FINIT -.pushsection .bss.page_aligned +.pushsection .text .align PAGE_SIZE_asm ENTRY(hypercall_page) .skip 0x1000 From ea71a546706dfdad72462624394e1e472c6bf34f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 20 Jun 2008 18:32:20 +0400 Subject: [PATCH 17/65] sched: refactor wait_for_completion_timeout() Simplify the code and fix the boundary condition of wait_for_completion_timeout(,0). We can kill the first __remove_wait_queue() as well. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra --- kernel/sched.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 577f160131bd..bebf9788f45e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4398,32 +4398,20 @@ do_wait_for_common(struct completion *x, long timeout, int state) signal_pending(current)) || (state == TASK_KILLABLE && fatal_signal_pending(current))) { - __remove_wait_queue(&x->wait, &wait); - return -ERESTARTSYS; + timeout = -ERESTARTSYS; + break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); - - /* - * If the completion has arrived meanwhile - * then return 1 jiffy time left: - */ - if (x->done && !timeout) { - timeout = 1; - break; - } - - if (!timeout) { - __remove_wait_queue(&x->wait, &wait); - return timeout; - } - } while (!x->done); + } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; } x->done--; - return timeout; + return timeout ?: 1; } static long __sched From 2856922c158605514ec5974a03097eaec91f4c0d Mon Sep 17 00:00:00 2001 From: Frederic Bohe Date: Fri, 20 Jun 2008 11:48:48 -0400 Subject: [PATCH 18/65] Ext4: Fix online resize block group descriptor corruption This is the patch for the group descriptor table corruption during online resize pointed out by Theodore Tso. The problem was caused by the fact that the ext4 group descriptor can be either 32 or 64 bytes long. Only the 64 bytes structure was taken into account. Signed-off-by: Frederic Bohe Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 9ecb92f68543..9ff7b1c04239 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -855,7 +855,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) */ /* Update group descriptor block for new group */ - gdp = (struct ext4_group_desc *)primary->b_data + gdb_off; + gdp = (struct ext4_group_desc *)((char *)primary->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ From e0c6d97c65e0784aade7e97b9411f245a6c543e7 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 20 Jun 2008 12:02:00 -0700 Subject: [PATCH 19/65] [IA64] SN2: security hole in sn2_ptc_proc_write Security hole in sn2_ptc_proc_write It is possible to overrun a buffer with a write to this /proc file. Signed-off-by: Cliff Wickman Signed-off-by: Tony Luck --- arch/ia64/sn/kernel/sn2/sn2_smp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index 49d3120415eb..6dd886c5d860 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -512,6 +512,8 @@ static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, si int cpu; char optstr[64]; + if (count > sizeof(optstr)) + return -EINVAL; if (copy_from_user(optstr, user, count)) return -EFAULT; optstr[count - 1] = '\0'; From 9267b4b3880d00dc2dab90f1d817c856939114f7 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Sat, 21 Jun 2008 03:25:39 +0400 Subject: [PATCH 20/65] alpha: fix module load failures on smp (bug #10926) To calculate addresses of locally defined variables, GCC uses 32-bit displacement from the GP. Which doesn't work for per cpu variables in modules, as an offset to the kernel per cpu area is way above 4G. The workaround is to force allocation of a GOT entry for per cpu variable using ldq instruction with a 'literal' relocation. I had to use custom asm/percpu.h, as a required argument magic doesn't work with asm-generic/percpu.h macros. Signed-off-by: Ivan Kokshaysky Signed-off-by: Linus Torvalds --- include/asm-alpha/percpu.h | 72 +++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/include/asm-alpha/percpu.h b/include/asm-alpha/percpu.h index 48348fe34c19..82e8a94b4b2f 100644 --- a/include/asm-alpha/percpu.h +++ b/include/asm-alpha/percpu.h @@ -1,6 +1,76 @@ #ifndef __ALPHA_PERCPU_H #define __ALPHA_PERCPU_H +#include +#include -#include +/* + * Determine the real variable name from the name visible in the + * kernel sources. + */ +#define per_cpu_var(var) per_cpu__##var + +#ifdef CONFIG_SMP + +/* + * per_cpu_offset() is the offset that has to be added to a + * percpu variable to get to the instance for a certain processor. + */ +extern unsigned long __per_cpu_offset[NR_CPUS]; + +#define per_cpu_offset(x) (__per_cpu_offset[x]) + +#define __my_cpu_offset per_cpu_offset(raw_smp_processor_id()) +#ifdef CONFIG_DEBUG_PREEMPT +#define my_cpu_offset per_cpu_offset(smp_processor_id()) +#else +#define my_cpu_offset __my_cpu_offset +#endif + +#ifndef MODULE +#define SHIFT_PERCPU_PTR(var, offset) RELOC_HIDE(&per_cpu_var(var), (offset)) +#define PER_CPU_ATTRIBUTES +#else +/* + * To calculate addresses of locally defined variables, GCC uses 32-bit + * displacement from the GP. Which doesn't work for per cpu variables in + * modules, as an offset to the kernel per cpu area is way above 4G. + * + * This forces allocation of a GOT entry for per cpu variable using + * ldq instruction with a 'literal' relocation. + */ +#define SHIFT_PERCPU_PTR(var, offset) ({ \ + extern int simple_identifier_##var(void); \ + unsigned long __ptr, tmp_gp; \ + asm ( "br %1, 1f \n\ + 1: ldgp %1, 0(%1) \n\ + ldq %0, per_cpu__" #var"(%1)\t!literal" \ + : "=&r"(__ptr), "=&r"(tmp_gp)); \ + (typeof(&per_cpu_var(var)))(__ptr + (offset)); }) + +#define PER_CPU_ATTRIBUTES __used + +#endif /* MODULE */ + +/* + * A percpu variable may point to a discarded regions. The following are + * established ways to produce a usable pointer from the percpu variable + * offset. + */ +#define per_cpu(var, cpu) \ + (*SHIFT_PERCPU_PTR(var, per_cpu_offset(cpu))) +#define __get_cpu_var(var) \ + (*SHIFT_PERCPU_PTR(var, my_cpu_offset)) +#define __raw_get_cpu_var(var) \ + (*SHIFT_PERCPU_PTR(var, __my_cpu_offset)) + +#else /* ! SMP */ + +#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) +#define __get_cpu_var(var) per_cpu_var(var) +#define __raw_get_cpu_var(var) per_cpu_var(var) + +#endif /* SMP */ + +#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name) #endif /* __ALPHA_PERCPU_H */ From ede426923b25414f5ec9c00fefe6727d9721dd13 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Sat, 21 Jun 2008 03:26:21 +0400 Subject: [PATCH 21/65] alpha: link failure fix With built-in scsi disk driver, the final link fails with a following error: `.exit.text' referenced in section `.rodata' of drivers/built-in.o: defined in discarded section `.exit.text' of drivers/built-in.o This happens with -Os (CONFIG_CC_OPTIMIZE_FOR_SIZE=y) with all gcc-4 versions, and also with -O2 and gcc-4.3. The problem is in sd.c:sd_major() being inlined into __exit function exit_sd(), and the compiler generating a jump table in .rodata section for the 'switch' statement in sd_major(). So we have references to discarded section. Fixed with a big hammer in the form of -fno-jump-tables. Note that jump tables vs. discarded sections is a generic problem, other architectures are just lucky not to suffer from it. But with a slightly more complex switch/case statement it can be reproduced on x86 as well. So maybe at some point we should consider -fno-jump-tables as a generic compile option... Signed-off-by: Ivan Kokshaysky Signed-off-by: Linus Torvalds --- arch/alpha/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/alpha/Makefile b/arch/alpha/Makefile index 4e1a8e2c4541..4759fe751aa1 100644 --- a/arch/alpha/Makefile +++ b/arch/alpha/Makefile @@ -13,6 +13,7 @@ NM := $(NM) -B LDFLAGS_vmlinux := -static -N #-relax CHECKFLAGS += -D__alpha__ -m64 cflags-y := -pipe -mno-fp-regs -ffixed-8 -msmall-data +cflags-y += $(call cc-option, -fno-jump-tables) cpuflags-$(CONFIG_ALPHA_EV4) := -mcpu=ev4 cpuflags-$(CONFIG_ALPHA_EV5) := -mcpu=ev5 From d559d4a24a3fed75bd890abcc1f95cd8d8dad6e1 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Sat, 21 Jun 2008 03:28:31 +0400 Subject: [PATCH 22/65] alpha: fix compile failures with gcc-4.3 (bug #10438) Vast majority of these build failures are gcc-4.3 warnings about static functions and objects being referenced from non-static (read: "extern inline") functions, in conjunction with our -Werror. We cannot just convert "extern inline" to "static inline", as people keep suggesting all the time, because "extern inline" logic is crucial for generic kernel build. So - just make sure that all callees of critical "extern inline" functions are also "extern inline"; - use "static inline", wherever it's possible. traps.c: work around gcc-4.3 being too smart about array bounds-checking. TODO: add "gnu_inline" attribute to all our "extern inline" functions to ensure desired behaviour with future compilers. Signed-off-by: Ivan Kokshaysky Signed-off-by: Linus Torvalds --- arch/alpha/kernel/core_t2.c | 2 ++ arch/alpha/kernel/traps.c | 3 ++- include/asm-alpha/core_mcpcia.h | 2 +- include/asm-alpha/core_t2.h | 14 +++++++------- include/asm-alpha/io.h | 6 +++--- include/asm-alpha/mmu_context.h | 6 +++--- include/asm-alpha/system.h | 10 +++++----- include/asm-alpha/vga.h | 6 +++--- 8 files changed, 26 insertions(+), 23 deletions(-) diff --git a/arch/alpha/kernel/core_t2.c b/arch/alpha/kernel/core_t2.c index c0750291b44a..d9980d47ab81 100644 --- a/arch/alpha/kernel/core_t2.c +++ b/arch/alpha/kernel/core_t2.c @@ -74,6 +74,8 @@ # define DBG(args) #endif +DEFINE_SPINLOCK(t2_hae_lock); + static volatile unsigned int t2_mcheck_any_expected; static volatile unsigned int t2_mcheck_last_taken; diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index dc57790250d2..c778779007fc 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -447,7 +447,7 @@ struct unaligned_stat { /* Macro for exception fixup code to access integer registers. */ -#define una_reg(r) (regs->regs[(r) >= 16 && (r) <= 18 ? (r)+19 : (r)]) +#define una_reg(r) (_regs[(r) >= 16 && (r) <= 18 ? (r)+19 : (r)]) asmlinkage void @@ -456,6 +456,7 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg, { long error, tmp1, tmp2, tmp3, tmp4; unsigned long pc = regs->pc - 4; + unsigned long *_regs = regs->regs; const struct exception_table_entry *fixup; unaligned[0].count++; diff --git a/include/asm-alpha/core_mcpcia.h b/include/asm-alpha/core_mcpcia.h index 525b4f6a7ace..acf55b483472 100644 --- a/include/asm-alpha/core_mcpcia.h +++ b/include/asm-alpha/core_mcpcia.h @@ -261,7 +261,7 @@ struct el_MCPCIA_uncorrected_frame_mcheck { } #endif -static inline int __mcpcia_is_mmio(unsigned long addr) +extern inline int __mcpcia_is_mmio(unsigned long addr) { return (addr & 0x80000000UL) == 0; } diff --git a/include/asm-alpha/core_t2.h b/include/asm-alpha/core_t2.h index 90e6b5d6c214..46bfff58f670 100644 --- a/include/asm-alpha/core_t2.h +++ b/include/asm-alpha/core_t2.h @@ -356,13 +356,13 @@ struct el_t2_frame_corrected { #define vip volatile int * #define vuip volatile unsigned int * -static inline u8 t2_inb(unsigned long addr) +extern inline u8 t2_inb(unsigned long addr) { long result = *(vip) ((addr << 5) + T2_IO + 0x00); return __kernel_extbl(result, addr & 3); } -static inline void t2_outb(u8 b, unsigned long addr) +extern inline void t2_outb(u8 b, unsigned long addr) { unsigned long w; @@ -371,13 +371,13 @@ static inline void t2_outb(u8 b, unsigned long addr) mb(); } -static inline u16 t2_inw(unsigned long addr) +extern inline u16 t2_inw(unsigned long addr) { long result = *(vip) ((addr << 5) + T2_IO + 0x08); return __kernel_extwl(result, addr & 3); } -static inline void t2_outw(u16 b, unsigned long addr) +extern inline void t2_outw(u16 b, unsigned long addr) { unsigned long w; @@ -386,12 +386,12 @@ static inline void t2_outw(u16 b, unsigned long addr) mb(); } -static inline u32 t2_inl(unsigned long addr) +extern inline u32 t2_inl(unsigned long addr) { return *(vuip) ((addr << 5) + T2_IO + 0x18); } -static inline void t2_outl(u32 b, unsigned long addr) +extern inline void t2_outl(u32 b, unsigned long addr) { *(vuip) ((addr << 5) + T2_IO + 0x18) = b; mb(); @@ -435,7 +435,7 @@ static inline void t2_outl(u32 b, unsigned long addr) set_hae(msb); \ } -static DEFINE_SPINLOCK(t2_hae_lock); +extern spinlock_t t2_hae_lock; /* * NOTE: take T2_DENSE_MEM off in each readX/writeX routine, since diff --git a/include/asm-alpha/io.h b/include/asm-alpha/io.h index 38f18cf18c9d..e971ab000f95 100644 --- a/include/asm-alpha/io.h +++ b/include/asm-alpha/io.h @@ -35,7 +35,7 @@ * register not being up-to-date with respect to the hardware * value. */ -static inline void __set_hae(unsigned long new_hae) +extern inline void __set_hae(unsigned long new_hae) { unsigned long flags; local_irq_save(flags); @@ -49,7 +49,7 @@ static inline void __set_hae(unsigned long new_hae) local_irq_restore(flags); } -static inline void set_hae(unsigned long new_hae) +extern inline void set_hae(unsigned long new_hae) { if (new_hae != alpha_mv.hae_cache) __set_hae(new_hae); @@ -176,7 +176,7 @@ REMAP2(u64, writeq, volatile) #undef REMAP1 #undef REMAP2 -static inline void __iomem *generic_ioportmap(unsigned long a) +extern inline void __iomem *generic_ioportmap(unsigned long a) { return alpha_mv.mv_ioportmap(a); } diff --git a/include/asm-alpha/mmu_context.h b/include/asm-alpha/mmu_context.h index 6a5be1f7debf..86c08a02d239 100644 --- a/include/asm-alpha/mmu_context.h +++ b/include/asm-alpha/mmu_context.h @@ -23,7 +23,7 @@ #endif -extern inline unsigned long +static inline unsigned long __reload_thread(struct pcb_struct *pcb) { register unsigned long a0 __asm__("$16"); @@ -114,7 +114,7 @@ extern unsigned long last_asn; #define __MMU_EXTERN_INLINE #endif -static inline unsigned long +extern inline unsigned long __get_new_mm_context(struct mm_struct *mm, long cpu) { unsigned long asn = cpu_last_asn(cpu); @@ -226,7 +226,7 @@ ev4_activate_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm) # endif #endif -extern inline int +static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { int i; diff --git a/include/asm-alpha/system.h b/include/asm-alpha/system.h index ed221d6408fc..afe20fa58c99 100644 --- a/include/asm-alpha/system.h +++ b/include/asm-alpha/system.h @@ -184,7 +184,7 @@ enum amask_enum { __amask; }) #define __CALL_PAL_R0(NAME, TYPE) \ -static inline TYPE NAME(void) \ +extern inline TYPE NAME(void) \ { \ register TYPE __r0 __asm__("$0"); \ __asm__ __volatile__( \ @@ -196,7 +196,7 @@ static inline TYPE NAME(void) \ } #define __CALL_PAL_W1(NAME, TYPE0) \ -static inline void NAME(TYPE0 arg0) \ +extern inline void NAME(TYPE0 arg0) \ { \ register TYPE0 __r16 __asm__("$16") = arg0; \ __asm__ __volatile__( \ @@ -207,7 +207,7 @@ static inline void NAME(TYPE0 arg0) \ } #define __CALL_PAL_W2(NAME, TYPE0, TYPE1) \ -static inline void NAME(TYPE0 arg0, TYPE1 arg1) \ +extern inline void NAME(TYPE0 arg0, TYPE1 arg1) \ { \ register TYPE0 __r16 __asm__("$16") = arg0; \ register TYPE1 __r17 __asm__("$17") = arg1; \ @@ -219,7 +219,7 @@ static inline void NAME(TYPE0 arg0, TYPE1 arg1) \ } #define __CALL_PAL_RW1(NAME, RTYPE, TYPE0) \ -static inline RTYPE NAME(TYPE0 arg0) \ +extern inline RTYPE NAME(TYPE0 arg0) \ { \ register RTYPE __r0 __asm__("$0"); \ register TYPE0 __r16 __asm__("$16") = arg0; \ @@ -232,7 +232,7 @@ static inline RTYPE NAME(TYPE0 arg0) \ } #define __CALL_PAL_RW2(NAME, RTYPE, TYPE0, TYPE1) \ -static inline RTYPE NAME(TYPE0 arg0, TYPE1 arg1) \ +extern inline RTYPE NAME(TYPE0 arg0, TYPE1 arg1) \ { \ register RTYPE __r0 __asm__("$0"); \ register TYPE0 __r16 __asm__("$16") = arg0; \ diff --git a/include/asm-alpha/vga.h b/include/asm-alpha/vga.h index e8df1e7aae6b..c00106bac521 100644 --- a/include/asm-alpha/vga.h +++ b/include/asm-alpha/vga.h @@ -13,7 +13,7 @@ #define VT_BUF_HAVE_MEMSETW #define VT_BUF_HAVE_MEMCPYW -extern inline void scr_writew(u16 val, volatile u16 *addr) +static inline void scr_writew(u16 val, volatile u16 *addr) { if (__is_ioaddr(addr)) __raw_writew(val, (volatile u16 __iomem *) addr); @@ -21,7 +21,7 @@ extern inline void scr_writew(u16 val, volatile u16 *addr) *addr = val; } -extern inline u16 scr_readw(volatile const u16 *addr) +static inline u16 scr_readw(volatile const u16 *addr) { if (__is_ioaddr(addr)) return __raw_readw((volatile const u16 __iomem *) addr); @@ -29,7 +29,7 @@ extern inline u16 scr_readw(volatile const u16 *addr) return *addr; } -extern inline void scr_memsetw(u16 *s, u16 c, unsigned int count) +static inline void scr_memsetw(u16 *s, u16 c, unsigned int count) { if (__is_ioaddr(s)) memsetw_io((u16 __iomem *) s, c, count); From a744e0160ac5804b763449aa34d3991dc21af0be Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Sat, 21 Jun 2008 03:28:54 +0400 Subject: [PATCH 23/65] alpha: resurrect Cypress IDE quirk Which was removed in the hope that generic legacy IDE quirk in drivers/pci/probe.c is sufficient for Cypress IDE. It isn't, as this controller has non-standard BAR layout: secondary channel registers are in the BAR0-1 of the second PCI function - not in the BAR2-3 of the same function, as the generic quirk routine assumes. Signed-off-by: Ivan Kokshaysky Signed-off-by: Linus Torvalds --- arch/alpha/kernel/pci.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 36ab22a7ea12..5cf45fc51343 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -71,6 +71,23 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82378, quirk_i static void __init quirk_cypress(struct pci_dev *dev) { + /* The Notorious Cy82C693 chip. */ + + /* The generic legacy mode IDE fixup in drivers/pci/probe.c + doesn't work correctly with the Cypress IDE controller as + it has non-standard register layout. Fix that. */ + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE) { + dev->resource[2].start = dev->resource[3].start = 0; + dev->resource[2].end = dev->resource[3].end = 0; + dev->resource[2].flags = dev->resource[3].flags = 0; + if (PCI_FUNC(dev->devfn) == 2) { + dev->resource[0].start = 0x170; + dev->resource[0].end = 0x177; + dev->resource[1].start = 0x376; + dev->resource[1].end = 0x376; + } + } + /* The Cypress bridge responds on the PCI bus in the address range 0xffff0000-0xffffffff (conventional x86 BIOS ROM). There is no way to turn this off. The bridge also supports several extended From 2645a3c3761ac25498db2e627271016c849c68e1 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 20 Jun 2008 21:58:02 -0700 Subject: [PATCH 24/65] pppoe: warning fix Fix warning: drivers/net/pppoe.c: In function 'pppoe_recvmsg': drivers/net/pppoe.c:945: warning: comparison of distinct pointer types lacks a cast because skb->len is unsigned int and total_len is size_t Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/pppoe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c index bafb69b6f7cb..fc6f4b8c64b3 100644 --- a/drivers/net/pppoe.c +++ b/drivers/net/pppoe.c @@ -942,7 +942,7 @@ static int pppoe_recvmsg(struct kiocb *iocb, struct socket *sock, m->msg_namelen = 0; if (skb) { - total_len = min(total_len, skb->len); + total_len = min_t(size_t, total_len, skb->len); error = skb_copy_datagram_iovec(skb, 0, m->msg_iov, total_len); if (error == 0) error = total_len; From 735ce972fbc8a65fb17788debd7bbe7b4383cc62 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 20 Jun 2008 22:04:34 -0700 Subject: [PATCH 25/65] sctp: Make sure N * sizeof(union sctp_addr) does not overflow. As noticed by Gabriel Campana, the kmalloc() length arg passed in by sctp_getsockopt_local_addrs_old() can overflow if ->addr_num is large enough. Therefore, enforce an appropriate limit. Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e7e3baf7009e..0dbcde6758ea 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4401,7 +4401,9 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len, if (copy_from_user(&getaddrs, optval, len)) return -EFAULT; - if (getaddrs.addr_num <= 0) return -EINVAL; + if (getaddrs.addr_num <= 0 || + getaddrs.addr_num >= (INT_MAX / sizeof(union sctp_addr))) + return -EINVAL; /* * For UDP-style sockets, id specifies the association to query. * If the id field is set to the value '0' then the locally bound From b9f75f45a6b46a0ab4eb0857d437a0845871f314 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Jun 2008 22:16:51 -0700 Subject: [PATCH 26/65] netns: Don't receive new packets in a dead network namespace. Alexey Dobriyan writes: > Subject: ICMP sockets destruction vs ICMP packets oops > After icmp_sk_exit() nuked ICMP sockets, we get an interrupt. > icmp_reply() wants ICMP socket. > > Steps to reproduce: > > launch shell in new netns > move real NIC to netns > setup routing > ping -i 0 > exit from shell > > BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 > IP: [] icmp_sk+0x17/0x30 > PGD 17f3cd067 PUD 17f3ce067 PMD 0 > Oops: 0000 [1] PREEMPT SMP DEBUG_PAGEALLOC > CPU 0 > Modules linked in: usblp usbcore > Pid: 0, comm: swapper Not tainted 2.6.26-rc6-netns-ct #4 > RIP: 0010:[] [] icmp_sk+0x17/0x30 > RSP: 0018:ffffffff8057fc30 EFLAGS: 00010286 > RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff81017c7db900 > RDX: 0000000000000034 RSI: ffff81017c7db900 RDI: ffff81017dc41800 > RBP: ffffffff8057fc40 R08: 0000000000000001 R09: 000000000000a815 > R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff8057fd28 > R13: ffffffff8057fd00 R14: ffff81017c7db938 R15: ffff81017dc41800 > FS: 0000000000000000(0000) GS:ffffffff80525000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b > CR2: 0000000000000000 CR3: 000000017fcda000 CR4: 00000000000006e0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > Process swapper (pid: 0, threadinfo ffffffff8053a000, task ffffffff804fa4a0) > Stack: 0000000000000000 ffff81017c7db900 ffffffff8057fcf0 ffffffff803fcfe4 > ffffffff804faa38 0000000000000246 0000000000005a40 0000000000000246 > 000000000001ffff ffff81017dd68dc0 0000000000005a40 0000000055342436 > Call Trace: > [] icmp_reply+0x44/0x1e0 > [] ? ip_route_input+0x23a/0x1360 > [] icmp_echo+0x65/0x70 > [] icmp_rcv+0x180/0x1b0 > [] ip_local_deliver+0xf4/0x1f0 > [] ip_rcv+0x33b/0x650 > [] netif_receive_skb+0x27a/0x340 > [] process_backlog+0x9d/0x100 > [] net_rx_action+0x18d/0x250 > [] __do_softirq+0x75/0x100 > [] call_softirq+0x1c/0x30 > [] do_softirq+0x65/0xa0 > [] irq_exit+0x97/0xa0 > [] do_IRQ+0xa8/0x130 > [] ? mwait_idle+0x0/0x60 > [] ret_from_intr+0x0/0xf > [] ? mwait_idle+0x4c/0x60 > [] ? mwait_idle+0x43/0x60 > [] ? cpu_idle+0x57/0xa0 > [] ? rest_init+0x70/0x80 > Code: 10 5b 41 5c 41 5d 41 5e c9 c3 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 53 > 48 83 ec 08 48 8b 9f 78 01 00 00 e8 2b c7 f1 ff 89 c0 <48> 8b 04 c3 48 83 c4 08 > 5b c9 c3 66 66 66 66 66 2e 0f 1f 84 00 > RIP [] icmp_sk+0x17/0x30 > RSP > CR2: 0000000000000000 > ---[ end trace ea161157b76b33e8 ]--- > Kernel panic - not syncing: Aiee, killing interrupt handler! Receiving packets while we are cleaning up a network namespace is a racy proposition. It is possible when the packet arrives that we have removed some but not all of the state we need to fully process it. We have the choice of either playing wack-a-mole with the cleanup routines or simply dropping packets when we don't have a network namespace to handle them. Since the check looks inexpensive in netif_receive_skb let's just drop the incoming packets. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/net/net_namespace.h | 11 +++++++++++ net/core/dev.c | 4 ++++ net/core/net_namespace.c | 3 +++ 3 files changed, 18 insertions(+) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index aa540e6be502..d9dd0f707296 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -95,6 +95,11 @@ extern struct list_head net_namespace_list; #ifdef CONFIG_NET_NS extern void __put_net(struct net *net); +static inline int net_alive(struct net *net) +{ + return net && atomic_read(&net->count); +} + static inline struct net *get_net(struct net *net) { atomic_inc(&net->count); @@ -125,6 +130,12 @@ int net_eq(const struct net *net1, const struct net *net2) return net1 == net2; } #else + +static inline int net_alive(struct net *net) +{ + return 1; +} + static inline struct net *get_net(struct net *net) { return net; diff --git a/net/core/dev.c b/net/core/dev.c index 68d8df0992ab..c421a1f8f0b9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2077,6 +2077,10 @@ int netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); + /* Don't receive packets in an exiting network namespace */ + if (!net_alive(dev_net(skb->dev))) + goto out; + #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 72b4c184dd84..7c52fe277b62 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -140,6 +140,9 @@ static void cleanup_net(struct work_struct *work) struct pernet_operations *ops; struct net *net; + /* Be very certain incoming network packets will not find us */ + rcu_barrier(); + net = container_of(work, struct net, work); mutex_lock(&net_mutex); From 71c2742f5e6348d76ee62085cf0a13e5eff0f00e Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Sat, 21 Jun 2008 19:01:02 +0200 Subject: [PATCH 27/65] Add return value to reserve_bootmem_node() This patch changes the function reserve_bootmem_node() from void to int, returning -ENOMEM if the allocation fails. This fixes a build problem on x86 with CONFIG_KEXEC=y and CONFIG_NEED_MULTIPLE_NODES=y Signed-off-by: Bernhard Walle Reported-by: Adrian Bunk Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 2 +- mm/bootmem.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 6a5dbdc8a7dc..686895bacd9d 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -94,7 +94,7 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn); -extern void reserve_bootmem_node(pg_data_t *pgdat, +extern int reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags); diff --git a/mm/bootmem.c b/mm/bootmem.c index e8fb927392b9..8d9f60e06f62 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -442,15 +442,17 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); } -void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, +int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { int ret; ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); if (ret < 0) - return; + return -ENOMEM; reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); + + return 0; } void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, From 481c5346d0981940ee63037eb53e4e37b0735c10 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 21 Jun 2008 16:46:35 -0700 Subject: [PATCH 28/65] Slab: Fix memory leak in fallback_alloc() The zonelist patches caused the loop that checks for available objects in permitted zones to not terminate immediately. One object per zone per allocation may be allocated and then abandoned. Break the loop when we have successfully allocated one object. Signed-off-by: Christoph Lameter Signed-off-by: Linus Torvalds --- mm/slab.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/slab.c b/mm/slab.c index 06236e4ddc1b..046607f05f3e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3263,9 +3263,12 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) if (cpuset_zone_allowed_hardwall(zone, flags) && cache->nodelists[nid] && - cache->nodelists[nid]->free_objects) + cache->nodelists[nid]->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); + if (obj) + break; + } } if (!obj) { From 55d8538498f62ec72b5ba67aa386c7726f630475 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 Jun 2008 12:23:15 -0700 Subject: [PATCH 29/65] Fix performance regression on lmbench select benchmark Christian Borntraeger reported that reinstating cond_resched() with CONFIG_PREEMPT caused a performance regression on lmbench: For example select file 500: 23 microseconds 32 microseconds and that's really because we totally unnecessarily do the cond_resched() in the innermost loop of select(), which is just silly. This moves it out from the innermost loop (which only ever loops ove the bits in a single "unsigned long" anyway), which makes the performance regression go away. Reported-and-tested-by: Christian Borntraeger Signed-off-by: Linus Torvalds --- fs/select.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/select.c b/fs/select.c index 8dda969614a9..da0e88201c3a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) retval++; } } - cond_resched(); } if (res_in) *rinp = res_in; @@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) *routp = res_out; if (res_ex) *rexp = res_ex; + cond_resched(); } wait = NULL; if (retval || !*timeout || signal_pending(current)) From 44e051773da465f8c92127914bc784770e0e2a28 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 23 Jun 2008 11:54:05 +0200 Subject: [PATCH 30/65] ALSA: aw2 - Fix Oops at initialization The irq handler may be called before the proper initialization of hardware. Call snd_aw2_saa7146_setup() before the irq handler registration. Signed-off-by: Takashi Iwai --- sound/pci/aw2/aw2-alsa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/aw2/aw2-alsa.c b/sound/pci/aw2/aw2-alsa.c index 56f87cd33c19..3f00ddf450f8 100644 --- a/sound/pci/aw2/aw2-alsa.c +++ b/sound/pci/aw2/aw2-alsa.c @@ -316,6 +316,8 @@ static int __devinit snd_aw2_create(struct snd_card *card, return -ENOMEM; } + /* (2) initialization of the chip hardware */ + snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt); if (request_irq(pci->irq, snd_aw2_saa7146_interrupt, IRQF_SHARED, "Audiowerk2", chip)) { @@ -329,8 +331,6 @@ static int __devinit snd_aw2_create(struct snd_card *card, } chip->irq = pci->irq; - /* (2) initialization of the chip hardware */ - snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt); err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops); if (err < 0) { free_irq(chip->irq, (void *)chip); From 3e14b50dd4a3178f4f635267a2706b5d4f8c61ee Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 23 Jun 2008 11:58:06 +0200 Subject: [PATCH 31/65] ALSA: sb - Fix wrong assertions snd_assert() in save_mixer() and restore_mixer() in sb_mixer.c is just wrong. The debug code wasn't tested at all, obviously... Signed-off-by: Takashi Iwai --- sound/isa/sb/sb_mixer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/isa/sb/sb_mixer.c b/sound/isa/sb/sb_mixer.c index 91d14224f6b3..73d4572d136b 100644 --- a/sound/isa/sb/sb_mixer.c +++ b/sound/isa/sb/sb_mixer.c @@ -925,7 +925,7 @@ static unsigned char als4000_saved_regs[] = { static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) { unsigned char *val = chip->saved_regs; - snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); + snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return); for (; num_regs; num_regs--) *val++ = snd_sbmixer_read(chip, *regs++); } @@ -933,7 +933,7 @@ static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) static void restore_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) { unsigned char *val = chip->saved_regs; - snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); + snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return); for (; num_regs; num_regs--) snd_sbmixer_write(chip, *regs++, *val++); } From 1b7558e457ed0de61023cfc913d2c342c7c3d9f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 23 Jun 2008 11:21:58 +0200 Subject: [PATCH 32/65] futexes: fix fault handling in futex_lock_pi This patch addresses a very sporadic pi-futex related failure in highly threaded java apps on large SMP systems. David Holmes reported that the pi_state consistency check in lookup_pi_state triggered with his test application. This means that the kernel internal pi_state and the user space futex variable are out of sync. First we assumed that this is a user space data corruption, but deeper investigation revieled that the problem happend because the pi-futex code is not handling a fault in the futex_lock_pi path when the user space variable needs to be fixed up. The fault happens when a fork mapped the anon memory which contains the futex readonly for COW or the page got swapped out exactly between the unlock of the futex and the return of either the new futex owner or the task which was the expected owner but failed to acquire the kernel internal rtmutex. The current futex_lock_pi() code drops out with an inconsistent in case it faults and returns -EFAULT to user space. User space has no way to fixup that state. When we wrote this code we thought that we could not drop the hash bucket lock at this point to handle the fault. After analysing the code again it turned out to be wrong because there are only two tasks involved which might modify the pi_state and the user space variable: - the task which acquired the rtmutex - the pending owner of the pi_state which did not get the rtmutex Both tasks drop into the fixup_pi_state() function before returning to user space. The first task which acquired the hash bucket lock faults in the fixup of the user space variable, drops the spinlock and calls futex_handle_fault() to fault in the page. Now the second task could acquire the hash bucket lock and tries to fixup the user space variable as well. It either faults as well or it succeeds because the first task already faulted the page in. One caveat is to avoid a double fixup. After returning from the fault handling we reacquire the hash bucket lock and check whether the pi_state owner has been modified already. Reported-by: David Holmes Signed-off-by: Thomas Gleixner Cc: Andrew Morton Cc: David Holmes Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Signed-off-by: Ingo Molnar kernel/futex.c | 93 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 20 deletions(-) --- kernel/futex.c | 93 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 20 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 449def8074fe..7d1136e97c14 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1096,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *newowner, + struct rw_semaphore *fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret; + int ret, attempt = 0; /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + /* + * We are here either because we stole the rtmutex from the + * pending owner or we are the pending owner which failed to + * get the rtmutex. We have to replace the pending owner TID + * in the user space variable. This must be atomic as we have + * to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the + * pi_state because we can fault here. Imagine swapped out + * pages or a fork, which was running right before we acquired + * mmap_sem, that marked all the anonymous memory readonly for + * cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault + * here, because we need to drop the hash bucket lock to + * handle the fault. This might be observed in the PID check + * in lookup_pi_state. + */ +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + + while (1) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (curval == -EFAULT) + goto handle_fault; + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ if (pi_state->owner != NULL) { spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); spin_unlock_irq(&pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; + } pi_state->owner = newowner; @@ -1118,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); spin_unlock_irq(&newowner->pi_lock); + return 0; /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. + * To handle the page fault we need to drop the hash bucket + * lock here. That gives the other task (either the pending + * owner itself or the task which stole the rtmutex) the + * chance to try the fixup of the pi_state. So once we are + * back from handling the fault we need to check the pi_state + * after reacquiring the hash bucket lock and before trying to + * do another fixup. When the fixup has been done already we + * simply return. */ - ret = get_futex_value_locked(&uval, uaddr); +handle_fault: + spin_unlock(q->lock_ptr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; + ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + spin_lock(q->lock_ptr); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - return ret; + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return 0; + + if (ret) + return ret; + + goto retry; } /* @@ -1507,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * that case: */ if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr); + ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); } else { /* * Catch the rare case, where the lock was released @@ -1539,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, int res; owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner); + res = fixup_pi_state_owner(uaddr, &q, owner, + fshared); /* propagate -EFAULT, if the fixup failed */ if (res) From 87afd448b186c885d67a08b7417cd46253b6a9d6 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 23 Jun 2008 09:29:58 -0700 Subject: [PATCH 33/65] IB/mthca: Clear ICM pages before handing to FW Current memfree FW has a bug which in some cases, assumes that ICM pages passed to it are cleared. This patch uses __GFP_ZERO to allocate all ICM pages passed to the FW. Once firmware with a fix is released, we can make the workaround conditional on firmware version. This fixes the bug reported by Arthur Kepner here: http://lists.openfabrics.org/pipermail/general/2008-May/050026.html Cc: Signed-off-by: Eli Cohen [ Rewritten to be a one-liner using __GFP_ZERO instead of vmap()ing ICM memory and memset()ing it to 0. - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_memfree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index b224079d4e1f..d5862e5d99a0 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -109,7 +109,11 @@ static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_m { struct page *page; - page = alloc_pages(gfp_mask, order); + /* + * Use __GFP_ZERO because buggy firmware assumes ICM pages are + * cleared, and subtle failures are seen if they aren't. + */ + page = alloc_pages(gfp_mask | __GFP_ZERO, order); if (!page) return -ENOMEM; From 36c7343b4ecac2432430f5393314f1bdc2c219a5 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 23 Jun 2008 12:06:52 +0100 Subject: [PATCH 34/65] tty_driver: Update required method documentation Some of the requirement rules are now more relaxed. Also correct a contradiction in the previous update Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- include/linux/tty_driver.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 59f1c0bd8f9c..d2a003586761 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -27,8 +27,7 @@ * This routine is called by the kernel to write a series of * characters to the tty device. The characters may come from * user space or kernel space. This routine will return the - * number of characters actually accepted for writing. This - * routine is mandatory. + * number of characters actually accepted for writing. * * Optional: Required for writable devices. * @@ -134,7 +133,7 @@ * This routine notifies the tty driver that it should hangup the * tty device. * - * Required: + * Optional: * * void (*break_ctl)(struct tty_stuct *tty, int state); * From 96a331b1d6426726c37242ddbe939ee14b255790 Mon Sep 17 00:00:00 2001 From: Gustavo Fernando Padovan Date: Mon, 23 Jun 2008 12:07:06 +0100 Subject: [PATCH 35/65] removed unused var real_tty on n_tty_ioctl() I noted that the 'struct tty_struct *real_tty' is not used in this function, so I removed the code about 'real_tty'. Signed-off-by: Gustavo Fernando Padovan Acked-by: Alan Cox Signed-off-by: Linus Torvalds --- drivers/char/tty_ioctl.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c index b1a757a5ee27..8f81139d6194 100644 --- a/drivers/char/tty_ioctl.c +++ b/drivers/char/tty_ioctl.c @@ -981,16 +981,9 @@ EXPORT_SYMBOL_GPL(tty_perform_flush); int n_tty_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg) { - struct tty_struct *real_tty; unsigned long flags; int retval; - if (tty->driver->type == TTY_DRIVER_TYPE_PTY && - tty->driver->subtype == PTY_TYPE_MASTER) - real_tty = tty->link; - else - real_tty = tty; - switch (cmd) { case TCXONC: retval = tty_check_change(tty); From 672ca28e300c17bf8d792a2a7a8631193e580c74 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 23 Jun 2008 11:21:37 -0700 Subject: [PATCH 36/65] Fix ZERO_PAGE breakage with vmware Commit 89f5b7da2a6bad2e84670422ab8192382a5aeb9f ("Reinstate ZERO_PAGE optimization in 'get_user_pages()' and fix XIP") broke vmware, as reported by Jeff Chua: "This broke vmware 6.0.4. Jun 22 14:53:03.845: vmx| NOT_IMPLEMENTED /build/mts/release/bora-93057/bora/vmx/main/vmmonPosix.c:774" and the reason seems to be that there's an old bug in how we handle do FOLL_ANON on VM_SHARED areas in get_user_pages(), but since it only triggered if the whole page table was missing, nobody had apparently hit it before. The recent changes to 'follow_page()' made the FOLL_ANON logic trigger not just for whole missing page tables, but for individual pages as well, and exposed this problem. This fixes it by making the test for when FOLL_ANON is used more careful, and also makes the code easier to read and understand by moving the logic to a separate inline function. Reported-and-tested-by: Jeff Chua Signed-off-by: Linus Torvalds --- mm/memory.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 9aefaae46858..423e0e7c2f73 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1045,6 +1045,26 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, return page; } +/* Can we do the FOLL_ANON optimization? */ +static inline int use_zero_page(struct vm_area_struct *vma) +{ + /* + * We don't want to optimize FOLL_ANON for make_pages_present() + * when it tries to page in a VM_LOCKED region. As to VM_SHARED, + * we want to get the page from the page tables to make sure + * that we serialize and update with any other user of that + * mapping. + */ + if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) + return 0; + /* + * And if we have a fault or a nopfn routine, it's not an + * anonymous region. + */ + return !vma->vm_ops || + (!vma->vm_ops->fault && !vma->vm_ops->nopfn); +} + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -1119,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, foll_flags = FOLL_TOUCH; if (pages) foll_flags |= FOLL_GET; - if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || !vma->vm_ops->fault)) + if (!write && use_zero_page(vma)) foll_flags |= FOLL_ANON; do { From 945754a1754f9d4c2974a8241ad4f92fad7f3a6a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 23 Jun 2008 14:30:30 +0200 Subject: [PATCH 37/65] mm: fix race in COW logic There is a race in the COW logic. It contains a shortcut to avoid the COW and reuse the page if we have the sole reference on the page, however it is possible to have two racing do_wp_page()ers with one causing the other to mistakenly believe it is safe to take the shortcut when it is not. This could lead to data corruption. Process 1 and process2 each have a wp pte of the same anon page (ie. one forked the other). The page's mapcount is 2. Then they both attempt to write to it around the same time... proc1 proc2 thr1 proc2 thr2 CPU0 CPU1 CPU3 do_wp_page() do_wp_page() trylock_page() can_share_swap_page() load page mapcount (==2) reuse = 0 pte unlock copy page to new_page pte lock page_remove_rmap(page); trylock_page() can_share_swap_page() load page mapcount (==1) reuse = 1 ptep_set_access_flags (allow W) write private key into page read from page ptep_clear_flush() set_pte_at(pte of new_page) Fix this by moving the page_remove_rmap of the old page after the pte clear and flush. Potentially the entire branch could be moved down here, but in order to stay consistent, I won't (should probably move all the *_mm_counter stuff with one patch). Signed-off-by: Nick Piggin Acked-by: Hugh Dickins Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 423e0e7c2f73..d14b251a25a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1785,7 +1785,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { - page_remove_rmap(old_page, vma); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); @@ -1807,6 +1806,32 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page, vma); + } + /* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; From 33852a1f2bb014e4047a844556c0d76a2f790c37 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 19 Jun 2008 14:20:11 -0400 Subject: [PATCH 38/65] NFS: Reduce the NFS mount code stack usage. This appears to fix the Oops reported in http://bugzilla.kernel.org/show_bug.cgi?id=10826 Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 68 +++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2a4a024a4e7b..dac663dc5611 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1216,8 +1216,6 @@ static int nfs_validate_mount_data(void *options, { struct nfs_mount_data *data = (struct nfs_mount_data *)options; - memset(args, 0, sizeof(*args)); - if (data == NULL) goto out_no_data; @@ -1585,24 +1583,29 @@ static int nfs_get_sb(struct file_system_type *fs_type, { struct nfs_server *server = NULL; struct super_block *s; - struct nfs_fh mntfh; - struct nfs_parsed_mount_data data; + struct nfs_parsed_mount_data *data; + struct nfs_fh *mntfh; struct dentry *mntroot; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, }; - int error; + int error = -ENOMEM; - security_init_mnt_opts(&data.lsm_opts); + data = kzalloc(sizeof(*data), GFP_KERNEL); + mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + + security_init_mnt_opts(&data->lsm_opts); /* Validate the mount data */ - error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name); + error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); if (error < 0) goto out; /* Get a volume representation */ - server = nfs_create_server(&data, &mntfh); + server = nfs_create_server(data, mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); goto out; @@ -1630,16 +1633,16 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ - nfs_fill_super(s, &data); + nfs_fill_super(s, data); } - mntroot = nfs_get_root(s, &mntfh); + mntroot = nfs_get_root(s, mntfh); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; } - error = security_sb_set_mnt_opts(s, &data.lsm_opts); + error = security_sb_set_mnt_opts(s, &data->lsm_opts); if (error) goto error_splat_root; @@ -1649,9 +1652,12 @@ static int nfs_get_sb(struct file_system_type *fs_type, error = 0; out: - kfree(data.nfs_server.hostname); - kfree(data.mount_server.hostname); - security_free_mnt_opts(&data.lsm_opts); + kfree(data->nfs_server.hostname); + kfree(data->mount_server.hostname); + security_free_mnt_opts(&data->lsm_opts); +out_free_fh: + kfree(mntfh); + kfree(data); return error; out_err_nosb: @@ -1800,8 +1806,6 @@ static int nfs4_validate_mount_data(void *options, struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; char *c; - memset(args, 0, sizeof(*args)); - if (data == NULL) goto out_no_data; @@ -1959,26 +1963,31 @@ static int nfs4_validate_mount_data(void *options, static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { - struct nfs_parsed_mount_data data; + struct nfs_parsed_mount_data *data; struct super_block *s; struct nfs_server *server; - struct nfs_fh mntfh; + struct nfs_fh *mntfh; struct dentry *mntroot; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, }; - int error; + int error = -ENOMEM; - security_init_mnt_opts(&data.lsm_opts); + data = kzalloc(sizeof(*data), GFP_KERNEL); + mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + + security_init_mnt_opts(&data->lsm_opts); /* Validate the mount data */ - error = nfs4_validate_mount_data(raw_data, &data, dev_name); + error = nfs4_validate_mount_data(raw_data, data, dev_name); if (error < 0) goto out; /* Get a volume representation */ - server = nfs4_create_server(&data, &mntfh); + server = nfs4_create_server(data, mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); goto out; @@ -2009,13 +2018,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type, nfs4_fill_super(s); } - mntroot = nfs4_get_root(s, &mntfh); + mntroot = nfs4_get_root(s, mntfh); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; } - error = security_sb_set_mnt_opts(s, &data.lsm_opts); + error = security_sb_set_mnt_opts(s, &data->lsm_opts); if (error) goto error_splat_root; @@ -2025,10 +2034,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type, error = 0; out: - kfree(data.client_address); - kfree(data.nfs_server.export_path); - kfree(data.nfs_server.hostname); - security_free_mnt_opts(&data.lsm_opts); + kfree(data->client_address); + kfree(data->nfs_server.export_path); + kfree(data->nfs_server.hostname); + security_free_mnt_opts(&data->lsm_opts); +out_free_fh: + kfree(mntfh); + kfree(data); return error; out_free: From b7e2445737ff69cef892b6fd9cd71cae2c9e9515 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 19 Jun 2008 15:21:11 -0400 Subject: [PATCH 39/65] NFS: Fix filehandle size comparisons in the mount code Fix a sign issue in xdr_decode_fhstatus3() Fix incorrect comparison in nfs_validate_mount_data() Signed-off-by: Trond Myklebust --- fs/nfs/mount_clnt.c | 5 +++-- fs/nfs/super.c | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 49c7cd0502cc..779d2eb649c5 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res) { struct nfs_fh *fh = res->fh; + unsigned size; if ((res->status = ntohl(*p++)) == 0) { - int size = ntohl(*p++); - if (size <= NFS3_FHSIZE) { + size = ntohl(*p++); + if (size <= NFS3_FHSIZE && size != 0) { fh->size = size; memcpy(fh->data, p, size); } else diff --git a/fs/nfs/super.c b/fs/nfs/super.c index dac663dc5611..614efeed5437 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1249,13 +1249,13 @@ static int nfs_validate_mount_data(void *options, case 5: memset(data->context, 0, sizeof(data->context)); case 6: - if (data->flags & NFS_MOUNT_VER3) + if (data->flags & NFS_MOUNT_VER3) { + if (data->root.size > NFS3_FHSIZE || data->root.size == 0) + goto out_invalid_fh; mntfh->size = data->root.size; - else + } else mntfh->size = NFS2_FHSIZE; - if (mntfh->size > sizeof(mntfh->data)) - goto out_invalid_fh; memcpy(mntfh->data, data->root.data, mntfh->size); if (mntfh->size < sizeof(mntfh->data)) From 03fa9e84e5dc10aeacb0e4eb2f708cd9fc36a5b8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 5 Jun 2008 16:02:35 -0400 Subject: [PATCH 40/65] NFS: nfs_updatepage(): don't mark page as dirty if an error occurred Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6d8ace3e3259..f333848fd3be 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -739,12 +739,13 @@ int nfs_updatepage(struct file *file, struct page *page, } status = nfs_writepage_setup(ctx, page, offset, count); - __set_page_dirty_nobuffers(page); + if (status < 0) + nfs_set_pageerror(page); + else + __set_page_dirty_nobuffers(page); dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", status, (long long)i_size_read(inode)); - if (status < 0) - nfs_set_pageerror(page); return status; } From 72c6e251ed84b3a9cdfde6711191155c47bb2b9c Mon Sep 17 00:00:00 2001 From: Thorsten Kranzkowski Date: Mon, 23 Jun 2008 20:57:22 +0000 Subject: [PATCH 41/65] alpha: fix compile error in arch/alpha/mm/init.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 9267b4b3880d00dc2dab90f1d817c856939114f7 ("alpha: fix module load failures on smp (bug #10926)") causes a regression for my ev4 uniprocessor build: CC arch/alpha/mm/init.o /export/data/repositories/linux-2.6/arch/alpha/mm/init.c:34: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘typeof’ make[2]: *** [arch/alpha/mm/init.o] Error 1 make[1]: *** [arch/alpha/mm] Error 2 make: *** [sub-make] Error 2 This fixes it for me (compile and boot tested): Signed-off-by: Thorsten Kranzkowski Acked-by: Ivan Kokshaysky Signed-off-by: Linus Torvalds --- include/asm-alpha/percpu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-alpha/percpu.h b/include/asm-alpha/percpu.h index 82e8a94b4b2f..3495e8e00d70 100644 --- a/include/asm-alpha/percpu.h +++ b/include/asm-alpha/percpu.h @@ -69,6 +69,8 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #define __get_cpu_var(var) per_cpu_var(var) #define __raw_get_cpu_var(var) per_cpu_var(var) +#define PER_CPU_ATTRIBUTES + #endif /* SMP */ #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name) From d4acf7e7abe45457e751525a2a4d5b693dfdd597 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 6 Jun 2008 16:37:35 -0300 Subject: [PATCH 42/65] KVM: Fix race between timer migration and vcpu migration A guest vcpu instance can be scheduled to a different physical CPU between the test for KVM_REQ_MIGRATE_TIMER and local_irq_disable(). If that happens, the timer will only be migrated to the current pCPU on the next exit, meaning that guest LAPIC timer event can be delayed until a host interrupt is triggered. Fix it by cancelling guest entry if any vcpu request is pending. This has the side effect of nicely consolidating vcpu->requests checks. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00acf1301a15..b90744a1dc3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2759,6 +2759,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); + if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + kvm_x86_ops->tlb_flush(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests)) { kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; @@ -2781,21 +2783,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) local_irq_disable(); - if (need_resched()) { + if (vcpu->requests || need_resched()) { local_irq_enable(); preempt_enable(); r = 1; goto out; } - if (vcpu->requests) - if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) { - local_irq_enable(); - preempt_enable(); - r = 1; - goto out; - } - if (signal_pending(current)) { local_irq_enable(); preempt_enable(); @@ -2825,9 +2819,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_guest_enter(); - if (vcpu->requests) - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) - kvm_x86_ops->tlb_flush(vcpu); KVMTRACE_0D(VMENTRY, vcpu, entryexit); kvm_x86_ops->run(vcpu, kvm_run); From 06e05645661211b9eaadaf6344c335d2e80f0ba2 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 6 Jun 2008 16:37:36 -0300 Subject: [PATCH 43/65] KVM: close timer injection race window in __vcpu_run If a timer fires after kvm_inject_pending_timer_irqs() but before local_irq_disable() the code will enter guest mode and only inject such timer interrupt the next time an unrelated event causes an exit. It would be simpler if the timer->pending irq conversion could be done with IRQ's disabled, so that the above problem cannot happen. For now introduce a new vcpu requests bit to cancel guest entry. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 9 ++++++--- arch/x86/kvm/lapic.c | 1 + arch/x86/kvm/x86.c | 1 + include/linux/kvm_host.h | 1 + 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index f2f5d260874e..3829aa7b663f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps) atomic_inc(&pt->pending); smp_mb__after_atomic_inc(); - if (vcpu0 && waitqueue_active(&vcpu0->wq)) { - vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; - wake_up_interruptible(&vcpu0->wq); + if (vcpu0) { + set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); + if (waitqueue_active(&vcpu0->wq)) { + vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; + wake_up_interruptible(&vcpu0->wq); + } } pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c297c50eba63..ebc03f5ae162 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic) wait_queue_head_t *q = &apic->vcpu->wq; atomic_inc(&apic->timer.pending); + set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); if (waitqueue_active(q)) { apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; wake_up_interruptible(q); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b90744a1dc3a..b08812d6b34c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2774,6 +2774,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); kvm_inject_pending_timer_irqs(vcpu); preempt_disable(); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 092b1b25291d..de9d1df4bba2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -33,6 +33,7 @@ #define KVM_REQ_REPORT_TPR_ACCESS 2 #define KVM_REQ_MMU_RELOAD 3 #define KVM_REQ_TRIPLE_FAULT 4 +#define KVM_REQ_PENDING_TIMER 5 struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; From 6597ca09e6c0e5aec7ffd2b8ab48c671d3c28414 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sun, 8 Jun 2008 01:48:53 -0300 Subject: [PATCH 44/65] KVM: MMU: Fix rmap_write_protect() hugepage iteration bug rmap_next() does not work correctly after rmap_remove(), as it expects the rmap chains not to change during iteration. Fix (for now) by restarting iteration from the beginning. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ee3f53098f0c..9628091c574d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) rmap_remove(kvm, spte); --kvm->stat.lpages; set_shadow_pte(spte, shadow_trap_nonpresent_pte); + spte = NULL; write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); From 3094538739415a9225afd2a6c78cb0fe1c1f641b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 11 Jun 2008 20:32:40 -0300 Subject: [PATCH 45/65] KVM: MMU: large page update_pte issue with non-PAE 32-bit guests (resend) kvm_mmu_pte_write() does not handle 32-bit non-PAE large page backed guests properly. It will instantiate two 2MB sptes pointing to the same physical 2MB page when a guest large pte update is trapped. Instead of duplicating code to handle this, disallow directory level updates to happen through kvm_mmu_pte_write(), so the two 2MB sptes emulating one guest 4MB pte can be correctly created by the page fault handling path. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9628091c574d..baa6503894d3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1581,11 +1581,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, u64 *spte, const void *new) { - if ((sp->role.level != PT_PAGE_TABLE_LEVEL) - && !vcpu->arch.update_pte.largepage) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } + if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + if (!vcpu->arch.update_pte.largepage || + sp->role.glevels == PT32_ROOT_LEVEL) { + ++vcpu->kvm->stat.mmu_pde_zapped; + return; + } + } ++vcpu->kvm->stat.mmu_pte_updated; if (sp->role.glevels == PT32_ROOT_LEVEL) From 6bf6a9532fd03ad719f0c86654f16ef777b78fc6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 12 Jun 2008 16:54:41 +0300 Subject: [PATCH 46/65] KVM: MMU: Fix oops on guest userspace access to guest pagetable KVM has a heuristic to unshadow guest pagetables when userspace accesses them, on the assumption that most guests do not allow userspace to access pagetables directly. Unfortunately, in addition to unshadowing the pagetables, it also oopses. This never triggers on ordinary guests since sane OSes will clear the pagetables before assigning them to userspace, which will trigger the flood heuristic, unshadowing the pagetables before the first userspace access. One particular guest, though (Xenner) will run the kernel in userspace, triggering the oops. Since the heuristic is incorrect in this case, we can simply remove it. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index baa6503894d3..7e7c3969f7a2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1083,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, struct kvm_mmu_page *shadow; spte |= PT_WRITABLE_MASK; - if (user_fault) { - mmu_unshadow(vcpu->kvm, gfn); - goto unshadowed; - } shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); if (shadow || @@ -1103,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } } -unshadowed: - if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); From 4fa6b9c5dc4134bdeac341d731a87783cc11ca10 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 17 Jun 2008 15:36:36 -0700 Subject: [PATCH 47/65] KVM: ioapic: fix lost interrupt when changing a device's irq The ioapic acknowledge path translates interrupt vectors to irqs. It currently uses a first match algorithm, stopping when it finds the first redirection table entry containing the vector. That fails however if the guest changes the irq to a different line, leaving the old redirection table entry in place (though masked). Result is interrupts not making it to the guest. Fix by always scanning the entire redirection table. Signed-off-by: Avi Kivity --- virt/kvm/ioapic.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 98778cb69c6e..1dcf9f3d1107 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -269,28 +269,9 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) } } -static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) +static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi) { - int i; - - for (i = 0; i < IOAPIC_NUM_PINS; i++) - if (ioapic->redirtbl[i].fields.vector == vector) - return i; - return -1; -} - -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) -{ - struct kvm_ioapic *ioapic = kvm->arch.vioapic; union ioapic_redir_entry *ent; - int gsi; - - gsi = get_eoi_gsi(ioapic, vector); - if (gsi == -1) { - printk(KERN_WARNING "Can't find redir item for %d EOI\n", - vector); - return; - } ent = &ioapic->redirtbl[gsi]; ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); @@ -300,6 +281,16 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) ioapic_deliver(ioapic, gsi); } +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) + if (ioapic->redirtbl[i].fields.vector == vector) + __kvm_ioapic_update_eoi(ioapic, i); +} + static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr) { struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; From a9b21b622958afc3f3bc5a23d266dd9ed1171fd3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 24 Jun 2008 11:48:49 +0300 Subject: [PATCH 48/65] KVM: VMX: Fix host msr corruption with preemption enabled Switching msrs can occur either synchronously as a result of calls to the msr management functions (usually in response to the guest touching virtualized msrs), or asynchronously when preempting a kvm thread that has guest state loaded. If we're unlucky enough to have the two at the same time, host msrs are corrupted and the machine goes kaput on the next syscall. Most easily triggered by Windows Server 2008, as it does a lot of msr switching during bootup. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 02efbe75f317..540e95179074 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) load_transition_efer(vmx); } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +static void __vmx_load_host_state(struct vcpu_vmx *vmx) { unsigned long flags; @@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) reload_host_efer(vmx); } +static void vmx_load_host_state(struct vcpu_vmx *vmx) +{ + preempt_disable(); + __vmx_load_host_state(vmx); + preempt_enable(); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { - vmx_load_host_state(to_vmx(vcpu)); + __vmx_load_host_state(to_vmx(vcpu)); } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) switch (msr_index) { #ifdef CONFIG_X86_64 case MSR_EFER: + vmx_load_host_state(vmx); ret = kvm_set_msr_common(vcpu, msr_index, data); - if (vmx->host_state.loaded) { - reload_host_efer(vmx); - load_transition_efer(vmx); - } break; case MSR_FS_BASE: vmcs_writel(GUEST_FS_BASE, data); @@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) guest_write_tsc(data); break; default: + vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; - if (vmx->host_state.loaded) - load_msrs(vmx->guest_msrs, vmx->save_nmsrs); break; } ret = kvm_set_msr_common(vcpu, msr_index, data); From 63842cccb285259345f52025ef57bdfd79657a2d Mon Sep 17 00:00:00 2001 From: Wim Van Sebroeck Date: Tue, 24 Jun 2008 13:09:26 +0000 Subject: [PATCH 49/65] Revert "[WATCHDOG] hpwdt: Add CFLAGS to get driver working" After Linus fixed the inline assembly, the CFLAGS option is not needed anymore. Signed-off-by: Thomas Mingarelli Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 8662a6b7a30b..25b352b664d9 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -68,7 +68,6 @@ obj-$(CONFIG_WAFER_WDT) += wafer5823wdt.o obj-$(CONFIG_I6300ESB_WDT) += i6300esb.o obj-$(CONFIG_ITCO_WDT) += iTCO_wdt.o iTCO_vendor_support.o obj-$(CONFIG_IT8712F_WDT) += it8712f_wdt.o -CFLAGS_hpwdt.o += -O obj-$(CONFIG_HP_WATCHDOG) += hpwdt.o obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o From 17c15da00c0e7289375ad57e8fea0c7892b74aa0 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 18 Jun 2008 11:30:40 -0500 Subject: [PATCH 50/65] [GFS2] BUG: unable to handle kernel paging request at ffff81002690e000 This patch fixes bugzilla bug bz448866: gfs2: BUG: unable to handle kernel paging request at ffff81002690e000. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 6387523a3153..3401628d742b 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -195,7 +195,7 @@ static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal, depending on architecture. I've experimented with several ways of writing this section such as using an else before the goto but this one seems to be the fastest. */ - while ((unsigned char *)plong < end - 1) { + while ((unsigned char *)plong < end - sizeof(unsigned long)) { prefetch(plong + 1); if (((*plong) & LBITMASK) != lskipval) break; From 28499143933f19b28008a556ed59255d6009391a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 9 May 2008 12:05:57 +0100 Subject: [PATCH 51/65] xen: remove support for non-PAE 32-bit Non-PAE operation has been deprecated in Xen for a while, and is rarely tested or used. xen-unstable has now officially dropped non-PAE support. Since Xen/pvops' non-PAE support has also been broken for a while, we may as well completely drop it altogether. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 2 +- arch/x86/xen/enlighten.c | 51 ++++++++++++++------------------------ arch/x86/xen/mmu.c | 19 ++------------ arch/x86/xen/mmu.h | 24 +++++------------- arch/x86/xen/xen-head.S | 4 --- include/asm-x86/xen/page.h | 4 --- 6 files changed, 27 insertions(+), 77 deletions(-) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 2e641be2737e..525b108411bd 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,7 +6,7 @@ config XEN bool "Xen guest support" select PARAVIRT depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) + depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c048de34d6a1..f09c1c69c37a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) static __init void xen_pagetable_setup_start(pgd_t *base) { pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + int i; /* special set_pte for pagetable initialization */ pv_mmu_ops.set_pte = xen_set_pte_init; init_mm.pgd = base; /* - * copy top-level of Xen-supplied pagetable into place. For - * !PAE we can use this as-is, but for PAE it is a stand-in - * while we copy the pmd pages. + * copy top-level of Xen-supplied pagetable into place. This + * is a stand-in while we copy the pmd pages. */ memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - if (PTRS_PER_PMD > 1) { - int i; - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + /* + * For PAE, need to allocate new pmds, rather than + * share Xen's, since Xen doesn't like pmd's being + * shared between address spaces. + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); - make_lowmem_page_readonly(pmd); + make_lowmem_page_readonly(pmd); - set_pgd(&base[i], __pgd(1 + __pa(pmd))); - } else - pgd_clear(&base[i]); - } + set_pgd(&base[i], __pgd(1 + __pa(pmd))); + } else + pgd_clear(&base[i]); } /* make sure zero_page is mapped RO so we can use it in pagetables */ @@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ - { - unsigned level; - -#ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; -#endif - - pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); - } + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); } /* This is called once we have the cpu_possible_map */ @@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pte = xen_make_pte, .make_pgd = xen_make_pgd, -#ifdef CONFIG_X86_PAE .set_pte_atomic = xen_set_pte_atomic, .set_pte_present = xen_set_pte_at, .set_pud = xen_set_pud, @@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pmd = xen_make_pmd, .pmd_val = xen_pmd_val, -#endif /* PAE */ .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 265601d5a6ae..df40bf74ea75 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -228,7 +228,7 @@ pmdval_t xen_pmd_val(pmd_t pmd) { return pte_mfn_to_pfn(pmd.pmd); } -#ifdef CONFIG_X86_PAE + void xen_set_pud(pud_t *ptr, pud_t val) { struct multicall_space mcs; @@ -276,12 +276,6 @@ pmd_t xen_make_pmd(pmdval_t pmd) pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); } -#else /* !PAE */ -void xen_set_pte(pte_t *ptep, pte_t pte) -{ - *ptep = pte; -} -#endif /* CONFIG_X86_PAE */ /* (Yet another) pagetable walker. This one is intended for pinning a @@ -434,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level) read-only, and can be pinned. */ void xen_pgd_pin(pgd_t *pgd) { - unsigned level; - xen_mc_batch(); if (pgd_walk(pgd, pin_page, TASK_SIZE)) { @@ -445,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); } -#ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; -#endif - - xen_do_pin(level, PFN_DOWN(__pa(pgd))); - + xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); xen_mc_issue(0); } diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index b5e189b1519d..5fe961caffd4 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm); void xen_pgd_pin(pgd_t *pgd); //void xen_pgd_unpin(pgd_t *pgd); -#ifdef CONFIG_X86_PAE -unsigned long long xen_pte_val(pte_t); -unsigned long long xen_pmd_val(pmd_t); -unsigned long long xen_pgd_val(pgd_t); +pteval_t xen_pte_val(pte_t); +pmdval_t xen_pmd_val(pmd_t); +pgdval_t xen_pgd_val(pgd_t); -pte_t xen_make_pte(unsigned long long); -pmd_t xen_make_pmd(unsigned long long); -pgd_t xen_make_pgd(unsigned long long); +pte_t xen_make_pte(pteval_t); +pmd_t xen_make_pmd(pmdval_t); +pgd_t xen_make_pgd(pgdval_t); void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); @@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val); void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_pmd_clear(pmd_t *pmdp); - -#else -unsigned long xen_pte_val(pte_t); -unsigned long xen_pmd_val(pmd_t); -unsigned long xen_pgd_val(pgd_t); - -pte_t xen_make_pte(unsigned long); -pmd_t xen_make_pmd(unsigned long); -pgd_t xen_make_pgd(unsigned long); -#endif - #endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 3175e973fd0d..6ec3b4f7719b 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -30,11 +30,7 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") -#ifdef CONFIG_X86_PAE ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") -#else - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") -#endif ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") #endif /*CONFIG_XEN */ diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h index baf3a4dce28c..e11f24038b1d 100644 --- a/include/asm-x86/xen/page.h +++ b/include/asm-x86/xen/page.h @@ -150,13 +150,9 @@ static inline pte_t __pte_ma(pteval_t x) return (pte_t) { .pte = x }; } -#ifdef CONFIG_X86_PAE #define pmd_val_ma(v) ((v).pmd) #define pud_val_ma(v) ((v).pgd.pgd) #define __pmd_ma(x) ((pmd_t) { (x) } ) -#else /* !X86_PAE */ -#define pmd_val_ma(v) ((v).pud.pgd.pgd) -#endif /* CONFIG_X86_PAE */ #define pgd_val_ma(x) ((x).pgd) From a606b5e24b7e2937604f4c85023d8d9c5ab0a28b Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Tue, 24 Jun 2008 10:52:55 -0500 Subject: [PATCH 52/65] kgdb: documentation update - remove kgdboe kgdboe is not presently included kgdb, and there should be no references to it. Also fix the tcp port terminal connection example. Signed-off-by: Jason Wessel --- Documentation/DocBook/kgdb.tmpl | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl index 028a8444d95e..e8acd1f03456 100644 --- a/Documentation/DocBook/kgdb.tmpl +++ b/Documentation/DocBook/kgdb.tmpl @@ -84,10 +84,9 @@ runs an instance of gdb against the vmlinux file which contains the symbols (not boot image such as bzImage, zImage, uImage...). In gdb the developer specifies the connection parameters and - connects to kgdb. Depending on which kgdb I/O modules exist in - the kernel for a given architecture, it may be possible to debug - the test machine's kernel with the development machine using a - rs232 or ethernet connection. + connects to kgdb. The type of connection a developer makes with + gdb depends on the availability of kgdb I/O modules compiled as + builtin's or kernel modules in the test machine's kernel. @@ -223,7 +222,7 @@ IMPORTANT NOTE: Using this option with kgdb over the console - (kgdboc) or kgdb over ethernet (kgdboe) is not supported. + (kgdboc) is not supported. @@ -249,18 +248,11 @@ (gdb) target remote /dev/ttyS0 - Example (kgdb to a terminal server): + Example (kgdb to a terminal server on tcp port 2012): % gdb ./vmlinux - (gdb) target remote udp:192.168.2.2:6443 - - - Example (kgdb over ethernet): - - - % gdb ./vmlinux - (gdb) target remote udp:192.168.2.2:6443 + (gdb) target remote 192.168.2.2:2012 Once connected, you can debug a kernel the way you would debug an From aabdc3b8c3b3d081f1532454e344208338478e29 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Tue, 24 Jun 2008 10:52:55 -0500 Subject: [PATCH 53/65] kgdb: sparse fix - Fix warning reported by sparse kernel/kgdb.c:1502:6: warning: symbol 'kgdb_console_write' was not declared. Should it be static? Signed-off-by: Jason Wessel --- kernel/kgdb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 79e3c90113c2..3ec23c3ec97f 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1499,7 +1499,8 @@ int kgdb_nmicallback(int cpu, void *regs) return 1; } -void kgdb_console_write(struct console *co, const char *s, unsigned count) +static void kgdb_console_write(struct console *co, const char *s, + unsigned count) { unsigned long flags; From 2826f8c0f4c97b7db33e2a680f184d828eb7a785 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Tue, 24 Jun 2008 11:30:09 -0400 Subject: [PATCH 54/65] [IA64] Fix boot failure on ia64/sn2 Call check_sal_cache_flush() after platform_setup() as check_sal_cache_flush() now relies on being able to call platform vector code. Problem was introduced by: 3463a93def55c309f3c0d0a8aaf216be3be42d64 "Update check_sal_cache_flush to use platform_send_ipi()" Signed-off-by: Jes Sorensen Tested-by: Alex Chiang: Signed-off-by: Tony Luck --- arch/ia64/kernel/setup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index f48a809c686d..4ae15c8c2488 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -578,8 +578,6 @@ setup_arch (char **cmdline_p) cpu_init(); /* initialize the bootstrap CPU */ mmu_context_init(); /* initialize context_id bitmap */ - check_sal_cache_flush(); - #ifdef CONFIG_ACPI acpi_boot_init(); #endif @@ -607,6 +605,7 @@ setup_arch (char **cmdline_p) ia64_mca_init(); platform_setup(cmdline_p); + check_sal_cache_flush(); paging_init(); } From 8097110d179b874d91c6495330c2b96c991e8c6e Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Tue, 24 Jun 2008 10:20:06 -0700 Subject: [PATCH 55/65] [IA64] Handle count==0 in sn2_ptc_proc_write() The fix applied in e0c6d97c65e0784aade7e97b9411f245a6c543e7 "security hole in sn2_ptc_proc_write" didn't take into account the case where count==0 (which results in a buffer underrun when adding the trailing '\0'). Thanks to Andi Kleen for pointing this out. Signed-off-by: Cliff Wickman Signed-off-by: Tony Luck --- arch/ia64/sn/kernel/sn2/sn2_smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index 6dd886c5d860..e585f9a2afb9 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -512,7 +512,7 @@ static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, si int cpu; char optstr[64]; - if (count > sizeof(optstr)) + if (count == 0 || count > sizeof(optstr)) return -EINVAL; if (copy_from_user(optstr, user, count)) return -EFAULT; From e2569b7e572c0e6782380b3fdda901deb175d75a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 24 Jun 2008 10:22:05 +0200 Subject: [PATCH 56/65] [IA64] Eliminate NULL test after alloc_bootmem in iosapic_alloc_rte() As noted by Akinobu Mita alloc_bootmem and related functions never return NULL and always return a zeroed region of memory. Thus a NULL test or memset after calls to these functions is unnecessary. Signed-off-by: Julia Lawall Signed-off-by: Tony Luck --- arch/ia64/kernel/iosapic.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 082c31dcfd99..39752cdef6ff 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -558,8 +558,6 @@ static struct iosapic_rte_info * __init_refok iosapic_alloc_rte (void) if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) { rte = alloc_bootmem(sizeof(struct iosapic_rte_info) * NR_PREALLOCATE_RTE_ENTRIES); - if (!rte) - return NULL; for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++) list_add(&rte->rte_list, &free_rte_list); } From 5af4e7a0bea715f2dd7190859a43eb2258b1f388 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 24 Jun 2008 12:53:38 -0500 Subject: [PATCH 57/65] [GFS2] fix gfs2 block allocation (cleaned up) This patch fixes bz 450641. This patch changes the computation for zero_metapath_length(), which it renames to metapath_branch_start(). When you are extending the metadata tree, The indirect blocks that point to the new data block must either diverge from the existing tree either at the inode, or at the first indirect block. They can diverge at the first indirect block because the inode has room for 483 pointers while the indirect blocks have room for 509 pointers, so when the tree is grown, there is some free space in the first indirect block. What metapath_branch_start() now computes is the height where the first indirect block for the new data block is located. It can either be 1 (if the indirect block diverges from the inode) or 2 (if it diverges from the first indirect block). Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index c19184f2e70e..bec76b1c2bb0 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -246,15 +246,11 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block, } -static inline unsigned int zero_metapath_length(const struct metapath *mp, - unsigned height) +static inline unsigned int metapath_branch_start(const struct metapath *mp) { - unsigned int i; - for (i = 0; i < height - 1; i++) { - if (mp->mp_list[i] != 0) - return i; - } - return height; + if (mp->mp_list[0] == 0) + return 2; + return 1; } /** @@ -436,7 +432,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh = mp->mp_bh[0]; u64 bn, dblock = 0; - unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0; + unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; unsigned dblks = 0; unsigned ptrs_per_blk; const unsigned end_of_metadata = height - 1; @@ -471,9 +467,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, /* Building up tree height */ state = ALLOC_GROW_HEIGHT; iblks = height - ip->i_height; - zmpl = zero_metapath_length(mp, height); - iblks -= zmpl; - iblks += height; + branch_start = metapath_branch_start(mp); + iblks += (height - branch_start); } } @@ -509,13 +504,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, sizeof(struct gfs2_meta_header)); *ptr = zero_bn; state = ALLOC_GROW_DEPTH; - for(i = zmpl; i < height; i++) { + for(i = branch_start; i < height; i++) { if (mp->mp_bh[i] == NULL) break; brelse(mp->mp_bh[i]); mp->mp_bh[i] = NULL; } - i = zmpl; + i = branch_start; } if (n == 0) break; From 7af192c954017499ec163bc9dbaaee2e593d7ef2 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 3 Jun 2008 16:17:29 +0200 Subject: [PATCH 58/65] x86: Add structs and functions for paravirt clocksource This patch adds structs for the paravirt clocksource ABI used by both xen and kvm (pvclock-abi.h). It also adds some helper functions to read system time and wall clock time from a paravirtual clocksource (pvclock.[ch]). They are based on the xen code. They are enabled using CONFIG_PARAVIRT_CLOCK. Subsequent patches of this series will put the code in use. Signed-off-by: Gerd Hoffmann Acked-by: Jeremy Fitzhardinge Signed-off-by: Avi Kivity --- arch/x86/Kconfig | 4 + arch/x86/kernel/Makefile | 1 + arch/x86/kernel/pvclock.c | 141 ++++++++++++++++++++++++++++++++++ include/asm-x86/pvclock-abi.h | 42 ++++++++++ include/asm-x86/pvclock.h | 13 ++++ 5 files changed, 201 insertions(+) create mode 100644 arch/x86/kernel/pvclock.c create mode 100644 include/asm-x86/pvclock-abi.h create mode 100644 include/asm-x86/pvclock.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 52e18e6d2ba0..f94bca6ff47f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -410,6 +410,10 @@ config PARAVIRT over full virtualization. However, when run without a hypervisor the kernel is theoretically slower and slightly larger. +config PARAVIRT_CLOCK + bool + default n + endif config MEMTEST_BOOTPARAM diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b4720..77807d4769c9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 000000000000..05fbe9a0325a --- /dev/null +++ b/arch/x86/kernel/pvclock.c @@ -0,0 +1,141 @@ +/* paravirtual clock -- common code used by kvm/xen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include +#include +#include + +/* + * These are perodically updated + * xen: magic shared_info page + * kvm: gpa registered via msr + * and then copied here. + */ +struct pvclock_shadow_time { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) +{ + u64 delta = native_read_tsc() - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +/* + * Reads a consistent set of time-base values from hypervisor, + * into a shadow data area. + */ +static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, + struct pvclock_vcpu_time_info *src) +{ + do { + dst->version = src->version; + rmb(); /* fetch version before data */ + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); /* test version after fetching data */ + } while ((src->version & 1) || (dst->version != src->version)); + + return dst->version; +} + +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ + struct pvclock_shadow_time shadow; + unsigned version; + cycle_t ret, offset; + + do { + version = pvclock_get_time_values(&shadow, src); + barrier(); + offset = pvclock_get_nsec_offset(&shadow); + ret = shadow.system_timestamp + offset; + barrier(); + } while (version != src->version); + + return ret; +} + +void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, + struct pvclock_vcpu_time_info *vcpu_time, + struct timespec *ts) +{ + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = wall_clock->version; + rmb(); /* fetch version before time */ + now.tv_sec = wall_clock->sec; + now.tv_nsec = wall_clock->nsec; + rmb(); /* fetch time before checking version */ + } while ((wall_clock->version & 1) || (version != wall_clock->version)); + + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} diff --git a/include/asm-x86/pvclock-abi.h b/include/asm-x86/pvclock-abi.h new file mode 100644 index 000000000000..6857f840b243 --- /dev/null +++ b/include/asm-x86/pvclock-abi.h @@ -0,0 +1,42 @@ +#ifndef _ASM_X86_PVCLOCK_ABI_H_ +#define _ASM_X86_PVCLOCK_ABI_H_ +#ifndef __ASSEMBLY__ + +/* + * These structs MUST NOT be changed. + * They are the ABI between hypervisor and guest OS. + * Both Xen and KVM are using this. + * + * pvclock_vcpu_time_info holds the system time and the tsc timestamp + * of the last update. So the guest can use the tsc delta to get a + * more precise system time. There is one per virtual cpu. + * + * pvclock_wall_clock references the point in time when the system + * time was zero (usually boot time), thus the guest calculates the + * current wall clock by adding the system time. + * + * Protocol for the "version" fields is: hypervisor raises it (making + * it uneven) before it starts updating the fields and raises it again + * (making it even) when it is done. Thus the guest can make sure the + * time values it got are consistent by checking the version before + * and after reading them. + */ + +struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 pad[3]; +} __attribute__((__packed__)); /* 32 bytes */ + +struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; +} __attribute__((__packed__)); + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_PVCLOCK_ABI_H_ */ diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h new file mode 100644 index 000000000000..85b1bba8e0a3 --- /dev/null +++ b/include/asm-x86/pvclock.h @@ -0,0 +1,13 @@ +#ifndef _ASM_X86_PVCLOCK_H_ +#define _ASM_X86_PVCLOCK_H_ + +#include +#include + +/* some helper functions for xen and kvm pv clock sources */ +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +void pvclock_read_wallclock(struct pvclock_wall_clock *wall, + struct pvclock_vcpu_time_info *vcpu, + struct timespec *ts); + +#endif /* _ASM_X86_PVCLOCK_H_ */ From 1c7b67f7576c4ca2a344379a4a29eec8fe8e7935 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 3 Jun 2008 16:17:30 +0200 Subject: [PATCH 59/65] x86: Make xen use the paravirt clocksource structs and functions This patch updates the xen guest to use the pvclock structs and helper functions. Signed-off-by: Gerd Hoffmann Acked-by: Jeremy Fitzhardinge Signed-off-by: Avi Kivity --- arch/x86/xen/Kconfig | 1 + arch/x86/xen/time.c | 132 ++++-------------------------------- include/xen/interface/xen.h | 7 +- 3 files changed, 16 insertions(+), 124 deletions(-) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 2e641be2737e..3a4f16aea4bf 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -5,6 +5,7 @@ config XEN bool "Xen guest support" select PARAVIRT + select PARAVIRT_CLOCK depends on X86_32 depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) help diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 52b2e3856980..41e217503c96 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -31,17 +32,6 @@ static cycle_t xen_clocksource_read(void); -/* These are perodically updated in shared_info, and then copied here. */ -struct shadow_time_info { - u64 tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_timestamp; /* Time, in nanosecs, since boot. */ - u32 tsc_to_nsec_mul; - int tsc_shift; - u32 version; -}; - -static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); - /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void) unsigned long xen_cpu_khz(void) { u64 xen_khz = 1000000ULL << 32; - const struct vcpu_time_info *info = + const struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; do_div(xen_khz, info->tsc_to_system_mul); @@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void) return xen_khz; } -/* - * Reads a consistent set of time-base values from Xen, into a shadow data - * area. - */ -static unsigned get_time_values_from_xen(void) -{ - struct vcpu_time_info *src; - struct shadow_time_info *dst; - - /* src is shared memory with the hypervisor, so we need to - make sure we get a consistent snapshot, even in the face of - being preempted. */ - src = &__get_cpu_var(xen_vcpu)->time; - dst = &__get_cpu_var(shadow_time); - - do { - dst->version = src->version; - rmb(); /* fetch version before data */ - dst->tsc_timestamp = src->tsc_timestamp; - dst->system_timestamp = src->system_time; - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; - dst->tsc_shift = src->tsc_shift; - rmb(); /* test version after fetching data */ - } while ((src->version & 1) | (dst->version ^ src->version)); - - return dst->version; -} - -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) -{ - u64 product; -#ifdef __i386__ - u32 tmp1, tmp2; -#endif - - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ - __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif - - return product; -} - -static u64 get_nsec_offset(struct shadow_time_info *shadow) -{ - u64 now, delta; - now = native_read_tsc(); - delta = now - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); -} - static cycle_t xen_clocksource_read(void) { - struct shadow_time_info *shadow = &get_cpu_var(shadow_time); + struct pvclock_vcpu_time_info *src; cycle_t ret; - unsigned version; - - do { - version = get_time_values_from_xen(); - barrier(); - ret = shadow->system_timestamp + get_nsec_offset(shadow); - barrier(); - } while (version != __get_cpu_var(xen_vcpu)->time.version); - - put_cpu_var(shadow_time); + src = &get_cpu_var(xen_vcpu)->time; + ret = pvclock_clocksource_read(src); + put_cpu_var(xen_vcpu); return ret; } static void xen_read_wallclock(struct timespec *ts) { - const struct shared_info *s = HYPERVISOR_shared_info; - u32 version; - u64 delta; - struct timespec now; + struct shared_info *s = HYPERVISOR_shared_info; + struct pvclock_wall_clock *wall_clock = &(s->wc); + struct pvclock_vcpu_time_info *vcpu_time; - /* get wallclock at system boot */ - do { - version = s->wc_version; - rmb(); /* fetch version before time */ - now.tv_sec = s->wc_sec; - now.tv_nsec = s->wc_nsec; - rmb(); /* fetch time before checking version */ - } while ((s->wc_version & 1) | (version ^ s->wc_version)); - - delta = xen_clocksource_read(); /* time since system boot */ - delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; - - now.tv_nsec = do_div(delta, NSEC_PER_SEC); - now.tv_sec = delta; - - set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); + vcpu_time = &get_cpu_var(xen_vcpu)->time; + pvclock_read_wallclock(wall_clock, vcpu_time, ts); + put_cpu_var(xen_vcpu); } unsigned long xen_get_wallclock(void) @@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void) struct timespec ts; xen_read_wallclock(&ts); - return ts.tv_sec; } @@ -569,8 +463,6 @@ __init void xen_time_init(void) { int cpu = smp_processor_id(); - get_time_values_from_xen(); - clocksource_register(&xen_clocksource); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 9b018da48cf3..819a0331cda9 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -10,6 +10,7 @@ #define __XEN_PUBLIC_XEN_H__ #include +#include /* * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). @@ -336,7 +337,7 @@ struct vcpu_info { uint8_t evtchn_upcall_mask; unsigned long evtchn_pending_sel; struct arch_vcpu_info arch; - struct vcpu_time_info time; + struct pvclock_vcpu_time_info time; }; /* 64 bytes (x86) */ /* @@ -384,9 +385,7 @@ struct shared_info { * Wallclock time: updated only by control software. Guests should base * their gettimeofday() syscall on this wallclock-base value. */ - uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */ - uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ - uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */ + struct pvclock_wall_clock wc; struct arch_shared_info arch; From 50d0a0f987b83a8dadb1134d834e35ec410392b5 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 3 Jun 2008 16:17:31 +0200 Subject: [PATCH 60/65] KVM: Make kvm host use the paravirt clocksource structs This patch updates the kvm host code to use the pvclock structs. It also makes the paravirt clock compatible with Xen. Signed-off-by: Gerd Hoffmann Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 75 +++++++++++++++++++++++++++++++------- include/asm-x86/kvm_host.h | 4 +- 2 files changed, 65 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b08812d6b34c..63a77caa59f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { static int version; - struct kvm_wall_clock wc; - struct timespec wc_ts; + struct pvclock_wall_clock wc; + struct timespec now, sys, boot; if (!wall_clock) return; @@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); - wc_ts = current_kernel_time(); - wc.wc_sec = wc_ts.tv_sec; - wc.wc_nsec = wc_ts.tv_nsec; - wc.wc_version = version; + /* + * The guest calculates current wall clock time by adding + * system time (updated by kvm_write_guest_time below) to the + * wall clock specified here. guest system time equals host + * system time for us, thus we must fill in host boot time here. + */ + now = current_kernel_time(); + ktime_get_ts(&sys); + boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); + + wc.sec = boot.tv_sec; + wc.nsec = boot.tv_nsec; + wc.version = version; kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } +static uint32_t div_frac(uint32_t dividend, uint32_t divisor) +{ + uint32_t quotient, remainder; + + /* Don't try to replace with do_div(), this one calculates + * "(dividend << 32) / divisor" */ + __asm__ ( "divl %4" + : "=a" (quotient), "=d" (remainder) + : "0" (0), "1" (dividend), "r" (divisor) ); + return quotient; +} + +static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) +{ + uint64_t nsecs = 1000000000LL; + int32_t shift = 0; + uint64_t tps64; + uint32_t tps32; + + tps64 = tsc_khz * 1000LL; + while (tps64 > nsecs*2) { + tps64 >>= 1; + shift--; + } + + tps32 = (uint32_t)tps64; + while (tps32 <= (uint32_t)nsecs) { + tps32 <<= 1; + shift++; + } + + hv_clock->tsc_shift = shift; + hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); + + pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", + __FUNCTION__, tsc_khz, hv_clock->tsc_shift, + hv_clock->tsc_to_system_mul); +} + static void kvm_write_guest_time(struct kvm_vcpu *v) { struct timespec ts; @@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) if ((!vcpu->time_page)) return; + if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { + kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = tsc_khz; + } + /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, @@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate - * state, we just write "2" at the end + * state, we just increase by 2 at the end. */ - vcpu->hv_clock.version = 2; + vcpu->hv_clock.version += 2; shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + sizeof(vcpu->hv_clock)); kunmap_atomic(shared_kaddr, KM_USER0); @@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - vcpu->arch.hv_clock.tsc_to_system_mul = - clocksource_khz2mult(tsc_khz, 22); - vcpu->arch.hv_clock.tsc_shift = 22; - down_read(¤t->mm->mmap_sem); vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 1d8cd01fa514..844f2a89afbc 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -18,6 +18,7 @@ #include #include +#include #include #define KVM_MAX_VCPUS 16 @@ -282,7 +283,8 @@ struct kvm_vcpu_arch { struct x86_emulate_ctxt emulate_ctxt; gpa_t time; - struct kvm_vcpu_time_info hv_clock; + struct pvclock_vcpu_time_info hv_clock; + unsigned int hv_clock_tsc_khz; unsigned int time_offset; struct page *time_page; }; From f6e16d5ad463d15f285666f588cfe49495c692d9 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 3 Jun 2008 16:17:32 +0200 Subject: [PATCH 61/65] x86: KVM guest: Use the paravirt clocksource structs and functions This patch updates the kvm host code to use the pvclock structs and functions, thereby making it compatible with Xen. The patch also fixes an initialization bug: on SMP systems the per-cpu has two different locations early at boot and after CPU bringup. kvmclock must take that in account when registering the physical address within the host. Signed-off-by: Gerd Hoffmann Signed-off-by: Avi Kivity --- arch/x86/Kconfig | 1 + arch/x86/kernel/kvmclock.c | 89 ++++++++++++++------------------------ 2 files changed, 34 insertions(+), 56 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f94bca6ff47f..e0edaaa6920a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -383,6 +383,7 @@ config VMI config KVM_CLOCK bool "KVM paravirtualized clock" select PARAVIRT + select PARAVIRT_CLOCK depends on !(X86_VISWS || X86_VOYAGER) help Turning on this option will allow you to run a paravirtualized clock diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 08a30986d472..87edf1ceb1df 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg) early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); -#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field +static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); +static struct pvclock_wall_clock wall_clock; -static inline u64 kvm_get_delta(u64 last_tsc) -{ - int cpu = smp_processor_id(); - u64 delta = native_read_tsc() - last_tsc; - return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; -} - -static struct kvm_wall_clock wall_clock; -static cycle_t kvm_clock_read(void); /* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for @@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void); */ static unsigned long kvm_get_wallclock(void) { - u32 wc_sec, wc_nsec; - u64 delta; + struct pvclock_vcpu_time_info *vcpu_time; struct timespec ts; - int version, nsec; int low, high; low = (int)__pa(&wall_clock); high = ((u64)__pa(&wall_clock) >> 32); - - delta = kvm_clock_read(); - native_write_msr(MSR_KVM_WALL_CLOCK, low, high); - do { - version = wall_clock.wc_version; - rmb(); - wc_sec = wall_clock.wc_sec; - wc_nsec = wall_clock.wc_nsec; - rmb(); - } while ((wall_clock.wc_version != version) || (version & 1)); - delta = kvm_clock_read() - delta; - delta += wc_nsec; - nsec = do_div(delta, NSEC_PER_SEC); - set_normalized_timespec(&ts, wc_sec + delta, nsec); - /* - * Of all mechanisms of time adjustment I've tested, this one - * was the champion! - */ - return ts.tv_sec + 1; + vcpu_time = &get_cpu_var(hv_clock); + pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); + put_cpu_var(hv_clock); + + return ts.tv_sec; } static int kvm_set_wallclock(unsigned long now) { - return 0; + return -1; } -/* - * This is our read_clock function. The host puts an tsc timestamp each time - * it updates a new time. Without the tsc adjustment, we can have a situation - * in which a vcpu starts to run earlier (smaller system_time), but probes - * time later (compared to another vcpu), leading to backwards time - */ static cycle_t kvm_clock_read(void) { - u64 last_tsc, now; - int cpu; + struct pvclock_vcpu_time_info *src; + cycle_t ret; - preempt_disable(); - cpu = smp_processor_id(); - - last_tsc = get_clock(cpu, tsc_timestamp); - now = get_clock(cpu, system_time); - - now += kvm_get_delta(last_tsc); - preempt_enable(); - - return now; + src = &get_cpu_var(hv_clock); + ret = pvclock_clocksource_read(src); + put_cpu_var(hv_clock); + return ret; } + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_read, @@ -123,13 +88,14 @@ static struct clocksource kvm_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static int kvm_register_clock(void) +static int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high; low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); - + printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", + cpu, high, low, txt); return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); } @@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void) * Now that the first cpu already had this clocksource initialized, * we shouldn't fail. */ - WARN_ON(kvm_register_clock()); + WARN_ON(kvm_register_clock("secondary cpu clock")); /* ok, done with our trickery, call native */ setup_secondary_APIC_clock(); } #endif +#ifdef CONFIG_SMP +void __init kvm_smp_prepare_boot_cpu(void) +{ + WARN_ON(kvm_register_clock("primary cpu clock")); + native_smp_prepare_boot_cpu(); +} +#endif + /* * After the clock is registered, the host will keep writing to the * registered memory location. If the guest happens to shutdown, this memory @@ -174,13 +148,16 @@ void __init kvmclock_init(void) return; if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { - if (kvm_register_clock()) + if (kvm_register_clock("boot clock")) return; pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; #ifdef CONFIG_X86_LOCAL_APIC pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; +#endif +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; #endif machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC From 6b1ed9086592fd4b066daae222751bb6757ca5eb Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 3 Jun 2008 16:17:33 +0200 Subject: [PATCH 62/65] KVM: Remove now unused structs from kvm_para.h The kvm_* structs are obsoleted by the pvclock_* ones. Now all users have been switched over and the old structs can be dropped. Signed-off-by: Gerd Hoffmann Signed-off-by: Avi Kivity --- include/asm-x86/kvm_para.h | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h index 509845942070..bfd9900742bf 100644 --- a/include/asm-x86/kvm_para.h +++ b/include/asm-x86/kvm_para.h @@ -48,24 +48,6 @@ struct kvm_mmu_op_release_pt { #ifdef __KERNEL__ #include -/* xen binary-compatible interface. See xen headers for details */ -struct kvm_vcpu_time_info { - uint32_t version; - uint32_t pad0; - uint64_t tsc_timestamp; - uint64_t system_time; - uint32_t tsc_to_system_mul; - int8_t tsc_shift; - int8_t pad[3]; -} __attribute__((__packed__)); /* 32 bytes */ - -struct kvm_wall_clock { - uint32_t wc_version; - uint32_t wc_sec; - uint32_t wc_nsec; -} __attribute__((__packed__)); - - extern void kvmclock_init(void); From ea7b44c8e6baa1a4507f05ba2c0009ac21c3fe0b Mon Sep 17 00:00:00 2001 From: Jie Luo Date: Tue, 24 Jun 2008 10:38:31 -0700 Subject: [PATCH 63/65] enable bus mastering on i915 at resume time On 9xx chips, bus mastering needs to be enabled at resume time for much of the chip to function. With this patch, vblank interrupts will work as expected on resume, along with other chip functions. Fixes kernel bugzilla #10844. Signed-off-by: Jie Luo Signed-off-by: Jesse Barnes Signed-off-by: Linus Torvalds --- drivers/char/drm/i915_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/drm/i915_drv.c b/drivers/char/drm/i915_drv.c index e8f3d682e3b1..93aed1c38bd2 100644 --- a/drivers/char/drm/i915_drv.c +++ b/drivers/char/drm/i915_drv.c @@ -389,6 +389,7 @@ static int i915_resume(struct drm_device *dev) pci_restore_state(dev->pdev); if (pci_enable_device(dev->pdev)) return -1; + pci_set_master(dev->pdev); pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB); From 543cf4cb3fe6f6cae3651ba918b9c56200b257d0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 24 Jun 2008 18:58:20 -0700 Subject: [PATCH 64/65] Linux 2.6.26-rc8 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2b4977c9844e..6aff5f47c21d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 26 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc8 NAME = Rotary Wombat # *DOCUMENTATION* From 4ed47896935573c8423d05bddda3f269d6e6c613 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 3 Jul 2008 16:11:06 +0100 Subject: [PATCH 65/65] [ARM] mach-types update Update mach-types. Remove invalid or incorrect entries. Signed-off-by: Russell King --- arch/arm/tools/mach-types | 126 +++++++++++++++++++++++++++++++++----- 1 file changed, 109 insertions(+), 17 deletions(-) diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types index 207a8b5a0c4a..0be5630ff568 100644 --- a/arch/arm/tools/mach-types +++ b/arch/arm/tools/mach-types @@ -12,7 +12,7 @@ # # http://www.arm.linux.org.uk/developer/machines/?action=new # -# Last update: Sat Apr 19 11:23:38 2008 +# Last update: Mon Jul 7 16:25:39 2008 # # machine_is_xxx CONFIG_xxxx MACH_TYPE_xxx number # @@ -560,7 +560,6 @@ husky MACH_HUSKY HUSKY 543 boxer MACH_BOXER BOXER 544 shepherd MACH_SHEPHERD SHEPHERD 545 aml42800aa MACH_AML42800AA AML42800AA 546 -ml674001 MACH_MACH_TYPE_ML674001 MACH_TYPE_ML674001 547 lpc2294 MACH_LPC2294 LPC2294 548 switchgrass MACH_SWITCHGRASS SWITCHGRASS 549 ens_cmu MACH_ENS_CMU ENS_CMU 550 @@ -748,7 +747,6 @@ anubis MACH_ANUBIS ANUBIS 734 ite8152 MACH_ITE8152 ITE8152 735 lpc3xxx MACH_LPC3XXX LPC3XXX 736 puppeteer MACH_PUPPETEER PUPPETEER 737 -vt001 MACH_MACH_VADATECH MACH_VADATECH 738 e570 MACH_E570 E570 739 x50 MACH_X50 X50 740 recon MACH_RECON RECON 741 @@ -839,7 +837,7 @@ ccxp270 MACH_CCXP CCXP 825 omap_gsample MACH_OMAP_GSAMPLE OMAP_GSAMPLE 826 realview_eb MACH_REALVIEW_EB REALVIEW_EB 827 samoa MACH_SAMOA SAMOA 828 -t3xscale MACH_T3XSCALE T3XSCALE 829 +palmt3 MACH_PALMT3 PALMT3 829 i878 MACH_I878 I878 830 borzoi MACH_BORZOI BORZOI 831 gecko MACH_GECKO GECKO 832 @@ -895,7 +893,7 @@ mio8390 MACH_MIO8390 MIO8390 881 omi_board MACH_OMI_BOARD OMI_BOARD 882 mx21civ MACH_MX21CIV MX21CIV 883 mahi_cdac MACH_MAHI_CDAC MAHI_CDAC 884 -xscale_palmtx MACH_XSCALE_PALMTX XSCALE_PALMTX 885 +palmtx MACH_PALMTX PALMTX 885 s3c2413 MACH_S3C2413 S3C2413 887 samsys_ep0 MACH_SAMSYS_EP0 SAMSYS_EP0 888 wg302v1 MACH_WG302V1 WG302V1 889 @@ -918,7 +916,7 @@ nxdb500 MACH_NXDB500 NXDB500 905 apf9328 MACH_APF9328 APF9328 906 omap_wipoq MACH_OMAP_WIPOQ OMAP_WIPOQ 907 omap_twip MACH_OMAP_TWIP OMAP_TWIP 908 -xscale_treo650 MACH_XSCALE_PALMTREO650 XSCALE_PALMTREO650 909 +palmtreo650 MACH_PALMTREO650 PALMTREO650 909 acumen MACH_ACUMEN ACUMEN 910 xp100 MACH_XP100 XP100 911 fs2410 MACH_FS2410 FS2410 912 @@ -926,8 +924,8 @@ pxa270_cerf MACH_PXA270_CERF PXA270_CERF 913 sq2ftlpalm MACH_SQ2FTLPALM SQ2FTLPALM 914 bsemserver MACH_BSEMSERVER BSEMSERVER 915 netclient MACH_NETCLIENT NETCLIENT 916 -xscale_palmtt5 MACH_XSCALE_PALMTT5 XSCALE_PALMTT5 917 -xscale_palmtc MACH_OMAP_PALMTC OMAP_PALMTC 918 +palmt5 MACH_PALMT5 PALMT5 917 +palmtc MACH_PALMTC PALMTC 918 omap_apollon MACH_OMAP_APOLLON OMAP_APOLLON 919 mxc30030evb MACH_MXC30030EVB MXC30030EVB 920 rea_2d MACH_REA_2D REA_2D 921 @@ -1220,7 +1218,6 @@ empca400 MACH_EMPCA400 EMPCA400 1211 em7210 MACH_EM7210 EM7210 1212 htchermes MACH_HTCHERMES HTCHERMES 1213 eti_c1 MACH_ETI_C1 ETI_C1 1214 -mach_dep2410 MACH_MACH_DEP2410 MACH_DEP2410 1215 ac100 MACH_AC100 AC100 1216 sneetch MACH_SNEETCH SNEETCH 1217 studentmate MACH_STUDENTMATE STUDENTMATE 1218 @@ -1421,10 +1418,10 @@ looxc550 MACH_LOOXC550 LOOXC550 1417 cnty_titan MACH_CNTY_TITAN CNTY_TITAN 1418 app3xx MACH_APP3XX APP3XX 1419 sideoatsgrama MACH_SIDEOATSGRAMA SIDEOATSGRAMA 1420 -xscale_palmt700p MACH_XSCALE_PALMT700P XSCALE_PALMT700P 1421 -xscale_palmt700w MACH_XSCALE_PALMT700W XSCALE_PALMT700W 1422 -xscale_palmt750 MACH_XSCALE_PALMT750 XSCALE_PALMT750 1423 -xscale_palmt755p MACH_XSCALE_PALMT755P XSCALE_PALMT755P 1424 +palmtreo700p MACH_PALMTREO700P PALMTREO700P 1421 +palmtreo700w MACH_PALMTREO700W PALMTREO700W 1422 +palmtreo750 MACH_PALMTREO750 PALMTREO750 1423 +palmtreo755p MACH_PALMTREO755P PALMTREO755P 1424 ezreganut9200 MACH_EZREGANUT9200 EZREGANUT9200 1425 sarge MACH_SARGE SARGE 1426 a696 MACH_A696 A696 1427 @@ -1463,7 +1460,7 @@ artemis MACH_ARTEMIS ARTEMIS 1462 htctitan MACH_HTCTITAN HTCTITAN 1463 qranium MACH_QRANIUM QRANIUM 1464 adx_wsc2 MACH_ADX_WSC2 ADX_WSC2 1465 -adx_medcom MACH_ADX_MEDINET ADX_MEDINET 1466 +adx_medcom MACH_ADX_MEDCOM ADX_MEDCOM 1466 bboard MACH_BBOARD BBOARD 1467 cambria MACH_CAMBRIA CAMBRIA 1468 mt7xxx MACH_MT7XXX MT7XXX 1469 @@ -1519,7 +1516,7 @@ wp188 MACH_WP188 WP188 1518 corsica MACH_CORSICA CORSICA 1519 bigeye MACH_BIGEYE BIGEYE 1520 tll5000 MACH_TLL5000 TLL5000 1522 -hni270 MACH_HNI_X270 HNI_X270 1523 +bebot MACH_BEBOT BEBOT 1523 qong MACH_QONG QONG 1524 tcompact MACH_TCOMPACT TCOMPACT 1525 puma5 MACH_PUMA5 PUMA5 1526 @@ -1636,7 +1633,6 @@ awlug4lcu MACH_AWLUG4LCU AWLUG4LCU 1637 palermoc MACH_PALERMOC PALERMOC 1638 omap_ldp MACH_OMAP_LDP OMAP_LDP 1639 ip500 MACH_IP500 IP500 1640 -mx35ads MACH_MACH_MX35ADS MACH_MX35ADS 1641 ase2 MACH_ASE2 ASE2 1642 mx35evb MACH_MX35EVB MX35EVB 1643 aml_m8050 MACH_AML_M8050 AML_M8050 1644 @@ -1647,7 +1643,7 @@ badger MACH_BADGER BADGER 1648 trizeps4wl MACH_TRIZEPS4WL TRIZEPS4WL 1649 trizeps5 MACH_TRIZEPS5 TRIZEPS5 1650 marlin MACH_MARLIN MARLIN 1651 -ts7800 MACH_TS7800 TS7800 1652 +ts78xx MACH_TS78XX TS78XX 1652 hpipaq214 MACH_HPIPAQ214 HPIPAQ214 1653 at572d940dcm MACH_AT572D940DCM AT572D940DCM 1654 ne1board MACH_NE1BOARD NE1BOARD 1655 @@ -1720,3 +1716,99 @@ htc_kaiser MACH_HTC_KAISER HTC_KAISER 1724 lg_ks20 MACH_LG_KS20 LG_KS20 1725 hhgps MACH_HHGPS HHGPS 1726 nokia_n810_wimax MACH_NOKIA_N810_WIMAX NOKIA_N810_WIMAX 1727 +insight MACH_INSIGHT INSIGHT 1728 +sapphire MACH_SAPPHIRE SAPPHIRE 1729 +csb637xo MACH_CSB637XO CSB637XO 1730 +evisiong MACH_EVISIONG EVISIONG 1731 +stmp37xx MACH_STMP37XX STMP37XX 1732 +stmp378x MACH_STMP38XX STMP38XX 1733 +tnt MACH_TNT TNT 1734 +tbxt MACH_TBXT TBXT 1735 +playmate MACH_PLAYMATE PLAYMATE 1736 +pns10 MACH_PNS10 PNS10 1737 +eznavi MACH_EZNAVI EZNAVI 1738 +ps4000 MACH_PS4000 PS4000 1739 +ezx_a780 MACH_EZX_A780 EZX_A780 1740 +ezx_e680 MACH_EZX_E680 EZX_E680 1741 +ezx_a1200 MACH_EZX_A1200 EZX_A1200 1742 +ezx_e6 MACH_EZX_E6 EZX_E6 1743 +ezx_e2 MACH_EZX_E2 EZX_E2 1744 +ezx_a910 MACH_EZX_A910 EZX_A910 1745 +cwmx31 MACH_CWMX31 CWMX31 1746 +sl2312 MACH_SL2312 SL2312 1747 +blenny MACH_BLENNY BLENNY 1748 +ds107 MACH_DS107 DS107 1749 +dsx07 MACH_DSX07 DSX07 1750 +picocom1 MACH_PICOCOM1 PICOCOM1 1751 +lynx_wolverine MACH_LYNX_WOLVERINE LYNX_WOLVERINE 1752 +ubisys_p9_sc19 MACH_UBISYS_P9_SC19 UBISYS_P9_SC19 1753 +kratos_low MACH_KRATOS_LOW KRATOS_LOW 1754 +m700 MACH_M700 M700 1755 +edmini_v2 MACH_EDMINI_V2 EDMINI_V2 1756 +zipit2 MACH_ZIPIT2 ZIPIT2 1757 +hslfemtocell MACH_HSLFEMTOCELL HSLFEMTOCELL 1758 +daintree_at91 MACH_DAINTREE_AT91 DAINTREE_AT91 1759 +sg560usb MACH_SG560USB SG560USB 1760 +omap3_pandora MACH_OMAP3_PANDORA OMAP3_PANDORA 1761 +usr8200 MACH_USR8200 USR8200 1762 +s1s65k MACH_S1S65K S1S65K 1763 +s2s65a MACH_S2S65A S2S65A 1764 +icore MACH_ICORE ICORE 1765 +mss2 MACH_MSS2 MSS2 1766 +belmont MACH_BELMONT BELMONT 1767 +asusp525 MACH_ASUSP525 ASUSP525 1768 +lb88rc8480 MACH_LB88RC8480 LB88RC8480 1769 +hipxa MACH_HIPXA HIPXA 1770 +mx25_3ds MACH_MX25_3DS MX25_3DS 1771 +m800 MACH_M800 M800 1772 +omap3530_lv_som MACH_OMAP3530_LV_SOM OMAP3530_LV_SOM 1773 +prima_evb MACH_PRIMA_EVB PRIMA_EVB 1774 +mx31bt1 MACH_MX31BT1 MX31BT1 1775 +atlas4_evb MACH_ATLAS4_EVB ATLAS4_EVB 1776 +mx31cicada MACH_MX31CICADA MX31CICADA 1777 +mi424wr MACH_MI424WR MI424WR 1778 +axs_ultrax MACH_AXS_ULTRAX AXS_ULTRAX 1779 +at572d940deb MACH_AT572D940DEB AT572D940DEB 1780 +davinci_da8xx_evm MACH_DAVINCI_DA8XX_EVM DAVINCI_DA8XX_EVM 1781 +ep9302 MACH_EP9302 EP9302 1782 +at572d940hfeb MACH_AT572D940HFEB AT572D940HFEB 1783 +cybook3 MACH_CYBOOK3 CYBOOK3 1784 +wdg002 MACH_WDG002 WDG002 1785 +sg560adsl MACH_SG560ADSL SG560ADSL 1786 +nextio_n2800_ica MACH_NEXTIO_N2800_ICA NEXTIO_N2800_ICA 1787 +marvell_newdb MACH_MARVELL_NEWDB MARVELL_NEWDB 1789 +vandihud MACH_VANDIHUD VANDIHUD 1790 +magx_e8 MACH_MAGX_E8 MAGX_E8 1791 +magx_z6 MACH_MAGX_Z6 MAGX_Z6 1792 +magx_v8 MACH_MAGX_V8 MAGX_V8 1793 +magx_u9 MACH_MAGX_U9 MAGX_U9 1794 +toughcf08 MACH_TOUGHCF08 TOUGHCF08 1795 +zw4400 MACH_ZW4400 ZW4400 1796 +marat91 MACH_MARAT91 MARAT91 1797 +overo MACH_OVERO OVERO 1798 +at2440evb MACH_AT2440EVB AT2440EVB 1799 +neocore926 MACH_NEOCORE926 NEOCORE926 1800 +wnr854t MACH_WNR854T WNR854T 1801 +imx27 MACH_IMX27 IMX27 1802 +moose_db MACH_MOOSE_DB MOOSE_DB 1803 +fab4 MACH_FAB4 FAB4 1804 +htcdiamond MACH_HTCDIAMOND HTCDIAMOND 1805 +fiona MACH_FIONA FIONA 1806 +mxc30030_x MACH_MXC30030_X MXC30030_X 1807 +bmp1000 MACH_BMP1000 BMP1000 1808 +logi9200 MACH_LOGI9200 LOGI9200 1809 +tqma31 MACH_TQMA31 TQMA31 1810 +ccw9p9215js MACH_CCW9P9215JS CCW9P9215JS 1811 +rd88f5181l_ge MACH_RD88F5181L_GE RD88F5181L_GE 1812 +sifmain MACH_SIFMAIN SIFMAIN 1813 +sam9_l9261 MACH_SAM9_L9261 SAM9_L9261 1814 +cc9m2443js MACH_CC9M2443JS CC9M2443JS 1815 +xaria300 MACH_XARIA300 XARIA300 1816 +it9200 MACH_IT9200 IT9200 1817 +rd88f5181l_fxo MACH_RD88F5181L_FXO RD88F5181L_FXO 1818 +kriss_sensor MACH_KRISS_SENSOR KRISS_SENSOR 1819 +pilz_pmi5 MACH_PILZ_PMI5 PILZ_PMI5 1820 +jade MACH_JADE JADE 1821 +ks8695_softplc MACH_KS8695_SOFTPLC KS8695_SOFTPLC 1822 +gprisc4 MACH_GPRISC4 GPRISC4 1823 +stamp9260 MACH_STAMP9260 STAMP9260 1824