diff --git a/.cocciconfig b/.cocciconfig new file mode 100644 index 000000000000..43967c6b2015 --- /dev/null +++ b/.cocciconfig @@ -0,0 +1,3 @@ +[spatch] + options = --timeout 200 + options = --use-gitgrep diff --git a/.gitignore b/.gitignore index 0c320bf02586..c2ed4ecb0acd 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,7 @@ modules.builtin Module.symvers *.dwo *.su +*.c.[012]*.* # # Top-level generic files @@ -66,6 +67,7 @@ Module.symvers # !.gitignore !.mailmap +!.cocciconfig # # Generated include files diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 1179850f453c..c55df2911136 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -78,422 +78,111 @@ CONFIG_PCI_MSI option. 4.2 Using MSI -Most of the hard work is done for the driver in the PCI layer. It simply -has to request that the PCI layer set up the MSI capability for this +Most of the hard work is done for the driver in the PCI layer. The driver +simply has to request that the PCI layer set up the MSI capability for this device. -4.2.1 pci_enable_msi +To automatically use MSI or MSI-X interrupt vectors, use the following +function: -int pci_enable_msi(struct pci_dev *dev) + int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags); -A successful call allocates ONE interrupt to the device, regardless -of how many MSIs the device supports. The device is switched from -pin-based interrupt mode to MSI mode. The dev->irq number is changed -to a new number which represents the message signaled interrupt; -consequently, this function should be called before the driver calls -request_irq(), because an MSI is delivered via a vector that is -different from the vector of a pin-based interrupt. +which allocates up to max_vecs interrupt vectors for a PCI device. It +returns the number of vectors allocated or a negative error. If the device +has a requirements for a minimum number of vectors the driver can pass a +min_vecs argument set to this limit, and the PCI core will return -ENOSPC +if it can't meet the minimum number of vectors. -4.2.2 pci_enable_msi_range +The flags argument should normally be set to 0, but can be used to pass the +PCI_IRQ_NOMSI and PCI_IRQ_NOMSIX flag in case a device claims to support +MSI or MSI-X, but the support is broken, or to pass PCI_IRQ_NOLEGACY in +case the device does not support legacy interrupt lines. -int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +By default this function will spread the interrupts around the available +CPUs, but this feature can be disabled by passing the PCI_IRQ_NOAFFINITY +flag. -This function allows a device driver to request any number of MSI -interrupts within specified range from 'minvec' to 'maxvec'. +To get the Linux IRQ numbers passed to request_irq() and free_irq() and the +vectors, use the following function: -If this function returns a positive number it indicates the number of -MSI interrupts that have been successfully allocated. In this case -the device is switched from pin-based interrupt mode to MSI mode and -updates dev->irq to be the lowest of the new interrupts assigned to it. -The other interrupts assigned to the device are in the range dev->irq -to dev->irq + returned value - 1. Device driver can use the returned -number of successfully allocated MSI interrupts to further allocate -and initialize device resources. + int pci_irq_vector(struct pci_dev *dev, unsigned int nr); -If this function returns a negative number, it indicates an error and -the driver should not attempt to request any more MSI interrupts for -this device. +Any allocated resources should be freed before removing the device using +the following function: -This function should be called before the driver calls request_irq(), -because MSI interrupts are delivered via vectors that are different -from the vector of a pin-based interrupt. + void pci_free_irq_vectors(struct pci_dev *dev); -It is ideal if drivers can cope with a variable number of MSI interrupts; -there are many reasons why the platform may not be able to provide the -exact number that a driver asks for. +If a device supports both MSI-X and MSI capabilities, this API will use the +MSI-X facilities in preference to the MSI facilities. MSI-X supports any +number of interrupts between 1 and 2048. In contrast, MSI is restricted to +a maximum of 32 interrupts (and must be a power of two). In addition, the +MSI interrupt vectors must be allocated consecutively, so the system might +not be able to allocate as many vectors for MSI as it could for MSI-X. On +some platforms, MSI interrupts must all be targeted at the same set of CPUs +whereas MSI-X interrupts can all be targeted at different CPUs. -There could be devices that can not operate with just any number of MSI -interrupts within a range. See chapter 4.3.1.3 to get the idea how to -handle such devices for MSI-X - the same logic applies to MSI. +If a device supports neither MSI-X or MSI it will fall back to a single +legacy IRQ vector. -4.2.1.1 Maximum possible number of MSI interrupts +The typical usage of MSI or MSI-X interrupts is to allocate as many vectors +as possible, likely up to the limit supported by the device. If nvec is +larger than the number supported by the device it will automatically be +capped to the supported limit, so there is no need to query the number of +vectors supported beforehand: -The typical usage of MSI interrupts is to allocate as many vectors as -possible, likely up to the limit returned by pci_msi_vec_count() function: - -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, 1, nvec); -} - -Note the value of 'minvec' parameter is 1. As 'minvec' is inclusive, -the value of 0 would be meaningless and could result in error. - -Some devices have a minimal limit on number of MSI interrupts. -In this case the function could look like this: - -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, FOO_DRIVER_MINIMUM_NVEC, nvec); -} - -4.2.1.2 Exact number of MSI interrupts + nvec = pci_alloc_irq_vectors(pdev, 1, nvec, 0); + if (nvec < 0) + goto out_err; If a driver is unable or unwilling to deal with a variable number of MSI -interrupts it could request a particular number of interrupts by passing -that number to pci_enable_msi_range() function as both 'minvec' and 'maxvec' -parameters: +interrupts it can request a particular number of interrupts by passing that +number to pci_alloc_irq_vectors() function as both 'min_vecs' and +'max_vecs' parameters: -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, nvec, nvec); -} + ret = pci_alloc_irq_vectors(pdev, nvec, nvec, 0); + if (ret < 0) + goto out_err; -Note, unlike pci_enable_msi_exact() function, which could be also used to -enable a particular number of MSI-X interrupts, pci_enable_msi_range() -returns either a negative errno or 'nvec' (not negative errno or 0 - as -pci_enable_msi_exact() does). +The most notorious example of the request type described above is enabling +the single MSI mode for a device. It could be done by passing two 1s as +'min_vecs' and 'max_vecs': -4.2.1.3 Single MSI mode + ret = pci_alloc_irq_vectors(pdev, 1, 1, 0); + if (ret < 0) + goto out_err; -The most notorious example of the request type described above is -enabling the single MSI mode for a device. It could be done by passing -two 1s as 'minvec' and 'maxvec': +Some devices might not support using legacy line interrupts, in which case +the PCI_IRQ_NOLEGACY flag can be used to fail the request if the platform +can't provide MSI or MSI-X interrupts: -static int foo_driver_enable_single_msi(struct pci_dev *pdev) -{ - return pci_enable_msi_range(pdev, 1, 1); -} + nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_NOLEGACY); + if (nvec < 0) + goto out_err; -Note, unlike pci_enable_msi() function, which could be also used to -enable the single MSI mode, pci_enable_msi_range() returns either a -negative errno or 1 (not negative errno or 0 - as pci_enable_msi() -does). +4.3 Legacy APIs -4.2.3 pci_enable_msi_exact +The following old APIs to enable and disable MSI or MSI-X interrupts should +not be used in new code: -int pci_enable_msi_exact(struct pci_dev *dev, int nvec) + pci_enable_msi() /* deprecated */ + pci_enable_msi_range() /* deprecated */ + pci_enable_msi_exact() /* deprecated */ + pci_disable_msi() /* deprecated */ + pci_enable_msix_range() /* deprecated */ + pci_enable_msix_exact() /* deprecated */ + pci_disable_msix() /* deprecated */ -This variation on pci_enable_msi_range() call allows a device driver to -request exactly 'nvec' MSIs. +Additionally there are APIs to provide the number of supported MSI or MSI-X +vectors: pci_msi_vec_count() and pci_msix_vec_count(). In general these +should be avoided in favor of letting pci_alloc_irq_vectors() cap the +number of vectors. If you have a legitimate special use case for the count +of vectors we might have to revisit that decision and add a +pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently. -If this function returns a negative number, it indicates an error and -the driver should not attempt to request any more MSI interrupts for -this device. +4.4 Considerations when using MSIs -By contrast with pci_enable_msi_range() function, pci_enable_msi_exact() -returns zero in case of success, which indicates MSI interrupts have been -successfully allocated. - -4.2.4 pci_disable_msi - -void pci_disable_msi(struct pci_dev *dev) - -This function should be used to undo the effect of pci_enable_msi_range(). -Calling it restores dev->irq to the pin-based interrupt number and frees -the previously allocated MSIs. The interrupts may subsequently be assigned -to another device, so drivers should not cache the value of dev->irq. - -Before calling this function, a device driver must always call free_irq() -on any interrupt for which it previously called request_irq(). -Failure to do so results in a BUG_ON(), leaving the device with -MSI enabled and thus leaking its vector. - -4.2.4 pci_msi_vec_count - -int pci_msi_vec_count(struct pci_dev *dev) - -This function could be used to retrieve the number of MSI vectors the -device requested (via the Multiple Message Capable register). The MSI -specification only allows the returned value to be a power of two, -up to a maximum of 2^5 (32). - -If this function returns a negative number, it indicates the device is -not capable of sending MSIs. - -If this function returns a positive number, it indicates the maximum -number of MSI interrupt vectors that could be allocated. - -4.3 Using MSI-X - -The MSI-X capability is much more flexible than the MSI capability. -It supports up to 2048 interrupts, each of which can be controlled -independently. To support this flexibility, drivers must use an array of -`struct msix_entry': - -struct msix_entry { - u16 vector; /* kernel uses to write alloc vector */ - u16 entry; /* driver uses to specify entry */ -}; - -This allows for the device to use these interrupts in a sparse fashion; -for example, it could use interrupts 3 and 1027 and yet allocate only a -two-element array. The driver is expected to fill in the 'entry' value -in each element of the array to indicate for which entries the kernel -should assign interrupts; it is invalid to fill in two entries with the -same number. - -4.3.1 pci_enable_msix_range - -int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, - int minvec, int maxvec) - -Calling this function asks the PCI subsystem to allocate any number of -MSI-X interrupts within specified range from 'minvec' to 'maxvec'. -The 'entries' argument is a pointer to an array of msix_entry structs -which should be at least 'maxvec' entries in size. - -On success, the device is switched into MSI-X mode and the function -returns the number of MSI-X interrupts that have been successfully -allocated. In this case the 'vector' member in entries numbered from -0 to the returned value - 1 is populated with the interrupt number; -the driver should then call request_irq() for each 'vector' that it -decides to use. The device driver is responsible for keeping track of the -interrupts assigned to the MSI-X vectors so it can free them again later. -Device driver can use the returned number of successfully allocated MSI-X -interrupts to further allocate and initialize device resources. - -If this function returns a negative number, it indicates an error and -the driver should not attempt to allocate any more MSI-X interrupts for -this device. - -This function, in contrast with pci_enable_msi_range(), does not adjust -dev->irq. The device will not generate interrupts for this interrupt -number once MSI-X is enabled. - -Device drivers should normally call this function once per device -during the initialization phase. - -It is ideal if drivers can cope with a variable number of MSI-X interrupts; -there are many reasons why the platform may not be able to provide the -exact number that a driver asks for. - -There could be devices that can not operate with just any number of MSI-X -interrupts within a range. E.g., an network adapter might need let's say -four vectors per each queue it provides. Therefore, a number of MSI-X -interrupts allocated should be a multiple of four. In this case interface -pci_enable_msix_range() can not be used alone to request MSI-X interrupts -(since it can allocate any number within the range, without any notion of -the multiple of four) and the device driver should master a custom logic -to request the required number of MSI-X interrupts. - -4.3.1.1 Maximum possible number of MSI-X interrupts - -The typical usage of MSI-X interrupts is to allocate as many vectors as -possible, likely up to the limit returned by pci_msix_vec_count() function: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - 1, nvec); -} - -Note the value of 'minvec' parameter is 1. As 'minvec' is inclusive, -the value of 0 would be meaningless and could result in error. - -Some devices have a minimal limit on number of MSI-X interrupts. -In this case the function could look like this: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - FOO_DRIVER_MINIMUM_NVEC, nvec); -} - -4.3.1.2 Exact number of MSI-X interrupts - -If a driver is unable or unwilling to deal with a variable number of MSI-X -interrupts it could request a particular number of interrupts by passing -that number to pci_enable_msix_range() function as both 'minvec' and 'maxvec' -parameters: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - nvec, nvec); -} - -Note, unlike pci_enable_msix_exact() function, which could be also used to -enable a particular number of MSI-X interrupts, pci_enable_msix_range() -returns either a negative errno or 'nvec' (not negative errno or 0 - as -pci_enable_msix_exact() does). - -4.3.1.3 Specific requirements to the number of MSI-X interrupts - -As noted above, there could be devices that can not operate with just any -number of MSI-X interrupts within a range. E.g., let's assume a device that -is only capable sending the number of MSI-X interrupts which is a power of -two. A routine that enables MSI-X mode for such device might look like this: - -/* - * Assume 'minvec' and 'maxvec' are non-zero - */ -static int foo_driver_enable_msix(struct foo_adapter *adapter, - int minvec, int maxvec) -{ - int rc; - - minvec = roundup_pow_of_two(minvec); - maxvec = rounddown_pow_of_two(maxvec); - - if (minvec > maxvec) - return -ERANGE; - -retry: - rc = pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - maxvec, maxvec); - /* - * -ENOSPC is the only error code allowed to be analyzed - */ - if (rc == -ENOSPC) { - if (maxvec == 1) - return -ENOSPC; - - maxvec /= 2; - - if (minvec > maxvec) - return -ENOSPC; - - goto retry; - } - - return rc; -} - -Note how pci_enable_msix_range() return value is analyzed for a fallback - -any error code other than -ENOSPC indicates a fatal error and should not -be retried. - -4.3.2 pci_enable_msix_exact - -int pci_enable_msix_exact(struct pci_dev *dev, - struct msix_entry *entries, int nvec) - -This variation on pci_enable_msix_range() call allows a device driver to -request exactly 'nvec' MSI-Xs. - -If this function returns a negative number, it indicates an error and -the driver should not attempt to allocate any more MSI-X interrupts for -this device. - -By contrast with pci_enable_msix_range() function, pci_enable_msix_exact() -returns zero in case of success, which indicates MSI-X interrupts have been -successfully allocated. - -Another version of a routine that enables MSI-X mode for a device with -specific requirements described in chapter 4.3.1.3 might look like this: - -/* - * Assume 'minvec' and 'maxvec' are non-zero - */ -static int foo_driver_enable_msix(struct foo_adapter *adapter, - int minvec, int maxvec) -{ - int rc; - - minvec = roundup_pow_of_two(minvec); - maxvec = rounddown_pow_of_two(maxvec); - - if (minvec > maxvec) - return -ERANGE; - -retry: - rc = pci_enable_msix_exact(adapter->pdev, - adapter->msix_entries, maxvec); - - /* - * -ENOSPC is the only error code allowed to be analyzed - */ - if (rc == -ENOSPC) { - if (maxvec == 1) - return -ENOSPC; - - maxvec /= 2; - - if (minvec > maxvec) - return -ENOSPC; - - goto retry; - } else if (rc < 0) { - return rc; - } - - return maxvec; -} - -4.3.3 pci_disable_msix - -void pci_disable_msix(struct pci_dev *dev) - -This function should be used to undo the effect of pci_enable_msix_range(). -It frees the previously allocated MSI-X interrupts. The interrupts may -subsequently be assigned to another device, so drivers should not cache -the value of the 'vector' elements over a call to pci_disable_msix(). - -Before calling this function, a device driver must always call free_irq() -on any interrupt for which it previously called request_irq(). -Failure to do so results in a BUG_ON(), leaving the device with -MSI-X enabled and thus leaking its vector. - -4.3.3 The MSI-X Table - -The MSI-X capability specifies a BAR and offset within that BAR for the -MSI-X Table. This address is mapped by the PCI subsystem, and should not -be accessed directly by the device driver. If the driver wishes to -mask or unmask an interrupt, it should call disable_irq() / enable_irq(). - -4.3.4 pci_msix_vec_count - -int pci_msix_vec_count(struct pci_dev *dev) - -This function could be used to retrieve number of entries in the device -MSI-X table. - -If this function returns a negative number, it indicates the device is -not capable of sending MSI-Xs. - -If this function returns a positive number, it indicates the maximum -number of MSI-X interrupt vectors that could be allocated. - -4.4 Handling devices implementing both MSI and MSI-X capabilities - -If a device implements both MSI and MSI-X capabilities, it can -run in either MSI mode or MSI-X mode, but not both simultaneously. -This is a requirement of the PCI spec, and it is enforced by the -PCI layer. Calling pci_enable_msi_range() when MSI-X is already -enabled or pci_enable_msix_range() when MSI is already enabled -results in an error. If a device driver wishes to switch between MSI -and MSI-X at runtime, it must first quiesce the device, then switch -it back to pin-interrupt mode, before calling pci_enable_msi_range() -or pci_enable_msix_range() and resuming operation. This is not expected -to be a common operation but may be useful for debugging or testing -during development. - -4.5 Considerations when using MSIs - -4.5.1 Choosing between MSI-X and MSI - -If your device supports both MSI-X and MSI capabilities, you should use -the MSI-X facilities in preference to the MSI facilities. As mentioned -above, MSI-X supports any number of interrupts between 1 and 2048. -In contrast, MSI is restricted to a maximum of 32 interrupts (and -must be a power of two). In addition, the MSI interrupt vectors must -be allocated consecutively, so the system might not be able to allocate -as many vectors for MSI as it could for MSI-X. On some platforms, MSI -interrupts must all be targeted at the same set of CPUs whereas MSI-X -interrupts can all be targeted at different CPUs. - -4.5.2 Spinlocks +4.4.1 Spinlocks Most device drivers have a per-device spinlock which is taken in the interrupt handler. With pin-based interrupts or a single MSI, it is not @@ -505,7 +194,7 @@ acquire the spinlock. Such deadlocks can be avoided by using spin_lock_irqsave() or spin_lock_irq() which disable local interrupts and acquire the lock (see Documentation/DocBook/kernel-locking). -4.6 How to tell whether MSI/MSI-X is enabled on a device +4.5 How to tell whether MSI/MSI-X is enabled on a device Using 'lspci -v' (as root) may show some devices with "MSI", "Message Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities diff --git a/Documentation/coccinelle.txt b/Documentation/coccinelle.txt index 7f773d51fdd9..01fb1dae3163 100644 --- a/Documentation/coccinelle.txt +++ b/Documentation/coccinelle.txt @@ -38,6 +38,15 @@ as a regular user, and install it with sudo make install + Supplemental documentation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For supplemental documentation refer to the wiki: + +https://bottest.wiki.kernel.org/coccicheck + +The wiki documentation always refers to the linux-next version of the script. + Using Coccinelle on the Linux kernel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -94,11 +103,26 @@ To enable verbose messages set the V= variable, for example: make coccicheck MODE=report V=1 + Coccinelle parallelization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + By default, coccicheck tries to run as parallel as possible. To change the parallelism, set the J= variable. For example, to run across 4 CPUs: make coccicheck MODE=report J=4 +As of Coccinelle 1.0.2 Coccinelle uses Ocaml parmap for parallelization, +if support for this is detected you will benefit from parmap parallelization. + +When parmap is enabled coccicheck will enable dynamic load balancing by using +'--chunksize 1' argument, this ensures we keep feeding threads with work +one by one, so that we avoid the situation where most work gets done by only +a few threads. With dynamic load balancing, if a thread finishes early we keep +feeding it more work. + +When parmap is enabled, if an error occurs in Coccinelle, this error +value is propagated back, the return value of the 'make coccicheck' +captures this return value. Using Coccinelle with a single semantic patch ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -142,15 +166,118 @@ semantic patch as shown in the previous section. The "report" mode is the default. You can select another one with the MODE variable explained above. + Debugging Coccinelle SmPL patches +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using coccicheck is best as it provides in the spatch command line +include options matching the options used when we compile the kernel. +You can learn what these options are by using V=1, you could then +manually run Coccinelle with debug options added. + +Alternatively you can debug running Coccinelle against SmPL patches +by asking for stderr to be redirected to stderr, by default stderr +is redirected to /dev/null, if you'd like to capture stderr you +can specify the DEBUG_FILE="file.txt" option to coccicheck. For +instance: + + rm -f cocci.err + make coccicheck COCCI=scripts/coccinelle/free/kfree.cocci MODE=report DEBUG_FILE=cocci.err + cat cocci.err + +You can use SPFLAGS to add debugging flags, for instance you may want to +add both --profile --show-trying to SPFLAGS when debugging. For instance +you may want to use: + + rm -f err.log + export COCCI=scripts/coccinelle/misc/irqf_oneshot.cocci + make coccicheck DEBUG_FILE="err.log" MODE=report SPFLAGS="--profile --show-trying" M=./drivers/mfd/arizona-irq.c + +err.log will now have the profiling information, while stdout will +provide some progress information as Coccinelle moves forward with +work. + +DEBUG_FILE support is only supported when using coccinelle >= 1.2. + + .cocciconfig support +~~~~~~~~~~~~~~~~~~~~~~ + +Coccinelle supports reading .cocciconfig for default Coccinelle options that +should be used every time spatch is spawned, the order of precedence for +variables for .cocciconfig is as follows: + + o Your current user's home directory is processed first + o Your directory from which spatch is called is processed next + o The directory provided with the --dir option is processed last, if used + +Since coccicheck runs through make, it naturally runs from the kernel +proper dir, as such the second rule above would be implied for picking up a +.cocciconfig when using 'make coccicheck'. + +'make coccicheck' also supports using M= targets.If you do not supply +any M= target, it is assumed you want to target the entire kernel. +The kernel coccicheck script has: + + if [ "$KBUILD_EXTMOD" = "" ] ; then + OPTIONS="--dir $srctree $COCCIINCLUDE" + else + OPTIONS="--dir $KBUILD_EXTMOD $COCCIINCLUDE" + fi + +KBUILD_EXTMOD is set when an explicit target with M= is used. For both cases +the spatch --dir argument is used, as such third rule applies when whether M= +is used or not, and when M= is used the target directory can have its own +.cocciconfig file. When M= is not passed as an argument to coccicheck the +target directory is the same as the directory from where spatch was called. + +If not using the kernel's coccicheck target, keep the above precedence +order logic of .cocciconfig reading. If using the kernel's coccicheck target, +override any of the kernel's .coccicheck's settings using SPFLAGS. + +We help Coccinelle when used against Linux with a set of sensible defaults +options for Linux with our own Linux .cocciconfig. This hints to coccinelle +git can be used for 'git grep' queries over coccigrep. A timeout of 200 +seconds should suffice for now. + +The options picked up by coccinelle when reading a .cocciconfig do not appear +as arguments to spatch processes running on your system, to confirm what +options will be used by Coccinelle run: + + spatch --print-options-only + +You can override with your own preferred index option by using SPFLAGS. Take +note that when there are conflicting options Coccinelle takes precedence for +the last options passed. Using .cocciconfig is possible to use idutils, however +given the order of precedence followed by Coccinelle, since the kernel now +carries its own .cocciconfig, you will need to use SPFLAGS to use idutils if +desired. See below section "Additional flags" for more details on how to use +idutils. + Additional flags ~~~~~~~~~~~~~~~~~~ Additional flags can be passed to spatch through the SPFLAGS -variable. +variable. This works as Coccinelle respects the last flags +given to it when options are in conflict. make SPFLAGS=--use-glimpse coccicheck + +Coccinelle supports idutils as well but requires coccinelle >= 1.0.6. +When no ID file is specified coccinelle assumes your ID database file +is in the file .id-utils.index on the top level of the kernel, coccinelle +carries a script scripts/idutils_index.sh which creates the database with + + mkid -i C --output .id-utils.index + +If you have another database filename you can also just symlink with this +name. + make SPFLAGS=--use-idutils coccicheck +Alternatively you can specify the database filename explicitly, for +instance: + + make SPFLAGS="--use-idutils /full-path/to/ID" coccicheck + See spatch --help to learn more about spatch options. Note that the '--use-glimpse' and '--use-idutils' options @@ -159,6 +286,25 @@ thus active by default. However, by indexing the code with one of these tools, and according to the cocci file used, spatch could proceed the entire code base more quickly. + SmPL patch specific options +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +SmPL patches can have their own requirements for options passed +to Coccinelle. SmPL patch specific options can be provided by +providing them at the top of the SmPL patch, for instance: + +// Options: --no-includes --include-headers + + SmPL patch Coccinelle requirements +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As Coccinelle features get added some more advanced SmPL patches +may require newer versions of Coccinelle. If an SmPL patch requires +at least a version of Coccinelle, this can be specified as follows, +as an example if requiring at least Coccinelle >= 1.0.5: + +// Requires: 1.0.5 + Proposing new semantic patches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt b/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt index 21055e210234..c1359f4d48d7 100644 --- a/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt +++ b/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt @@ -46,6 +46,10 @@ Required properties: 0 maps to GPMC_WAIT0 pin. - gpio-cells: Must be set to 2 +Required properties when using NAND prefetch dma: + - dmas GPMC NAND prefetch dma channel + - dma-names Must be set to "rxtx" + Timing properties for child nodes. All are optional and default to 0. - gpmc,sync-clk-ps: Minimum clock period for synchronous mode, in picoseconds @@ -137,7 +141,8 @@ Example for an AM33xx board: ti,hwmods = "gpmc"; reg = <0x50000000 0x2000>; interrupts = <100>; - + dmas = <&edma 52 0>; + dma-names = "rxtx"; gpmc,num-cs = <8>; gpmc,num-waitpins = <2>; #address-cells = <2>; diff --git a/Documentation/devicetree/bindings/mtd/atmel-quadspi.txt b/Documentation/devicetree/bindings/mtd/atmel-quadspi.txt new file mode 100644 index 000000000000..489807005eda --- /dev/null +++ b/Documentation/devicetree/bindings/mtd/atmel-quadspi.txt @@ -0,0 +1,32 @@ +* Atmel Quad Serial Peripheral Interface (QSPI) + +Required properties: +- compatible: Should be "atmel,sama5d2-qspi". +- reg: Should contain the locations and lengths of the base registers + and the mapped memory. +- reg-names: Should contain the resource reg names: + - qspi_base: configuration register address space + - qspi_mmap: memory mapped address space +- interrupts: Should contain the interrupt for the device. +- clocks: The phandle of the clock needed by the QSPI controller. +- #address-cells: Should be <1>. +- #size-cells: Should be <0>. + +Example: + +spi@f0020000 { + compatible = "atmel,sama5d2-qspi"; + reg = <0xf0020000 0x100>, <0xd0000000 0x8000000>; + reg-names = "qspi_base", "qspi_mmap"; + interrupts = <52 IRQ_TYPE_LEVEL_HIGH 7>; + clocks = <&spi0_clk>; + #address-cells = <1>; + #size-cells = <0>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_spi0_default>; + status = "okay"; + + m25p80@0 { + ... + }; +}; diff --git a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt index 7066597c9a81..b40f3a492800 100644 --- a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt +++ b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt @@ -27,6 +27,7 @@ Required properties: brcm,brcmnand-v6.2 brcm,brcmnand-v7.0 brcm,brcmnand-v7.1 + brcm,brcmnand-v7.2 brcm,brcmnand - reg : the register start and length for NAND register region. (optional) Flash DMA register range (if present) diff --git a/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt new file mode 100644 index 000000000000..f248056da24c --- /dev/null +++ b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt @@ -0,0 +1,56 @@ +* Cadence Quad SPI controller + +Required properties: +- compatible : Should be "cdns,qspi-nor". +- reg : Contains two entries, each of which is a tuple consisting of a + physical address and length. The first entry is the address and + length of the controller register set. The second entry is the + address and length of the QSPI Controller data area. +- interrupts : Unit interrupt specifier for the controller interrupt. +- clocks : phandle to the Quad SPI clock. +- cdns,fifo-depth : Size of the data FIFO in words. +- cdns,fifo-width : Bus width of the data FIFO in bytes. +- cdns,trigger-address : 32-bit indirect AHB trigger address. + +Optional properties: +- cdns,is-decoded-cs : Flag to indicate whether decoder is used or not. + +Optional subnodes: +Subnodes of the Cadence Quad SPI controller are spi slave nodes with additional +custom properties: +- cdns,read-delay : Delay for read capture logic, in clock cycles +- cdns,tshsl-ns : Delay in nanoseconds for the length that the master + mode chip select outputs are de-asserted between + transactions. +- cdns,tsd2d-ns : Delay in nanoseconds between one chip select being + de-activated and the activation of another. +- cdns,tchsh-ns : Delay in nanoseconds between last bit of current + transaction and deasserting the device chip select + (qspi_n_ss_out). +- cdns,tslch-ns : Delay in nanoseconds between setting qspi_n_ss_out low + and first bit transfer. + +Example: + + qspi: spi@ff705000 { + compatible = "cdns,qspi-nor"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0xff705000 0x1000>, + <0xffa00000 0x1000>; + interrupts = <0 151 4>; + clocks = <&qspi_clk>; + cdns,is-decoded-cs; + cdns,fifo-depth = <128>; + cdns,fifo-width = <4>; + cdns,trigger-address = <0x00000000>; + + flash0: n25q00@0 { + ... + cdns,read-delay = <4>; + cdns,tshsl-ns = <50>; + cdns,tsd2d-ns = <50>; + cdns,tchsh-ns = <4>; + cdns,tslch-ns = <4>; + }; + }; diff --git a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt index 3ee7e202657c..174f68c26c1b 100644 --- a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt +++ b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt @@ -39,7 +39,7 @@ Optional properties: "prefetch-polled" Prefetch polled mode (default) "polled" Polled mode, without prefetch - "prefetch-dma" Prefetch enabled sDMA mode + "prefetch-dma" Prefetch enabled DMA mode "prefetch-irq" Prefetch enabled irq mode - elm_id: use "ti,elm-id" instead diff --git a/Documentation/devicetree/bindings/mtd/hisilicon,fmc-spi-nor.txt b/Documentation/devicetree/bindings/mtd/hisilicon,fmc-spi-nor.txt new file mode 100644 index 000000000000..74981520d6dd --- /dev/null +++ b/Documentation/devicetree/bindings/mtd/hisilicon,fmc-spi-nor.txt @@ -0,0 +1,24 @@ +HiSilicon SPI-NOR Flash Controller + +Required properties: +- compatible : Should be "hisilicon,fmc-spi-nor" and one of the following strings: + "hisilicon,hi3519-spi-nor" +- address-cells : Should be 1. +- size-cells : Should be 0. +- reg : Offset and length of the register set for the controller device. +- reg-names : Must include the following two entries: "control", "memory". +- clocks : handle to spi-nor flash controller clock. + +Example: +spi-nor-controller@10000000 { + compatible = "hisilicon,hi3519-spi-nor", "hisilicon,fmc-spi-nor"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x10000000 0x1000>, <0x14000000 0x1000000>; + reg-names = "control", "memory"; + clocks = <&clock HI3519_FMC_CLK>; + spi-nor@0 { + compatible = "jedec,spi-nor"; + reg = <0>; + }; +}; diff --git a/Documentation/devicetree/bindings/mtd/mtk-nand.txt b/Documentation/devicetree/bindings/mtd/mtk-nand.txt new file mode 100644 index 000000000000..069c192ed5c2 --- /dev/null +++ b/Documentation/devicetree/bindings/mtd/mtk-nand.txt @@ -0,0 +1,160 @@ +MTK SoCs NAND FLASH controller (NFC) DT binding + +This file documents the device tree bindings for MTK SoCs NAND controllers. +The functional split of the controller requires two drivers to operate: +the nand controller interface driver and the ECC engine driver. + +The hardware description for both devices must be captured as device +tree nodes. + +1) NFC NAND Controller Interface (NFI): +======================================= + +The first part of NFC is NAND Controller Interface (NFI) HW. +Required NFI properties: +- compatible: Should be "mediatek,mtxxxx-nfc". +- reg: Base physical address and size of NFI. +- interrupts: Interrupts of NFI. +- clocks: NFI required clocks. +- clock-names: NFI clocks internal name. +- status: Disabled default. Then set "okay" by platform. +- ecc-engine: Required ECC Engine node. +- #address-cells: NAND chip index, should be 1. +- #size-cells: Should be 0. + +Example: + + nandc: nfi@1100d000 { + compatible = "mediatek,mt2701-nfc"; + reg = <0 0x1100d000 0 0x1000>; + interrupts = ; + clocks = <&pericfg CLK_PERI_NFI>, + <&pericfg CLK_PERI_NFI_PAD>; + clock-names = "nfi_clk", "pad_clk"; + status = "disabled"; + ecc-engine = <&bch>; + #address-cells = <1>; + #size-cells = <0>; + }; + +Platform related properties, should be set in {platform_name}.dts: +- children nodes: NAND chips. + +Children nodes properties: +- reg: Chip Select Signal, default 0. + Set as reg = <0>, <1> when need 2 CS. +Optional: +- nand-on-flash-bbt: Store BBT on NAND Flash. +- nand-ecc-mode: the NAND ecc mode (check driver for supported modes) +- nand-ecc-step-size: Number of data bytes covered by a single ECC step. + valid values: 512 and 1024. + 1024 is recommended for large page NANDs. +- nand-ecc-strength: Number of bits to correct per ECC step. + The valid values that the controller supports are: 4, 6, + 8, 10, 12, 14, 16, 18, 20, 22, 24, 28, 32, 36, 40, 44, + 48, 52, 56, 60. + The strength should be calculated as follows: + E = (S - F) * 8 / 14 + S = O / (P / Q) + E : nand-ecc-strength. + S : spare size per sector. + F : FDM size, should be in the range [1,8]. + It is used to store free oob data. + O : oob size. + P : page size. + Q : nand-ecc-step-size. + If the result does not match any one of the listed + choices above, please select the smaller valid value from + the list. + (otherwise the driver will do the adjustment at runtime) +- pinctrl-names: Default NAND pin GPIO setting name. +- pinctrl-0: GPIO setting node. + +Example: + &pio { + nand_pins_default: nanddefault { + pins_dat { + pinmux = , + , + , + , + , + , + , + , + ; + input-enable; + drive-strength = ; + bias-pull-up; + }; + + pins_we { + pinmux = ; + drive-strength = ; + bias-pull-up = ; + }; + + pins_ale { + pinmux = ; + drive-strength = ; + bias-pull-down = ; + }; + }; + }; + + &nandc { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&nand_pins_default>; + nand@0 { + reg = <0>; + nand-on-flash-bbt; + nand-ecc-mode = "hw"; + nand-ecc-strength = <24>; + nand-ecc-step-size = <1024>; + }; + }; + +NAND chip optional subnodes: +- Partitions, see Documentation/devicetree/bindings/mtd/partition.txt + +Example: + nand@0 { + partitions { + compatible = "fixed-partitions"; + #address-cells = <1>; + #size-cells = <1>; + + preloader@0 { + label = "pl"; + read-only; + reg = <0x00000000 0x00400000>; + }; + android@0x00400000 { + label = "android"; + reg = <0x00400000 0x12c00000>; + }; + }; + }; + +2) ECC Engine: +============== + +Required BCH properties: +- compatible: Should be "mediatek,mtxxxx-ecc". +- reg: Base physical address and size of ECC. +- interrupts: Interrupts of ECC. +- clocks: ECC required clocks. +- clock-names: ECC clocks internal name. +- status: Disabled default. Then set "okay" by platform. + +Example: + + bch: ecc@1100e000 { + compatible = "mediatek,mt2701-ecc"; + reg = <0 0x1100e000 0 0x1000>; + interrupts = ; + clocks = <&pericfg CLK_PERI_NFI_ECC>; + clock-names = "nfiecc_clk"; + status = "disabled"; + }; diff --git a/Documentation/devicetree/bindings/mtd/sunxi-nand.txt b/Documentation/devicetree/bindings/mtd/sunxi-nand.txt index 086d6f44c4b9..f322f56aef74 100644 --- a/Documentation/devicetree/bindings/mtd/sunxi-nand.txt +++ b/Documentation/devicetree/bindings/mtd/sunxi-nand.txt @@ -11,10 +11,16 @@ Required properties: * "ahb" : AHB gating clock * "mod" : nand controller clock +Optional properties: +- dmas : shall reference DMA channel associated to the NAND controller. +- dma-names : shall be "rxtx". + Optional children nodes: Children nodes represent the available nand chips. Optional properties: +- reset : phandle + reset specifier pair +- reset-names : must contain "ahb" - allwinner,rb : shall contain the native Ready/Busy ids. or - rb-gpios : shall contain the gpios used as R/B pins. diff --git a/Documentation/devicetree/bindings/pci/aardvark-pci.txt b/Documentation/devicetree/bindings/pci/aardvark-pci.txt new file mode 100644 index 000000000000..bbcd9f4c501f --- /dev/null +++ b/Documentation/devicetree/bindings/pci/aardvark-pci.txt @@ -0,0 +1,56 @@ +Aardvark PCIe controller + +This PCIe controller is used on the Marvell Armada 3700 ARM64 SoC. + +The Device Tree node describing an Aardvark PCIe controller must +contain the following properties: + + - compatible: Should be "marvell,armada-3700-pcie" + - reg: range of registers for the PCIe controller + - interrupts: the interrupt line of the PCIe controller + - #address-cells: set to <3> + - #size-cells: set to <2> + - device_type: set to "pci" + - ranges: ranges for the PCI memory and I/O regions + - #interrupt-cells: set to <1> + - msi-controller: indicates that the PCIe controller can itself + handle MSI interrupts + - msi-parent: pointer to the MSI controller to be used + - interrupt-map-mask and interrupt-map: standard PCI properties to + define the mapping of the PCIe interface to interrupt numbers. + - bus-range: PCI bus numbers covered + +In addition, the Device Tree describing an Aardvark PCIe controller +must include a sub-node that describes the legacy interrupt controller +built into the PCIe controller. This sub-node must have the following +properties: + + - interrupt-controller + - #interrupt-cells: set to <1> + +Example: + + pcie0: pcie@d0070000 { + compatible = "marvell,armada-3700-pcie"; + device_type = "pci"; + status = "disabled"; + reg = <0 0xd0070000 0 0x20000>; + #address-cells = <3>; + #size-cells = <2>; + bus-range = <0x00 0xff>; + interrupts = ; + #interrupt-cells = <1>; + msi-controller; + msi-parent = <&pcie0>; + ranges = <0x82000000 0 0xe8000000 0 0xe8000000 0 0x1000000 /* Port 0 MEM */ + 0x81000000 0 0xe9000000 0 0xe9000000 0 0x10000>; /* Port 0 IO*/ + interrupt-map-mask = <0 0 0 7>; + interrupt-map = <0 0 0 1 &pcie_intc 0>, + <0 0 0 2 &pcie_intc 1>, + <0 0 0 3 &pcie_intc 2>, + <0 0 0 4 &pcie_intc 3>; + pcie_intc: interrupt-controller { + interrupt-controller; + #interrupt-cells = <1>; + }; + }; diff --git a/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt b/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt new file mode 100644 index 000000000000..330a45b5f0b5 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt @@ -0,0 +1,46 @@ +* Axis ARTPEC-6 PCIe interface + +This PCIe host controller is based on the Synopsys DesignWare PCIe IP +and thus inherits all the common properties defined in designware-pcie.txt. + +Required properties: +- compatible: "axis,artpec6-pcie", "snps,dw-pcie" +- reg: base addresses and lengths of the PCIe controller (DBI), + the phy controller, and configuration address space. +- reg-names: Must include the following entries: + - "dbi" + - "phy" + - "config" +- interrupts: A list of interrupt outputs of the controller. Must contain an + entry for each entry in the interrupt-names property. +- interrupt-names: Must include the following entries: + - "msi": The interrupt that is asserted when an MSI is received +- axis,syscon-pcie: A phandle pointing to the ARTPEC-6 system controller, + used to enable and control the Synopsys IP. + +Example: + + pcie@f8050000 { + compatible = "axis,artpec6-pcie", "snps,dw-pcie"; + reg = <0xf8050000 0x2000 + 0xf8040000 0x1000 + 0xc0000000 0x1000>; + reg-names = "dbi", "phy", "config"; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + /* downstream I/O */ + ranges = <0x81000000 0 0x00010000 0xc0010000 0 0x00010000 + /* non-prefetchable memory */ + 0x82000000 0 0xc0020000 0xc0020000 0 0x1ffe0000>; + num-lanes = <2>; + interrupts = ; + interrupt-names = "msi"; + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0x7>; + interrupt-map = <0 0 0 1 &intc GIC_SPI 144 IRQ_TYPE_LEVEL_HIGH>, + <0 0 0 2 &intc GIC_SPI 145 IRQ_TYPE_LEVEL_HIGH>, + <0 0 0 3 &intc GIC_SPI 146 IRQ_TYPE_LEVEL_HIGH>, + <0 0 0 4 &intc GIC_SPI 147 IRQ_TYPE_LEVEL_HIGH>; + axis,syscon-pcie = <&syscon>; + }; diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 8ea834f6b289..5385cba941d2 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -3,6 +3,7 @@ *.bc *.bin *.bz2 +*.c.[012]*.* *.cis *.cpio *.csp diff --git a/Documentation/gcc-plugins.txt b/Documentation/gcc-plugins.txt new file mode 100644 index 000000000000..891c69464434 --- /dev/null +++ b/Documentation/gcc-plugins.txt @@ -0,0 +1,87 @@ +GCC plugin infrastructure +========================= + + +1. Introduction +=============== + +GCC plugins are loadable modules that provide extra features to the +compiler [1]. They are useful for runtime instrumentation and static analysis. +We can analyse, change and add further code during compilation via +callbacks [2], GIMPLE [3], IPA [4] and RTL passes [5]. + +The GCC plugin infrastructure of the kernel supports all gcc versions from +4.5 to 6.0, building out-of-tree modules, cross-compilation and building in a +separate directory. +Plugin source files have to be compilable by both a C and a C++ compiler as well +because gcc versions 4.5 and 4.6 are compiled by a C compiler, +gcc-4.7 can be compiled by a C or a C++ compiler, +and versions 4.8+ can only be compiled by a C++ compiler. + +Currently the GCC plugin infrastructure supports only the x86, arm and arm64 +architectures. + +This infrastructure was ported from grsecurity [6] and PaX [7]. + +-- +[1] https://gcc.gnu.org/onlinedocs/gccint/Plugins.html +[2] https://gcc.gnu.org/onlinedocs/gccint/Plugin-API.html#Plugin-API +[3] https://gcc.gnu.org/onlinedocs/gccint/GIMPLE.html +[4] https://gcc.gnu.org/onlinedocs/gccint/IPA.html +[5] https://gcc.gnu.org/onlinedocs/gccint/RTL.html +[6] https://grsecurity.net/ +[7] https://pax.grsecurity.net/ + + +2. Files +======== + +$(src)/scripts/gcc-plugins + This is the directory of the GCC plugins. + +$(src)/scripts/gcc-plugins/gcc-common.h + This is a compatibility header for GCC plugins. + It should be always included instead of individual gcc headers. + +$(src)/scripts/gcc-plugin.sh + This script checks the availability of the included headers in + gcc-common.h and chooses the proper host compiler to build the plugins + (gcc-4.7 can be built by either gcc or g++). + +$(src)/scripts/gcc-plugins/gcc-generate-gimple-pass.h +$(src)/scripts/gcc-plugins/gcc-generate-ipa-pass.h +$(src)/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h +$(src)/scripts/gcc-plugins/gcc-generate-rtl-pass.h + These headers automatically generate the registration structures for + GIMPLE, SIMPLE_IPA, IPA and RTL passes. They support all gcc versions + from 4.5 to 6.0. + They should be preferred to creating the structures by hand. + + +3. Usage +======== + +You must install the gcc plugin headers for your gcc version, +e.g., on Ubuntu for gcc-4.9: + + apt-get install gcc-4.9-plugin-dev + +Enable a GCC plugin based feature in the kernel config: + + CONFIG_GCC_PLUGIN_CYC_COMPLEXITY = y + +To compile only the plugin(s): + + make gcc-plugins + +or just run the kernel make and compile the whole kernel with +the cyclomatic complexity GCC plugin. + + +4. How to add a new GCC plugin +============================== + +The GCC plugins are in $(src)/scripts/gcc-plugins/. You can use a file or a directory +here. It must be added to $(src)/scripts/gcc-plugins/Makefile, +$(src)/scripts/Makefile.gcc-plugins and $(src)/arch/Kconfig. +See the cyc_complexity_plugin.c (CONFIG_GCC_PLUGIN_CYC_COMPLEXITY) GCC plugin. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e24aa11e8f8a..642029012059 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3021,6 +3021,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. resource_alignment= Format: [@][:]:.[; ...] + [@]pci::\ + [::][; ...] Specifies alignment and device to reassign aligned memory resources. If is not specified, @@ -3039,6 +3041,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. hpmemsize=nn[KMG] The fixed amount of bus space which is reserved for hotplug bridge's memory window. Default size is 2 megabytes. + hpbussize=nn The minimum amount of additional bus numbers + reserved for buses below a hotplug bridge. + Default is 1. realloc= Enable/disable reallocating PCI bridge resources if allocations done by BIOS are too small to accommodate resources required by all child @@ -3070,6 +3075,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. compat Treat PCIe ports as PCI-to-PCI bridges, disable the PCIe ports driver. + pcie_port_pm= [PCIE] PCIe port power management handling: + off Disable power management of all PCIe ports + force Forcibly enable power management of all PCIe ports + pcie_pme= [PCIE,PM] Native PCIe PME signaling options: nomsi Do not use MSI for native PCIe PME signaling (this makes all PCIe root ports use INTx for all services). diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a4482cce4bae..5237e1b2fd66 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1482,6 +1482,11 @@ struct kvm_irq_routing_msi { __u32 pad; }; +On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS +feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, +address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of +address_hi must be zero. + struct kvm_irq_routing_s390_adapter { __u64 ind_addr; __u64 summary_addr; @@ -1583,6 +1588,17 @@ struct kvm_lapic_state { Reads the Local APIC registers and copies them into the input argument. The data format and layout are the same as documented in the architecture manual. +If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is +enabled, then the format of APIC_ID register depends on the APIC mode +(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in +the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID +which is stored in bits 31-24 of the APIC register, or equivalently in +byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then +be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR. + +If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state +always uses xAPIC format. + 4.58 KVM_SET_LAPIC @@ -1600,6 +1616,10 @@ struct kvm_lapic_state { Copies the input argument into the Local APIC registers. The data format and layout are the same as documented in the architecture manual. +The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's +regs field) depends on the state of the KVM_CAP_X2APIC_API capability. +See the note in KVM_GET_LAPIC. + 4.59 KVM_IOEVENTFD @@ -2032,6 +2052,12 @@ registers, find a list below: MIPS | KVM_REG_MIPS_CP0_CONFIG5 | 32 MIPS | KVM_REG_MIPS_CP0_CONFIG7 | 32 MIPS | KVM_REG_MIPS_CP0_ERROREPC | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH1 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH2 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH3 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH4 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH5 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH6 | 64 MIPS | KVM_REG_MIPS_COUNT_CTL | 64 MIPS | KVM_REG_MIPS_COUNT_RESUME | 64 MIPS | KVM_REG_MIPS_COUNT_HZ | 64 @@ -2156,7 +2182,7 @@ after pausing the vcpu, but before it is resumed. 4.71 KVM_SIGNAL_MSI Capability: KVM_CAP_SIGNAL_MSI -Architectures: x86 +Architectures: x86 arm64 Type: vm ioctl Parameters: struct kvm_msi (in) Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error @@ -2169,10 +2195,22 @@ struct kvm_msi { __u32 address_hi; __u32 data; __u32 flags; - __u8 pad[16]; + __u32 devid; + __u8 pad[12]; }; -No flags are defined so far. The corresponding field must be 0. +flags: KVM_MSI_VALID_DEVID: devid contains a valid value +devid: If KVM_MSI_VALID_DEVID is set, contains a unique device identifier + for the device that wrote the MSI message. + For PCI, this is usually a BFD identifier in the lower 16 bits. + +The per-VM KVM_CAP_MSI_DEVID capability advertises the need to provide +the device ID. If this capability is not set, userland cannot rely on +the kernel to allow the KVM_MSI_VALID_DEVID flag being set. + +On x86, address_hi is ignored unless the KVM_CAP_X2APIC_API capability is +enabled. If it is enabled, address_hi bits 31-8 provide bits 31-8 of the +destination id. Bits 7-0 of address_hi must be zero. 4.71 KVM_CREATE_PIT2 @@ -2520,6 +2558,7 @@ Parameters: struct kvm_device_attr Returns: 0 on success, -1 on error Errors: ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. EPERM: The attribute cannot (currently) be accessed this way (e.g. read-only attribute, or attribute that only makes sense when the device is in a different state) @@ -2547,6 +2586,7 @@ Parameters: struct kvm_device_attr Returns: 0 on success, -1 on error Errors: ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. Tests whether a device supports a particular attribute. A successful return indicates the attribute is implemented. It does not necessarily @@ -3803,6 +3843,42 @@ Allows use of runtime-instrumentation introduced with zEC12 processor. Will return -EINVAL if the machine does not support runtime-instrumentation. Will return -EBUSY if a VCPU has already been created. +7.7 KVM_CAP_X2APIC_API + +Architectures: x86 +Parameters: args[0] - features that should be enabled +Returns: 0 on success, -EINVAL when args[0] contains invalid features + +Valid feature flags in args[0] are + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of +KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, +allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their +respective sections. + +KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work +in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff +as a broadcast even in x2APIC mode in order to support physical x2APIC +without interrupt remapping. This is undesirable in logical mode, +where 0xff represents CPUs 0-7 in cluster 0. + +7.8 KVM_CAP_S390_USER_INSTR0 + +Architectures: s390 +Parameters: none + +With this capability enabled, all illegal instructions 0x0000 (2 bytes) will +be intercepted and forwarded to user space. User space can use this +mechanism e.g. to realize 2-byte software breakpoints. The kernel will +not inject an operating exception for these instructions, user space has +to take care of that. + +This capability can be enabled dynamically even if VCPUs were already +created and are running. + 8. Other capabilities. ---------------------- diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt index 59541d49e15c..89182f80cc7f 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -4,16 +4,22 @@ ARM Virtual Generic Interrupt Controller (VGIC) Device types supported: KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0 + KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller -Only one VGIC instance may be instantiated through either this API or the -legacy KVM_CREATE_IRQCHIP api. The created VGIC will act as the VM interrupt -controller, requiring emulated user-space devices to inject interrupts to the -VGIC instead of directly to CPUs. +Only one VGIC instance of the V2/V3 types above may be instantiated through +either this API or the legacy KVM_CREATE_IRQCHIP api. The created VGIC will +act as the VM interrupt controller, requiring emulated user-space devices to +inject interrupts to the VGIC instead of directly to CPUs. Creating a guest GICv3 device requires a host GICv3 as well. GICv3 implementations with hardware compatibility support allow a guest GICv2 as well. +Creating a virtual ITS controller requires a host GICv3 (but does not depend +on having physical ITS controllers). +There can be multiple ITS controllers per guest, each of them has to have +a separate, non-overlapping MMIO region. + Groups: KVM_DEV_ARM_VGIC_GRP_ADDR Attributes: @@ -39,6 +45,13 @@ Groups: Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. This address needs to be 64K aligned. + KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit) + Base address in the guest physical address space of the GICv3 ITS + control register frame. The ITS allows MSI(-X) interrupts to be + injected into guests. This extension is optional. If the kernel + does not support the ITS, the call returns -ENODEV. + Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS. + This address needs to be 64K aligned and the region covers 128K. KVM_DEV_ARM_VGIC_GRP_DIST_REGS Attributes: @@ -109,8 +122,8 @@ Groups: KVM_DEV_ARM_VGIC_GRP_CTRL Attributes: KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the VGIC, no additional parameter in - kvm_device_attr.addr. + request the initialization of the VGIC or ITS, no additional parameter + in kvm_device_attr.addr. Errors: -ENXIO: VGIC not properly configured as required prior to calling this attribute diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt index a9ea8774a45f..b6cda49f2ba4 100644 --- a/Documentation/virtual/kvm/devices/vm.txt +++ b/Documentation/virtual/kvm/devices/vm.txt @@ -20,7 +20,8 @@ Enables Collaborative Memory Management Assist (CMMA) for the virtual machine. 1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA Parameters: none -Returns: 0 +Returns: -EINVAL if CMMA was not enabled + 0 otherwise Clear the CMMA status for all guest pages, so any pages the guest marked as unused are again used any may not be reclaimed by the host. @@ -85,6 +86,90 @@ Returns: -EBUSY in case 1 or more vcpus are already activated (only in write -ENOMEM if not enough memory is available to process the ioctl 0 in case of success +2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o) + +Allows user space to retrieve available cpu features. A feature is available if +provided by the hardware and supported by kvm. In theory, cpu features could +even be completely emulated by kvm. + +struct kvm_s390_vm_cpu_feat { + __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering +}; + +Parameters: address of a buffer to load the feature list from. +Returns: -EFAULT if the given address is not accessible from kernel space. + 0 in case of success. + +2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w) + +Allows user space to retrieve or change enabled cpu features for all VCPUs of a +VM. Features that are not available cannot be enabled. + +See 2.3. for a description of the parameter struct. + +Parameters: address of a buffer to store/load the feature list from. +Returns: -EFAULT if the given address is not accessible from kernel space. + -EINVAL if a cpu feature that is not available is to be enabled. + -EBUSY if at least one VCPU has already been defined. + 0 in case of success. + +2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o) + +Allows user space to retrieve available cpu subfunctions without any filtering +done by a set IBC. These subfunctions are indicated to the guest VCPU via +query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff. + +A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the +STFL(E) bit introducing the affected instruction. If the affected instruction +indicates subfunctions via a "query subfunction", the response block is +contained in the returned struct. If the affected instruction +indicates subfunctions via a "test bit" mechanism, the subfunction codes are +contained in the returned struct in MSB 0 bit numbering. + +struct kvm_s390_vm_cpu_subfunc { + u8 plo[32]; # always valid (ESA/390 feature) + u8 ptff[16]; # valid with TOD-clock steering + u8 kmac[16]; # valid with Message-Security-Assist + u8 kmc[16]; # valid with Message-Security-Assist + u8 km[16]; # valid with Message-Security-Assist + u8 kimd[16]; # valid with Message-Security-Assist + u8 klmd[16]; # valid with Message-Security-Assist + u8 pckmo[16]; # valid with Message-Security-Assist-Extension 3 + u8 kmctr[16]; # valid with Message-Security-Assist-Extension 4 + u8 kmf[16]; # valid with Message-Security-Assist-Extension 4 + u8 kmo[16]; # valid with Message-Security-Assist-Extension 4 + u8 pcc[16]; # valid with Message-Security-Assist-Extension 4 + u8 ppno[16]; # valid with Message-Security-Assist-Extension 5 + u8 reserved[1824]; # reserved for future instructions +}; + +Parameters: address of a buffer to load the subfunction blocks from. +Returns: -EFAULT if the given address is not accessible from kernel space. + 0 in case of success. + +2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w) + +Allows user space to retrieve or change cpu subfunctions to be indicated for +all VCPUs of a VM. This attribute will only be available if kernel and +hardware support are in place. + +The kernel uses the configured subfunction blocks for indication to +the guest. A subfunction block will only be used if the associated STFL(E) bit +has not been disabled by user space (so the instruction to be queried is +actually available for the guest). + +As long as no data has been written, a read will fail. The IBC will be used +to determine available subfunctions in this case, this will guarantee backward +compatibility. + +See 2.5. for a description of the parameter struct. + +Parameters: address of a buffer to store/load the subfunction blocks from. +Returns: -EFAULT if the given address is not accessible from kernel space. + -EINVAL when reading, if there was no write yet. + -EBUSY if at least one VCPU has already been defined. + 0 in case of success. + 3. GROUP: KVM_S390_VM_TOD Architectures: s390 diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index 19f94a6b9bb0..f2491a8c68b4 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -89,7 +89,7 @@ In mmu_spte_clear_track_bits(): old_spte = *spte; /* 'if' condition is satisfied. */ - if (old_spte.Accssed == 1 && + if (old_spte.Accessed == 1 && old_spte.W == 0) spte = 0ull; on fast page fault path: @@ -102,7 +102,7 @@ In mmu_spte_clear_track_bits(): old_spte = xchg(spte, 0ull) - if (old_spte.Accssed == 1) + if (old_spte.Accessed == 1) kvm_set_pfn_accessed(spte.pfn); if (old_spte.Dirty == 1) kvm_set_pfn_dirty(spte.pfn); diff --git a/MAINTAINERS b/MAINTAINERS index 25f43204014d..10074ff03c57 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5094,6 +5094,15 @@ L: linux-scsi@vger.kernel.org S: Odd Fixes (e.g., new signatures) F: drivers/scsi/fdomain.* +GCC PLUGINS +M: Kees Cook +R: Emese Revfy +L: kernel-hardening@lists.openwall.com +S: Maintained +F: scripts/gcc-plugins/ +F: scripts/gcc-plugin.sh +F: Documentation/gcc-plugins.txt + GCOV BASED KERNEL PROFILING M: Peter Oberparleiter S: Maintained @@ -8874,6 +8883,7 @@ L: linux-pci@vger.kernel.org Q: http://patchwork.ozlabs.org/project/linux-pci/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git S: Supported +F: Documentation/devicetree/bindings/pci/ F: Documentation/PCI/ F: drivers/pci/ F: include/linux/pci* @@ -8937,6 +8947,13 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: drivers/pci/host/*mvebu* +PCI DRIVER FOR AARDVARK (Marvell Armada 3700) +M: Thomas Petazzoni +L: linux-pci@vger.kernel.org +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +S: Maintained +F: drivers/pci/host/pci-aardvark.c + PCI DRIVER FOR NVIDIA TEGRA M: Thierry Reding L: linux-tegra@vger.kernel.org @@ -9019,6 +9036,15 @@ S: Maintained F: Documentation/devicetree/bindings/pci/xgene-pci-msi.txt F: drivers/pci/host/pci-xgene-msi.c +PCIE DRIVER FOR AXIS ARTPEC +M: Niklas Cassel +M: Jesper Nilsson +L: linux-arm-kernel@axis.com +L: linux-pci@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/pci/axis,artpec* +F: drivers/pci/host/*artpec* + PCIE DRIVER FOR HISILICON M: Zhou Wang M: Gabriele Paoloni diff --git a/Makefile b/Makefile index 393b6159ae92..d6d401bf9d95 100644 --- a/Makefile +++ b/Makefile @@ -371,26 +371,27 @@ CFLAGS_KERNEL = AFLAGS_KERNEL = LDFLAGS_vmlinux = CFLAGS_GCOV = -fprofile-arcs -ftest-coverage -fno-tree-loop-im -CFLAGS_KCOV = -fsanitize-coverage=trace-pc +CFLAGS_KCOV := $(call cc-option,-fsanitize-coverage=trace-pc,) # Use USERINCLUDE when you must reference the UAPI directories only. USERINCLUDE := \ -I$(srctree)/arch/$(hdr-arch)/include/uapi \ - -Iarch/$(hdr-arch)/include/generated/uapi \ + -I$(objtree)/arch/$(hdr-arch)/include/generated/uapi \ -I$(srctree)/include/uapi \ - -Iinclude/generated/uapi \ + -I$(objtree)/include/generated/uapi \ -include $(srctree)/include/linux/kconfig.h # Use LINUXINCLUDE when you must reference the include/ directory. # Needed to be compatible with the O= option LINUXINCLUDE := \ -I$(srctree)/arch/$(hdr-arch)/include \ - -Iarch/$(hdr-arch)/include/generated/uapi \ - -Iarch/$(hdr-arch)/include/generated \ + -I$(objtree)/arch/$(hdr-arch)/include/generated/uapi \ + -I$(objtree)/arch/$(hdr-arch)/include/generated \ $(if $(KBUILD_SRC), -I$(srctree)/include) \ - -Iinclude \ - $(USERINCLUDE) + -I$(objtree)/include + +LINUXINCLUDE += $(filter-out $(LINUXINCLUDE),$(USERINCLUDE)) KBUILD_CPPFLAGS := -D__KERNEL__ @@ -554,7 +555,7 @@ ifeq ($(KBUILD_EXTMOD),) # in parallel PHONY += scripts scripts: scripts_basic include/config/auto.conf include/config/tristate.conf \ - asm-generic + asm-generic gcc-plugins $(Q)$(MAKE) $(build)=$(@) # Objects we will link into vmlinux / subdirs we need to visit @@ -635,6 +636,15 @@ endif # Tell gcc to never replace conditional load with a non-conditional one KBUILD_CFLAGS += $(call cc-option,--param=allow-store-data-races=0) +PHONY += gcc-plugins +gcc-plugins: scripts_basic +ifdef CONFIG_GCC_PLUGINS + $(Q)$(MAKE) $(build)=scripts/gcc-plugins +endif + @: + +include scripts/Makefile.gcc-plugins + ifdef CONFIG_READABLE_ASM # Disable optimizations that make assembler listings hard to read. # reorder blocks reorders the control in the function @@ -666,21 +676,11 @@ endif endif # Find arch-specific stack protector compiler sanity-checking script. ifdef CONFIG_CC_STACKPROTECTOR - stackp-path := $(srctree)/scripts/gcc-$(ARCH)_$(BITS)-has-stack-protector.sh - ifneq ($(wildcard $(stackp-path)),) - stackp-check := $(stackp-path) - endif + stackp-path := $(srctree)/scripts/gcc-$(SRCARCH)_$(BITS)-has-stack-protector.sh + stackp-check := $(wildcard $(stackp-path)) endif KBUILD_CFLAGS += $(stackp-flag) -ifdef CONFIG_KCOV - ifeq ($(call cc-option, $(CFLAGS_KCOV)),) - $(warning Cannot use CONFIG_KCOV: \ - -fsanitize-coverage=trace-pc is not supported by compiler) - CFLAGS_KCOV = - endif -endif - ifeq ($(cc-name),clang) KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,) @@ -1019,7 +1019,7 @@ prepare1: prepare2 $(version_h) include/generated/utsrelease.h \ archprepare: archheaders archscripts prepare1 scripts_basic -prepare0: archprepare +prepare0: archprepare gcc-plugins $(Q)$(MAKE) $(build)=. # All the preparing.. @@ -1531,6 +1531,7 @@ clean: $(clean-dirs) -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \ -o -name '*.symtypes' -o -name 'modules.order' \ -o -name modules.builtin -o -name '.tmp_*.o.*' \ + -o -name '*.c.[012]*.*' \ -o -name '*.gcno' \) -type f -print | xargs rm -f # Generate tags for editors @@ -1641,7 +1642,7 @@ endif $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \ $(build)=$(build-dir) # Make sure the latest headers are built for Documentation -Documentation/: headers_install +Documentation/ samples/: headers_install %/: prepare scripts FORCE $(cmd_crmodverdir) $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \ diff --git a/arch/Kconfig b/arch/Kconfig index 15996290fed4..bd8056b5b246 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -357,6 +357,43 @@ config SECCOMP_FILTER See Documentation/prctl/seccomp_filter.txt for details. +config HAVE_GCC_PLUGINS + bool + help + An arch should select this symbol if it supports building with + GCC plugins. + +menuconfig GCC_PLUGINS + bool "GCC plugins" + depends on HAVE_GCC_PLUGINS + depends on !COMPILE_TEST + help + GCC plugins are loadable modules that provide extra features to the + compiler. They are useful for runtime instrumentation and static analysis. + + See Documentation/gcc-plugins.txt for details. + +config GCC_PLUGIN_CYC_COMPLEXITY + bool "Compute the cyclomatic complexity of a function" + depends on GCC_PLUGINS + help + The complexity M of a function's control flow graph is defined as: + M = E - N + 2P + where + + E = the number of edges + N = the number of nodes + P = the number of connected components (exit nodes). + +config GCC_PLUGIN_SANCOV + bool + depends on GCC_PLUGINS + help + This plugin inserts a __sanitizer_cov_trace_pc() call at the start of + basic blocks. It supports all gcc versions with plugin support (from + gcc-4.5 on). It is based on the commit "Add fuzzing coverage support" + by Dmitry Vyukov . + config HAVE_CC_STACKPROTECTOR bool help diff --git a/arch/alpha/boot/Makefile b/arch/alpha/boot/Makefile index 8399bd0e68e8..0cbe4c59d3ce 100644 --- a/arch/alpha/boot/Makefile +++ b/arch/alpha/boot/Makefile @@ -15,7 +15,7 @@ targets := vmlinux.gz vmlinux \ OBJSTRIP := $(obj)/tools/objstrip HOSTCFLAGS := -Wall -I$(objtree)/usr/include -BOOTCFLAGS += -I$(obj) -I$(srctree)/$(obj) +BOOTCFLAGS += -I$(objtree)/$(obj) -I$(srctree)/$(obj) # SRM bootable image. Copy to offset 512 of a partition. $(obj)/bootimage: $(addprefix $(obj)/tools/,mkbb lxboot bootlx) $(obj)/vmlinux.nh diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 14b4cf75ceec..2d601d769a1c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -54,6 +54,7 @@ config ARM select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL) select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL) select HAVE_FUNCTION_TRACER if (!XIP_KERNEL) + select HAVE_GCC_PLUGINS select HAVE_GENERIC_DMA_COHERENT select HAVE_HW_BREAKPOINT if (PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7)) select HAVE_IDE if PCI || ISA || PCMCIA @@ -699,7 +700,7 @@ config ARCH_VIRT depends on ARCH_MULTI_V7 select ARM_AMBA select ARM_GIC - select ARM_GIC_V2M if PCI_MSI + select ARM_GIC_V2M if PCI select ARM_GIC_V3 select ARM_PSCI select HAVE_ARM_ARCH_TIMER diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 3d5a5cd071bd..58faff5f1eb2 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -66,6 +66,8 @@ extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern void __init_stage2_translation(void); + +extern void __kvm_hyp_reset(unsigned long); #endif #endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 96387d477e91..de338d93d11b 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -241,8 +241,7 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index); -static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, - phys_addr_t pgd_ptr, +static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, unsigned long hyp_stack_ptr, unsigned long vector_ptr) { @@ -251,18 +250,13 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, * code. The init code doesn't need to preserve these * registers as r0-r3 are already callee saved according to * the AAPCS. - * Note that we slightly misuse the prototype by casing the + * Note that we slightly misuse the prototype by casting the * stack pointer to a void *. - * - * We don't have enough registers to perform the full init in - * one go. Install the boot PGD first, and then install the - * runtime PGD, stack pointer and vectors. The PGDs are always - * passed as the third argument, in order to be passed into - * r2-r3 to the init code (yes, this is compliant with the - * PCS!). - */ - kvm_call_hyp(NULL, 0, boot_pgd_ptr); + * The PGDs are always passed as the third argument, in order + * to be passed into r2-r3 to the init code (yes, this is + * compliant with the PCS!). + */ kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); } @@ -272,16 +266,13 @@ static inline void __cpu_init_stage2(void) kvm_call_hyp(__init_stage2_translation); } -static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr, +static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr, phys_addr_t phys_idmap_start) { - /* - * TODO - * kvm_call_reset(boot_pgd_ptr, phys_idmap_start); - */ + kvm_call_hyp((void *)virt_to_idmap(__kvm_hyp_reset), vector_ptr); } -static inline int kvm_arch_dev_ioctl_check_extension(long ext) +static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) { return 0; } diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index f0e860761380..6eaff28f2ff3 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -25,9 +25,6 @@ #define __hyp_text __section(.hyp.text) notrace -#define kern_hyp_va(v) (v) -#define hyp_kern_va(v) (v) - #define __ACCESS_CP15(CRn, Op1, CRm, Op2) \ "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32 #define __ACCESS_CP15_64(Op1, CRm) \ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index f9a65061130b..3bb803d6814b 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -26,16 +26,7 @@ * We directly use the kernel VA for the HYP, as we can directly share * the mapping (HTTBR "covers" TTBR1). */ -#define HYP_PAGE_OFFSET_MASK UL(~0) -#define HYP_PAGE_OFFSET PAGE_OFFSET -#define KERN_TO_HYP(kva) (kva) - -/* - * Our virtual mapping for the boot-time MMU-enable code. Must be - * shared across all the page-tables. Conveniently, we use the vectors - * page, where no kernel data will ever be shared with HYP. - */ -#define TRAMPOLINE_VA UL(CONFIG_VECTORS_BASE) +#define kern_hyp_va(kva) (kva) /* * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels. @@ -49,9 +40,8 @@ #include #include -int create_hyp_mappings(void *from, void *to); +int create_hyp_mappings(void *from, void *to, pgprot_t prot); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); -void free_boot_hyp_pgd(void); void free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); @@ -65,7 +55,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); -phys_addr_t kvm_mmu_get_boot_httbr(void); phys_addr_t kvm_get_idmap_vector(void); phys_addr_t kvm_get_idmap_start(void); int kvm_mmu_init(void); diff --git a/arch/arm/include/asm/mach/pci.h b/arch/arm/include/asm/mach/pci.h index 0070e8520cd4..2d88af5be45f 100644 --- a/arch/arm/include/asm/mach/pci.h +++ b/arch/arm/include/asm/mach/pci.h @@ -22,6 +22,7 @@ struct hw_pci { struct msi_controller *msi_ctrl; struct pci_ops *ops; int nr_controllers; + unsigned int io_optional:1; void **private_data; int (*setup)(int nr, struct pci_sys_data *); struct pci_bus *(*scan)(int nr, struct pci_sys_data *); diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index d62204060cbe..a8d656d9aec7 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -97,7 +97,9 @@ extern pgprot_t pgprot_s2_device; #define PAGE_READONLY_EXEC _MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY) #define PAGE_KERNEL _MOD_PROT(pgprot_kernel, L_PTE_XN) #define PAGE_KERNEL_EXEC pgprot_kernel -#define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_HYP) +#define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_XN) +#define PAGE_HYP_EXEC _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY) +#define PAGE_HYP_RO _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN) #define PAGE_HYP_DEVICE _MOD_PROT(pgprot_hyp_device, L_PTE_HYP) #define PAGE_S2 _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY) #define PAGE_S2_DEVICE _MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY) diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h index d4ceaf5f299b..a2e75b84e2ae 100644 --- a/arch/arm/include/asm/virt.h +++ b/arch/arm/include/asm/virt.h @@ -80,6 +80,10 @@ static inline bool is_kernel_in_hyp_mode(void) return false; } +/* The section containing the hypervisor idmap text */ +extern char __hyp_idmap_text_start[]; +extern char __hyp_idmap_text_end[]; + /* The section containing the hypervisor text */ extern char __hyp_text_start[]; extern char __hyp_text_end[]; diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c index 05e61a2eeabe..2f0e07735d1d 100644 --- a/arch/arm/kernel/bios32.c +++ b/arch/arm/kernel/bios32.c @@ -410,7 +410,8 @@ static int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) return irq; } -static int pcibios_init_resources(int busnr, struct pci_sys_data *sys) +static int pcibios_init_resource(int busnr, struct pci_sys_data *sys, + int io_optional) { int ret; struct resource_entry *window; @@ -420,6 +421,14 @@ static int pcibios_init_resources(int busnr, struct pci_sys_data *sys) &iomem_resource, sys->mem_offset); } + /* + * If a platform says I/O port support is optional, we don't add + * the default I/O space. The platform is responsible for adding + * any I/O space it needs. + */ + if (io_optional) + return 0; + resource_list_for_each_entry(window, &sys->resources) if (resource_type(window->res) == IORESOURCE_IO) return 0; @@ -466,7 +475,7 @@ static void pcibios_init_hw(struct device *parent, struct hw_pci *hw, if (ret > 0) { struct pci_host_bridge *host_bridge; - ret = pcibios_init_resources(nr, sys); + ret = pcibios_init_resource(nr, sys, hw->io_optional); if (ret) { kfree(sys); break; @@ -515,25 +524,23 @@ void pci_common_init_dev(struct device *parent, struct hw_pci *hw) list_for_each_entry(sys, &head, node) { struct pci_bus *bus = sys->bus; - if (!pci_has_flag(PCI_PROBE_ONLY)) { + /* + * We insert PCI resources into the iomem_resource and + * ioport_resource trees in either pci_bus_claim_resources() + * or pci_bus_assign_resources(). + */ + if (pci_has_flag(PCI_PROBE_ONLY)) { + pci_bus_claim_resources(bus); + } else { struct pci_bus *child; - /* - * Size the bridge windows. - */ pci_bus_size_bridges(bus); - - /* - * Assign resources. - */ pci_bus_assign_resources(bus); list_for_each_entry(child, &bus->children, node) pcie_bus_configure_settings(child); } - /* - * Tell drivers about devices found. - */ + pci_bus_add_devices(bus); } } @@ -590,18 +597,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, return start; } -/** - * pcibios_enable_device - Enable I/O and memory. - * @dev: PCI device to be enabled - */ -int pcibios_enable_device(struct pci_dev *dev, int mask) -{ - if (pci_has_flag(PCI_PROBE_ONLY)) - return 0; - - return pci_enable_resources(dev, mask); -} - int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state, int write_combine) { diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 02abfff68ee5..95a000515e43 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -46,13 +46,6 @@ config KVM_ARM_HOST ---help--- Provides host support for ARM processors. -config KVM_NEW_VGIC - bool "New VGIC implementation" - depends on KVM - default y - ---help--- - uses the new VGIC implementation - source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index a596b58f6d37..5e28df80dca7 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -22,7 +22,6 @@ obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o -ifeq ($(CONFIG_KVM_NEW_VGIC),y) obj-y += $(KVM)/arm/vgic/vgic.o obj-y += $(KVM)/arm/vgic/vgic-init.o obj-y += $(KVM)/arm/vgic/vgic-irqfd.o @@ -30,9 +29,4 @@ obj-y += $(KVM)/arm/vgic/vgic-v2.o obj-y += $(KVM)/arm/vgic/vgic-mmio.o obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o -else -obj-y += $(KVM)/arm/vgic.o -obj-y += $(KVM)/arm/vgic-v2.o -obj-y += $(KVM)/arm/vgic-v2-emul.o -endif obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index f1bde7c4e736..d94bb9093ead 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -122,7 +123,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto out_fail_alloc; - ret = create_hyp_mappings(kvm, kvm + 1); + ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP); if (ret) goto out_free_stage2_pgd; @@ -201,7 +202,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_VCPUS; break; default: - r = kvm_arch_dev_ioctl_check_extension(ext); + r = kvm_arch_dev_ioctl_check_extension(kvm, ext); break; } return r; @@ -239,7 +240,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) if (err) goto free_vcpu; - err = create_hyp_mappings(vcpu, vcpu + 1); + err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); if (err) goto vcpu_uninit; @@ -377,7 +378,7 @@ void force_vm_exit(const cpumask_t *mask) /** * need_new_vmid_gen - check that the VMID is still valid - * @kvm: The VM's VMID to checkt + * @kvm: The VM's VMID to check * * return true if there is a new generation of VMIDs being used * @@ -616,7 +617,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * Enter the guest */ trace_kvm_entry(*vcpu_pc(vcpu)); - __kvm_guest_enter(); + guest_enter_irqoff(); vcpu->mode = IN_GUEST_MODE; ret = kvm_call_hyp(__kvm_vcpu_run, vcpu); @@ -642,14 +643,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) local_irq_enable(); /* - * We do local_irq_enable() before calling kvm_guest_exit() so + * We do local_irq_enable() before calling guest_exit() so * that if a timer interrupt hits while running the guest we * account that tick as being spent in the guest. We enable - * preemption after calling kvm_guest_exit() so that if we get + * preemption after calling guest_exit() so that if we get * preempted we make sure ticks after that is not counted as * guest time. */ - kvm_guest_exit(); + guest_exit(); trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* @@ -1039,7 +1040,6 @@ long kvm_arch_vm_ioctl(struct file *filp, static void cpu_init_hyp_mode(void *dummy) { - phys_addr_t boot_pgd_ptr; phys_addr_t pgd_ptr; unsigned long hyp_stack_ptr; unsigned long stack_page; @@ -1048,13 +1048,12 @@ static void cpu_init_hyp_mode(void *dummy) /* Switch from the HYP stub to our own HYP init vector */ __hyp_set_vectors(kvm_get_idmap_vector()); - boot_pgd_ptr = kvm_mmu_get_boot_httbr(); pgd_ptr = kvm_mmu_get_httbr(); stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); hyp_stack_ptr = stack_page + PAGE_SIZE; vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector); - __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr); + __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); __cpu_init_stage2(); kvm_arm_init_debug(); @@ -1076,15 +1075,9 @@ static void cpu_hyp_reinit(void) static void cpu_hyp_reset(void) { - phys_addr_t boot_pgd_ptr; - phys_addr_t phys_idmap_start; - - if (!is_kernel_in_hyp_mode()) { - boot_pgd_ptr = kvm_mmu_get_boot_httbr(); - phys_idmap_start = kvm_get_idmap_start(); - - __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start); - } + if (!is_kernel_in_hyp_mode()) + __cpu_reset_hyp_mode(hyp_default_vectors, + kvm_get_idmap_start()); } static void _kvm_arch_hardware_enable(void *discard) @@ -1294,14 +1287,14 @@ static int init_hyp_mode(void) * Map the Hyp-code called directly from the host */ err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), - kvm_ksym_ref(__hyp_text_end)); + kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC); if (err) { kvm_err("Cannot map world-switch code\n"); goto out_err; } err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), - kvm_ksym_ref(__end_rodata)); + kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); if (err) { kvm_err("Cannot map rodata section\n"); goto out_err; @@ -1312,7 +1305,8 @@ static int init_hyp_mode(void) */ for_each_possible_cpu(cpu) { char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); - err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE); + err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE, + PAGE_HYP); if (err) { kvm_err("Cannot map hyp stack\n"); @@ -1324,7 +1318,7 @@ static int init_hyp_mode(void) kvm_cpu_context_t *cpu_ctxt; cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu); - err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1); + err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP); if (err) { kvm_err("Cannot map host CPU state: %d\n", err); @@ -1332,10 +1326,6 @@ static int init_hyp_mode(void) } } -#ifndef CONFIG_HOTPLUG_CPU - free_boot_hyp_pgd(); -#endif - /* set size of VMID supported by CPU */ kvm_vmid_bits = kvm_get_vmid_bits(); kvm_info("%d-bit VMID\n", kvm_vmid_bits); diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c index a494def3f195..af93e3ffc9f3 100644 --- a/arch/arm/kvm/emulate.c +++ b/arch/arm/kvm/emulate.c @@ -210,7 +210,7 @@ bool kvm_condition_valid(struct kvm_vcpu *vcpu) * @vcpu: The VCPU pointer * * When exceptions occur while instructions are executed in Thumb IF-THEN - * blocks, the ITSTATE field of the CPSR is not advanved (updated), so we have + * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have * to do this little bit of work manually. The fields map like this: * * IT[7:0] -> CPSR[26:25],CPSR[15:10] diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index 9093ed0f8b2a..9aca92074f85 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -182,7 +182,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) /** * kvm_arm_copy_reg_indices - get indices of all registers. * - * We do core registers right here, then we apppend coproc regs. + * We do core registers right here, then we append coproc regs. */ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S index 1f9ae17476f9..bf89c919efc1 100644 --- a/arch/arm/kvm/init.S +++ b/arch/arm/kvm/init.S @@ -32,23 +32,13 @@ * r2,r3 = Hypervisor pgd pointer * * The init scenario is: - * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd, - * runtime stack, runtime vectors - * - Enable the MMU with the boot pgd - * - Jump to a target into the trampoline page (remember, this is the same - * physical page!) - * - Now switch to the runtime pgd (same VA, and still the same physical - * page!) + * - We jump in HYP with 3 parameters: runtime HYP pgd, runtime stack, + * runtime vectors * - Invalidate TLBs * - Set stack and vectors + * - Setup the page tables + * - Enable the MMU * - Profit! (or eret, if you only care about the code). - * - * As we only have four registers available to pass parameters (and we - * need six), we split the init in two phases: - * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD. - * Provides the basic HYP init, and enable the MMU. - * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD. - * Switches to the runtime PGD, set stack and vectors. */ .text @@ -68,8 +58,11 @@ __kvm_hyp_init: W(b) . __do_hyp_init: - cmp r0, #0 @ We have a SP? - bne phase2 @ Yes, second stage init + @ Set stack pointer + mov sp, r0 + + @ Set HVBAR to point to the HYP vectors + mcr p15, 4, r1, c12, c0, 0 @ HVBAR @ Set the HTTBR to point to the hypervisor PGD pointer passed mcrr p15, 4, rr_lo_hi(r2, r3), c2 @@ -114,34 +107,25 @@ __do_hyp_init: THUMB( ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) ) orr r1, r1, r2 orr r0, r0, r1 - isb mcr p15, 4, r0, c1, c0, 0 @ HSCR - - @ End of init phase-1 - eret - -phase2: - @ Set stack pointer - mov sp, r0 - - @ Set HVBAR to point to the HYP vectors - mcr p15, 4, r1, c12, c0, 0 @ HVBAR - - @ Jump to the trampoline page - ldr r0, =TRAMPOLINE_VA - adr r1, target - bfi r0, r1, #0, #PAGE_SHIFT - ret r0 - -target: @ We're now in the trampoline code, switch page tables - mcrr p15, 4, rr_lo_hi(r2, r3), c2 isb - @ Invalidate the old TLBs - mcr p15, 4, r0, c8, c7, 0 @ TLBIALLH - dsb ish + eret + + @ r0 : stub vectors address +ENTRY(__kvm_hyp_reset) + /* We're now in idmap, disable MMU */ + mrc p15, 4, r1, c1, c0, 0 @ HSCTLR + ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I) + bic r1, r1, r2 + mcr p15, 4, r1, c1, c0, 0 @ HSCTLR + + /* Install stub vectors */ + mcr p15, 4, r0, c12, c0, 0 @ HVBAR + isb eret +ENDPROC(__kvm_hyp_reset) .ltorg diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 45c43aecb8f2..bda27b6b1aa2 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -32,8 +32,6 @@ #include "trace.h" -extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; - static pgd_t *boot_hyp_pgd; static pgd_t *hyp_pgd; static pgd_t *merged_hyp_pgd; @@ -483,28 +481,6 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) } while (pgd++, addr = next, addr != end); } -/** - * free_boot_hyp_pgd - free HYP boot page tables - * - * Free the HYP boot page tables. The bounce page is also freed. - */ -void free_boot_hyp_pgd(void) -{ - mutex_lock(&kvm_hyp_pgd_mutex); - - if (boot_hyp_pgd) { - unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); - unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); - free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); - boot_hyp_pgd = NULL; - } - - if (hyp_pgd) - unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); - - mutex_unlock(&kvm_hyp_pgd_mutex); -} - /** * free_hyp_pgds - free Hyp-mode page tables * @@ -519,15 +495,20 @@ void free_hyp_pgds(void) { unsigned long addr; - free_boot_hyp_pgd(); - mutex_lock(&kvm_hyp_pgd_mutex); + if (boot_hyp_pgd) { + unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); + free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); + boot_hyp_pgd = NULL; + } + if (hyp_pgd) { + unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE); for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) - unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) - unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); free_pages((unsigned long)hyp_pgd, hyp_pgd_order); hyp_pgd = NULL; @@ -679,17 +660,18 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr) * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode * @from: The virtual kernel start address of the range * @to: The virtual kernel end address of the range (exclusive) + * @prot: The protection to be applied to this range * * The same virtual address as the kernel virtual address is also used * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying * physical pages. */ -int create_hyp_mappings(void *from, void *to) +int create_hyp_mappings(void *from, void *to, pgprot_t prot) { phys_addr_t phys_addr; unsigned long virt_addr; - unsigned long start = KERN_TO_HYP((unsigned long)from); - unsigned long end = KERN_TO_HYP((unsigned long)to); + unsigned long start = kern_hyp_va((unsigned long)from); + unsigned long end = kern_hyp_va((unsigned long)to); if (is_kernel_in_hyp_mode()) return 0; @@ -704,7 +686,7 @@ int create_hyp_mappings(void *from, void *to) err = __create_hyp_mappings(hyp_pgd, virt_addr, virt_addr + PAGE_SIZE, __phys_to_pfn(phys_addr), - PAGE_HYP); + prot); if (err) return err; } @@ -723,8 +705,8 @@ int create_hyp_mappings(void *from, void *to) */ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) { - unsigned long start = KERN_TO_HYP((unsigned long)from); - unsigned long end = KERN_TO_HYP((unsigned long)to); + unsigned long start = kern_hyp_va((unsigned long)from); + unsigned long end = kern_hyp_va((unsigned long)to); if (is_kernel_in_hyp_mode()) return 0; @@ -1687,14 +1669,6 @@ phys_addr_t kvm_mmu_get_httbr(void) return virt_to_phys(hyp_pgd); } -phys_addr_t kvm_mmu_get_boot_httbr(void) -{ - if (__kvm_cpu_uses_extended_idmap()) - return virt_to_phys(merged_hyp_pgd); - else - return virt_to_phys(boot_hyp_pgd); -} - phys_addr_t kvm_get_idmap_vector(void) { return hyp_idmap_vector; @@ -1705,6 +1679,22 @@ phys_addr_t kvm_get_idmap_start(void) return hyp_idmap_start; } +static int kvm_map_idmap_text(pgd_t *pgd) +{ + int err; + + /* Create the idmap in the boot page tables */ + err = __create_hyp_mappings(pgd, + hyp_idmap_start, hyp_idmap_end, + __phys_to_pfn(hyp_idmap_start), + PAGE_HYP_EXEC); + if (err) + kvm_err("Failed to idmap %lx-%lx\n", + hyp_idmap_start, hyp_idmap_end); + + return err; +} + int kvm_mmu_init(void) { int err; @@ -1719,28 +1709,41 @@ int kvm_mmu_init(void) */ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); - hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); - boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); + kvm_info("IDMAP page: %lx\n", hyp_idmap_start); + kvm_info("HYP VA range: %lx:%lx\n", + kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL)); - if (!hyp_pgd || !boot_hyp_pgd) { + if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && + hyp_idmap_start < kern_hyp_va(~0UL)) { + /* + * The idmap page is intersecting with the VA space, + * it is not safe to continue further. + */ + kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); + err = -EINVAL; + goto out; + } + + hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); + if (!hyp_pgd) { kvm_err("Hyp mode PGD not allocated\n"); err = -ENOMEM; goto out; } - /* Create the idmap in the boot page tables */ - err = __create_hyp_mappings(boot_hyp_pgd, - hyp_idmap_start, hyp_idmap_end, - __phys_to_pfn(hyp_idmap_start), - PAGE_HYP); - - if (err) { - kvm_err("Failed to idmap %lx-%lx\n", - hyp_idmap_start, hyp_idmap_end); - goto out; - } - if (__kvm_cpu_uses_extended_idmap()) { + boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + hyp_pgd_order); + if (!boot_hyp_pgd) { + kvm_err("Hyp boot PGD not allocated\n"); + err = -ENOMEM; + goto out; + } + + err = kvm_map_idmap_text(boot_hyp_pgd); + if (err) + goto out; + merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); if (!merged_hyp_pgd) { kvm_err("Failed to allocate extra HYP pgd\n"); @@ -1748,29 +1751,10 @@ int kvm_mmu_init(void) } __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, hyp_idmap_start); - return 0; - } - - /* Map the very same page at the trampoline VA */ - err = __create_hyp_mappings(boot_hyp_pgd, - TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, - __phys_to_pfn(hyp_idmap_start), - PAGE_HYP); - if (err) { - kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n", - TRAMPOLINE_VA); - goto out; - } - - /* Map the same page again into the runtime page tables */ - err = __create_hyp_mappings(hyp_pgd, - TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, - __phys_to_pfn(hyp_idmap_start), - PAGE_HYP); - if (err) { - kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n", - TRAMPOLINE_VA); - goto out; + } else { + err = kvm_map_idmap_text(hyp_pgd); + if (err) + goto out; } return 0; diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c index 0048b5a62a50..4b5e802e57d1 100644 --- a/arch/arm/kvm/reset.c +++ b/arch/arm/kvm/reset.c @@ -52,7 +52,7 @@ static const struct kvm_irq_level cortexa_vtimer_irq = { * @vcpu: The VCPU pointer * * This function finds the right table above and sets the registers on the - * virtual CPU struct to their architectually defined reset values. + * virtual CPU struct to their architecturally defined reset values. */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9f8b99e20557..69c8787bec7d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -3,6 +3,7 @@ config ARM64 select ACPI_CCA_REQUIRED if ACPI select ACPI_GENERIC_GSI if ACPI select ACPI_REDUCED_HARDWARE_ONLY if ACPI + select ACPI_MCFG if ACPI select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE @@ -22,9 +23,9 @@ config ARM64 select ARM_ARCH_TIMER select ARM_GIC select AUDIT_ARCH_COMPAT_GENERIC - select ARM_GIC_V2M if PCI_MSI + select ARM_GIC_V2M if PCI select ARM_GIC_V3 - select ARM_GIC_V3_ITS if PCI_MSI + select ARM_GIC_V3_ITS if PCI select ARM_PSCI_FW select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS @@ -78,6 +79,7 @@ config ARM64 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_GCC_PLUGINS select HAVE_GENERIC_DMA_COHERENT select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING @@ -101,6 +103,7 @@ config ARM64 select OF_EARLY_FLATTREE select OF_NUMA if NUMA && OF select OF_RESERVED_MEM + select PCI_ECAM if ACPI select PERF_USE_VMALLOC select POWER_RESET select POWER_SUPPLY diff --git a/arch/arm64/boot/dts/marvell/armada-3720-db.dts b/arch/arm64/boot/dts/marvell/armada-3720-db.dts index 86110a6ae330..1372e9a6aaa4 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-db.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-db.dts @@ -76,3 +76,8 @@ &uart0 { &usb3 { status = "okay"; }; + +/* CON17 (PCIe) / CON12 (mini-PCIe) */ +&pcie0 { + status = "okay"; +}; diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi index eb29280962d7..c4762538ec01 100644 --- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi @@ -176,5 +176,30 @@ gic: interrupt-controller@1d00000 { <0x1d40000 0x40000>; /* GICR */ }; }; + + pcie0: pcie@d0070000 { + compatible = "marvell,armada-3700-pcie"; + device_type = "pci"; + status = "disabled"; + reg = <0 0xd0070000 0 0x20000>; + #address-cells = <3>; + #size-cells = <2>; + bus-range = <0x00 0xff>; + interrupts = ; + #interrupt-cells = <1>; + msi-parent = <&pcie0>; + msi-controller; + ranges = <0x82000000 0 0xe8000000 0 0xe8000000 0 0x1000000 /* Port 0 MEM */ + 0x81000000 0 0xe9000000 0 0xe9000000 0 0x10000>; /* Port 0 IO*/ + interrupt-map-mask = <0 0 0 7>; + interrupt-map = <0 0 0 1 &pcie_intc 0>, + <0 0 0 2 &pcie_intc 1>, + <0 0 0 3 &pcie_intc 2>, + <0 0 0 4 &pcie_intc 3>; + pcie_intc: interrupt-controller { + interrupt-controller; + #interrupt-cells = <1>; + }; + }; }; }; diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 49dd1bd3ea50..7099f26e3702 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -36,8 +36,9 @@ #define ARM64_HAS_VIRT_HOST_EXTN 11 #define ARM64_WORKAROUND_CAVIUM_27456 12 #define ARM64_HAS_32BIT_EL0 13 +#define ARM64_HYP_OFFSET_LOW 14 -#define ARM64_NCAPS 14 +#define ARM64_NCAPS 15 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 2cdb6b551ac6..4b5c977af465 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -178,7 +178,7 @@ /* Hyp System Trap Register */ #define HSTR_EL2_T(x) (1 << x) -/* Hyp Coproccessor Trap Register Shifts */ +/* Hyp Coprocessor Trap Register Shifts */ #define CPTR_EL2_TFP_SHIFT 10 /* Hyp Coprocessor Trap Register */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 49095fc4b482..3eda975837d0 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -47,8 +47,7 @@ int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); -int kvm_arch_dev_ioctl_check_extension(long ext); -unsigned long kvm_hyp_reset_entry(void); +int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); struct kvm_arch { @@ -348,8 +347,7 @@ int kvm_perf_teardown(void); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); -static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, - phys_addr_t pgd_ptr, +static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, unsigned long hyp_stack_ptr, unsigned long vector_ptr) { @@ -357,19 +355,14 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, * Call initialization code, and switch to the full blown * HYP code. */ - __kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr, - hyp_stack_ptr, vector_ptr); + __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr); } -static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr, +void __kvm_hyp_teardown(void); +static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr, phys_addr_t phys_idmap_start) { - /* - * Call reset code, and switch back to stub hyp vectors. - * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation. - */ - __kvm_call_hyp((void *)kvm_hyp_reset_entry(), - boot_pgd_ptr, phys_idmap_start); + kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start); } static inline void kvm_arch_hardware_unsetup(void) {} diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 44eaff70da6a..cff510574fae 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -25,29 +25,6 @@ #define __hyp_text __section(.hyp.text) notrace -static inline unsigned long __kern_hyp_va(unsigned long v) -{ - asm volatile(ALTERNATIVE("and %0, %0, %1", - "nop", - ARM64_HAS_VIRT_HOST_EXTN) - : "+r" (v) : "i" (HYP_PAGE_OFFSET_MASK)); - return v; -} - -#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v))) - -static inline unsigned long __hyp_kern_va(unsigned long v) -{ - u64 offset = PAGE_OFFSET - HYP_PAGE_OFFSET; - asm volatile(ALTERNATIVE("add %0, %0, %1", - "nop", - ARM64_HAS_VIRT_HOST_EXTN) - : "+r" (v) : "r" (offset)); - return v; -} - -#define hyp_kern_va(v) (typeof(v))(__hyp_kern_va((unsigned long)(v))) - #define read_sysreg_elx(r,nvh,vh) \ ({ \ u64 reg; \ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index f05ac27d033e..b6bb83400cd8 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -29,21 +29,48 @@ * * Instead, give the HYP mode its own VA region at a fixed offset from * the kernel by just masking the top bits (which are all ones for a - * kernel address). + * kernel address). We need to find out how many bits to mask. * - * ARMv8.1 (using VHE) does have a TTBR1_EL2, and doesn't use these - * macros (the entire kernel runs at EL2). + * We want to build a set of page tables that cover both parts of the + * idmap (the trampoline page used to initialize EL2), and our normal + * runtime VA space, at the same time. + * + * Given that the kernel uses VA_BITS for its entire address space, + * and that half of that space (VA_BITS - 1) is used for the linear + * mapping, we can also limit the EL2 space to (VA_BITS - 1). + * + * The main question is "Within the VA_BITS space, does EL2 use the + * top or the bottom half of that space to shadow the kernel's linear + * mapping?". As we need to idmap the trampoline page, this is + * determined by the range in which this page lives. + * + * If the page is in the bottom half, we have to use the top half. If + * the page is in the top half, we have to use the bottom half: + * + * T = __virt_to_phys(__hyp_idmap_text_start) + * if (T & BIT(VA_BITS - 1)) + * HYP_VA_MIN = 0 //idmap in upper half + * else + * HYP_VA_MIN = 1 << (VA_BITS - 1) + * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1 + * + * This of course assumes that the trampoline page exists within the + * VA_BITS range. If it doesn't, then it means we're in the odd case + * where the kernel idmap (as well as HYP) uses more levels than the + * kernel runtime page tables (as seen when the kernel is configured + * for 4k pages, 39bits VA, and yet memory lives just above that + * limit, forcing the idmap to use 4 levels of page tables while the + * kernel itself only uses 3). In this particular case, it doesn't + * matter which side of VA_BITS we use, as we're guaranteed not to + * conflict with anything. + * + * When using VHE, there are no separate hyp mappings and all KVM + * functionality is already mapped as part of the main kernel + * mappings, and none of this applies in that case. */ -#define HYP_PAGE_OFFSET_SHIFT VA_BITS -#define HYP_PAGE_OFFSET_MASK ((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1) -#define HYP_PAGE_OFFSET (PAGE_OFFSET & HYP_PAGE_OFFSET_MASK) -/* - * Our virtual mapping for the idmap-ed MMU-enable code. Must be - * shared across all the page-tables. Conveniently, we use the last - * possible page, where no kernel mapping will ever exist. - */ -#define TRAMPOLINE_VA (HYP_PAGE_OFFSET_MASK & PAGE_MASK) +#define HYP_PAGE_OFFSET_HIGH_MASK ((UL(1) << VA_BITS) - 1) +#define HYP_PAGE_OFFSET_LOW_MASK ((UL(1) << (VA_BITS - 1)) - 1) #ifdef __ASSEMBLY__ @@ -53,13 +80,33 @@ /* * Convert a kernel VA into a HYP VA. * reg: VA to be converted. + * + * This generates the following sequences: + * - High mask: + * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK + * nop + * - Low mask: + * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK + * and x0, x0, #HYP_PAGE_OFFSET_LOW_MASK + * - VHE: + * nop + * nop + * + * The "low mask" version works because the mask is a strict subset of + * the "high mask", hence performing the first mask for nothing. + * Should be completely invisible on any viable CPU. */ .macro kern_hyp_va reg -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN - and \reg, \reg, #HYP_PAGE_OFFSET_MASK +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN + and \reg, \reg, #HYP_PAGE_OFFSET_HIGH_MASK alternative_else nop alternative_endif +alternative_if_not ARM64_HYP_OFFSET_LOW + nop +alternative_else + and \reg, \reg, #HYP_PAGE_OFFSET_LOW_MASK +alternative_endif .endm #else @@ -70,7 +117,22 @@ alternative_endif #include #include -#define KERN_TO_HYP(kva) ((unsigned long)kva - PAGE_OFFSET + HYP_PAGE_OFFSET) +static inline unsigned long __kern_hyp_va(unsigned long v) +{ + asm volatile(ALTERNATIVE("and %0, %0, %1", + "nop", + ARM64_HAS_VIRT_HOST_EXTN) + : "+r" (v) + : "i" (HYP_PAGE_OFFSET_HIGH_MASK)); + asm volatile(ALTERNATIVE("nop", + "and %0, %0, %1", + ARM64_HYP_OFFSET_LOW) + : "+r" (v) + : "i" (HYP_PAGE_OFFSET_LOW_MASK)); + return v; +} + +#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v))) /* * We currently only support a 40bit IPA. @@ -81,9 +143,8 @@ alternative_endif #include -int create_hyp_mappings(void *from, void *to); +int create_hyp_mappings(void *from, void *to, pgprot_t prot); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); -void free_boot_hyp_pgd(void); void free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); @@ -97,7 +158,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); -phys_addr_t kvm_mmu_get_boot_httbr(void); phys_addr_t kvm_get_idmap_vector(void); phys_addr_t kvm_get_idmap_start(void); int kvm_mmu_init(void); diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 2813748e2f24..c3ae239db3ee 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -164,6 +164,7 @@ #define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */ #define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */ #define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */ +#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */ /* * AttrIndx[2:0] encoding (mapping attributes defined in the MAIR* registers). diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 29fcb33ab401..39f5252673f7 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -55,7 +55,9 @@ #define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT) -#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP) +#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) +#define PAGE_HYP_EXEC __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) +#define PAGE_HYP_RO __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) #define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index bbc6a8cf83f1..1788545f25bc 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -87,6 +87,10 @@ extern void verify_cpu_run_el(void); static inline void verify_cpu_run_el(void) {} #endif +/* The section containing the hypervisor idmap text */ +extern char __hyp_idmap_text_start[]; +extern char __hyp_idmap_text_end[]; + /* The section containing the hypervisor text */ extern char __hyp_text_start[]; extern char __hyp_text_end[]; diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index f209ea151dca..3051f86a9b5f 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -87,9 +87,11 @@ struct kvm_regs { /* Supported VGICv3 address types */ #define KVM_VGIC_V3_ADDR_TYPE_DIST 2 #define KVM_VGIC_V3_ADDR_TYPE_REDIST 3 +#define KVM_VGIC_ITS_ADDR_TYPE 4 #define KVM_VGIC_V3_DIST_SIZE SZ_64K #define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K) +#define KVM_VGIC_V3_ITS_SIZE (2 * SZ_64K) #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ #define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 916d27ad79c1..62272eac1352 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -726,6 +726,19 @@ static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused return is_kernel_in_hyp_mode(); } +static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry, + int __unused) +{ + phys_addr_t idmap_addr = virt_to_phys(__hyp_idmap_text_start); + + /* + * Activate the lower HYP offset only if: + * - the idmap doesn't clash with it, + * - the kernel is not running at EL2. + */ + return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode(); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -803,6 +816,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64PFR0_EL0_SHIFT, .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, }, + { + .desc = "Reduced HYP mapping offset", + .capability = ARM64_HYP_OFFSET_LOW, + .def_scope = SCOPE_SYSTEM, + .matches = hyp_offset_low, + }, {}, }; diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 3c4e308b40a0..acf38722457b 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include #include /* @@ -36,25 +39,17 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, return res->start; } -/** - * pcibios_enable_device - Enable I/O and memory. - * @dev: PCI device to be enabled - * @mask: bitmask of BARs to enable - */ -int pcibios_enable_device(struct pci_dev *dev, int mask) -{ - if (pci_has_flag(PCI_PROBE_ONLY)) - return 0; - - return pci_enable_resources(dev, mask); -} - /* - * Try to assign the IRQ number from DT when adding a new device + * Try to assign the IRQ number when probing a new device */ -int pcibios_add_device(struct pci_dev *dev) +int pcibios_alloc_irq(struct pci_dev *dev) { - dev->irq = of_irq_parse_and_map_pci(dev, 0, 0); + if (acpi_disabled) + dev->irq = of_irq_parse_and_map_pci(dev, 0, 0); +#ifdef CONFIG_ACPI + else + return acpi_pci_irq_enable(dev); +#endif return 0; } @@ -65,13 +60,21 @@ int pcibios_add_device(struct pci_dev *dev) int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val) { - return -ENXIO; + struct pci_bus *b = pci_find_bus(domain, bus); + + if (!b) + return PCIBIOS_DEVICE_NOT_FOUND; + return b->ops->read(b, devfn, reg, len, val); } int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 val) { - return -ENXIO; + struct pci_bus *b = pci_find_bus(domain, bus); + + if (!b) + return PCIBIOS_DEVICE_NOT_FOUND; + return b->ops->write(b, devfn, reg, len, val); } #ifdef CONFIG_NUMA @@ -85,10 +88,124 @@ EXPORT_SYMBOL(pcibus_to_node); #endif #ifdef CONFIG_ACPI -/* Root bridge scanning */ + +struct acpi_pci_generic_root_info { + struct acpi_pci_root_info common; + struct pci_config_window *cfg; /* config space mapping */ +}; + +int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) +{ + struct pci_config_window *cfg = bus->sysdata; + struct acpi_device *adev = to_acpi_device(cfg->parent); + struct acpi_pci_root *root = acpi_driver_data(adev); + + return root->segment; +} + +int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) +{ + if (!acpi_disabled) { + struct pci_config_window *cfg = bridge->bus->sysdata; + struct acpi_device *adev = to_acpi_device(cfg->parent); + ACPI_COMPANION_SET(&bridge->dev, adev); + } + + return 0; +} + +/* + * Lookup the bus range for the domain in MCFG, and set up config space + * mapping. + */ +static struct pci_config_window * +pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) +{ + struct resource *bus_res = &root->secondary; + u16 seg = root->segment; + struct pci_config_window *cfg; + struct resource cfgres; + unsigned int bsz; + + /* Use address from _CBA if present, otherwise lookup MCFG */ + if (!root->mcfg_addr) + root->mcfg_addr = pci_mcfg_lookup(seg, bus_res); + + if (!root->mcfg_addr) { + dev_err(&root->device->dev, "%04x:%pR ECAM region not found\n", + seg, bus_res); + return NULL; + } + + bsz = 1 << pci_generic_ecam_ops.bus_shift; + cfgres.start = root->mcfg_addr + bus_res->start * bsz; + cfgres.end = cfgres.start + resource_size(bus_res) * bsz - 1; + cfgres.flags = IORESOURCE_MEM; + cfg = pci_ecam_create(&root->device->dev, &cfgres, bus_res, + &pci_generic_ecam_ops); + if (IS_ERR(cfg)) { + dev_err(&root->device->dev, "%04x:%pR error %ld mapping ECAM\n", + seg, bus_res, PTR_ERR(cfg)); + return NULL; + } + + return cfg; +} + +/* release_info: free resources allocated by init_info */ +static void pci_acpi_generic_release_info(struct acpi_pci_root_info *ci) +{ + struct acpi_pci_generic_root_info *ri; + + ri = container_of(ci, struct acpi_pci_generic_root_info, common); + pci_ecam_free(ri->cfg); + kfree(ri); +} + +static struct acpi_pci_root_ops acpi_pci_root_ops = { + .release_info = pci_acpi_generic_release_info, +}; + +/* Interface called from ACPI code to setup PCI host controller */ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) { - /* TODO: Should be revisited when implementing PCI on ACPI */ - return NULL; + int node = acpi_get_node(root->device->handle); + struct acpi_pci_generic_root_info *ri; + struct pci_bus *bus, *child; + + ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node); + if (!ri) + return NULL; + + ri->cfg = pci_acpi_setup_ecam_mapping(root); + if (!ri->cfg) { + kfree(ri); + return NULL; + } + + acpi_pci_root_ops.pci_ops = &ri->cfg->ops->pci_ops; + bus = acpi_pci_root_create(root, &acpi_pci_root_ops, &ri->common, + ri->cfg); + if (!bus) + return NULL; + + pci_bus_size_bridges(bus); + pci_bus_assign_resources(bus); + + list_for_each_entry(child, &bus->children, node) + pcie_bus_configure_settings(child); + + return bus; } + +void pcibios_add_bus(struct pci_bus *bus) +{ + acpi_pci_add_bus(bus); +} + +void pcibios_remove_bus(struct pci_bus *bus) +{ + acpi_pci_remove_bus(bus); +} + #endif diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index c4f26ef91e77..9d2eff0b3ad3 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -36,6 +36,7 @@ config KVM select HAVE_KVM_IRQFD select KVM_ARM_VGIC_V3 select KVM_ARM_PMU if HW_PERF_EVENTS + select HAVE_KVM_MSI ---help--- Support hosting virtualized guest machines. We don't support KVM with 16K page tables yet, due to the multiple @@ -54,13 +55,6 @@ config KVM_ARM_PMU Adds support for a virtual Performance Monitoring Unit (PMU) in virtual machines. -config KVM_NEW_VGIC - bool "New VGIC implementation" - depends on KVM - default y - ---help--- - uses the new VGIC implementation - source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index a7a958ca29d5..a5b96642a9cb 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -20,7 +20,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o -ifeq ($(CONFIG_KVM_NEW_VGIC),y) kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o @@ -30,12 +29,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o -else -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o -endif +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 32fad75bb9ff..3f9e15722473 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -211,7 +211,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) /** * kvm_arm_copy_reg_indices - get indices of all registers. * - * We do core registers right here, then we apppend system regs. + * We do core registers right here, then we append system regs. */ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index a873a6d8be90..6b29d3d9e1f2 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -53,10 +53,9 @@ __invalid: b . /* - * x0: HYP boot pgd - * x1: HYP pgd - * x2: HYP stack - * x3: HYP vectors + * x0: HYP pgd + * x1: HYP stack + * x2: HYP vectors */ __do_hyp_init: @@ -110,71 +109,27 @@ __do_hyp_init: msr sctlr_el2, x4 isb - /* Skip the trampoline dance if we merged the boot and runtime PGDs */ - cmp x0, x1 - b.eq merged - - /* MMU is now enabled. Get ready for the trampoline dance */ - ldr x4, =TRAMPOLINE_VA - adr x5, target - bfi x4, x5, #0, #PAGE_SHIFT - br x4 - -target: /* We're now in the trampoline code, switch page tables */ - msr ttbr0_el2, x1 - isb - - /* Invalidate the old TLBs */ - tlbi alle2 - dsb sy - -merged: /* Set the stack and new vectors */ + kern_hyp_va x1 + mov sp, x1 kern_hyp_va x2 - mov sp, x2 - kern_hyp_va x3 - msr vbar_el2, x3 + msr vbar_el2, x2 /* Hello, World! */ eret ENDPROC(__kvm_hyp_init) /* - * Reset kvm back to the hyp stub. This is the trampoline dance in - * reverse. If kvm used an extended idmap, __extended_idmap_trampoline - * calls this code directly in the idmap. In this case switching to the - * boot tables is a no-op. - * - * x0: HYP boot pgd - * x1: HYP phys_idmap_start + * Reset kvm back to the hyp stub. */ ENTRY(__kvm_hyp_reset) - /* We're in trampoline code in VA, switch back to boot page tables */ - msr ttbr0_el2, x0 - isb - - /* Ensure the PA branch doesn't find a stale tlb entry or stale code. */ - ic iallu - tlbi alle2 - dsb sy - isb - - /* Branch into PA space */ - adr x0, 1f - bfi x1, x0, #0, #PAGE_SHIFT - br x1 - /* We're now in idmap, disable MMU */ -1: mrs x0, sctlr_el2 + mrs x0, sctlr_el2 ldr x1, =SCTLR_ELx_FLAGS bic x0, x0, x1 // Clear SCTL_M and etc msr sctlr_el2, x0 isb - /* Invalidate the old TLBs */ - tlbi alle2 - dsb sy - /* Install stub vectors */ adr_l x0, __hyp_stub_vectors msr vbar_el2, x0 diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 70254a65bd5b..ce9e5e5f28cf 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -164,22 +164,3 @@ alternative_endif eret ENDPROC(__fpsimd_guest_restore) - -/* - * When using the extended idmap, we don't have a trampoline page we can use - * while we switch pages tables during __kvm_hyp_reset. Accessing the idmap - * directly would be ideal, but if we're using the extended idmap then the - * idmap is located above HYP_PAGE_OFFSET, and the address will be masked by - * kvm_call_hyp using kern_hyp_va. - * - * x0: HYP boot pgd - * x1: HYP phys_idmap_start - */ -ENTRY(__extended_idmap_trampoline) - mov x4, x1 - adr_l x3, __kvm_hyp_reset - - /* insert __kvm_hyp_reset()s offset into phys_idmap_start */ - bfi x4, x3, #0, #PAGE_SHIFT - br x4 -ENDPROC(__extended_idmap_trampoline) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 2d87f36d5cb4..f6d9694ae3b1 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -62,6 +62,21 @@ ENTRY(__vhe_hyp_call) isb ret ENDPROC(__vhe_hyp_call) + +/* + * Compute the idmap address of __kvm_hyp_reset based on the idmap + * start passed as a parameter, and jump there. + * + * x0: HYP phys_idmap_start + */ +ENTRY(__kvm_hyp_teardown) + mov x4, x0 + adr_l x3, __kvm_hyp_reset + + /* insert __kvm_hyp_reset()s offset into phys_idmap_start */ + bfi x4, x3, #0, #PAGE_SHIFT + br x4 +ENDPROC(__kvm_hyp_teardown) el1_sync: // Guest trapped into EL2 save_x0_to_x3 diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 4373997d1a70..ae7855f16ec2 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -299,9 +299,16 @@ static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:% static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par) { - unsigned long str_va = (unsigned long)__hyp_panic_string; + unsigned long str_va; - __hyp_do_panic(hyp_kern_va(str_va), + /* + * Force the panic string to be loaded from the literal pool, + * making sure it is a kernel address and not a PC-relative + * reference. + */ + asm volatile("ldr %0, =__hyp_panic_string" : "=r" (str_va)); + + __hyp_do_panic(str_va, spsr, elr, read_sysreg(esr_el2), read_sysreg_el2(far), read_sysreg(hpfar_el2), par, diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index b1ad730e1567..5bc460884639 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -65,7 +65,7 @@ static bool cpu_has_32bit_el1(void) * We currently assume that the number of HW registers is uniform * across all CPUs (see cpuinfo_sanity_check). */ -int kvm_arch_dev_ioctl_check_extension(long ext) +int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) { int r; @@ -86,6 +86,12 @@ int kvm_arch_dev_ioctl_check_extension(long ext) case KVM_CAP_VCPU_ATTRIBUTES: r = 1; break; + case KVM_CAP_MSI_DEVID: + if (!kvm) + r = -EINVAL; + else + r = kvm->arch.vgic.msis_require_devid; + break; default: r = 0; } @@ -98,7 +104,7 @@ int kvm_arch_dev_ioctl_check_extension(long ext) * @vcpu: The VCPU pointer * * This function finds the right table above and sets the registers on - * the virtual CPU struct to their architectually defined reset + * the virtual CPU struct to their architecturally defined reset * values. */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) @@ -132,31 +138,3 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) /* Reset timer */ return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq); } - -extern char __hyp_idmap_text_start[]; - -unsigned long kvm_hyp_reset_entry(void) -{ - if (!__kvm_cpu_uses_extended_idmap()) { - unsigned long offset; - - /* - * Find the address of __kvm_hyp_reset() in the trampoline page. - * This is present in the running page tables, and the boot page - * tables, so we call the code here to start the trampoline - * dance in reverse. - */ - offset = (unsigned long)__kvm_hyp_reset - - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK); - - return TRAMPOLINE_VA + offset; - } else { - /* - * KVM is running with merged page tables, which don't have the - * trampoline page mapped. We know the idmap is still mapped, - * but can't be called into directly. Use - * __extended_idmap_trampoline to do the call. - */ - return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline); - } -} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a57d650f552c..b0b225ceca18 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1546,7 +1546,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu, struct sys_reg_params *params) { u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu); - int cp; + int cp = -1; switch(hsr_ec) { case ESR_ELx_EC_CP15_32: @@ -1558,7 +1558,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu, cp = 14; break; default: - WARN_ON((cp = -1)); + WARN_ON(1); } kvm_err("Unsupported guest CP%d access at: %08lx\n", diff --git a/arch/cris/arch-v10/drivers/axisflashmap.c b/arch/cris/arch-v10/drivers/axisflashmap.c index 60d57c590032..bdc25aa43468 100644 --- a/arch/cris/arch-v10/drivers/axisflashmap.c +++ b/arch/cris/arch-v10/drivers/axisflashmap.c @@ -397,7 +397,7 @@ static int __init init_axis_flash(void) if (!romfs_in_flash) { /* Create an RAM device for the root partition (romfs). */ -#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0) || (CONFIG_MTDRAM_ABS_POS != 0) +#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0) /* No use trying to boot this kernel from RAM. Panic! */ printk(KERN_EMERG "axisflashmap: Cannot create an MTD RAM " "device due to kernel (mis)configuration!\n"); diff --git a/arch/cris/arch-v32/drivers/axisflashmap.c b/arch/cris/arch-v32/drivers/axisflashmap.c index bd10d3ba0949..87656c41fec7 100644 --- a/arch/cris/arch-v32/drivers/axisflashmap.c +++ b/arch/cris/arch-v32/drivers/axisflashmap.c @@ -320,7 +320,7 @@ static int __init init_axis_flash(void) * but its size must be configured as 0 so as not to conflict * with our usage. */ -#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0) || (CONFIG_MTDRAM_ABS_POS != 0) +#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0) if (!romfs_in_flash && !nand_boot) { printk(KERN_EMERG "axisflashmap: Cannot create an MTD RAM " "device; configure CONFIG_MTD_MTDRAM with size = 0!\n"); diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h index fc3ecb55f1b2..2a120bb70e54 100644 --- a/arch/microblaze/include/asm/pci.h +++ b/arch/microblaze/include/asm/pci.h @@ -82,9 +82,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, pgprot_t prot); #define HAVE_ARCH_PCI_RESOURCE_TO_USER -extern void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, - resource_size_t *start, resource_size_t *end); extern void pcibios_setup_bus_devices(struct pci_bus *bus); extern void pcibios_setup_bus_self(struct pci_bus *bus); diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index 14cba600da7a..81556b843a8e 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -218,33 +218,6 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev, return NULL; } -/* - * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci - * device mapping. - */ -static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp, - pgprot_t protection, - enum pci_mmap_state mmap_state, - int write_combine) -{ - pgprot_t prot = protection; - - /* Write combine is always 0 on non-memory space mappings. On - * memory space, if the user didn't pass 1, we check for a - * "prefetchable" resource. This is a bit hackish, but we use - * this to workaround the inability of /sysfs to provide a write - * combine bit - */ - if (mmap_state != pci_mmap_mem) - write_combine = 0; - else if (write_combine == 0) { - if (rp->flags & IORESOURCE_PREFETCH) - write_combine = 1; - } - - return pgprot_noncached(prot); -} - /* * This one is used by /dev/mem and fbdev who have no clue about the * PCI device, it tries to find the PCI device first and calls the @@ -317,9 +290,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, return -EINVAL; vma->vm_pgoff = offset >> PAGE_SHIFT; - vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp, - vma->vm_page_prot, - mmap_state, write_combine); + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, vma->vm_end - vma->vm_start, vma->vm_page_prot); @@ -473,39 +444,25 @@ void pci_resource_to_user(const struct pci_dev *dev, int bar, const struct resource *rsrc, resource_size_t *start, resource_size_t *end) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - resource_size_t offset = 0; + struct pci_bus_region region; - if (hose == NULL) + if (rsrc->flags & IORESOURCE_IO) { + pcibios_resource_to_bus(dev->bus, ®ion, + (struct resource *) rsrc); + *start = region.start; + *end = region.end; return; + } - if (rsrc->flags & IORESOURCE_IO) - offset = (unsigned long)hose->io_base_virt - _IO_BASE; - - /* We pass a fully fixed up address to userland for MMIO instead of - * a BAR value because X is lame and expects to be able to use that - * to pass to /dev/mem ! + /* We pass a CPU physical address to userland for MMIO instead of a + * BAR value because X is lame and expects to be able to use that + * to pass to /dev/mem! * - * That means that we'll have potentially 64 bits values where some - * userland apps only expect 32 (like X itself since it thinks only - * Sparc has 64 bits MMIO) but if we don't do that, we break it on - * 32 bits CHRPs :-( - * - * Hopefully, the sysfs insterface is immune to that gunk. Once X - * has been fixed (and the fix spread enough), we can re-enable the - * 2 lines below and pass down a BAR value to userland. In that case - * we'll also have to re-enable the matching code in - * __pci_mmap_make_offset(). - * - * BenH. + * That means we may have 64-bit values where some apps only expect + * 32 (like X itself since it thinks only Sparc has 64-bit MMIO). */ -#if 0 - else if (rsrc->flags & IORESOURCE_MEM) - offset = hose->pci_mem_offset; -#endif - - *start = rsrc->start - offset; - *end = rsrc->end - offset; + *start = rsrc->start; + *end = rsrc->end; } /** diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ac91939b9b75..29867139851e 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1488,6 +1488,7 @@ config CPU_MIPS64_R2 select CPU_SUPPORTS_HIGHMEM select CPU_SUPPORTS_HUGEPAGES select CPU_SUPPORTS_MSA + select HAVE_KVM help Choose this option to build a kernel for release 2 or later of the MIPS64 architecture. Many modern embedded systems with a 64-bit @@ -1505,6 +1506,7 @@ config CPU_MIPS64_R6 select CPU_SUPPORTS_MSA select GENERIC_CSUM select MIPS_O32_FP64_SUPPORT if MIPS32_O32 + select HAVE_KVM help Choose this option to build a kernel for release 6 or later of the MIPS64 architecture. New MIPS processors, starting with the Warrior diff --git a/arch/mips/include/asm/addrspace.h b/arch/mips/include/asm/addrspace.h index 3b0e51d5a613..c5b04e752e97 100644 --- a/arch/mips/include/asm/addrspace.h +++ b/arch/mips/include/asm/addrspace.h @@ -45,7 +45,7 @@ /* * Returns the kernel segment base of a given address */ -#define KSEGX(a) ((_ACAST32_ (a)) & 0xe0000000) +#define KSEGX(a) ((_ACAST32_(a)) & _ACAST32_(0xe0000000)) /* * Returns the physical address of a CKSEGx / XKPHYS address diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 36a391d289aa..b54bcadd8aec 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -19,6 +19,9 @@ #include #include +#include +#include + /* MIPS KVM register ids */ #define MIPS_CP0_32(_R, _S) \ (KVM_REG_MIPS_CP0 | KVM_REG_SIZE_U32 | (8 * (_R) + (_S))) @@ -53,6 +56,12 @@ #define KVM_REG_MIPS_CP0_CONFIG7 MIPS_CP0_32(16, 7) #define KVM_REG_MIPS_CP0_XCONTEXT MIPS_CP0_64(20, 0) #define KVM_REG_MIPS_CP0_ERROREPC MIPS_CP0_64(30, 0) +#define KVM_REG_MIPS_CP0_KSCRATCH1 MIPS_CP0_64(31, 2) +#define KVM_REG_MIPS_CP0_KSCRATCH2 MIPS_CP0_64(31, 3) +#define KVM_REG_MIPS_CP0_KSCRATCH3 MIPS_CP0_64(31, 4) +#define KVM_REG_MIPS_CP0_KSCRATCH4 MIPS_CP0_64(31, 5) +#define KVM_REG_MIPS_CP0_KSCRATCH5 MIPS_CP0_64(31, 6) +#define KVM_REG_MIPS_CP0_KSCRATCH6 MIPS_CP0_64(31, 7) #define KVM_MAX_VCPUS 1 @@ -65,8 +74,14 @@ -/* Special address that contains the comm page, used for reducing # of traps */ -#define KVM_GUEST_COMMPAGE_ADDR 0x0 +/* + * Special address that contains the comm page, used for reducing # of traps + * This needs to be within 32Kb of 0x0 (so the zero register can be used), but + * preferably not at 0x0 so that most kernel NULL pointer dereferences can be + * caught. + */ +#define KVM_GUEST_COMMPAGE_ADDR ((PAGE_SIZE > 0x8000) ? 0 : \ + (0x8000 - PAGE_SIZE)) #define KVM_GUEST_KERNEL_MODE(vcpu) ((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \ ((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0)) @@ -93,9 +108,6 @@ #define KVM_INVALID_ADDR 0xdeadbeef extern atomic_t kvm_mips_instance; -extern kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn); -extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn); -extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn); struct kvm_vm_stat { u32 remote_tlb_flush; @@ -126,28 +138,6 @@ struct kvm_vcpu_stat { u32 halt_wakeup; }; -enum kvm_mips_exit_types { - WAIT_EXITS, - CACHE_EXITS, - SIGNAL_EXITS, - INT_EXITS, - COP_UNUSABLE_EXITS, - TLBMOD_EXITS, - TLBMISS_LD_EXITS, - TLBMISS_ST_EXITS, - ADDRERR_ST_EXITS, - ADDRERR_LD_EXITS, - SYSCALL_EXITS, - RESVD_INST_EXITS, - BREAK_INST_EXITS, - TRAP_INST_EXITS, - MSA_FPE_EXITS, - FPE_EXITS, - MSA_DISABLED_EXITS, - FLUSH_DCACHE_EXITS, - MAX_KVM_MIPS_EXIT_TYPES -}; - struct kvm_arch_memory_slot { }; @@ -215,73 +205,6 @@ struct mips_coproc { #define MIPS_CP0_CONFIG4_SEL 4 #define MIPS_CP0_CONFIG5_SEL 5 -/* Config0 register bits */ -#define CP0C0_M 31 -#define CP0C0_K23 28 -#define CP0C0_KU 25 -#define CP0C0_MDU 20 -#define CP0C0_MM 17 -#define CP0C0_BM 16 -#define CP0C0_BE 15 -#define CP0C0_AT 13 -#define CP0C0_AR 10 -#define CP0C0_MT 7 -#define CP0C0_VI 3 -#define CP0C0_K0 0 - -/* Config1 register bits */ -#define CP0C1_M 31 -#define CP0C1_MMU 25 -#define CP0C1_IS 22 -#define CP0C1_IL 19 -#define CP0C1_IA 16 -#define CP0C1_DS 13 -#define CP0C1_DL 10 -#define CP0C1_DA 7 -#define CP0C1_C2 6 -#define CP0C1_MD 5 -#define CP0C1_PC 4 -#define CP0C1_WR 3 -#define CP0C1_CA 2 -#define CP0C1_EP 1 -#define CP0C1_FP 0 - -/* Config2 Register bits */ -#define CP0C2_M 31 -#define CP0C2_TU 28 -#define CP0C2_TS 24 -#define CP0C2_TL 20 -#define CP0C2_TA 16 -#define CP0C2_SU 12 -#define CP0C2_SS 8 -#define CP0C2_SL 4 -#define CP0C2_SA 0 - -/* Config3 Register bits */ -#define CP0C3_M 31 -#define CP0C3_ISA_ON_EXC 16 -#define CP0C3_ULRI 13 -#define CP0C3_DSPP 10 -#define CP0C3_LPA 7 -#define CP0C3_VEIC 6 -#define CP0C3_VInt 5 -#define CP0C3_SP 4 -#define CP0C3_MT 2 -#define CP0C3_SM 1 -#define CP0C3_TL 0 - -/* MMU types, the first four entries have the same layout as the - CP0C0_MT field. */ -enum mips_mmu_types { - MMU_TYPE_NONE, - MMU_TYPE_R4000, - MMU_TYPE_RESERVED, - MMU_TYPE_FMT, - MMU_TYPE_R3000, - MMU_TYPE_R6000, - MMU_TYPE_R8000 -}; - /* Resume Flags */ #define RESUME_FLAG_DR (1<<0) /* Reload guest nonvolatile state? */ #define RESUME_FLAG_HOST (1<<1) /* Resume host? */ @@ -298,11 +221,6 @@ enum emulation_result { EMULATE_PRIV_FAIL, }; -#define MIPS3_PG_G 0x00000001 /* Global; ignore ASID if in lo0 & lo1 */ -#define MIPS3_PG_V 0x00000002 /* Valid */ -#define MIPS3_PG_NV 0x00000000 -#define MIPS3_PG_D 0x00000004 /* Dirty */ - #define mips3_paddr_to_tlbpfn(x) \ (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME) #define mips3_tlbpfn_to_paddr(x) \ @@ -313,13 +231,11 @@ enum emulation_result { #define VPN2_MASK 0xffffe000 #define KVM_ENTRYHI_ASID MIPS_ENTRYHI_ASID -#define TLB_IS_GLOBAL(x) (((x).tlb_lo0 & MIPS3_PG_G) && \ - ((x).tlb_lo1 & MIPS3_PG_G)) +#define TLB_IS_GLOBAL(x) ((x).tlb_lo[0] & (x).tlb_lo[1] & ENTRYLO_G) #define TLB_VPN2(x) ((x).tlb_hi & VPN2_MASK) #define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID) -#define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) \ - ? ((x).tlb_lo1 & MIPS3_PG_V) \ - : ((x).tlb_lo0 & MIPS3_PG_V)) +#define TLB_LO_IDX(x, va) (((va) >> PAGE_SHIFT) & 1) +#define TLB_IS_VALID(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V) #define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \ ((y) & VPN2_MASK & ~(x).tlb_mask)) #define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \ @@ -328,26 +244,23 @@ enum emulation_result { struct kvm_mips_tlb { long tlb_mask; long tlb_hi; - long tlb_lo0; - long tlb_lo1; + long tlb_lo[2]; }; -#define KVM_MIPS_FPU_FPU 0x1 -#define KVM_MIPS_FPU_MSA 0x2 +#define KVM_MIPS_AUX_FPU 0x1 +#define KVM_MIPS_AUX_MSA 0x2 #define KVM_MIPS_GUEST_TLB_SIZE 64 struct kvm_vcpu_arch { - void *host_ebase, *guest_ebase; + void *guest_ebase; int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); unsigned long host_stack; unsigned long host_gp; /* Host CP0 registers used when handling exits from guest */ unsigned long host_cp0_badvaddr; - unsigned long host_cp0_cause; unsigned long host_cp0_epc; - unsigned long host_cp0_entryhi; - uint32_t guest_inst; + u32 host_cp0_cause; /* GPRS */ unsigned long gprs[32]; @@ -357,8 +270,8 @@ struct kvm_vcpu_arch { /* FPU State */ struct mips_fpu_struct fpu; - /* Which FPU state is loaded (KVM_MIPS_FPU_*) */ - unsigned int fpu_inuse; + /* Which auxiliary state is loaded (KVM_MIPS_AUX_*) */ + unsigned int aux_inuse; /* COP0 State */ struct mips_coproc *cop0; @@ -370,11 +283,11 @@ struct kvm_vcpu_arch { struct hrtimer comparecount_timer; /* Count timer control KVM register */ - uint32_t count_ctl; + u32 count_ctl; /* Count bias from the raw time */ - uint32_t count_bias; + u32 count_bias; /* Frequency of timer in Hz */ - uint32_t count_hz; + u32 count_hz; /* Dynamic nanosecond bias (multiple of count_period) to avoid overflow */ s64 count_dyn_bias; /* Resume time */ @@ -388,7 +301,7 @@ struct kvm_vcpu_arch { /* Bitmask of pending exceptions to be cleared */ unsigned long pending_exceptions_clr; - unsigned long pending_load_cause; + u32 pending_load_cause; /* Save/Restore the entryhi register when are are preempted/scheduled back in */ unsigned long preempt_entryhi; @@ -397,8 +310,8 @@ struct kvm_vcpu_arch { struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE]; /* Cached guest kernel/user ASIDs */ - uint32_t guest_user_asid[NR_CPUS]; - uint32_t guest_kernel_asid[NR_CPUS]; + u32 guest_user_asid[NR_CPUS]; + u32 guest_kernel_asid[NR_CPUS]; struct mm_struct guest_kernel_mm, guest_user_mm; int last_sched_cpu; @@ -408,6 +321,7 @@ struct kvm_vcpu_arch { u8 fpu_enabled; u8 msa_enabled; + u8 kscratch_enabled; }; @@ -461,6 +375,18 @@ struct kvm_vcpu_arch { #define kvm_write_c0_guest_config7(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][7] = (val)) #define kvm_read_c0_guest_errorepc(cop0) (cop0->reg[MIPS_CP0_ERROR_PC][0]) #define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val)) +#define kvm_read_c0_guest_kscratch1(cop0) (cop0->reg[MIPS_CP0_DESAVE][2]) +#define kvm_read_c0_guest_kscratch2(cop0) (cop0->reg[MIPS_CP0_DESAVE][3]) +#define kvm_read_c0_guest_kscratch3(cop0) (cop0->reg[MIPS_CP0_DESAVE][4]) +#define kvm_read_c0_guest_kscratch4(cop0) (cop0->reg[MIPS_CP0_DESAVE][5]) +#define kvm_read_c0_guest_kscratch5(cop0) (cop0->reg[MIPS_CP0_DESAVE][6]) +#define kvm_read_c0_guest_kscratch6(cop0) (cop0->reg[MIPS_CP0_DESAVE][7]) +#define kvm_write_c0_guest_kscratch1(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][2] = (val)) +#define kvm_write_c0_guest_kscratch2(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][3] = (val)) +#define kvm_write_c0_guest_kscratch3(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][4] = (val)) +#define kvm_write_c0_guest_kscratch4(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][5] = (val)) +#define kvm_write_c0_guest_kscratch5(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][6] = (val)) +#define kvm_write_c0_guest_kscratch6(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][7] = (val)) /* * Some of the guest registers may be modified asynchronously (e.g. from a @@ -474,7 +400,7 @@ static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg, unsigned long temp; do { __asm__ __volatile__( - " .set mips3 \n" + " .set "MIPS_ISA_ARCH_LEVEL" \n" " " __LL "%0, %1 \n" " or %0, %2 \n" " " __SC "%0, %1 \n" @@ -490,7 +416,7 @@ static inline void _kvm_atomic_clear_c0_guest_reg(unsigned long *reg, unsigned long temp; do { __asm__ __volatile__( - " .set mips3 \n" + " .set "MIPS_ISA_ARCH_LEVEL" \n" " " __LL "%0, %1 \n" " and %0, %2 \n" " " __SC "%0, %1 \n" @@ -507,7 +433,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg, unsigned long temp; do { __asm__ __volatile__( - " .set mips3 \n" + " .set "MIPS_ISA_ARCH_LEVEL" \n" " " __LL "%0, %1 \n" " and %0, %2 \n" " or %0, %3 \n" @@ -542,7 +468,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg, static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu) { - return (!__builtin_constant_p(cpu_has_fpu) || cpu_has_fpu) && + return (!__builtin_constant_p(raw_cpu_has_fpu) || raw_cpu_has_fpu) && vcpu->fpu_enabled; } @@ -589,9 +515,11 @@ struct kvm_mips_callbacks { void (*dequeue_io_int)(struct kvm_vcpu *vcpu, struct kvm_mips_interrupt *irq); int (*irq_deliver)(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause); + u32 cause); int (*irq_clear)(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause); + u32 cause); + unsigned long (*num_regs)(struct kvm_vcpu *vcpu); + int (*copy_reg_indices)(struct kvm_vcpu *vcpu, u64 __user *indices); int (*get_one_reg)(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, s64 *v); int (*set_one_reg)(struct kvm_vcpu *vcpu, @@ -605,8 +533,13 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); -/* Trampoline ASM routine to start running in "Guest" context */ -extern int __kvm_mips_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu); +extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu); + +/* Building of entry/exception code */ +int kvm_mips_entry_setup(void); +void *kvm_mips_build_vcpu_run(void *addr); +void *kvm_mips_build_exception(void *addr, void *handler); +void *kvm_mips_build_exit(void *addr); /* FPU/MSA context management */ void __kvm_save_fpu(struct kvm_vcpu_arch *vcpu); @@ -622,11 +555,11 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu); void kvm_lose_fpu(struct kvm_vcpu *vcpu); /* TLB handling */ -uint32_t kvm_get_kernel_asid(struct kvm_vcpu *vcpu); +u32 kvm_get_kernel_asid(struct kvm_vcpu *vcpu); -uint32_t kvm_get_user_asid(struct kvm_vcpu *vcpu); +u32 kvm_get_user_asid(struct kvm_vcpu *vcpu); -uint32_t kvm_get_commpage_asid (struct kvm_vcpu *vcpu); +u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu); extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr, struct kvm_vcpu *vcpu); @@ -635,22 +568,24 @@ extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu); extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, - struct kvm_mips_tlb *tlb, - unsigned long *hpa0, - unsigned long *hpa1); + struct kvm_mips_tlb *tlb); -extern enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvm_mips_dump_host_tlbs(void); extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu); +extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, + unsigned long entrylo0, + unsigned long entrylo1, + int flush_dcache_mask); extern void kvm_mips_flush_host_tlb(int skip_kseg0); extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi); @@ -667,90 +602,90 @@ extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu); extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu); /* Emulation */ -uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu); -enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause); +u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu); +enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); -extern enum emulation_result kvm_mips_emulate_inst(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_inst(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_syscall(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_syscall(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_handle_ri(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_handle_ri(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause, - uint32_t *opc, +extern enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run); -uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu); -void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count); -void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack); +u32 kvm_mips_read_count(struct kvm_vcpu *vcpu); +void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count); +void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack); void kvm_mips_init_count(struct kvm_vcpu *vcpu); int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl); int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume); @@ -759,27 +694,27 @@ void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu); void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu); enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_check_privilege(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_check_privilege(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_cache(uint32_t inst, - uint32_t *opc, - uint32_t cause, +enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, + u32 *opc, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, - uint32_t *opc, - uint32_t cause, +enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, + u32 *opc, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_store(uint32_t inst, - uint32_t cause, +enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_load(uint32_t inst, - uint32_t cause, +enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); @@ -789,13 +724,13 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu); unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu); /* Dynamic binary translation */ -extern int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc, - struct kvm_vcpu *vcpu); -extern int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc, +extern int kvm_mips_trans_cache_index(union mips_instruction inst, + u32 *opc, struct kvm_vcpu *vcpu); +extern int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc, struct kvm_vcpu *vcpu); -extern int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, +extern int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc, struct kvm_vcpu *vcpu); -extern int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc, +extern int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc, struct kvm_vcpu *vcpu); /* Misc */ diff --git a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h index d68e685cde60..bd8b9bbe1771 100644 --- a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h +++ b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h @@ -55,7 +55,7 @@ #define cpu_has_mipsmt 0 #define cpu_has_vint 0 #define cpu_has_veic 0 -#define cpu_hwrena_impl_bits 0xc0000000 +#define cpu_hwrena_impl_bits (MIPS_HWRENA_IMPL1 | MIPS_HWRENA_IMPL2) #define cpu_has_wsbh 1 #define cpu_has_rixi (cpu_data[0].cputype != CPU_CAVIUM_OCTEON) diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h index e1ca65c62f6a..def9d8d13f6e 100644 --- a/arch/mips/include/asm/mipsregs.h +++ b/arch/mips/include/asm/mipsregs.h @@ -53,7 +53,7 @@ #define CP0_SEGCTL2 $5, 4 #define CP0_WIRED $6 #define CP0_INFO $7 -#define CP0_HWRENA $7, 0 +#define CP0_HWRENA $7 #define CP0_BADVADDR $8 #define CP0_BADINSTR $8, 1 #define CP0_COUNT $9 @@ -533,6 +533,7 @@ #define TX49_CONF_CWFON (_ULCAST_(1) << 27) /* Bits specific to the MIPS32/64 PRA. */ +#define MIPS_CONF_VI (_ULCAST_(1) << 3) #define MIPS_CONF_MT (_ULCAST_(7) << 7) #define MIPS_CONF_MT_TLB (_ULCAST_(1) << 7) #define MIPS_CONF_MT_FTLB (_ULCAST_(4) << 7) @@ -853,6 +854,24 @@ #define MIPS_CDMMBASE_ADDR_SHIFT 11 #define MIPS_CDMMBASE_ADDR_START 15 +/* RDHWR register numbers */ +#define MIPS_HWR_CPUNUM 0 /* CPU number */ +#define MIPS_HWR_SYNCISTEP 1 /* SYNCI step size */ +#define MIPS_HWR_CC 2 /* Cycle counter */ +#define MIPS_HWR_CCRES 3 /* Cycle counter resolution */ +#define MIPS_HWR_ULR 29 /* UserLocal */ +#define MIPS_HWR_IMPL1 30 /* Implementation dependent */ +#define MIPS_HWR_IMPL2 31 /* Implementation dependent */ + +/* Bits in HWREna register */ +#define MIPS_HWRENA_CPUNUM (_ULCAST_(1) << MIPS_HWR_CPUNUM) +#define MIPS_HWRENA_SYNCISTEP (_ULCAST_(1) << MIPS_HWR_SYNCISTEP) +#define MIPS_HWRENA_CC (_ULCAST_(1) << MIPS_HWR_CC) +#define MIPS_HWRENA_CCRES (_ULCAST_(1) << MIPS_HWR_CCRES) +#define MIPS_HWRENA_ULR (_ULCAST_(1) << MIPS_HWR_ULR) +#define MIPS_HWRENA_IMPL1 (_ULCAST_(1) << MIPS_HWR_IMPL1) +#define MIPS_HWRENA_IMPL2 (_ULCAST_(1) << MIPS_HWR_IMPL2) + /* * Bitfields in the TX39 family CP0 Configuration Register 3 */ diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h index 86b239d9d75d..9b63cd41213d 100644 --- a/arch/mips/include/asm/pci.h +++ b/arch/mips/include/asm/pci.h @@ -80,16 +80,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, #define HAVE_ARCH_PCI_RESOURCE_TO_USER -static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, resource_size_t *start, - resource_size_t *end) -{ - phys_addr_t size = resource_size(rsrc); - - *start = fixup_bigphys_addr(rsrc->start, size); - *end = rsrc->start + size; -} - /* * Dynamic DMA mapping stuff. * MIPS has everything mapped statically. diff --git a/arch/mips/include/asm/setup.h b/arch/mips/include/asm/setup.h index d7bfdeba9e84..4f5279a8308d 100644 --- a/arch/mips/include/asm/setup.h +++ b/arch/mips/include/asm/setup.h @@ -21,6 +21,7 @@ extern void *set_vi_handler(int n, vi_handler_t addr); extern void *set_except_vector(int n, void *addr); extern unsigned long ebase; +extern unsigned int hwrena; extern void per_cpu_trap_init(bool); extern void cpu_cache_init(void); diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h index b6ecfeee4dbe..f7929f65f7ca 100644 --- a/arch/mips/include/asm/uasm.h +++ b/arch/mips/include/asm/uasm.h @@ -104,8 +104,13 @@ Ip_u1s2(_bltz); Ip_u1s2(_bltzl); Ip_u1u2s3(_bne); Ip_u2s3u1(_cache); +Ip_u1u2(_cfc1); +Ip_u2u1(_cfcmsa); +Ip_u1u2(_ctc1); +Ip_u2u1(_ctcmsa); Ip_u2u1s3(_daddiu); Ip_u3u1u2(_daddu); +Ip_u1(_di); Ip_u2u1msbu3(_dins); Ip_u2u1msbu3(_dinsm); Ip_u1u2(_divu); @@ -141,6 +146,8 @@ Ip_u1(_mfhi); Ip_u1(_mflo); Ip_u1u2u3(_mtc0); Ip_u1u2u3(_mthc0); +Ip_u1(_mthi); +Ip_u1(_mtlo); Ip_u3u1u2(_mul); Ip_u3u1u2(_or); Ip_u2u1u3(_ori); diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h index 8051f9aa1379..77429d1622b3 100644 --- a/arch/mips/include/uapi/asm/inst.h +++ b/arch/mips/include/uapi/asm/inst.h @@ -21,20 +21,20 @@ enum major_op { spec_op, bcond_op, j_op, jal_op, beq_op, bne_op, blez_op, bgtz_op, - addi_op, cbcond0_op = addi_op, addiu_op, slti_op, sltiu_op, + addi_op, pop10_op = addi_op, addiu_op, slti_op, sltiu_op, andi_op, ori_op, xori_op, lui_op, cop0_op, cop1_op, cop2_op, cop1x_op, beql_op, bnel_op, blezl_op, bgtzl_op, - daddi_op, cbcond1_op = daddi_op, daddiu_op, ldl_op, ldr_op, + daddi_op, pop30_op = daddi_op, daddiu_op, ldl_op, ldr_op, spec2_op, jalx_op, mdmx_op, msa_op = mdmx_op, spec3_op, lb_op, lh_op, lwl_op, lw_op, lbu_op, lhu_op, lwr_op, lwu_op, sb_op, sh_op, swl_op, sw_op, sdl_op, sdr_op, swr_op, cache_op, ll_op, lwc1_op, lwc2_op, bc6_op = lwc2_op, pref_op, - lld_op, ldc1_op, ldc2_op, beqzcjic_op = ldc2_op, ld_op, + lld_op, ldc1_op, ldc2_op, pop66_op = ldc2_op, ld_op, sc_op, swc1_op, swc2_op, balc6_op = swc2_op, major_3b_op, - scd_op, sdc1_op, sdc2_op, bnezcjialc_op = sdc2_op, sd_op + scd_op, sdc1_op, sdc2_op, pop76_op = sdc2_op, sd_op }; /* @@ -92,6 +92,50 @@ enum spec3_op { rdhwr_op = 0x3b }; +/* + * Bits 10-6 minor opcode for r6 spec mult/div encodings + */ +enum mult_op { + mult_mult_op = 0x0, + mult_mul_op = 0x2, + mult_muh_op = 0x3, +}; +enum multu_op { + multu_multu_op = 0x0, + multu_mulu_op = 0x2, + multu_muhu_op = 0x3, +}; +enum div_op { + div_div_op = 0x0, + div_div6_op = 0x2, + div_mod_op = 0x3, +}; +enum divu_op { + divu_divu_op = 0x0, + divu_divu6_op = 0x2, + divu_modu_op = 0x3, +}; +enum dmult_op { + dmult_dmult_op = 0x0, + dmult_dmul_op = 0x2, + dmult_dmuh_op = 0x3, +}; +enum dmultu_op { + dmultu_dmultu_op = 0x0, + dmultu_dmulu_op = 0x2, + dmultu_dmuhu_op = 0x3, +}; +enum ddiv_op { + ddiv_ddiv_op = 0x0, + ddiv_ddiv6_op = 0x2, + ddiv_dmod_op = 0x3, +}; +enum ddivu_op { + ddivu_ddivu_op = 0x0, + ddivu_ddivu6_op = 0x2, + ddivu_dmodu_op = 0x3, +}; + /* * rt field of bcond opcodes. */ @@ -103,7 +147,7 @@ enum rt_op { bltzal_op, bgezal_op, bltzall_op, bgezall_op, rt_op_0x14, rt_op_0x15, rt_op_0x16, rt_op_0x17, rt_op_0x18, rt_op_0x19, rt_op_0x1a, rt_op_0x1b, - bposge32_op, rt_op_0x1d, rt_op_0x1e, rt_op_0x1f + bposge32_op, rt_op_0x1d, rt_op_0x1e, synci_op }; /* @@ -237,6 +281,21 @@ enum bshfl_func { seh_op = 0x18, }; +/* + * MSA minor opcodes. + */ +enum msa_func { + msa_elm_op = 0x19, +}; + +/* + * MSA ELM opcodes. + */ +enum msa_elm { + msa_ctc_op = 0x3e, + msa_cfc_op = 0x7e, +}; + /* * func field for MSA MI10 format. */ @@ -264,7 +323,7 @@ enum mm_major_op { mm_pool32b_op, mm_pool16b_op, mm_lhu16_op, mm_andi16_op, mm_addiu32_op, mm_lhu32_op, mm_sh32_op, mm_lh32_op, mm_pool32i_op, mm_pool16c_op, mm_lwsp16_op, mm_pool16d_op, - mm_ori32_op, mm_pool32f_op, mm_reserved1_op, mm_reserved2_op, + mm_ori32_op, mm_pool32f_op, mm_pool32s_op, mm_reserved2_op, mm_pool32c_op, mm_lwgp16_op, mm_lw16_op, mm_pool16e_op, mm_xori32_op, mm_jals32_op, mm_addiupc_op, mm_reserved3_op, mm_reserved4_op, mm_pool16f_op, mm_sb16_op, mm_beqz16_op, @@ -360,7 +419,10 @@ enum mm_32axf_minor_op { mm_mflo32_op = 0x075, mm_jalrhb_op = 0x07c, mm_tlbwi_op = 0x08d, + mm_mthi32_op = 0x0b5, mm_tlbwr_op = 0x0cd, + mm_mtlo32_op = 0x0f5, + mm_di_op = 0x11d, mm_jalrs_op = 0x13c, mm_jalrshb_op = 0x17c, mm_sync_op = 0x1ad, @@ -478,6 +540,13 @@ enum mm_32f_73_minor_op { mm_fcvts1_op = 0xed, }; +/* + * (microMIPS) POOL32S minor opcodes. + */ +enum mm_32s_minor_op { + mm_32s_elm_op = 0x16, +}; + /* * (microMIPS) POOL16C minor opcodes. */ @@ -586,6 +655,36 @@ struct r_format { /* Register format */ ;)))))) }; +struct c0r_format { /* C0 register format */ + __BITFIELD_FIELD(unsigned int opcode : 6, + __BITFIELD_FIELD(unsigned int rs : 5, + __BITFIELD_FIELD(unsigned int rt : 5, + __BITFIELD_FIELD(unsigned int rd : 5, + __BITFIELD_FIELD(unsigned int z: 8, + __BITFIELD_FIELD(unsigned int sel : 3, + ;)))))) +}; + +struct mfmc0_format { /* MFMC0 register format */ + __BITFIELD_FIELD(unsigned int opcode : 6, + __BITFIELD_FIELD(unsigned int rs : 5, + __BITFIELD_FIELD(unsigned int rt : 5, + __BITFIELD_FIELD(unsigned int rd : 5, + __BITFIELD_FIELD(unsigned int re : 5, + __BITFIELD_FIELD(unsigned int sc : 1, + __BITFIELD_FIELD(unsigned int : 2, + __BITFIELD_FIELD(unsigned int sel : 3, + ;)))))))) +}; + +struct co_format { /* C0 CO format */ + __BITFIELD_FIELD(unsigned int opcode : 6, + __BITFIELD_FIELD(unsigned int co : 1, + __BITFIELD_FIELD(unsigned int code : 19, + __BITFIELD_FIELD(unsigned int func : 6, + ;)))) +}; + struct p_format { /* Performance counter format (R10000) */ __BITFIELD_FIELD(unsigned int opcode : 6, __BITFIELD_FIELD(unsigned int rs : 5, @@ -937,6 +1036,9 @@ union mips_instruction { struct u_format u_format; struct c_format c_format; struct r_format r_format; + struct c0r_format c0r_format; + struct mfmc0_format mfmc0_format; + struct co_format co_format; struct p_format p_format; struct f_format f_format; struct ma_format ma_format; diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index 1ea973b2abb1..fae2f9447792 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -339,71 +339,9 @@ void output_pm_defines(void) } #endif -void output_cpuinfo_defines(void) -{ - COMMENT(" MIPS cpuinfo offsets. "); - DEFINE(CPUINFO_SIZE, sizeof(struct cpuinfo_mips)); -#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE - OFFSET(CPUINFO_ASID_MASK, cpuinfo_mips, asid_mask); -#endif -} - void output_kvm_defines(void) { COMMENT(" KVM/MIPS Specfic offsets. "); - DEFINE(VCPU_ARCH_SIZE, sizeof(struct kvm_vcpu_arch)); - OFFSET(VCPU_RUN, kvm_vcpu, run); - OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch); - - OFFSET(VCPU_HOST_EBASE, kvm_vcpu_arch, host_ebase); - OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase); - - OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack); - OFFSET(VCPU_HOST_GP, kvm_vcpu_arch, host_gp); - - OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr); - OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause); - OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc); - OFFSET(VCPU_HOST_ENTRYHI, kvm_vcpu_arch, host_cp0_entryhi); - - OFFSET(VCPU_GUEST_INST, kvm_vcpu_arch, guest_inst); - - OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]); - OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]); - OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]); - OFFSET(VCPU_R3, kvm_vcpu_arch, gprs[3]); - OFFSET(VCPU_R4, kvm_vcpu_arch, gprs[4]); - OFFSET(VCPU_R5, kvm_vcpu_arch, gprs[5]); - OFFSET(VCPU_R6, kvm_vcpu_arch, gprs[6]); - OFFSET(VCPU_R7, kvm_vcpu_arch, gprs[7]); - OFFSET(VCPU_R8, kvm_vcpu_arch, gprs[8]); - OFFSET(VCPU_R9, kvm_vcpu_arch, gprs[9]); - OFFSET(VCPU_R10, kvm_vcpu_arch, gprs[10]); - OFFSET(VCPU_R11, kvm_vcpu_arch, gprs[11]); - OFFSET(VCPU_R12, kvm_vcpu_arch, gprs[12]); - OFFSET(VCPU_R13, kvm_vcpu_arch, gprs[13]); - OFFSET(VCPU_R14, kvm_vcpu_arch, gprs[14]); - OFFSET(VCPU_R15, kvm_vcpu_arch, gprs[15]); - OFFSET(VCPU_R16, kvm_vcpu_arch, gprs[16]); - OFFSET(VCPU_R17, kvm_vcpu_arch, gprs[17]); - OFFSET(VCPU_R18, kvm_vcpu_arch, gprs[18]); - OFFSET(VCPU_R19, kvm_vcpu_arch, gprs[19]); - OFFSET(VCPU_R20, kvm_vcpu_arch, gprs[20]); - OFFSET(VCPU_R21, kvm_vcpu_arch, gprs[21]); - OFFSET(VCPU_R22, kvm_vcpu_arch, gprs[22]); - OFFSET(VCPU_R23, kvm_vcpu_arch, gprs[23]); - OFFSET(VCPU_R24, kvm_vcpu_arch, gprs[24]); - OFFSET(VCPU_R25, kvm_vcpu_arch, gprs[25]); - OFFSET(VCPU_R26, kvm_vcpu_arch, gprs[26]); - OFFSET(VCPU_R27, kvm_vcpu_arch, gprs[27]); - OFFSET(VCPU_R28, kvm_vcpu_arch, gprs[28]); - OFFSET(VCPU_R29, kvm_vcpu_arch, gprs[29]); - OFFSET(VCPU_R30, kvm_vcpu_arch, gprs[30]); - OFFSET(VCPU_R31, kvm_vcpu_arch, gprs[31]); - OFFSET(VCPU_LO, kvm_vcpu_arch, lo); - OFFSET(VCPU_HI, kvm_vcpu_arch, hi); - OFFSET(VCPU_PC, kvm_vcpu_arch, pc); - BLANK(); OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]); OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]); @@ -441,14 +379,6 @@ void output_kvm_defines(void) OFFSET(VCPU_FCR31, kvm_vcpu_arch, fpu.fcr31); OFFSET(VCPU_MSA_CSR, kvm_vcpu_arch, fpu.msacsr); BLANK(); - - OFFSET(VCPU_COP0, kvm_vcpu_arch, cop0); - OFFSET(VCPU_GUEST_KERNEL_ASID, kvm_vcpu_arch, guest_kernel_asid); - OFFSET(VCPU_GUEST_USER_ASID, kvm_vcpu_arch, guest_user_asid); - - OFFSET(COP0_TLB_HI, mips_coproc, reg[MIPS_CP0_TLB_HI][0]); - OFFSET(COP0_STATUS, mips_coproc, reg[MIPS_CP0_STATUS][0]); - BLANK(); } #ifdef CONFIG_MIPS_CPS diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c index 6dc3f1fdaccc..46c227fc98f5 100644 --- a/arch/mips/kernel/branch.c +++ b/arch/mips/kernel/branch.c @@ -790,7 +790,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs, epc += 4 + (insn.i_format.simmediate << 2); regs->cp0_epc = epc; break; - case beqzcjic_op: + case pop66_op: if (!cpu_has_mips_r6) { ret = -SIGILL; break; @@ -798,7 +798,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs, /* Compact branch: BEQZC || JIC */ regs->cp0_epc += 8; break; - case bnezcjialc_op: + case pop76_op: if (!cpu_has_mips_r6) { ret = -SIGILL; break; @@ -809,8 +809,8 @@ int __compute_return_epc_for_insn(struct pt_regs *regs, regs->cp0_epc += 8; break; #endif - case cbcond0_op: - case cbcond1_op: + case pop10_op: + case pop30_op: /* Only valid for MIPS R6 */ if (!cpu_has_mips_r6) { ret = -SIGILL; diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 4a1712b5abdf..6fb4704bd156 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -619,17 +619,17 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt) perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); switch (rd) { - case 0: /* CPU number */ + case MIPS_HWR_CPUNUM: /* CPU number */ regs->regs[rt] = smp_processor_id(); return 0; - case 1: /* SYNCI length */ + case MIPS_HWR_SYNCISTEP: /* SYNCI length */ regs->regs[rt] = min(current_cpu_data.dcache.linesz, current_cpu_data.icache.linesz); return 0; - case 2: /* Read count register */ + case MIPS_HWR_CC: /* Read count register */ regs->regs[rt] = read_c0_count(); return 0; - case 3: /* Count register resolution */ + case MIPS_HWR_CCRES: /* Count register resolution */ switch (current_cpu_type()) { case CPU_20KC: case CPU_25KF: @@ -639,7 +639,7 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt) regs->regs[rt] = 2; } return 0; - case 29: + case MIPS_HWR_ULR: /* Read UserLocal register */ regs->regs[rt] = ti->tp_value; return 0; default: @@ -1859,6 +1859,7 @@ void __noreturn nmi_exception_handler(struct pt_regs *regs) #define VECTORSPACING 0x100 /* for EI/VI mode */ unsigned long ebase; +EXPORT_SYMBOL_GPL(ebase); unsigned long exception_handlers[32]; unsigned long vi_handlers[64]; @@ -2063,16 +2064,22 @@ static void configure_status(void) status_set); } +unsigned int hwrena; +EXPORT_SYMBOL_GPL(hwrena); + /* configure HWRENA register */ static void configure_hwrena(void) { - unsigned int hwrena = cpu_hwrena_impl_bits; + hwrena = cpu_hwrena_impl_bits; if (cpu_has_mips_r2_r6) - hwrena |= 0x0000000f; + hwrena |= MIPS_HWRENA_CPUNUM | + MIPS_HWRENA_SYNCISTEP | + MIPS_HWRENA_CC | + MIPS_HWRENA_CCRES; if (!noulri && cpu_has_userlocal) - hwrena |= (1 << 29); + hwrena |= MIPS_HWRENA_ULR; if (hwrena) write_c0_hwrena(hwrena); diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 2ae12825529f..7c56d6b124d1 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -17,6 +17,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM + select EXPORT_UASM select PREEMPT_NOTIFIERS select ANON_INODES select KVM_MMIO diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile index 637ebbebd549..847429de780d 100644 --- a/arch/mips/kvm/Makefile +++ b/arch/mips/kvm/Makefile @@ -7,9 +7,10 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o -kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \ +kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \ interrupt.o stats.o commpage.o \ dyntrans.o trap_emul.o fpu.o +kvm-objs += mmu.o obj-$(CONFIG_KVM) += kvm.o obj-y += callback.o tlb.o diff --git a/arch/mips/kvm/commpage.c b/arch/mips/kvm/commpage.c index 2d6e976d1add..a36b77e1705c 100644 --- a/arch/mips/kvm/commpage.c +++ b/arch/mips/kvm/commpage.c @@ -4,7 +4,7 @@ * for more details. * * commpage, currently used for Virtual COP0 registers. - * Mapped into the guest kernel @ 0x0. + * Mapped into the guest kernel @ KVM_GUEST_COMMPAGE_ADDR. * * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. * Authors: Sanjay Lal diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index f1527a465c1b..d280894915ed 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -20,125 +21,114 @@ #include "commpage.h" -#define SYNCI_TEMPLATE 0x041f0000 -#define SYNCI_BASE(x) (((x) >> 21) & 0x1f) -#define SYNCI_OFFSET ((x) & 0xffff) +/** + * kvm_mips_trans_replace() - Replace trapping instruction in guest memory. + * @vcpu: Virtual CPU. + * @opc: PC of instruction to replace. + * @replace: Instruction to write + */ +static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, + union mips_instruction replace) +{ + unsigned long paddr, flags; + void *vaddr; -#define LW_TEMPLATE 0x8c000000 -#define CLEAR_TEMPLATE 0x00000020 -#define SW_TEMPLATE 0xac000000 + if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) { + paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, + (unsigned long)opc); + vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr))); + vaddr += paddr & ~PAGE_MASK; + memcpy(vaddr, (void *)&replace, sizeof(u32)); + local_flush_icache_range((unsigned long)vaddr, + (unsigned long)vaddr + 32); + kunmap_atomic(vaddr); + } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { + local_irq_save(flags); + memcpy((void *)opc, (void *)&replace, sizeof(u32)); + local_flush_icache_range((unsigned long)opc, + (unsigned long)opc + 32); + local_irq_restore(flags); + } else { + kvm_err("%s: Invalid address: %p\n", __func__, opc); + return -EFAULT; + } -int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc, + return 0; +} + +int kvm_mips_trans_cache_index(union mips_instruction inst, u32 *opc, struct kvm_vcpu *vcpu) { - int result = 0; - unsigned long kseg0_opc; - uint32_t synci_inst = 0x0; + union mips_instruction nop_inst = { 0 }; /* Replace the CACHE instruction, with a NOP */ - kseg0_opc = - CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa - (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t)); - local_flush_icache_range(kseg0_opc, kseg0_opc + 32); - - return result; + return kvm_mips_trans_replace(vcpu, opc, nop_inst); } /* * Address based CACHE instructions are transformed into synci(s). A little * heavy for just D-cache invalidates, but avoids an expensive trap */ -int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc, +int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc, struct kvm_vcpu *vcpu) { - int result = 0; - unsigned long kseg0_opc; - uint32_t synci_inst = SYNCI_TEMPLATE, base, offset; + union mips_instruction synci_inst = { 0 }; - base = (inst >> 21) & 0x1f; - offset = inst & 0xffff; - synci_inst |= (base << 21); - synci_inst |= offset; + synci_inst.i_format.opcode = bcond_op; + synci_inst.i_format.rs = inst.i_format.rs; + synci_inst.i_format.rt = synci_op; + if (cpu_has_mips_r6) + synci_inst.i_format.simmediate = inst.spec3_format.simmediate; + else + synci_inst.i_format.simmediate = inst.i_format.simmediate; - kseg0_opc = - CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa - (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t)); - local_flush_icache_range(kseg0_opc, kseg0_opc + 32); - - return result; + return kvm_mips_trans_replace(vcpu, opc, synci_inst); } -int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu) +int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc, + struct kvm_vcpu *vcpu) { - int32_t rt, rd, sel; - uint32_t mfc0_inst; - unsigned long kseg0_opc, flags; + union mips_instruction mfc0_inst = { 0 }; + u32 rd, sel; - rt = (inst >> 16) & 0x1f; - rd = (inst >> 11) & 0x1f; - sel = inst & 0x7; + rd = inst.c0r_format.rd; + sel = inst.c0r_format.sel; - if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) { - mfc0_inst = CLEAR_TEMPLATE; - mfc0_inst |= ((rt & 0x1f) << 16); + if (rd == MIPS_CP0_ERRCTL && sel == 0) { + mfc0_inst.r_format.opcode = spec_op; + mfc0_inst.r_format.rd = inst.c0r_format.rt; + mfc0_inst.r_format.func = add_op; } else { - mfc0_inst = LW_TEMPLATE; - mfc0_inst |= ((rt & 0x1f) << 16); - mfc0_inst |= offsetof(struct kvm_mips_commpage, - cop0.reg[rd][sel]); + mfc0_inst.i_format.opcode = lw_op; + mfc0_inst.i_format.rt = inst.c0r_format.rt; + mfc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR | + offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]); +#ifdef CONFIG_CPU_BIG_ENDIAN + if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8) + mfc0_inst.i_format.simmediate |= 4; +#endif } - if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) { - kseg0_opc = - CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa - (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(uint32_t)); - local_flush_icache_range(kseg0_opc, kseg0_opc + 32); - } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { - local_irq_save(flags); - memcpy((void *)opc, (void *)&mfc0_inst, sizeof(uint32_t)); - local_flush_icache_range((unsigned long)opc, - (unsigned long)opc + 32); - local_irq_restore(flags); - } else { - kvm_err("%s: Invalid address: %p\n", __func__, opc); - return -EFAULT; - } - - return 0; + return kvm_mips_trans_replace(vcpu, opc, mfc0_inst); } -int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu) +int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc, + struct kvm_vcpu *vcpu) { - int32_t rt, rd, sel; - uint32_t mtc0_inst = SW_TEMPLATE; - unsigned long kseg0_opc, flags; + union mips_instruction mtc0_inst = { 0 }; + u32 rd, sel; - rt = (inst >> 16) & 0x1f; - rd = (inst >> 11) & 0x1f; - sel = inst & 0x7; + rd = inst.c0r_format.rd; + sel = inst.c0r_format.sel; - mtc0_inst |= ((rt & 0x1f) << 16); - mtc0_inst |= offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]); + mtc0_inst.i_format.opcode = sw_op; + mtc0_inst.i_format.rt = inst.c0r_format.rt; + mtc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR | + offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]); +#ifdef CONFIG_CPU_BIG_ENDIAN + if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8) + mtc0_inst.i_format.simmediate |= 4; +#endif - if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) { - kseg0_opc = - CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa - (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(uint32_t)); - local_flush_icache_range(kseg0_opc, kseg0_opc + 32); - } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { - local_irq_save(flags); - memcpy((void *)opc, (void *)&mtc0_inst, sizeof(uint32_t)); - local_flush_icache_range((unsigned long)opc, - (unsigned long)opc + 32); - local_irq_restore(flags); - } else { - kvm_err("%s: Invalid address: %p\n", __func__, opc); - return -EFAULT; - } - - return 0; + return kvm_mips_trans_replace(vcpu, opc, mtc0_inst); } diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 645c8a1982a7..6eb52b9c9818 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -52,7 +52,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, goto unaligned; /* Read the instruction */ - insn.word = kvm_get_inst((uint32_t *) epc, vcpu); + insn.word = kvm_get_inst((u32 *) epc, vcpu); if (insn.word == KVM_INVALID_INST) return KVM_INVALID_INST; @@ -161,9 +161,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, nextpc = epc; break; - case blez_op: /* not really i_format */ - case blezl_op: - /* rt field assumed to be zero */ + case blez_op: /* POP06 */ +#ifndef CONFIG_CPU_MIPSR6 + case blezl_op: /* removed in R6 */ +#endif + if (insn.i_format.rt != 0) + goto compact_branch; if ((long)arch->gprs[insn.i_format.rs] <= 0) epc = epc + 4 + (insn.i_format.simmediate << 2); else @@ -171,9 +174,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, nextpc = epc; break; - case bgtz_op: - case bgtzl_op: - /* rt field assumed to be zero */ + case bgtz_op: /* POP07 */ +#ifndef CONFIG_CPU_MIPSR6 + case bgtzl_op: /* removed in R6 */ +#endif + if (insn.i_format.rt != 0) + goto compact_branch; if ((long)arch->gprs[insn.i_format.rs] > 0) epc = epc + 4 + (insn.i_format.simmediate << 2); else @@ -185,6 +191,40 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, case cop1_op: kvm_err("%s: unsupported cop1_op\n", __func__); break; + +#ifdef CONFIG_CPU_MIPSR6 + /* R6 added the following compact branches with forbidden slots */ + case blezl_op: /* POP26 */ + case bgtzl_op: /* POP27 */ + /* only rt == 0 isn't compact branch */ + if (insn.i_format.rt != 0) + goto compact_branch; + break; + case pop10_op: + case pop30_op: + /* only rs == rt == 0 is reserved, rest are compact branches */ + if (insn.i_format.rs != 0 || insn.i_format.rt != 0) + goto compact_branch; + break; + case pop66_op: + case pop76_op: + /* only rs == 0 isn't compact branch */ + if (insn.i_format.rs != 0) + goto compact_branch; + break; +compact_branch: + /* + * If we've hit an exception on the forbidden slot, then + * the branch must not have been taken. + */ + epc += 8; + nextpc = epc; + break; +#else +compact_branch: + /* Compact branches not supported before R6 */ + break; +#endif } return nextpc; @@ -198,7 +238,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, return nextpc; } -enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause) +enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause) { unsigned long branch_pc; enum emulation_result er = EMULATE_DONE; @@ -243,7 +283,7 @@ static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu) * * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running). */ -static uint32_t kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now) +static u32 kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now) { s64 now_ns, periods; u64 delta; @@ -300,11 +340,11 @@ static inline ktime_t kvm_mips_count_time(struct kvm_vcpu *vcpu) * * Returns: The current value of the guest CP0_Count register. */ -static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) +static u32 kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) { struct mips_coproc *cop0 = vcpu->arch.cop0; ktime_t expires, threshold; - uint32_t count, compare; + u32 count, compare; int running; /* Calculate the biased and scaled guest CP0_Count */ @@ -315,7 +355,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) * Find whether CP0_Count has reached the closest timer interrupt. If * not, we shouldn't inject it. */ - if ((int32_t)(count - compare) < 0) + if ((s32)(count - compare) < 0) return count; /* @@ -360,7 +400,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) * * Returns: The current guest CP0_Count value. */ -uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu) +u32 kvm_mips_read_count(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -387,8 +427,7 @@ uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu) * * Returns: The ktime at the point of freeze. */ -static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, - uint32_t *count) +static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count) { ktime_t now; @@ -419,16 +458,16 @@ static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running). */ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu, - ktime_t now, uint32_t count) + ktime_t now, u32 count) { struct mips_coproc *cop0 = vcpu->arch.cop0; - uint32_t compare; + u32 compare; u64 delta; ktime_t expire; /* Calculate timeout (wrap 0 to 2^32) */ compare = kvm_read_c0_guest_compare(cop0); - delta = (u64)(uint32_t)(compare - count - 1) + 1; + delta = (u64)(u32)(compare - count - 1) + 1; delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz); expire = ktime_add_ns(now, delta); @@ -444,7 +483,7 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu, * * Sets the CP0_Count value and updates the timer accordingly. */ -void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count) +void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count) { struct mips_coproc *cop0 = vcpu->arch.cop0; ktime_t now; @@ -538,13 +577,13 @@ int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz) * If @ack, atomically acknowledge any pending timer interrupt, otherwise ensure * any pending timer interrupt is preserved. */ -void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack) +void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack) { struct mips_coproc *cop0 = vcpu->arch.cop0; int dc; u32 old_compare = kvm_read_c0_guest_compare(cop0); ktime_t now; - uint32_t count; + u32 count; /* if unchanged, must just be an ack */ if (old_compare == compare) { @@ -585,7 +624,7 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack) static ktime_t kvm_mips_count_disable(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - uint32_t count; + u32 count; ktime_t now; /* Stop hrtimer */ @@ -632,7 +671,7 @@ void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu) void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - uint32_t count; + u32 count; kvm_clear_c0_guest_cause(cop0, CAUSEF_DC); @@ -661,7 +700,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl) s64 changed = count_ctl ^ vcpu->arch.count_ctl; s64 delta; ktime_t expire, now; - uint32_t count, compare; + u32 count, compare; /* Only allow defined bits to be changed */ if (changed & ~(s64)(KVM_REG_MIPS_COUNT_CTL_DC)) @@ -687,7 +726,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl) */ count = kvm_read_c0_guest_count(cop0); compare = kvm_read_c0_guest_compare(cop0); - delta = (u64)(uint32_t)(compare - count - 1) + 1; + delta = (u64)(u32)(compare - count - 1) + 1; delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz); expire = ktime_add_ns(vcpu->arch.count_resume, delta); @@ -776,7 +815,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu) vcpu->arch.pending_exceptions); ++vcpu->stat.wait_exits; - trace_kvm_exit(vcpu, WAIT_EXITS); + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_WAIT); if (!vcpu->arch.pending_exceptions) { vcpu->arch.wait = 1; kvm_vcpu_block(vcpu); @@ -801,9 +840,9 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu) enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - uint32_t pc = vcpu->arch.pc; + unsigned long pc = vcpu->arch.pc; - kvm_err("[%#x] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0)); + kvm_err("[%#lx] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0)); return EMULATE_FAIL; } @@ -813,11 +852,11 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu) struct mips_coproc *cop0 = vcpu->arch.cop0; int index = kvm_read_c0_guest_index(cop0); struct kvm_mips_tlb *tlb = NULL; - uint32_t pc = vcpu->arch.pc; + unsigned long pc = vcpu->arch.pc; if (index < 0 || index >= KVM_MIPS_GUEST_TLB_SIZE) { kvm_debug("%s: illegal index: %d\n", __func__, index); - kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n", + kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n", pc, index, kvm_read_c0_guest_entryhi(cop0), kvm_read_c0_guest_entrylo0(cop0), kvm_read_c0_guest_entrylo1(cop0), @@ -834,10 +873,10 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu) tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0); tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0); - tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0); - tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0); + tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0); + tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0); - kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n", + kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n", pc, index, kvm_read_c0_guest_entryhi(cop0), kvm_read_c0_guest_entrylo0(cop0), kvm_read_c0_guest_entrylo1(cop0), @@ -851,7 +890,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_mips_tlb *tlb = NULL; - uint32_t pc = vcpu->arch.pc; + unsigned long pc = vcpu->arch.pc; int index; get_random_bytes(&index, sizeof(index)); @@ -867,10 +906,10 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu) tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0); tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0); - tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0); - tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0); + tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0); + tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0); - kvm_debug("[%#x] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n", + kvm_debug("[%#lx] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n", pc, index, kvm_read_c0_guest_entryhi(cop0), kvm_read_c0_guest_entrylo0(cop0), kvm_read_c0_guest_entrylo1(cop0)); @@ -882,14 +921,14 @@ enum emulation_result kvm_mips_emul_tlbp(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; long entryhi = kvm_read_c0_guest_entryhi(cop0); - uint32_t pc = vcpu->arch.pc; + unsigned long pc = vcpu->arch.pc; int index = -1; index = kvm_mips_guest_tlb_lookup(vcpu, entryhi); kvm_write_c0_guest_index(cop0, index); - kvm_debug("[%#x] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi, + kvm_debug("[%#lx] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi, index); return EMULATE_DONE; @@ -922,8 +961,8 @@ unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu) */ unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu) { - /* Config4 is optional */ - unsigned int mask = MIPS_CONF_M; + /* Config4 and ULRI are optional */ + unsigned int mask = MIPS_CONF_M | MIPS_CONF3_ULRI; /* Permit MSA to be present if MSA is supported */ if (kvm_mips_guest_can_have_msa(&vcpu->arch)) @@ -942,7 +981,12 @@ unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu) unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu) { /* Config5 is optional */ - return MIPS_CONF_M; + unsigned int mask = MIPS_CONF_M; + + /* KScrExist */ + mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16; + + return mask; } /** @@ -973,14 +1017,14 @@ unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu) return mask; } -enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, - uint32_t cause, struct kvm_run *run, +enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, + u32 *opc, u32 cause, + struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; enum emulation_result er = EMULATE_DONE; - int32_t rt, rd, copz, sel, co_bit, op; - uint32_t pc = vcpu->arch.pc; + u32 rt, rd, sel; unsigned long curr_pc; /* @@ -992,16 +1036,8 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, if (er == EMULATE_FAIL) return er; - copz = (inst >> 21) & 0x1f; - rt = (inst >> 16) & 0x1f; - rd = (inst >> 11) & 0x1f; - sel = inst & 0x7; - co_bit = (inst >> 25) & 1; - - if (co_bit) { - op = (inst) & 0xff; - - switch (op) { + if (inst.co_format.co) { + switch (inst.co_format.func) { case tlbr_op: /* Read indexed TLB entry */ er = kvm_mips_emul_tlbr(vcpu); break; @@ -1020,47 +1056,58 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, case eret_op: er = kvm_mips_emul_eret(vcpu); goto dont_update_pc; - break; case wait_op: er = kvm_mips_emul_wait(vcpu); break; } } else { - switch (copz) { + rt = inst.c0r_format.rt; + rd = inst.c0r_format.rd; + sel = inst.c0r_format.sel; + + switch (inst.c0r_format.rs) { case mfc_op: #ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS cop0->stat[rd][sel]++; #endif /* Get reg */ if ((rd == MIPS_CP0_COUNT) && (sel == 0)) { - vcpu->arch.gprs[rt] = kvm_mips_read_count(vcpu); + vcpu->arch.gprs[rt] = + (s32)kvm_mips_read_count(vcpu); } else if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) { vcpu->arch.gprs[rt] = 0x0; #ifdef CONFIG_KVM_MIPS_DYN_TRANS kvm_mips_trans_mfc0(inst, opc, vcpu); #endif } else { - vcpu->arch.gprs[rt] = cop0->reg[rd][sel]; + vcpu->arch.gprs[rt] = (s32)cop0->reg[rd][sel]; #ifdef CONFIG_KVM_MIPS_DYN_TRANS kvm_mips_trans_mfc0(inst, opc, vcpu); #endif } - kvm_debug - ("[%#x] MFCz[%d][%d], vcpu->arch.gprs[%d]: %#lx\n", - pc, rd, sel, rt, vcpu->arch.gprs[rt]); - + trace_kvm_hwr(vcpu, KVM_TRACE_MFC0, + KVM_TRACE_COP0(rd, sel), + vcpu->arch.gprs[rt]); break; case dmfc_op: vcpu->arch.gprs[rt] = cop0->reg[rd][sel]; + + trace_kvm_hwr(vcpu, KVM_TRACE_DMFC0, + KVM_TRACE_COP0(rd, sel), + vcpu->arch.gprs[rt]); break; case mtc_op: #ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS cop0->stat[rd][sel]++; #endif + trace_kvm_hwr(vcpu, KVM_TRACE_MTC0, + KVM_TRACE_COP0(rd, sel), + vcpu->arch.gprs[rt]); + if ((rd == MIPS_CP0_TLB_INDEX) && (vcpu->arch.gprs[rt] >= KVM_MIPS_GUEST_TLB_SIZE)) { @@ -1078,16 +1125,15 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n", kvm_read_c0_guest_ebase(cop0)); } else if (rd == MIPS_CP0_TLB_HI && sel == 0) { - uint32_t nasid = + u32 nasid = vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID; if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) && ((kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID) != nasid)) { - kvm_debug("MTCz, change ASID from %#lx to %#lx\n", + trace_kvm_asid_change(vcpu, kvm_read_c0_guest_entryhi(cop0) - & KVM_ENTRYHI_ASID, - vcpu->arch.gprs[rt] - & KVM_ENTRYHI_ASID); + & KVM_ENTRYHI_ASID, + nasid); /* Blow away the shadow host TLBs */ kvm_mips_flush_host_tlb(1); @@ -1100,10 +1146,6 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]); goto done; } else if ((rd == MIPS_CP0_COMPARE) && (sel == 0)) { - kvm_debug("[%#x] MTCz, COMPARE %#lx <- %#lx\n", - pc, kvm_read_c0_guest_compare(cop0), - vcpu->arch.gprs[rt]); - /* If we are writing to COMPARE */ /* Clear pending timer interrupt, if any */ kvm_mips_write_compare(vcpu, @@ -1155,7 +1197,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, * it first. */ if (change & ST0_CU1 && !(val & ST0_FR) && - vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) + vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) kvm_lose_fpu(vcpu); /* @@ -1166,7 +1208,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, * the near future. */ if (change & ST0_CU1 && - vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) + vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) change_c0_status(ST0_CU1, val); preempt_enable(); @@ -1201,7 +1243,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, * context is already loaded. */ if (change & MIPS_CONF5_FRE && - vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) + vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) change_c0_config5(MIPS_CONF5_FRE, val); /* @@ -1211,7 +1253,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, * quickly enabled again in the near future. */ if (change & MIPS_CONF5_MSAEN && - vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) + vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) change_c0_config5(MIPS_CONF5_MSAEN, val); @@ -1219,7 +1261,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, kvm_write_c0_guest_config5(cop0, val); } else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) { - uint32_t old_cause, new_cause; + u32 old_cause, new_cause; old_cause = kvm_read_c0_guest_cause(cop0); new_cause = vcpu->arch.gprs[rt]; @@ -1233,20 +1275,30 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, else kvm_mips_count_enable_cause(vcpu); } + } else if ((rd == MIPS_CP0_HWRENA) && (sel == 0)) { + u32 mask = MIPS_HWRENA_CPUNUM | + MIPS_HWRENA_SYNCISTEP | + MIPS_HWRENA_CC | + MIPS_HWRENA_CCRES; + + if (kvm_read_c0_guest_config3(cop0) & + MIPS_CONF3_ULRI) + mask |= MIPS_HWRENA_ULR; + cop0->reg[rd][sel] = vcpu->arch.gprs[rt] & mask; } else { cop0->reg[rd][sel] = vcpu->arch.gprs[rt]; #ifdef CONFIG_KVM_MIPS_DYN_TRANS kvm_mips_trans_mtc0(inst, opc, vcpu); #endif } - - kvm_debug("[%#x] MTCz, cop0->reg[%d][%d]: %#lx\n", pc, - rd, sel, cop0->reg[rd][sel]); break; case dmtc_op: kvm_err("!!!!!!![%#lx]dmtc_op: rt: %d, rd: %d, sel: %d!!!!!!\n", vcpu->arch.pc, rt, rd, sel); + trace_kvm_hwr(vcpu, KVM_TRACE_DMTC0, + KVM_TRACE_COP0(rd, sel), + vcpu->arch.gprs[rt]); er = EMULATE_FAIL; break; @@ -1258,7 +1310,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, vcpu->arch.gprs[rt] = kvm_read_c0_guest_status(cop0); /* EI */ - if (inst & 0x20) { + if (inst.mfmc0_format.sc) { kvm_debug("[%#lx] mfmc0_op: EI\n", vcpu->arch.pc); kvm_set_c0_guest_status(cop0, ST0_IE); @@ -1272,9 +1324,8 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, case wrpgpr_op: { - uint32_t css = - cop0->reg[MIPS_CP0_STATUS][2] & 0xf; - uint32_t pss = + u32 css = cop0->reg[MIPS_CP0_STATUS][2] & 0xf; + u32 pss = (cop0->reg[MIPS_CP0_STATUS][2] >> 6) & 0xf; /* * We don't support any shadow register sets, so @@ -1291,7 +1342,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, break; default: kvm_err("[%#lx]MachEmulateCP0: unsupported COP0, copz: 0x%x\n", - vcpu->arch.pc, copz); + vcpu->arch.pc, inst.c0r_format.rs); er = EMULATE_FAIL; break; } @@ -1312,13 +1363,14 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, return er; } -enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause, +enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DO_MMIO; - int32_t op, base, rt, offset; - uint32_t bytes; + u32 rt; + u32 bytes; void *data = run->mmio.data; unsigned long curr_pc; @@ -1331,12 +1383,9 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause, if (er == EMULATE_FAIL) return er; - rt = (inst >> 16) & 0x1f; - base = (inst >> 21) & 0x1f; - offset = inst & 0xffff; - op = (inst >> 26) & 0x3f; + rt = inst.i_format.rt; - switch (op) { + switch (inst.i_format.opcode) { case sb_op: bytes = 1; if (bytes > sizeof(run->mmio.data)) { @@ -1357,7 +1406,7 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause, *(u8 *) data = vcpu->arch.gprs[rt]; kvm_debug("OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n", vcpu->arch.host_cp0_badvaddr, vcpu->arch.gprs[rt], - *(uint8_t *) data); + *(u8 *) data); break; @@ -1379,11 +1428,11 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause, run->mmio.is_write = 1; vcpu->mmio_needed = 1; vcpu->mmio_is_write = 1; - *(uint32_t *) data = vcpu->arch.gprs[rt]; + *(u32 *) data = vcpu->arch.gprs[rt]; kvm_debug("[%#lx] OP_SW: eaddr: %#lx, gpr: %#lx, data: %#x\n", vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr, - vcpu->arch.gprs[rt], *(uint32_t *) data); + vcpu->arch.gprs[rt], *(u32 *) data); break; case sh_op: @@ -1404,15 +1453,16 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause, run->mmio.is_write = 1; vcpu->mmio_needed = 1; vcpu->mmio_is_write = 1; - *(uint16_t *) data = vcpu->arch.gprs[rt]; + *(u16 *) data = vcpu->arch.gprs[rt]; kvm_debug("[%#lx] OP_SH: eaddr: %#lx, gpr: %#lx, data: %#x\n", vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr, - vcpu->arch.gprs[rt], *(uint32_t *) data); + vcpu->arch.gprs[rt], *(u32 *) data); break; default: - kvm_err("Store not yet supported"); + kvm_err("Store not yet supported (inst=0x%08x)\n", + inst.word); er = EMULATE_FAIL; break; } @@ -1424,18 +1474,16 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause, return er; } -enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause, - struct kvm_run *run, +enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DO_MMIO; - int32_t op, base, rt, offset; - uint32_t bytes; + u32 op, rt; + u32 bytes; - rt = (inst >> 16) & 0x1f; - base = (inst >> 21) & 0x1f; - offset = inst & 0xffff; - op = (inst >> 26) & 0x3f; + rt = inst.i_format.rt; + op = inst.i_format.opcode; vcpu->arch.pending_load_cause = cause; vcpu->arch.io_gpr = rt; @@ -1521,7 +1569,8 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause, break; default: - kvm_err("Load not yet supported"); + kvm_err("Load not yet supported (inst=0x%08x)\n", + inst.word); er = EMULATE_FAIL; break; } @@ -1529,40 +1578,15 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause, return er; } -int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu) -{ - unsigned long offset = (va & ~PAGE_MASK); - struct kvm *kvm = vcpu->kvm; - unsigned long pa; - gfn_t gfn; - kvm_pfn_t pfn; - - gfn = va >> PAGE_SHIFT; - - if (gfn >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: %#llx\n", __func__, gfn); - kvm_mips_dump_host_tlbs(); - kvm_arch_vcpu_dump_regs(vcpu); - return -1; - } - pfn = kvm->arch.guest_pmap[gfn]; - pa = (pfn << PAGE_SHIFT) | offset; - - kvm_debug("%s: va: %#lx, unmapped: %#x\n", __func__, va, - CKSEG0ADDR(pa)); - - local_flush_icache_range(CKSEG0ADDR(pa), 32); - return 0; -} - -enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, - uint32_t cause, +enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, + u32 *opc, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; enum emulation_result er = EMULATE_DONE; - int32_t offset, cache, op_inst, op, base; + u32 cache, op_inst, op, base; + s16 offset; struct kvm_vcpu_arch *arch = &vcpu->arch; unsigned long va; unsigned long curr_pc; @@ -1576,9 +1600,12 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, if (er == EMULATE_FAIL) return er; - base = (inst >> 21) & 0x1f; - op_inst = (inst >> 16) & 0x1f; - offset = (int16_t)inst; + base = inst.i_format.rs; + op_inst = inst.i_format.rt; + if (cpu_has_mips_r6) + offset = inst.spec3_format.simmediate; + else + offset = inst.i_format.simmediate; cache = op_inst & CacheOp_Cache; op = op_inst & CacheOp_Op; @@ -1634,7 +1661,6 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, (cop0) & KVM_ENTRYHI_ASID)); if (index < 0) { - vcpu->arch.host_cp0_entryhi = (va & VPN2_MASK); vcpu->arch.host_cp0_badvaddr = va; vcpu->arch.pc = curr_pc; er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, @@ -1659,9 +1685,7 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, * We fault an entry from the guest tlb to the * shadow host TLB */ - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, - NULL, - NULL); + kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb); } } } else { @@ -1714,20 +1738,20 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, return er; } -enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc, +enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { + union mips_instruction inst; enum emulation_result er = EMULATE_DONE; - uint32_t inst; /* Fetch the instruction. */ if (cause & CAUSEF_BD) opc += 1; - inst = kvm_get_inst(opc, vcpu); + inst.word = kvm_get_inst(opc, vcpu); - switch (((union mips_instruction)inst).r_format.opcode) { + switch (inst.r_format.opcode) { case cop0_op: er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu); break; @@ -1744,15 +1768,31 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc, er = kvm_mips_emulate_load(inst, cause, run, vcpu); break; +#ifndef CONFIG_CPU_MIPSR6 case cache_op: ++vcpu->stat.cache_exits; - trace_kvm_exit(vcpu, CACHE_EXITS); + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu); break; +#else + case spec3_op: + switch (inst.spec3_format.func) { + case cache6_op: + ++vcpu->stat.cache_exits; + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); + er = kvm_mips_emulate_cache(inst, opc, cause, run, + vcpu); + break; + default: + goto unknown; + }; + break; +unknown: +#endif default: kvm_err("Instruction emulation not supported (%p/%#x)\n", opc, - inst); + inst.word); kvm_arch_vcpu_dump_regs(vcpu); er = EMULATE_FAIL; break; @@ -1761,8 +1801,8 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc, return er; } -enum emulation_result kvm_mips_emulate_syscall(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_syscall(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1796,8 +1836,8 @@ enum emulation_result kvm_mips_emulate_syscall(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1842,8 +1882,8 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause, return EMULATE_DONE; } -enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1888,8 +1928,8 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause, return EMULATE_DONE; } -enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1932,8 +1972,8 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause, return EMULATE_DONE; } -enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1977,7 +2017,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause, } /* TLBMOD: store into address matching TLB with Dirty bit off */ -enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc, +enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2005,8 +2045,8 @@ enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc, return er; } -enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2048,8 +2088,8 @@ enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause, return EMULATE_DONE; } -enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2077,8 +2117,8 @@ enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause, return EMULATE_DONE; } -enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2112,8 +2152,8 @@ enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2147,8 +2187,8 @@ enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2182,8 +2222,8 @@ enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2217,8 +2257,8 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2252,8 +2292,8 @@ enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2287,22 +2327,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause, return er; } -/* ll/sc, rdhwr, sync emulation */ - -#define OPCODE 0xfc000000 -#define BASE 0x03e00000 -#define RT 0x001f0000 -#define OFFSET 0x0000ffff -#define LL 0xc0000000 -#define SC 0xe0000000 -#define SPEC0 0x00000000 -#define SPEC3 0x7c000000 -#define RD 0x0000f800 -#define FUNC 0x0000003f -#define SYNC 0x0000000f -#define RDHWR 0x0000003b - -enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc, +enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2310,7 +2335,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc, struct kvm_vcpu_arch *arch = &vcpu->arch; enum emulation_result er = EMULATE_DONE; unsigned long curr_pc; - uint32_t inst; + union mips_instruction inst; /* * Update PC and hold onto current PC in case there is @@ -2325,17 +2350,22 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc, if (cause & CAUSEF_BD) opc += 1; - inst = kvm_get_inst(opc, vcpu); + inst.word = kvm_get_inst(opc, vcpu); - if (inst == KVM_INVALID_INST) { + if (inst.word == KVM_INVALID_INST) { kvm_err("%s: Cannot get inst @ %p\n", __func__, opc); return EMULATE_FAIL; } - if ((inst & OPCODE) == SPEC3 && (inst & FUNC) == RDHWR) { + if (inst.r_format.opcode == spec3_op && + inst.r_format.func == rdhwr_op && + inst.r_format.rs == 0 && + (inst.r_format.re >> 3) == 0) { int usermode = !KVM_GUEST_KERNEL_MODE(vcpu); - int rd = (inst & RD) >> 11; - int rt = (inst & RT) >> 16; + int rd = inst.r_format.rd; + int rt = inst.r_format.rt; + int sel = inst.r_format.re & 0x7; + /* If usermode, check RDHWR rd is allowed by guest HWREna */ if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) { kvm_debug("RDHWR %#x disallowed by HWREna @ %p\n", @@ -2343,17 +2373,17 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc, goto emulate_ri; } switch (rd) { - case 0: /* CPU number */ - arch->gprs[rt] = 0; + case MIPS_HWR_CPUNUM: /* CPU number */ + arch->gprs[rt] = vcpu->vcpu_id; break; - case 1: /* SYNCI length */ + case MIPS_HWR_SYNCISTEP: /* SYNCI length */ arch->gprs[rt] = min(current_cpu_data.dcache.linesz, current_cpu_data.icache.linesz); break; - case 2: /* Read count register */ - arch->gprs[rt] = kvm_mips_read_count(vcpu); + case MIPS_HWR_CC: /* Read count register */ + arch->gprs[rt] = (s32)kvm_mips_read_count(vcpu); break; - case 3: /* Count register resolution */ + case MIPS_HWR_CCRES: /* Count register resolution */ switch (current_cpu_data.cputype) { case CPU_20KC: case CPU_25KF: @@ -2363,7 +2393,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc, arch->gprs[rt] = 2; } break; - case 29: + case MIPS_HWR_ULR: /* Read UserLocal register */ arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0); break; @@ -2371,8 +2401,12 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc, kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc); goto emulate_ri; } + + trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR, KVM_TRACE_HWR(rd, sel), + vcpu->arch.gprs[rt]); } else { - kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst); + kvm_debug("Emulate RI not supported @ %p: %#x\n", + opc, inst.word); goto emulate_ri; } @@ -2405,19 +2439,19 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, switch (run->mmio.len) { case 4: - *gpr = *(int32_t *) run->mmio.data; + *gpr = *(s32 *) run->mmio.data; break; case 2: if (vcpu->mmio_needed == 2) - *gpr = *(int16_t *) run->mmio.data; + *gpr = *(s16 *) run->mmio.data; else - *gpr = *(uint16_t *)run->mmio.data; + *gpr = *(u16 *)run->mmio.data; break; case 1: if (vcpu->mmio_needed == 2) - *gpr = *(int8_t *) run->mmio.data; + *gpr = *(s8 *) run->mmio.data; else *gpr = *(u8 *) run->mmio.data; break; @@ -2432,12 +2466,12 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, return er; } -static enum emulation_result kvm_mips_emulate_exc(unsigned long cause, - uint32_t *opc, +static enum emulation_result kvm_mips_emulate_exc(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { - uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; + u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_vcpu_arch *arch = &vcpu->arch; enum emulation_result er = EMULATE_DONE; @@ -2470,13 +2504,13 @@ static enum emulation_result kvm_mips_emulate_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_check_privilege(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_check_privilege(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DONE; - uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; + u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; int usermode = !KVM_GUEST_KERNEL_MODE(vcpu); @@ -2566,18 +2600,18 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause, * (2) TLB entry is present in the Guest TLB but not in the shadow, in this * case we inject the TLB from the Guest TLB into the shadow host TLB */ -enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause, - uint32_t *opc, +enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DONE; - uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; + u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; unsigned long va = vcpu->arch.host_cp0_badvaddr; int index; - kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx, entryhi: %#lx\n", - vcpu->arch.host_cp0_badvaddr, vcpu->arch.host_cp0_entryhi); + kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx\n", + vcpu->arch.host_cp0_badvaddr); /* * KVM would not have got the exception if this entry was valid in the @@ -2620,13 +2654,12 @@ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause, } } else { kvm_debug("Injecting hi: %#lx, lo0: %#lx, lo1: %#lx into shadow host TLB\n", - tlb->tlb_hi, tlb->tlb_lo0, tlb->tlb_lo1); + tlb->tlb_hi, tlb->tlb_lo[0], tlb->tlb_lo[1]); /* * OK we have a Guest TLB entry, now inject it into the * shadow host TLB */ - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, NULL, - NULL); + kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb); } } diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c new file mode 100644 index 000000000000..6a02b3a3fa65 --- /dev/null +++ b/arch/mips/kvm/entry.c @@ -0,0 +1,701 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Generation of main entry point for the guest, exception handling. + * + * Copyright (C) 2012 MIPS Technologies, Inc. + * Authors: Sanjay Lal + * + * Copyright (C) 2016 Imagination Technologies Ltd. + */ + +#include +#include +#include +#include + +/* Register names */ +#define ZERO 0 +#define AT 1 +#define V0 2 +#define V1 3 +#define A0 4 +#define A1 5 + +#if _MIPS_SIM == _MIPS_SIM_ABI32 +#define T0 8 +#define T1 9 +#define T2 10 +#define T3 11 +#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ + +#if _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 +#define T0 12 +#define T1 13 +#define T2 14 +#define T3 15 +#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 */ + +#define S0 16 +#define S1 17 +#define T9 25 +#define K0 26 +#define K1 27 +#define GP 28 +#define SP 29 +#define RA 31 + +/* Some CP0 registers */ +#define C0_HWRENA 7, 0 +#define C0_BADVADDR 8, 0 +#define C0_ENTRYHI 10, 0 +#define C0_STATUS 12, 0 +#define C0_CAUSE 13, 0 +#define C0_EPC 14, 0 +#define C0_EBASE 15, 1 +#define C0_CONFIG5 16, 5 +#define C0_DDATA_LO 28, 3 +#define C0_ERROREPC 30, 0 + +#define CALLFRAME_SIZ 32 + +#ifdef CONFIG_64BIT +#define ST0_KX_IF_64 ST0_KX +#else +#define ST0_KX_IF_64 0 +#endif + +static unsigned int scratch_vcpu[2] = { C0_DDATA_LO }; +static unsigned int scratch_tmp[2] = { C0_ERROREPC }; + +enum label_id { + label_fpu_1 = 1, + label_msa_1, + label_return_to_host, + label_kernel_asid, + label_exit_common, +}; + +UASM_L_LA(_fpu_1) +UASM_L_LA(_msa_1) +UASM_L_LA(_return_to_host) +UASM_L_LA(_kernel_asid) +UASM_L_LA(_exit_common) + +static void *kvm_mips_build_enter_guest(void *addr); +static void *kvm_mips_build_ret_from_exit(void *addr); +static void *kvm_mips_build_ret_to_guest(void *addr); +static void *kvm_mips_build_ret_to_host(void *addr); + +/** + * kvm_mips_entry_setup() - Perform global setup for entry code. + * + * Perform global setup for entry code, such as choosing a scratch register. + * + * Returns: 0 on success. + * -errno on failure. + */ +int kvm_mips_entry_setup(void) +{ + /* + * We prefer to use KScratchN registers if they are available over the + * defaults above, which may not work on all cores. + */ + unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc; + + /* Pick a scratch register for storing VCPU */ + if (kscratch_mask) { + scratch_vcpu[0] = 31; + scratch_vcpu[1] = ffs(kscratch_mask) - 1; + kscratch_mask &= ~BIT(scratch_vcpu[1]); + } + + /* Pick a scratch register to use as a temp for saving state */ + if (kscratch_mask) { + scratch_tmp[0] = 31; + scratch_tmp[1] = ffs(kscratch_mask) - 1; + kscratch_mask &= ~BIT(scratch_tmp[1]); + } + + return 0; +} + +static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp, + unsigned int frame) +{ + /* Save the VCPU scratch register value in cp0_epc of the stack frame */ + UASM_i_MFC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); + + /* Save the temp scratch register value in cp0_cause of stack frame */ + if (scratch_tmp[0] == 31) { + UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]); + UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); + } +} + +static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp, + unsigned int frame) +{ + /* + * Restore host scratch register values saved by + * kvm_mips_build_save_scratch(). + */ + UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); + UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); + + if (scratch_tmp[0] == 31) { + UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); + UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]); + } +} + +/** + * build_set_exc_base() - Assemble code to write exception base address. + * @p: Code buffer pointer. + * @reg: Source register (generated code may set WG bit in @reg). + * + * Assemble code to modify the exception base address in the EBase register, + * using the appropriately sized access and setting the WG bit if necessary. + */ +static inline void build_set_exc_base(u32 **p, unsigned int reg) +{ + if (cpu_has_ebase_wg) { + /* Set WG so that all the bits get written */ + uasm_i_ori(p, reg, reg, MIPS_EBASE_WG); + UASM_i_MTC0(p, reg, C0_EBASE); + } else { + uasm_i_mtc0(p, reg, C0_EBASE); + } +} + +/** + * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU. + * @addr: Address to start writing code. + * + * Assemble the start of the vcpu_run function to run a guest VCPU. The function + * conforms to the following prototype: + * + * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu); + * + * The exit from the guest and return to the caller is handled by the code + * generated by kvm_mips_build_ret_to_host(). + * + * Returns: Next address after end of written function. + */ +void *kvm_mips_build_vcpu_run(void *addr) +{ + u32 *p = addr; + unsigned int i; + + /* + * A0: run + * A1: vcpu + */ + + /* k0/k1 not being used in host kernel context */ + UASM_i_ADDIU(&p, K1, SP, -(int)sizeof(struct pt_regs)); + for (i = 16; i < 32; ++i) { + if (i == 24) + i = 28; + UASM_i_SW(&p, i, offsetof(struct pt_regs, regs[i]), K1); + } + + /* Save host status */ + uasm_i_mfc0(&p, V0, C0_STATUS); + UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1); + + /* Save scratch registers, will be used to store pointer to vcpu etc */ + kvm_mips_build_save_scratch(&p, V1, K1); + + /* VCPU scratch register has pointer to vcpu */ + UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); + + /* Offset into vcpu->arch */ + UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch)); + + /* + * Save the host stack to VCPU, used for exception processing + * when we exit from the Guest + */ + UASM_i_SW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1); + + /* Save the kernel gp as well */ + UASM_i_SW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1); + + /* + * Setup status register for running the guest in UM, interrupts + * are disabled + */ + UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV | ST0_KX_IF_64); + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + + /* load up the new EBASE */ + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); + build_set_exc_base(&p, K0); + + /* + * Now that the new EBASE has been loaded, unset BEV, set + * interrupt mask as it was but make sure that timer interrupts + * are enabled + */ + uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE | ST0_KX_IF_64); + uasm_i_andi(&p, V0, V0, ST0_IM); + uasm_i_or(&p, K0, K0, V0); + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + + p = kvm_mips_build_enter_guest(p); + + return p; +} + +/** + * kvm_mips_build_enter_guest() - Assemble code to resume guest execution. + * @addr: Address to start writing code. + * + * Assemble the code to resume guest execution. This code is common between the + * initial entry into the guest from the host, and returning from the exit + * handler back to the guest. + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_enter_guest(void *addr) +{ + u32 *p = addr; + unsigned int i; + struct uasm_label labels[2]; + struct uasm_reloc relocs[2]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* Set Guest EPC */ + UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1); + UASM_i_MTC0(&p, T0, C0_EPC); + + /* Set the ASID for the Guest Kernel */ + UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1); + UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]), + T0); + uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL); + uasm_i_xori(&p, T0, T0, KSU_USER); + uasm_il_bnez(&p, &r, T0, label_kernel_asid); + UASM_i_ADDIU(&p, T1, K1, + offsetof(struct kvm_vcpu_arch, guest_kernel_asid)); + /* else user */ + UASM_i_ADDIU(&p, T1, K1, + offsetof(struct kvm_vcpu_arch, guest_user_asid)); + uasm_l_kernel_asid(&l, p); + + /* t1: contains the base of the ASID array, need to get the cpu id */ + /* smp_processor_id */ + uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP); + /* x4 */ + uasm_i_sll(&p, T2, T2, 2); + UASM_i_ADDU(&p, T3, T1, T2); + uasm_i_lw(&p, K0, 0, T3); +#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE + /* x sizeof(struct cpuinfo_mips)/4 */ + uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4); + uasm_i_mul(&p, T2, T2, T3); + + UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask); + UASM_i_ADDU(&p, AT, AT, T2); + UASM_i_LW(&p, T2, uasm_rel_lo((long)&cpu_data[0].asid_mask), AT); + uasm_i_and(&p, K0, K0, T2); +#else + uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID); +#endif + uasm_i_mtc0(&p, K0, C0_ENTRYHI); + uasm_i_ehb(&p); + + /* Disable RDHWR access */ + uasm_i_mtc0(&p, ZERO, C0_HWRENA); + + /* load the guest context from VCPU and return */ + for (i = 1; i < 32; ++i) { + /* Guest k0/k1 loaded later */ + if (i == K0 || i == K1) + continue; + UASM_i_LW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1); + } + +#ifndef CONFIG_CPU_MIPSR6 + /* Restore hi/lo */ + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, hi), K1); + uasm_i_mthi(&p, K0); + + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, lo), K1); + uasm_i_mtlo(&p, K0); +#endif + + /* Restore the guest's k0/k1 registers */ + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1); + UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1); + + /* Jump to guest */ + uasm_i_eret(&p); + + uasm_resolve_relocs(relocs, labels); + + return p; +} + +/** + * kvm_mips_build_exception() - Assemble first level guest exception handler. + * @addr: Address to start writing code. + * @handler: Address of common handler (within range of @addr). + * + * Assemble exception vector code for guest execution. The generated vector will + * branch to the common exception handler generated by kvm_mips_build_exit(). + * + * Returns: Next address after end of written function. + */ +void *kvm_mips_build_exception(void *addr, void *handler) +{ + u32 *p = addr; + struct uasm_label labels[2]; + struct uasm_reloc relocs[2]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* Save guest k1 into scratch register */ + UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]); + + /* Get the VCPU pointer from the VCPU scratch register */ + UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); + + /* Save guest k0 into VCPU structure */ + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1); + + /* Branch to the common handler */ + uasm_il_b(&p, &r, label_exit_common); + uasm_i_nop(&p); + + uasm_l_exit_common(&l, handler); + uasm_resolve_relocs(relocs, labels); + + return p; +} + +/** + * kvm_mips_build_exit() - Assemble common guest exit handler. + * @addr: Address to start writing code. + * + * Assemble the generic guest exit handling code. This is called by the + * exception vectors (generated by kvm_mips_build_exception()), and calls + * kvm_mips_handle_exit(), then either resumes the guest or returns to the host + * depending on the return value. + * + * Returns: Next address after end of written function. + */ +void *kvm_mips_build_exit(void *addr) +{ + u32 *p = addr; + unsigned int i; + struct uasm_label labels[3]; + struct uasm_reloc relocs[3]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* + * Generic Guest exception handler. We end up here when the guest + * does something that causes a trap to kernel mode. + * + * Both k0/k1 registers will have already been saved (k0 into the vcpu + * structure, and k1 into the scratch_tmp register). + * + * The k1 register will already contain the kvm_vcpu_arch pointer. + */ + + /* Start saving Guest context to VCPU */ + for (i = 0; i < 32; ++i) { + /* Guest k0/k1 saved later */ + if (i == K0 || i == K1) + continue; + UASM_i_SW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1); + } + +#ifndef CONFIG_CPU_MIPSR6 + /* We need to save hi/lo and restore them on the way out */ + uasm_i_mfhi(&p, T0); + UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, hi), K1); + + uasm_i_mflo(&p, T0); + UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1); +#endif + + /* Finally save guest k1 to VCPU */ + uasm_i_ehb(&p); + UASM_i_MFC0(&p, T0, scratch_tmp[0], scratch_tmp[1]); + UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1); + + /* Now that context has been saved, we can use other registers */ + + /* Restore vcpu */ + UASM_i_MFC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); + uasm_i_move(&p, S1, A1); + + /* Restore run (vcpu->run) */ + UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1); + /* Save pointer to run in s0, will be saved by the compiler */ + uasm_i_move(&p, S0, A0); + + /* + * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process + * the exception + */ + UASM_i_MFC0(&p, K0, C0_EPC); + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, pc), K1); + + UASM_i_MFC0(&p, K0, C0_BADVADDR); + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_badvaddr), + K1); + + uasm_i_mfc0(&p, K0, C0_CAUSE); + uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1); + + /* Now restore the host state just enough to run the handlers */ + + /* Switch EBASE to the one used by Linux */ + /* load up the host EBASE */ + uasm_i_mfc0(&p, V0, C0_STATUS); + + uasm_i_lui(&p, AT, ST0_BEV >> 16); + uasm_i_or(&p, K0, V0, AT); + + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + + UASM_i_LA_mostly(&p, K0, (long)&ebase); + UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0); + build_set_exc_base(&p, K0); + + if (raw_cpu_has_fpu) { + /* + * If FPU is enabled, save FCR31 and clear it so that later + * ctc1's don't trigger FPE for pending exceptions. + */ + uasm_i_lui(&p, AT, ST0_CU1 >> 16); + uasm_i_and(&p, V1, V0, AT); + uasm_il_beqz(&p, &r, V1, label_fpu_1); + uasm_i_nop(&p); + uasm_i_cfc1(&p, T0, 31); + uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31), + K1); + uasm_i_ctc1(&p, ZERO, 31); + uasm_l_fpu_1(&l, p); + } + + if (cpu_has_msa) { + /* + * If MSA is enabled, save MSACSR and clear it so that later + * instructions don't trigger MSAFPE for pending exceptions. + */ + uasm_i_mfc0(&p, T0, C0_CONFIG5); + uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */ + uasm_il_beqz(&p, &r, T0, label_msa_1); + uasm_i_nop(&p); + uasm_i_cfcmsa(&p, T0, MSA_CSR); + uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr), + K1); + uasm_i_ctcmsa(&p, MSA_CSR, ZERO); + uasm_l_msa_1(&l, p); + } + + /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */ + uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE)); + uasm_i_and(&p, V0, V0, AT); + uasm_i_lui(&p, AT, ST0_CU0 >> 16); + uasm_i_or(&p, V0, V0, AT); + uasm_i_mtc0(&p, V0, C0_STATUS); + uasm_i_ehb(&p); + + /* Load up host GP */ + UASM_i_LW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1); + + /* Need a stack before we can jump to "C" */ + UASM_i_LW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1); + + /* Saved host state */ + UASM_i_ADDIU(&p, SP, SP, -(int)sizeof(struct pt_regs)); + + /* + * XXXKYMA do we need to load the host ASID, maybe not because the + * kernel entries are marked GLOBAL, need to verify + */ + + /* Restore host scratch registers, as we'll have clobbered them */ + kvm_mips_build_restore_scratch(&p, K0, SP); + + /* Restore RDHWR access */ + UASM_i_LA_mostly(&p, K0, (long)&hwrena); + uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0); + uasm_i_mtc0(&p, K0, C0_HWRENA); + + /* Jump to handler */ + /* + * XXXKYMA: not sure if this is safe, how large is the stack?? + * Now jump to the kvm_mips_handle_exit() to see if we can deal + * with this in the kernel + */ + UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit); + uasm_i_jalr(&p, RA, T9); + UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ); + + uasm_resolve_relocs(relocs, labels); + + p = kvm_mips_build_ret_from_exit(p); + + return p; +} + +/** + * kvm_mips_build_ret_from_exit() - Assemble guest exit return handler. + * @addr: Address to start writing code. + * + * Assemble the code to handle the return from kvm_mips_handle_exit(), either + * resuming the guest or returning to the host depending on the return value. + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_ret_from_exit(void *addr) +{ + u32 *p = addr; + struct uasm_label labels[2]; + struct uasm_reloc relocs[2]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* Return from handler Make sure interrupts are disabled */ + uasm_i_di(&p, ZERO); + uasm_i_ehb(&p); + + /* + * XXXKYMA: k0/k1 could have been blown away if we processed + * an exception while we were handling the exception from the + * guest, reload k1 + */ + + uasm_i_move(&p, K1, S1); + UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); + + /* + * Check return value, should tell us if we are returning to the + * host (handle I/O etc)or resuming the guest + */ + uasm_i_andi(&p, T0, V0, RESUME_HOST); + uasm_il_bnez(&p, &r, T0, label_return_to_host); + uasm_i_nop(&p); + + p = kvm_mips_build_ret_to_guest(p); + + uasm_l_return_to_host(&l, p); + p = kvm_mips_build_ret_to_host(p); + + uasm_resolve_relocs(relocs, labels); + + return p; +} + +/** + * kvm_mips_build_ret_to_guest() - Assemble code to return to the guest. + * @addr: Address to start writing code. + * + * Assemble the code to handle return from the guest exit handler + * (kvm_mips_handle_exit()) back to the guest. + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_ret_to_guest(void *addr) +{ + u32 *p = addr; + + /* Put the saved pointer to vcpu (s1) back into the scratch register */ + UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); + + /* Load up the Guest EBASE to minimize the window where BEV is set */ + UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); + + /* Switch EBASE back to the one used by KVM */ + uasm_i_mfc0(&p, V1, C0_STATUS); + uasm_i_lui(&p, AT, ST0_BEV >> 16); + uasm_i_or(&p, K0, V1, AT); + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + build_set_exc_base(&p, T0); + + /* Setup status register for running guest in UM */ + uasm_i_ori(&p, V1, V1, ST0_EXL | KSU_USER | ST0_IE); + UASM_i_LA(&p, AT, ~(ST0_CU0 | ST0_MX)); + uasm_i_and(&p, V1, V1, AT); + uasm_i_mtc0(&p, V1, C0_STATUS); + uasm_i_ehb(&p); + + p = kvm_mips_build_enter_guest(p); + + return p; +} + +/** + * kvm_mips_build_ret_to_host() - Assemble code to return to the host. + * @addr: Address to start writing code. + * + * Assemble the code to handle return from the guest exit handler + * (kvm_mips_handle_exit()) back to the host, i.e. to the caller of the vcpu_run + * function generated by kvm_mips_build_vcpu_run(). + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_ret_to_host(void *addr) +{ + u32 *p = addr; + unsigned int i; + + /* EBASE is already pointing to Linux */ + UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1); + UASM_i_ADDIU(&p, K1, K1, -(int)sizeof(struct pt_regs)); + + /* + * r2/v0 is the return code, shift it down by 2 (arithmetic) + * to recover the err code + */ + uasm_i_sra(&p, K0, V0, 2); + uasm_i_move(&p, V0, K0); + + /* Load context saved on the host stack */ + for (i = 16; i < 31; ++i) { + if (i == 24) + i = 28; + UASM_i_LW(&p, i, offsetof(struct pt_regs, regs[i]), K1); + } + + /* Restore RDHWR access */ + UASM_i_LA_mostly(&p, K0, (long)&hwrena); + uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0); + uasm_i_mtc0(&p, K0, C0_HWRENA); + + /* Restore RA, which is the address we will return to */ + UASM_i_LW(&p, RA, offsetof(struct pt_regs, regs[RA]), K1); + uasm_i_jr(&p, RA); + uasm_i_nop(&p); + + return p; +} + diff --git a/arch/mips/kvm/fpu.S b/arch/mips/kvm/fpu.S index 531fbf5131c0..16f17c6390dd 100644 --- a/arch/mips/kvm/fpu.S +++ b/arch/mips/kvm/fpu.S @@ -14,13 +14,16 @@ #include #include +/* preprocessor replaces the fp in ".set fp=64" with $30 otherwise */ +#undef fp + .set noreorder .set noat LEAF(__kvm_save_fpu) .set push - .set mips64r2 SET_HARDFLOAT + .set fp=64 mfc0 t0, CP0_STATUS sll t0, t0, 5 # is Status.FR set? bgez t0, 1f # no: skip odd doubles @@ -63,8 +66,8 @@ LEAF(__kvm_save_fpu) LEAF(__kvm_restore_fpu) .set push - .set mips64r2 SET_HARDFLOAT + .set fp=64 mfc0 t0, CP0_STATUS sll t0, t0, 5 # is Status.FR set? bgez t0, 1f # no: skip odd doubles diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c index 95f790663b0c..ad28dac6b7e9 100644 --- a/arch/mips/kvm/interrupt.c +++ b/arch/mips/kvm/interrupt.c @@ -22,12 +22,12 @@ #include "interrupt.h" -void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority) +void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority) { set_bit(priority, &vcpu->arch.pending_exceptions); } -void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority) +void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority) { clear_bit(priority, &vcpu->arch.pending_exceptions); } @@ -114,10 +114,10 @@ void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu, /* Deliver the interrupt of the corresponding priority, if possible. */ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause) + u32 cause) { int allowed = 0; - uint32_t exccode; + u32 exccode; struct kvm_vcpu_arch *arch = &vcpu->arch; struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -196,12 +196,12 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority, } int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause) + u32 cause) { return 1; } -void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause) +void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause) { unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned long *pending_clr = &vcpu->arch.pending_exceptions_clr; diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h index 2143884709e4..fb118a2c8379 100644 --- a/arch/mips/kvm/interrupt.h +++ b/arch/mips/kvm/interrupt.h @@ -28,17 +28,13 @@ #define MIPS_EXC_MAX 12 /* XXXSL More to follow */ -extern char __kvm_mips_vcpu_run_end[]; -extern char mips32_exception[], mips32_exceptionEnd[]; -extern char mips32_GuestException[], mips32_GuestExceptionEnd[]; - #define C_TI (_ULCAST_(1) << 30) #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0) #define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE (0) -void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority); -void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority); +void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority); +void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority); int kvm_mips_pending_timer(struct kvm_vcpu *vcpu); void kvm_mips_queue_timer_int_cb(struct kvm_vcpu *vcpu); @@ -48,7 +44,7 @@ void kvm_mips_queue_io_int_cb(struct kvm_vcpu *vcpu, void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu, struct kvm_mips_interrupt *irq); int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause); + u32 cause); int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause); -void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause); + u32 cause); +void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause); diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S deleted file mode 100644 index 828fcfc1cd7f..000000000000 --- a/arch/mips/kvm/locore.S +++ /dev/null @@ -1,605 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Main entry point for the guest, exception handling. - * - * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. - * Authors: Sanjay Lal - */ - -#include -#include -#include -#include -#include -#include - -#define _C_LABEL(x) x -#define MIPSX(name) mips32_ ## name -#define CALLFRAME_SIZ 32 - -/* - * VECTOR - * exception vector entrypoint - */ -#define VECTOR(x, regmask) \ - .ent _C_LABEL(x),0; \ - EXPORT(x); - -#define VECTOR_END(x) \ - EXPORT(x); - -/* Overload, Danger Will Robinson!! */ -#define PT_HOST_USERLOCAL PT_EPC - -#define CP0_DDATA_LO $28,3 - -/* Resume Flags */ -#define RESUME_FLAG_HOST (1<<1) /* Resume host? */ - -#define RESUME_GUEST 0 -#define RESUME_HOST RESUME_FLAG_HOST - -/* - * __kvm_mips_vcpu_run: entry point to the guest - * a0: run - * a1: vcpu - */ - .set noreorder - -FEXPORT(__kvm_mips_vcpu_run) - /* k0/k1 not being used in host kernel context */ - INT_ADDIU k1, sp, -PT_SIZE - LONG_S $16, PT_R16(k1) - LONG_S $17, PT_R17(k1) - LONG_S $18, PT_R18(k1) - LONG_S $19, PT_R19(k1) - LONG_S $20, PT_R20(k1) - LONG_S $21, PT_R21(k1) - LONG_S $22, PT_R22(k1) - LONG_S $23, PT_R23(k1) - - LONG_S $28, PT_R28(k1) - LONG_S $29, PT_R29(k1) - LONG_S $30, PT_R30(k1) - LONG_S $31, PT_R31(k1) - - /* Save hi/lo */ - mflo v0 - LONG_S v0, PT_LO(k1) - mfhi v1 - LONG_S v1, PT_HI(k1) - - /* Save host status */ - mfc0 v0, CP0_STATUS - LONG_S v0, PT_STATUS(k1) - - /* Save DDATA_LO, will be used to store pointer to vcpu */ - mfc0 v1, CP0_DDATA_LO - LONG_S v1, PT_HOST_USERLOCAL(k1) - - /* DDATA_LO has pointer to vcpu */ - mtc0 a1, CP0_DDATA_LO - - /* Offset into vcpu->arch */ - INT_ADDIU k1, a1, VCPU_HOST_ARCH - - /* - * Save the host stack to VCPU, used for exception processing - * when we exit from the Guest - */ - LONG_S sp, VCPU_HOST_STACK(k1) - - /* Save the kernel gp as well */ - LONG_S gp, VCPU_HOST_GP(k1) - - /* - * Setup status register for running the guest in UM, interrupts - * are disabled - */ - li k0, (ST0_EXL | KSU_USER | ST0_BEV) - mtc0 k0, CP0_STATUS - ehb - - /* load up the new EBASE */ - LONG_L k0, VCPU_GUEST_EBASE(k1) - mtc0 k0, CP0_EBASE - - /* - * Now that the new EBASE has been loaded, unset BEV, set - * interrupt mask as it was but make sure that timer interrupts - * are enabled - */ - li k0, (ST0_EXL | KSU_USER | ST0_IE) - andi v0, v0, ST0_IM - or k0, k0, v0 - mtc0 k0, CP0_STATUS - ehb - - /* Set Guest EPC */ - LONG_L t0, VCPU_PC(k1) - mtc0 t0, CP0_EPC - -FEXPORT(__kvm_mips_load_asid) - /* Set the ASID for the Guest Kernel */ - PTR_L t0, VCPU_COP0(k1) - LONG_L t0, COP0_STATUS(t0) - andi t0, KSU_USER | ST0_ERL | ST0_EXL - xori t0, KSU_USER - bnez t0, 1f /* If kernel */ - INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */ - INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */ -1: - /* t1: contains the base of the ASID array, need to get the cpu id */ - LONG_L t2, TI_CPU($28) /* smp_processor_id */ - INT_SLL t2, t2, 2 /* x4 */ - REG_ADDU t3, t1, t2 - LONG_L k0, (t3) -#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE - li t3, CPUINFO_SIZE/4 - mul t2, t2, t3 /* x sizeof(struct cpuinfo_mips)/4 */ - LONG_L t2, (cpu_data + CPUINFO_ASID_MASK)(t2) - and k0, k0, t2 -#else - andi k0, k0, MIPS_ENTRYHI_ASID -#endif - mtc0 k0, CP0_ENTRYHI - ehb - - /* Disable RDHWR access */ - mtc0 zero, CP0_HWRENA - - .set noat - /* Now load up the Guest Context from VCPU */ - LONG_L $1, VCPU_R1(k1) - LONG_L $2, VCPU_R2(k1) - LONG_L $3, VCPU_R3(k1) - - LONG_L $4, VCPU_R4(k1) - LONG_L $5, VCPU_R5(k1) - LONG_L $6, VCPU_R6(k1) - LONG_L $7, VCPU_R7(k1) - - LONG_L $8, VCPU_R8(k1) - LONG_L $9, VCPU_R9(k1) - LONG_L $10, VCPU_R10(k1) - LONG_L $11, VCPU_R11(k1) - LONG_L $12, VCPU_R12(k1) - LONG_L $13, VCPU_R13(k1) - LONG_L $14, VCPU_R14(k1) - LONG_L $15, VCPU_R15(k1) - LONG_L $16, VCPU_R16(k1) - LONG_L $17, VCPU_R17(k1) - LONG_L $18, VCPU_R18(k1) - LONG_L $19, VCPU_R19(k1) - LONG_L $20, VCPU_R20(k1) - LONG_L $21, VCPU_R21(k1) - LONG_L $22, VCPU_R22(k1) - LONG_L $23, VCPU_R23(k1) - LONG_L $24, VCPU_R24(k1) - LONG_L $25, VCPU_R25(k1) - - /* k0/k1 loaded up later */ - - LONG_L $28, VCPU_R28(k1) - LONG_L $29, VCPU_R29(k1) - LONG_L $30, VCPU_R30(k1) - LONG_L $31, VCPU_R31(k1) - - /* Restore hi/lo */ - LONG_L k0, VCPU_LO(k1) - mtlo k0 - - LONG_L k0, VCPU_HI(k1) - mthi k0 - -FEXPORT(__kvm_mips_load_k0k1) - /* Restore the guest's k0/k1 registers */ - LONG_L k0, VCPU_R26(k1) - LONG_L k1, VCPU_R27(k1) - - /* Jump to guest */ - eret -EXPORT(__kvm_mips_vcpu_run_end) - -VECTOR(MIPSX(exception), unknown) -/* Find out what mode we came from and jump to the proper handler. */ - mtc0 k0, CP0_ERROREPC #01: Save guest k0 - ehb #02: - - mfc0 k0, CP0_EBASE #02: Get EBASE - INT_SRL k0, k0, 10 #03: Get rid of CPUNum - INT_SLL k0, k0, 10 #04 - LONG_S k1, 0x3000(k0) #05: Save k1 @ offset 0x3000 - INT_ADDIU k0, k0, 0x2000 #06: Exception handler is - # installed @ offset 0x2000 - j k0 #07: jump to the function - nop #08: branch delay slot -VECTOR_END(MIPSX(exceptionEnd)) -.end MIPSX(exception) - -/* - * Generic Guest exception handler. We end up here when the guest - * does something that causes a trap to kernel mode. - */ -NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra) - /* Get the VCPU pointer from DDTATA_LO */ - mfc0 k1, CP0_DDATA_LO - INT_ADDIU k1, k1, VCPU_HOST_ARCH - - /* Start saving Guest context to VCPU */ - LONG_S $0, VCPU_R0(k1) - LONG_S $1, VCPU_R1(k1) - LONG_S $2, VCPU_R2(k1) - LONG_S $3, VCPU_R3(k1) - LONG_S $4, VCPU_R4(k1) - LONG_S $5, VCPU_R5(k1) - LONG_S $6, VCPU_R6(k1) - LONG_S $7, VCPU_R7(k1) - LONG_S $8, VCPU_R8(k1) - LONG_S $9, VCPU_R9(k1) - LONG_S $10, VCPU_R10(k1) - LONG_S $11, VCPU_R11(k1) - LONG_S $12, VCPU_R12(k1) - LONG_S $13, VCPU_R13(k1) - LONG_S $14, VCPU_R14(k1) - LONG_S $15, VCPU_R15(k1) - LONG_S $16, VCPU_R16(k1) - LONG_S $17, VCPU_R17(k1) - LONG_S $18, VCPU_R18(k1) - LONG_S $19, VCPU_R19(k1) - LONG_S $20, VCPU_R20(k1) - LONG_S $21, VCPU_R21(k1) - LONG_S $22, VCPU_R22(k1) - LONG_S $23, VCPU_R23(k1) - LONG_S $24, VCPU_R24(k1) - LONG_S $25, VCPU_R25(k1) - - /* Guest k0/k1 saved later */ - - LONG_S $28, VCPU_R28(k1) - LONG_S $29, VCPU_R29(k1) - LONG_S $30, VCPU_R30(k1) - LONG_S $31, VCPU_R31(k1) - - .set at - - /* We need to save hi/lo and restore them on the way out */ - mfhi t0 - LONG_S t0, VCPU_HI(k1) - - mflo t0 - LONG_S t0, VCPU_LO(k1) - - /* Finally save guest k0/k1 to VCPU */ - mfc0 t0, CP0_ERROREPC - LONG_S t0, VCPU_R26(k1) - - /* Get GUEST k1 and save it in VCPU */ - PTR_LI t1, ~0x2ff - mfc0 t0, CP0_EBASE - and t0, t0, t1 - LONG_L t0, 0x3000(t0) - LONG_S t0, VCPU_R27(k1) - - /* Now that context has been saved, we can use other registers */ - - /* Restore vcpu */ - mfc0 a1, CP0_DDATA_LO - move s1, a1 - - /* Restore run (vcpu->run) */ - LONG_L a0, VCPU_RUN(a1) - /* Save pointer to run in s0, will be saved by the compiler */ - move s0, a0 - - /* - * Save Host level EPC, BadVaddr and Cause to VCPU, useful to - * process the exception - */ - mfc0 k0,CP0_EPC - LONG_S k0, VCPU_PC(k1) - - mfc0 k0, CP0_BADVADDR - LONG_S k0, VCPU_HOST_CP0_BADVADDR(k1) - - mfc0 k0, CP0_CAUSE - LONG_S k0, VCPU_HOST_CP0_CAUSE(k1) - - mfc0 k0, CP0_ENTRYHI - LONG_S k0, VCPU_HOST_ENTRYHI(k1) - - /* Now restore the host state just enough to run the handlers */ - - /* Switch EBASE to the one used by Linux */ - /* load up the host EBASE */ - mfc0 v0, CP0_STATUS - - or k0, v0, ST0_BEV - - mtc0 k0, CP0_STATUS - ehb - - LONG_L k0, VCPU_HOST_EBASE(k1) - mtc0 k0,CP0_EBASE - - /* - * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't - * trigger FPE for pending exceptions. - */ - and v1, v0, ST0_CU1 - beqz v1, 1f - nop - .set push - SET_HARDFLOAT - cfc1 t0, fcr31 - sw t0, VCPU_FCR31(k1) - ctc1 zero,fcr31 - .set pop -1: - -#ifdef CONFIG_CPU_HAS_MSA - /* - * If MSA is enabled, save MSACSR and clear it so that later - * instructions don't trigger MSAFPE for pending exceptions. - */ - mfc0 t0, CP0_CONFIG3 - ext t0, t0, 28, 1 /* MIPS_CONF3_MSAP */ - beqz t0, 1f - nop - mfc0 t0, CP0_CONFIG5 - ext t0, t0, 27, 1 /* MIPS_CONF5_MSAEN */ - beqz t0, 1f - nop - _cfcmsa t0, MSA_CSR - sw t0, VCPU_MSA_CSR(k1) - _ctcmsa MSA_CSR, zero -1: -#endif - - /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */ - and v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE) - or v0, v0, ST0_CU0 - mtc0 v0, CP0_STATUS - ehb - - /* Load up host GP */ - LONG_L gp, VCPU_HOST_GP(k1) - - /* Need a stack before we can jump to "C" */ - LONG_L sp, VCPU_HOST_STACK(k1) - - /* Saved host state */ - INT_ADDIU sp, sp, -PT_SIZE - - /* - * XXXKYMA do we need to load the host ASID, maybe not because the - * kernel entries are marked GLOBAL, need to verify - */ - - /* Restore host DDATA_LO */ - LONG_L k0, PT_HOST_USERLOCAL(sp) - mtc0 k0, CP0_DDATA_LO - - /* Restore RDHWR access */ - PTR_LI k0, 0x2000000F - mtc0 k0, CP0_HWRENA - - /* Jump to handler */ -FEXPORT(__kvm_mips_jump_to_handler) - /* - * XXXKYMA: not sure if this is safe, how large is the stack?? - * Now jump to the kvm_mips_handle_exit() to see if we can deal - * with this in the kernel - */ - PTR_LA t9, kvm_mips_handle_exit - jalr.hb t9 - INT_ADDIU sp, sp, -CALLFRAME_SIZ /* BD Slot */ - - /* Return from handler Make sure interrupts are disabled */ - di - ehb - - /* - * XXXKYMA: k0/k1 could have been blown away if we processed - * an exception while we were handling the exception from the - * guest, reload k1 - */ - - move k1, s1 - INT_ADDIU k1, k1, VCPU_HOST_ARCH - - /* - * Check return value, should tell us if we are returning to the - * host (handle I/O etc)or resuming the guest - */ - andi t0, v0, RESUME_HOST - bnez t0, __kvm_mips_return_to_host - nop - -__kvm_mips_return_to_guest: - /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */ - mtc0 s1, CP0_DDATA_LO - - /* Load up the Guest EBASE to minimize the window where BEV is set */ - LONG_L t0, VCPU_GUEST_EBASE(k1) - - /* Switch EBASE back to the one used by KVM */ - mfc0 v1, CP0_STATUS - or k0, v1, ST0_BEV - mtc0 k0, CP0_STATUS - ehb - mtc0 t0, CP0_EBASE - - /* Setup status register for running guest in UM */ - or v1, v1, (ST0_EXL | KSU_USER | ST0_IE) - and v1, v1, ~(ST0_CU0 | ST0_MX) - mtc0 v1, CP0_STATUS - ehb - - /* Set Guest EPC */ - LONG_L t0, VCPU_PC(k1) - mtc0 t0, CP0_EPC - - /* Set the ASID for the Guest Kernel */ - PTR_L t0, VCPU_COP0(k1) - LONG_L t0, COP0_STATUS(t0) - andi t0, KSU_USER | ST0_ERL | ST0_EXL - xori t0, KSU_USER - bnez t0, 1f /* If kernel */ - INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */ - INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */ -1: - /* t1: contains the base of the ASID array, need to get the cpu id */ - LONG_L t2, TI_CPU($28) /* smp_processor_id */ - INT_SLL t2, t2, 2 /* x4 */ - REG_ADDU t3, t1, t2 - LONG_L k0, (t3) -#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE - li t3, CPUINFO_SIZE/4 - mul t2, t2, t3 /* x sizeof(struct cpuinfo_mips)/4 */ - LONG_L t2, (cpu_data + CPUINFO_ASID_MASK)(t2) - and k0, k0, t2 -#else - andi k0, k0, MIPS_ENTRYHI_ASID -#endif - mtc0 k0, CP0_ENTRYHI - ehb - - /* Disable RDHWR access */ - mtc0 zero, CP0_HWRENA - - .set noat - /* load the guest context from VCPU and return */ - LONG_L $0, VCPU_R0(k1) - LONG_L $1, VCPU_R1(k1) - LONG_L $2, VCPU_R2(k1) - LONG_L $3, VCPU_R3(k1) - LONG_L $4, VCPU_R4(k1) - LONG_L $5, VCPU_R5(k1) - LONG_L $6, VCPU_R6(k1) - LONG_L $7, VCPU_R7(k1) - LONG_L $8, VCPU_R8(k1) - LONG_L $9, VCPU_R9(k1) - LONG_L $10, VCPU_R10(k1) - LONG_L $11, VCPU_R11(k1) - LONG_L $12, VCPU_R12(k1) - LONG_L $13, VCPU_R13(k1) - LONG_L $14, VCPU_R14(k1) - LONG_L $15, VCPU_R15(k1) - LONG_L $16, VCPU_R16(k1) - LONG_L $17, VCPU_R17(k1) - LONG_L $18, VCPU_R18(k1) - LONG_L $19, VCPU_R19(k1) - LONG_L $20, VCPU_R20(k1) - LONG_L $21, VCPU_R21(k1) - LONG_L $22, VCPU_R22(k1) - LONG_L $23, VCPU_R23(k1) - LONG_L $24, VCPU_R24(k1) - LONG_L $25, VCPU_R25(k1) - - /* $/k1 loaded later */ - LONG_L $28, VCPU_R28(k1) - LONG_L $29, VCPU_R29(k1) - LONG_L $30, VCPU_R30(k1) - LONG_L $31, VCPU_R31(k1) - -FEXPORT(__kvm_mips_skip_guest_restore) - LONG_L k0, VCPU_HI(k1) - mthi k0 - - LONG_L k0, VCPU_LO(k1) - mtlo k0 - - LONG_L k0, VCPU_R26(k1) - LONG_L k1, VCPU_R27(k1) - - eret - .set at - -__kvm_mips_return_to_host: - /* EBASE is already pointing to Linux */ - LONG_L k1, VCPU_HOST_STACK(k1) - INT_ADDIU k1,k1, -PT_SIZE - - /* Restore host DDATA_LO */ - LONG_L k0, PT_HOST_USERLOCAL(k1) - mtc0 k0, CP0_DDATA_LO - - /* - * r2/v0 is the return code, shift it down by 2 (arithmetic) - * to recover the err code - */ - INT_SRA k0, v0, 2 - move $2, k0 - - /* Load context saved on the host stack */ - LONG_L $16, PT_R16(k1) - LONG_L $17, PT_R17(k1) - LONG_L $18, PT_R18(k1) - LONG_L $19, PT_R19(k1) - LONG_L $20, PT_R20(k1) - LONG_L $21, PT_R21(k1) - LONG_L $22, PT_R22(k1) - LONG_L $23, PT_R23(k1) - - LONG_L $28, PT_R28(k1) - LONG_L $29, PT_R29(k1) - LONG_L $30, PT_R30(k1) - - LONG_L k0, PT_HI(k1) - mthi k0 - - LONG_L k0, PT_LO(k1) - mtlo k0 - - /* Restore RDHWR access */ - PTR_LI k0, 0x2000000F - mtc0 k0, CP0_HWRENA - - /* Restore RA, which is the address we will return to */ - LONG_L ra, PT_R31(k1) - j ra - nop - -VECTOR_END(MIPSX(GuestExceptionEnd)) -.end MIPSX(GuestException) - -MIPSX(exceptions): - #### - ##### The exception handlers. - ##### - .word _C_LABEL(MIPSX(GuestException)) # 0 - .word _C_LABEL(MIPSX(GuestException)) # 1 - .word _C_LABEL(MIPSX(GuestException)) # 2 - .word _C_LABEL(MIPSX(GuestException)) # 3 - .word _C_LABEL(MIPSX(GuestException)) # 4 - .word _C_LABEL(MIPSX(GuestException)) # 5 - .word _C_LABEL(MIPSX(GuestException)) # 6 - .word _C_LABEL(MIPSX(GuestException)) # 7 - .word _C_LABEL(MIPSX(GuestException)) # 8 - .word _C_LABEL(MIPSX(GuestException)) # 9 - .word _C_LABEL(MIPSX(GuestException)) # 10 - .word _C_LABEL(MIPSX(GuestException)) # 11 - .word _C_LABEL(MIPSX(GuestException)) # 12 - .word _C_LABEL(MIPSX(GuestException)) # 13 - .word _C_LABEL(MIPSX(GuestException)) # 14 - .word _C_LABEL(MIPSX(GuestException)) # 15 - .word _C_LABEL(MIPSX(GuestException)) # 16 - .word _C_LABEL(MIPSX(GuestException)) # 17 - .word _C_LABEL(MIPSX(GuestException)) # 18 - .word _C_LABEL(MIPSX(GuestException)) # 19 - .word _C_LABEL(MIPSX(GuestException)) # 20 - .word _C_LABEL(MIPSX(GuestException)) # 21 - .word _C_LABEL(MIPSX(GuestException)) # 22 - .word _C_LABEL(MIPSX(GuestException)) # 23 - .word _C_LABEL(MIPSX(GuestException)) # 24 - .word _C_LABEL(MIPSX(GuestException)) # 25 - .word _C_LABEL(MIPSX(GuestException)) # 26 - .word _C_LABEL(MIPSX(GuestException)) # 27 - .word _C_LABEL(MIPSX(GuestException)) # 28 - .word _C_LABEL(MIPSX(GuestException)) # 29 - .word _C_LABEL(MIPSX(GuestException)) # 30 - .word _C_LABEL(MIPSX(GuestException)) # 31 diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 44da5259f390..a6ea084b4d9d 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -9,6 +9,7 @@ * Authors: Sanjay Lal */ +#include #include #include #include @@ -147,7 +148,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm) /* Put the pages we reserved for the guest pmap */ for (i = 0; i < kvm->arch.guest_pmap_npages; i++) { if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE) - kvm_mips_release_pfn_clean(kvm->arch.guest_pmap[i]); + kvm_release_pfn_clean(kvm->arch.guest_pmap[i]); } kfree(kvm->arch.guest_pmap); @@ -244,10 +245,27 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, } } +static inline void dump_handler(const char *symbol, void *start, void *end) +{ + u32 *p; + + pr_debug("LEAF(%s)\n", symbol); + + pr_debug("\t.set push\n"); + pr_debug("\t.set noreorder\n"); + + for (p = start; p < (u32 *)end; ++p) + pr_debug("\t.word\t0x%08x\t\t# %p\n", *p, p); + + pr_debug("\t.set\tpop\n"); + + pr_debug("\tEND(%s)\n", symbol); +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { - int err, size, offset; - void *gebase; + int err, size; + void *gebase, *p, *handler; int i; struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); @@ -273,9 +291,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) else size = 0x4000; - /* Save Linux EBASE */ - vcpu->arch.host_ebase = (void *)read_c0_ebase(); - gebase = kzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL); if (!gebase) { @@ -285,44 +300,53 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n", ALIGN(size, PAGE_SIZE), gebase); + /* + * Check new ebase actually fits in CP0_EBase. The lack of a write gate + * limits us to the low 512MB of physical address space. If the memory + * we allocate is out of range, just give up now. + */ + if (!cpu_has_ebase_wg && virt_to_phys(gebase) >= 0x20000000) { + kvm_err("CP0_EBase.WG required for guest exception base %pK\n", + gebase); + err = -ENOMEM; + goto out_free_gebase; + } + /* Save new ebase */ vcpu->arch.guest_ebase = gebase; - /* Copy L1 Guest Exception handler to correct offset */ + /* Build guest exception vectors dynamically in unmapped memory */ + handler = gebase + 0x2000; /* TLB Refill, EXL = 0 */ - memcpy(gebase, mips32_exception, - mips32_exceptionEnd - mips32_exception); + kvm_mips_build_exception(gebase, handler); /* General Exception Entry point */ - memcpy(gebase + 0x180, mips32_exception, - mips32_exceptionEnd - mips32_exception); + kvm_mips_build_exception(gebase + 0x180, handler); /* For vectored interrupts poke the exception code @ all offsets 0-7 */ for (i = 0; i < 8; i++) { kvm_debug("L1 Vectored handler @ %p\n", gebase + 0x200 + (i * VECTORSPACING)); - memcpy(gebase + 0x200 + (i * VECTORSPACING), mips32_exception, - mips32_exceptionEnd - mips32_exception); + kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING, + handler); } - /* General handler, relocate to unmapped space for sanity's sake */ - offset = 0x2000; - kvm_debug("Installing KVM Exception handlers @ %p, %#x bytes\n", - gebase + offset, - mips32_GuestExceptionEnd - mips32_GuestException); + /* General exit handler */ + p = handler; + p = kvm_mips_build_exit(p); - memcpy(gebase + offset, mips32_GuestException, - mips32_GuestExceptionEnd - mips32_GuestException); + /* Guest entry routine */ + vcpu->arch.vcpu_run = p; + p = kvm_mips_build_vcpu_run(p); -#ifdef MODULE - offset += mips32_GuestExceptionEnd - mips32_GuestException; - memcpy(gebase + offset, (char *)__kvm_mips_vcpu_run, - __kvm_mips_vcpu_run_end - (char *)__kvm_mips_vcpu_run); - vcpu->arch.vcpu_run = gebase + offset; -#else - vcpu->arch.vcpu_run = __kvm_mips_vcpu_run; -#endif + /* Dump the generated code */ + pr_debug("#include \n"); + pr_debug("#include \n"); + pr_debug("\n"); + dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p); + dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200); + dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run); /* Invalidate the icache for these ranges */ local_flush_icache_range((unsigned long)gebase, @@ -408,17 +432,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_mips_deliver_interrupts(vcpu, kvm_read_c0_guest_cause(vcpu->arch.cop0)); - __kvm_guest_enter(); + guest_enter_irqoff(); /* Disable hardware page table walking while in guest */ htw_stop(); + trace_kvm_enter(vcpu); r = vcpu->arch.vcpu_run(run, vcpu); + trace_kvm_out(vcpu); /* Re-enable HTW before enabling interrupts */ htw_start(); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); if (vcpu->sigset_active) @@ -507,8 +533,10 @@ static u64 kvm_mips_get_one_regs[] = { KVM_REG_MIPS_R30, KVM_REG_MIPS_R31, +#ifndef CONFIG_CPU_MIPSR6 KVM_REG_MIPS_HI, KVM_REG_MIPS_LO, +#endif KVM_REG_MIPS_PC, KVM_REG_MIPS_CP0_INDEX, @@ -539,6 +567,104 @@ static u64 kvm_mips_get_one_regs[] = { KVM_REG_MIPS_COUNT_HZ, }; +static u64 kvm_mips_get_one_regs_fpu[] = { + KVM_REG_MIPS_FCR_IR, + KVM_REG_MIPS_FCR_CSR, +}; + +static u64 kvm_mips_get_one_regs_msa[] = { + KVM_REG_MIPS_MSA_IR, + KVM_REG_MIPS_MSA_CSR, +}; + +static u64 kvm_mips_get_one_regs_kscratch[] = { + KVM_REG_MIPS_CP0_KSCRATCH1, + KVM_REG_MIPS_CP0_KSCRATCH2, + KVM_REG_MIPS_CP0_KSCRATCH3, + KVM_REG_MIPS_CP0_KSCRATCH4, + KVM_REG_MIPS_CP0_KSCRATCH5, + KVM_REG_MIPS_CP0_KSCRATCH6, +}; + +static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu) +{ + unsigned long ret; + + ret = ARRAY_SIZE(kvm_mips_get_one_regs); + if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) { + ret += ARRAY_SIZE(kvm_mips_get_one_regs_fpu) + 48; + /* odd doubles */ + if (boot_cpu_data.fpu_id & MIPS_FPIR_F64) + ret += 16; + } + if (kvm_mips_guest_can_have_msa(&vcpu->arch)) + ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32; + ret += __arch_hweight8(vcpu->arch.kscratch_enabled); + ret += kvm_mips_callbacks->num_regs(vcpu); + + return ret; +} + +static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices) +{ + u64 index; + unsigned int i; + + if (copy_to_user(indices, kvm_mips_get_one_regs, + sizeof(kvm_mips_get_one_regs))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_mips_get_one_regs); + + if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) { + if (copy_to_user(indices, kvm_mips_get_one_regs_fpu, + sizeof(kvm_mips_get_one_regs_fpu))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_mips_get_one_regs_fpu); + + for (i = 0; i < 32; ++i) { + index = KVM_REG_MIPS_FPR_32(i); + if (copy_to_user(indices, &index, sizeof(index))) + return -EFAULT; + ++indices; + + /* skip odd doubles if no F64 */ + if (i & 1 && !(boot_cpu_data.fpu_id & MIPS_FPIR_F64)) + continue; + + index = KVM_REG_MIPS_FPR_64(i); + if (copy_to_user(indices, &index, sizeof(index))) + return -EFAULT; + ++indices; + } + } + + if (kvm_mips_guest_can_have_msa(&vcpu->arch)) { + if (copy_to_user(indices, kvm_mips_get_one_regs_msa, + sizeof(kvm_mips_get_one_regs_msa))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_mips_get_one_regs_msa); + + for (i = 0; i < 32; ++i) { + index = KVM_REG_MIPS_VEC_128(i); + if (copy_to_user(indices, &index, sizeof(index))) + return -EFAULT; + ++indices; + } + } + + for (i = 0; i < 6; ++i) { + if (!(vcpu->arch.kscratch_enabled & BIT(i + 2))) + continue; + + if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i], + sizeof(kvm_mips_get_one_regs_kscratch[i]))) + return -EFAULT; + ++indices; + } + + return kvm_mips_callbacks->copy_reg_indices(vcpu, indices); +} + static int kvm_mips_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { @@ -554,12 +680,14 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_R0 ... KVM_REG_MIPS_R31: v = (long)vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0]; break; +#ifndef CONFIG_CPU_MIPSR6 case KVM_REG_MIPS_HI: v = (long)vcpu->arch.hi; break; case KVM_REG_MIPS_LO: v = (long)vcpu->arch.lo; break; +#endif case KVM_REG_MIPS_PC: v = (long)vcpu->arch.pc; break; @@ -688,17 +816,37 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_CP0_ERROREPC: v = (long)kvm_read_c0_guest_errorepc(cop0); break; + case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6: + idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2; + if (!(vcpu->arch.kscratch_enabled & BIT(idx))) + return -EINVAL; + switch (idx) { + case 2: + v = (long)kvm_read_c0_guest_kscratch1(cop0); + break; + case 3: + v = (long)kvm_read_c0_guest_kscratch2(cop0); + break; + case 4: + v = (long)kvm_read_c0_guest_kscratch3(cop0); + break; + case 5: + v = (long)kvm_read_c0_guest_kscratch4(cop0); + break; + case 6: + v = (long)kvm_read_c0_guest_kscratch5(cop0); + break; + case 7: + v = (long)kvm_read_c0_guest_kscratch6(cop0); + break; + } + break; /* registers to be handled specially */ - case KVM_REG_MIPS_CP0_COUNT: - case KVM_REG_MIPS_COUNT_CTL: - case KVM_REG_MIPS_COUNT_RESUME: - case KVM_REG_MIPS_COUNT_HZ: + default: ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v); if (ret) return ret; break; - default: - return -EINVAL; } if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64) { u64 __user *uaddr64 = (u64 __user *)(long)reg->addr; @@ -755,12 +903,14 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_R1 ... KVM_REG_MIPS_R31: vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0] = v; break; +#ifndef CONFIG_CPU_MIPSR6 case KVM_REG_MIPS_HI: vcpu->arch.hi = v; break; case KVM_REG_MIPS_LO: vcpu->arch.lo = v; break; +#endif case KVM_REG_MIPS_PC: vcpu->arch.pc = v; break; @@ -859,22 +1009,34 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_CP0_ERROREPC: kvm_write_c0_guest_errorepc(cop0, v); break; + case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6: + idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2; + if (!(vcpu->arch.kscratch_enabled & BIT(idx))) + return -EINVAL; + switch (idx) { + case 2: + kvm_write_c0_guest_kscratch1(cop0, v); + break; + case 3: + kvm_write_c0_guest_kscratch2(cop0, v); + break; + case 4: + kvm_write_c0_guest_kscratch3(cop0, v); + break; + case 5: + kvm_write_c0_guest_kscratch4(cop0, v); + break; + case 6: + kvm_write_c0_guest_kscratch5(cop0, v); + break; + case 7: + kvm_write_c0_guest_kscratch6(cop0, v); + break; + } + break; /* registers to be handled specially */ - case KVM_REG_MIPS_CP0_COUNT: - case KVM_REG_MIPS_CP0_COMPARE: - case KVM_REG_MIPS_CP0_CAUSE: - case KVM_REG_MIPS_CP0_CONFIG: - case KVM_REG_MIPS_CP0_CONFIG1: - case KVM_REG_MIPS_CP0_CONFIG2: - case KVM_REG_MIPS_CP0_CONFIG3: - case KVM_REG_MIPS_CP0_CONFIG4: - case KVM_REG_MIPS_CP0_CONFIG5: - case KVM_REG_MIPS_COUNT_CTL: - case KVM_REG_MIPS_COUNT_RESUME: - case KVM_REG_MIPS_COUNT_HZ: - return kvm_mips_callbacks->set_one_reg(vcpu, reg, v); default: - return -EINVAL; + return kvm_mips_callbacks->set_one_reg(vcpu, reg, v); } return 0; } @@ -927,23 +1089,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, } case KVM_GET_REG_LIST: { struct kvm_reg_list __user *user_list = argp; - u64 __user *reg_dest; struct kvm_reg_list reg_list; unsigned n; if (copy_from_user(®_list, user_list, sizeof(reg_list))) return -EFAULT; n = reg_list.n; - reg_list.n = ARRAY_SIZE(kvm_mips_get_one_regs); + reg_list.n = kvm_mips_num_regs(vcpu); if (copy_to_user(user_list, ®_list, sizeof(reg_list))) return -EFAULT; if (n < reg_list.n) return -E2BIG; - reg_dest = user_list->reg; - if (copy_to_user(reg_dest, kvm_mips_get_one_regs, - sizeof(kvm_mips_get_one_regs))) - return -EFAULT; - return 0; + return kvm_mips_copy_reg_indices(vcpu, user_list->reg); } case KVM_NMI: /* Treat the NMI as a CPU reset */ @@ -1222,7 +1379,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) static void kvm_mips_set_c0_status(void) { - uint32_t status = read_c0_status(); + u32 status = read_c0_status(); if (cpu_has_dsp) status |= (ST0_MX); @@ -1236,9 +1393,9 @@ static void kvm_mips_set_c0_status(void) */ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) { - uint32_t cause = vcpu->arch.host_cp0_cause; - uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; + u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -1260,6 +1417,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_debug("kvm_mips_handle_exit: cause: %#x, PC: %p, kvm_run: %p, kvm_vcpu: %p\n", cause, opc, run, vcpu); + trace_kvm_exit(vcpu, exccode); /* * Do a privilege check, if in UM most of these exit conditions end up @@ -1279,7 +1437,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_debug("[%d]EXCCODE_INT @ %p\n", vcpu->vcpu_id, opc); ++vcpu->stat.int_exits; - trace_kvm_exit(vcpu, INT_EXITS); if (need_resched()) cond_resched(); @@ -1291,7 +1448,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_debug("EXCCODE_CPU: @ PC: %p\n", opc); ++vcpu->stat.cop_unusable_exits; - trace_kvm_exit(vcpu, COP_UNUSABLE_EXITS); ret = kvm_mips_callbacks->handle_cop_unusable(vcpu); /* XXXKYMA: Might need to return to user space */ if (run->exit_reason == KVM_EXIT_IRQ_WINDOW_OPEN) @@ -1300,7 +1456,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) case EXCCODE_MOD: ++vcpu->stat.tlbmod_exits; - trace_kvm_exit(vcpu, TLBMOD_EXITS); ret = kvm_mips_callbacks->handle_tlb_mod(vcpu); break; @@ -1310,7 +1465,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) badvaddr); ++vcpu->stat.tlbmiss_st_exits; - trace_kvm_exit(vcpu, TLBMISS_ST_EXITS); ret = kvm_mips_callbacks->handle_tlb_st_miss(vcpu); break; @@ -1319,61 +1473,51 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) cause, opc, badvaddr); ++vcpu->stat.tlbmiss_ld_exits; - trace_kvm_exit(vcpu, TLBMISS_LD_EXITS); ret = kvm_mips_callbacks->handle_tlb_ld_miss(vcpu); break; case EXCCODE_ADES: ++vcpu->stat.addrerr_st_exits; - trace_kvm_exit(vcpu, ADDRERR_ST_EXITS); ret = kvm_mips_callbacks->handle_addr_err_st(vcpu); break; case EXCCODE_ADEL: ++vcpu->stat.addrerr_ld_exits; - trace_kvm_exit(vcpu, ADDRERR_LD_EXITS); ret = kvm_mips_callbacks->handle_addr_err_ld(vcpu); break; case EXCCODE_SYS: ++vcpu->stat.syscall_exits; - trace_kvm_exit(vcpu, SYSCALL_EXITS); ret = kvm_mips_callbacks->handle_syscall(vcpu); break; case EXCCODE_RI: ++vcpu->stat.resvd_inst_exits; - trace_kvm_exit(vcpu, RESVD_INST_EXITS); ret = kvm_mips_callbacks->handle_res_inst(vcpu); break; case EXCCODE_BP: ++vcpu->stat.break_inst_exits; - trace_kvm_exit(vcpu, BREAK_INST_EXITS); ret = kvm_mips_callbacks->handle_break(vcpu); break; case EXCCODE_TR: ++vcpu->stat.trap_inst_exits; - trace_kvm_exit(vcpu, TRAP_INST_EXITS); ret = kvm_mips_callbacks->handle_trap(vcpu); break; case EXCCODE_MSAFPE: ++vcpu->stat.msa_fpe_exits; - trace_kvm_exit(vcpu, MSA_FPE_EXITS); ret = kvm_mips_callbacks->handle_msa_fpe(vcpu); break; case EXCCODE_FPE: ++vcpu->stat.fpe_exits; - trace_kvm_exit(vcpu, FPE_EXITS); ret = kvm_mips_callbacks->handle_fpe(vcpu); break; case EXCCODE_MSADIS: ++vcpu->stat.msa_disabled_exits; - trace_kvm_exit(vcpu, MSA_DISABLED_EXITS); ret = kvm_mips_callbacks->handle_msa_disabled(vcpu); break; @@ -1400,11 +1544,13 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) run->exit_reason = KVM_EXIT_INTR; ret = (-EINTR << 2) | RESUME_HOST; ++vcpu->stat.signal_exits; - trace_kvm_exit(vcpu, SIGNAL_EXITS); + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_SIGNAL); } } if (ret == RESUME_GUEST) { + trace_kvm_reenter(vcpu); + /* * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context * is live), restore FCR31 / MSACSR. @@ -1450,7 +1596,7 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu) * not to clobber the status register directly via the commpage. */ if (cpu_has_msa && sr & ST0_CU1 && !(sr & ST0_FR) && - vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) + vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) kvm_lose_fpu(vcpu); /* @@ -1465,9 +1611,12 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu) enable_fpu_hazard(); /* If guest FPU state not active, restore it now */ - if (!(vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)) { + if (!(vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) { __kvm_restore_fpu(&vcpu->arch); - vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU; + vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_FPU); + } else { + trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_FPU); } preempt_enable(); @@ -1494,8 +1643,8 @@ void kvm_own_msa(struct kvm_vcpu *vcpu) * interacts with MSA state, so play it safe and save it first. */ if (!(sr & ST0_FR) && - (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU | - KVM_MIPS_FPU_MSA)) == KVM_MIPS_FPU_FPU) + (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU | + KVM_MIPS_AUX_MSA)) == KVM_MIPS_AUX_FPU) kvm_lose_fpu(vcpu); change_c0_status(ST0_CU1 | ST0_FR, sr); @@ -1509,22 +1658,26 @@ void kvm_own_msa(struct kvm_vcpu *vcpu) set_c0_config5(MIPS_CONF5_MSAEN); enable_fpu_hazard(); - switch (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA)) { - case KVM_MIPS_FPU_FPU: + switch (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA)) { + case KVM_MIPS_AUX_FPU: /* * Guest FPU state already loaded, only restore upper MSA state */ __kvm_restore_msa_upper(&vcpu->arch); - vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA; + vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_MSA); break; case 0: /* Neither FPU or MSA already active, restore full MSA state */ __kvm_restore_msa(&vcpu->arch); - vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA; + vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA; if (kvm_mips_guest_has_fpu(&vcpu->arch)) - vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU; + vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, + KVM_TRACE_AUX_FPU_MSA); break; default: + trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_MSA); break; } @@ -1536,13 +1689,15 @@ void kvm_own_msa(struct kvm_vcpu *vcpu) void kvm_drop_fpu(struct kvm_vcpu *vcpu) { preempt_disable(); - if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) { + if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) { disable_msa(); - vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_MSA; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_MSA); + vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_MSA; } - if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) { + if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) { clear_c0_status(ST0_CU1 | ST0_FR); - vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_FPU); + vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU; } preempt_enable(); } @@ -1558,25 +1713,27 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) */ preempt_disable(); - if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) { + if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) { set_c0_config5(MIPS_CONF5_MSAEN); enable_fpu_hazard(); __kvm_save_msa(&vcpu->arch); + trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU_MSA); /* Disable MSA & FPU */ disable_msa(); - if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) { + if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) { clear_c0_status(ST0_CU1 | ST0_FR); disable_fpu_hazard(); } - vcpu->arch.fpu_inuse &= ~(KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA); - } else if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) { + vcpu->arch.aux_inuse &= ~(KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA); + } else if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) { set_c0_status(ST0_CU1); enable_fpu_hazard(); __kvm_save_fpu(&vcpu->arch); - vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU; + vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU); /* Disable FPU */ clear_c0_status(ST0_CU1 | ST0_FR); @@ -1638,6 +1795,10 @@ static int __init kvm_mips_init(void) { int ret; + ret = kvm_mips_entry_setup(); + if (ret) + return ret; + ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (ret) @@ -1645,18 +1806,6 @@ static int __init kvm_mips_init(void) register_die_notifier(&kvm_mips_csr_die_notifier); - /* - * On MIPS, kernel modules are executed from "mapped space", which - * requires TLBs. The TLB handling code is statically linked with - * the rest of the kernel (tlb.c) to avoid the possibility of - * double faulting. The issue is that the TLB code references - * routines that are part of the the KVM module, which are only - * available once the module is loaded. - */ - kvm_mips_gfn_to_pfn = gfn_to_pfn; - kvm_mips_release_pfn_clean = kvm_release_pfn_clean; - kvm_mips_is_error_pfn = is_error_pfn; - return 0; } @@ -1664,10 +1813,6 @@ static void __exit kvm_mips_exit(void) { kvm_exit(); - kvm_mips_gfn_to_pfn = NULL; - kvm_mips_release_pfn_clean = NULL; - kvm_mips_is_error_pfn = NULL; - unregister_die_notifier(&kvm_mips_csr_die_notifier); } diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c new file mode 100644 index 000000000000..57319ee57c4f --- /dev/null +++ b/arch/mips/kvm/mmu.c @@ -0,0 +1,375 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * KVM/MIPS MMU handling in the KVM module. + * + * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. + * Authors: Sanjay Lal + */ + +#include +#include +#include + +static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) +{ + int cpu = smp_processor_id(); + + return vcpu->arch.guest_kernel_asid[cpu] & + cpu_asid_mask(&cpu_data[cpu]); +} + +static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) +{ + int cpu = smp_processor_id(); + + return vcpu->arch.guest_user_asid[cpu] & + cpu_asid_mask(&cpu_data[cpu]); +} + +static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) +{ + int srcu_idx, err = 0; + kvm_pfn_t pfn; + + if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE) + return 0; + + srcu_idx = srcu_read_lock(&kvm->srcu); + pfn = gfn_to_pfn(kvm, gfn); + + if (is_error_pfn(pfn)) { + kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn); + err = -EFAULT; + goto out; + } + + kvm->arch.guest_pmap[gfn] = pfn; +out: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return err; +} + +/* Translate guest KSEG0 addresses to Host PA */ +unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, + unsigned long gva) +{ + gfn_t gfn; + unsigned long offset = gva & ~PAGE_MASK; + struct kvm *kvm = vcpu->kvm; + + if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) { + kvm_err("%s/%p: Invalid gva: %#lx\n", __func__, + __builtin_return_address(0), gva); + return KVM_INVALID_PAGE; + } + + gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT); + + if (gfn >= kvm->arch.guest_pmap_npages) { + kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn, + gva); + return KVM_INVALID_PAGE; + } + + if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) + return KVM_INVALID_ADDR; + + return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset; +} + +/* XXXKYMA: Must be called with interrupts disabled */ +int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, + struct kvm_vcpu *vcpu) +{ + gfn_t gfn; + kvm_pfn_t pfn0, pfn1; + unsigned long vaddr = 0; + unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; + struct kvm *kvm = vcpu->kvm; + const int flush_dcache_mask = 0; + int ret; + + if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) { + kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr); + kvm_mips_dump_host_tlbs(); + return -1; + } + + gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT); + if (gfn >= kvm->arch.guest_pmap_npages) { + kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__, + gfn, badvaddr); + kvm_mips_dump_host_tlbs(); + return -1; + } + vaddr = badvaddr & (PAGE_MASK << 1); + + if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) + return -1; + + if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0) + return -1; + + pfn0 = kvm->arch.guest_pmap[gfn & ~0x1]; + pfn1 = kvm->arch.guest_pmap[gfn | 0x1]; + + entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | + ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | + ENTRYLO_D | ENTRYLO_V; + entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | + ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | + ENTRYLO_D | ENTRYLO_V; + + preempt_disable(); + entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu)); + ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, + flush_dcache_mask); + preempt_enable(); + + return ret; +} + +int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, + struct kvm_mips_tlb *tlb) +{ + unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; + struct kvm *kvm = vcpu->kvm; + kvm_pfn_t pfn0, pfn1; + int ret; + + if ((tlb->tlb_hi & VPN2_MASK) == 0) { + pfn0 = 0; + pfn1 = 0; + } else { + if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) + >> PAGE_SHIFT) < 0) + return -1; + + if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) + >> PAGE_SHIFT) < 0) + return -1; + + pfn0 = kvm->arch.guest_pmap[ + mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) >> PAGE_SHIFT]; + pfn1 = kvm->arch.guest_pmap[ + mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) >> PAGE_SHIFT]; + } + + /* Get attributes from the Guest TLB */ + entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | + ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | + (tlb->tlb_lo[0] & ENTRYLO_D) | + (tlb->tlb_lo[0] & ENTRYLO_V); + entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | + ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | + (tlb->tlb_lo[1] & ENTRYLO_D) | + (tlb->tlb_lo[1] & ENTRYLO_V); + + kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, + tlb->tlb_lo[0], tlb->tlb_lo[1]); + + preempt_disable(); + entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ? + kvm_mips_get_kernel_asid(vcpu) : + kvm_mips_get_user_asid(vcpu)); + ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, + tlb->tlb_mask); + preempt_enable(); + + return ret; +} + +void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, + struct kvm_vcpu *vcpu) +{ + unsigned long asid = asid_cache(cpu); + + asid += cpu_asid_inc(); + if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) { + if (cpu_has_vtag_icache) + flush_icache_all(); + + kvm_local_flush_tlb_all(); /* start new asid cycle */ + + if (!asid) /* fix version if needed */ + asid = asid_first_version(cpu); + } + + cpu_context(cpu, mm) = asid_cache(cpu) = asid; +} + +/** + * kvm_mips_migrate_count() - Migrate timer. + * @vcpu: Virtual CPU. + * + * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it + * if it was running prior to being cancelled. + * + * Must be called when the VCPU is migrated to a different CPU to ensure that + * timer expiry during guest execution interrupts the guest and causes the + * interrupt to be delivered in a timely manner. + */ +static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu) +{ + if (hrtimer_cancel(&vcpu->arch.comparecount_timer)) + hrtimer_restart(&vcpu->arch.comparecount_timer); +} + +/* Restore ASID once we are scheduled back after preemption */ +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]); + unsigned long flags; + int newasid = 0; + + kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu); + + /* Allocate new kernel and user ASIDs if needed */ + + local_irq_save(flags); + + if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) & + asid_version_mask(cpu)) { + kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu); + vcpu->arch.guest_kernel_asid[cpu] = + vcpu->arch.guest_kernel_mm.context.asid[cpu]; + kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); + vcpu->arch.guest_user_asid[cpu] = + vcpu->arch.guest_user_mm.context.asid[cpu]; + newasid++; + + kvm_debug("[%d]: cpu_context: %#lx\n", cpu, + cpu_context(cpu, current->mm)); + kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", + cpu, vcpu->arch.guest_kernel_asid[cpu]); + kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, + vcpu->arch.guest_user_asid[cpu]); + } + + if (vcpu->arch.last_sched_cpu != cpu) { + kvm_debug("[%d->%d]KVM VCPU[%d] switch\n", + vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); + /* + * Migrate the timer interrupt to the current CPU so that it + * always interrupts the guest and synchronously triggers a + * guest timer interrupt. + */ + kvm_mips_migrate_count(vcpu); + } + + if (!newasid) { + /* + * If we preempted while the guest was executing, then reload + * the pre-empted ASID + */ + if (current->flags & PF_VCPU) { + write_c0_entryhi(vcpu->arch. + preempt_entryhi & asid_mask); + ehb(); + } + } else { + /* New ASIDs were allocated for the VM */ + + /* + * Were we in guest context? If so then the pre-empted ASID is + * no longer valid, we need to set it to what it should be based + * on the mode of the Guest (Kernel/User) + */ + if (current->flags & PF_VCPU) { + if (KVM_GUEST_KERNEL_MODE(vcpu)) + write_c0_entryhi(vcpu->arch. + guest_kernel_asid[cpu] & + asid_mask); + else + write_c0_entryhi(vcpu->arch. + guest_user_asid[cpu] & + asid_mask); + ehb(); + } + } + + /* restore guest state to registers */ + kvm_mips_callbacks->vcpu_set_regs(vcpu); + + local_irq_restore(flags); + +} + +/* ASID can change if another task is scheduled during preemption */ +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + int cpu; + + local_irq_save(flags); + + cpu = smp_processor_id(); + + vcpu->arch.preempt_entryhi = read_c0_entryhi(); + vcpu->arch.last_sched_cpu = cpu; + + /* save guest state in registers */ + kvm_mips_callbacks->vcpu_get_regs(vcpu); + + if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu))) { + kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__, + cpu_context(cpu, current->mm)); + drop_mmu_context(current->mm, cpu); + } + write_c0_entryhi(cpu_asid(cpu, current->mm)); + ehb(); + + local_irq_restore(flags); +} + +u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + unsigned long paddr, flags, vpn2, asid; + unsigned long va = (unsigned long)opc; + void *vaddr; + u32 inst; + int index; + + if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 || + KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) { + local_irq_save(flags); + index = kvm_mips_host_tlb_lookup(vcpu, va); + if (index >= 0) { + inst = *(opc); + } else { + vpn2 = va & VPN2_MASK; + asid = kvm_read_c0_guest_entryhi(cop0) & + KVM_ENTRYHI_ASID; + index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid); + if (index < 0) { + kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n", + __func__, opc, vcpu, read_c0_entryhi()); + kvm_mips_dump_host_tlbs(); + kvm_mips_dump_guest_tlbs(vcpu); + local_irq_restore(flags); + return KVM_INVALID_INST; + } + kvm_mips_handle_mapped_seg_tlb_fault(vcpu, + &vcpu->arch. + guest_tlb[index]); + inst = *(opc); + } + local_irq_restore(flags); + } else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) { + paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va); + vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr))); + vaddr += paddr & ~PAGE_MASK; + inst = *(u32 *)vaddr; + kunmap_atomic(vaddr); + } else { + kvm_err("%s: illegal address: %p\n", __func__, opc); + return KVM_INVALID_INST; + } + + return inst; +} diff --git a/arch/mips/kvm/stats.c b/arch/mips/kvm/stats.c index 888bb67070ac..53f851a61554 100644 --- a/arch/mips/kvm/stats.c +++ b/arch/mips/kvm/stats.c @@ -11,27 +11,6 @@ #include -char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES] = { - "WAIT", - "CACHE", - "Signal", - "Interrupt", - "COP0/1 Unusable", - "TLB Mod", - "TLB Miss (LD)", - "TLB Miss (ST)", - "Address Err (ST)", - "Address Error (LD)", - "System Call", - "Reserved Inst", - "Break Inst", - "Trap Inst", - "MSA FPE", - "FPE", - "MSA Disabled", - "D-Cache Flushes", -}; - char *kvm_cop0_str[N_MIPS_COPROC_REGS] = { "Index", "Random", diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index ed021ae7867a..254377d8e0b9 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -24,6 +24,7 @@ #include #include #include +#include #undef CONFIG_MIPS_MT #include @@ -32,22 +33,10 @@ #define KVM_GUEST_PC_TLB 0 #define KVM_GUEST_SP_TLB 1 -#define PRIx64 "llx" - atomic_t kvm_mips_instance; EXPORT_SYMBOL_GPL(kvm_mips_instance); -/* These function pointers are initialized once the KVM module is loaded */ -kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn); -EXPORT_SYMBOL_GPL(kvm_mips_gfn_to_pfn); - -void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn); -EXPORT_SYMBOL_GPL(kvm_mips_release_pfn_clean); - -bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn); -EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn); - -uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) +static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); @@ -55,7 +44,7 @@ uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) cpu_asid_mask(&cpu_data[cpu]); } -uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) +static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); @@ -63,7 +52,7 @@ uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) cpu_asid_mask(&cpu_data[cpu]); } -inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu) +inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.commpage_tlb; } @@ -72,50 +61,15 @@ inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu) void kvm_mips_dump_host_tlbs(void) { - unsigned long old_entryhi; - unsigned long old_pagemask; - struct kvm_mips_tlb tlb; unsigned long flags; - int i; local_irq_save(flags); - old_entryhi = read_c0_entryhi(); - old_pagemask = read_c0_pagemask(); - kvm_info("HOST TLBs:\n"); - kvm_info("ASID: %#lx\n", read_c0_entryhi() & - cpu_asid_mask(¤t_cpu_data)); + dump_tlb_regs(); + pr_info("\n"); + dump_tlb_all(); - for (i = 0; i < current_cpu_data.tlbsize; i++) { - write_c0_index(i); - mtc0_tlbw_hazard(); - - tlb_read(); - tlbw_use_hazard(); - - tlb.tlb_hi = read_c0_entryhi(); - tlb.tlb_lo0 = read_c0_entrylo0(); - tlb.tlb_lo1 = read_c0_entrylo1(); - tlb.tlb_mask = read_c0_pagemask(); - - kvm_info("TLB%c%3d Hi 0x%08lx ", - (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*', - i, tlb.tlb_hi); - kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ", - (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0), - (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ', - (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ', - (tlb.tlb_lo0 >> 3) & 7); - kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n", - (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1), - (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ', - (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ', - (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask); - } - write_c0_entryhi(old_entryhi); - write_c0_pagemask(old_pagemask); - mtc0_tlbw_hazard(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kvm_mips_dump_host_tlbs); @@ -132,74 +86,24 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu) for (i = 0; i < KVM_MIPS_GUEST_TLB_SIZE; i++) { tlb = vcpu->arch.guest_tlb[i]; kvm_info("TLB%c%3d Hi 0x%08lx ", - (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*', + (tlb.tlb_lo[0] | tlb.tlb_lo[1]) & ENTRYLO_V + ? ' ' : '*', i, tlb.tlb_hi); - kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ", - (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0), - (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ', - (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ', - (tlb.tlb_lo0 >> 3) & 7); - kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n", - (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1), - (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ', - (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ', - (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask); + kvm_info("Lo0=0x%09llx %c%c attr %lx ", + (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[0]), + (tlb.tlb_lo[0] & ENTRYLO_D) ? 'D' : ' ', + (tlb.tlb_lo[0] & ENTRYLO_G) ? 'G' : ' ', + (tlb.tlb_lo[0] & ENTRYLO_C) >> ENTRYLO_C_SHIFT); + kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n", + (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[1]), + (tlb.tlb_lo[1] & ENTRYLO_D) ? 'D' : ' ', + (tlb.tlb_lo[1] & ENTRYLO_G) ? 'G' : ' ', + (tlb.tlb_lo[1] & ENTRYLO_C) >> ENTRYLO_C_SHIFT, + tlb.tlb_mask); } } EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs); -static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) -{ - int srcu_idx, err = 0; - kvm_pfn_t pfn; - - if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE) - return 0; - - srcu_idx = srcu_read_lock(&kvm->srcu); - pfn = kvm_mips_gfn_to_pfn(kvm, gfn); - - if (kvm_mips_is_error_pfn(pfn)) { - kvm_err("Couldn't get pfn for gfn %#" PRIx64 "!\n", gfn); - err = -EFAULT; - goto out; - } - - kvm->arch.guest_pmap[gfn] = pfn; -out: - srcu_read_unlock(&kvm->srcu, srcu_idx); - return err; -} - -/* Translate guest KSEG0 addresses to Host PA */ -unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, - unsigned long gva) -{ - gfn_t gfn; - uint32_t offset = gva & ~PAGE_MASK; - struct kvm *kvm = vcpu->kvm; - - if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) { - kvm_err("%s/%p: Invalid gva: %#lx\n", __func__, - __builtin_return_address(0), gva); - return KVM_INVALID_PAGE; - } - - gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT); - - if (gfn >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn, - gva); - return KVM_INVALID_PAGE; - } - - if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) - return KVM_INVALID_ADDR; - - return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset; -} -EXPORT_SYMBOL_GPL(kvm_mips_translate_guest_kseg0_to_hpa); - /* XXXKYMA: Must be called with interrupts disabled */ /* set flush_dcache_mask == 0 if no dcache flush required */ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, @@ -243,12 +147,12 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, /* Flush D-cache */ if (flush_dcache_mask) { - if (entrylo0 & MIPS3_PG_V) { + if (entrylo0 & ENTRYLO_V) { ++vcpu->stat.flush_dcache_exits; flush_data_cache_page((entryhi & VPN2_MASK) & ~flush_dcache_mask); } - if (entrylo1 & MIPS3_PG_V) { + if (entrylo1 & ENTRYLO_V) { ++vcpu->stat.flush_dcache_exits; flush_data_cache_page(((entryhi & VPN2_MASK) & ~flush_dcache_mask) | @@ -259,96 +163,35 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, /* Restore old ASID */ write_c0_entryhi(old_entryhi); mtc0_tlbw_hazard(); - tlbw_use_hazard(); local_irq_restore(flags); return 0; } - -/* XXXKYMA: Must be called with interrupts disabled */ -int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, - struct kvm_vcpu *vcpu) -{ - gfn_t gfn; - kvm_pfn_t pfn0, pfn1; - unsigned long vaddr = 0; - unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; - int even; - struct kvm *kvm = vcpu->kvm; - const int flush_dcache_mask = 0; - int ret; - - if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) { - kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr); - kvm_mips_dump_host_tlbs(); - return -1; - } - - gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT); - if (gfn >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__, - gfn, badvaddr); - kvm_mips_dump_host_tlbs(); - return -1; - } - even = !(gfn & 0x1); - vaddr = badvaddr & (PAGE_MASK << 1); - - if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) - return -1; - - if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0) - return -1; - - if (even) { - pfn0 = kvm->arch.guest_pmap[gfn]; - pfn1 = kvm->arch.guest_pmap[gfn ^ 0x1]; - } else { - pfn0 = kvm->arch.guest_pmap[gfn ^ 0x1]; - pfn1 = kvm->arch.guest_pmap[gfn]; - } - - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | - (1 << 2) | (0x1 << 1); - entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | - (1 << 2) | (0x1 << 1); - - preempt_disable(); - entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu)); - ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, - flush_dcache_mask); - preempt_enable(); - - return ret; -} -EXPORT_SYMBOL_GPL(kvm_mips_handle_kseg0_tlb_fault); +EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write); int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu) { - kvm_pfn_t pfn0, pfn1; + kvm_pfn_t pfn; unsigned long flags, old_entryhi = 0, vaddr = 0; - unsigned long entrylo0 = 0, entrylo1 = 0; + unsigned long entrylo[2] = { 0, 0 }; + unsigned int pair_idx; - pfn0 = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT; - pfn1 = 0; - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | - (1 << 2) | (0x1 << 1); - entrylo1 = 0; + pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage)); + pair_idx = (badvaddr >> PAGE_SHIFT) & 1; + entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) | + ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | + ENTRYLO_D | ENTRYLO_V; local_irq_save(flags); old_entryhi = read_c0_entryhi(); vaddr = badvaddr & (PAGE_MASK << 1); write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu)); - mtc0_tlbw_hazard(); - write_c0_entrylo0(entrylo0); - mtc0_tlbw_hazard(); - write_c0_entrylo1(entrylo1); - mtc0_tlbw_hazard(); + write_c0_entrylo0(entrylo[0]); + write_c0_entrylo1(entrylo[1]); write_c0_index(kvm_mips_get_commpage_asid(vcpu)); mtc0_tlbw_hazard(); tlb_write_indexed(); - mtc0_tlbw_hazard(); tlbw_use_hazard(); kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n", @@ -358,68 +201,12 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, /* Restore old ASID */ write_c0_entryhi(old_entryhi); mtc0_tlbw_hazard(); - tlbw_use_hazard(); local_irq_restore(flags); return 0; } EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault); -int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, - struct kvm_mips_tlb *tlb, - unsigned long *hpa0, - unsigned long *hpa1) -{ - unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; - struct kvm *kvm = vcpu->kvm; - kvm_pfn_t pfn0, pfn1; - int ret; - - if ((tlb->tlb_hi & VPN2_MASK) == 0) { - pfn0 = 0; - pfn1 = 0; - } else { - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0) - >> PAGE_SHIFT) < 0) - return -1; - - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1) - >> PAGE_SHIFT) < 0) - return -1; - - pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0) - >> PAGE_SHIFT]; - pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1) - >> PAGE_SHIFT]; - } - - if (hpa0) - *hpa0 = pfn0 << PAGE_SHIFT; - - if (hpa1) - *hpa1 = pfn1 << PAGE_SHIFT; - - /* Get attributes from the Guest TLB */ - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | - (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V); - entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | - (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V); - - kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, - tlb->tlb_lo0, tlb->tlb_lo1); - - preempt_disable(); - entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ? - kvm_mips_get_kernel_asid(vcpu) : - kvm_mips_get_user_asid(vcpu)); - ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, - tlb->tlb_mask); - preempt_enable(); - - return ret; -} -EXPORT_SYMBOL_GPL(kvm_mips_handle_mapped_seg_tlb_fault); - int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi) { int i; @@ -435,7 +222,7 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi) } kvm_debug("%s: entryhi: %#lx, index: %d lo0: %#lx, lo1: %#lx\n", - __func__, entryhi, index, tlb[i].tlb_lo0, tlb[i].tlb_lo1); + __func__, entryhi, index, tlb[i].tlb_lo[0], tlb[i].tlb_lo[1]); return index; } @@ -467,7 +254,6 @@ int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr) /* Restore old ASID */ write_c0_entryhi(old_entryhi); mtc0_tlbw_hazard(); - tlbw_use_hazard(); local_irq_restore(flags); @@ -498,21 +284,16 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va) if (idx > 0) { write_c0_entryhi(UNIQUE_ENTRYHI(idx)); - mtc0_tlbw_hazard(); - write_c0_entrylo0(0); - mtc0_tlbw_hazard(); - write_c0_entrylo1(0); mtc0_tlbw_hazard(); tlb_write_indexed(); - mtc0_tlbw_hazard(); + tlbw_use_hazard(); } write_c0_entryhi(old_entryhi); mtc0_tlbw_hazard(); - tlbw_use_hazard(); local_irq_restore(flags); @@ -540,61 +321,39 @@ void kvm_mips_flush_host_tlb(int skip_kseg0) /* Blast 'em all away. */ for (entry = 0; entry < maxentry; entry++) { write_c0_index(entry); - mtc0_tlbw_hazard(); if (skip_kseg0) { + mtc0_tlbr_hazard(); tlb_read(); - tlbw_use_hazard(); + tlb_read_hazard(); entryhi = read_c0_entryhi(); /* Don't blow away guest kernel entries */ if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0) continue; + + write_c0_pagemask(old_pagemask); } /* Make sure all entries differ. */ write_c0_entryhi(UNIQUE_ENTRYHI(entry)); - mtc0_tlbw_hazard(); write_c0_entrylo0(0); - mtc0_tlbw_hazard(); write_c0_entrylo1(0); mtc0_tlbw_hazard(); tlb_write_indexed(); - mtc0_tlbw_hazard(); + tlbw_use_hazard(); } - tlbw_use_hazard(); - write_c0_entryhi(old_entryhi); write_c0_pagemask(old_pagemask); mtc0_tlbw_hazard(); - tlbw_use_hazard(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb); -void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, - struct kvm_vcpu *vcpu) -{ - unsigned long asid = asid_cache(cpu); - - asid += cpu_asid_inc(); - if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) { - if (cpu_has_vtag_icache) - flush_icache_all(); - - kvm_local_flush_tlb_all(); /* start new asid cycle */ - - if (!asid) /* fix version if needed */ - asid = asid_first_version(cpu); - } - - cpu_context(cpu, mm) = asid_cache(cpu) = asid; -} - void kvm_local_flush_tlb_all(void) { unsigned long flags; @@ -614,185 +373,12 @@ void kvm_local_flush_tlb_all(void) write_c0_index(entry); mtc0_tlbw_hazard(); tlb_write_indexed(); + tlbw_use_hazard(); entry++; } - tlbw_use_hazard(); write_c0_entryhi(old_ctx); mtc0_tlbw_hazard(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all); - -/** - * kvm_mips_migrate_count() - Migrate timer. - * @vcpu: Virtual CPU. - * - * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it - * if it was running prior to being cancelled. - * - * Must be called when the VCPU is migrated to a different CPU to ensure that - * timer expiry during guest execution interrupts the guest and causes the - * interrupt to be delivered in a timely manner. - */ -static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu) -{ - if (hrtimer_cancel(&vcpu->arch.comparecount_timer)) - hrtimer_restart(&vcpu->arch.comparecount_timer); -} - -/* Restore ASID once we are scheduled back after preemption */ -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]); - unsigned long flags; - int newasid = 0; - - kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu); - - /* Allocate new kernel and user ASIDs if needed */ - - local_irq_save(flags); - - if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) & - asid_version_mask(cpu)) { - kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu); - vcpu->arch.guest_kernel_asid[cpu] = - vcpu->arch.guest_kernel_mm.context.asid[cpu]; - kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); - vcpu->arch.guest_user_asid[cpu] = - vcpu->arch.guest_user_mm.context.asid[cpu]; - newasid++; - - kvm_debug("[%d]: cpu_context: %#lx\n", cpu, - cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", - cpu, vcpu->arch.guest_kernel_asid[cpu]); - kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, - vcpu->arch.guest_user_asid[cpu]); - } - - if (vcpu->arch.last_sched_cpu != cpu) { - kvm_debug("[%d->%d]KVM VCPU[%d] switch\n", - vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); - /* - * Migrate the timer interrupt to the current CPU so that it - * always interrupts the guest and synchronously triggers a - * guest timer interrupt. - */ - kvm_mips_migrate_count(vcpu); - } - - if (!newasid) { - /* - * If we preempted while the guest was executing, then reload - * the pre-empted ASID - */ - if (current->flags & PF_VCPU) { - write_c0_entryhi(vcpu->arch. - preempt_entryhi & asid_mask); - ehb(); - } - } else { - /* New ASIDs were allocated for the VM */ - - /* - * Were we in guest context? If so then the pre-empted ASID is - * no longer valid, we need to set it to what it should be based - * on the mode of the Guest (Kernel/User) - */ - if (current->flags & PF_VCPU) { - if (KVM_GUEST_KERNEL_MODE(vcpu)) - write_c0_entryhi(vcpu->arch. - guest_kernel_asid[cpu] & - asid_mask); - else - write_c0_entryhi(vcpu->arch. - guest_user_asid[cpu] & - asid_mask); - ehb(); - } - } - - /* restore guest state to registers */ - kvm_mips_callbacks->vcpu_set_regs(vcpu); - - local_irq_restore(flags); - -} -EXPORT_SYMBOL_GPL(kvm_arch_vcpu_load); - -/* ASID can change if another task is scheduled during preemption */ -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - uint32_t cpu; - - local_irq_save(flags); - - cpu = smp_processor_id(); - - vcpu->arch.preempt_entryhi = read_c0_entryhi(); - vcpu->arch.last_sched_cpu = cpu; - - /* save guest state in registers */ - kvm_mips_callbacks->vcpu_get_regs(vcpu); - - if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & - asid_version_mask(cpu))) { - kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__, - cpu_context(cpu, current->mm)); - drop_mmu_context(current->mm, cpu); - } - write_c0_entryhi(cpu_asid(cpu, current->mm)); - ehb(); - - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(kvm_arch_vcpu_put); - -uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu) -{ - struct mips_coproc *cop0 = vcpu->arch.cop0; - unsigned long paddr, flags, vpn2, asid; - uint32_t inst; - int index; - - if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 || - KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { - local_irq_save(flags); - index = kvm_mips_host_tlb_lookup(vcpu, (unsigned long) opc); - if (index >= 0) { - inst = *(opc); - } else { - vpn2 = (unsigned long) opc & VPN2_MASK; - asid = kvm_read_c0_guest_entryhi(cop0) & - KVM_ENTRYHI_ASID; - index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid); - if (index < 0) { - kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n", - __func__, opc, vcpu, read_c0_entryhi()); - kvm_mips_dump_host_tlbs(); - local_irq_restore(flags); - return KVM_INVALID_INST; - } - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, - &vcpu->arch. - guest_tlb[index], - NULL, NULL); - inst = *(opc); - } - local_irq_restore(flags); - } else if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) { - paddr = - kvm_mips_translate_guest_kseg0_to_hpa(vcpu, - (unsigned long) opc); - inst = *(uint32_t *) CKSEG0ADDR(paddr); - } else { - kvm_err("%s: illegal address: %p\n", __func__, opc); - return KVM_INVALID_INST; - } - - return inst; -} -EXPORT_SYMBOL_GPL(kvm_get_inst); diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h index bd6437f67dc0..c858cf168078 100644 --- a/arch/mips/kvm/trace.h +++ b/arch/mips/kvm/trace.h @@ -17,8 +17,75 @@ #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE trace -/* Tracepoints for VM eists */ -extern char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES]; +/* + * Tracepoints for VM enters + */ +DECLARE_EVENT_CLASS(kvm_transition, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + TP_STRUCT__entry( + __field(unsigned long, pc) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + ), + + TP_printk("PC: 0x%08lx", + __entry->pc) +); + +DEFINE_EVENT(kvm_transition, kvm_enter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); + +DEFINE_EVENT(kvm_transition, kvm_reenter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); + +DEFINE_EVENT(kvm_transition, kvm_out, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); + +/* The first 32 exit reasons correspond to Cause.ExcCode */ +#define KVM_TRACE_EXIT_INT 0 +#define KVM_TRACE_EXIT_TLBMOD 1 +#define KVM_TRACE_EXIT_TLBMISS_LD 2 +#define KVM_TRACE_EXIT_TLBMISS_ST 3 +#define KVM_TRACE_EXIT_ADDRERR_LD 4 +#define KVM_TRACE_EXIT_ADDRERR_ST 5 +#define KVM_TRACE_EXIT_SYSCALL 8 +#define KVM_TRACE_EXIT_BREAK_INST 9 +#define KVM_TRACE_EXIT_RESVD_INST 10 +#define KVM_TRACE_EXIT_COP_UNUSABLE 11 +#define KVM_TRACE_EXIT_TRAP_INST 13 +#define KVM_TRACE_EXIT_MSA_FPE 14 +#define KVM_TRACE_EXIT_FPE 15 +#define KVM_TRACE_EXIT_MSA_DISABLED 21 +/* Further exit reasons */ +#define KVM_TRACE_EXIT_WAIT 32 +#define KVM_TRACE_EXIT_CACHE 33 +#define KVM_TRACE_EXIT_SIGNAL 34 + +/* Tracepoints for VM exits */ +#define kvm_trace_symbol_exit_types \ + { KVM_TRACE_EXIT_INT, "Interrupt" }, \ + { KVM_TRACE_EXIT_TLBMOD, "TLB Mod" }, \ + { KVM_TRACE_EXIT_TLBMISS_LD, "TLB Miss (LD)" }, \ + { KVM_TRACE_EXIT_TLBMISS_ST, "TLB Miss (ST)" }, \ + { KVM_TRACE_EXIT_ADDRERR_LD, "Address Error (LD)" }, \ + { KVM_TRACE_EXIT_ADDRERR_ST, "Address Err (ST)" }, \ + { KVM_TRACE_EXIT_SYSCALL, "System Call" }, \ + { KVM_TRACE_EXIT_BREAK_INST, "Break Inst" }, \ + { KVM_TRACE_EXIT_RESVD_INST, "Reserved Inst" }, \ + { KVM_TRACE_EXIT_COP_UNUSABLE, "COP0/1 Unusable" }, \ + { KVM_TRACE_EXIT_TRAP_INST, "Trap Inst" }, \ + { KVM_TRACE_EXIT_MSA_FPE, "MSA FPE" }, \ + { KVM_TRACE_EXIT_FPE, "FPE" }, \ + { KVM_TRACE_EXIT_MSA_DISABLED, "MSA Disabled" }, \ + { KVM_TRACE_EXIT_WAIT, "WAIT" }, \ + { KVM_TRACE_EXIT_CACHE, "CACHE" }, \ + { KVM_TRACE_EXIT_SIGNAL, "Signal" } TRACE_EVENT(kvm_exit, TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason), @@ -34,10 +101,173 @@ TRACE_EVENT(kvm_exit, ), TP_printk("[%s]PC: 0x%08lx", - kvm_mips_exit_types_str[__entry->reason], + __print_symbolic(__entry->reason, + kvm_trace_symbol_exit_types), __entry->pc) ); +#define KVM_TRACE_MFC0 0 +#define KVM_TRACE_MTC0 1 +#define KVM_TRACE_DMFC0 2 +#define KVM_TRACE_DMTC0 3 +#define KVM_TRACE_RDHWR 4 + +#define KVM_TRACE_HWR_COP0 0 +#define KVM_TRACE_HWR_HWR 1 + +#define KVM_TRACE_COP0(REG, SEL) ((KVM_TRACE_HWR_COP0 << 8) | \ + ((REG) << 3) | (SEL)) +#define KVM_TRACE_HWR(REG, SEL) ((KVM_TRACE_HWR_HWR << 8) | \ + ((REG) << 3) | (SEL)) + +#define kvm_trace_symbol_hwr_ops \ + { KVM_TRACE_MFC0, "MFC0" }, \ + { KVM_TRACE_MTC0, "MTC0" }, \ + { KVM_TRACE_DMFC0, "DMFC0" }, \ + { KVM_TRACE_DMTC0, "DMTC0" }, \ + { KVM_TRACE_RDHWR, "RDHWR" } + +#define kvm_trace_symbol_hwr_cop \ + { KVM_TRACE_HWR_COP0, "COP0" }, \ + { KVM_TRACE_HWR_HWR, "HWR" } + +#define kvm_trace_symbol_hwr_regs \ + { KVM_TRACE_COP0( 0, 0), "Index" }, \ + { KVM_TRACE_COP0( 2, 0), "EntryLo0" }, \ + { KVM_TRACE_COP0( 3, 0), "EntryLo1" }, \ + { KVM_TRACE_COP0( 4, 0), "Context" }, \ + { KVM_TRACE_COP0( 4, 2), "UserLocal" }, \ + { KVM_TRACE_COP0( 5, 0), "PageMask" }, \ + { KVM_TRACE_COP0( 6, 0), "Wired" }, \ + { KVM_TRACE_COP0( 7, 0), "HWREna" }, \ + { KVM_TRACE_COP0( 8, 0), "BadVAddr" }, \ + { KVM_TRACE_COP0( 9, 0), "Count" }, \ + { KVM_TRACE_COP0(10, 0), "EntryHi" }, \ + { KVM_TRACE_COP0(11, 0), "Compare" }, \ + { KVM_TRACE_COP0(12, 0), "Status" }, \ + { KVM_TRACE_COP0(12, 1), "IntCtl" }, \ + { KVM_TRACE_COP0(12, 2), "SRSCtl" }, \ + { KVM_TRACE_COP0(13, 0), "Cause" }, \ + { KVM_TRACE_COP0(14, 0), "EPC" }, \ + { KVM_TRACE_COP0(15, 0), "PRId" }, \ + { KVM_TRACE_COP0(15, 1), "EBase" }, \ + { KVM_TRACE_COP0(16, 0), "Config" }, \ + { KVM_TRACE_COP0(16, 1), "Config1" }, \ + { KVM_TRACE_COP0(16, 2), "Config2" }, \ + { KVM_TRACE_COP0(16, 3), "Config3" }, \ + { KVM_TRACE_COP0(16, 4), "Config4" }, \ + { KVM_TRACE_COP0(16, 5), "Config5" }, \ + { KVM_TRACE_COP0(16, 7), "Config7" }, \ + { KVM_TRACE_COP0(26, 0), "ECC" }, \ + { KVM_TRACE_COP0(30, 0), "ErrorEPC" }, \ + { KVM_TRACE_COP0(31, 2), "KScratch1" }, \ + { KVM_TRACE_COP0(31, 3), "KScratch2" }, \ + { KVM_TRACE_COP0(31, 4), "KScratch3" }, \ + { KVM_TRACE_COP0(31, 5), "KScratch4" }, \ + { KVM_TRACE_COP0(31, 6), "KScratch5" }, \ + { KVM_TRACE_COP0(31, 7), "KScratch6" }, \ + { KVM_TRACE_HWR( 0, 0), "CPUNum" }, \ + { KVM_TRACE_HWR( 1, 0), "SYNCI_Step" }, \ + { KVM_TRACE_HWR( 2, 0), "CC" }, \ + { KVM_TRACE_HWR( 3, 0), "CCRes" }, \ + { KVM_TRACE_HWR(29, 0), "ULR" } + +TRACE_EVENT(kvm_hwr, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, unsigned int reg, + unsigned long val), + TP_ARGS(vcpu, op, reg, val), + TP_STRUCT__entry( + __field(unsigned long, val) + __field(u16, reg) + __field(u8, op) + ), + + TP_fast_assign( + __entry->val = val; + __entry->reg = reg; + __entry->op = op; + ), + + TP_printk("%s %s (%s:%u:%u) 0x%08lx", + __print_symbolic(__entry->op, + kvm_trace_symbol_hwr_ops), + __print_symbolic(__entry->reg, + kvm_trace_symbol_hwr_regs), + __print_symbolic(__entry->reg >> 8, + kvm_trace_symbol_hwr_cop), + (__entry->reg >> 3) & 0x1f, + __entry->reg & 0x7, + __entry->val) +); + +#define KVM_TRACE_AUX_RESTORE 0 +#define KVM_TRACE_AUX_SAVE 1 +#define KVM_TRACE_AUX_ENABLE 2 +#define KVM_TRACE_AUX_DISABLE 3 +#define KVM_TRACE_AUX_DISCARD 4 + +#define KVM_TRACE_AUX_FPU 1 +#define KVM_TRACE_AUX_MSA 2 +#define KVM_TRACE_AUX_FPU_MSA 3 + +#define kvm_trace_symbol_aux_op \ + { KVM_TRACE_AUX_RESTORE, "restore" }, \ + { KVM_TRACE_AUX_SAVE, "save" }, \ + { KVM_TRACE_AUX_ENABLE, "enable" }, \ + { KVM_TRACE_AUX_DISABLE, "disable" }, \ + { KVM_TRACE_AUX_DISCARD, "discard" } + +#define kvm_trace_symbol_aux_state \ + { KVM_TRACE_AUX_FPU, "FPU" }, \ + { KVM_TRACE_AUX_MSA, "MSA" }, \ + { KVM_TRACE_AUX_FPU_MSA, "FPU & MSA" } + +TRACE_EVENT(kvm_aux, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, + unsigned int state), + TP_ARGS(vcpu, op, state), + TP_STRUCT__entry( + __field(unsigned long, pc) + __field(u8, op) + __field(u8, state) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + __entry->op = op; + __entry->state = state; + ), + + TP_printk("%s %s PC: 0x%08lx", + __print_symbolic(__entry->op, + kvm_trace_symbol_aux_op), + __print_symbolic(__entry->state, + kvm_trace_symbol_aux_state), + __entry->pc) +); + +TRACE_EVENT(kvm_asid_change, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int old_asid, + unsigned int new_asid), + TP_ARGS(vcpu, old_asid, new_asid), + TP_STRUCT__entry( + __field(unsigned long, pc) + __field(u8, old_asid) + __field(u8, new_asid) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + __entry->old_asid = old_asid; + __entry->new_asid = new_asid; + ), + + TP_printk("PC: 0x%08lx old: 0x%02x new: 0x%02x", + __entry->pc, + __entry->old_asid, + __entry->new_asid) +); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 6ba0fafcecbc..091553942bcb 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -21,7 +21,7 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva) { gpa_t gpa; - uint32_t kseg = KSEGX(gva); + gva_t kseg = KSEGX(gva); if ((kseg == CKSEG0) || (kseg == CKSEG1)) gpa = CPHYSADDR(gva); @@ -40,8 +40,8 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -87,15 +87,15 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { - kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu); @@ -111,14 +111,14 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) * when we are not using HIGHMEM. Need to address this in a * HIGHMEM kernel */ - kvm_err("TLB MOD fault not handled, cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); kvm_mips_dump_host_tlbs(); kvm_arch_vcpu_dump_regs(vcpu); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } else { - kvm_err("Illegal TLB Mod fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); kvm_mips_dump_host_tlbs(); kvm_arch_vcpu_dump_regs(vcpu); @@ -128,12 +128,12 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) return ret; } -static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu) +static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -145,55 +145,8 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu) } } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { - kvm_debug("USER ADDR TLB LD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); - er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu); - if (er == EMULATE_DONE) - ret = RESUME_GUEST; - else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } - } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) { - /* - * All KSEG0 faults are handled by KVM, as the guest kernel does - * not expect to ever get them - */ - if (kvm_mips_handle_kseg0_tlb_fault - (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } - } else { - kvm_err("Illegal TLB LD fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); - kvm_mips_dump_host_tlbs(); - kvm_arch_vcpu_dump_regs(vcpu); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } - return ret; -} - -static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) -{ - struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; - unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; - unsigned long cause = vcpu->arch.host_cp0_cause; - enum emulation_result er = EMULATE_DONE; - int ret = RESUME_GUEST; - - if (((badvaddr & PAGE_MASK) == KVM_GUEST_COMMPAGE_ADDR) - && KVM_GUEST_KERNEL_MODE(vcpu)) { - if (kvm_mips_handle_commpage_tlb_fault(badvaddr, vcpu) < 0) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } - } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 - || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { - kvm_debug("USER ADDR TLB ST fault: PC: %#lx, BadVaddr: %#lx\n", - vcpu->arch.pc, badvaddr); + kvm_debug("USER ADDR TLB %s fault: cause %#x, PC: %p, BadVaddr: %#lx\n", + store ? "ST" : "LD", cause, opc, badvaddr); /* * User Address (UA) fault, this could happen if @@ -213,14 +166,18 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) ret = RESUME_HOST; } } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) { + /* + * All KSEG0 faults are handled by KVM, as the guest kernel does + * not expect to ever get them + */ if (kvm_mips_handle_kseg0_tlb_fault (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } } else { - kvm_err("Illegal TLB ST fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); + kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", + store ? "ST" : "LD", cause, opc, badvaddr); kvm_mips_dump_host_tlbs(); kvm_arch_vcpu_dump_regs(vcpu); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -229,12 +186,22 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) return ret; } +static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu) +{ + return kvm_trap_emul_handle_tlb_miss(vcpu, true); +} + +static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) +{ + return kvm_trap_emul_handle_tlb_miss(vcpu, false); +} + static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -251,7 +218,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) ret = RESUME_HOST; } } else { - kvm_err("Address Error (STORE): cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; @@ -262,9 +229,9 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -280,7 +247,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) ret = RESUME_HOST; } } else { - kvm_err("Address Error (LOAD): cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; @@ -292,8 +259,8 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -310,8 +277,8 @@ static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -328,8 +295,8 @@ static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -346,8 +313,8 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 __user *opc = (u32 __user *)vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -364,8 +331,8 @@ static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 __user *opc = (u32 __user *)vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -382,8 +349,8 @@ static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 __user *opc = (u32 __user *)vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -407,8 +374,8 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -451,24 +418,41 @@ static int kvm_trap_emul_vm_init(struct kvm *kvm) static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) { + vcpu->arch.kscratch_enabled = 0xfc; + return 0; } static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - uint32_t config1; + u32 config, config1; int vcpu_id = vcpu->vcpu_id; /* * Arch specific stuff, set up config registers properly so that the - * guest will come up as expected, for now we simulate a MIPS 24kc + * guest will come up as expected */ +#ifndef CONFIG_CPU_MIPSR6 + /* r2-r5, simulate a MIPS 24kc */ kvm_write_c0_guest_prid(cop0, 0x00019300); - /* Have config1, Cacheable, noncoherent, write-back, write allocate */ - kvm_write_c0_guest_config(cop0, MIPS_CONF_M | (0x3 << CP0C0_K0) | - (0x1 << CP0C0_AR) | - (MMU_TYPE_R4000 << CP0C0_MT)); +#else + /* r6+, simulate a generic QEMU machine */ + kvm_write_c0_guest_prid(cop0, 0x00010000); +#endif + /* + * Have config1, Cacheable, noncoherent, write-back, write allocate. + * Endianness, arch revision & virtually tagged icache should match + * host. + */ + config = read_c0_config() & MIPS_CONF_AR; + config |= MIPS_CONF_M | CONF_CM_CACHABLE_NONCOHERENT | MIPS_CONF_MT_TLB; +#ifdef CONFIG_CPU_BIG_ENDIAN + config |= CONF_BE; +#endif + if (cpu_has_vtag_icache) + config |= MIPS_CONF_VI; + kvm_write_c0_guest_config(cop0, config); /* Read the cache characteristics from the host Config1 Register */ config1 = (read_c0_config1() & ~0x7f); @@ -478,9 +462,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) config1 |= ((KVM_MIPS_GUEST_TLB_SIZE - 1) << 25); /* We unset some bits that we aren't emulating */ - config1 &= - ~((1 << CP0C1_C2) | (1 << CP0C1_MD) | (1 << CP0C1_PC) | - (1 << CP0C1_WR) | (1 << CP0C1_CA)); + config1 &= ~(MIPS_CONF1_C2 | MIPS_CONF1_MD | MIPS_CONF1_PC | + MIPS_CONF1_WR | MIPS_CONF1_CA); kvm_write_c0_guest_config1(cop0, config1); /* Have config3, no tertiary/secondary caches implemented */ @@ -511,6 +494,17 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu, + u64 __user *indices) +{ + return 0; +} + static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, s64 *v) @@ -660,6 +654,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .dequeue_io_int = kvm_mips_dequeue_io_int_cb, .irq_deliver = kvm_mips_irq_deliver_cb, .irq_clear = kvm_mips_irq_clear_cb, + .num_regs = kvm_trap_emul_num_regs, + .copy_reg_indices = kvm_trap_emul_copy_reg_indices, .get_one_reg = kvm_trap_emul_get_one_reg, .set_one_reg = kvm_trap_emul_set_one_reg, .vcpu_get_regs = kvm_trap_emul_vcpu_get_regs, diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index d96e912b9d44..6dc07fba187f 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -627,8 +627,8 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn, dec_insn.pc_inc + dec_insn.next_pc_inc; return 1; - case cbcond0_op: - case cbcond1_op: + case pop10_op: + case pop30_op: if (!cpu_has_mips_r6) break; if (insn.i_format.rt && !insn.i_format.rs) @@ -683,14 +683,14 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn, dec_insn.next_pc_inc; return 1; - case beqzcjic_op: + case pop66_op: if (!cpu_has_mips_r6) break; *contpc = regs->cp0_epc + dec_insn.pc_inc + dec_insn.next_pc_inc; return 1; - case bnezcjialc_op: + case pop76_op: if (!cpu_has_mips_r6) break; if (!insn.i_format.rs) diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index ef7f925dd1b0..7a9c345e87e5 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -1206,7 +1206,7 @@ static void probe_pcache(void) c->icache.linesz; c->icache.waybit = __ffs(icache_size/c->icache.ways); - if (config & 0x8) /* VI bit */ + if (config & MIPS_CONF_VI) c->icache.flags |= MIPS_CACHE_VTAG; /* diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c index d78178daea4b..277cf52d80e1 100644 --- a/arch/mips/mm/uasm-micromips.c +++ b/arch/mips/mm/uasm-micromips.c @@ -53,8 +53,13 @@ static struct insn insn_table_MM[] = { { insn_bltzl, 0, 0 }, { insn_bne, M(mm_bne32_op, 0, 0, 0, 0, 0), RT | RS | BIMM }, { insn_cache, M(mm_pool32b_op, 0, 0, mm_cache_func, 0, 0), RT | RS | SIMM }, + { insn_cfc1, M(mm_pool32f_op, 0, 0, 0, mm_cfc1_op, mm_32f_73_op), RT | RS }, + { insn_cfcmsa, M(mm_pool32s_op, 0, msa_cfc_op, 0, 0, mm_32s_elm_op), RD | RE }, + { insn_ctc1, M(mm_pool32f_op, 0, 0, 0, mm_ctc1_op, mm_32f_73_op), RT | RS }, + { insn_ctcmsa, M(mm_pool32s_op, 0, msa_ctc_op, 0, 0, mm_32s_elm_op), RD | RE }, { insn_daddu, 0, 0 }, { insn_daddiu, 0, 0 }, + { insn_di, M(mm_pool32a_op, 0, 0, 0, mm_di_op, mm_pool32axf_op), RS }, { insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS }, { insn_dmfc0, 0, 0 }, { insn_dmtc0, 0, 0 }, @@ -84,6 +89,8 @@ static struct insn insn_table_MM[] = { { insn_mfhi, M(mm_pool32a_op, 0, 0, 0, mm_mfhi32_op, mm_pool32axf_op), RS }, { insn_mflo, M(mm_pool32a_op, 0, 0, 0, mm_mflo32_op, mm_pool32axf_op), RS }, { insn_mtc0, M(mm_pool32a_op, 0, 0, 0, mm_mtc0_op, mm_pool32axf_op), RT | RS | RD }, + { insn_mthi, M(mm_pool32a_op, 0, 0, 0, mm_mthi32_op, mm_pool32axf_op), RS }, + { insn_mtlo, M(mm_pool32a_op, 0, 0, 0, mm_mtlo32_op, mm_pool32axf_op), RS }, { insn_mul, M(mm_pool32a_op, 0, 0, 0, 0, mm_mul_op), RT | RS | RD }, { insn_or, M(mm_pool32a_op, 0, 0, 0, 0, mm_or32_op), RT | RS | RD }, { insn_ori, M(mm_ori32_op, 0, 0, 0, 0, 0), RT | RS | UIMM }, @@ -166,13 +173,15 @@ static void build_insn(u32 **buf, enum opcode opc, ...) op = ip->match; va_start(ap, opc); if (ip->fields & RS) { - if (opc == insn_mfc0 || opc == insn_mtc0) + if (opc == insn_mfc0 || opc == insn_mtc0 || + opc == insn_cfc1 || opc == insn_ctc1) op |= build_rt(va_arg(ap, u32)); else op |= build_rs(va_arg(ap, u32)); } if (ip->fields & RT) { - if (opc == insn_mfc0 || opc == insn_mtc0) + if (opc == insn_mfc0 || opc == insn_mtc0 || + opc == insn_cfc1 || opc == insn_ctc1) op |= build_rs(va_arg(ap, u32)); else op |= build_rt(va_arg(ap, u32)); diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c index 9c2220a45189..cec524167822 100644 --- a/arch/mips/mm/uasm-mips.c +++ b/arch/mips/mm/uasm-mips.c @@ -67,9 +67,14 @@ static struct insn insn_table[] = { #else { insn_cache, M6(cache_op, 0, 0, 0, cache6_op), RS | RT | SIMM9 }, #endif + { insn_cfc1, M(cop1_op, cfc_op, 0, 0, 0, 0), RT | RD }, + { insn_cfcmsa, M(msa_op, 0, msa_cfc_op, 0, 0, msa_elm_op), RD | RE }, + { insn_ctc1, M(cop1_op, ctc_op, 0, 0, 0, 0), RT | RD }, + { insn_ctcmsa, M(msa_op, 0, msa_ctc_op, 0, 0, msa_elm_op), RD | RE }, { insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM }, { insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD }, { insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE }, + { insn_di, M(cop0_op, mfmc0_op, 0, 12, 0, 0), RT }, { insn_dins, M(spec3_op, 0, 0, 0, 0, dins_op), RS | RT | RD | RE }, { insn_divu, M(spec_op, 0, 0, 0, 0, divu_op), RS | RT }, { insn_dmfc0, M(cop0_op, dmfc_op, 0, 0, 0, 0), RT | RD | SET}, @@ -114,7 +119,13 @@ static struct insn insn_table[] = { { insn_mflo, M(spec_op, 0, 0, 0, 0, mflo_op), RD }, { insn_mtc0, M(cop0_op, mtc_op, 0, 0, 0, 0), RT | RD | SET}, { insn_mthc0, M(cop0_op, mthc0_op, 0, 0, 0, 0), RT | RD | SET}, + { insn_mthi, M(spec_op, 0, 0, 0, 0, mthi_op), RS }, + { insn_mtlo, M(spec_op, 0, 0, 0, 0, mtlo_op), RS }, +#ifndef CONFIG_CPU_MIPSR6 { insn_mul, M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD}, +#else + { insn_mul, M(spec_op, 0, 0, 0, mult_mul_op, mult_op), RS | RT | RD}, +#endif { insn_ori, M(ori_op, 0, 0, 0, 0, 0), RS | RT | UIMM }, { insn_or, M(spec_op, 0, 0, 0, 0, or_op), RS | RT | RD }, #ifndef CONFIG_CPU_MIPSR6 diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c index ad718debc35a..3e0282d301d6 100644 --- a/arch/mips/mm/uasm.c +++ b/arch/mips/mm/uasm.c @@ -49,18 +49,19 @@ enum opcode { insn_invalid, insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1, insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl, - insn_bne, insn_cache, insn_daddiu, insn_daddu, insn_dins, insn_dinsm, - insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll, + insn_bne, insn_cache, insn_cfc1, insn_cfcmsa, insn_ctc1, insn_ctcmsa, + insn_daddiu, insn_daddu, insn_di, insn_dins, insn_dinsm, insn_divu, + insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll, insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret, insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb, insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0, - insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe, - insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt, - insn_sltiu, insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu, - insn_sw, insn_sync, insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi, - insn_tlbwr, insn_wait, insn_wsbh, insn_xor, insn_xori, insn_yield, - insn_lddir, insn_ldpte, + insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_or, insn_ori, + insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, + insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl, + insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp, + insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor, + insn_xori, insn_yield, insn_lddir, insn_ldpte, }; struct insn { @@ -268,10 +269,15 @@ I_u1s2(_bltz) I_u1s2(_bltzl) I_u1u2s3(_bne) I_u2s3u1(_cache) +I_u1u2(_cfc1) +I_u2u1(_cfcmsa) +I_u1u2(_ctc1) +I_u2u1(_ctcmsa) I_u1u2u3(_dmfc0) I_u1u2u3(_dmtc0) I_u2u1s3(_daddiu) I_u3u1u2(_daddu) +I_u1(_di); I_u1u2(_divu) I_u2u1u3(_dsll) I_u2u1u3(_dsll32) @@ -301,6 +307,8 @@ I_u1(_mfhi) I_u1(_mflo) I_u1u2u3(_mtc0) I_u1u2u3(_mthc0) +I_u1(_mthi) +I_u1(_mtlo) I_u3u1u2(_mul) I_u2u1u3(_ori) I_u3u1u2(_or) diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c index f1b11f0dea2d..b4c02f29663e 100644 --- a/arch/mips/pci/pci.c +++ b/arch/mips/pci/pci.c @@ -112,7 +112,14 @@ static void pcibios_scanbus(struct pci_controller *hose) need_domain_info = 1; } - if (!pci_has_flag(PCI_PROBE_ONLY)) { + /* + * We insert PCI resources into the iomem_resource and + * ioport_resource trees in either pci_bus_claim_resources() + * or pci_bus_assign_resources(). + */ + if (pci_has_flag(PCI_PROBE_ONLY)) { + pci_bus_claim_resources(bus); + } else { pci_bus_size_bridges(bus); pci_bus_assign_resources(bus); } @@ -319,6 +326,16 @@ void pcibios_fixup_bus(struct pci_bus *bus) EXPORT_SYMBOL(PCIBIOS_MIN_IO); EXPORT_SYMBOL(PCIBIOS_MIN_MEM); +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, resource_size_t *start, + resource_size_t *end) +{ + phys_addr_t size = resource_size(rsrc); + + *start = fixup_bigphys_addr(rsrc->start, size); + *end = rsrc->start + size; +} + int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state, int write_combine) { diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 4cd612a6e272..1a2a6e8dc40d 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -43,7 +43,7 @@ ifeq ($(call cc-option-yn, -fstack-protector),y) BOOTCFLAGS += -fno-stack-protector endif -BOOTCFLAGS += -I$(obj) -I$(srctree)/$(obj) +BOOTCFLAGS += -I$(objtree)/$(obj) -I$(srctree)/$(obj) DTC_FLAGS ?= -p 1024 diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h new file mode 100644 index 000000000000..88b4901ac4ee --- /dev/null +++ b/arch/powerpc/include/asm/hmi.h @@ -0,0 +1,45 @@ +/* + * Hypervisor Maintenance Interrupt header file. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * + * Copyright 2015 IBM Corporation + * Author: Mahesh Salgaonkar + */ + +#ifndef __ASM_PPC64_HMI_H__ +#define __ASM_PPC64_HMI_H__ + +#ifdef CONFIG_PPC_BOOK3S_64 + +#define CORE_TB_RESYNC_REQ_BIT 63 +#define MAX_SUBCORE_PER_CORE 4 + +/* + * sibling_subcore_state structure is used to co-ordinate all threads + * during HMI to avoid TB corruption. This structure is allocated once + * per each core and shared by all threads on that core. + */ +struct sibling_subcore_state { + unsigned long flags; + u8 in_guest[MAX_SUBCORE_PER_CORE]; +}; + +extern void wait_for_subcore_guest_exit(void); +extern void wait_for_tb_resync(void); +#else +static inline void wait_for_subcore_guest_exit(void) { } +static inline void wait_for_tb_resync(void) { } +#endif +#endif /* __ASM_PPC64_HMI_H__ */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index ad171e979ab0..148303e7771f 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -26,6 +26,7 @@ #include #endif #include +#include register struct paca_struct *local_paca asm("r13"); @@ -182,6 +183,11 @@ struct paca_struct { */ u16 in_mce; u8 hmi_event_available; /* HMI event is available */ + /* + * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for + * more details + */ + struct sibling_subcore_state *sibling_subcore_state; #endif /* Stuff for accurate time accounting */ diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index a6f3ac0d4602..e9bd6cf0212f 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -136,9 +136,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, pgprot_t prot); #define HAVE_ARCH_PCI_RESOURCE_TO_USER -extern void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, - resource_size_t *start, resource_size_t *end); extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose); extern void pcibios_setup_bus_devices(struct pci_bus *bus); diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index fe4c075bcf50..b2027a5cf508 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o -obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o +obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o hmi.o obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o obj-$(CONFIG_PPC64) += vdso64/ obj-$(CONFIG_ALTIVEC) += vecemu.o diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6200e4925d26..694def6c9d61 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -671,6 +671,8 @@ BEGIN_FTR_SECTION beq h_doorbell_common cmpwi r3,0xea0 beq h_virt_irq_common + cmpwi r3,0xe60 + beq hmi_exception_common FTR_SECTION_ELSE cmpwi r3,0xa00 beq doorbell_super_common @@ -1172,7 +1174,7 @@ fwnmi_data_area: .globl hmi_exception_early hmi_exception_early: - EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60) + EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62) mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kernel/hmi.c new file mode 100644 index 000000000000..e3f738eb1cac --- /dev/null +++ b/arch/powerpc/kernel/hmi.c @@ -0,0 +1,56 @@ +/* + * Hypervisor Maintenance Interrupt (HMI) handling. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * + * Copyright 2015 IBM Corporation + * Author: Mahesh Salgaonkar + */ + +#undef DEBUG + +#include +#include +#include +#include + +void wait_for_subcore_guest_exit(void) +{ + int i; + + /* + * NULL bitmap pointer indicates that KVM module hasn't + * been loaded yet and hence no guests are running. + * If no KVM is in use, no need to co-ordinate among threads + * as all of them will always be in host and no one is going + * to modify TB other than the opal hmi handler. + * Hence, just return from here. + */ + if (!local_paca->sibling_subcore_state) + return; + + for (i = 0; i < MAX_SUBCORE_PER_CORE; i++) + while (local_paca->sibling_subcore_state->in_guest[i]) + cpu_relax(); +} + +void wait_for_tb_resync(void) +{ + if (!local_paca->sibling_subcore_state) + return; + + while (test_bit(CORE_TB_RESYNC_REQ_BIT, + &local_paca->sibling_subcore_state->flags)) + cpu_relax(); +} diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 335eb6cedae5..8a56a51fc0cb 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -336,7 +336,9 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ ld r2,PACATOC(r13); \ ld r1,PACAR1(r13); \ std r3,ORIG_GPR3(r1); /* Save original r3 */ \ - bl opal_rm_handle_hmi; \ + li r3,0; /* NULL argument */ \ + bl hmi_exception_realmode; \ + nop; \ ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \ 20: nop; diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index f93942b4b6a6..a5c0153ede37 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -411,36 +411,6 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev, return NULL; } -/* - * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci - * device mapping. - */ -static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp, - pgprot_t protection, - enum pci_mmap_state mmap_state, - int write_combine) -{ - - /* Write combine is always 0 on non-memory space mappings. On - * memory space, if the user didn't pass 1, we check for a - * "prefetchable" resource. This is a bit hackish, but we use - * this to workaround the inability of /sysfs to provide a write - * combine bit - */ - if (mmap_state != pci_mmap_mem) - write_combine = 0; - else if (write_combine == 0) { - if (rp->flags & IORESOURCE_PREFETCH) - write_combine = 1; - } - - /* XXX would be nice to have a way to ask for write-through */ - if (write_combine) - return pgprot_noncached_wc(protection); - else - return pgprot_noncached(protection); -} - /* * This one is used by /dev/mem and fbdev who have no clue about the * PCI device, it tries to find the PCI device first and calls the @@ -514,9 +484,10 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, return -EINVAL; vma->vm_pgoff = offset >> PAGE_SHIFT; - vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp, - vma->vm_page_prot, - mmap_state, write_combine); + if (write_combine) + vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); + else + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, vma->vm_end - vma->vm_start, vma->vm_page_prot); @@ -666,39 +637,25 @@ void pci_resource_to_user(const struct pci_dev *dev, int bar, const struct resource *rsrc, resource_size_t *start, resource_size_t *end) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - resource_size_t offset = 0; + struct pci_bus_region region; - if (hose == NULL) + if (rsrc->flags & IORESOURCE_IO) { + pcibios_resource_to_bus(dev->bus, ®ion, + (struct resource *) rsrc); + *start = region.start; + *end = region.end; return; + } - if (rsrc->flags & IORESOURCE_IO) - offset = (unsigned long)hose->io_base_virt - _IO_BASE; - - /* We pass a fully fixed up address to userland for MMIO instead of - * a BAR value because X is lame and expects to be able to use that - * to pass to /dev/mem ! + /* We pass a CPU physical address to userland for MMIO instead of a + * BAR value because X is lame and expects to be able to use that + * to pass to /dev/mem! * - * That means that we'll have potentially 64 bits values where some - * userland apps only expect 32 (like X itself since it thinks only - * Sparc has 64 bits MMIO) but if we don't do that, we break it on - * 32 bits CHRPs :-( - * - * Hopefully, the sysfs insterface is immune to that gunk. Once X - * has been fixed (and the fix spread enough), we can re-enable the - * 2 lines below and pass down a BAR value to userland. In that case - * we'll also have to re-enable the matching code in - * __pci_mmap_make_offset(). - * - * BenH. + * That means we may have 64-bit values where some apps only expect + * 32 (like X itself since it thinks only Sparc has 64-bit MMIO). */ -#if 0 - else if (rsrc->flags & IORESOURCE_MEM) - offset = hose->pci_mem_offset; -#endif - - *start = rsrc->start - offset; - *end = rsrc->end - offset; + *start = rsrc->start; + *end = rsrc->end; } /** diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index f7e2f2e318bd..2cb589264cb7 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) @@ -308,9 +309,13 @@ long hmi_exception_realmode(struct pt_regs *regs) { __this_cpu_inc(irq_stat.hmi_exceptions); + wait_for_subcore_guest_exit(); + if (ppc_md.hmi_exception_early) ppc_md.hmi_exception_early(regs); + wait_for_tb_resync(); + return 0; } diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index eba0bea6e032..1f9e5529e692 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -20,7 +20,7 @@ common-objs-y += powerpc.o emulate.o emulate_loadstore.o obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o -AFLAGS_booke_interrupts.o := -I$(obj) +AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj) kvm-e500-objs := \ $(common-objs-y) \ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e20beae5ca7a..2fd5580c8f6e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -2522,7 +2523,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) spin_unlock(&pvc->lock); - kvm_guest_enter(); + guest_enter(); srcu_idx = srcu_read_lock(&vc->kvm->srcu); @@ -2570,7 +2571,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); - kvm_guest_exit(); + guest_exit(); for (sub = 0; sub < core_info.n_subcores; ++sub) list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub], @@ -3401,6 +3402,38 @@ static struct kvmppc_ops kvm_ops_hv = { .hcall_implemented = kvmppc_hcall_impl_hv, }; +static int kvm_init_subcore_bitmap(void) +{ + int i, j; + int nr_cores = cpu_nr_cores(); + struct sibling_subcore_state *sibling_subcore_state; + + for (i = 0; i < nr_cores; i++) { + int first_cpu = i * threads_per_core; + int node = cpu_to_node(first_cpu); + + /* Ignore if it is already allocated. */ + if (paca[first_cpu].sibling_subcore_state) + continue; + + sibling_subcore_state = + kmalloc_node(sizeof(struct sibling_subcore_state), + GFP_KERNEL, node); + if (!sibling_subcore_state) + return -ENOMEM; + + memset(sibling_subcore_state, 0, + sizeof(struct sibling_subcore_state)); + + for (j = 0; j < threads_per_core; j++) { + int cpu = first_cpu + j; + + paca[cpu].sibling_subcore_state = sibling_subcore_state; + } + } + return 0; +} + static int kvmppc_book3s_init_hv(void) { int r; @@ -3411,6 +3444,10 @@ static int kvmppc_book3s_init_hv(void) if (r < 0) return -ENODEV; + r = kvm_init_subcore_bitmap(); + if (r) + return r; + kvm_ops_hv.owner = THIS_MODULE; kvmppc_hv_ops = &kvm_ops_hv; diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index 93b5f5c9b445..0fa70a9618d7 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -13,6 +13,9 @@ #include #include #include +#include +#include +#include /* SRR1 bits for machine check on POWER7 */ #define SRR1_MC_LDSTERR (1ul << (63-42)) @@ -140,3 +143,176 @@ long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) { return kvmppc_realmode_mc_power7(vcpu); } + +/* Check if dynamic split is in force and return subcore size accordingly. */ +static inline int kvmppc_cur_subcore_size(void) +{ + if (local_paca->kvm_hstate.kvm_split_mode) + return local_paca->kvm_hstate.kvm_split_mode->subcore_size; + + return threads_per_subcore; +} + +void kvmppc_subcore_enter_guest(void) +{ + int thread_id, subcore_id; + + thread_id = cpu_thread_in_core(local_paca->paca_index); + subcore_id = thread_id / kvmppc_cur_subcore_size(); + + local_paca->sibling_subcore_state->in_guest[subcore_id] = 1; +} + +void kvmppc_subcore_exit_guest(void) +{ + int thread_id, subcore_id; + + thread_id = cpu_thread_in_core(local_paca->paca_index); + subcore_id = thread_id / kvmppc_cur_subcore_size(); + + local_paca->sibling_subcore_state->in_guest[subcore_id] = 0; +} + +static bool kvmppc_tb_resync_required(void) +{ + if (test_and_set_bit(CORE_TB_RESYNC_REQ_BIT, + &local_paca->sibling_subcore_state->flags)) + return false; + + return true; +} + +static void kvmppc_tb_resync_done(void) +{ + clear_bit(CORE_TB_RESYNC_REQ_BIT, + &local_paca->sibling_subcore_state->flags); +} + +/* + * kvmppc_realmode_hmi_handler() is called only by primary thread during + * guest exit path. + * + * There are multiple reasons why HMI could occur, one of them is + * Timebase (TB) error. If this HMI is due to TB error, then TB would + * have been in stopped state. The opal hmi handler Will fix it and + * restore the TB value with host timebase value. For HMI caused due + * to non-TB errors, opal hmi handler will not touch/restore TB register + * and hence there won't be any change in TB value. + * + * Since we are not sure about the cause of this HMI, we can't be sure + * about the content of TB register whether it holds guest or host timebase + * value. Hence the idea is to resync the TB on every HMI, so that we + * know about the exact state of the TB value. Resync TB call will + * restore TB to host timebase. + * + * Things to consider: + * - On TB error, HMI interrupt is reported on all the threads of the core + * that has encountered TB error irrespective of split-core mode. + * - The very first thread on the core that get chance to fix TB error + * would rsync the TB with local chipTOD value. + * - The resync TB is a core level action i.e. it will sync all the TBs + * in that core independent of split-core mode. This means if we trigger + * TB sync from a thread from one subcore, it would affect TB values of + * sibling subcores of the same core. + * + * All threads need to co-ordinate before making opal hmi handler. + * All threads will use sibling_subcore_state->in_guest[] (shared by all + * threads in the core) in paca which holds information about whether + * sibling subcores are in Guest mode or host mode. The in_guest[] array + * is of size MAX_SUBCORE_PER_CORE=4, indexed using subcore id to set/unset + * subcore status. Only primary threads from each subcore is responsible + * to set/unset its designated array element while entering/exiting the + * guset. + * + * After invoking opal hmi handler call, one of the thread (of entire core) + * will need to resync the TB. Bit 63 from subcore state bitmap flags + * (sibling_subcore_state->flags) will be used to co-ordinate between + * primary threads to decide who takes up the responsibility. + * + * This is what we do: + * - Primary thread from each subcore tries to set resync required bit[63] + * of paca->sibling_subcore_state->flags. + * - The first primary thread that is able to set the flag takes the + * responsibility of TB resync. (Let us call it as thread leader) + * - All other threads which are in host will call + * wait_for_subcore_guest_exit() and wait for in_guest[0-3] from + * paca->sibling_subcore_state to get cleared. + * - All the primary thread will clear its subcore status from subcore + * state in_guest[] array respectively. + * - Once all primary threads clear in_guest[0-3], all of them will invoke + * opal hmi handler. + * - Now all threads will wait for TB resync to complete by invoking + * wait_for_tb_resync() except the thread leader. + * - Thread leader will do a TB resync by invoking opal_resync_timebase() + * call and the it will clear the resync required bit. + * - All other threads will now come out of resync wait loop and proceed + * with individual execution. + * - On return of this function, primary thread will signal all + * secondary threads to proceed. + * - All secondary threads will eventually call opal hmi handler on + * their exit path. + */ + +long kvmppc_realmode_hmi_handler(void) +{ + int ptid = local_paca->kvm_hstate.ptid; + bool resync_req; + + /* This is only called on primary thread. */ + BUG_ON(ptid != 0); + __this_cpu_inc(irq_stat.hmi_exceptions); + + /* + * By now primary thread has already completed guest->host + * partition switch but haven't signaled secondaries yet. + * All the secondary threads on this subcore is waiting + * for primary thread to signal them to go ahead. + * + * For threads from subcore which isn't in guest, they all will + * wait until all other subcores on this core exit the guest. + * + * Now set the resync required bit. If you are the first to + * set this bit then kvmppc_tb_resync_required() function will + * return true. For rest all other subcores + * kvmppc_tb_resync_required() will return false. + * + * If resync_req == true, then this thread is responsible to + * initiate TB resync after hmi handler has completed. + * All other threads on this core will wait until this thread + * clears the resync required bit flag. + */ + resync_req = kvmppc_tb_resync_required(); + + /* Reset the subcore status to indicate it has exited guest */ + kvmppc_subcore_exit_guest(); + + /* + * Wait for other subcores on this core to exit the guest. + * All the primary threads and threads from subcore that are + * not in guest will wait here until all subcores are out + * of guest context. + */ + wait_for_subcore_guest_exit(); + + /* + * At this point we are sure that primary threads from each + * subcore on this core have completed guest->host partition + * switch. Now it is safe to call HMI handler. + */ + if (ppc_md.hmi_exception_early) + ppc_md.hmi_exception_early(NULL); + + /* + * Check if this thread is responsible to resync TB. + * All other threads will wait until this thread completes the + * TB resync. + */ + if (resync_req) { + opal_resync_timebase(); + /* Reset TB resync req bit */ + kvmppc_tb_resync_done(); + } else { + wait_for_tb_resync(); + } + return 0; +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 86f0cae37a85..975655573844 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -29,6 +29,7 @@ #include #include #include +#include #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) @@ -373,6 +374,18 @@ kvm_secondary_got_guest: lwsync std r0, HSTATE_KVM_VCORE(r13) + /* + * All secondaries exiting guest will fall through this path. + * Before proceeding, just check for HMI interrupt and + * invoke opal hmi handler. By now we are sure that the + * primary thread on this core/subcore has already made partition + * switch/TB resync and we are good to call opal hmi handler. + */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + bne kvm_no_guest + + li r3,0 /* NULL argument */ + bl hmi_exception_realmode /* * At this point we have finished executing in the guest. * We need to wait for hwthread_req to become zero, since @@ -427,6 +440,22 @@ kvm_no_guest: * whole-core mode, so we need to nap. */ kvm_unsplit_nap: + /* + * When secondaries are napping in kvm_unsplit_nap() with + * hwthread_req = 1, HMI goes ignored even though subcores are + * already exited the guest. Hence HMI keeps waking up secondaries + * from nap in a loop and secondaries always go back to nap since + * no vcore is assigned to them. This makes impossible for primary + * thread to get hold of secondary threads resulting into a soft + * lockup in KVM path. + * + * Let us check if HMI is pending and handle it before we go to nap. + */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + bne 55f + li r3, 0 /* NULL argument */ + bl hmi_exception_realmode +55: /* * Ensure that secondary doesn't nap when it has * its vcore pointer set. @@ -601,6 +630,11 @@ BEGIN_FTR_SECTION mtspr SPRN_DPDES, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + /* Mark the subcore state as inside guest */ + bl kvmppc_subcore_enter_guest + nop + ld r5, HSTATE_KVM_VCORE(r13) + ld r4, HSTATE_KVM_VCPU(r13) li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ @@ -655,112 +689,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION - b skip_tm -END_FTR_SECTION_IFCLR(CPU_FTR_TM) - - /* Turn on TM/FP/VSX/VMX so we can restore them. */ - mfmsr r5 - li r6, MSR_TM >> 32 - sldi r6, r6, 32 - or r5, r5, r6 - ori r5, r5, MSR_FP - oris r5, r5, (MSR_VEC | MSR_VSX)@h - mtmsrd r5 - - /* - * The user may change these outside of a transaction, so they must - * always be context switched. - */ - ld r5, VCPU_TFHAR(r4) - ld r6, VCPU_TFIAR(r4) - ld r7, VCPU_TEXASR(r4) - mtspr SPRN_TFHAR, r5 - mtspr SPRN_TFIAR, r6 - mtspr SPRN_TEXASR, r7 - - ld r5, VCPU_MSR(r4) - rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 - beq skip_tm /* TM not active in guest */ - - /* Make sure the failure summary is set, otherwise we'll program check - * when we trechkpt. It's possible that this might have been not set - * on a kvmppc_set_one_reg() call but we shouldn't let this crash the - * host. - */ - oris r7, r7, (TEXASR_FS)@h - mtspr SPRN_TEXASR, r7 - - /* - * We need to load up the checkpointed state for the guest. - * We need to do this early as it will blow away any GPRs, VSRs and - * some SPRs. - */ - - mr r31, r4 - addi r3, r31, VCPU_FPRS_TM - bl load_fp_state - addi r3, r31, VCPU_VRS_TM - bl load_vr_state - mr r4, r31 - lwz r7, VCPU_VRSAVE_TM(r4) - mtspr SPRN_VRSAVE, r7 - - ld r5, VCPU_LR_TM(r4) - lwz r6, VCPU_CR_TM(r4) - ld r7, VCPU_CTR_TM(r4) - ld r8, VCPU_AMR_TM(r4) - ld r9, VCPU_TAR_TM(r4) - mtlr r5 - mtcr r6 - mtctr r7 - mtspr SPRN_AMR, r8 - mtspr SPRN_TAR, r9 - - /* - * Load up PPR and DSCR values but don't put them in the actual SPRs - * till the last moment to avoid running with userspace PPR and DSCR for - * too long. - */ - ld r29, VCPU_DSCR_TM(r4) - ld r30, VCPU_PPR_TM(r4) - - std r2, PACATMSCRATCH(r13) /* Save TOC */ - - /* Clear the MSR RI since r1, r13 are all going to be foobar. */ - li r5, 0 - mtmsrd r5, 1 - - /* Load GPRs r0-r28 */ - reg = 0 - .rept 29 - ld reg, VCPU_GPRS_TM(reg)(r31) - reg = reg + 1 - .endr - - mtspr SPRN_DSCR, r29 - mtspr SPRN_PPR, r30 - - /* Load final GPRs */ - ld 29, VCPU_GPRS_TM(29)(r31) - ld 30, VCPU_GPRS_TM(30)(r31) - ld 31, VCPU_GPRS_TM(31)(r31) - - /* TM checkpointed state is now setup. All GPRs are now volatile. */ - TRECHKPT - - /* Now let's get back the state we need. */ - HMT_MEDIUM - GET_PACA(r13) - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 - ld r4, HSTATE_KVM_VCPU(r13) - ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATMSCRATCH(r13) - - /* Set the MSR RI since we have our registers back. */ - li r5, MSR_RI - mtmsrd r5, 1 -skip_tm: + bl kvmppc_restore_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif /* Load guest PMU registers */ @@ -841,12 +771,6 @@ BEGIN_FTR_SECTION /* Skip next section on POWER7 */ b 8f END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) - /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */ - mfmsr r8 - li r0, 1 - rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG - mtmsrd r8 - /* Load up POWER8-specific registers */ ld r5, VCPU_IAMR(r4) lwz r6, VCPU_PSPB(r4) @@ -1436,106 +1360,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION - b 2f -END_FTR_SECTION_IFCLR(CPU_FTR_TM) - /* Turn on TM. */ - mfmsr r8 - li r0, 1 - rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG - mtmsrd r8 - - ld r5, VCPU_MSR(r9) - rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 - beq 1f /* TM not active in guest. */ - - li r3, TM_CAUSE_KVM_RESCHED - - /* Clear the MSR RI since r1, r13 are all going to be foobar. */ - li r5, 0 - mtmsrd r5, 1 - - /* All GPRs are volatile at this point. */ - TRECLAIM(R3) - - /* Temporarily store r13 and r9 so we have some regs to play with */ - SET_SCRATCH0(r13) - GET_PACA(r13) - std r9, PACATMSCRATCH(r13) - ld r9, HSTATE_KVM_VCPU(r13) - - /* Get a few more GPRs free. */ - std r29, VCPU_GPRS_TM(29)(r9) - std r30, VCPU_GPRS_TM(30)(r9) - std r31, VCPU_GPRS_TM(31)(r9) - - /* Save away PPR and DSCR soon so don't run with user values. */ - mfspr r31, SPRN_PPR - HMT_MEDIUM - mfspr r30, SPRN_DSCR - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 - - /* Save all but r9, r13 & r29-r31 */ - reg = 0 - .rept 29 - .if (reg != 9) && (reg != 13) - std reg, VCPU_GPRS_TM(reg)(r9) - .endif - reg = reg + 1 - .endr - /* ... now save r13 */ - GET_SCRATCH0(r4) - std r4, VCPU_GPRS_TM(13)(r9) - /* ... and save r9 */ - ld r4, PACATMSCRATCH(r13) - std r4, VCPU_GPRS_TM(9)(r9) - - /* Reload stack pointer and TOC. */ - ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATOC(r13) - - /* Set MSR RI now we have r1 and r13 back. */ - li r5, MSR_RI - mtmsrd r5, 1 - - /* Save away checkpinted SPRs. */ - std r31, VCPU_PPR_TM(r9) - std r30, VCPU_DSCR_TM(r9) - mflr r5 - mfcr r6 - mfctr r7 - mfspr r8, SPRN_AMR - mfspr r10, SPRN_TAR - std r5, VCPU_LR_TM(r9) - stw r6, VCPU_CR_TM(r9) - std r7, VCPU_CTR_TM(r9) - std r8, VCPU_AMR_TM(r9) - std r10, VCPU_TAR_TM(r9) - - /* Restore r12 as trap number. */ - lwz r12, VCPU_TRAP(r9) - - /* Save FP/VSX. */ - addi r3, r9, VCPU_FPRS_TM - bl store_fp_state - addi r3, r9, VCPU_VRS_TM - bl store_vr_state - mfspr r6, SPRN_VRSAVE - stw r6, VCPU_VRSAVE_TM(r9) -1: - /* - * We need to save these SPRs after the treclaim so that the software - * error code is recorded correctly in the TEXASR. Also the user may - * change these outside of a transaction, so they must always be - * context switched. - */ - mfspr r5, SPRN_TFHAR - mfspr r6, SPRN_TFIAR - mfspr r7, SPRN_TEXASR - std r5, VCPU_TFHAR(r9) - std r6, VCPU_TFIAR(r9) - std r7, VCPU_TEXASR(r9) -2: + bl kvmppc_save_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif /* Increment yield count if they have a VPA */ @@ -1683,6 +1509,23 @@ BEGIN_FTR_SECTION mtspr SPRN_DPDES, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + /* If HMI, call kvmppc_realmode_hmi_handler() */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + bne 27f + bl kvmppc_realmode_hmi_handler + nop + li r12, BOOK3S_INTERRUPT_HMI + /* + * At this point kvmppc_realmode_hmi_handler would have resync-ed + * the TB. Hence it is not required to subtract guest timebase + * offset from timebase. So, skip it. + * + * Also, do not call kvmppc_subcore_exit_guest() because it has + * been invoked as part of kvmppc_realmode_hmi_handler(). + */ + b 30f + +27: /* Subtract timebase offset from timebase */ ld r8,VCORE_TB_OFFSET(r5) cmpdi r8,0 @@ -1698,8 +1541,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) addis r8,r8,0x100 /* if so, increment upper 40 bits */ mtspr SPRN_TBU40,r8 +17: bl kvmppc_subcore_exit_guest + nop +30: ld r5,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ + /* Reset PCR */ -17: ld r0, VCORE_PCR(r5) + ld r0, VCORE_PCR(r5) cmpdi r0, 0 beq 18f li r0, 0 @@ -2245,6 +2093,13 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ /* save FP state */ bl kvmppc_save_fp +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + ld r9, HSTATE_KVM_VCPU(r13) + bl kvmppc_save_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) +#endif + /* * Set DEC to the smaller of DEC and HDEC, so that we wake * no later than the end of our timeslice (HDEC interrupts @@ -2321,6 +2176,12 @@ kvm_end_cede: bl kvmhv_accumulate_time #endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + bl kvmppc_restore_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) +#endif + /* load up FP state */ bl kvmppc_load_fp @@ -2461,6 +2322,8 @@ BEGIN_FTR_SECTION cmpwi r6, 3 /* hypervisor doorbell? */ beq 3f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + cmpwi r6, 0xa /* Hypervisor maintenance ? */ + beq 4f li r3, 1 /* anything else, return 1 */ 0: blr @@ -2482,6 +2345,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r3, -1 blr + /* Woken up due to Hypervisor maintenance interrupt */ +4: li r12, BOOK3S_INTERRUPT_HMI + li r3, 1 + blr + /* * Determine what sort of external interrupt is pending (if any). * Returns: @@ -2631,6 +2499,239 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) mr r4,r31 blr +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Save transactional state and TM-related registers. + * Called with r9 pointing to the vcpu struct. + * This can modify all checkpointed registers, but + * restores r1, r2 and r9 (vcpu pointer) before exit. + */ +kvmppc_save_tm: + mflr r0 + std r0, PPC_LR_STKOFF(r1) + + /* Turn on TM. */ + mfmsr r8 + li r0, 1 + rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG + mtmsrd r8 + + ld r5, VCPU_MSR(r9) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beq 1f /* TM not active in guest. */ + + std r1, HSTATE_HOST_R1(r13) + li r3, TM_CAUSE_KVM_RESCHED + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* All GPRs are volatile at this point. */ + TRECLAIM(R3) + + /* Temporarily store r13 and r9 so we have some regs to play with */ + SET_SCRATCH0(r13) + GET_PACA(r13) + std r9, PACATMSCRATCH(r13) + ld r9, HSTATE_KVM_VCPU(r13) + + /* Get a few more GPRs free. */ + std r29, VCPU_GPRS_TM(29)(r9) + std r30, VCPU_GPRS_TM(30)(r9) + std r31, VCPU_GPRS_TM(31)(r9) + + /* Save away PPR and DSCR soon so don't run with user values. */ + mfspr r31, SPRN_PPR + HMT_MEDIUM + mfspr r30, SPRN_DSCR + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + + /* Save all but r9, r13 & r29-r31 */ + reg = 0 + .rept 29 + .if (reg != 9) && (reg != 13) + std reg, VCPU_GPRS_TM(reg)(r9) + .endif + reg = reg + 1 + .endr + /* ... now save r13 */ + GET_SCRATCH0(r4) + std r4, VCPU_GPRS_TM(13)(r9) + /* ... and save r9 */ + ld r4, PACATMSCRATCH(r13) + std r4, VCPU_GPRS_TM(9)(r9) + + /* Reload stack pointer and TOC. */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + /* Set MSR RI now we have r1 and r13 back. */ + li r5, MSR_RI + mtmsrd r5, 1 + + /* Save away checkpinted SPRs. */ + std r31, VCPU_PPR_TM(r9) + std r30, VCPU_DSCR_TM(r9) + mflr r5 + mfcr r6 + mfctr r7 + mfspr r8, SPRN_AMR + mfspr r10, SPRN_TAR + std r5, VCPU_LR_TM(r9) + stw r6, VCPU_CR_TM(r9) + std r7, VCPU_CTR_TM(r9) + std r8, VCPU_AMR_TM(r9) + std r10, VCPU_TAR_TM(r9) + + /* Restore r12 as trap number. */ + lwz r12, VCPU_TRAP(r9) + + /* Save FP/VSX. */ + addi r3, r9, VCPU_FPRS_TM + bl store_fp_state + addi r3, r9, VCPU_VRS_TM + bl store_vr_state + mfspr r6, SPRN_VRSAVE + stw r6, VCPU_VRSAVE_TM(r9) +1: + /* + * We need to save these SPRs after the treclaim so that the software + * error code is recorded correctly in the TEXASR. Also the user may + * change these outside of a transaction, so they must always be + * context switched. + */ + mfspr r5, SPRN_TFHAR + mfspr r6, SPRN_TFIAR + mfspr r7, SPRN_TEXASR + std r5, VCPU_TFHAR(r9) + std r6, VCPU_TFIAR(r9) + std r7, VCPU_TEXASR(r9) + + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + blr + +/* + * Restore transactional state and TM-related registers. + * Called with r4 pointing to the vcpu struct. + * This potentially modifies all checkpointed registers. + * It restores r1, r2, r4 from the PACA. + */ +kvmppc_restore_tm: + mflr r0 + std r0, PPC_LR_STKOFF(r1) + + /* Turn on TM/FP/VSX/VMX so we can restore them. */ + mfmsr r5 + li r6, MSR_TM >> 32 + sldi r6, r6, 32 + or r5, r5, r6 + ori r5, r5, MSR_FP + oris r5, r5, (MSR_VEC | MSR_VSX)@h + mtmsrd r5 + + /* + * The user may change these outside of a transaction, so they must + * always be context switched. + */ + ld r5, VCPU_TFHAR(r4) + ld r6, VCPU_TFIAR(r4) + ld r7, VCPU_TEXASR(r4) + mtspr SPRN_TFHAR, r5 + mtspr SPRN_TFIAR, r6 + mtspr SPRN_TEXASR, r7 + + ld r5, VCPU_MSR(r4) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beqlr /* TM not active in guest */ + std r1, HSTATE_HOST_R1(r13) + + /* Make sure the failure summary is set, otherwise we'll program check + * when we trechkpt. It's possible that this might have been not set + * on a kvmppc_set_one_reg() call but we shouldn't let this crash the + * host. + */ + oris r7, r7, (TEXASR_FS)@h + mtspr SPRN_TEXASR, r7 + + /* + * We need to load up the checkpointed state for the guest. + * We need to do this early as it will blow away any GPRs, VSRs and + * some SPRs. + */ + + mr r31, r4 + addi r3, r31, VCPU_FPRS_TM + bl load_fp_state + addi r3, r31, VCPU_VRS_TM + bl load_vr_state + mr r4, r31 + lwz r7, VCPU_VRSAVE_TM(r4) + mtspr SPRN_VRSAVE, r7 + + ld r5, VCPU_LR_TM(r4) + lwz r6, VCPU_CR_TM(r4) + ld r7, VCPU_CTR_TM(r4) + ld r8, VCPU_AMR_TM(r4) + ld r9, VCPU_TAR_TM(r4) + mtlr r5 + mtcr r6 + mtctr r7 + mtspr SPRN_AMR, r8 + mtspr SPRN_TAR, r9 + + /* + * Load up PPR and DSCR values but don't put them in the actual SPRs + * till the last moment to avoid running with userspace PPR and DSCR for + * too long. + */ + ld r29, VCPU_DSCR_TM(r4) + ld r30, VCPU_PPR_TM(r4) + + std r2, PACATMSCRATCH(r13) /* Save TOC */ + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* Load GPRs r0-r28 */ + reg = 0 + .rept 29 + ld reg, VCPU_GPRS_TM(reg)(r31) + reg = reg + 1 + .endr + + mtspr SPRN_DSCR, r29 + mtspr SPRN_PPR, r30 + + /* Load final GPRs */ + ld 29, VCPU_GPRS_TM(29)(r31) + ld 30, VCPU_GPRS_TM(30)(r31) + ld 31, VCPU_GPRS_TM(31)(r31) + + /* TM checkpointed state is now setup. All GPRs are now volatile. */ + TRECHKPT + + /* Now let's get back the state we need. */ + HMT_MEDIUM + GET_PACA(r13) + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + ld r4, HSTATE_KVM_VCPU(r13) + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATMSCRATCH(r13) + + /* Set the MSR RI since we have our registers back. */ + li r5, MSR_RI + mtmsrd r5, 1 + + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + blr +#endif + /* * We come here if we get any exception or interrupt while we are * executing host real mode code while in guest MMU context. diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index c4f7d6b86b9e..e76f79a45988 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -914,7 +914,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, /* We get here with MSR.EE=1 */ trace_kvm_exit(exit_nr, vcpu); - kvm_guest_exit(); + guest_exit(); switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: @@ -1049,7 +1049,17 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, int emul; program_interrupt: - flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + /* + * shadow_srr1 only contains valid flags if we came here via + * a program exception. The other exceptions (emulation assist, + * FP unavailable, etc.) do not provide flags in SRR1, so use + * an illegal-instruction exception when injecting a program + * interrupt into the guest. + */ + if (exit_nr == BOOK3S_INTERRUPT_PROGRAM) + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + else + flags = SRR1_PROGILL; emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); if (emul != EMULATE_DONE) { @@ -1531,7 +1541,7 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_clear_debug(vcpu); - /* No need for kvm_guest_exit. It's done in handle_exit. + /* No need for guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ /* Make sure we save the guest FPU/Altivec/VSX state */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 4afae695899a..02b4672f7347 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -776,7 +776,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ret = __kvmppc_vcpu_run(kvm_run, vcpu); - /* No need for kvm_guest_exit. It's done in handle_exit. + /* No need for guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ /* Switch back to user space debug context */ @@ -1012,7 +1012,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } trace_kvm_exit(exit_nr, vcpu); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 5cc2e7af3a7b..b379146de55b 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -302,7 +302,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) advance = 0; printk(KERN_ERR "Couldn't emulate instruction 0x%08x " "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst)); - kvmppc_core_queue_program(vcpu, 0); } } diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index 6249cdc834d1..ed38f8114118 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -1823,7 +1823,8 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return 0; } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 02416fea7653..6ce40dd6fe51 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -119,7 +119,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) continue; } - __kvm_guest_enter(); + guest_enter_irqoff(); return 1; } @@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; #endif + case KVM_CAP_PPC_HTM: + r = cpu_has_feature(CPU_FTR_TM_COMP) && + is_kvmppc_hv_enabled(kvm); + break; default: r = 0; break; diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index cf928bba4d9a..3d29d40eb0e9 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -64,7 +64,6 @@ END_FTR_SECTION(0, 1); \ OPAL_BRANCH(opal_tracepoint_entry) \ mfcr r12; \ stw r12,8(r1); \ - std r1,PACAR1(r13); \ li r11,0; \ mfmsr r12; \ ori r11,r11,MSR_EE; \ @@ -127,7 +126,6 @@ opal_tracepoint_entry: mfcr r12 std r11,16(r1) stw r12,8(r1) - std r1,PACAR1(r13) li r11,0 mfmsr r12 ori r11,r11,MSR_EE diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile index 13723c39e58e..33ba697c782d 100644 --- a/arch/s390/boot/compressed/Makefile +++ b/arch/s390/boot/compressed/Makefile @@ -33,10 +33,10 @@ quiet_cmd_sizes = GEN $@ $(obj)/sizes.h: vmlinux $(call if_changed,sizes) -AFLAGS_head.o += -I$(obj) +AFLAGS_head.o += -I$(objtree)/$(obj) $(obj)/head.o: $(obj)/sizes.h -CFLAGS_misc.o += -I$(obj) +CFLAGS_misc.o += -I$(objtree)/$(obj) $(obj)/misc.o: $(obj)/sizes.h OBJCOPYFLAGS_vmlinux.bin := -R .comment -S diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index 67d43a0eabb4..28f03ca60100 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -19,29 +19,10 @@ #include #include "hypfs.h" -#define LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */ -#define CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */ #define TMP_SIZE 64 /* size of temporary buffers */ #define DBFS_D204_HDR_VERSION 0 -/* diag 204 subcodes */ -enum diag204_sc { - SUBC_STIB4 = 4, - SUBC_RSI = 5, - SUBC_STIB6 = 6, - SUBC_STIB7 = 7 -}; - -/* The two available diag 204 data formats */ -enum diag204_format { - INFO_SIMPLE = 0, - INFO_EXT = 0x00010000 -}; - -/* bit is set in flags, when physical cpu info is included in diag 204 data */ -#define LPAR_PHYS_FLG 0x80 - static char *diag224_cpu_names; /* diag 224 name table */ static enum diag204_sc diag204_store_sc; /* used subcode for store */ static enum diag204_format diag204_info_type; /* used diag 204 data format */ @@ -53,7 +34,7 @@ static int diag204_buf_pages; /* number of pages for diag204 data */ static struct dentry *dbfs_d204_file; /* - * DIAG 204 data structures and member access functions. + * DIAG 204 member access functions. * * Since we have two different diag 204 data formats for old and new s390 * machines, we do not access the structs directly, but use getter functions for @@ -62,304 +43,173 @@ static struct dentry *dbfs_d204_file; /* Time information block */ -struct info_blk_hdr { - __u8 npar; - __u8 flags; - __u16 tslice; - __u16 phys_cpus; - __u16 this_part; - __u64 curtod; -} __attribute__ ((packed)); - -struct x_info_blk_hdr { - __u8 npar; - __u8 flags; - __u16 tslice; - __u16 phys_cpus; - __u16 this_part; - __u64 curtod1; - __u64 curtod2; - char reserved[40]; -} __attribute__ ((packed)); - static inline int info_blk_hdr__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct info_blk_hdr); - else /* INFO_EXT */ - return sizeof(struct x_info_blk_hdr); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_info_blk_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_info_blk_hdr); } static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct info_blk_hdr *)hdr)->npar; - else /* INFO_EXT */ - return ((struct x_info_blk_hdr *)hdr)->npar; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->npar; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->npar; } static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct info_blk_hdr *)hdr)->flags; - else /* INFO_EXT */ - return ((struct x_info_blk_hdr *)hdr)->flags; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->flags; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->flags; } static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct info_blk_hdr *)hdr)->phys_cpus; - else /* INFO_EXT */ - return ((struct x_info_blk_hdr *)hdr)->phys_cpus; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus; } /* Partition header */ -struct part_hdr { - __u8 pn; - __u8 cpus; - char reserved[6]; - char part_name[LPAR_NAME_LEN]; -} __attribute__ ((packed)); - -struct x_part_hdr { - __u8 pn; - __u8 cpus; - __u8 rcpus; - __u8 pflag; - __u32 mlu; - char part_name[LPAR_NAME_LEN]; - char lpc_name[8]; - char os_name[8]; - __u64 online_cs; - __u64 online_es; - __u8 upid; - char reserved1[3]; - __u32 group_mlu; - char group_name[8]; - char reserved2[32]; -} __attribute__ ((packed)); - static inline int part_hdr__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct part_hdr); - else /* INFO_EXT */ - return sizeof(struct x_part_hdr); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_part_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_part_hdr); } static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct part_hdr *)hdr)->cpus; - else /* INFO_EXT */ - return ((struct x_part_hdr *)hdr)->rcpus; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_part_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_part_hdr *)hdr)->rcpus; } static inline void part_hdr__part_name(enum diag204_format type, void *hdr, char *name) { - if (type == INFO_SIMPLE) - memcpy(name, ((struct part_hdr *)hdr)->part_name, - LPAR_NAME_LEN); - else /* INFO_EXT */ - memcpy(name, ((struct x_part_hdr *)hdr)->part_name, - LPAR_NAME_LEN); - EBCASC(name, LPAR_NAME_LEN); - name[LPAR_NAME_LEN] = 0; + if (type == DIAG204_INFO_SIMPLE) + memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + else /* DIAG204_INFO_EXT */ + memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + EBCASC(name, DIAG204_LPAR_NAME_LEN); + name[DIAG204_LPAR_NAME_LEN] = 0; strim(name); } -struct cpu_info { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - __u8 cflag; - __u16 weight; - __u64 acc_time; - __u64 lp_time; -} __attribute__ ((packed)); - -struct x_cpu_info { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - __u8 cflag; - __u16 weight; - __u64 acc_time; - __u64 lp_time; - __u16 min_weight; - __u16 cur_weight; - __u16 max_weight; - char reseved2[2]; - __u64 online_time; - __u64 wait_time; - __u32 pma_weight; - __u32 polar_weight; - char reserved3[40]; -} __attribute__ ((packed)); - /* CPU info block */ static inline int cpu_info__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct cpu_info); - else /* INFO_EXT */ - return sizeof(struct x_cpu_info); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_cpu_info); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_cpu_info); } static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->ctidx; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->ctidx; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->ctidx; } static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->cpu_addr; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->cpu_addr; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->cpu_addr; } static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->acc_time; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->acc_time; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->acc_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->acc_time; } static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->lp_time; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->lp_time; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->lp_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->lp_time; } static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) + if (type == DIAG204_INFO_SIMPLE) return 0; /* online_time not available in simple info */ - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->online_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->online_time; } /* Physical header */ -struct phys_hdr { - char reserved1[1]; - __u8 cpus; - char reserved2[6]; - char mgm_name[8]; -} __attribute__ ((packed)); - -struct x_phys_hdr { - char reserved1[1]; - __u8 cpus; - char reserved2[6]; - char mgm_name[8]; - char reserved3[80]; -} __attribute__ ((packed)); - static inline int phys_hdr__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct phys_hdr); - else /* INFO_EXT */ - return sizeof(struct x_phys_hdr); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_hdr); } static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_hdr *)hdr)->cpus; - else /* INFO_EXT */ - return ((struct x_phys_hdr *)hdr)->cpus; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_hdr *)hdr)->cpus; } /* Physical CPU info block */ -struct phys_cpu { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - char reserved2[3]; - __u64 mgm_time; - char reserved3[8]; -} __attribute__ ((packed)); - -struct x_phys_cpu { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - char reserved2[3]; - __u64 mgm_time; - char reserved3[80]; -} __attribute__ ((packed)); - static inline int phys_cpu__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct phys_cpu); - else /* INFO_EXT */ - return sizeof(struct x_phys_cpu); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_cpu); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_cpu); } static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_cpu *)hdr)->cpu_addr; - else /* INFO_EXT */ - return ((struct x_phys_cpu *)hdr)->cpu_addr; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr; } static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_cpu *)hdr)->mgm_time; - else /* INFO_EXT */ - return ((struct x_phys_cpu *)hdr)->mgm_time; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->mgm_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->mgm_time; } static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_cpu *)hdr)->ctidx; - else /* INFO_EXT */ - return ((struct x_phys_cpu *)hdr)->ctidx; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->ctidx; } /* Diagnose 204 functions */ - -static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr) -{ - register unsigned long _subcode asm("0") = *subcode; - register unsigned long _size asm("1") = size; - - asm volatile( - " diag %2,%0,0x204\n" - "0: nopr %%r7\n" - EX_TABLE(0b,0b) - : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory"); - *subcode = _subcode; - return _size; -} - -static int diag204(unsigned long subcode, unsigned long size, void *addr) -{ - diag_stat_inc(DIAG_STAT_X204); - size = __diag204(&subcode, size, addr); - if (subcode) - return -1; - return size; -} - /* * For the old diag subcode 4 with simple data format we have to use real * memory. If we use subcode 6 or 7 with extended data format, we can (and @@ -411,12 +261,12 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages) *pages = diag204_buf_pages; return diag204_buf; } - if (fmt == INFO_SIMPLE) { + if (fmt == DIAG204_INFO_SIMPLE) { *pages = 1; return diag204_alloc_rbuf(); - } else {/* INFO_EXT */ - *pages = diag204((unsigned long)SUBC_RSI | - (unsigned long)INFO_EXT, 0, NULL); + } else {/* DIAG204_INFO_EXT */ + *pages = diag204((unsigned long)DIAG204_SUBC_RSI | + (unsigned long)DIAG204_INFO_EXT, 0, NULL); if (*pages <= 0) return ERR_PTR(-ENOSYS); else @@ -443,18 +293,18 @@ static int diag204_probe(void) void *buf; int pages, rc; - buf = diag204_get_buffer(INFO_EXT, &pages); + buf = diag204_get_buffer(DIAG204_INFO_EXT, &pages); if (!IS_ERR(buf)) { - if (diag204((unsigned long)SUBC_STIB7 | - (unsigned long)INFO_EXT, pages, buf) >= 0) { - diag204_store_sc = SUBC_STIB7; - diag204_info_type = INFO_EXT; + if (diag204((unsigned long)DIAG204_SUBC_STIB7 | + (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB7; + diag204_info_type = DIAG204_INFO_EXT; goto out; } - if (diag204((unsigned long)SUBC_STIB6 | - (unsigned long)INFO_EXT, pages, buf) >= 0) { - diag204_store_sc = SUBC_STIB6; - diag204_info_type = INFO_EXT; + if (diag204((unsigned long)DIAG204_SUBC_STIB6 | + (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB6; + diag204_info_type = DIAG204_INFO_EXT; goto out; } diag204_free_buffer(); @@ -462,15 +312,15 @@ static int diag204_probe(void) /* subcodes 6 and 7 failed, now try subcode 4 */ - buf = diag204_get_buffer(INFO_SIMPLE, &pages); + buf = diag204_get_buffer(DIAG204_INFO_SIMPLE, &pages); if (IS_ERR(buf)) { rc = PTR_ERR(buf); goto fail_alloc; } - if (diag204((unsigned long)SUBC_STIB4 | - (unsigned long)INFO_SIMPLE, pages, buf) >= 0) { - diag204_store_sc = SUBC_STIB4; - diag204_info_type = INFO_SIMPLE; + if (diag204((unsigned long)DIAG204_SUBC_STIB4 | + (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB4; + diag204_info_type = DIAG204_INFO_SIMPLE; goto out; } else { rc = -ENOSYS; @@ -510,20 +360,6 @@ static void *diag204_store(void) /* Diagnose 224 functions */ -static int diag224(void *ptr) -{ - int rc = -EOPNOTSUPP; - - diag_stat_inc(DIAG_STAT_X224); - asm volatile( - " diag %1,%2,0x224\n" - "0: lhi %0,0x0\n" - "1:\n" - EX_TABLE(0b,1b) - : "+d" (rc) :"d" (0), "d" (ptr) : "memory"); - return rc; -} - static int diag224_get_name_table(void) { /* memory must be below 2GB */ @@ -545,9 +381,9 @@ static void diag224_delete_name_table(void) static int diag224_idx2name(int index, char *name) { - memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN), - CPU_NAME_LEN); - name[CPU_NAME_LEN] = 0; + memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN), + DIAG204_CPU_NAME_LEN); + name[DIAG204_CPU_NAME_LEN] = 0; strim(name); return 0; } @@ -603,7 +439,7 @@ __init int hypfs_diag_init(void) pr_err("The hardware system does not support hypfs\n"); return -ENODATA; } - if (diag204_info_type == INFO_EXT) { + if (diag204_info_type == DIAG204_INFO_EXT) { rc = hypfs_dbfs_create_file(&dbfs_file_d204); if (rc) return rc; @@ -651,7 +487,7 @@ static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info) cpu_info__lp_time(diag204_info_type, cpu_info)); if (IS_ERR(rc)) return PTR_ERR(rc); - if (diag204_info_type == INFO_EXT) { + if (diag204_info_type == DIAG204_INFO_EXT) { rc = hypfs_create_u64(cpu_dir, "onlinetime", cpu_info__online_time(diag204_info_type, cpu_info)); @@ -667,12 +503,12 @@ static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr) { struct dentry *cpus_dir; struct dentry *lpar_dir; - char lpar_name[LPAR_NAME_LEN + 1]; + char lpar_name[DIAG204_LPAR_NAME_LEN + 1]; void *cpu_info; int i; part_hdr__part_name(diag204_info_type, part_hdr, lpar_name); - lpar_name[LPAR_NAME_LEN] = 0; + lpar_name[DIAG204_LPAR_NAME_LEN] = 0; lpar_dir = hypfs_mkdir(systems_dir, lpar_name); if (IS_ERR(lpar_dir)) return lpar_dir; @@ -755,7 +591,8 @@ int hypfs_diag_create_files(struct dentry *root) goto err_out; } } - if (info_blk_hdr__flags(diag204_info_type, time_hdr) & LPAR_PHYS_FLG) { + if (info_blk_hdr__flags(diag204_info_type, time_hdr) & + DIAG204_LPAR_PHYS_FLG) { ptr = hypfs_create_phys_files(root, part_hdr); if (IS_ERR(ptr)) { rc = PTR_ERR(ptr); diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index 1a82cf26ee11..d28621de8e0b 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -20,6 +20,9 @@ #define CPACF_KMC 0xb92f /* MSA */ #define CPACF_KIMD 0xb93e /* MSA */ #define CPACF_KLMD 0xb93f /* MSA */ +#define CPACF_PCKMO 0xb928 /* MSA3 */ +#define CPACF_KMF 0xb92a /* MSA4 */ +#define CPACF_KMO 0xb92b /* MSA4 */ #define CPACF_PCC 0xb92c /* MSA4 */ #define CPACF_KMCTR 0xb92d /* MSA4 */ #define CPACF_PPNO 0xb93c /* MSA5 */ @@ -136,6 +139,7 @@ static inline void __cpacf_query(unsigned int opcode, unsigned char *status) register unsigned long r1 asm("1") = (unsigned long) status; asm volatile( + " spm 0\n" /* pckmo doesn't change the cc */ /* Parameter registers are ignored, but may not be 0 */ "0: .insn rrf,%[opc] << 16,2,2,2,0\n" " brc 1,0b\n" /* handle partial completion */ @@ -157,6 +161,12 @@ static inline int cpacf_query(unsigned int opcode, unsigned int func) if (!test_facility(17)) /* check for MSA */ return 0; break; + case CPACF_PCKMO: + if (!test_facility(76)) /* check for MSA3 */ + return 0; + break; + case CPACF_KMF: + case CPACF_KMO: case CPACF_PCC: case CPACF_KMCTR: if (!test_facility(77)) /* check for MSA4 */ diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 86cae09e076a..8acf482162ed 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -78,4 +78,153 @@ struct diag210 { extern int diag210(struct diag210 *addr); +/* bit is set in flags, when physical cpu info is included in diag 204 data */ +#define DIAG204_LPAR_PHYS_FLG 0x80 +#define DIAG204_LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */ +#define DIAG204_CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */ + +/* diag 204 subcodes */ +enum diag204_sc { + DIAG204_SUBC_STIB4 = 4, + DIAG204_SUBC_RSI = 5, + DIAG204_SUBC_STIB6 = 6, + DIAG204_SUBC_STIB7 = 7 +}; + +/* The two available diag 204 data formats */ +enum diag204_format { + DIAG204_INFO_SIMPLE = 0, + DIAG204_INFO_EXT = 0x00010000 +}; + +enum diag204_cpu_flags { + DIAG204_CPU_ONLINE = 0x20, + DIAG204_CPU_CAPPED = 0x40, +}; + +struct diag204_info_blk_hdr { + __u8 npar; + __u8 flags; + __u16 tslice; + __u16 phys_cpus; + __u16 this_part; + __u64 curtod; +} __packed; + +struct diag204_x_info_blk_hdr { + __u8 npar; + __u8 flags; + __u16 tslice; + __u16 phys_cpus; + __u16 this_part; + __u64 curtod1; + __u64 curtod2; + char reserved[40]; +} __packed; + +struct diag204_part_hdr { + __u8 pn; + __u8 cpus; + char reserved[6]; + char part_name[DIAG204_LPAR_NAME_LEN]; +} __packed; + +struct diag204_x_part_hdr { + __u8 pn; + __u8 cpus; + __u8 rcpus; + __u8 pflag; + __u32 mlu; + char part_name[DIAG204_LPAR_NAME_LEN]; + char lpc_name[8]; + char os_name[8]; + __u64 online_cs; + __u64 online_es; + __u8 upid; + __u8 reserved:3; + __u8 mtid:5; + char reserved1[2]; + __u32 group_mlu; + char group_name[8]; + char hardware_group_name[8]; + char reserved2[24]; +} __packed; + +struct diag204_cpu_info { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + __u8 cflag; + __u16 weight; + __u64 acc_time; + __u64 lp_time; +} __packed; + +struct diag204_x_cpu_info { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + __u8 cflag; + __u16 weight; + __u64 acc_time; + __u64 lp_time; + __u16 min_weight; + __u16 cur_weight; + __u16 max_weight; + char reseved2[2]; + __u64 online_time; + __u64 wait_time; + __u32 pma_weight; + __u32 polar_weight; + __u32 cpu_type_cap; + __u32 group_cpu_type_cap; + char reserved3[32]; +} __packed; + +struct diag204_phys_hdr { + char reserved1[1]; + __u8 cpus; + char reserved2[6]; + char mgm_name[8]; +} __packed; + +struct diag204_x_phys_hdr { + char reserved1[1]; + __u8 cpus; + char reserved2[6]; + char mgm_name[8]; + char reserved3[80]; +} __packed; + +struct diag204_phys_cpu { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + char reserved2[3]; + __u64 mgm_time; + char reserved3[8]; +} __packed; + +struct diag204_x_phys_cpu { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + char reserved2[1]; + __u16 weight; + __u64 mgm_time; + char reserved3[80]; +} __packed; + +struct diag204_x_part_block { + struct diag204_x_part_hdr hdr; + struct diag204_x_cpu_info cpus[]; +} __packed; + +struct diag204_x_phys_block { + struct diag204_x_phys_hdr hdr; + struct diag204_x_phys_cpu cpus[]; +} __packed; + +int diag204(unsigned long subcode, unsigned long size, void *addr); +int diag224(void *ptr); #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index d054c1b07a3c..741ddba0bf11 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -10,14 +10,25 @@ /** * struct gmap_struct - guest address space + * @list: list head for the mm->context gmap list * @crst_list: list of all crst tables used in the guest address space * @mm: pointer to the parent mm_struct * @guest_to_host: radix tree with guest to host address translation * @host_to_guest: radix tree with pointer to segment table entries * @guest_table_lock: spinlock to protect all entries in the guest page table + * @ref_count: reference counter for the gmap structure * @table: pointer to the page directory * @asce: address space control element for gmap page table * @pfault_enabled: defines if pfaults are applicable for the guest + * @host_to_rmap: radix tree with gmap_rmap lists + * @children: list of shadow gmap structures + * @pt_list: list of all page tables used in the shadow guest address space + * @shadow_lock: spinlock to protect the shadow gmap list + * @parent: pointer to the parent gmap for shadow guest address spaces + * @orig_asce: ASCE for which the shadow page table has been created + * @edat_level: edat level to be used for the shadow translation + * @removed: flag to indicate if a shadow guest address space has been removed + * @initialized: flag to indicate if a shadow guest address space can be used */ struct gmap { struct list_head list; @@ -26,26 +37,64 @@ struct gmap { struct radix_tree_root guest_to_host; struct radix_tree_root host_to_guest; spinlock_t guest_table_lock; + atomic_t ref_count; unsigned long *table; unsigned long asce; unsigned long asce_end; void *private; bool pfault_enabled; + /* Additional data for shadow guest address spaces */ + struct radix_tree_root host_to_rmap; + struct list_head children; + struct list_head pt_list; + spinlock_t shadow_lock; + struct gmap *parent; + unsigned long orig_asce; + int edat_level; + bool removed; + bool initialized; }; +/** + * struct gmap_rmap - reverse mapping for shadow page table entries + * @next: pointer to next rmap in the list + * @raddr: virtual rmap address in the shadow guest address space + */ +struct gmap_rmap { + struct gmap_rmap *next; + unsigned long raddr; +}; + +#define gmap_for_each_rmap(pos, head) \ + for (pos = (head); pos; pos = pos->next) + +#define gmap_for_each_rmap_safe(pos, n, head) \ + for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) + /** * struct gmap_notifier - notify function block for page invalidation * @notifier_call: address of callback function */ struct gmap_notifier { struct list_head list; - void (*notifier_call)(struct gmap *gmap, unsigned long gaddr); + struct rcu_head rcu; + void (*notifier_call)(struct gmap *gmap, unsigned long start, + unsigned long end); }; -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit); -void gmap_free(struct gmap *gmap); +static inline int gmap_is_shadow(struct gmap *gmap) +{ + return !!gmap->parent; +} + +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); +void gmap_remove(struct gmap *gmap); +struct gmap *gmap_get(struct gmap *gmap); +void gmap_put(struct gmap *gmap); + void gmap_enable(struct gmap *gmap); void gmap_disable(struct gmap *gmap); +struct gmap *gmap_get_enabled(void); int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long to, unsigned long len); int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); @@ -57,8 +106,29 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to); void __gmap_zap(struct gmap *, unsigned long gaddr); void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); -void gmap_register_ipte_notifier(struct gmap_notifier *); -void gmap_unregister_ipte_notifier(struct gmap_notifier *); -int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len); +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); + +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, + int edat_level); +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake); +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake); +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake); +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake); +int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, int *fake); +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); + +void gmap_register_pte_notifier(struct gmap_notifier *); +void gmap_unregister_pte_notifier(struct gmap_notifier *); +void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *, + unsigned long bits); + +int gmap_mprotect_notify(struct gmap *, unsigned long start, + unsigned long len, int prot); #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index ac82e8eb936d..8e5daf7a76ce 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -43,6 +43,7 @@ /* s390-specific vcpu->requests bit members */ #define KVM_REQ_ENABLE_IBS 8 #define KVM_REQ_DISABLE_IBS 9 +#define KVM_REQ_ICPT_OPEREXC 10 #define SIGP_CTRL_C 0x80 #define SIGP_CTRL_SCN_MASK 0x3f @@ -145,7 +146,7 @@ struct kvm_s390_sie_block { __u64 cputm; /* 0x0028 */ __u64 ckc; /* 0x0030 */ __u64 epoch; /* 0x0038 */ - __u8 reserved40[4]; /* 0x0040 */ + __u32 svcc; /* 0x0040 */ #define LCTL_CR0 0x8000 #define LCTL_CR6 0x0200 #define LCTL_CR9 0x0040 @@ -154,6 +155,7 @@ struct kvm_s390_sie_block { #define LCTL_CR14 0x0002 __u16 lctl; /* 0x0044 */ __s16 icpua; /* 0x0046 */ +#define ICTL_OPEREXC 0x80000000 #define ICTL_PINT 0x20000000 #define ICTL_LPSW 0x00400000 #define ICTL_STCTL 0x00040000 @@ -166,6 +168,9 @@ struct kvm_s390_sie_block { #define ICPT_INST 0x04 #define ICPT_PROGI 0x08 #define ICPT_INSTPROGI 0x0C +#define ICPT_EXTINT 0x14 +#define ICPT_VALIDITY 0x20 +#define ICPT_STOP 0x28 #define ICPT_OPEREXC 0x2C #define ICPT_PARTEXEC 0x38 #define ICPT_IOINST 0x40 @@ -185,7 +190,9 @@ struct kvm_s390_sie_block { __u32 scaol; /* 0x0064 */ __u8 reserved68[4]; /* 0x0068 */ __u32 todpr; /* 0x006c */ - __u8 reserved70[32]; /* 0x0070 */ + __u8 reserved70[16]; /* 0x0070 */ + __u64 mso; /* 0x0080 */ + __u64 msl; /* 0x0088 */ psw_t gpsw; /* 0x0090 */ __u64 gg14; /* 0x00a0 */ __u64 gg15; /* 0x00a8 */ @@ -223,7 +230,7 @@ struct kvm_s390_sie_block { __u8 reserved1e6[2]; /* 0x01e6 */ __u64 itdba; /* 0x01e8 */ __u64 riccbd; /* 0x01f0 */ - __u8 reserved1f8[8]; /* 0x01f8 */ + __u64 gvrd; /* 0x01f8 */ } __attribute__((packed)); struct kvm_s390_itdb { @@ -256,6 +263,7 @@ struct kvm_vcpu_stat { u32 instruction_stctg; u32 exit_program_interruption; u32 exit_instr_and_program; + u32 exit_operation_exception; u32 deliver_external_call; u32 deliver_emergency_signal; u32 deliver_service_signal; @@ -278,7 +286,9 @@ struct kvm_vcpu_stat { u32 instruction_stsi; u32 instruction_stfl; u32 instruction_tprot; + u32 instruction_sie; u32 instruction_essa; + u32 instruction_sthyi; u32 instruction_sigp_sense; u32 instruction_sigp_sense_running; u32 instruction_sigp_external_call; @@ -541,12 +551,16 @@ struct kvm_guestdbg_info_arch { struct kvm_vcpu_arch { struct kvm_s390_sie_block *sie_block; + /* if vsie is active, currently executed shadow sie control block */ + struct kvm_s390_sie_block *vsie_block; unsigned int host_acrs[NUM_ACRS]; struct fpu host_fpregs; struct kvm_s390_local_interrupt local_int; struct hrtimer ckc_timer; struct kvm_s390_pgm_info pgm; struct gmap *gmap; + /* backup location for the currently enabled gmap when scheduled out */ + struct gmap *enabled_gmap; struct kvm_guestdbg_info_arch guestdbg; unsigned long pfault_token; unsigned long pfault_select; @@ -631,6 +645,14 @@ struct sie_page2 { u8 reserved900[0x1000 - 0x900]; /* 0x0900 */ } __packed; +struct kvm_s390_vsie { + struct mutex mutex; + struct radix_tree_root addr_to_page; + int page_count; + int next; + struct page *pages[KVM_MAX_VCPUS]; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -646,15 +668,20 @@ struct kvm_arch{ int user_cpu_state_ctrl; int user_sigp; int user_stsi; + int user_instr0; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; int ipte_lock_count; struct mutex ipte_mutex; + struct ratelimit_state sthyi_limit; spinlock_t start_stop_lock; struct sie_page2 *sie_page2; struct kvm_s390_cpu_model model; struct kvm_s390_crypto crypto; + struct kvm_s390_vsie vsie; u64 epoch; + /* subset of available cpu features enabled by user space */ + DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 18226437a832..6d39329c894b 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -8,8 +8,9 @@ typedef struct { cpumask_t cpu_attach_mask; atomic_t flush_count; unsigned int flush_mm; - spinlock_t list_lock; + spinlock_t pgtable_lock; struct list_head pgtable_list; + spinlock_t gmap_lock; struct list_head gmap_list; unsigned long asce; unsigned long asce_limit; @@ -22,9 +23,11 @@ typedef struct { unsigned int use_skey:1; } mm_context_t; -#define INIT_MM_CONTEXT(name) \ - .context.list_lock = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \ - .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ +#define INIT_MM_CONTEXT(name) \ + .context.pgtable_lock = \ + __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \ + .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ + .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \ .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list), static inline int tprot(unsigned long addr) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index f77c638bf397..c6a088c91aee 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -15,8 +15,9 @@ static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { - spin_lock_init(&mm->context.list_lock); + spin_lock_init(&mm->context.pgtable_lock); INIT_LIST_HEAD(&mm->context.pgtable_list); + spin_lock_init(&mm->context.gmap_lock); INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.flush_count, 0); diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index b2146c4119b2..69b8a41fca84 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -111,13 +111,14 @@ static inline unsigned char page_get_storage_key(unsigned long addr) static inline int page_reset_referenced(unsigned long addr) { - unsigned int ipm; + int cc; asm volatile( " rrbe 0,%1\n" " ipm %0\n" - : "=d" (ipm) : "a" (addr) : "cc"); - return !!(ipm & 0x20000000); + " srl %0,28\n" + : "=d" (cc) : "a" (addr) : "cc"); + return cc; } /* Bits int the storage key */ @@ -148,6 +149,8 @@ static inline int devmem_is_allowed(unsigned long pfn) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT) +#define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index da34cb6b1f3b..f4eb9843eed4 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *); void crst_table_free(struct mm_struct *, unsigned long *); unsigned long *page_table_alloc(struct mm_struct *); +struct page *page_table_alloc_pgste(struct mm_struct *mm); void page_table_free(struct mm_struct *, unsigned long *); void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); +void page_table_free_pgste(struct page *page); extern int page_table_allocate_pgste; static inline void clear_table(unsigned long *s, unsigned long val, size_t n) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 48d383af078f..72c7f60bfe83 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -277,6 +277,7 @@ static inline int is_module_addr(void *addr) /* Bits in the region table entry */ #define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */ #define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */ +#define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */ #define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ #define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ #define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ @@ -364,6 +365,7 @@ static inline int is_module_addr(void *addr) #define PGSTE_GC_BIT 0x0002000000000000UL #define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ #define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ +#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */ /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL @@ -1002,15 +1004,26 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry); void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void ptep_notify(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long bits); +int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, + pte_t *ptep, int prot, unsigned long bit); void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pte_t *ptep , int reset); void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, pte_t pte); +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep); bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char key, bool nq); -unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr); +int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, unsigned char *oldkey, + bool nq, bool mr, bool mc); +int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr); +int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char *key); /* * Certain architectures need to do special things when PTEs diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 09529202ea77..03323175de30 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -112,6 +112,8 @@ struct thread_struct { unsigned long ksp; /* kernel stack pointer */ mm_segment_t mm_segment; unsigned long gmap_addr; /* address of last gmap fault. */ + unsigned int gmap_write_flag; /* gmap fault write indication */ + unsigned int gmap_int_code; /* int code of last gmap fault */ unsigned int gmap_pfault; /* signal of a pending guest pfault */ struct per_regs per_user; /* User specified PER registers */ struct per_event per_event; /* Cause of the last PER trap */ diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index e4f6f73afe2f..2ad9c204b1a2 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -32,12 +32,19 @@ struct sclp_core_entry { u8 reserved0; u8 : 4; u8 sief2 : 1; - u8 : 3; - u8 : 3; + u8 skey : 1; + u8 : 2; + u8 : 2; + u8 gpere : 1; u8 siif : 1; u8 sigpif : 1; u8 : 3; - u8 reserved2[10]; + u8 reserved2[3]; + u8 : 2; + u8 ib : 1; + u8 cei : 1; + u8 : 4; + u8 reserved3[6]; u8 type; u8 reserved1; } __attribute__((packed)); @@ -59,6 +66,15 @@ struct sclp_info { unsigned char has_hvs : 1; unsigned char has_esca : 1; unsigned char has_sief2 : 1; + unsigned char has_64bscao : 1; + unsigned char has_gpere : 1; + unsigned char has_cmma : 1; + unsigned char has_gsls : 1; + unsigned char has_ib : 1; + unsigned char has_cei : 1; + unsigned char has_pfmfi : 1; + unsigned char has_ibs : 1; + unsigned char has_skey : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; @@ -101,5 +117,6 @@ int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count); int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count); void sclp_early_detect(void); void _sclp_print_early(const char *); +void sclp_ocf_cpc_name_copy(char *dst); #endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 3b8e99ef9d58..a2ffec4139ad 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -93,6 +93,47 @@ struct kvm_s390_vm_cpu_machine { __u64 fac_list[256]; }; +#define KVM_S390_VM_CPU_PROCESSOR_FEAT 2 +#define KVM_S390_VM_CPU_MACHINE_FEAT 3 + +#define KVM_S390_VM_CPU_FEAT_NR_BITS 1024 +#define KVM_S390_VM_CPU_FEAT_ESOP 0 +#define KVM_S390_VM_CPU_FEAT_SIEF2 1 +#define KVM_S390_VM_CPU_FEAT_64BSCAO 2 +#define KVM_S390_VM_CPU_FEAT_SIIF 3 +#define KVM_S390_VM_CPU_FEAT_GPERE 4 +#define KVM_S390_VM_CPU_FEAT_GSLS 5 +#define KVM_S390_VM_CPU_FEAT_IB 6 +#define KVM_S390_VM_CPU_FEAT_CEI 7 +#define KVM_S390_VM_CPU_FEAT_IBS 8 +#define KVM_S390_VM_CPU_FEAT_SKEY 9 +#define KVM_S390_VM_CPU_FEAT_CMMA 10 +#define KVM_S390_VM_CPU_FEAT_PFMFI 11 +#define KVM_S390_VM_CPU_FEAT_SIGPIF 12 +struct kvm_s390_vm_cpu_feat { + __u64 feat[16]; +}; + +#define KVM_S390_VM_CPU_PROCESSOR_SUBFUNC 4 +#define KVM_S390_VM_CPU_MACHINE_SUBFUNC 5 +/* for "test bit" instructions MSB 0 bit ordering, for "query" raw blocks */ +struct kvm_s390_vm_cpu_subfunc { + __u8 plo[32]; /* always */ + __u8 ptff[16]; /* with TOD-clock steering */ + __u8 kmac[16]; /* with MSA */ + __u8 kmc[16]; /* with MSA */ + __u8 km[16]; /* with MSA */ + __u8 kimd[16]; /* with MSA */ + __u8 klmd[16]; /* with MSA */ + __u8 pckmo[16]; /* with MSA3 */ + __u8 kmctr[16]; /* with MSA4 */ + __u8 kmf[16]; /* with MSA4 */ + __u8 kmo[16]; /* with MSA4 */ + __u8 pcc[16]; /* with MSA4 */ + __u8 ppno[16]; /* with MSA5 */ + __u8 reserved[1824]; +}; + /* kvm attributes for crypto */ #define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h index 8fb5d4a6dd25..3ac634368939 100644 --- a/arch/s390/include/uapi/asm/sie.h +++ b/arch/s390/include/uapi/asm/sie.h @@ -140,6 +140,7 @@ exit_code_ipa0(0xB2, 0x4c, "TAR"), \ exit_code_ipa0(0xB2, 0x50, "CSP"), \ exit_code_ipa0(0xB2, 0x54, "MVPG"), \ + exit_code_ipa0(0xB2, 0x56, "STHYI"), \ exit_code_ipa0(0xB2, 0x58, "BSG"), \ exit_code_ipa0(0xB2, 0x5a, "BSA"), \ exit_code_ipa0(0xB2, 0x5f, "CHSC"), \ diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index 48b37b8357e6..a97354c8c667 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -162,6 +162,30 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) } EXPORT_SYMBOL(diag14); +static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr) +{ + register unsigned long _subcode asm("0") = *subcode; + register unsigned long _size asm("1") = size; + + asm volatile( + " diag %2,%0,0x204\n" + "0: nopr %%r7\n" + EX_TABLE(0b,0b) + : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory"); + *subcode = _subcode; + return _size; +} + +int diag204(unsigned long subcode, unsigned long size, void *addr) +{ + diag_stat_inc(DIAG_STAT_X204); + size = __diag204(&subcode, size, addr); + if (subcode) + return -1; + return size; +} +EXPORT_SYMBOL(diag204); + /* * Diagnose 210: Get information about a virtual device */ @@ -196,3 +220,18 @@ int diag210(struct diag210 *addr) return ccode; } EXPORT_SYMBOL(diag210); + +int diag224(void *ptr) +{ + int rc = -EOPNOTSUPP; + + diag_stat_inc(DIAG_STAT_X224); + asm volatile( + " diag %1,%2,0x224\n" + "0: lhi %0,0x0\n" + "1:\n" + EX_TABLE(0b,1b) + : "+d" (rc) :"d" (0), "d" (ptr) : "memory"); + return rc; +} +EXPORT_SYMBOL(diag224); diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index d42fa38c2429..09a9e6dfc09f 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o +kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 1ea4095b67d7..ce865bd4f81d 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -212,6 +212,11 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) return -EOPNOTSUPP; + VCPU_EVENT(vcpu, 4, "diag 0x500 schid 0x%8.8x queue 0x%x cookie 0x%llx", + (u32) vcpu->run->s.regs.gprs[2], + (u32) vcpu->run->s.regs.gprs[3], + vcpu->run->s.regs.gprs[4]); + /* * The layout is as follows: * - gpr 2 contains the subchannel id (passed as addr) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 66938d283b77..54200208bf24 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" #include @@ -476,18 +477,73 @@ enum { FSI_FETCH = 2 /* Exception was due to fetch operation */ }; +enum prot_type { + PROT_TYPE_LA = 0, + PROT_TYPE_KEYC = 1, + PROT_TYPE_ALC = 2, + PROT_TYPE_DAT = 3, +}; + +static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, + ar_t ar, enum gacc_mode mode, enum prot_type prot) +{ + struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; + struct trans_exc_code_bits *tec; + + memset(pgm, 0, sizeof(*pgm)); + pgm->code = code; + tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; + + switch (code) { + case PGM_ASCE_TYPE: + case PGM_PAGE_TRANSLATION: + case PGM_REGION_FIRST_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_THIRD_TRANS: + case PGM_SEGMENT_TRANSLATION: + /* + * op_access_id only applies to MOVE_PAGE -> set bit 61 + * exc_access_id has to be set to 0 for some instructions. Both + * cases have to be handled by the caller. We can always store + * exc_access_id, as it is undefined for non-ar cases. + */ + tec->addr = gva >> PAGE_SHIFT; + tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; + tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; + /* FALL THROUGH */ + case PGM_ALEN_TRANSLATION: + case PGM_ALE_SEQUENCE: + case PGM_ASTE_VALIDITY: + case PGM_ASTE_SEQUENCE: + case PGM_EXTENDED_AUTHORITY: + pgm->exc_access_id = ar; + break; + case PGM_PROTECTION: + switch (prot) { + case PROT_TYPE_ALC: + tec->b60 = 1; + /* FALL THROUGH */ + case PROT_TYPE_DAT: + tec->b61 = 1; + tec->addr = gva >> PAGE_SHIFT; + tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; + tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; + /* exc_access_id is undefined for most cases */ + pgm->exc_access_id = ar; + break; + default: /* LA and KEYC set b61 to 0, other params undefined */ + break; + } + break; + } + return code; +} + static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, - ar_t ar, enum gacc_mode mode) + unsigned long ga, ar_t ar, enum gacc_mode mode) { int rc; struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw); - struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; - struct trans_exc_code_bits *tec_bits; - - memset(pgm, 0, sizeof(*pgm)); - tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; - tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; - tec_bits->as = psw.as; if (!psw.t) { asce->val = 0; @@ -510,21 +566,8 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, return 0; case PSW_AS_ACCREG: rc = ar_translation(vcpu, asce, ar, mode); - switch (rc) { - case PGM_ALEN_TRANSLATION: - case PGM_ALE_SEQUENCE: - case PGM_ASTE_VALIDITY: - case PGM_ASTE_SEQUENCE: - case PGM_EXTENDED_AUTHORITY: - vcpu->arch.pgm.exc_access_id = ar; - break; - case PGM_PROTECTION: - tec_bits->b60 = 1; - tec_bits->b61 = 1; - break; - } if (rc > 0) - pgm->code = rc; + return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC); return rc; } return 0; @@ -729,40 +772,31 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } -static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, +static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, unsigned long *pages, unsigned long nr_pages, const union asce asce, enum gacc_mode mode) { - struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; psw_t *psw = &vcpu->arch.sie_block->gpsw; - struct trans_exc_code_bits *tec_bits; - int lap_enabled, rc; + int lap_enabled, rc = 0; - tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; lap_enabled = low_address_protection_enabled(vcpu, asce); while (nr_pages) { ga = kvm_s390_logical_to_effective(vcpu, ga); - tec_bits->addr = ga >> PAGE_SHIFT; - if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) { - pgm->code = PGM_PROTECTION; - return pgm->code; - } + if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) + return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode, + PROT_TYPE_LA); ga &= PAGE_MASK; if (psw_bits(*psw).t) { rc = guest_translate(vcpu, ga, pages, asce, mode); if (rc < 0) return rc; - if (rc == PGM_PROTECTION) - tec_bits->b61 = 1; - if (rc) - pgm->code = rc; } else { *pages = kvm_s390_real_to_abs(vcpu, ga); if (kvm_is_error_gpa(vcpu->kvm, *pages)) - pgm->code = PGM_ADDRESSING; + rc = PGM_ADDRESSING; } - if (pgm->code) - return pgm->code; + if (rc) + return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT); ga += PAGE_SIZE; pages++; nr_pages--; @@ -783,7 +817,8 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, if (!len) return 0; - rc = get_vcpu_asce(vcpu, &asce, ar, mode); + ga = kvm_s390_logical_to_effective(vcpu, ga); + rc = get_vcpu_asce(vcpu, &asce, ga, ar, mode); if (rc) return rc; nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1; @@ -795,7 +830,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, need_ipte_lock = psw_bits(*psw).t && !asce.r; if (need_ipte_lock) ipte_lock(vcpu); - rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode); + rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode); for (idx = 0; idx < nr_pages && !rc; idx++) { gpa = *(pages + idx) + (ga & ~PAGE_MASK); _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len); @@ -846,37 +881,28 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, unsigned long *gpa, enum gacc_mode mode) { - struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; psw_t *psw = &vcpu->arch.sie_block->gpsw; - struct trans_exc_code_bits *tec; union asce asce; int rc; gva = kvm_s390_logical_to_effective(vcpu, gva); - tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; - rc = get_vcpu_asce(vcpu, &asce, ar, mode); - tec->addr = gva >> PAGE_SHIFT; + rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); if (rc) return rc; if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) { - if (mode == GACC_STORE) { - rc = pgm->code = PGM_PROTECTION; - return rc; - } + if (mode == GACC_STORE) + return trans_exc(vcpu, PGM_PROTECTION, gva, 0, + mode, PROT_TYPE_LA); } if (psw_bits(*psw).t && !asce.r) { /* Use DAT? */ rc = guest_translate(vcpu, gva, gpa, asce, mode); - if (rc > 0) { - if (rc == PGM_PROTECTION) - tec->b61 = 1; - pgm->code = rc; - } + if (rc > 0) + return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT); } else { - rc = 0; *gpa = kvm_s390_real_to_abs(vcpu, gva); if (kvm_is_error_gpa(vcpu->kvm, *gpa)) - rc = pgm->code = PGM_ADDRESSING; + return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0); } return rc; @@ -915,20 +941,247 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, */ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) { - struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; - psw_t *psw = &vcpu->arch.sie_block->gpsw; - struct trans_exc_code_bits *tec_bits; union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]}; if (!ctlreg0.lap || !is_low_address(gra)) return 0; - - memset(pgm, 0, sizeof(*pgm)); - tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; - tec_bits->fsi = FSI_STORE; - tec_bits->as = psw_bits(*psw).as; - tec_bits->addr = gra >> PAGE_SHIFT; - pgm->code = PGM_PROTECTION; - - return pgm->code; + return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA); +} + +/** + * kvm_s390_shadow_tables - walk the guest page table and create shadow tables + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: pointer to the page table address result + * @fake: pgt references contiguous guest memory block, not a pgtable + */ +static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, + int *fake) +{ + struct gmap *parent; + union asce asce; + union vaddress vaddr; + unsigned long ptr; + int rc; + + *fake = 0; + *dat_protection = 0; + parent = sg->parent; + vaddr.addr = saddr; + asce.val = sg->orig_asce; + ptr = asce.origin * 4096; + if (asce.r) { + *fake = 1; + asce.dt = ASCE_TYPE_REGION1; + } + switch (asce.dt) { + case ASCE_TYPE_REGION1: + if (vaddr.rfx01 > asce.tl && !asce.r) + return PGM_REGION_FIRST_TRANS; + break; + case ASCE_TYPE_REGION2: + if (vaddr.rfx) + return PGM_ASCE_TYPE; + if (vaddr.rsx01 > asce.tl) + return PGM_REGION_SECOND_TRANS; + break; + case ASCE_TYPE_REGION3: + if (vaddr.rfx || vaddr.rsx) + return PGM_ASCE_TYPE; + if (vaddr.rtx01 > asce.tl) + return PGM_REGION_THIRD_TRANS; + break; + case ASCE_TYPE_SEGMENT: + if (vaddr.rfx || vaddr.rsx || vaddr.rtx) + return PGM_ASCE_TYPE; + if (vaddr.sx01 > asce.tl) + return PGM_SEGMENT_TRANSLATION; + break; + } + + switch (asce.dt) { + case ASCE_TYPE_REGION1: { + union region1_table_entry rfte; + + if (*fake) { + /* offset in 16EB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.rsx << 53UL); + rfte.val = ptr; + goto shadow_r2t; + } + rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); + if (rc) + return rc; + if (rfte.i) + return PGM_REGION_FIRST_TRANS; + if (rfte.tt != TABLE_TYPE_REGION1) + return PGM_TRANSLATION_SPEC; + if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) + return PGM_REGION_SECOND_TRANS; + if (sg->edat_level >= 1) + *dat_protection |= rfte.p; + ptr = rfte.rto << 12UL; +shadow_r2t: + rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); + if (rc) + return rc; + /* fallthrough */ + } + case ASCE_TYPE_REGION2: { + union region2_table_entry rste; + + if (*fake) { + /* offset in 8PB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.rtx << 42UL); + rste.val = ptr; + goto shadow_r3t; + } + rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); + if (rc) + return rc; + if (rste.i) + return PGM_REGION_SECOND_TRANS; + if (rste.tt != TABLE_TYPE_REGION2) + return PGM_TRANSLATION_SPEC; + if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) + return PGM_REGION_THIRD_TRANS; + if (sg->edat_level >= 1) + *dat_protection |= rste.p; + ptr = rste.rto << 12UL; +shadow_r3t: + rste.p |= *dat_protection; + rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); + if (rc) + return rc; + /* fallthrough */ + } + case ASCE_TYPE_REGION3: { + union region3_table_entry rtte; + + if (*fake) { + /* offset in 4TB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.sx << 31UL); + rtte.val = ptr; + goto shadow_sgt; + } + rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); + if (rc) + return rc; + if (rtte.i) + return PGM_REGION_THIRD_TRANS; + if (rtte.tt != TABLE_TYPE_REGION3) + return PGM_TRANSLATION_SPEC; + if (rtte.cr && asce.p && sg->edat_level >= 2) + return PGM_TRANSLATION_SPEC; + if (rtte.fc && sg->edat_level >= 2) { + *dat_protection |= rtte.fc0.p; + *fake = 1; + ptr = rtte.fc1.rfaa << 31UL; + rtte.val = ptr; + goto shadow_sgt; + } + if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) + return PGM_SEGMENT_TRANSLATION; + if (sg->edat_level >= 1) + *dat_protection |= rtte.fc0.p; + ptr = rtte.fc0.sto << 12UL; +shadow_sgt: + rtte.fc0.p |= *dat_protection; + rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); + if (rc) + return rc; + /* fallthrough */ + } + case ASCE_TYPE_SEGMENT: { + union segment_table_entry ste; + + if (*fake) { + /* offset in 2G guest memory block */ + ptr = ptr + ((unsigned long) vaddr.sx << 20UL); + ste.val = ptr; + goto shadow_pgt; + } + rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); + if (rc) + return rc; + if (ste.i) + return PGM_SEGMENT_TRANSLATION; + if (ste.tt != TABLE_TYPE_SEGMENT) + return PGM_TRANSLATION_SPEC; + if (ste.cs && asce.p) + return PGM_TRANSLATION_SPEC; + *dat_protection |= ste.fc0.p; + if (ste.fc && sg->edat_level >= 1) { + *fake = 1; + ptr = ste.fc1.sfaa << 20UL; + ste.val = ptr; + goto shadow_pgt; + } + ptr = ste.fc0.pto << 11UL; +shadow_pgt: + ste.fc0.p |= *dat_protection; + rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); + if (rc) + return rc; + } + } + /* Return the parent address of the page table */ + *pgt = ptr; + return 0; +} + +/** + * kvm_s390_shadow_fault - handle fault on a shadow page table + * @vcpu: virtual cpu + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * + * Returns: - 0 if the shadow fault was successfully resolved + * - > 0 (pgm exception code) on exceptions while faulting + * - -EAGAIN if the caller can retry immediately + * - -EFAULT when accessing invalid guest addresses + * - -ENOMEM if out of memory + */ +int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, + unsigned long saddr) +{ + union vaddress vaddr; + union page_table_entry pte; + unsigned long pgt; + int dat_protection, fake; + int rc; + + down_read(&sg->mm->mmap_sem); + /* + * We don't want any guest-2 tables to change - so the parent + * tables/pointers we read stay valid - unshadowing is however + * always possible - only guest_table_lock protects us. + */ + ipte_lock(vcpu); + + rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); + if (rc) + rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, + &fake); + + vaddr.addr = saddr; + if (fake) { + /* offset in 1MB guest memory block */ + pte.val = pgt + ((unsigned long) vaddr.px << 12UL); + goto shadow_page; + } + if (!rc) + rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); + if (!rc && pte.i) + rc = PGM_PAGE_TRANSLATION; + if (!rc && (pte.z || (pte.co && sg->edat_level < 1))) + rc = PGM_TRANSLATION_SPEC; +shadow_page: + pte.p |= dat_protection; + if (!rc) + rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); + ipte_unlock(vcpu); + up_read(&sg->mm->mmap_sem); + return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index df0a79dd8159..8756569ad938 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu); int ipte_lock_held(struct kvm_vcpu *vcpu); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); +int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, + unsigned long saddr); + #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index e8c6843b9600..31a05330d11c 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -439,6 +439,23 @@ static int debug_exit_required(struct kvm_vcpu *vcpu) #define guest_per_enabled(vcpu) \ (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) +int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) +{ + const u8 ilen = kvm_s390_get_ilen(vcpu); + struct kvm_s390_pgm_info pgm_info = { + .code = PGM_PER, + .per_code = PER_EVENT_IFETCH >> 24, + .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen), + }; + + /* + * The PSW points to the next instruction, therefore the intercepted + * instruction generated a PER i-fetch event. PER address therefore + * points at the previous PSW address (could be an EXECUTE function). + */ + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); +} + static void filter_guest_per_event(struct kvm_vcpu *vcpu) { u32 perc = vcpu->arch.sie_block->perc << 24; @@ -465,7 +482,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu) guest_perc &= ~PER_EVENT_IFETCH; /* All other PER events will be given to the guest */ - /* TODO: Check alterated address/address space */ + /* TODO: Check altered address/address space */ vcpu->arch.sie_block->perc = guest_perc >> 24; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 252157181302..dfd0ca2638fa 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -351,8 +351,26 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } +static int handle_operexc(struct kvm_vcpu *vcpu) +{ + vcpu->stat.exit_operation_exception++; + trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa, + vcpu->arch.sie_block->ipb); + + if (vcpu->arch.sie_block->ipa == 0xb256 && + test_kvm_facility(vcpu->kvm, 74)) + return handle_sthyi(vcpu); + + if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) + return -EOPNOTSUPP; + + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); +} + int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { + int rc, per_rc = 0; + if (kvm_is_ucontrol(vcpu->kvm)) return -EOPNOTSUPP; @@ -361,7 +379,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) case 0x18: return handle_noop(vcpu); case 0x04: - return handle_instruction(vcpu); + rc = handle_instruction(vcpu); + break; case 0x08: return handle_prog(vcpu); case 0x14: @@ -372,9 +391,19 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) return handle_validity(vcpu); case 0x28: return handle_stop(vcpu); + case 0x2c: + rc = handle_operexc(vcpu); + break; case 0x38: - return handle_partial_execution(vcpu); + rc = handle_partial_execution(vcpu); + break; default: return -EOPNOTSUPP; } + + /* process PER, also if the instrution is processed in user space */ + if (vcpu->arch.sie_block->icptstatus & 0x02 && + (!rc || rc == -EOPNOTSUPP)) + per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu); + return per_rc ? per_rc : rc; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 5a80af740d3e..24524c0f3ef8 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -28,9 +28,6 @@ #include "gaccess.h" #include "trace-s390.h" -#define IOINT_SCHID_MASK 0x0000ffff -#define IOINT_SSID_MASK 0x00030000 -#define IOINT_CSSID_MASK 0x03fc0000 #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 #define VIRTIO_PARAM 0x0d00 @@ -821,7 +818,14 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt_info, list); if (inti) { - VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type); + if (inti->type & KVM_S390_INT_IO_AI_MASK) + VCPU_EVENT(vcpu, 4, "%s", "deliver: I/O (AI)"); + else + VCPU_EVENT(vcpu, 4, "deliver: I/O %x ss %x schid %04x", + inti->io.subchannel_id >> 8, + inti->io.subchannel_id >> 1 & 0x3, + inti->io.subchannel_nr); + vcpu->stat.deliver_io_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, @@ -991,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) swake_up(&vcpu->wq); vcpu->stat.halt_wakeup++; } + /* + * The VCPU might not be sleeping but is executing the VSIE. Let's + * kick it, so it leaves the SIE to process the request. + */ + kvm_s390_vsie_kick(vcpu); } enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) @@ -1415,6 +1424,13 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) } fi->counters[FIRQ_CNTR_IO] += 1; + if (inti->type & KVM_S390_INT_IO_AI_MASK) + VM_EVENT(kvm, 4, "%s", "inject: I/O (AI)"); + else + VM_EVENT(kvm, 4, "inject: I/O %x ss %x schid %04x", + inti->io.subchannel_id >> 8, + inti->io.subchannel_id >> 1 & 0x3, + inti->io.subchannel_nr); isc = int_word_to_isc(inti->io.io_int_word); list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc]; list_add_tail(&inti->list, list); @@ -1531,13 +1547,6 @@ int kvm_s390_inject_vm(struct kvm *kvm, inti->mchk.mcic = s390int->parm64; break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - if (inti->type & KVM_S390_INT_IO_AI_MASK) - VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)"); - else - VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x", - s390int->type & IOINT_CSSID_MASK, - s390int->type & IOINT_SSID_MASK, - s390int->type & IOINT_SCHID_MASK); inti->io.subchannel_id = s390int->parm >> 16; inti->io.subchannel_nr = s390int->parm & 0x0000ffffu; inti->io.io_int_parm = s390int->parm64 >> 32; @@ -2237,7 +2246,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, return ret; } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int ret; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 6f5c344cd785..3f3ae4865d57 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -21,11 +21,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -35,6 +37,8 @@ #include #include #include +#include +#include #include "kvm-s390.h" #include "gaccess.h" @@ -64,6 +68,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "exit_pei", VCPU_STAT(exit_pei) }, { "exit_program_interruption", VCPU_STAT(exit_program_interruption) }, { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, + { "exit_operation_exception", VCPU_STAT(exit_operation_exception) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, @@ -94,6 +99,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "instruction_stsi", VCPU_STAT(instruction_stsi) }, { "instruction_stfl", VCPU_STAT(instruction_stfl) }, { "instruction_tprot", VCPU_STAT(instruction_tprot) }, + { "instruction_sthyi", VCPU_STAT(instruction_sthyi) }, + { "instruction_sie", VCPU_STAT(instruction_sie) }, { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) }, { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) }, @@ -119,6 +126,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +/* allow nested virtualization in KVM (if enabled by user space) */ +static int nested; +module_param(nested, int, S_IRUGO); +MODULE_PARM_DESC(nested, "Nested virtualization support"); + /* upper facilities limit for kvm */ unsigned long kvm_s390_fac_list_mask[16] = { 0xffe6000000000000UL, @@ -131,7 +143,13 @@ unsigned long kvm_s390_fac_list_mask_size(void) return ARRAY_SIZE(kvm_s390_fac_list_mask); } +/* available cpu features supported by kvm */ +static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); +/* available subfunctions indicated via query / "test bit" */ +static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; + static struct gmap_notifier gmap_notifier; +static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; /* Section: not file related */ @@ -141,7 +159,8 @@ int kvm_arch_hardware_enable(void) return 0; } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address); +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end); /* * This callback is executed during stop_machine(). All CPUs are therefore @@ -163,6 +182,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val, vcpu->arch.sie_block->epoch -= *delta; if (vcpu->arch.cputm_enabled) vcpu->arch.cputm_start += *delta; + if (vcpu->arch.vsie_block) + vcpu->arch.vsie_block->epoch -= *delta; } } return NOTIFY_OK; @@ -175,7 +196,9 @@ static struct notifier_block kvm_clock_notifier = { int kvm_arch_hardware_setup(void) { gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_ipte_notifier(&gmap_notifier); + gmap_register_pte_notifier(&gmap_notifier); + vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; + gmap_register_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_register(&s390_epoch_delta_notifier, &kvm_clock_notifier); return 0; @@ -183,11 +206,109 @@ int kvm_arch_hardware_setup(void) void kvm_arch_hardware_unsetup(void) { - gmap_unregister_ipte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, &kvm_clock_notifier); } +static void allow_cpu_feat(unsigned long nr) +{ + set_bit_inv(nr, kvm_s390_available_cpu_feat); +} + +static inline int plo_test_bit(unsigned char nr) +{ + register unsigned long r0 asm("0") = (unsigned long) nr | 0x100; + int cc = 3; /* subfunction not available */ + + asm volatile( + /* Parameter registers are ignored for "test bit" */ + " plo 0,0,0,0(0)\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (cc) + : "d" (r0) + : "cc"); + return cc == 0; +} + +static void kvm_s390_cpu_feat_init(void) +{ + int i; + + for (i = 0; i < 256; ++i) { + if (plo_test_bit(i)) + kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7); + } + + if (test_facility(28)) /* TOD-clock steering */ + ptff(kvm_s390_available_subfunc.ptff, + sizeof(kvm_s390_available_subfunc.ptff), + PTFF_QAF); + + if (test_facility(17)) { /* MSA */ + __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac); + __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc); + __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km); + __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd); + __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd); + } + if (test_facility(76)) /* MSA3 */ + __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo); + if (test_facility(77)) { /* MSA4 */ + __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr); + __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf); + __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo); + __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc); + } + if (test_facility(57)) /* MSA5 */ + __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno); + + if (MACHINE_HAS_ESOP) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); + /* + * We need SIE support, ESOP (PROT_READ protection for gmap_shadow), + * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing). + */ + if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao || + !test_facility(3) || !nested) + return; + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2); + if (sclp.has_64bscao) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO); + if (sclp.has_siif) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF); + if (sclp.has_gpere) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE); + if (sclp.has_gsls) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS); + if (sclp.has_ib) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB); + if (sclp.has_cei) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI); + if (sclp.has_ibs) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS); + /* + * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make + * all skey handling functions read/set the skey from the PGSTE + * instead of the real storage key. + * + * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make + * pages being detected as preserved although they are resident. + * + * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will + * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY. + * + * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and + * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be + * correctly shadowed. We can do that for the PGSTE but not for PTE.I. + * + * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We + * cannot easily shadow the SCA because of the ipte lock. + */ +} + int kvm_arch_init(void *opaque) { kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long)); @@ -199,6 +320,8 @@ int kvm_arch_init(void *opaque) return -ENOMEM; } + kvm_s390_cpu_feat_init(); + /* Register floating interrupt controller interface. */ return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); } @@ -244,6 +367,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_USER_STSI: case KVM_CAP_S390_SKEYS: case KVM_CAP_S390_IRQ_STATE: + case KVM_CAP_S390_USER_INSTR0: r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -251,8 +375,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: - r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS - : KVM_S390_BSCA_CPU_SLOTS; + r = KVM_S390_BSCA_CPU_SLOTS; + if (sclp.has_esca && sclp.has_64bscao) + r = KVM_S390_ESCA_CPU_SLOTS; break; case KVM_CAP_NR_MEMSLOTS: r = KVM_USER_MEM_SLOTS; @@ -335,6 +460,16 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, return r; } +static void icpt_operexc_on_all_vcpus(struct kvm *kvm) +{ + unsigned int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu); + } +} + static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; @@ -355,7 +490,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) break; case KVM_CAP_S390_VECTOR_REGISTERS: mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus)) { + if (kvm->created_vcpus) { r = -EBUSY; } else if (MACHINE_HAS_VX) { set_kvm_facility(kvm->arch.model.fac_mask, 129); @@ -370,7 +505,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) case KVM_CAP_S390_RI: r = -EINVAL; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus)) { + if (kvm->created_vcpus) { r = -EBUSY; } else if (test_facility(64)) { set_kvm_facility(kvm->arch.model.fac_mask, 64); @@ -386,6 +521,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) kvm->arch.user_stsi = 1; r = 0; break; + case KVM_CAP_S390_USER_INSTR0: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0"); + kvm->arch.user_instr0 = 1; + icpt_operexc_on_all_vcpus(kvm); + r = 0; + break; default: r = -EINVAL; break; @@ -418,21 +559,23 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att unsigned int idx; switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: - /* enable CMMA only for z10 and later (EDAT_1) */ - ret = -EINVAL; - if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1) + ret = -ENXIO; + if (!sclp.has_cmma) break; ret = -EBUSY; VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support"); mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) == 0) { + if (!kvm->created_vcpus) { kvm->arch.use_cmma = 1; ret = 0; } mutex_unlock(&kvm->lock); break; case KVM_S390_VM_MEM_CLR_CMMA: + ret = -ENXIO; + if (!sclp.has_cmma) + break; ret = -EINVAL; if (!kvm->arch.use_cmma) break; @@ -461,20 +604,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - /* gmap_alloc takes last usable address */ + /* gmap_create takes last usable address */ if (new_limit != KVM_S390_NO_MEM_LIMIT) new_limit -= 1; ret = -EBUSY; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) == 0) { - /* gmap_alloc will round the limit up */ - struct gmap *new = gmap_alloc(current->mm, new_limit); + if (!kvm->created_vcpus) { + /* gmap_create will round the limit up */ + struct gmap *new = gmap_create(current->mm, new_limit); if (!new) { ret = -ENOMEM; } else { - gmap_free(kvm->arch.gmap); + gmap_remove(kvm->arch.gmap); new->private = kvm; kvm->arch.gmap = new; ret = 0; @@ -644,7 +787,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) int ret = 0; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus)) { + if (kvm->created_vcpus) { ret = -EBUSY; goto out; } @@ -676,6 +819,39 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } +static int kvm_s390_set_processor_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + int ret = -EBUSY; + + if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data))) + return -EFAULT; + if (!bitmap_subset((unsigned long *) data.feat, + kvm_s390_available_cpu_feat, + KVM_S390_VM_CPU_FEAT_NR_BITS)) + return -EINVAL; + + mutex_lock(&kvm->lock); + if (!atomic_read(&kvm->online_vcpus)) { + bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat, + KVM_S390_VM_CPU_FEAT_NR_BITS); + ret = 0; + } + mutex_unlock(&kvm->lock); + return ret; +} + +static int kvm_s390_set_processor_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + /* + * Once supported by kernel + hw, we have to store the subfunctions + * in kvm->arch and remember that user space configured them. + */ + return -ENXIO; +} + static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -684,6 +860,12 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_PROCESSOR: ret = kvm_s390_set_processor(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + ret = kvm_s390_set_processor_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + ret = kvm_s390_set_processor_subfunc(kvm, attr); + break; } return ret; } @@ -732,6 +914,50 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } +static int kvm_s390_get_processor_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + + bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat, + KVM_S390_VM_CPU_FEAT_NR_BITS); + if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) + return -EFAULT; + return 0; +} + +static int kvm_s390_get_machine_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + + bitmap_copy((unsigned long *) data.feat, + kvm_s390_available_cpu_feat, + KVM_S390_VM_CPU_FEAT_NR_BITS); + if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) + return -EFAULT; + return 0; +} + +static int kvm_s390_get_processor_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + /* + * Once we can actually configure subfunctions (kernel + hw support), + * we have to check if they were already set by user space, if so copy + * them from kvm->arch. + */ + return -ENXIO; +} + +static int kvm_s390_get_machine_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc, + sizeof(struct kvm_s390_vm_cpu_subfunc))) + return -EFAULT; + return 0; +} static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -743,6 +969,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MACHINE: ret = kvm_s390_get_machine(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + ret = kvm_s390_get_processor_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_FEAT: + ret = kvm_s390_get_machine_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + ret = kvm_s390_get_processor_subfunc(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_SUBFUNC: + ret = kvm_s390_get_machine_subfunc(kvm, attr); + break; } return ret; } @@ -803,6 +1041,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: case KVM_S390_VM_MEM_CLR_CMMA: + ret = sclp.has_cmma ? 0 : -ENXIO; + break; case KVM_S390_VM_MEM_LIMIT_SIZE: ret = 0; break; @@ -826,8 +1066,13 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_S390_VM_CPU_PROCESSOR: case KVM_S390_VM_CPU_MACHINE: + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + case KVM_S390_VM_CPU_MACHINE_FEAT: + case KVM_S390_VM_CPU_MACHINE_SUBFUNC: ret = 0; break; + /* configuring subfunctions is not supported yet */ + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: default: ret = -ENXIO; break; @@ -858,7 +1103,6 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; - unsigned long curkey; int i, r = 0; if (args->flags != 0) @@ -879,26 +1123,27 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (!keys) return -ENOMEM; + down_read(¤t->mm->mmap_sem); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); if (kvm_is_error_hva(hva)) { r = -EFAULT; - goto out; + break; } - curkey = get_guest_storage_key(current->mm, hva); - if (IS_ERR_VALUE(curkey)) { - r = curkey; - goto out; - } - keys[i] = curkey; + r = get_guest_storage_key(current->mm, hva, &keys[i]); + if (r) + break; + } + up_read(¤t->mm->mmap_sem); + + if (!r) { + r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, + sizeof(uint8_t) * args->count); + if (r) + r = -EFAULT; } - r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, - sizeof(uint8_t) * args->count); - if (r) - r = -EFAULT; -out: kvfree(keys); return r; } @@ -935,24 +1180,25 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (r) goto out; + down_read(¤t->mm->mmap_sem); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); if (kvm_is_error_hva(hva)) { r = -EFAULT; - goto out; + break; } /* Lowest order bit is reserved */ if (keys[i] & 0x01) { r = -EINVAL; - goto out; + break; } - r = set_guest_storage_key(current->mm, hva, - (unsigned long)keys[i], 0); + r = set_guest_storage_key(current->mm, hva, keys[i], 0); if (r) - goto out; + break; } + up_read(¤t->mm->mmap_sem); out: kvfree(keys); return r; @@ -1129,6 +1375,7 @@ static void sca_dispose(struct kvm *kvm) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + gfp_t alloc_flags = GFP_KERNEL; int i, rc; char debug_name[16]; static unsigned long sca_offset; @@ -1150,9 +1397,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) rc = -ENOMEM; + ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500); + kvm->arch.use_esca = 0; /* start with basic SCA */ + if (!sclp.has_64bscao) + alloc_flags |= GFP_DMA; rwlock_init(&kvm->arch.sca_lock); - kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL); + kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); if (!kvm->arch.sca) goto out_err; spin_lock(&kvm_lock); @@ -1189,6 +1440,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask, S390_ARCH_FAC_LIST_SIZE_BYTE); + set_kvm_facility(kvm->arch.model.fac_mask, 74); + set_kvm_facility(kvm->arch.model.fac_list, 74); + kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); kvm->arch.model.ibc = sclp.ibc & 0x0fff; @@ -1212,7 +1466,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) else kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE, sclp.hamax + 1); - kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1); + kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1); if (!kvm->arch.gmap) goto out_err; kvm->arch.gmap->private = kvm; @@ -1224,6 +1478,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.epoch = 0; spin_lock_init(&kvm->arch.start_stop_lock); + kvm_s390_vsie_init(kvm); KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -1245,7 +1500,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) sca_del_vcpu(vcpu); if (kvm_is_ucontrol(vcpu->kvm)) - gmap_free(vcpu->arch.gmap); + gmap_remove(vcpu->arch.gmap); if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); @@ -1278,16 +1533,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm) debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) - gmap_free(kvm->arch.gmap); + gmap_remove(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); + kvm_s390_vsie_destroy(kvm); KVM_EVENT(3, "vm 0x%pK destroyed", kvm); } /* Section: vcpu related */ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) { - vcpu->arch.gmap = gmap_alloc(current->mm, -1UL); + vcpu->arch.gmap = gmap_create(current->mm, -1UL); if (!vcpu->arch.gmap) return -ENOMEM; vcpu->arch.gmap->private = vcpu->kvm; @@ -1396,7 +1652,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) if (id < KVM_S390_BSCA_CPU_SLOTS) return true; - if (!sclp.has_esca) + if (!sclp.has_esca || !sclp.has_64bscao) return false; mutex_lock(&kvm->lock); @@ -1537,7 +1793,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) save_access_regs(vcpu->arch.host_acrs); restore_access_regs(vcpu->run->s.regs.acrs); - gmap_enable(vcpu->arch.gmap); + gmap_enable(vcpu->arch.enabled_gmap); atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __start_cpu_timer_accounting(vcpu); @@ -1550,7 +1806,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __stop_cpu_timer_accounting(vcpu); atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); - gmap_disable(vcpu->arch.gmap); + vcpu->arch.enabled_gmap = gmap_get_enabled(); + gmap_disable(vcpu->arch.enabled_gmap); /* Save guest register state */ save_fpu_regs(); @@ -1599,7 +1856,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.gmap = vcpu->kvm->arch.gmap; sca_add_vcpu(vcpu); } - + if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0) + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; + /* make vcpu_load load the right gmap on the first trigger */ + vcpu->arch.enabled_gmap = vcpu->arch.gmap; } static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) @@ -1658,15 +1918,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_setup_model(vcpu); - vcpu->arch.sie_block->ecb = 0x02; + /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */ + if (MACHINE_HAS_ESOP) + vcpu->arch.sie_block->ecb |= 0x02; if (test_kvm_facility(vcpu->kvm, 9)) vcpu->arch.sie_block->ecb |= 0x04; - if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73)) + if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= 0x10; - if (test_kvm_facility(vcpu->kvm, 8)) + if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi) vcpu->arch.sie_block->ecb2 |= 0x08; - vcpu->arch.sie_block->eca = 0xC1002000U; + vcpu->arch.sie_block->eca = 0x1002000U; + if (sclp.has_cei) + vcpu->arch.sie_block->eca |= 0x80000000U; + if (sclp.has_ib) + vcpu->arch.sie_block->eca |= 0x40000000U; if (sclp.has_siif) vcpu->arch.sie_block->eca |= 1; if (sclp.has_sigpif) @@ -1716,6 +1982,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; + /* the real guest size will always be smaller than msl */ + vcpu->arch.sie_block->mso = 0; + vcpu->arch.sie_block->msl = sclp.hamax; + vcpu->arch.sie_block->icpua = id; spin_lock_init(&vcpu->arch.local_int.lock); vcpu->arch.local_int.float_int = &kvm->arch.float_int; @@ -1784,16 +2054,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) kvm_s390_vcpu_request(vcpu); } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address) +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) { - int i; struct kvm *kvm = gmap->private; struct kvm_vcpu *vcpu; + unsigned long prefix; + int i; + if (gmap_is_shadow(gmap)) + return; + if (start >= 1UL << 31) + /* We are only interested in prefix pages */ + return; kvm_for_each_vcpu(i, vcpu, kvm) { /* match against both prefix pages */ - if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) { - VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address); + prefix = kvm_s390_get_prefix(vcpu); + if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { + VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", + start, end); kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu); } } @@ -2002,6 +2281,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (dbg->control & ~VALID_GUESTDBG_FLAGS) return -EINVAL; + if (!sclp.has_gpere) + return -EINVAL; if (dbg->control & KVM_GUESTDBG_ENABLE) { vcpu->guest_debug = dbg->control; @@ -2070,16 +2351,16 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) return 0; /* * We use MMU_RELOAD just to re-arm the ipte notifier for the - * guest prefix page. gmap_ipte_notify will wait on the ptl lock. + * guest prefix page. gmap_mprotect_notify will wait on the ptl lock. * This ensures that the ipte instruction for this request has * already finished. We might race against a second unmapper that * wants to set the blocking bit. Lets just retry the request loop. */ if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) { int rc; - rc = gmap_ipte_notify(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu), - PAGE_SIZE * 2); + rc = gmap_mprotect_notify(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu), + PAGE_SIZE * 2, PROT_WRITE); if (rc) return rc; goto retry; @@ -2108,6 +2389,11 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) goto retry; } + if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) { + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; + goto retry; + } + /* nothing to do, just clear the request */ clear_bit(KVM_REQ_UNHALT, &vcpu->requests); @@ -2362,14 +2648,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) * guest_enter and guest_exit should be no uaccess. */ local_irq_disable(); - __kvm_guest_enter(); + guest_enter_irqoff(); __disable_cpu_timer_accounting(vcpu); local_irq_enable(); exit_reason = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); local_irq_disable(); __enable_cpu_timer_accounting(vcpu); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); @@ -2598,6 +2884,8 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm) static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) { + if (!sclp.has_ibs) + return; kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu); kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu); } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 8621ab00ec8e..b8432862a817 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT; + return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) @@ -175,6 +175,12 @@ static inline int set_kvm_facility(u64 *fac_list, unsigned long nr) return 0; } +static inline int test_kvm_cpu_feat(struct kvm *kvm, unsigned long nr) +{ + WARN_ON_ONCE(nr >= KVM_S390_VM_CPU_FEAT_NR_BITS); + return test_bit_inv(nr, kvm->arch.cpu_feat); +} + /* are cpu states controlled by user space */ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) { @@ -232,6 +238,8 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen) } static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu) { + /* don't inject PER events if we re-execute the instruction */ + vcpu->arch.sie_block->icptstatus &= ~0x02; kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu)); } @@ -246,10 +254,21 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu); int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu); int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); +/* implemented in vsie.c */ +int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); +void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu); +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end); +void kvm_s390_vsie_init(struct kvm *kvm); +void kvm_s390_vsie_destroy(struct kvm *kvm); + /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); +/* implemented in sthyi.c */ +int handle_sthyi(struct kvm_vcpu *vcpu); + /* implemented in kvm-s390.c */ void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); @@ -360,6 +379,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu); void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); +int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); /* support for Basic/Extended SCA handling */ diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 95916fa7c670..46160388e996 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "gaccess.h" #include "kvm-s390.h" #include "trace.h" @@ -152,30 +153,166 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) static int __skey_check_enable(struct kvm_vcpu *vcpu) { int rc = 0; + + trace_kvm_s390_skey_related_inst(vcpu); if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE))) return rc; rc = s390_enable_skey(); - VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest"); - trace_kvm_s390_skey_related_inst(vcpu); - vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); + VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); + if (!rc) + vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); return rc; } - -static int handle_skey(struct kvm_vcpu *vcpu) +static int try_handle_skey(struct kvm_vcpu *vcpu) { - int rc = __skey_check_enable(vcpu); + int rc; + vcpu->stat.instruction_storage_key++; + rc = __skey_check_enable(vcpu); if (rc) return rc; - vcpu->stat.instruction_storage_key++; - + if (sclp.has_skey) { + /* with storage-key facility, SIE interprets it for us */ + kvm_s390_retry_instr(vcpu); + VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation"); + return -EAGAIN; + } if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + return 0; +} - kvm_s390_retry_instr(vcpu); - VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation"); +static int handle_iske(struct kvm_vcpu *vcpu) +{ + unsigned long addr; + unsigned char key; + int reg1, reg2; + int rc; + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + addr = kvm_s390_logical_to_effective(vcpu, addr); + addr = kvm_s390_real_to_abs(vcpu, addr); + addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr)); + if (kvm_is_error_hva(addr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + down_read(¤t->mm->mmap_sem); + rc = get_guest_storage_key(current->mm, addr, &key); + up_read(¤t->mm->mmap_sem); + if (rc) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + vcpu->run->s.regs.gprs[reg1] &= ~0xff; + vcpu->run->s.regs.gprs[reg1] |= key; + return 0; +} + +static int handle_rrbe(struct kvm_vcpu *vcpu) +{ + unsigned long addr; + int reg1, reg2; + int rc; + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + addr = kvm_s390_logical_to_effective(vcpu, addr); + addr = kvm_s390_real_to_abs(vcpu, addr); + addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr)); + if (kvm_is_error_hva(addr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + down_read(¤t->mm->mmap_sem); + rc = reset_guest_reference_bit(current->mm, addr); + up_read(¤t->mm->mmap_sem); + if (rc < 0) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + kvm_s390_set_psw_cc(vcpu, rc); + return 0; +} + +#define SSKE_NQ 0x8 +#define SSKE_MR 0x4 +#define SSKE_MC 0x2 +#define SSKE_MB 0x1 +static int handle_sske(struct kvm_vcpu *vcpu) +{ + unsigned char m3 = vcpu->arch.sie_block->ipb >> 28; + unsigned long start, end; + unsigned char key, oldkey; + int reg1, reg2; + int rc; + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + if (!test_kvm_facility(vcpu->kvm, 8)) + m3 &= ~SSKE_MB; + if (!test_kvm_facility(vcpu->kvm, 10)) + m3 &= ~(SSKE_MC | SSKE_MR); + if (!test_kvm_facility(vcpu->kvm, 14)) + m3 &= ~SSKE_NQ; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + key = vcpu->run->s.regs.gprs[reg1] & 0xfe; + start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + start = kvm_s390_logical_to_effective(vcpu, start); + if (m3 & SSKE_MB) { + /* start already designates an absolute address */ + end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + } else { + start = kvm_s390_real_to_abs(vcpu, start); + end = start + PAGE_SIZE; + } + + while (start != end) { + unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); + + if (kvm_is_error_hva(addr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + down_read(¤t->mm->mmap_sem); + rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey, + m3 & SSKE_NQ, m3 & SSKE_MR, + m3 & SSKE_MC); + up_read(¤t->mm->mmap_sem); + if (rc < 0) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + start += PAGE_SIZE; + }; + + if (m3 & (SSKE_MC | SSKE_MR)) { + if (m3 & SSKE_MB) { + /* skey in reg1 is unpredictable */ + kvm_s390_set_psw_cc(vcpu, 3); + } else { + kvm_s390_set_psw_cc(vcpu, rc); + vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL; + vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8; + } + } + if (m3 & SSKE_MB) { + if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) + vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK; + else + vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL; + end = kvm_s390_logical_to_effective(vcpu, end); + vcpu->run->s.regs.gprs[reg2] |= end; + } return 0; } @@ -582,10 +719,11 @@ static const intercept_handler_t b2_handlers[256] = { [0x10] = handle_set_prefix, [0x11] = handle_store_prefix, [0x12] = handle_store_cpu_address, + [0x14] = kvm_s390_handle_vsie, [0x21] = handle_ipte_interlock, - [0x29] = handle_skey, - [0x2a] = handle_skey, - [0x2b] = handle_skey, + [0x29] = handle_iske, + [0x2a] = handle_rrbe, + [0x2b] = handle_sske, [0x2c] = handle_test_block, [0x30] = handle_io_inst, [0x31] = handle_io_inst, @@ -654,8 +792,10 @@ static int handle_epsw(struct kvm_vcpu *vcpu) static int handle_pfmf(struct kvm_vcpu *vcpu) { + bool mr = false, mc = false, nq; int reg1, reg2; unsigned long start, end; + unsigned char key; vcpu->stat.instruction_pfmf++; @@ -675,15 +815,27 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) !test_kvm_facility(vcpu->kvm, 14)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - /* No support for conditional-SSKE */ - if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC)) - return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + /* Only provide conditional-SSKE support if enabled for the guest */ + if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK && + test_kvm_facility(vcpu->kvm, 10)) { + mr = vcpu->run->s.regs.gprs[reg1] & PFMF_MR; + mc = vcpu->run->s.regs.gprs[reg1] & PFMF_MC; + } + nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ; + key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); + if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { + if (kvm_s390_check_low_addr_prot_real(vcpu, start)) + return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); + } + switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { case 0x00000000: + /* only 4k frames specify a real address */ + start = kvm_s390_real_to_abs(vcpu, start); end = (start + (1UL << 12)) & ~((1UL << 12) - 1); break; case 0x00001000: @@ -701,20 +853,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); } - if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { - if (kvm_s390_check_low_addr_prot_real(vcpu, start)) - return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); - } - - while (start < end) { - unsigned long useraddr, abs_addr; + while (start != end) { + unsigned long useraddr; /* Translate guest address to host address */ - if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0) - abs_addr = kvm_s390_real_to_abs(vcpu, start); - else - abs_addr = start; - useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr)); + useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); if (kvm_is_error_hva(useraddr)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); @@ -728,16 +871,25 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (rc) return rc; - if (set_guest_storage_key(current->mm, useraddr, - vcpu->run->s.regs.gprs[reg1] & PFMF_KEY, - vcpu->run->s.regs.gprs[reg1] & PFMF_NQ)) + down_read(¤t->mm->mmap_sem); + rc = cond_set_guest_storage_key(current->mm, useraddr, + key, NULL, nq, mr, mc); + up_read(¤t->mm->mmap_sem); + if (rc < 0) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); } start += PAGE_SIZE; } - if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) - vcpu->run->s.regs.gprs[reg2] = end; + if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { + if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) { + vcpu->run->s.regs.gprs[reg2] = end; + } else { + vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL; + end = kvm_s390_logical_to_effective(vcpu, end); + vcpu->run->s.regs.gprs[reg2] |= end; + } + } return 0; } @@ -1033,7 +1185,15 @@ static int handle_sckpf(struct kvm_vcpu *vcpu) return 0; } +static int handle_ptff(struct kvm_vcpu *vcpu) +{ + /* we don't emulate any control instructions yet */ + kvm_s390_set_psw_cc(vcpu, 3); + return 0; +} + static const intercept_handler_t x01_handlers[256] = { + [0x04] = handle_ptff, [0x07] = handle_sckpf, }; diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 28ea0cab1f1b..1a252f537081 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT; u16 p_asn, s_asn; psw_t *psw; - u32 flags; + bool idle; - flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags); + idle = is_vcpu_idle(vcpu); psw = &dst_vcpu->arch.sie_block->gpsw; p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff; /* Primary ASN */ s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff; /* Secondary ASN */ /* Inject the emergency signal? */ - if (!(flags & CPUSTAT_STOPPED) + if (!is_vcpu_stopped(vcpu) || (psw->mask & psw_int_mask) != psw_int_mask - || ((flags & CPUSTAT_WAIT) && psw->addr != 0) - || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) { + || (idle && psw->addr != 0) + || (!idle && (asn == p_asn || asn == s_asn))) { return __inject_sigp_emergency(vcpu, dst_vcpu); } else { *reg &= 0xffffffff00000000UL; diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c new file mode 100644 index 000000000000..bd98b7d25200 --- /dev/null +++ b/arch/s390/kvm/sthyi.c @@ -0,0 +1,471 @@ +/* + * store hypervisor information instruction emulation functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Copyright IBM Corp. 2016 + * Author(s): Janosch Frank + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "kvm-s390.h" +#include "gaccess.h" +#include "trace.h" + +#define DED_WEIGHT 0xffff +/* + * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string + * as they are justified with spaces. + */ +#define CP 0xc3d7404040404040UL +#define IFL 0xc9c6d34040404040UL + +enum hdr_flags { + HDR_NOT_LPAR = 0x10, + HDR_STACK_INCM = 0x20, + HDR_STSI_UNAV = 0x40, + HDR_PERF_UNAV = 0x80, +}; + +enum mac_validity { + MAC_NAME_VLD = 0x20, + MAC_ID_VLD = 0x40, + MAC_CNT_VLD = 0x80, +}; + +enum par_flag { + PAR_MT_EN = 0x80, +}; + +enum par_validity { + PAR_GRP_VLD = 0x08, + PAR_ID_VLD = 0x10, + PAR_ABS_VLD = 0x20, + PAR_WGHT_VLD = 0x40, + PAR_PCNT_VLD = 0x80, +}; + +struct hdr_sctn { + u8 infhflg1; + u8 infhflg2; /* reserved */ + u8 infhval1; /* reserved */ + u8 infhval2; /* reserved */ + u8 reserved[3]; + u8 infhygct; + u16 infhtotl; + u16 infhdln; + u16 infmoff; + u16 infmlen; + u16 infpoff; + u16 infplen; + u16 infhoff1; + u16 infhlen1; + u16 infgoff1; + u16 infglen1; + u16 infhoff2; + u16 infhlen2; + u16 infgoff2; + u16 infglen2; + u16 infhoff3; + u16 infhlen3; + u16 infgoff3; + u16 infglen3; + u8 reserved2[4]; +} __packed; + +struct mac_sctn { + u8 infmflg1; /* reserved */ + u8 infmflg2; /* reserved */ + u8 infmval1; + u8 infmval2; /* reserved */ + u16 infmscps; + u16 infmdcps; + u16 infmsifl; + u16 infmdifl; + char infmname[8]; + char infmtype[4]; + char infmmanu[16]; + char infmseq[16]; + char infmpman[4]; + u8 reserved[4]; +} __packed; + +struct par_sctn { + u8 infpflg1; + u8 infpflg2; /* reserved */ + u8 infpval1; + u8 infpval2; /* reserved */ + u16 infppnum; + u16 infpscps; + u16 infpdcps; + u16 infpsifl; + u16 infpdifl; + u16 reserved; + char infppnam[8]; + u32 infpwbcp; + u32 infpabcp; + u32 infpwbif; + u32 infpabif; + char infplgnm[8]; + u32 infplgcp; + u32 infplgif; +} __packed; + +struct sthyi_sctns { + struct hdr_sctn hdr; + struct mac_sctn mac; + struct par_sctn par; +} __packed; + +struct cpu_inf { + u64 lpar_cap; + u64 lpar_grp_cap; + u64 lpar_weight; + u64 all_weight; + int cpu_num_ded; + int cpu_num_shd; +}; + +struct lpar_cpu_inf { + struct cpu_inf cp; + struct cpu_inf ifl; +}; + +static inline u64 cpu_id(u8 ctidx, void *diag224_buf) +{ + return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN)); +} + +/* + * Scales the cpu capping from the lpar range to the one expected in + * sthyi data. + * + * diag204 reports a cap in hundredths of processor units. + * z/VM's range for one core is 0 - 0x10000. + */ +static u32 scale_cap(u32 in) +{ + return (0x10000 * in) / 100; +} + +static void fill_hdr(struct sthyi_sctns *sctns) +{ + sctns->hdr.infhdln = sizeof(sctns->hdr); + sctns->hdr.infmoff = sizeof(sctns->hdr); + sctns->hdr.infmlen = sizeof(sctns->mac); + sctns->hdr.infplen = sizeof(sctns->par); + sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen; + sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen; +} + +static void fill_stsi_mac(struct sthyi_sctns *sctns, + struct sysinfo_1_1_1 *sysinfo) +{ + if (stsi(sysinfo, 1, 1, 1)) + return; + + sclp_ocf_cpc_name_copy(sctns->mac.infmname); + + memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype)); + memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu)); + memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman)); + memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq)); + + sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD; +} + +static void fill_stsi_par(struct sthyi_sctns *sctns, + struct sysinfo_2_2_2 *sysinfo) +{ + if (stsi(sysinfo, 2, 2, 2)) + return; + + sctns->par.infppnum = sysinfo->lpar_number; + memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam)); + + sctns->par.infpval1 |= PAR_ID_VLD; +} + +static void fill_stsi(struct sthyi_sctns *sctns) +{ + void *sysinfo; + + /* Errors are handled through the validity bits in the response. */ + sysinfo = (void *)__get_free_page(GFP_KERNEL); + if (!sysinfo) + return; + + fill_stsi_mac(sctns, sysinfo); + fill_stsi_par(sctns, sysinfo); + + free_pages((unsigned long)sysinfo, 0); +} + +static void fill_diag_mac(struct sthyi_sctns *sctns, + struct diag204_x_phys_block *block, + void *diag224_buf) +{ + int i; + + for (i = 0; i < block->hdr.cpus; i++) { + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdcps++; + else + sctns->mac.infmscps++; + break; + case IFL: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdifl++; + else + sctns->mac.infmsifl++; + break; + } + } + sctns->mac.infmval1 |= MAC_CNT_VLD; +} + +/* Returns a pointer to the the next partition block. */ +static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, + bool this_lpar, + void *diag224_buf, + struct diag204_x_part_block *block) +{ + int i, capped = 0, weight_cp = 0, weight_ifl = 0; + struct cpu_inf *cpu_inf; + + for (i = 0; i < block->hdr.rcpus; i++) { + if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE)) + continue; + + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + cpu_inf = &part_inf->cp; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_cp |= block->cpus[i].cur_weight; + break; + case IFL: + cpu_inf = &part_inf->ifl; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_ifl |= block->cpus[i].cur_weight; + break; + default: + continue; + } + + if (!this_lpar) + continue; + + capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED; + cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap; + cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap; + + if (block->cpus[i].weight == DED_WEIGHT) + cpu_inf->cpu_num_ded += 1; + else + cpu_inf->cpu_num_shd += 1; + } + + if (this_lpar && capped) { + part_inf->cp.lpar_weight = weight_cp; + part_inf->ifl.lpar_weight = weight_ifl; + } + part_inf->cp.all_weight += weight_cp; + part_inf->ifl.all_weight += weight_ifl; + return (struct diag204_x_part_block *)&block->cpus[i]; +} + +static void fill_diag(struct sthyi_sctns *sctns) +{ + int i, r, pages; + bool this_lpar; + void *diag204_buf; + void *diag224_buf = NULL; + struct diag204_x_info_blk_hdr *ti_hdr; + struct diag204_x_part_block *part_block; + struct diag204_x_phys_block *phys_block; + struct lpar_cpu_inf lpar_inf = {}; + + /* Errors are handled through the validity bits in the response. */ + pages = diag204((unsigned long)DIAG204_SUBC_RSI | + (unsigned long)DIAG204_INFO_EXT, 0, NULL); + if (pages <= 0) + return; + + diag204_buf = vmalloc(PAGE_SIZE * pages); + if (!diag204_buf) + return; + + r = diag204((unsigned long)DIAG204_SUBC_STIB7 | + (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf); + if (r < 0) + goto out; + + diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA); + if (!diag224_buf || diag224(diag224_buf)) + goto out; + + ti_hdr = diag204_buf; + part_block = diag204_buf + sizeof(*ti_hdr); + + for (i = 0; i < ti_hdr->npar; i++) { + /* + * For the calling lpar we also need to get the cpu + * caps and weights. The time information block header + * specifies the offset to the partition block of the + * caller lpar, so we know when we process its data. + */ + this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part; + part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf, + part_block); + } + + phys_block = (struct diag204_x_phys_block *)part_block; + part_block = diag204_buf + ti_hdr->this_part; + if (part_block->hdr.mtid) + sctns->par.infpflg1 = PAR_MT_EN; + + sctns->par.infpval1 |= PAR_GRP_VLD; + sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap); + sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap); + memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name, + sizeof(sctns->par.infplgnm)); + + sctns->par.infpscps = lpar_inf.cp.cpu_num_shd; + sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded; + sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd; + sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded; + sctns->par.infpval1 |= PAR_PCNT_VLD; + + sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap); + sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap); + sctns->par.infpval1 |= PAR_ABS_VLD; + + /* + * Everything below needs global performance data to be + * meaningful. + */ + if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) { + sctns->hdr.infhflg1 |= HDR_PERF_UNAV; + goto out; + } + + fill_diag_mac(sctns, phys_block, diag224_buf); + + if (lpar_inf.cp.lpar_weight) { + sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 * + lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight; + } + + if (lpar_inf.ifl.lpar_weight) { + sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 * + lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight; + } + sctns->par.infpval1 |= PAR_WGHT_VLD; + +out: + kfree(diag224_buf); + vfree(diag204_buf); +} + +static int sthyi(u64 vaddr) +{ + register u64 code asm("0") = 0; + register u64 addr asm("2") = vaddr; + int cc; + + asm volatile( + ".insn rre,0xB2560000,%[code],%[addr]\n" + "ipm %[cc]\n" + "srl %[cc],28\n" + : [cc] "=d" (cc) + : [code] "d" (code), [addr] "a" (addr) + : "memory", "cc"); + return cc; +} + +int handle_sthyi(struct kvm_vcpu *vcpu) +{ + int reg1, reg2, r = 0; + u64 code, addr, cc = 0; + struct sthyi_sctns *sctns = NULL; + + /* + * STHYI requires extensive locking in the higher hypervisors + * and is very computational/memory expensive. Therefore we + * ratelimit the executions per VM. + */ + if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) { + kvm_s390_retry_instr(vcpu); + return 0; + } + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + code = vcpu->run->s.regs.gprs[reg1]; + addr = vcpu->run->s.regs.gprs[reg2]; + + vcpu->stat.instruction_sthyi++; + VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr); + trace_kvm_s390_handle_sthyi(vcpu, code, addr); + + if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (code & 0xffff) { + cc = 3; + goto out; + } + + /* + * If the page has not yet been faulted in, we want to do that + * now and not after all the expensive calculations. + */ + r = write_guest(vcpu, addr, reg2, &cc, 1); + if (r) + return kvm_s390_inject_prog_cond(vcpu, r); + + sctns = (void *)get_zeroed_page(GFP_KERNEL); + if (!sctns) + return -ENOMEM; + + /* + * If we are a guest, we don't want to emulate an emulated + * instruction. We ask the hypervisor to provide the data. + */ + if (test_facility(74)) { + cc = sthyi((u64)sctns); + goto out; + } + + fill_hdr(sctns); + fill_stsi(sctns); + fill_diag(sctns); + +out: + if (!cc) { + r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); + if (r) { + free_page((unsigned long)sctns); + return kvm_s390_inject_prog_cond(vcpu, r); + } + } + + free_page((unsigned long)sctns); + vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0; + kvm_s390_set_psw_cc(vcpu, cc); + return r; +} diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h index 916834d7a73a..4fc9d4e5be89 100644 --- a/arch/s390/kvm/trace.h +++ b/arch/s390/kvm/trace.h @@ -41,7 +41,7 @@ TRACE_EVENT(kvm_s390_skey_related_inst, TP_fast_assign( VCPU_ASSIGN_COMMON ), - VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu") + VCPU_TP_PRINTK("%s", "storage key related instruction") ); TRACE_EVENT(kvm_s390_major_guest_pfault, @@ -185,8 +185,10 @@ TRACE_EVENT(kvm_s390_intercept_prog, __entry->code = code; ), - VCPU_TP_PRINTK("intercepted program interruption %04x", - __entry->code) + VCPU_TP_PRINTK("intercepted program interruption %04x (%s)", + __entry->code, + __print_symbolic(__entry->code, + icpt_prog_codes)) ); /* @@ -412,6 +414,47 @@ TRACE_EVENT(kvm_s390_handle_stsi, __entry->addr) ); +TRACE_EVENT(kvm_s390_handle_operexc, + TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb), + TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(__u64, instruction) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->instruction = ((__u64)ipa << 48) | + ((__u64)ipb << 16); + ), + + VCPU_TP_PRINTK("operation exception on instruction %016llx (%s)", + __entry->instruction, + __print_symbolic(icpt_insn_decoder(__entry->instruction), + icpt_insn_codes)) + ); + +TRACE_EVENT(kvm_s390_handle_sthyi, + TP_PROTO(VCPU_PROTO_COMMON, u64 code, u64 addr), + TP_ARGS(VCPU_ARGS_COMMON, code, addr), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(u64, code) + __field(u64, addr) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->code = code; + __entry->addr = addr; + ), + + VCPU_TP_PRINTK("STHYI fc: %llu addr: %016llx", + __entry->code, __entry->addr) + ); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c new file mode 100644 index 000000000000..c106488b4137 --- /dev/null +++ b/arch/s390/kvm/vsie.c @@ -0,0 +1,1091 @@ +/* + * kvm nested virtualization support for s390x + * + * Copyright IBM Corp. 2016 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Author(s): David Hildenbrand + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kvm-s390.h" +#include "gaccess.h" + +struct vsie_page { + struct kvm_s390_sie_block scb_s; /* 0x0000 */ + /* the pinned originial scb */ + struct kvm_s390_sie_block *scb_o; /* 0x0200 */ + /* the shadow gmap in use by the vsie_page */ + struct gmap *gmap; /* 0x0208 */ + /* address of the last reported fault to guest2 */ + unsigned long fault_addr; /* 0x0210 */ + __u8 reserved[0x0700 - 0x0218]; /* 0x0218 */ + struct kvm_s390_crypto_cb crycb; /* 0x0700 */ + __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ +} __packed; + +/* trigger a validity icpt for the given scb */ +static int set_validity_icpt(struct kvm_s390_sie_block *scb, + __u16 reason_code) +{ + scb->ipa = 0x1000; + scb->ipb = ((__u32) reason_code) << 16; + scb->icptcode = ICPT_VALIDITY; + return 1; +} + +/* mark the prefix as unmapped, this will block the VSIE */ +static void prefix_unmapped(struct vsie_page *vsie_page) +{ + atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20); +} + +/* mark the prefix as unmapped and wait until the VSIE has been left */ +static void prefix_unmapped_sync(struct vsie_page *vsie_page) +{ + prefix_unmapped(vsie_page); + if (vsie_page->scb_s.prog0c & PROG_IN_SIE) + atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags); + while (vsie_page->scb_s.prog0c & PROG_IN_SIE) + cpu_relax(); +} + +/* mark the prefix as mapped, this will allow the VSIE to run */ +static void prefix_mapped(struct vsie_page *vsie_page) +{ + atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20); +} + +/* test if the prefix is mapped into the gmap shadow */ +static int prefix_is_mapped(struct vsie_page *vsie_page) +{ + return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST); +} + +/* copy the updated intervention request bits into the shadow scb */ +static void update_intervention_requests(struct vsie_page *vsie_page) +{ + const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT; + int cpuflags; + + cpuflags = atomic_read(&vsie_page->scb_o->cpuflags); + atomic_andnot(bits, &vsie_page->scb_s.cpuflags); + atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags); +} + +/* shadow (filter and validate) the cpuflags */ +static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + int newflags, cpuflags = atomic_read(&scb_o->cpuflags); + + /* we don't allow ESA/390 guests */ + if (!(cpuflags & CPUSTAT_ZARCH)) + return set_validity_icpt(scb_s, 0x0001U); + + if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS)) + return set_validity_icpt(scb_s, 0x0001U); + else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR)) + return set_validity_icpt(scb_s, 0x0007U); + + /* intervention requests will be set later */ + newflags = CPUSTAT_ZARCH; + if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8)) + newflags |= CPUSTAT_GED; + if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) { + if (cpuflags & CPUSTAT_GED) + return set_validity_icpt(scb_s, 0x0001U); + newflags |= CPUSTAT_GED2; + } + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE)) + newflags |= cpuflags & CPUSTAT_P; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS)) + newflags |= cpuflags & CPUSTAT_SM; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS)) + newflags |= cpuflags & CPUSTAT_IBS; + + atomic_set(&scb_s->cpuflags, newflags); + return 0; +} + +/* + * Create a shadow copy of the crycb block and setup key wrapping, if + * requested for guest 3 and enabled for guest 2. + * + * We only accept format-1 (no AP in g2), but convert it into format-2 + * There is nothing to do for format-0. + * + * Returns: - 0 if shadowed or nothing to do + * - > 0 if control has to be given to guest 2 + */ +static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U; + unsigned long *b1, *b2; + u8 ecb3_flags; + + scb_s->crycbd = 0; + if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1)) + return 0; + /* format-1 is supported with message-security-assist extension 3 */ + if (!test_kvm_facility(vcpu->kvm, 76)) + return 0; + /* we may only allow it if enabled for guest 2 */ + ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 & + (ECB3_AES | ECB3_DEA); + if (!ecb3_flags) + return 0; + + if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK)) + return set_validity_icpt(scb_s, 0x003CU); + else if (!crycb_addr) + return set_validity_icpt(scb_s, 0x0039U); + + /* copy only the wrapping keys */ + if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56)) + return set_validity_icpt(scb_s, 0x0035U); + + scb_s->ecb3 |= ecb3_flags; + scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 | + CRYCB_FORMAT2; + + /* xor both blocks in one run */ + b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask; + b2 = (unsigned long *) + vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask; + /* as 56%8 == 0, bitmap_xor won't overwrite any data */ + bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56); + return 0; +} + +/* shadow (round up/down) the ibc to avoid validity icpt */ +static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU; + + scb_s->ibc = 0; + /* ibc installed in g2 and requested for g3 */ + if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) { + scb_s->ibc = scb_o->ibc & 0x0fffU; + /* takte care of the minimum ibc level of the machine */ + if (scb_s->ibc < min_ibc) + scb_s->ibc = min_ibc; + /* take care of the maximum ibc level set for the guest */ + if (scb_s->ibc > vcpu->kvm->arch.model.ibc) + scb_s->ibc = vcpu->kvm->arch.model.ibc; + } +} + +/* unshadow the scb, copying parameters back to the real scb */ +static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + + /* interception */ + scb_o->icptcode = scb_s->icptcode; + scb_o->icptstatus = scb_s->icptstatus; + scb_o->ipa = scb_s->ipa; + scb_o->ipb = scb_s->ipb; + scb_o->gbea = scb_s->gbea; + + /* timer */ + scb_o->cputm = scb_s->cputm; + scb_o->ckc = scb_s->ckc; + scb_o->todpr = scb_s->todpr; + + /* guest state */ + scb_o->gpsw = scb_s->gpsw; + scb_o->gg14 = scb_s->gg14; + scb_o->gg15 = scb_s->gg15; + memcpy(scb_o->gcr, scb_s->gcr, 128); + scb_o->pp = scb_s->pp; + + /* interrupt intercept */ + switch (scb_s->icptcode) { + case ICPT_PROGI: + case ICPT_INSTPROGI: + case ICPT_EXTINT: + memcpy((void *)((u64)scb_o + 0xc0), + (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0); + break; + case ICPT_PARTEXEC: + /* MVPG only */ + memcpy((void *)((u64)scb_o + 0xc0), + (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0); + break; + } + + if (scb_s->ihcpu != 0xffffU) + scb_o->ihcpu = scb_s->ihcpu; +} + +/* + * Setup the shadow scb by copying and checking the relevant parts of the g2 + * provided scb. + * + * Returns: - 0 if the scb has been shadowed + * - > 0 if control has to be given to guest 2 + */ +static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + bool had_tx = scb_s->ecb & 0x10U; + unsigned long new_mso = 0; + int rc; + + /* make sure we don't have any leftovers when reusing the scb */ + scb_s->icptcode = 0; + scb_s->eca = 0; + scb_s->ecb = 0; + scb_s->ecb2 = 0; + scb_s->ecb3 = 0; + scb_s->ecd = 0; + scb_s->fac = 0; + + rc = prepare_cpuflags(vcpu, vsie_page); + if (rc) + goto out; + + /* timer */ + scb_s->cputm = scb_o->cputm; + scb_s->ckc = scb_o->ckc; + scb_s->todpr = scb_o->todpr; + scb_s->epoch = scb_o->epoch; + + /* guest state */ + scb_s->gpsw = scb_o->gpsw; + scb_s->gg14 = scb_o->gg14; + scb_s->gg15 = scb_o->gg15; + memcpy(scb_s->gcr, scb_o->gcr, 128); + scb_s->pp = scb_o->pp; + + /* interception / execution handling */ + scb_s->gbea = scb_o->gbea; + scb_s->lctl = scb_o->lctl; + scb_s->svcc = scb_o->svcc; + scb_s->ictl = scb_o->ictl; + /* + * SKEY handling functions can't deal with false setting of PTE invalid + * bits. Therefore we cannot provide interpretation and would later + * have to provide own emulation handlers. + */ + scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; + scb_s->icpua = scb_o->icpua; + + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM)) + new_mso = scb_o->mso & 0xfffffffffff00000UL; + /* if the hva of the prefix changes, we have to remap the prefix */ + if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix) + prefix_unmapped(vsie_page); + /* SIE will do mso/msl validity and exception checks for us */ + scb_s->msl = scb_o->msl & 0xfffffffffff00000UL; + scb_s->mso = new_mso; + scb_s->prefix = scb_o->prefix; + + /* We have to definetly flush the tlb if this scb never ran */ + if (scb_s->ihcpu != 0xffffU) + scb_s->ihcpu = scb_o->ihcpu; + + /* MVPG and Protection Exception Interpretation are always available */ + scb_s->eca |= scb_o->eca & 0x01002000U; + /* Host-protection-interruption introduced with ESOP */ + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) + scb_s->ecb |= scb_o->ecb & 0x02U; + /* transactional execution */ + if (test_kvm_facility(vcpu->kvm, 73)) { + /* remap the prefix is tx is toggled on */ + if ((scb_o->ecb & 0x10U) && !had_tx) + prefix_unmapped(vsie_page); + scb_s->ecb |= scb_o->ecb & 0x10U; + } + /* SIMD */ + if (test_kvm_facility(vcpu->kvm, 129)) { + scb_s->eca |= scb_o->eca & 0x00020000U; + scb_s->ecd |= scb_o->ecd & 0x20000000U; + } + /* Run-time-Instrumentation */ + if (test_kvm_facility(vcpu->kvm, 64)) + scb_s->ecb3 |= scb_o->ecb3 & 0x01U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF)) + scb_s->eca |= scb_o->eca & 0x00000001U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB)) + scb_s->eca |= scb_o->eca & 0x40000000U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) + scb_s->eca |= scb_o->eca & 0x80000000U; + + prepare_ibc(vcpu, vsie_page); + rc = shadow_crycb(vcpu, vsie_page); +out: + if (rc) + unshadow_scb(vcpu, vsie_page); + return rc; +} + +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct kvm *kvm = gmap->private; + struct vsie_page *cur; + unsigned long prefix; + struct page *page; + int i; + + if (!gmap_is_shadow(gmap)) + return; + if (start >= 1UL << 31) + /* We are only interested in prefix pages */ + return; + + /* + * Only new shadow blocks are added to the list during runtime, + * therefore we can safely reference them all the time. + */ + for (i = 0; i < kvm->arch.vsie.page_count; i++) { + page = READ_ONCE(kvm->arch.vsie.pages[i]); + if (!page) + continue; + cur = page_to_virt(page); + if (READ_ONCE(cur->gmap) != gmap) + continue; + prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; + /* with mso/msl, the prefix lies at an offset */ + prefix += cur->scb_s.mso; + if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1) + prefix_unmapped_sync(cur); + } +} + +/* + * Map the first prefix page and if tx is enabled also the second prefix page. + * + * The prefix will be protected, a gmap notifier will inform about unmaps. + * The shadow scb must not be executed until the prefix is remapped, this is + * guaranteed by properly handling PROG_REQUEST. + * + * Returns: - 0 on if successfully mapped or already mapped + * - > 0 if control has to be given to guest 2 + * - -EAGAIN if the caller can retry immediately + * - -ENOMEM if out of memory + */ +static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; + int rc; + + if (prefix_is_mapped(vsie_page)) + return 0; + + /* mark it as mapped so we can catch any concurrent unmappers */ + prefix_mapped(vsie_page); + + /* with mso/msl, the prefix lies at offset *mso* */ + prefix += scb_s->mso; + + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix); + if (!rc && (scb_s->ecb & 0x10U)) + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + prefix + PAGE_SIZE); + /* + * We don't have to mprotect, we will be called for all unshadows. + * SIE will detect if protection applies and trigger a validity. + */ + if (rc) + prefix_unmapped(vsie_page); + if (rc > 0 || rc == -EFAULT) + rc = set_validity_icpt(scb_s, 0x0037U); + return rc; +} + +/* + * Pin the guest page given by gpa and set hpa to the pinned host address. + * Will always be pinned writable. + * + * Returns: - 0 on success + * - -EINVAL if the gpa is not valid guest storage + * - -ENOMEM if out of memory + */ +static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) +{ + struct page *page; + hva_t hva; + int rc; + + hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(hva)) + return -EINVAL; + rc = get_user_pages_fast(hva, 1, 1, &page); + if (rc < 0) + return rc; + else if (rc != 1) + return -ENOMEM; + *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); + return 0; +} + +/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */ +static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) +{ + struct page *page; + + page = virt_to_page(hpa); + set_page_dirty_lock(page); + put_page(page); + /* mark the page always as dirty for migration */ + mark_page_dirty(kvm, gpa_to_gfn(gpa)); +} + +/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */ +static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + hpa_t hpa; + gpa_t gpa; + + hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol; + if (hpa) { + gpa = scb_o->scaol & ~0xfUL; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO)) + gpa |= (u64) scb_o->scaoh << 32; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->scaol = 0; + scb_s->scaoh = 0; + } + + hpa = scb_s->itdba; + if (hpa) { + gpa = scb_o->itdba & ~0xffUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->itdba = 0; + } + + hpa = scb_s->gvrd; + if (hpa) { + gpa = scb_o->gvrd & ~0x1ffUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->gvrd = 0; + } + + hpa = scb_s->riccbd; + if (hpa) { + gpa = scb_o->riccbd & ~0x3fUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->riccbd = 0; + } +} + +/* + * Instead of shadowing some blocks, we can simply forward them because the + * addresses in the scb are 64 bit long. + * + * This works as long as the data lies in one page. If blocks ever exceed one + * page, we have to fall back to shadowing. + * + * As we reuse the sca, the vcpu pointers contained in it are invalid. We must + * therefore not enable any facilities that access these pointers (e.g. SIGPIF). + * + * Returns: - 0 if all blocks were pinned. + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + hpa_t hpa; + gpa_t gpa; + int rc = 0; + + gpa = scb_o->scaol & ~0xfUL; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO)) + gpa |= (u64) scb_o->scaoh << 32; + if (gpa) { + if (!(gpa & ~0x1fffUL)) + rc = set_validity_icpt(scb_s, 0x0038U); + else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu)) + rc = set_validity_icpt(scb_s, 0x0011U); + else if ((gpa & PAGE_MASK) != + ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK)) + rc = set_validity_icpt(scb_s, 0x003bU); + if (!rc) { + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0034U); + } + if (rc) + goto unpin; + scb_s->scaoh = (u32)((u64)hpa >> 32); + scb_s->scaol = (u32)(u64)hpa; + } + + gpa = scb_o->itdba & ~0xffUL; + if (gpa && (scb_s->ecb & 0x10U)) { + if (!(gpa & ~0x1fffU)) { + rc = set_validity_icpt(scb_s, 0x0080U); + goto unpin; + } + /* 256 bytes cannot cross page boundaries */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0080U); + if (rc) + goto unpin; + scb_s->itdba = hpa; + } + + gpa = scb_o->gvrd & ~0x1ffUL; + if (gpa && (scb_s->eca & 0x00020000U) && + !(scb_s->ecd & 0x20000000U)) { + if (!(gpa & ~0x1fffUL)) { + rc = set_validity_icpt(scb_s, 0x1310U); + goto unpin; + } + /* + * 512 bytes vector registers cannot cross page boundaries + * if this block gets bigger, we have to shadow it. + */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x1310U); + if (rc) + goto unpin; + scb_s->gvrd = hpa; + } + + gpa = scb_o->riccbd & ~0x3fUL; + if (gpa && (scb_s->ecb3 & 0x01U)) { + if (!(gpa & ~0x1fffUL)) { + rc = set_validity_icpt(scb_s, 0x0043U); + goto unpin; + } + /* 64 bytes cannot cross page boundaries */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0043U); + /* Validity 0x0044 will be checked by SIE */ + if (rc) + goto unpin; + scb_s->gvrd = hpa; + } + return 0; +unpin: + unpin_blocks(vcpu, vsie_page); + return rc; +} + +/* unpin the scb provided by guest 2, marking it as dirty */ +static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, + gpa_t gpa) +{ + hpa_t hpa = (hpa_t) vsie_page->scb_o; + + if (hpa) + unpin_guest_page(vcpu->kvm, gpa, hpa); + vsie_page->scb_o = NULL; +} + +/* + * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o. + * + * Returns: - 0 if the scb was pinned. + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, + gpa_t gpa) +{ + hpa_t hpa; + int rc; + + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) { + rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (!rc) + rc = 1; + } + if (!rc) + vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; + return rc; +} + +/* + * Inject a fault into guest 2. + * + * Returns: - > 0 if control has to be given to guest 2 + * < 0 if an error occurred during injection. + */ +static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, + bool write_flag) +{ + struct kvm_s390_pgm_info pgm = { + .code = code, + .trans_exc_code = + /* 0-51: virtual address */ + (vaddr & 0xfffffffffffff000UL) | + /* 52-53: store / fetch */ + (((unsigned int) !write_flag) + 1) << 10, + /* 62-63: asce id (alway primary == 0) */ + .exc_access_id = 0, /* always primary */ + .op_access_id = 0, /* not MVPG */ + }; + int rc; + + if (code == PGM_PROTECTION) + pgm.trans_exc_code |= 0x4UL; + + rc = kvm_s390_inject_prog_irq(vcpu, &pgm); + return rc ? rc : 1; +} + +/* + * Handle a fault during vsie execution on a gmap shadow. + * + * Returns: - 0 if the fault was resolved + * - > 0 if control has to be given to guest 2 + * - < 0 if an error occurred + */ +static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + int rc; + + if (current->thread.gmap_int_code == PGM_PROTECTION) + /* we can directly forward all protection exceptions */ + return inject_fault(vcpu, PGM_PROTECTION, + current->thread.gmap_addr, 1); + + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + current->thread.gmap_addr); + if (rc > 0) { + rc = inject_fault(vcpu, rc, + current->thread.gmap_addr, + current->thread.gmap_write_flag); + if (rc >= 0) + vsie_page->fault_addr = current->thread.gmap_addr; + } + return rc; +} + +/* + * Retry the previous fault that required guest 2 intervention. This avoids + * one superfluous SIE re-entry and direct exit. + * + * Will ignore any errors. The next SIE fault will do proper fault handling. + */ +static void handle_last_fault(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + if (vsie_page->fault_addr) + kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + vsie_page->fault_addr); + vsie_page->fault_addr = 0; +} + +static inline void clear_vsie_icpt(struct vsie_page *vsie_page) +{ + vsie_page->scb_s.icptcode = 0; +} + +/* rewind the psw and clear the vsie icpt, so we can retry execution */ +static void retry_vsie_icpt(struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + int ilen = insn_length(scb_s->ipa >> 8); + + /* take care of EXECUTE instructions */ + if (scb_s->icptstatus & 1) { + ilen = (scb_s->icptstatus >> 4) & 0x6; + if (!ilen) + ilen = 4; + } + scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen); + clear_vsie_icpt(vsie_page); +} + +/* + * Try to shadow + enable the guest 2 provided facility list. + * Retry instruction execution if enabled for and provided by guest 2. + * + * Returns: - 0 if handled (retry or guest 2 icpt) + * - > 0 if control has to be given to guest 2 + */ +static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U; + + if (fac && test_kvm_facility(vcpu->kvm, 7)) { + retry_vsie_icpt(vsie_page); + if (read_guest_real(vcpu, fac, &vsie_page->fac, + sizeof(vsie_page->fac))) + return set_validity_icpt(scb_s, 0x1090U); + scb_s->fac = (__u32)(__u64) &vsie_page->fac; + } + return 0; +} + +/* + * Run the vsie on a shadow scb and a shadow gmap, without any further + * sanity checks, handling SIE faults. + * + * Returns: - 0 everything went fine + * - > 0 if control has to be given to guest 2 + * - < 0 if an error occurred + */ +static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + int rc; + + handle_last_fault(vcpu, vsie_page); + + if (need_resched()) + schedule(); + if (test_cpu_flag(CIF_MCCK_PENDING)) + s390_handle_mcck(); + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + local_irq_disable(); + guest_enter_irqoff(); + local_irq_enable(); + + rc = sie64a(scb_s, vcpu->run->s.regs.gprs); + + local_irq_disable(); + guest_exit_irqoff(); + local_irq_enable(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + if (rc > 0) + rc = 0; /* we could still have an icpt */ + else if (rc == -EFAULT) + return handle_fault(vcpu, vsie_page); + + switch (scb_s->icptcode) { + case ICPT_INST: + if (scb_s->ipa == 0xb2b0) + rc = handle_stfle(vcpu, vsie_page); + break; + case ICPT_STOP: + /* stop not requested by g2 - must have been a kick */ + if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT)) + clear_vsie_icpt(vsie_page); + break; + case ICPT_VALIDITY: + if ((scb_s->ipa & 0xf000) != 0xf000) + scb_s->ipa += 0x1000; + break; + } + return rc; +} + +static void release_gmap_shadow(struct vsie_page *vsie_page) +{ + if (vsie_page->gmap) + gmap_put(vsie_page->gmap); + WRITE_ONCE(vsie_page->gmap, NULL); + prefix_unmapped(vsie_page); +} + +static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + unsigned long asce; + union ctlreg0 cr0; + struct gmap *gmap; + int edat; + + asce = vcpu->arch.sie_block->gcr[1]; + cr0.val = vcpu->arch.sie_block->gcr[0]; + edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); + edat += edat && test_kvm_facility(vcpu->kvm, 78); + + /* + * ASCE or EDAT could have changed since last icpt, or the gmap + * we're holding has been unshadowed. If the gmap is still valid, + * we can safely reuse it. + */ + if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) + return 0; + + /* release the old shadow - if any, and mark the prefix as unmapped */ + release_gmap_shadow(vsie_page); + gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); + if (IS_ERR(gmap)) + return PTR_ERR(gmap); + gmap->private = vcpu->kvm; + WRITE_ONCE(vsie_page->gmap, gmap); + return 0; +} + +/* + * Register the shadow scb at the VCPU, e.g. for kicking out of vsie. + */ +static void register_shadow_scb(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + + WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s); + /* + * External calls have to lead to a kick of the vcpu and + * therefore the vsie -> Simulate Wait state. + */ + atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); + /* + * We have to adjust the g3 epoch by the g2 epoch. The epoch will + * automatically be adjusted on tod clock changes via kvm_sync_clock. + */ + preempt_disable(); + scb_s->epoch += vcpu->kvm->arch.epoch; + preempt_enable(); +} + +/* + * Unregister a shadow scb from a VCPU. + */ +static void unregister_shadow_scb(struct kvm_vcpu *vcpu) +{ + atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); + WRITE_ONCE(vcpu->arch.vsie_block, NULL); +} + +/* + * Run the vsie on a shadowed scb, managing the gmap shadow, handling + * prefix pages and faults. + * + * Returns: - 0 if no errors occurred + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + int rc = 0; + + while (1) { + rc = acquire_gmap_shadow(vcpu, vsie_page); + if (!rc) + rc = map_prefix(vcpu, vsie_page); + if (!rc) { + gmap_enable(vsie_page->gmap); + update_intervention_requests(vsie_page); + rc = do_vsie_run(vcpu, vsie_page); + gmap_enable(vcpu->arch.gmap); + } + atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20); + + if (rc == -EAGAIN) + rc = 0; + if (rc || scb_s->icptcode || signal_pending(current) || + kvm_s390_vcpu_has_irq(vcpu, 0)) + break; + }; + + if (rc == -EFAULT) { + /* + * Addressing exceptions are always presentes as intercepts. + * As addressing exceptions are suppressing and our guest 3 PSW + * points at the responsible instruction, we have to + * forward the PSW and set the ilc. If we can't read guest 3 + * instruction, we can use an arbitrary ilc. Let's always use + * ilen = 4 for now, so we can avoid reading in guest 3 virtual + * memory. (we could also fake the shadow so the hardware + * handles it). + */ + scb_s->icptcode = ICPT_PROGI; + scb_s->iprcc = PGM_ADDRESSING; + scb_s->pgmilc = 4; + scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4); + } + return rc; +} + +/* + * Get or create a vsie page for a scb address. + * + * Returns: - address of a vsie page (cached or new one) + * - NULL if the same scb address is already used by another VCPU + * - ERR_PTR(-ENOMEM) if out of memory + */ +static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) +{ + struct vsie_page *vsie_page; + struct page *page; + int nr_vcpus; + + rcu_read_lock(); + page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); + rcu_read_unlock(); + if (page) { + if (page_ref_inc_return(page) == 2) + return page_to_virt(page); + page_ref_dec(page); + } + + /* + * We want at least #online_vcpus shadows, so every VCPU can execute + * the VSIE in parallel. + */ + nr_vcpus = atomic_read(&kvm->online_vcpus); + + mutex_lock(&kvm->arch.vsie.mutex); + if (kvm->arch.vsie.page_count < nr_vcpus) { + page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA); + if (!page) { + mutex_unlock(&kvm->arch.vsie.mutex); + return ERR_PTR(-ENOMEM); + } + page_ref_inc(page); + kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page; + kvm->arch.vsie.page_count++; + } else { + /* reuse an existing entry that belongs to nobody */ + while (true) { + page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; + if (page_ref_inc_return(page) == 2) + break; + page_ref_dec(page); + kvm->arch.vsie.next++; + kvm->arch.vsie.next %= nr_vcpus; + } + radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + } + page->index = addr; + /* double use of the same address */ + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { + page_ref_dec(page); + mutex_unlock(&kvm->arch.vsie.mutex); + return NULL; + } + mutex_unlock(&kvm->arch.vsie.mutex); + + vsie_page = page_to_virt(page); + memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); + release_gmap_shadow(vsie_page); + vsie_page->fault_addr = 0; + vsie_page->scb_s.ihcpu = 0xffffU; + return vsie_page; +} + +/* put a vsie page acquired via get_vsie_page */ +static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page) +{ + struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT); + + page_ref_dec(page); +} + +int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) +{ + struct vsie_page *vsie_page; + unsigned long scb_addr; + int rc; + + vcpu->stat.instruction_sie++; + if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2)) + return -EOPNOTSUPP; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + BUILD_BUG_ON(sizeof(struct vsie_page) != 4096); + scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL); + + /* 512 byte alignment */ + if (unlikely(scb_addr & 0x1ffUL)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0)) + return 0; + + vsie_page = get_vsie_page(vcpu->kvm, scb_addr); + if (IS_ERR(vsie_page)) + return PTR_ERR(vsie_page); + else if (!vsie_page) + /* double use of sie control block - simply do nothing */ + return 0; + + rc = pin_scb(vcpu, vsie_page, scb_addr); + if (rc) + goto out_put; + rc = shadow_scb(vcpu, vsie_page); + if (rc) + goto out_unpin_scb; + rc = pin_blocks(vcpu, vsie_page); + if (rc) + goto out_unshadow; + register_shadow_scb(vcpu, vsie_page); + rc = vsie_run(vcpu, vsie_page); + unregister_shadow_scb(vcpu); + unpin_blocks(vcpu, vsie_page); +out_unshadow: + unshadow_scb(vcpu, vsie_page); +out_unpin_scb: + unpin_scb(vcpu, vsie_page, scb_addr); +out_put: + put_vsie_page(vcpu->kvm, vsie_page); + + return rc < 0 ? rc : 0; +} + +/* Init the vsie data structures. To be called when a vm is initialized. */ +void kvm_s390_vsie_init(struct kvm *kvm) +{ + mutex_init(&kvm->arch.vsie.mutex); + INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL); +} + +/* Destroy the vsie data structures. To be called when a vm is destroyed. */ +void kvm_s390_vsie_destroy(struct kvm *kvm) +{ + struct vsie_page *vsie_page; + struct page *page; + int i; + + mutex_lock(&kvm->arch.vsie.mutex); + for (i = 0; i < kvm->arch.vsie.page_count; i++) { + page = kvm->arch.vsie.pages[i]; + kvm->arch.vsie.pages[i] = NULL; + vsie_page = page_to_virt(page); + release_gmap_shadow(vsie_page); + /* free the radix tree entry */ + radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + __free_page(page); + } + kvm->arch.vsie.page_count = 0; + mutex_unlock(&kvm->arch.vsie.mutex); +} + +void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block); + + /* + * Even if the VCPU lets go of the shadow sie block reference, it is + * still valid in the cache. So we can safely kick it. + */ + if (scb) { + atomic_or(PROG_BLOCK_SIE, &scb->prog20); + if (scb->prog0c & PROG_IN_SIE) + atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags); + } +} diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 25783dc3c813..a58bca62a93b 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -418,6 +418,8 @@ static inline int do_exception(struct pt_regs *regs, int access) (struct gmap *) S390_lowcore.gmap : NULL; if (gmap) { current->thread.gmap_addr = address; + current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); + current->thread.gmap_int_code = regs->int_code & 0xffff; address = __gmap_translate(gmap, address); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 063c721ec0dc..2ce6bb3bab32 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -20,14 +20,16 @@ #include #include +#define GMAP_SHADOW_FAKE_TABLE 1ULL + /** - * gmap_alloc - allocate a guest address space + * gmap_alloc - allocate and initialize a guest address space * @mm: pointer to the parent mm_struct * @limit: maximum address of the gmap address space * * Returns a guest address space structure. */ -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) +static struct gmap *gmap_alloc(unsigned long limit) { struct gmap *gmap; struct page *page; @@ -55,10 +57,14 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) if (!gmap) goto out; INIT_LIST_HEAD(&gmap->crst_list); + INIT_LIST_HEAD(&gmap->children); + INIT_LIST_HEAD(&gmap->pt_list); INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC); spin_lock_init(&gmap->guest_table_lock); - gmap->mm = mm; + spin_lock_init(&gmap->shadow_lock); + atomic_set(&gmap->ref_count, 1); page = alloc_pages(GFP_KERNEL, 2); if (!page) goto out_free; @@ -70,9 +76,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) gmap->asce = atype | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | __pa(table); gmap->asce_end = limit; - down_write(&mm->mmap_sem); - list_add(&gmap->list, &mm->context.gmap_list); - up_write(&mm->mmap_sem); return gmap; out_free: @@ -80,7 +83,28 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) out: return NULL; } -EXPORT_SYMBOL_GPL(gmap_alloc); + +/** + * gmap_create - create a guest address space + * @mm: pointer to the parent mm_struct + * @limit: maximum size of the gmap address space + * + * Returns a guest address space structure. + */ +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) +{ + struct gmap *gmap; + + gmap = gmap_alloc(limit); + if (!gmap) + return NULL; + gmap->mm = mm; + spin_lock(&mm->context.gmap_lock); + list_add_rcu(&gmap->list, &mm->context.gmap_list); + spin_unlock(&mm->context.gmap_lock); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_create); static void gmap_flush_tlb(struct gmap *gmap) { @@ -114,31 +138,117 @@ static void gmap_radix_tree_free(struct radix_tree_root *root) } while (nr > 0); } +static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) +{ + struct gmap_rmap *rmap, *rnext, *head; + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + head = radix_tree_delete(root, index); + gmap_for_each_rmap_safe(rmap, rnext, head) + kfree(rmap); + } + } while (nr > 0); +} + /** * gmap_free - free a guest address space * @gmap: pointer to the guest address space structure + * + * No locks required. There are no references to this gmap anymore. */ -void gmap_free(struct gmap *gmap) +static void gmap_free(struct gmap *gmap) { struct page *page, *next; - /* Flush tlb. */ - if (MACHINE_HAS_IDTE) - __tlb_flush_idte(gmap->asce); - else - __tlb_flush_global(); - + /* Flush tlb of all gmaps (if not already done for shadows) */ + if (!(gmap_is_shadow(gmap) && gmap->removed)) + gmap_flush_tlb(gmap); /* Free all segment & region tables. */ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) __free_pages(page, 2); gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); - down_write(&gmap->mm->mmap_sem); - list_del(&gmap->list); - up_write(&gmap->mm->mmap_sem); + + /* Free additional data for a shadow gmap */ + if (gmap_is_shadow(gmap)) { + /* Free all page tables. */ + list_for_each_entry_safe(page, next, &gmap->pt_list, lru) + page_table_free_pgste(page); + gmap_rmap_radix_tree_free(&gmap->host_to_rmap); + /* Release reference to the parent */ + gmap_put(gmap->parent); + } + kfree(gmap); } -EXPORT_SYMBOL_GPL(gmap_free); + +/** + * gmap_get - increase reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * Returns the gmap pointer + */ +struct gmap *gmap_get(struct gmap *gmap) +{ + atomic_inc(&gmap->ref_count); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_get); + +/** + * gmap_put - decrease reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * If the reference counter reaches zero the guest address space is freed. + */ +void gmap_put(struct gmap *gmap) +{ + if (atomic_dec_return(&gmap->ref_count) == 0) + gmap_free(gmap); +} +EXPORT_SYMBOL_GPL(gmap_put); + +/** + * gmap_remove - remove a guest address space but do not free it yet + * @gmap: pointer to the guest address space structure + */ +void gmap_remove(struct gmap *gmap) +{ + struct gmap *sg, *next; + + /* Remove all shadow gmaps linked to this gmap */ + if (!list_empty(&gmap->children)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, &gmap->children, list) { + list_del(&sg->list); + gmap_put(sg); + } + spin_unlock(&gmap->shadow_lock); + } + /* Remove gmap from the pre-mm list */ + spin_lock(&gmap->mm->context.gmap_lock); + list_del_rcu(&gmap->list); + spin_unlock(&gmap->mm->context.gmap_lock); + synchronize_rcu(); + /* Put reference */ + gmap_put(gmap); +} +EXPORT_SYMBOL_GPL(gmap_remove); /** * gmap_enable - switch primary space to the guest address space @@ -160,6 +270,17 @@ void gmap_disable(struct gmap *gmap) } EXPORT_SYMBOL_GPL(gmap_disable); +/** + * gmap_get_enabled - get a pointer to the currently enabled gmap + * + * Returns a pointer to the currently enabled gmap. 0 if none is enabled. + */ +struct gmap *gmap_get_enabled(void) +{ + return (struct gmap *) S390_lowcore.gmap; +} +EXPORT_SYMBOL_GPL(gmap_get_enabled); + /* * gmap_alloc_table is assumed to be called with mmap_sem held */ @@ -175,7 +296,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, return -ENOMEM; new = (unsigned long *) page_to_phys(page); crst_table_init(new, init); - spin_lock(&gmap->mm->page_table_lock); + spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { list_add(&page->lru, &gmap->crst_list); *table = (unsigned long) new | _REGION_ENTRY_LENGTH | @@ -183,7 +304,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, page->index = gaddr; page = NULL; } - spin_unlock(&gmap->mm->page_table_lock); + spin_unlock(&gmap->guest_table_lock); if (page) __free_pages(page, 2); return 0; @@ -219,6 +340,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) unsigned long *entry; int flush = 0; + BUG_ON(gmap_is_shadow(gmap)); spin_lock(&gmap->guest_table_lock); entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); if (entry) { @@ -258,6 +380,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) unsigned long off; int flush; + BUG_ON(gmap_is_shadow(gmap)); if ((to | len) & (PMD_SIZE - 1)) return -EINVAL; if (len == 0 || to + len < to) @@ -289,6 +412,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long off; int flush; + BUG_ON(gmap_is_shadow(gmap)); if ((from | to | len) & (PMD_SIZE - 1)) return -EINVAL; if (len == 0 || from + len < from || to + len < to || @@ -326,6 +450,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment); * This function does not establish potentially missing page table entries. * The mmap_sem of the mm that belongs to the address space must be held * when this function gets called. + * + * Note: Can also be called for shadow gmaps. */ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) { @@ -333,6 +459,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); + /* Note: guest_to_host is empty for a shadow gmap */ return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; } EXPORT_SYMBOL_GPL(__gmap_translate); @@ -369,11 +496,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table, struct gmap *gmap; int flush; - list_for_each_entry(gmap, &mm->context.gmap_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); if (flush) gmap_flush_tlb(gmap); } + rcu_read_unlock(); } /** @@ -397,6 +526,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) pmd_t *pmd; int rc; + BUG_ON(gmap_is_shadow(gmap)); /* Create higher level tables in the gmap page table */ table = gmap->table; if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { @@ -552,116 +682,1412 @@ static LIST_HEAD(gmap_notifier_list); static DEFINE_SPINLOCK(gmap_notifier_lock); /** - * gmap_register_ipte_notifier - register a pte invalidation callback + * gmap_register_pte_notifier - register a pte invalidation callback * @nb: pointer to the gmap notifier block */ -void gmap_register_ipte_notifier(struct gmap_notifier *nb) +void gmap_register_pte_notifier(struct gmap_notifier *nb) { spin_lock(&gmap_notifier_lock); - list_add(&nb->list, &gmap_notifier_list); + list_add_rcu(&nb->list, &gmap_notifier_list); spin_unlock(&gmap_notifier_lock); } -EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); +EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); /** - * gmap_unregister_ipte_notifier - remove a pte invalidation callback + * gmap_unregister_pte_notifier - remove a pte invalidation callback * @nb: pointer to the gmap notifier block */ -void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) +void gmap_unregister_pte_notifier(struct gmap_notifier *nb) { spin_lock(&gmap_notifier_lock); - list_del_init(&nb->list); + list_del_rcu(&nb->list); spin_unlock(&gmap_notifier_lock); + synchronize_rcu(); } -EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); +EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); /** - * gmap_ipte_notify - mark a range of ptes for invalidation notification + * gmap_call_notifier - call all registered invalidation callbacks + * @gmap: pointer to guest mapping meta data structure + * @start: start virtual address in the guest address space + * @end: end virtual address in the guest address space + */ +static void gmap_call_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct gmap_notifier *nb; + + list_for_each_entry(nb, &gmap_notifier_list, list) + nb->notifier_call(gmap, start, end); +} + +/** + * gmap_table_walk - walk the gmap page tables + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @level: page table level to stop at + * + * Returns a table entry pointer for the given guest address and @level + * @level=0 : returns a pointer to a page table table entry (or NULL) + * @level=1 : returns a pointer to a segment table entry (or NULL) + * @level=2 : returns a pointer to a region-3 table entry (or NULL) + * @level=3 : returns a pointer to a region-2 table entry (or NULL) + * @level=4 : returns a pointer to a region-1 table entry (or NULL) + * + * Returns NULL if the gmap page tables could not be walked to the + * requested level. + * + * Note: Can also be called for shadow gmaps. + */ +static inline unsigned long *gmap_table_walk(struct gmap *gmap, + unsigned long gaddr, int level) +{ + unsigned long *table; + + if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4)) + return NULL; + if (gmap_is_shadow(gmap) && gmap->removed) + return NULL; + if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11))) + return NULL; + table = gmap->table; + switch (gmap->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + table += (gaddr >> 53) & 0x7ff; + if (level == 4) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_REGION2: + table += (gaddr >> 42) & 0x7ff; + if (level == 3) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_REGION3: + table += (gaddr >> 31) & 0x7ff; + if (level == 2) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_SEGMENT: + table += (gaddr >> 20) & 0x7ff; + if (level == 1) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table += (gaddr >> 12) & 0xff; + } + return table; +} + +/** + * gmap_pte_op_walk - walk the gmap page table, get the page table lock + * and return the pte pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @ptl: pointer to the spinlock pointer + * + * Returns a pointer to the locked pte for a guest address, or NULL + * + * Note: Can also be called for shadow gmaps. + */ +static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, + spinlock_t **ptl) +{ + unsigned long *table; + + if (gmap_is_shadow(gmap)) + spin_lock(&gmap->guest_table_lock); + /* Walk the gmap page table, lock and get pte pointer */ + table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ + if (!table || *table & _SEGMENT_ENTRY_INVALID) { + if (gmap_is_shadow(gmap)) + spin_unlock(&gmap->guest_table_lock); + return NULL; + } + if (gmap_is_shadow(gmap)) { + *ptl = &gmap->guest_table_lock; + return pte_offset_map((pmd_t *) table, gaddr); + } + return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); +} + +/** + * gmap_pte_op_fixup - force a page in and connect the gmap page table + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @vmaddr: address in the host process address space + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if the caller can retry __gmap_translate (might fail again), + * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing + * up or connecting the gmap page table. + */ +static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, + unsigned long vmaddr, int prot) +{ + struct mm_struct *mm = gmap->mm; + unsigned int fault_flags; + bool unlocked = false; + + BUG_ON(gmap_is_shadow(gmap)); + fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; + if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked)) + return -EFAULT; + if (unlocked) + /* lost mmap_sem, caller has to retry __gmap_translate */ + return 0; + /* Connect the page tables */ + return __gmap_link(gmap, gaddr, vmaddr); +} + +/** + * gmap_pte_op_end - release the page table lock + * @ptl: pointer to the spinlock pointer + */ +static void gmap_pte_op_end(spinlock_t *ptl) +{ + spin_unlock(ptl); +} + +/* + * gmap_protect_range - remove access rights to memory and set pgste bits * @gmap: pointer to guest mapping meta data structure * @gaddr: virtual address in the guest address space * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set * - * Returns 0 if for each page in the given range a gmap mapping exists and - * the invalidation notification could be set. If the gmap mapping is missing - * for one or more pages -EFAULT is returned. If no memory could be allocated - * -ENOMEM is returned. This function establishes missing page table entries. + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EFAULT if gaddr is invalid (or mapping for shadows is missing). + * + * Called with sg->mm->mmap_sem in read. + * + * Note: Can also be called for shadow gmaps. */ -int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) +static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot, unsigned long bits) { - unsigned long addr; + unsigned long vmaddr; spinlock_t *ptl; pte_t *ptep; - bool unlocked; - int rc = 0; + int rc; - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) + while (len) { + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits); + gmap_pte_op_end(ptl); + } + if (rc) { + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); + if (rc) + return rc; + continue; + } + gaddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + return 0; +} + +/** + * gmap_mprotect_notify - change access rights for a range of ptes and + * call the notifier if any pte changes again + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if for each page in the given range a gmap mapping exists, + * the new access rights could be set and the notifier could be armed. + * If the gmap mapping is missing for one or more pages -EFAULT is + * returned. If no memory could be allocated -ENOMEM is returned. + * This function establishes missing page table entries. + */ +int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot) +{ + int rc; + + if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) + return -EINVAL; + if (!MACHINE_HAS_ESOP && prot == PROT_READ) return -EINVAL; down_read(&gmap->mm->mmap_sem); - while (len) { - unlocked = false; - /* Convert gmap address and connect the page tables */ - addr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(addr)) { - rc = addr; - break; - } - /* Get the page mapped */ - if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, - &unlocked)) { - rc = -EFAULT; - break; - } - /* While trying to map mmap_sem got unlocked. Let us retry */ - if (unlocked) - continue; - rc = __gmap_link(gmap, gaddr, addr); - if (rc) - break; - /* Walk the process page table, lock and get pte pointer */ - ptep = get_locked_pte(gmap->mm, addr, &ptl); - VM_BUG_ON(!ptep); - /* Set notification bit in the pgste of the pte */ - if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { - ptep_set_notify(gmap->mm, addr, ptep); - gaddr += PAGE_SIZE; - len -= PAGE_SIZE; - } - pte_unmap_unlock(ptep, ptl); - } + rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT); up_read(&gmap->mm->mmap_sem); return rc; } -EXPORT_SYMBOL_GPL(gmap_ipte_notify); +EXPORT_SYMBOL_GPL(gmap_mprotect_notify); + +/** + * gmap_read_table - get an unsigned long value from a guest page table using + * absolute addressing, without marking the page referenced. + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @val: pointer to the unsigned long value to return + * + * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT + * if reading using the virtual address failed. + * + * Called with gmap->mm->mmap_sem in read. + */ +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) +{ + unsigned long address, vmaddr; + spinlock_t *ptl; + pte_t *ptep, pte; + int rc; + + while (1) { + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + pte = *ptep; + if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { + address = pte_val(pte) & PAGE_MASK; + address += gaddr & ~PAGE_MASK; + *val = *(unsigned long *) address; + pte_val(*ptep) |= _PAGE_YOUNG; + /* Do *NOT* clear the _PAGE_INVALID bit! */ + rc = 0; + } + gmap_pte_op_end(ptl); + } + if (!rc) + break; + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); + if (rc) + break; + } + return rc; +} +EXPORT_SYMBOL_GPL(gmap_read_table); + +/** + * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree + * @sg: pointer to the shadow guest address space structure + * @vmaddr: vm address associated with the rmap + * @rmap: pointer to the rmap structure + * + * Called with the sg->guest_table_lock + */ +static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, + struct gmap_rmap *rmap) +{ + void **slot; + + BUG_ON(!gmap_is_shadow(sg)); + slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); + if (slot) { + rmap->next = radix_tree_deref_slot_protected(slot, + &sg->guest_table_lock); + radix_tree_replace_slot(slot, rmap); + } else { + rmap->next = NULL; + radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, + rmap); + } +} + +/** + * gmap_protect_rmap - modify access rights to memory and create an rmap + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow gmap + * @paddr: address in the parent guest address space + * @len: length of the memory area to protect + * @prot: indicates access rights: none, read-only or read-write + * + * Returns 0 if successfully protected and the rmap was created, -ENOMEM + * if out of memory and -EFAULT if paddr is invalid. + */ +static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, + unsigned long paddr, unsigned long len, int prot) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + while (len) { + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + if (!rmap) + return -ENOMEM; + rmap->raddr = raddr; + rc = radix_tree_preload(GFP_KERNEL); + if (rc) { + kfree(rmap); + return rc; + } + rc = -EAGAIN; + ptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (ptep) { + spin_lock(&sg->guest_table_lock); + rc = ptep_force_prot(parent->mm, paddr, ptep, prot, + PGSTE_VSIE_BIT); + if (!rc) + gmap_insert_rmap(sg, vmaddr, rmap); + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptl); + } + radix_tree_preload_end(); + if (rc) { + kfree(rmap); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + if (rc) + return rc; + continue; + } + paddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + return 0; +} + +#define _SHADOW_RMAP_MASK 0x7 +#define _SHADOW_RMAP_REGION1 0x5 +#define _SHADOW_RMAP_REGION2 0x4 +#define _SHADOW_RMAP_REGION3 0x3 +#define _SHADOW_RMAP_SEGMENT 0x2 +#define _SHADOW_RMAP_PGTABLE 0x1 + +/** + * gmap_idte_one - invalidate a single region or segment table entry + * @asce: region or segment table *origin* + table-type bits + * @vaddr: virtual address to identify the table entry to flush + * + * The invalid bit of a single region or segment table entry is set + * and the associated TLB entries depending on the entry are flushed. + * The table-type of the @asce identifies the portion of the @vaddr + * that is used as the invalidation index. + */ +static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) +{ + asm volatile( + " .insn rrf,0xb98e0000,%0,%1,0,0" + : : "a" (asce), "a" (vaddr) : "cc", "memory"); +} + +/** + * gmap_unshadow_page - remove a page from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ + if (!table || *table & _PAGE_INVALID) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1); + ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); +} + +/** + * __gmap_unshadow_pgt - remove all entries from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @pgt: pointer to the start of a shadow page table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, + unsigned long *pgt) +{ + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < 256; i++, raddr += 1UL << 12) + pgt[i] = _PAGE_INVALID; +} + +/** + * gmap_unshadow_pgt - remove a shadow page table from a segment entry + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long sto, *ste, *pgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ + if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1); + sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff)); + gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); + pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + *ste = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, pgt); + /* Free page table */ + page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + list_del(&page->lru); + page_table_free_pgste(page); +} + +/** + * __gmap_unshadow_sgt - remove all entries from a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @sgt: pointer to the start of a shadow segment table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, + unsigned long *sgt) +{ + unsigned long asce, *pgt; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT; + for (i = 0; i < 2048; i++, raddr += 1UL << 20) { + if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) + continue; + pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + sgt[i] = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, pgt); + /* Free page table */ + page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + list_del(&page->lru); + page_table_free_pgste(page); + } +} + +/** + * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the shadow->guest_table_lock + */ +static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long r3o, *r3e, *sgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ + if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1); + r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff)); + gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); + sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + *r3e = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, sgt); + /* Free segment table */ + page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * @r3t: pointer to the start of a shadow region-3 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, + unsigned long *r3t) +{ + unsigned long asce, *sgt; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r3t | _ASCE_TYPE_REGION3; + for (i = 0; i < 2048; i++, raddr += 1UL << 31) { + if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) + continue; + sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + r3t[i] = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, sgt); + /* Free segment table */ + page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r2o, *r2e, *r3t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ + if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1); + r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff)); + gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); + r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + *r2e = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, r3t); + /* Free region 3 table */ + page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r2t: pointer to the start of a shadow region-2 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, + unsigned long *r2t) +{ + unsigned long asce, *r3t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r2t | _ASCE_TYPE_REGION2; + for (i = 0; i < 2048; i++, raddr += 1UL << 42) { + if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r2t[i] = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, r3t); + /* Free region 3 table */ + page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r1o, *r1e, *r2t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ + if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1); + r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff)); + gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); + r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + *r1e = _REGION1_ENTRY_EMPTY; + __gmap_unshadow_r2t(sg, raddr, r2t); + /* Free region 2 table */ + page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r1t: pointer to the start of a shadow region-1 table + * + * Called with the shadow->guest_table_lock + */ +static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, + unsigned long *r1t) +{ + unsigned long asce, *r2t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; + for (i = 0; i < 2048; i++, raddr += 1UL << 53) { + if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); + __gmap_unshadow_r2t(sg, raddr, r2t); + /* Clear entry and flush translation r1t -> r2t */ + gmap_idte_one(asce, raddr); + r1t[i] = _REGION1_ENTRY_EMPTY; + /* Free region 2 table */ + page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow - remove a shadow page table completely + * @sg: pointer to the shadow guest address space structure + * + * Called with sg->guest_table_lock + */ +static void gmap_unshadow(struct gmap *sg) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + if (sg->removed) + return; + sg->removed = 1; + gmap_call_notifier(sg, 0, -1UL); + gmap_flush_tlb(sg); + table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + switch (sg->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + __gmap_unshadow_r1t(sg, 0, table); + break; + case _ASCE_TYPE_REGION2: + __gmap_unshadow_r2t(sg, 0, table); + break; + case _ASCE_TYPE_REGION3: + __gmap_unshadow_r3t(sg, 0, table); + break; + case _ASCE_TYPE_SEGMENT: + __gmap_unshadow_sgt(sg, 0, table); + break; + } +} + +/** + * gmap_find_shadow - find a specific asce in the list of shadow tables + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * Returns the pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, + int edat_level) +{ + struct gmap *sg; + + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce != asce || sg->edat_level != edat_level || + sg->removed) + continue; + if (!sg->initialized) + return ERR_PTR(-EAGAIN); + atomic_inc(&sg->ref_count); + return sg; + } + return NULL; +} + +/** + * gmap_shadow_valid - check if a shadow guest address space matches the + * given properties and is still valid + * @sg: pointer to the shadow guest address space structure + * @asce: ASCE for which the shadow table is requested + * @edat_level: edat level to be used for the shadow translation + * + * Returns 1 if the gmap shadow is still valid and matches the given + * properties, the caller can continue using it. Returns 0 otherwise, the + * caller has to request a new shadow gmap in this case. + * + */ +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +{ + if (sg->removed) + return 0; + return sg->orig_asce == asce && sg->edat_level == edat_level; +} +EXPORT_SYMBOL_GPL(gmap_shadow_valid); + +/** + * gmap_shadow - create/find a shadow guest address space + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. + */ +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, + int edat_level) +{ + struct gmap *sg, *new; + unsigned long limit; + int rc; + + BUG_ON(gmap_is_shadow(parent)); + spin_lock(&parent->shadow_lock); + sg = gmap_find_shadow(parent, asce, edat_level); + spin_unlock(&parent->shadow_lock); + if (sg) + return sg; + /* Create a new shadow gmap */ + limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); + if (asce & _ASCE_REAL_SPACE) + limit = -1UL; + new = gmap_alloc(limit); + if (!new) + return ERR_PTR(-ENOMEM); + new->mm = parent->mm; + new->parent = gmap_get(parent); + new->orig_asce = asce; + new->edat_level = edat_level; + new->initialized = false; + spin_lock(&parent->shadow_lock); + /* Recheck if another CPU created the same shadow */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + spin_unlock(&parent->shadow_lock); + gmap_free(new); + return sg; + } + if (asce & _ASCE_REAL_SPACE) { + /* only allow one real-space gmap shadow */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce & _ASCE_REAL_SPACE) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + break; + } + } + } + atomic_set(&new->ref_count, 2); + list_add(&new->list, &parent->children); + if (asce & _ASCE_REAL_SPACE) { + /* nothing to protect, return right away */ + new->initialized = true; + spin_unlock(&parent->shadow_lock); + return new; + } + spin_unlock(&parent->shadow_lock); + /* protect after insertion, so it will get properly invalidated */ + down_read(&parent->mm->mmap_sem); + rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, + ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096, + PROT_READ, PGSTE_VSIE_BIT); + up_read(&parent->mm->mmap_sem); + spin_lock(&parent->shadow_lock); + new->initialized = true; + if (rc) { + list_del(&new->list); + gmap_free(new); + new = ERR_PTR(rc); + } + spin_unlock(&parent->shadow_lock); + return new; +} +EXPORT_SYMBOL_GPL(gmap_shadow); + +/** + * gmap_shadow_r2t - create an empty shadow region 2 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r2t: parent gmap address of the region 2 table to get shadowed + * @fake: r2t references contiguous guest memory block, not a r2t + * + * The r2t parameter specifies the address of the source table. The + * four pages of the source table are made read-only in the parent gmap + * address space. A write to the source table area @r2t will automatically + * remove the shadow r2 table and all of its decendents. + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_r2t, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = r2t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_r2t = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r2t & _REGION_ENTRY_PROTECT); + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r2t read-only in parent gmap page table */ + raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1; + origin = r2t & _REGION_ENTRY_ORIGIN; + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 4); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_r2t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r2t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r2t); + +/** + * gmap_shadow_r3t - create a shadow region 3 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r3t: parent gmap address of the region 3 table to get shadowed + * @fake: r3t references contiguous guest memory block, not a r3t + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_r3t, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = r3t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_r3t = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + } + crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r3t & _REGION_ENTRY_PROTECT); + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r3t read-only in parent gmap page table */ + raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2; + origin = r3t & _REGION_ENTRY_ORIGIN; + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 3); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_r3t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r3t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r3t); + +/** + * gmap_shadow_sgt - create a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @sgt: parent gmap address of the segment table to get shadowed + * @fake: sgt references contiguous guest memory block, not a sgt + * + * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_sgt, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); + /* Allocate a shadow segment table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = sgt & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_sgt = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= sgt & _REGION_ENTRY_PROTECT; + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make sgt read-only in parent gmap page table */ + raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3; + origin = sgt & _REGION_ENTRY_ORIGIN; + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 2); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_sgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_sgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_sgt); + +/** + * gmap_shadow_lookup_pgtable - find a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: the address in the shadow aguest address space + * @pgt: parent gmap address of the page table to get shadowed + * @dat_protection: if the pgtable is marked as protected by dat + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if the shadow page table was found and -EAGAIN if the page + * table was not found. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, + int *fake) +{ + unsigned long *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { + /* Shadow page tables are full pages (pte+pgste) */ + page = pfn_to_page(*table >> PAGE_SHIFT); + *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; + *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); + *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); + rc = 0; + } else { + rc = -EAGAIN; + } + spin_unlock(&sg->guest_table_lock); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); + +/** + * gmap_shadow_pgt - instantiate a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: parent gmap address of the page table to get shadowed + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory, + * -EFAULT if an address in the parent gmap could not be resolved and + * + * Called with gmap->mm->mmap_sem in read + */ +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake) +{ + unsigned long raddr, origin; + unsigned long *s_pgt, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); + /* Allocate a shadow page table */ + page = page_table_alloc_pgste(sg->mm); + if (!page) + return -ENOMEM; + page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_pgt = (unsigned long *) page_to_phys(page); + /* Install shadow page table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _SEGMENT_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _SEGMENT_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | + (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; + list_add(&page->lru, &sg->pt_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_SEGMENT_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make pgt read-only in parent gmap page table (not the pgste) */ + raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT; + origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; + rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 1); + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != + (unsigned long) s_pgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_SEGMENT_ENTRY_INVALID; + } else { + gmap_unshadow_pgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + page_table_free_pgste(page); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt); + +/** + * gmap_shadow_page - create a shadow page mapping + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pte: pte in parent gmap address space to get shadowed + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr, paddr; + spinlock_t *ptl; + pte_t *sptep, *tptep; + int prot; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; + + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + if (!rmap) + return -ENOMEM; + rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; + + while (1) { + paddr = pte_val(pte) & PAGE_MASK; + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + break; + rc = -EAGAIN; + sptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (sptep) { + spin_lock(&sg->guest_table_lock); + /* Get page table pointer */ + tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); + if (!tptep) { + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptl); + radix_tree_preload_end(); + break; + } + rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); + if (rc > 0) { + /* Success and a new mapping */ + gmap_insert_rmap(sg, vmaddr, rmap); + rmap = NULL; + rc = 0; + } + gmap_pte_op_end(ptl); + spin_unlock(&sg->guest_table_lock); + } + radix_tree_preload_end(); + if (!rc) + break; + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + if (rc) + break; + } + kfree(rmap); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_page); + +/** + * gmap_shadow_notify - handle notifications for shadow gmap + * + * Called with sg->parent->shadow_lock. + */ +static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, + unsigned long offset, pte_t *pte) +{ + struct gmap_rmap *rmap, *rnext, *head; + unsigned long gaddr, start, end, bits, raddr; + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + spin_lock(&sg->parent->guest_table_lock); + table = radix_tree_lookup(&sg->parent->host_to_guest, + vmaddr >> PMD_SHIFT); + gaddr = table ? __gmap_segment_gaddr(table) + offset : 0; + spin_unlock(&sg->parent->guest_table_lock); + if (!table) + return; + + spin_lock(&sg->guest_table_lock); + if (sg->removed) { + spin_unlock(&sg->guest_table_lock); + return; + } + /* Check for top level table */ + start = sg->orig_asce & _ASCE_ORIGIN; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096; + if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && + gaddr < end) { + /* The complete shadow table has to go */ + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + return; + } + /* Remove the page table tree from on specific entry */ + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12); + gmap_for_each_rmap_safe(rmap, rnext, head) { + bits = rmap->raddr & _SHADOW_RMAP_MASK; + raddr = rmap->raddr ^ bits; + switch (bits) { + case _SHADOW_RMAP_REGION1: + gmap_unshadow_r2t(sg, raddr); + break; + case _SHADOW_RMAP_REGION2: + gmap_unshadow_r3t(sg, raddr); + break; + case _SHADOW_RMAP_REGION3: + gmap_unshadow_sgt(sg, raddr); + break; + case _SHADOW_RMAP_SEGMENT: + gmap_unshadow_pgt(sg, raddr); + break; + case _SHADOW_RMAP_PGTABLE: + gmap_unshadow_page(sg, raddr); + break; + } + kfree(rmap); + } + spin_unlock(&sg->guest_table_lock); +} /** * ptep_notify - call all invalidation callbacks for a specific pte. * @mm: pointer to the process mm_struct * @addr: virtual address in the process address space * @pte: pointer to the page table entry + * @bits: bits from the pgste that caused the notify call * * This function is assumed to be called with the page table lock held * for the pte to notify. */ -void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) +void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, + pte_t *pte, unsigned long bits) { unsigned long offset, gaddr; unsigned long *table; - struct gmap_notifier *nb; - struct gmap *gmap; + struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); offset = offset * (4096 / sizeof(pte_t)); - spin_lock(&gmap_notifier_lock); - list_for_each_entry(gmap, &mm->context.gmap_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, + &gmap->children, list) + gmap_shadow_notify(sg, vmaddr, offset, pte); + spin_unlock(&gmap->shadow_lock); + } + if (!(bits & PGSTE_IN_BIT)) + continue; + spin_lock(&gmap->guest_table_lock); table = radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); - if (!table) - continue; - gaddr = __gmap_segment_gaddr(table) + offset; - list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(gmap, gaddr); + if (table) + gaddr = __gmap_segment_gaddr(table) + offset; + spin_unlock(&gmap->guest_table_lock); + if (table) + gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); } - spin_unlock(&gmap_notifier_lock); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ptep_notify); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index e2565d2d0c32..995f78532cc2 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) return new; } +#ifdef CONFIG_PGSTE + +struct page *page_table_alloc_pgste(struct mm_struct *mm) +{ + struct page *page; + unsigned long *table; + + page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + if (page) { + table = (unsigned long *) page_to_phys(page); + clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); + clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); + } + return page; +} + +void page_table_free_pgste(struct page *page) +{ + __free_page(page); +} + +#endif /* CONFIG_PGSTE */ + /* * page table entry allocation/free routines. */ @@ -149,7 +172,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Try to get a fragment of a 4K page as a 2K page table */ if (!mm_alloc_pgste(mm)) { table = NULL; - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); if (!list_empty(&mm->context.pgtable_list)) { page = list_first_entry(&mm->context.pgtable_list, struct page, lru); @@ -164,7 +187,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) list_del(&page->lru); } } - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); if (table) return table; } @@ -187,9 +210,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Return the first 2K fragment of the page */ atomic_set(&page->_mapcount, 1); clear_table(table, _PAGE_INVALID, PAGE_SIZE); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); } return table; } @@ -203,13 +226,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); mask = atomic_xor_bits(&page->_mapcount, 1U << bit); if (mask & 3) list_add(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); if (mask != 0) return; } @@ -235,13 +258,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, return; } bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); if (mask & 3) list_add_tail(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); table = (unsigned long *) (__pa(table) | (1U << bit)); tlb_remove_table(tlb, table); } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index b98d1a152d46..5f092015aaa7 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -174,14 +174,17 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) return pgste; } -static inline pgste_t pgste_ipte_notify(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, pgste_t pgste) +static inline pgste_t pgste_pte_notify(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pgste_t pgste) { #ifdef CONFIG_PGSTE - if (pgste_val(pgste) & PGSTE_IN_BIT) { - pgste_val(pgste) &= ~PGSTE_IN_BIT; - ptep_notify(mm, addr, ptep); + unsigned long bits; + + bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); + if (bits) { + pgste_val(pgste) ^= bits; + ptep_notify(mm, addr, ptep, bits); } #endif return pgste; @@ -194,7 +197,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm, if (mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); - pgste = pgste_ipte_notify(mm, addr, ptep, pgste); + pgste = pgste_pte_notify(mm, addr, ptep, pgste); } return pgste; } @@ -459,6 +462,90 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) preempt_enable(); } +/** + * ptep_force_prot - change access rights of a locked pte + * @mm: pointer to the process mm_struct + * @addr: virtual address in the guest address space + * @ptep: pointer to the page table entry + * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bit: pgste bit to set (e.g. for notification) + * + * Returns 0 if the access rights were changed and -EAGAIN if the current + * and requested access rights are incompatible. + */ +int ptep_force_prot(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int prot, unsigned long bit) +{ + pte_t entry; + pgste_t pgste; + int pte_i, pte_p; + + pgste = pgste_get_lock(ptep); + entry = *ptep; + /* Check pte entry after all locks have been acquired */ + pte_i = pte_val(entry) & _PAGE_INVALID; + pte_p = pte_val(entry) & _PAGE_PROTECT; + if ((pte_i && (prot != PROT_NONE)) || + (pte_p && (prot & PROT_WRITE))) { + pgste_set_unlock(ptep, pgste); + return -EAGAIN; + } + /* Change access rights and set pgste bit */ + if (prot == PROT_NONE && !pte_i) { + ptep_flush_direct(mm, addr, ptep); + pgste = pgste_update_all(entry, pgste, mm); + pte_val(entry) |= _PAGE_INVALID; + } + if (prot == PROT_READ && !pte_p) { + ptep_flush_direct(mm, addr, ptep); + pte_val(entry) &= ~_PAGE_INVALID; + pte_val(entry) |= _PAGE_PROTECT; + } + pgste_val(pgste) |= bit; + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); + return 0; +} + +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, pte_t pte) +{ + pgste_t spgste, tpgste; + pte_t spte, tpte; + int rc = -EAGAIN; + + if (!(pte_val(*tptep) & _PAGE_INVALID)) + return 0; /* already shadowed */ + spgste = pgste_get_lock(sptep); + spte = *sptep; + if (!(pte_val(spte) & _PAGE_INVALID) && + !((pte_val(spte) & _PAGE_PROTECT) && + !(pte_val(pte) & _PAGE_PROTECT))) { + pgste_val(spgste) |= PGSTE_VSIE_BIT; + tpgste = pgste_get_lock(tptep); + pte_val(tpte) = (pte_val(spte) & PAGE_MASK) | + (pte_val(pte) & _PAGE_PROTECT); + /* don't touch the storage key - it belongs to parent pgste */ + tpgste = pgste_set_pte(tptep, tpgste, tpte); + pgste_set_unlock(tptep, tpgste); + rc = 1; + } + pgste_set_unlock(sptep, spgste); + return rc; +} + +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) +{ + pgste_t pgste; + + pgste = pgste_get_lock(ptep); + /* notifier is called by the caller */ + ptep_flush_direct(mm, saddr, ptep); + /* don't touch the storage key - it belongs to parent pgste */ + pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); + pgste_set_unlock(ptep, pgste); +} + static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) { if (!non_swap_entry(entry)) @@ -532,7 +619,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pgste_val(pgste) &= ~PGSTE_UC_BIT; pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { - pgste = pgste_ipte_notify(mm, addr, ptep, pgste); + pgste = pgste_pte_notify(mm, addr, ptep, pgste); __ptep_ipte(addr, ptep); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; @@ -555,12 +642,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, pgste_t old, new; pte_t *ptep; - down_read(&mm->mmap_sem); ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) { - up_read(&mm->mmap_sem); + if (unlikely(!ptep)) return -EFAULT; - } new = old = pgste_get_lock(ptep); pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | @@ -587,45 +671,100 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, pgste_set_unlock(ptep, new); pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); return 0; } EXPORT_SYMBOL(set_guest_storage_key); -unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr) +/** + * Conditionally set a guest storage key (handling csske). + * oldkey will be updated when either mr or mc is set and a pointer is given. + * + * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest + * storage key was updated and -EFAULT on access errors. + */ +int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, unsigned char *oldkey, + bool nq, bool mr, bool mc) +{ + unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; + int rc; + + /* we can drop the pgste lock between getting and setting the key */ + if (mr | mc) { + rc = get_guest_storage_key(current->mm, addr, &tmp); + if (rc) + return rc; + if (oldkey) + *oldkey = tmp; + if (!mr) + mask |= _PAGE_REFERENCED; + if (!mc) + mask |= _PAGE_CHANGED; + if (!((tmp ^ key) & mask)) + return 0; + } + rc = set_guest_storage_key(current->mm, addr, key, nq); + return rc < 0 ? rc : 1; +} +EXPORT_SYMBOL(cond_set_guest_storage_key); + +/** + * Reset a guest reference bit (rrbe), returning the reference and changed bit. + * + * Returns < 0 in case of error, otherwise the cc to be reported to the guest. + */ +int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) +{ + spinlock_t *ptl; + pgste_t old, new; + pte_t *ptep; + int cc = 0; + + ptep = get_locked_pte(mm, addr, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + + new = old = pgste_get_lock(ptep); + /* Reset guest reference bit only */ + pgste_val(new) &= ~PGSTE_GR_BIT; + + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK); + /* Merge real referenced bit into host-set */ + pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; + } + /* Reflect guest's logical view, not physical */ + cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; + /* Changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) + pgste_val(new) |= PGSTE_UC_BIT; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(reset_guest_reference_bit); + +int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char *key) { - unsigned char key; spinlock_t *ptl; pgste_t pgste; pte_t *ptep; - down_read(&mm->mmap_sem); ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) { - up_read(&mm->mmap_sem); + if (unlikely(!ptep)) return -EFAULT; - } + pgste = pgste_get_lock(ptep); - - if (pte_val(*ptep) & _PAGE_INVALID) { - key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; - key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; - key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; - key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; - } else { - key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); - - /* Reflect guest's logical view, not physical */ - if (pgste_val(pgste) & PGSTE_GR_BIT) - key |= _PAGE_REFERENCED; - if (pgste_val(pgste) & PGSTE_GC_BIT) - key |= _PAGE_CHANGED; - } - + *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + if (!(pte_val(*ptep) & _PAGE_INVALID)) + *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); + /* Reflect guest's logical view, not physical */ + *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; pgste_set_unlock(ptep, pgste); pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); - return key; + return 0; } EXPORT_SYMBOL(get_guest_storage_key); #endif diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h index 022d16008a00..2303635158f5 100644 --- a/arch/sparc/include/asm/pci_64.h +++ b/arch/sparc/include/asm/pci_64.h @@ -55,9 +55,6 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) } #define HAVE_ARCH_PCI_RESOURCE_TO_USER -void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, - resource_size_t *start, resource_size_t *end); #endif /* __KERNEL__ */ #endif /* __SPARC64_PCI_H */ diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index c2b202d763a1..9c1878f4fa9f 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -986,16 +986,18 @@ void pci_resource_to_user(const struct pci_dev *pdev, int bar, const struct resource *rp, resource_size_t *start, resource_size_t *end) { - struct pci_pbm_info *pbm = pdev->dev.archdata.host_controller; - unsigned long offset; + struct pci_bus_region region; - if (rp->flags & IORESOURCE_IO) - offset = pbm->io_space.start; - else - offset = pbm->mem_space.start; - - *start = rp->start - offset; - *end = rp->end - offset; + /* + * "User" addresses are shown in /sys/devices/pci.../.../resource + * and /proc/bus/pci/devices and used as mmap offsets for + * /proc/bus/pci/BB/DD.F files (see proc_bus_pci_mmap()). + * + * On sparc, these are PCI bus addresses, i.e., raw BAR values. + */ + pcibios_resource_to_bus(pdev->bus, ®ion, (struct resource *) rp); + *start = region.start; + *end = region.end; } void pcibios_set_master(struct pci_dev *dev) diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index cc0013475444..58650d098fb4 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common @@ -9,6 +9,7 @@ config UML select GENERIC_CPU_DEVICES select GENERIC_IO select GENERIC_CLOCKEVENTS + select HAVE_GCC_PLUGINS select TTY # Needed for line.c config MMU diff --git a/arch/um/Makefile b/arch/um/Makefile index e3abe6f3156d..0ca46ededfc7 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -78,8 +78,8 @@ include $(ARCH_DIR)/Makefile-os-$(OS) KBUILD_CPPFLAGS += -I$(srctree)/$(HOST_DIR)/include \ -I$(srctree)/$(HOST_DIR)/include/uapi \ - -I$(HOST_DIR)/include/generated \ - -I$(HOST_DIR)/include/generated/uapi + -I$(objtree)/$(HOST_DIR)/include/generated \ + -I$(objtree)/$(HOST_DIR)/include/generated/uapi # -Derrno=kernel_errno - This turns all kernel references to errno into # kernel_errno to separate them from the libc errno. This allows -fno-common diff --git a/arch/unicore32/kernel/pci.c b/arch/unicore32/kernel/pci.c index d45fa5f3e9c4..62137d13c6f9 100644 --- a/arch/unicore32/kernel/pci.c +++ b/arch/unicore32/kernel/pci.c @@ -265,10 +265,8 @@ static int __init pci_common_init(void) pci_fixup_irqs(pci_common_swizzle, pci_puv3_map_irq); - if (!pci_has_flag(PCI_PROBE_ONLY)) { - pci_bus_size_bridges(puv3_bus); - pci_bus_assign_resources(puv3_bus); - } + pci_bus_size_bridges(puv3_bus); + pci_bus_assign_resources(puv3_bus); pci_bus_add_devices(puv3_bus); return 0; } @@ -279,9 +277,6 @@ char * __init pcibios_setup(char *str) if (!strcmp(str, "debug")) { debug_pci = 1; return NULL; - } else if (!strcmp(str, "firmware")) { - pci_add_flags(PCI_PROBE_ONLY); - return NULL; } return str; } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2fa55851d2a9..3a9add58d794 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -111,6 +111,7 @@ config X86 select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER + select HAVE_GCC_PLUGINS select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_HW_BREAKPOINT select HAVE_IDE diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index be8e688fa0d4..12ea8f8384f4 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -96,7 +96,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE $(call if_changed,zoffset) -AFLAGS_header.o += -I$(obj) +AFLAGS_header.o += -I$(objtree)/$(obj) $(obj)/header.o: $(obj)/zoffset.h LDFLAGS_setup.elf := -T diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 6ba89a1ab0e5..d5409660f5de 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -75,7 +75,7 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ -fno-omit-frame-pointer -foptimize-sibling-calls \ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO -$(vobjs): KBUILD_CFLAGS += $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. @@ -145,6 +145,7 @@ KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 69e62862b622..33ae3a4d0159 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,8 +35,9 @@ #include #include -#define KVM_MAX_VCPUS 255 -#define KVM_SOFT_MAX_VCPUS 160 +#define KVM_MAX_VCPUS 288 +#define KVM_SOFT_MAX_VCPUS 240 +#define KVM_MAX_VCPU_ID 1023 #define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 @@ -599,6 +600,7 @@ struct kvm_vcpu_arch { u64 mcg_cap; u64 mcg_status; u64 mcg_ctl; + u64 mcg_ext_ctl; u64 *mce_banks; /* Cache MMIO info */ @@ -682,9 +684,12 @@ struct kvm_arch_memory_slot { struct kvm_apic_map { struct rcu_head rcu; u8 mode; - struct kvm_lapic *phys_map[256]; - /* first index is cluster id second is cpu id in a cluster */ - struct kvm_lapic *logical_map[16][16]; + u32 max_apic_id; + union { + struct kvm_lapic *xapic_flat_map[8]; + struct kvm_lapic *xapic_cluster_map[16][4]; + }; + struct kvm_lapic *phys_map[]; }; /* Hyper-V emulation context */ @@ -779,6 +784,9 @@ struct kvm_arch { u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; + + bool x2apic_format; + bool x2apic_broadcast_quirk_disabled; }; struct kvm_vm_stat { @@ -1006,6 +1014,11 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); + + int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc); + void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); + + void (*setup_mce)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1026,7 +1039,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1077,6 +1090,10 @@ extern u32 kvm_max_guest_tsc_khz; extern u8 kvm_tsc_scaling_ratio_frac_bits; /* maximum allowed value of TSC scaling ratio */ extern u64 kvm_max_tsc_scaling_ratio; +/* 1ull << kvm_tsc_scaling_ratio_frac_bits */ +extern u64 kvm_default_tsc_scaling_ratio; + +extern u64 kvm_mce_cap_supported; enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -1352,7 +1369,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); -void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index d0fe23ec7e98..14824fc78f7e 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -193,7 +193,6 @@ struct __attribute__ ((__packed__)) vmcb { struct vmcb_save_area save; }; -#define SVM_CPUID_FEATURE_SHIFT 2 #define SVM_CPUID_FUNC 0x8000000a #define SVM_VM_CR_SVM_DISABLE 4 diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index cce9ee68e335..0116b2ee9e64 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -83,23 +83,19 @@ static inline void cpu_emergency_vmxoff(void) */ static inline int cpu_has_svm(const char **msg) { - uint32_t eax, ebx, ecx, edx; - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { if (msg) *msg = "not amd"; return 0; } - cpuid(0x80000000, &eax, &ebx, &ecx, &edx); - if (eax < SVM_CPUID_FUNC) { + if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) { if (msg) *msg = "can't execute cpuid_8000000a"; return 0; } - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { + if (!boot_cpu_has(X86_FEATURE_SVM)) { if (msg) *msg = "svm not available"; return 0; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 639a6e34500c..ab8e32f7b9a8 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -32,7 +32,6 @@ config KVM select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD - select KVM_APIC_ARCHITECTURE select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a4bf5b45d65a..5fb6c620180e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -645,7 +645,6 @@ static const struct kvm_io_device_ops speaker_dev_ops = { .write = speaker_ioport_write, }; -/* Caller must hold slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; @@ -690,6 +689,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_pit_set_reinject(pit, true); + mutex_lock(&kvm->slots_lock); kvm_iodevice_init(&pit->dev, &pit_dev_ops); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, KVM_PIT_MEM_LENGTH, &pit->dev); @@ -704,12 +704,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) if (ret < 0) goto fail_register_speaker; } + mutex_unlock(&kvm->slots_lock); return pit; fail_register_speaker: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail_register_pit: + mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); kthread_stop(pit->worker_task); fail_kthread: diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 95e0e6481f07..b181426f67b4 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c @@ -28,9 +28,7 @@ #include #include #include -#include #include -#include #include "assigned-dev.h" static bool allow_unsafe_assigned_interrupts; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index dfb4c6476877..25810b144b58 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { - trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ? + (u64)e->msi.address_hi << 32 : 0), + e->msi.data); irq->dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + if (kvm->arch.x2apic_format) + irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); irq->vector = (e->msi.data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; @@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, } EXPORT_SYMBOL_GPL(kvm_set_msi_irq); +static inline bool kvm_msi_route_invalid(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e) +{ + return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); +} + int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { struct kvm_lapic_irq irq; + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + if (!level) return -1; - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); } @@ -153,7 +166,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) return -EWOULDBLOCK; - kvm_set_msi_irq(e, &irq); + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + + kvm_set_msi_irq(kvm, e, &irq); if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) return r; @@ -248,7 +264,8 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; @@ -285,6 +302,9 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, e->msi.address_lo = ue->u.msi.address_lo; e->msi.address_hi = ue->u.msi.address_hi; e->msi.data = ue->u.msi.data; + + if (kvm_msi_route_invalid(kvm, e)) + goto out; break; case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; @@ -388,21 +408,16 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm->arch.nr_reserved_ioapic_pins); for (i = 0; i < nr_ioapic_pins; ++i) { hlist_for_each_entry(entry, &table->map[i], link) { - u32 dest_id, dest_mode; - bool level; + struct kvm_lapic_irq irq; if (entry->type != KVM_IRQ_ROUTING_MSI) continue; - dest_id = (entry->msi.address_lo >> 12) & 0xff; - dest_mode = (entry->msi.address_lo >> 2) & 0x1; - level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL; - if (level && kvm_apic_match_dest(vcpu, NULL, 0, - dest_id, dest_mode)) { - u32 vector = entry->msi.data & 0xff; - __set_bit(vector, - ioapic_handled_vectors); - } + kvm_set_msi_irq(vcpu->kvm, entry, &irq); + + if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, + irq.dest_id, irq.dest_mode)) + __set_bit(irq.vector, ioapic_handled_vectors); } } srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 57549ed47ca5..730cf174090a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -115,26 +115,43 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -/* The logical map is definitely wrong if we have multiple - * modes at the same time. (Physical map is always right.) - */ -static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map) -{ - return !(map->mode & (map->mode - 1)); +static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, + u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { + switch (map->mode) { + case KVM_APIC_MODE_X2APIC: { + u32 offset = (dest_id >> 16) * 16; + u32 max_apic_id = map->max_apic_id; + + if (offset <= max_apic_id) { + u8 cluster_size = min(max_apic_id - offset + 1, 16U); + + *cluster = &map->phys_map[offset]; + *mask = dest_id & (0xffff >> (16 - cluster_size)); + } else { + *mask = 0; + } + + return true; + } + case KVM_APIC_MODE_XAPIC_FLAT: + *cluster = map->xapic_flat_map; + *mask = dest_id & 0xff; + return true; + case KVM_APIC_MODE_XAPIC_CLUSTER: + *cluster = map->xapic_cluster_map[dest_id >> 4]; + *mask = dest_id & 0xf; + return true; + default: + /* Not optimized. */ + return false; + } } -static inline void -apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid) +static void kvm_apic_map_free(struct rcu_head *rcu) { - unsigned lid_bits; + struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); - BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4); - BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8); - BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16); - lid_bits = map->mode; - - *cid = dest_id >> lid_bits; - *lid = dest_id & ((1 << lid_bits) - 1); + kvfree(map); } static void recalculate_apic_map(struct kvm *kvm) @@ -142,17 +159,26 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; int i; - - new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL); + u32 max_id = 255; mutex_lock(&kvm->arch.apic_map_lock); + kvm_for_each_vcpu(i, vcpu, kvm) + if (kvm_apic_present(vcpu)) + max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); + + new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); + if (!new) goto out; + new->max_apic_id = max_id; + kvm_for_each_vcpu(i, vcpu, kvm) { struct kvm_lapic *apic = vcpu->arch.apic; - u16 cid, lid; + struct kvm_lapic **cluster; + u16 mask; u32 ldr, aid; if (!kvm_apic_present(vcpu)) @@ -161,7 +187,7 @@ static void recalculate_apic_map(struct kvm *kvm) aid = kvm_apic_id(apic); ldr = kvm_lapic_get_reg(apic, APIC_LDR); - if (aid < ARRAY_SIZE(new->phys_map)) + if (aid <= new->max_apic_id) new->phys_map[aid] = apic; if (apic_x2apic_mode(apic)) { @@ -174,13 +200,11 @@ static void recalculate_apic_map(struct kvm *kvm) new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; } - if (!kvm_apic_logical_map_valid(new)) + if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask)) continue; - apic_logical_id(new, ldr, &cid, &lid); - - if (lid && cid < ARRAY_SIZE(new->logical_map)) - new->logical_map[cid][ffs(lid) - 1] = apic; + if (mask) + cluster[ffs(mask) - 1] = apic; } out: old = rcu_dereference_protected(kvm->arch.apic_map, @@ -189,7 +213,7 @@ static void recalculate_apic_map(struct kvm *kvm) mutex_unlock(&kvm->arch.apic_map_lock); if (old) - kfree_rcu(old, rcu); + call_rcu(&old->rcu, kvm_apic_map_free); kvm_make_scan_ioapic_request(kvm); } @@ -210,7 +234,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) } } -static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) +static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) { kvm_lapic_set_reg(apic, APIC_ID, id << 24); recalculate_apic_map(apic->vcpu->kvm); @@ -222,11 +246,11 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) recalculate_apic_map(apic->vcpu->kvm); } -static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id) +static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - kvm_lapic_set_reg(apic, APIC_ID, id << 24); + kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); } @@ -599,17 +623,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) } } -/* KVM APIC implementation has two quirks - * - dest always begins at 0 while xAPIC MDA has offset 24, - * - IOxAPIC messages have to be delivered (directly) to x2APIC. +/* The KVM local APIC implementation has two quirks: + * + * - the xAPIC MDA stores the destination at bits 24-31, while this + * is not true of struct kvm_lapic_irq's dest_id field. This is + * just a quirk in the API and is not problematic. + * + * - in-kernel IOAPIC messages have to be delivered directly to + * x2APIC, because the kernel does not support interrupt remapping. + * In order to support broadcast without interrupt remapping, x2APIC + * rewrites the destination of non-IPI messages from APIC_BROADCAST + * to X2APIC_BROADCAST. + * + * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is + * important when userspace wants to use x2APIC-format MSIs, because + * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7". */ -static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source, - struct kvm_lapic *target) +static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, + struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); - if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda) + if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && + !ipi && dest_id == APIC_BROADCAST && x2apic_mda) return X2APIC_BROADCAST; return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); @@ -619,7 +656,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; - u32 mda = kvm_apic_mda(dest, source, target); + u32 mda = kvm_apic_mda(vcpu, dest, source, target); apic_debug("target %p, source %p, dest 0x%x, " "dest_mode 0x%x, short_hand 0x%x\n", @@ -671,14 +708,105 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm) } } +static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, + struct kvm_lapic_irq *irq, struct kvm_apic_map *map) +{ + if (kvm->arch.x2apic_broadcast_quirk_disabled) { + if ((irq->dest_id == APIC_BROADCAST && + map->mode != KVM_APIC_MODE_X2APIC)) + return true; + if (irq->dest_id == X2APIC_BROADCAST) + return true; + } else { + bool x2apic_ipi = src && *src && apic_x2apic_mode(*src); + if (irq->dest_id == (x2apic_ipi ? + X2APIC_BROADCAST : APIC_BROADCAST)) + return true; + } + + return false; +} + +/* Return true if the interrupt can be handled by using *bitmap as index mask + * for valid destinations in *dst array. + * Return false if kvm_apic_map_get_dest_lapic did nothing useful. + * Note: we may have zero kvm_lapic destinations when we return true, which + * means that the interrupt should be dropped. In this case, *bitmap would be + * zero and *dst undefined. + */ +static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, + struct kvm_lapic **src, struct kvm_lapic_irq *irq, + struct kvm_apic_map *map, struct kvm_lapic ***dst, + unsigned long *bitmap) +{ + int i, lowest; + + if (irq->shorthand == APIC_DEST_SELF && src) { + *dst = src; + *bitmap = 1; + return true; + } else if (irq->shorthand) + return false; + + if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map)) + return false; + + if (irq->dest_mode == APIC_DEST_PHYSICAL) { + if (irq->dest_id > map->max_apic_id) { + *bitmap = 0; + } else { + *dst = &map->phys_map[irq->dest_id]; + *bitmap = 1; + } + return true; + } + + *bitmap = 0; + if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst, + (u16 *)bitmap)) + return false; + + if (!kvm_lowest_prio_delivery(irq)) + return true; + + if (!kvm_vector_hashing_enabled()) { + lowest = -1; + for_each_set_bit(i, bitmap, 16) { + if (!(*dst)[i]) + continue; + if (lowest < 0) + lowest = i; + else if (kvm_apic_compare_prio((*dst)[i]->vcpu, + (*dst)[lowest]->vcpu) < 0) + lowest = i; + } + } else { + if (!*bitmap) + return true; + + lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap), + bitmap, 16); + + if (!(*dst)[lowest]) { + kvm_apic_disabled_lapic_found(kvm); + *bitmap = 0; + return true; + } + } + + *bitmap = (lowest >= 0) ? 1 << lowest : 0; + + return true; +} + bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) { struct kvm_apic_map *map; - unsigned long bitmap = 1; - struct kvm_lapic **dst; + unsigned long bitmap; + struct kvm_lapic **dst = NULL; int i; - bool ret, x2apic_ipi; + bool ret; *r = -1; @@ -687,86 +815,19 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, return true; } - if (irq->shorthand) - return false; - - x2apic_ipi = src && apic_x2apic_mode(src); - if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) - return false; - - ret = true; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (!map) { - ret = false; - goto out; - } - - if (irq->dest_mode == APIC_DEST_PHYSICAL) { - if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) - goto out; - - dst = &map->phys_map[irq->dest_id]; - } else { - u16 cid; - - if (!kvm_apic_logical_map_valid(map)) { - ret = false; - goto out; + ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap); + if (ret) + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (*r < 0) + *r = 0; + *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } - apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); - - if (cid >= ARRAY_SIZE(map->logical_map)) - goto out; - - dst = map->logical_map[cid]; - - if (!kvm_lowest_prio_delivery(irq)) - goto set_irq; - - if (!kvm_vector_hashing_enabled()) { - int l = -1; - for_each_set_bit(i, &bitmap, 16) { - if (!dst[i]) - continue; - if (l < 0) - l = i; - else if (kvm_apic_compare_prio(dst[i]->vcpu, - dst[l]->vcpu) < 0) - l = i; - } - bitmap = (l >= 0) ? 1 << l : 0; - } else { - int idx; - unsigned int dest_vcpus; - - dest_vcpus = hweight16(bitmap); - if (dest_vcpus == 0) - goto out; - - idx = kvm_vector_to_index(irq->vector, - dest_vcpus, &bitmap, 16); - - if (!dst[idx]) { - kvm_apic_disabled_lapic_found(kvm); - goto out; - } - - bitmap = (idx >= 0) ? 1 << idx : 0; - } - } - -set_irq: - for_each_set_bit(i, &bitmap, 16) { - if (!dst[i]) - continue; - if (*r < 0) - *r = 0; - *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); - } -out: rcu_read_unlock(); return ret; } @@ -789,8 +850,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { struct kvm_apic_map *map; + unsigned long bitmap; + struct kvm_lapic **dst = NULL; bool ret = false; - struct kvm_lapic *dst = NULL; if (irq->shorthand) return false; @@ -798,69 +860,16 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (!map) - goto out; + if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) && + hweight16(bitmap) == 1) { + unsigned long i = find_first_bit(&bitmap, 16); - if (irq->dest_mode == APIC_DEST_PHYSICAL) { - if (irq->dest_id == 0xFF) - goto out; - - if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) - goto out; - - dst = map->phys_map[irq->dest_id]; - if (dst && kvm_apic_present(dst->vcpu)) - *dest_vcpu = dst->vcpu; - else - goto out; - } else { - u16 cid; - unsigned long bitmap = 1; - int i, r = 0; - - if (!kvm_apic_logical_map_valid(map)) - goto out; - - apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); - - if (cid >= ARRAY_SIZE(map->logical_map)) - goto out; - - if (kvm_vector_hashing_enabled() && - kvm_lowest_prio_delivery(irq)) { - int idx; - unsigned int dest_vcpus; - - dest_vcpus = hweight16(bitmap); - if (dest_vcpus == 0) - goto out; - - idx = kvm_vector_to_index(irq->vector, dest_vcpus, - &bitmap, 16); - - dst = map->logical_map[cid][idx]; - if (!dst) { - kvm_apic_disabled_lapic_found(kvm); - goto out; - } - - *dest_vcpu = dst->vcpu; - } else { - for_each_set_bit(i, &bitmap, 16) { - dst = map->logical_map[cid][i]; - if (++r == 2) - goto out; - } - - if (dst && kvm_apic_present(dst->vcpu)) - *dest_vcpu = dst->vcpu; - else - goto out; + if (dst[i]) { + *dest_vcpu = dst[i]->vcpu; + ret = true; } } - ret = true; -out: rcu_read_unlock(); return ret; } @@ -1127,12 +1136,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) return 0; switch (offset) { - case APIC_ID: - if (apic_x2apic_mode(apic)) - val = kvm_apic_id(apic); - else - val = kvm_apic_id(apic) << 24; - break; case APIC_ARBPRI: apic_debug("Access APIC ARBPRI register which is for P6\n"); break; @@ -1314,6 +1317,108 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) nsec_to_cycles(vcpu, lapic_timer_advance_ns))); } +static void start_sw_tscdeadline(struct kvm_lapic *apic) +{ + u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + u64 ns = 0; + ktime_t expire; + struct kvm_vcpu *vcpu = apic->vcpu; + unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; + unsigned long flags; + ktime_t now; + + if (unlikely(!tscdeadline || !this_tsc_khz)) + return; + + local_irq_save(flags); + + now = apic->lapic_timer.timer.base->get_time(); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + if (likely(tscdeadline > guest_tsc)) { + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + expire = ktime_add_ns(now, ns); + expire = ktime_sub_ns(expire, lapic_timer_advance_ns); + hrtimer_start(&apic->lapic_timer.timer, + expire, HRTIMER_MODE_ABS_PINNED); + } else + apic_timer_expired(apic); + + local_irq_restore(flags); +} + +bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.apic->lapic_timer.hv_timer_in_use; +} +EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); + +static void cancel_hv_tscdeadline(struct kvm_lapic *apic) +{ + kvm_x86_ops->cancel_hv_timer(apic->vcpu); + apic->lapic_timer.hv_timer_in_use = false; +} + +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + WARN_ON(swait_active(&vcpu->wq)); + cancel_hv_tscdeadline(apic); + apic_timer_expired(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); + +static bool start_hv_tscdeadline(struct kvm_lapic *apic) +{ + u64 tscdeadline = apic->lapic_timer.tscdeadline; + + if (atomic_read(&apic->lapic_timer.pending) || + kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_tscdeadline(apic); + } else { + apic->lapic_timer.hv_timer_in_use = true; + hrtimer_cancel(&apic->lapic_timer.timer); + + /* In case the sw timer triggered in the window */ + if (atomic_read(&apic->lapic_timer.pending)) + cancel_hv_tscdeadline(apic); + } + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, + apic->lapic_timer.hv_timer_in_use); + return apic->lapic_timer.hv_timer_in_use; +} + +void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(apic->lapic_timer.hv_timer_in_use); + + if (apic_lvtt_tscdeadline(apic)) + start_hv_tscdeadline(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); + +void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + /* Possibly the TSC deadline timer is not enabled yet */ + if (!apic->lapic_timer.hv_timer_in_use) + return; + + cancel_hv_tscdeadline(apic); + + if (atomic_read(&apic->lapic_timer.pending)) + return; + + start_sw_tscdeadline(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); + static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; @@ -1360,32 +1465,8 @@ static void start_apic_timer(struct kvm_lapic *apic) ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); } else if (apic_lvtt_tscdeadline(apic)) { - /* lapic timer in tsc deadline mode */ - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; - u64 ns = 0; - ktime_t expire; - struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; - unsigned long flags; - - if (unlikely(!tscdeadline || !this_tsc_khz)) - return; - - local_irq_save(flags); - - now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); - expire = ktime_add_ns(now, ns); - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); - hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS_PINNED); - } else - apic_timer_expired(apic); - - local_irq_restore(flags); + if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) + start_sw_tscdeadline(apic); } } @@ -1413,7 +1494,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) - kvm_apic_set_id(apic, val >> 24); + kvm_apic_set_xapic_id(apic, val >> 24); else ret = 1; break; @@ -1674,9 +1755,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { - if (value & MSR_IA32_APICBASE_ENABLE) + if (value & MSR_IA32_APICBASE_ENABLE) { + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_key_slow_dec_deferred(&apic_hw_disabled); - else + } else static_key_slow_inc(&apic_hw_disabled.key); recalculate_apic_map(vcpu->kvm); } @@ -1716,8 +1798,11 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - if (!init_event) - kvm_apic_set_id(apic, vcpu->vcpu_id); + if (!init_event) { + kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE); + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); + } kvm_apic_set_version(apic->vcpu); for (i = 0; i < KVM_APIC_LVT_NUM; i++) @@ -1856,9 +1941,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) * thinking that APIC satet has changed. */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; - kvm_lapic_set_base(vcpu, - APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); - static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -1938,17 +2020,48 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) return vector; } -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) +static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s, bool set) +{ + if (apic_x2apic_mode(vcpu->arch.apic)) { + u32 *id = (u32 *)(s->regs + APIC_ID); + + if (vcpu->kvm->arch.x2apic_format) { + if (*id != vcpu->vcpu_id) + return -EINVAL; + } else { + if (set) + *id >>= 24; + else + *id <<= 24; + } + } + + return 0; +} + +int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) +{ + memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s)); + return kvm_apic_state_fixup(vcpu, s, false); +} + +int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { struct kvm_lapic *apic = vcpu->arch.apic; + int r; + kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); + + r = kvm_apic_state_fixup(vcpu, s, true); + if (r) + return r; memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); - /* call kvm_apic_set_id() to put apic into apic_map */ - kvm_apic_set_id(apic, kvm_apic_id(apic)); + + recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); apic_update_ppr(apic); @@ -1974,6 +2087,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_rtc_eoi_tracking_restore_one(vcpu); vcpu->arch.apic_arb_prio = 0; + + return 0; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 891c6da7d4aa..f60d01c29d51 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -20,6 +20,7 @@ struct kvm_timer { u64 tscdeadline; u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ + bool hv_timer_in_use; }; struct kvm_lapic { @@ -80,8 +81,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s); +int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); +int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); @@ -199,9 +200,15 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } -static inline int kvm_apic_id(struct kvm_lapic *apic) +static inline u32 kvm_apic_id(struct kvm_lapic *apic) { - return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff; + /* To avoid a race between apic_base and following APIC_ID update when + * switching to x2apic_mode, the x2apic mode returns initial x2apic id. + */ + if (apic_x2apic_mode(apic)) + return apic->vcpu->vcpu_id; + + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); @@ -212,4 +219,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, const unsigned long *bitmap, u32 bitmap_size); +void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); +void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); +bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 745a5f445ae2..3d4cc8cc56a3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -176,6 +176,7 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; +static u64 __read_mostly shadow_present_mask; static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -283,13 +284,14 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) } void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; + shadow_present_mask = p_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -305,7 +307,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); + return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -524,7 +526,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) } /* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changged. + * Update the state bits, it means the mapped pfn is not changed. * * Whenever we overwrite a writable spte with a read-only one we * should flush remote TLBs. Otherwise rmap_write_protect @@ -2246,10 +2248,9 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte; - BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || - VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | + spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | shadow_user_mask | shadow_x_mask | shadow_accessed_mask; mmu_spte_set(sptep, spte); @@ -2516,13 +2517,19 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte; + u64 spte = 0; int ret = 0; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; - spte = PT_PRESENT_MASK; + /* + * For the EPT case, shadow_present_mask is 0 if hardware + * supports exec-only page table entries. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + spte |= shadow_present_mask; if (!speculative) spte |= shadow_accessed_mask; @@ -3190,7 +3197,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) MMU_WARN_ON(VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); - if (!is_present_gpte(pdptr)) { + if (!(pdptr & PT_PRESENT_MASK)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } @@ -3915,9 +3922,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, * clearer. */ smap = cr4_smap && u && !uf && !ff; - } else - /* Not really needed: no U/S accesses on ept */ - u = 1; + } fault = (ff && !x) || (uf && !u) || (wf && !w) || (smapf && smap); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 66b33b96a31b..ddc56e91f2e4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -93,11 +93,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_present_gpte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bc019f70e0b6..a01105485315 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -131,7 +131,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) static inline int FNAME(is_present_gpte)(unsigned long pte) { #if PTTYPE != PTTYPE_EPT - return is_present_gpte(pte); + return pte & PT_PRESENT_MASK; #else return pte & 7; #endif @@ -181,13 +181,19 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, return true; } +/* + * For PTTYPE_EPT, a page table can be executable but not readable + * on supported processors. Therefore, set_spte does not automatically + * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK + * to signify readability since it isn't used in the EPT case + */ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) { unsigned access; #if PTTYPE == PTTYPE_EPT access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | - ACC_USER_MASK; + ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); #else BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); BUILD_BUG_ON(ACC_EXEC_MASK != 1); diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index ab38af4f4947..9d4a8504a95a 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -93,7 +93,7 @@ static unsigned intel_find_fixed_event(int idx) return intel_arch_events[fixed_pmc_events[idx]].event_type; } -/* check if a PMC is enabled by comparising it with globl_ctrl bits. */ +/* check if a PMC is enabled by comparing it with globl_ctrl bits. */ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 16ef31b87452..af523d84d102 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1577,7 +1577,7 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { /* - * Any change of EFLAGS.VM is accompained by a reload of SS + * Any change of EFLAGS.VM is accompanied by a reload of SS * (caused by either a task switch or an inter-privilege IRET), * so we do not need to update the CPL here. */ @@ -4940,6 +4940,12 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, static void svm_handle_external_intr(struct kvm_vcpu *vcpu) { local_irq_enable(); + /* + * We must have an instruction with interrupts enabled, so + * the timer interrupt isn't delayed by the interrupt shadow. + */ + asm("nop"); + local_irq_disable(); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 8de925031b5c..0a6cc6754ec5 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1348,6 +1348,21 @@ TRACE_EVENT(kvm_avic_unaccelerated_access, __entry->vec) ); +TRACE_EVENT(kvm_hv_timer_state, + TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), + TP_ARGS(vcpu_id, hv_timer_in_use), + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(unsigned int, hv_timer_in_use) + ), + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->hv_timer_in_use = hv_timer_in_use; + ), + TP_printk("vcpu_id %x hv_timer %x\n", + __entry->vcpu_id, + __entry->hv_timer_in_use) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index df07a0a4611f..bc354f003ce1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -110,6 +110,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO); #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL +/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ +static int __read_mostly cpu_preemption_timer_multi; +static bool __read_mostly enable_preemption_timer = 1; +#ifdef CONFIG_X86_64 +module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); +#endif + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -398,6 +405,12 @@ struct nested_vmx { /* The host-usable pointer to the above */ struct page *current_vmcs12_page; struct vmcs12 *current_vmcs12; + /* + * Cache of the guest's VMCS, existing outside of guest memory. + * Loaded from guest memory during VMPTRLD. Flushed to guest + * memory during VMXOFF, VMCLEAR, VMPTRLD. + */ + struct vmcs12 *cached_vmcs12; struct vmcs *current_shadow_vmcs; /* * Indicates if the shadow vmcs must be updated with the @@ -421,7 +434,6 @@ struct nested_vmx { struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; - u64 msr_ia32_feature_control; struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -597,11 +609,22 @@ struct vcpu_vmx { #define PML_ENTITY_NUM 512 struct page *pml_pg; + /* apic deadline value in host tsc */ + u64 hv_deadline_tsc; + u64 current_tsc_ratio; bool guest_pkru_valid; u32 guest_pkru; u32 host_pkru; + + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included + * in msr_ia32_feature_control_valid_bits. + */ + u64 msr_ia32_feature_control; + u64 msr_ia32_feature_control_valid_bits; }; enum segment_cache_field { @@ -841,7 +864,7 @@ static inline short vmcs_field_to_offset(unsigned long field) static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { - return to_vmx(vcpu)->nested.current_vmcs12; + return to_vmx(vcpu)->nested.cached_vmcs12; } static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) @@ -1056,6 +1079,58 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +/* + * Comment's format: document - errata name - stepping - processor name. + * Refer from + * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp + */ +static u32 vmx_preemption_cpu_tfms[] = { +/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ +0x000206E6, +/* 323056.pdf - AAX65 - C2 - Xeon L3406 */ +/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ +/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020652, +/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020655, +/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ +/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ +/* + * 320767.pdf - AAP86 - B1 - + * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile + */ +0x000106E5, +/* 321333.pdf - AAM126 - C0 - Xeon 3500 */ +0x000106A0, +/* 321333.pdf - AAM126 - C1 - Xeon 3500 */ +0x000106A1, +/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ +0x000106A4, + /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ + /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ + /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ +0x000106A5, +}; + +static inline bool cpu_has_broken_vmx_preemption_timer(void) +{ + u32 eax = cpuid_eax(0x00000001), i; + + /* Clear the reserved bits */ + eax &= ~(0x3U << 14 | 0xfU << 28); + for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) + if (eax == vmx_preemption_cpu_tfms[i]) + return true; + + return false; +} + +static inline bool cpu_has_vmx_preemption_timer(void) +{ + return vmcs_config.pin_based_exec_ctrl & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + static inline bool cpu_has_vmx_posted_intr(void) { return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && @@ -1603,6 +1678,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) __vmcs_writel(field, __vmcs_readl(field) | mask); } +static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); +} + static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VM_ENTRY_CONTROLS, val); @@ -1631,6 +1711,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); } +static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); +} + static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VM_EXIT_CONTROLS, val); @@ -2121,22 +2206,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + bool already_loaded = vmx->loaded_vmcs->cpu == cpu; if (!vmm_exclusive) kvm_cpu_vmxon(phys_addr); - else if (vmx->loaded_vmcs->cpu != cpu) + else if (!already_loaded) loaded_vmcs_clear(vmx->loaded_vmcs); - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; - vmcs_load(vmx->loaded_vmcs->vmcs); - } - - if (vmx->loaded_vmcs->cpu != cpu) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); - unsigned long sysenter_esp; - - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + if (!already_loaded) { local_irq_disable(); crash_disable_local_vmclear(cpu); @@ -2151,6 +2228,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) &per_cpu(loaded_vmcss_on_cpu, cpu)); crash_enable_local_vmclear(cpu); local_irq_enable(); + } + + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); + } + + if (!already_loaded) { + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); + unsigned long sysenter_esp; + + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); /* * Linux uses per-cpu TSS and GDT, so set these when switching @@ -2716,6 +2805,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; + if (cpu_has_vmx_ept_execute_only()) + vmx->nested.nested_vmx_ept_caps |= + VMX_EPT_EXECUTE_ONLY_BIT; vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; /* * For nested guests, we don't do anything specific @@ -2864,6 +2956,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 0; } +static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, + uint64_t val) +{ + uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + + return !(val & ~valid_bits); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -2905,10 +3005,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; - case MSR_IA32_FEATURE_CONTROL: - if (!nested_vmx_allowed(vcpu)) + case MSR_IA32_MCG_EXT_CTL: + if (!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) return 1; - msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + msr_info->data = vcpu->arch.mcg_ext_ctl; + break; + case MSR_IA32_FEATURE_CONTROL: + msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) @@ -2998,12 +3103,20 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; + case MSR_IA32_MCG_EXT_CTL: + if ((!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) || + (data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = data; + break; case MSR_IA32_FEATURE_CONTROL: - if (!nested_vmx_allowed(vcpu) || - (to_vmx(vcpu)->nested.msr_ia32_feature_control & + if (!vmx_feature_control_msr_valid(vcpu, data) || + (to_vmx(vcpu)->msr_ia32_feature_control & FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) return 1; - vmx->nested.msr_ia32_feature_control = data; + vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); break; @@ -3297,25 +3410,27 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = VM_EXIT_SAVE_DEBUG_CONTROLS; + min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; + VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | + PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; + if (cpu_has_broken_vmx_preemption_timer()) + _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || - !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; min = VM_ENTRY_LOAD_DEBUG_CONTROLS; @@ -3364,7 +3479,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) /* * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL - * but due to arrata below it can't be used. Workaround is to use + * but due to errata below it can't be used. Workaround is to use * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. * * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] @@ -4781,6 +4896,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) if (!kvm_vcpu_apicv_active(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + /* Enable the preemption timer dynamically */ + pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; return pin_based_exec_ctrl; } @@ -4896,6 +5013,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) /* Control */ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + vmx->hv_deadline_tsc = -1; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); @@ -6016,12 +6134,14 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - /* It is a write fault? */ - error_code = exit_qualification & PFERR_WRITE_MASK; + /* it is a read fault? */ + error_code = (exit_qualification << 2) & PFERR_USER_MASK; + /* it is a write fault? */ + error_code |= exit_qualification & PFERR_WRITE_MASK; /* It is a fetch fault? */ error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; /* ept page table is present? */ - error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; + error_code |= (exit_qualification & 0x38) != 0; vcpu->arch.exit_qualification = exit_qualification; @@ -6355,9 +6475,6 @@ static __init int hardware_setup(void) for (msr = 0x800; msr <= 0x8ff; msr++) vmx_disable_intercept_msr_read_x2apic(msr); - /* According SDM, in x2apic mode, the whole id reg is used. But in - * KVM, it only use the highest eight bits. Need to intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); /* TMCCT */ vmx_enable_intercept_msr_read_x2apic(0x839); /* TPR */ @@ -6368,10 +6485,12 @@ static __init int hardware_setup(void) vmx_disable_intercept_msr_write_x2apic(0x83f); if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK); + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? + 0ull : VMX_EPT_READABLE_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else @@ -6393,8 +6512,21 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + cpu_preemption_timer_multi = + vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + } else { + kvm_x86_ops->set_hv_timer = NULL; + kvm_x86_ops->cancel_hv_timer = NULL; + } + kvm_set_posted_intr_wakeup_handler(wakeup_handler); + kvm_mce_cap_supported |= MCG_LMCE_P; + return alloc_kvm_area(); out8: @@ -6862,16 +6994,22 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); return 1; } + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_vmcs12) + return -ENOMEM; + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) + if (!shadow_vmcs) { + kfree(vmx->nested.cached_vmcs12); return -ENOMEM; + } /* mark vmcs as shadow */ shadow_vmcs->revision_id |= (1u << 31); /* init shadow vmcs */ @@ -6942,6 +7080,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); } vmx->nested.posted_intr_nv = -1; + + /* Flush VMCS12 to guest memory */ + memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12, + VMCS12_SIZE); + kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); vmx->nested.current_vmptr = -1ull; @@ -6962,6 +7105,7 @@ static void free_nested(struct vcpu_vmx *vmx) nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); + kfree(vmx->nested.cached_vmcs12); /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); @@ -7365,6 +7509,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; + /* + * Load VMCS12 from guest memory since it is not already + * cached. + */ + memcpy(vmx->nested.cached_vmcs12, + vmx->nested.current_vmcs12, VMCS12_SIZE); + if (enable_shadow_vmcs) { vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); @@ -7560,6 +7711,12 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } +static int handle_preemption_timer(struct kvm_vcpu *vcpu) +{ + kvm_lapic_expired_hv_timer(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7610,6 +7767,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; static const int kvm_vmx_max_exit_handlers = @@ -7918,6 +8076,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + case EXIT_REASON_PREEMPTION_TIMER: + return false; default: return true; } @@ -8303,7 +8463,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) * the next L2->L1 exit. */ if (!is_guest_mode(vcpu) || - !nested_cpu_has2(vmx->nested.current_vmcs12, + !nested_cpu_has2(get_vmcs12(&vmx->vcpu), SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) vmcs_write64(APIC_ACCESS_ADDR, hpa); } @@ -8436,7 +8596,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) "push %[sp]\n\t" #endif "pushf\n\t" - "orl $0x200, (%%" _ASM_SP ")\n\t" __ASM_SIZE(push) " $%c[cs]\n\t" "call *%[entry]\n\t" : @@ -8449,8 +8608,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) [ss]"i"(__KERNEL_DS), [cs]"i"(__KERNEL_CS) ); - } else - local_irq_enable(); + } } static bool vmx_has_high_real_mode_segbase(void) @@ -8601,6 +8759,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } +void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl; + u32 delta_tsc; + + if (vmx->hv_deadline_tsc == -1) + return; + + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* sure to be 32 bit only because checked on set_hv_timer */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; + + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); +} + static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8650,6 +8828,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); + vmx_arm_hv_timer(vcpu); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -8940,6 +9120,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; + vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + return &vmx->vcpu; free_vmcs: @@ -9080,6 +9262,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (cpu_has_secondary_exec_ctrls()) vmcs_set_secondary_exec_control(secondary_exec_ctl); + + if (nested_vmx_allowed(vcpu)) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9636,9 +9825,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; - exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + /* Preemption timer setting is only taken from vmcs01. */ + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control |= vmcs_config.pin_based_exec_ctrl; + if (vmx->hv_deadline_tsc == -1) + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + + /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { /* * Note that we use L0's vector here and in @@ -10556,8 +10750,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); - vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); - vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); + vm_entry_controls_reset_shadow(vmx); + vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); /* if no vmcs02 cache requested, remove the one we used */ @@ -10566,8 +10760,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); - /* Update TSC_OFFSET if TSC was changed while L2 ran */ + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + if (vmx->hv_deadline_tsc == -1) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + else + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); /* This is needed for same reason as it was needed in prepare_vmcs02 */ vmx->host_rsp = 0; @@ -10647,6 +10847,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, return X86EMUL_CONTINUE; } +#ifdef CONFIG_X86_64 +/* (a << shift) / divisor, return 1 if overflow otherwise 0 */ +static inline int u64_shl_div_u64(u64 a, unsigned int shift, + u64 divisor, u64 *result) +{ + u64 low = a << shift, high = a >> (64 - shift); + + /* To avoid the overflow on divq */ + if (high >= divisor) + return 1; + + /* Low hold the result, high hold rem which is discarded */ + asm("divq %2\n\t" : "=a" (low), "=d" (high) : + "rm" (divisor), "0" (low), "1" (high)); + *result = low; + + return 0; +} + +static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl = rdtsc(); + u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; + + /* Convert to host delta tsc if tsc scaling is enabled */ + if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + u64_shl_div_u64(delta_tsc, + kvm_tsc_scaling_ratio_frac_bits, + vcpu->arch.tsc_scaling_ratio, + &delta_tsc)) + return -ERANGE; + + /* + * If the delta tsc can't fit in the 32 bit after the multi shift, + * we can't use the preemption timer. + * It's possible that it fits on later vmentries, but checking + * on every vmentry is costly so we just use an hrtimer. + */ + if (delta_tsc >> (cpu_preemption_timer_multi + 32)) + return -ERANGE; + + vmx->hv_deadline_tsc = tscl + delta_tsc; + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + return 0; +} + +static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->hv_deadline_tsc = -1; + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); +} +#endif + static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) { if (ple_gap) @@ -10691,7 +10949,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, * this case, return 1, otherwise, return 0. * */ -static int vmx_pre_block(struct kvm_vcpu *vcpu) +static int pi_pre_block(struct kvm_vcpu *vcpu) { unsigned long flags; unsigned int dest; @@ -10758,7 +11016,18 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) return 0; } -static void vmx_post_block(struct kvm_vcpu *vcpu) +static int vmx_pre_block(struct kvm_vcpu *vcpu) +{ + if (pi_pre_block(vcpu)) + return 1; + + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_switch_to_sw_timer(vcpu); + + return 0; +} + +static void pi_post_block(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc old, new; @@ -10800,6 +11069,14 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) } } +static void vmx_post_block(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->set_hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); + + pi_post_block(vcpu); +} + /* * vmx_update_pi_irte - set IRTE for Posted-Interrupts * @@ -10844,7 +11121,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * We will support full lowest-priority interrupt later. */ - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { /* * Make sure the IRTE is in remapped mode if @@ -10889,6 +11166,16 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, return ret; } +static void vmx_setup_mce(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_LMCE; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_LMCE; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -11013,6 +11300,13 @@ static struct kvm_x86_ops vmx_x86_ops = { .pmu_ops = &intel_pmu_ops, .update_pi_irte = vmx_update_pi_irte, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vmx_set_hv_timer, + .cancel_hv_timer = vmx_cancel_hv_timer, +#endif + + .setup_mce = vmx_setup_mce, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9c496c7e8c00..19f9f9e05c2a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -71,7 +71,8 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; +EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) @@ -90,8 +91,12 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) + static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); +static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); struct kvm_x86_ops *kvm_x86_ops __read_mostly; @@ -114,7 +119,8 @@ u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); u64 __read_mostly kvm_max_tsc_scaling_ratio; EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -static u64 __read_mostly kvm_default_tsc_scaling_ratio; +u64 __read_mostly kvm_default_tsc_scaling_ratio; +EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; @@ -538,7 +544,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if (is_present_gpte(pdpte[i]) && + if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) { ret = 0; @@ -983,6 +989,7 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, + MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, }; @@ -1162,7 +1169,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) int version; int r; struct pvclock_wall_clock wc; - struct timespec boot; + struct timespec64 boot; if (!wall_clock) return; @@ -1185,13 +1192,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ - getboottime(&boot); + getboottime64(&boot); if (kvm->arch.kvmclock_offset) { - struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset); - boot = timespec_sub(boot, ts); + struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); + boot = timespec64_sub(boot, ts); } - wc.sec = boot.tv_sec; + wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ wc.nsec = boot.tv_nsec; wc.version = version; @@ -2616,6 +2623,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; + case KVM_CAP_X2APIC_API: + r = KVM_X2APIC_API_VALID_FLAGS; + break; default: r = 0; break; @@ -2678,11 +2688,9 @@ long kvm_arch_dev_ioctl(struct file *filp, break; } case KVM_X86_GET_MCE_CAP_SUPPORTED: { - u64 mce_cap; - - mce_cap = KVM_MCE_CAP_SUPPORTED; r = -EFAULT; - if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) + if (copy_to_user(argp, &kvm_mce_cap_supported, + sizeof(kvm_mce_cap_supported))) goto out; r = 0; break; @@ -2734,6 +2742,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdtsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); + + if (kvm_lapic_hv_timer_in_use(vcpu) && + kvm_x86_ops->set_hv_timer(vcpu, + kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_lapic_switch_to_sw_timer(vcpu); if (check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); @@ -2767,15 +2780,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); - memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - - return 0; + return kvm_apic_get_state(vcpu, s); } static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - kvm_apic_post_state_restore(vcpu, s); + int r; + + r = kvm_apic_set_state(vcpu, s); + if (r) + return r; update_cr8_intercept(vcpu); return 0; @@ -2860,7 +2875,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; @@ -2870,6 +2885,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, /* Init IA32_MCi_CTL to all 1s */ for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; + + if (kvm_x86_ops->setup_mce) + kvm_x86_ops->setup_mce(vcpu); out: return r; } @@ -3768,7 +3786,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = -EEXIST; if (irqchip_in_kernel(kvm)) goto split_irqchip_unlock; - if (atomic_read(&kvm->online_vcpus)) + if (kvm->created_vcpus) goto split_irqchip_unlock; r = kvm_setup_empty_irq_routing(kvm); if (r) @@ -3782,6 +3800,18 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, mutex_unlock(&kvm->lock); break; } + case KVM_CAP_X2APIC_API: + r = -EINVAL; + if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) + break; + + if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) + kvm->arch.x2apic_format = true; + if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) + kvm->arch.x2apic_broadcast_quirk_disabled = true; + + r = 0; + break; default: r = -EINVAL; break; @@ -3833,7 +3863,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpic) goto create_irqchip_unlock; r = -EINVAL; - if (atomic_read(&kvm->online_vcpus)) + if (kvm->created_vcpus) goto create_irqchip_unlock; r = -ENOMEM; vpic = kvm_create_pic(kvm); @@ -3873,7 +3903,7 @@ long kvm_arch_vm_ioctl(struct file *filp, sizeof(struct kvm_pit_config))) goto out; create_pit: - mutex_lock(&kvm->slots_lock); + mutex_lock(&kvm->lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -3882,7 +3912,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpit) r = 0; create_pit_unlock: - mutex_unlock(&kvm->slots_lock); + mutex_unlock(&kvm->lock); break; case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ @@ -3989,7 +4019,7 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) != 0) + if (kvm->created_vcpus) r = -EBUSY; else kvm->arch.bsp_vcpu_id = arg; @@ -5297,13 +5327,8 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu) /* This is a good place to trace that we are exiting SMM. */ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); - if (unlikely(vcpu->arch.smi_pending)) { - kvm_make_request(KVM_REQ_SMI, vcpu); - vcpu->arch.smi_pending = 0; - } else { - /* Process a latched INIT, if any. */ - kvm_make_request(KVM_REQ_EVENT, vcpu); - } + /* Process a latched INIT or SMI, if any. */ + kvm_make_request(KVM_REQ_EVENT, vcpu); } kvm_mmu_reset_context(vcpu); @@ -5849,8 +5874,8 @@ int kvm_arch_init(void *opaque) kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0); - + PT_DIRTY_MASK, PT64_NX_MASK, 0, + PT_PRESENT_MASK); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -6084,7 +6109,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } /* try to inject new event if pending */ - if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { + if (vcpu->arch.smi_pending && !is_smm(vcpu)) { + vcpu->arch.smi_pending = false; + enter_smm(vcpu); + } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; kvm_x86_ops->set_nmi(vcpu); @@ -6107,6 +6135,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) kvm_x86_ops->set_irq(vcpu); } } + return 0; } @@ -6130,7 +6159,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) #define put_smstate(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val -static u32 process_smi_get_segment_flags(struct kvm_segment *seg) +static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) { u32 flags = 0; flags |= seg->g << 23; @@ -6144,7 +6173,7 @@ static u32 process_smi_get_segment_flags(struct kvm_segment *seg) return flags; } -static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) +static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) { struct kvm_segment seg; int offset; @@ -6159,11 +6188,11 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) put_smstate(u32, buf, offset + 8, seg.base); put_smstate(u32, buf, offset + 4, seg.limit); - put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg)); + put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg)); } #ifdef CONFIG_X86_64 -static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) +static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) { struct kvm_segment seg; int offset; @@ -6172,7 +6201,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) kvm_get_segment(vcpu, &seg, n); offset = 0x7e00 + n * 16; - flags = process_smi_get_segment_flags(&seg) >> 8; + flags = enter_smm_get_segment_flags(&seg) >> 8; put_smstate(u16, buf, offset, seg.selector); put_smstate(u16, buf, offset + 2, flags); put_smstate(u32, buf, offset + 4, seg.limit); @@ -6180,7 +6209,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) } #endif -static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) +static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) { struct desc_ptr dt; struct kvm_segment seg; @@ -6204,13 +6233,13 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7fc4, seg.selector); put_smstate(u32, buf, 0x7f64, seg.base); put_smstate(u32, buf, 0x7f60, seg.limit); - put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg)); + put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); put_smstate(u32, buf, 0x7fc0, seg.selector); put_smstate(u32, buf, 0x7f80, seg.base); put_smstate(u32, buf, 0x7f7c, seg.limit); - put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg)); + put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); kvm_x86_ops->get_gdt(vcpu, &dt); put_smstate(u32, buf, 0x7f74, dt.address); @@ -6221,7 +6250,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7f54, dt.size); for (i = 0; i < 6; i++) - process_smi_save_seg_32(vcpu, buf, i); + enter_smm_save_seg_32(vcpu, buf, i); put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); @@ -6230,7 +6259,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); } -static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) +static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) { #ifdef CONFIG_X86_64 struct desc_ptr dt; @@ -6262,7 +6291,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); put_smstate(u16, buf, 0x7e90, seg.selector); - put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); put_smstate(u32, buf, 0x7e94, seg.limit); put_smstate(u64, buf, 0x7e98, seg.base); @@ -6272,7 +6301,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); put_smstate(u16, buf, 0x7e70, seg.selector); - put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); put_smstate(u32, buf, 0x7e74, seg.limit); put_smstate(u64, buf, 0x7e78, seg.base); @@ -6281,31 +6310,26 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u64, buf, 0x7e68, dt.address); for (i = 0; i < 6; i++) - process_smi_save_seg_64(vcpu, buf, i); + enter_smm_save_seg_64(vcpu, buf, i); #else WARN_ON_ONCE(1); #endif } -static void process_smi(struct kvm_vcpu *vcpu) +static void enter_smm(struct kvm_vcpu *vcpu) { struct kvm_segment cs, ds; struct desc_ptr dt; char buf[512]; u32 cr0; - if (is_smm(vcpu)) { - vcpu->arch.smi_pending = true; - return; - } - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); vcpu->arch.hflags |= HF_SMM_MASK; memset(buf, 0, 512); if (guest_cpuid_has_longmode(vcpu)) - process_smi_save_state_64(vcpu, buf); + enter_smm_save_state_64(vcpu, buf); else - process_smi_save_state_32(vcpu, buf); + enter_smm_save_state_32(vcpu, buf); kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); @@ -6361,6 +6385,12 @@ static void process_smi(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } +static void process_smi(struct kvm_vcpu *vcpu) +{ + vcpu->arch.smi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + void kvm_make_scan_ioapic_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); @@ -6555,8 +6585,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (inject_pending_event(vcpu, req_int_win) != 0) req_immediate_exit = true; - /* enable NMI/IRQ window open exits if needed */ else { + /* Enable NMI/IRQ window open exits if needed. + * + * SMIs have two cases: 1) they can be nested, and + * then there is nothing to do here because RSM will + * cause a vmexit anyway; 2) or the SMI can be pending + * because inject_pending_event has completed the + * injection of an IRQ or NMI from the previous vmexit, + * and then we request an immediate exit to inject the SMI. + */ + if (vcpu->arch.smi_pending && !is_smm(vcpu)) + req_immediate_exit = true; if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) @@ -6607,12 +6647,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_guest_xcr0(vcpu); - if (req_immediate_exit) + if (req_immediate_exit) { + kvm_make_request(KVM_REQ_EVENT, vcpu); smp_send_reschedule(vcpu->cpu); + } trace_kvm_entry(vcpu->vcpu_id); wait_lapic_expire(vcpu); - __kvm_guest_enter(); + guest_enter_irqoff(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -6663,16 +6705,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; - /* - * We must have an instruction between local_irq_enable() and - * kvm_guest_exit(), so the timer interrupt isn't delayed by - * the interrupt shadow. The stat.exits increment will do nicely. - * But we need to prevent reordering, hence this barrier(): - */ - barrier(); - - kvm_guest_exit(); + guest_exit_irqoff(); + local_irq_enable(); preempt_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); @@ -7409,6 +7444,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { vcpu->arch.hflags = 0; + vcpu->arch.smi_pending = 0; atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; @@ -7601,11 +7637,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) -{ - return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu); -} - struct static_key kvm_no_apic_vcpu __read_mostly; EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); @@ -7872,7 +7903,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); - kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kvm_mmu_uninit_vm(kvm); } @@ -8380,7 +8411,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, /* * When producer of consumer is unregistered, we change back to * remapped mode, so we can re-use the current implementation - * when the irq is masked/disabed or the consumer side (KVM + * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 8196054fedb0..7b6a9d14c8c0 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -133,7 +133,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev) if (pci_probe & PCI_NOASSIGN_BARS) { /* * If the BIOS did not assign the BAR, zero out the - * resource so the kernel doesn't attmept to assign + * resource so the kernel doesn't attempt to assign * it later on in pci_assign_unassigned_resources */ for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) { diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c index 613cac7395c4..e88b4176260f 100644 --- a/arch/x86/pci/vmd.c +++ b/arch/x86/pci/vmd.c @@ -119,10 +119,11 @@ static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) static void vmd_irq_enable(struct irq_data *data) { struct vmd_irq *vmdirq = data->chip_data; + unsigned long flags; - raw_spin_lock(&list_lock); + raw_spin_lock_irqsave(&list_lock, flags); list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list); - raw_spin_unlock(&list_lock); + raw_spin_unlock_irqrestore(&list_lock, flags); data->chip->irq_unmask(data); } @@ -130,12 +131,14 @@ static void vmd_irq_enable(struct irq_data *data) static void vmd_irq_disable(struct irq_data *data) { struct vmd_irq *vmdirq = data->chip_data; + unsigned long flags; data->chip->irq_mask(data); - raw_spin_lock(&list_lock); + raw_spin_lock_irqsave(&list_lock, flags); list_del_rcu(&vmdirq->node); - raw_spin_unlock(&list_lock); + INIT_LIST_HEAD_RCU(&vmdirq->node); + raw_spin_unlock_irqrestore(&list_lock, flags); } /* @@ -166,16 +169,20 @@ static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info, * XXX: We can be even smarter selecting the best IRQ once we solve the * affinity problem. */ -static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd) +static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc) { - int i, best = 0; + int i, best = 1; + unsigned long flags; - raw_spin_lock(&list_lock); + if (!desc->msi_attrib.is_msix || vmd->msix_count == 1) + return &vmd->irqs[0]; + + raw_spin_lock_irqsave(&list_lock, flags); for (i = 1; i < vmd->msix_count; i++) if (vmd->irqs[i].count < vmd->irqs[best].count) best = i; vmd->irqs[best].count++; - raw_spin_unlock(&list_lock); + raw_spin_unlock_irqrestore(&list_lock, flags); return &vmd->irqs[best]; } @@ -184,14 +191,15 @@ static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq, irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(arg->desc)->bus); + struct msi_desc *desc = arg->desc; + struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus); struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL); if (!vmdirq) return -ENOMEM; INIT_LIST_HEAD(&vmdirq->node); - vmdirq->irq = vmd_next_irq(vmd); + vmdirq->irq = vmd_next_irq(vmd, desc); vmdirq->virq = virq; irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip, @@ -203,11 +211,12 @@ static void vmd_msi_free(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq) { struct vmd_irq *vmdirq = irq_get_chip_data(virq); + unsigned long flags; /* XXX: Potential optimization to rebalance */ - raw_spin_lock(&list_lock); + raw_spin_lock_irqsave(&list_lock, flags); vmdirq->irq->count--; - raw_spin_unlock(&list_lock); + raw_spin_unlock_irqrestore(&list_lock, flags); kfree_rcu(vmdirq, rcu); } @@ -261,7 +270,7 @@ static struct device *to_vmd_dev(struct device *dev) static struct dma_map_ops *vmd_dma_ops(struct device *dev) { - return to_vmd_dev(dev)->archdata.dma_ops; + return get_dma_ops(to_vmd_dev(dev)); } static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr, @@ -367,7 +376,7 @@ static void vmd_teardown_dma_ops(struct vmd_dev *vmd) { struct dma_domain *domain = &vmd->dma_domain; - if (vmd->dev->dev.archdata.dma_ops) + if (get_dma_ops(&vmd->dev->dev)) del_dma_domain(domain); } @@ -379,7 +388,7 @@ static void vmd_teardown_dma_ops(struct vmd_dev *vmd) static void vmd_setup_dma_ops(struct vmd_dev *vmd) { - const struct dma_map_ops *source = vmd->dev->dev.archdata.dma_ops; + const struct dma_map_ops *source = get_dma_ops(&vmd->dev->dev); struct dma_map_ops *dest = &vmd->dma_ops; struct dma_domain *domain = &vmd->dma_domain; @@ -594,7 +603,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd) sd->node = pcibus_to_node(vmd->dev->bus); vmd->irq_domain = pci_msi_create_irq_domain(NULL, &vmd_msi_domain_info, - NULL); + x86_vector_domain); if (!vmd->irq_domain) return -ENODEV; diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 12734a96df47..ac58c1616408 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -8,6 +8,8 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro +KCOV_INSTRUMENT := n + # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That # in turn leaves some undefined symbols like __fentry__ in purgatory and not # sure how to relocate those. Like kexec-tools, use custom flags. diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index c556c5ae8de5..25012abc3409 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -48,7 +48,7 @@ targets += realmode.lds $(obj)/realmode.lds: $(obj)/pasyms.h LDFLAGS_realmode.elf := --emit-relocs -T -CPPFLAGS_realmode.lds += -P -C -I$(obj) +CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj) targets += realmode.elf $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index aebd944bdaa1..445ce28475b3 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -221,6 +221,9 @@ config ACPI_PROCESSOR_IDLE bool select CPU_IDLE +config ACPI_MCFG + bool + config ACPI_CPPC_LIB bool depends on ACPI_PROCESSOR diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 35a6ccbe3025..5ae9d85c5159 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -40,6 +40,7 @@ acpi-$(CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC) += processor_pdc.o acpi-y += ec.o acpi-$(CONFIG_ACPI_DOCK) += dock.o acpi-y += pci_root.o pci_link.o pci_irq.o +obj-$(CONFIG_ACPI_MCFG) += pci_mcfg.o acpi-y += acpi_lpss.o acpi_apd.o acpi-y += acpi_platform.o acpi-y += acpi_pnp.o diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c new file mode 100644 index 000000000000..b5b376e081f5 --- /dev/null +++ b/drivers/acpi/pci_mcfg.c @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2016 Broadcom + * Author: Jayachandran C + * Copyright (C) 2016 Semihalf + * Author: Tomasz Nowicki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation (the "GPL"). + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 (GPLv2) for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 (GPLv2) along with this source code. + */ + +#define pr_fmt(fmt) "ACPI: " fmt + +#include +#include +#include + +/* Structure to hold entries from the MCFG table */ +struct mcfg_entry { + struct list_head list; + phys_addr_t addr; + u16 segment; + u8 bus_start; + u8 bus_end; +}; + +/* List to save MCFG entries */ +static LIST_HEAD(pci_mcfg_list); + +phys_addr_t pci_mcfg_lookup(u16 seg, struct resource *bus_res) +{ + struct mcfg_entry *e; + + /* + * We expect exact match, unless MCFG entry end bus covers more than + * specified by caller. + */ + list_for_each_entry(e, &pci_mcfg_list, list) { + if (e->segment == seg && e->bus_start == bus_res->start && + e->bus_end >= bus_res->end) + return e->addr; + } + + return 0; +} + +static __init int pci_mcfg_parse(struct acpi_table_header *header) +{ + struct acpi_table_mcfg *mcfg; + struct acpi_mcfg_allocation *mptr; + struct mcfg_entry *e, *arr; + int i, n; + + if (header->length < sizeof(struct acpi_table_mcfg)) + return -EINVAL; + + n = (header->length - sizeof(struct acpi_table_mcfg)) / + sizeof(struct acpi_mcfg_allocation); + mcfg = (struct acpi_table_mcfg *)header; + mptr = (struct acpi_mcfg_allocation *) &mcfg[1]; + + arr = kcalloc(n, sizeof(*arr), GFP_KERNEL); + if (!arr) + return -ENOMEM; + + for (i = 0, e = arr; i < n; i++, mptr++, e++) { + e->segment = mptr->pci_segment; + e->addr = mptr->address; + e->bus_start = mptr->start_bus_number; + e->bus_end = mptr->end_bus_number; + list_add(&e->list, &pci_mcfg_list); + } + + pr_info("MCFG table detected, %d entries\n", n); + return 0; +} + +/* Interface called by ACPI - parse and save MCFG table */ +void __init pci_mmcfg_late_init(void) +{ + int err = acpi_table_parse(ACPI_SIG_MCFG, pci_mcfg_parse); + if (err) + pr_err("Failed to parse MCFG (%d)\n", err); +} diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index ae3fe4e64203..d144168d4ef9 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -720,6 +720,36 @@ static void acpi_pci_root_validate_resources(struct device *dev, } } +static void acpi_pci_root_remap_iospace(struct resource_entry *entry) +{ +#ifdef PCI_IOBASE + struct resource *res = entry->res; + resource_size_t cpu_addr = res->start; + resource_size_t pci_addr = cpu_addr - entry->offset; + resource_size_t length = resource_size(res); + unsigned long port; + + if (pci_register_io_range(cpu_addr, length)) + goto err; + + port = pci_address_to_pio(cpu_addr); + if (port == (unsigned long)-1) + goto err; + + res->start = port; + res->end = port + length - 1; + entry->offset = port - pci_addr; + + if (pci_remap_iospace(res, cpu_addr) < 0) + goto err; + + pr_info("Remapped I/O %pa to %pR\n", &cpu_addr, res); + return; +err: + res->flags |= IORESOURCE_DISABLED; +#endif +} + int acpi_pci_probe_root_resources(struct acpi_pci_root_info *info) { int ret; @@ -740,6 +770,9 @@ int acpi_pci_probe_root_resources(struct acpi_pci_root_info *info) "no IO and memory resources present in _CRS\n"); else { resource_list_for_each_entry_safe(entry, tmp, list) { + if (entry->res->flags & IORESOURCE_IO) + acpi_pci_root_remap_iospace(entry); + if (entry->res->flags & IORESOURCE_DISABLED) resource_list_destroy_entry(entry); else @@ -811,6 +844,8 @@ static void acpi_pci_root_release_info(struct pci_host_bridge *bridge) resource_list_for_each_entry(entry, &bridge->windows) { res = entry->res; + if (res->flags & IORESOURCE_IO) + pci_unmap_iospace(res); if (res->parent && (res->flags & (IORESOURCE_MEM | IORESOURCE_IO))) release_resource(res); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 450662055d97..1a04af6d2421 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1937,7 +1937,7 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", obj_request->object_name)) goto fail; @@ -1991,7 +1991,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", obj_request->object_name)) goto fail; @@ -3995,10 +3995,11 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, /* Initialize the layout used for all rbd requests */ - rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); - rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); + rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER; + rbd_dev->layout.stripe_count = 1; + rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER; + rbd_dev->layout.pool_id = spec->pool_id; + RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); /* * If this is a mapping rbd_dev (as opposed to a parent one), @@ -5187,7 +5188,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); - rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id; if (rbd_dev->image_format == 1) ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", spec->image_name, RBD_SUFFIX); diff --git a/drivers/gpu/drm/drm_dp_helper.c b/drivers/gpu/drm/drm_dp_helper.c index 8f11b8741e42..eae5ef963cb7 100644 --- a/drivers/gpu/drm/drm_dp_helper.c +++ b/drivers/gpu/drm/drm_dp_helper.c @@ -860,3 +860,35 @@ void drm_dp_aux_unregister(struct drm_dp_aux *aux) i2c_del_adapter(&aux->ddc); } EXPORT_SYMBOL(drm_dp_aux_unregister); + +#define PSR_SETUP_TIME(x) [DP_PSR_SETUP_TIME_ ## x >> DP_PSR_SETUP_TIME_SHIFT] = (x) + +/** + * drm_dp_psr_setup_time() - PSR setup in time usec + * @psr_cap: PSR capabilities from DPCD + * + * Returns: + * PSR setup time for the panel in microseconds, negative + * error code on failure. + */ +int drm_dp_psr_setup_time(const u8 psr_cap[EDP_PSR_RECEIVER_CAP_SIZE]) +{ + static const u16 psr_setup_time_us[] = { + PSR_SETUP_TIME(330), + PSR_SETUP_TIME(275), + PSR_SETUP_TIME(165), + PSR_SETUP_TIME(110), + PSR_SETUP_TIME(55), + PSR_SETUP_TIME(0), + }; + int i; + + i = (psr_cap[1] & DP_PSR_SETUP_TIME_MASK) >> DP_PSR_SETUP_TIME_SHIFT; + if (i >= ARRAY_SIZE(psr_setup_time_us)) + return -EINVAL; + + return psr_setup_time_us[i]; +} +EXPORT_SYMBOL(drm_dp_psr_setup_time); + +#undef PSR_SETUP_TIME diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index 3329fc6a95f4..cc937a19b1ba 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -1730,6 +1730,8 @@ bool intel_sdvo_init(struct drm_device *dev, /* intel_sprite.c */ +int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode, + int usecs); int intel_plane_init(struct drm_device *dev, enum pipe pipe, int plane); int intel_sprite_set_colorkey(struct drm_device *dev, void *data, struct drm_file *file_priv); diff --git a/drivers/gpu/drm/i915/intel_psr.c b/drivers/gpu/drm/i915/intel_psr.c index 68bd0bb34817..2b0d1baf15b3 100644 --- a/drivers/gpu/drm/i915/intel_psr.c +++ b/drivers/gpu/drm/i915/intel_psr.c @@ -327,6 +327,9 @@ static bool intel_psr_match_conditions(struct intel_dp *intel_dp) struct drm_i915_private *dev_priv = to_i915(dev); struct drm_crtc *crtc = dig_port->base.base.crtc; struct intel_crtc *intel_crtc = to_intel_crtc(crtc); + const struct drm_display_mode *adjusted_mode = + &intel_crtc->config->base.adjusted_mode; + int psr_setup_time; lockdep_assert_held(&dev_priv->psr.lock); WARN_ON(!drm_modeset_is_locked(&dev->mode_config.connection_mutex)); @@ -365,11 +368,25 @@ static bool intel_psr_match_conditions(struct intel_dp *intel_dp) } if (IS_HASWELL(dev) && - intel_crtc->config->base.adjusted_mode.flags & DRM_MODE_FLAG_INTERLACE) { + adjusted_mode->flags & DRM_MODE_FLAG_INTERLACE) { DRM_DEBUG_KMS("PSR condition failed: Interlaced is Enabled\n"); return false; } + psr_setup_time = drm_dp_psr_setup_time(intel_dp->psr_dpcd); + if (psr_setup_time < 0) { + DRM_DEBUG_KMS("PSR condition failed: Invalid PSR setup time (0x%02x)\n", + intel_dp->psr_dpcd[1]); + return false; + } + + if (intel_usecs_to_scanlines(adjusted_mode, psr_setup_time) > + adjusted_mode->crtc_vtotal - adjusted_mode->crtc_vdisplay - 1) { + DRM_DEBUG_KMS("PSR condition failed: PSR setup time (%d us) too long\n", + psr_setup_time); + return false; + } + dev_priv->psr.source_ok = true; return true; } diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c index 0de935ad01c2..7c08e4f29032 100644 --- a/drivers/gpu/drm/i915/intel_sprite.c +++ b/drivers/gpu/drm/i915/intel_sprite.c @@ -53,8 +53,8 @@ format_is_yuv(uint32_t format) } } -static int usecs_to_scanlines(const struct drm_display_mode *adjusted_mode, - int usecs) +int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode, + int usecs) { /* paranoia */ if (!adjusted_mode->crtc_htotal) @@ -91,7 +91,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc) vblank_start = DIV_ROUND_UP(vblank_start, 2); /* FIXME needs to be calibrated sensibly */ - min = vblank_start - usecs_to_scanlines(adjusted_mode, 100); + min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100); max = vblank_start - 1; local_irq_disable(); diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 5495a5ba8039..7f8728984f44 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -21,9 +21,9 @@ config ARM_GIC_MAX_NR config ARM_GIC_V2M bool - depends on ARM_GIC - depends on PCI && PCI_MSI - select PCI_MSI_IRQ_DOMAIN + depends on PCI + select ARM_GIC + select PCI_MSI config GIC_NON_BANKED bool @@ -37,7 +37,8 @@ config ARM_GIC_V3 config ARM_GIC_V3_ITS bool - select PCI_MSI_IRQ_DOMAIN + depends on PCI + depends on PCI_MSI config ARM_NVIC bool @@ -62,13 +63,13 @@ config ARM_VIC_NR config ARMADA_370_XP_IRQ bool select GENERIC_IRQ_CHIP - select PCI_MSI_IRQ_DOMAIN if PCI_MSI + select PCI_MSI if PCI config ALPINE_MSI bool - depends on PCI && PCI_MSI + depends on PCI + select PCI_MSI select GENERIC_IRQ_CHIP - select PCI_MSI_IRQ_DOMAIN config ATMEL_AIC_IRQ bool @@ -117,7 +118,6 @@ config HISILICON_IRQ_MBIGEN bool select ARM_GIC_V3 select ARM_GIC_V3_ITS - select GENERIC_MSI_IRQ_DOMAIN config IMGPDC_IRQ bool @@ -250,12 +250,10 @@ config IRQ_MXS config MVEBU_ODMI bool - select GENERIC_MSI_IRQ_DOMAIN config LS_SCFG_MSI def_bool y if SOC_LS1021A || ARCH_LAYERSCAPE depends on PCI && PCI_MSI - select PCI_MSI_IRQ_DOMAIN config PARTITION_PERCPU bool diff --git a/drivers/memory/Kconfig b/drivers/memory/Kconfig index 133712346911..4b4c0c3c3d2f 100644 --- a/drivers/memory/Kconfig +++ b/drivers/memory/Kconfig @@ -115,7 +115,7 @@ config FSL_CORENET_CF config FSL_IFC bool - depends on FSL_SOC + depends on FSL_SOC || ARCH_LAYERSCAPE config JZ4780_NEMC bool "Ingenic JZ4780 SoC NEMC driver" diff --git a/drivers/memory/fsl_ifc.c b/drivers/memory/fsl_ifc.c index 904b4af5f142..1b182b117f9c 100644 --- a/drivers/memory/fsl_ifc.c +++ b/drivers/memory/fsl_ifc.c @@ -31,7 +31,9 @@ #include #include #include -#include +#include +#include +#include struct fsl_ifc_ctrl *fsl_ifc_ctrl_dev; EXPORT_SYMBOL(fsl_ifc_ctrl_dev); diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c index 4cf8f82cfca2..a70b853fa2c9 100644 --- a/drivers/misc/genwqe/card_base.c +++ b/drivers/misc/genwqe/card_base.c @@ -182,7 +182,7 @@ static void genwqe_dev_free(struct genwqe_dev *cd) */ static int genwqe_bus_reset(struct genwqe_dev *cd) { - int bars, rc = 0; + int rc = 0; struct pci_dev *pci_dev = cd->pci_dev; void __iomem *mmio; @@ -193,8 +193,7 @@ static int genwqe_bus_reset(struct genwqe_dev *cd) cd->mmio = NULL; pci_iounmap(pci_dev, mmio); - bars = pci_select_bars(pci_dev, IORESOURCE_MEM); - pci_release_selected_regions(pci_dev, bars); + pci_release_mem_regions(pci_dev); /* * Firmware/BIOS might change memory mapping during bus reset. @@ -218,7 +217,7 @@ static int genwqe_bus_reset(struct genwqe_dev *cd) GENWQE_INJECT_GFIR_FATAL | GENWQE_INJECT_GFIR_INFO); - rc = pci_request_selected_regions(pci_dev, bars, genwqe_driver_name); + rc = pci_request_mem_regions(pci_dev, genwqe_driver_name); if (rc) { dev_err(&pci_dev->dev, "[%s] err: request bars failed (%d)\n", __func__, rc); @@ -1068,10 +1067,9 @@ static int genwqe_health_check_stop(struct genwqe_dev *cd) */ static int genwqe_pci_setup(struct genwqe_dev *cd) { - int err, bars; + int err; struct pci_dev *pci_dev = cd->pci_dev; - bars = pci_select_bars(pci_dev, IORESOURCE_MEM); err = pci_enable_device_mem(pci_dev); if (err) { dev_err(&pci_dev->dev, @@ -1080,7 +1078,7 @@ static int genwqe_pci_setup(struct genwqe_dev *cd) } /* Reserve PCI I/O and memory resources */ - err = pci_request_selected_regions(pci_dev, bars, genwqe_driver_name); + err = pci_request_mem_regions(pci_dev, genwqe_driver_name); if (err) { dev_err(&pci_dev->dev, "[%s] err: request bars failed (%d)\n", __func__, err); @@ -1142,7 +1140,7 @@ static int genwqe_pci_setup(struct genwqe_dev *cd) out_iounmap: pci_iounmap(pci_dev, cd->mmio); out_release_resources: - pci_release_selected_regions(pci_dev, bars); + pci_release_mem_regions(pci_dev); err_disable_device: pci_disable_device(pci_dev); err_out: @@ -1154,14 +1152,12 @@ static int genwqe_pci_setup(struct genwqe_dev *cd) */ static void genwqe_pci_remove(struct genwqe_dev *cd) { - int bars; struct pci_dev *pci_dev = cd->pci_dev; if (cd->mmio) pci_iounmap(pci_dev, cd->mmio); - bars = pci_select_bars(pci_dev, IORESOURCE_MEM); - pci_release_selected_regions(pci_dev, bars); + pci_release_mem_regions(pci_dev); pci_disable_device(pci_dev); } diff --git a/drivers/mtd/chips/cfi_cmdset_0020.c b/drivers/mtd/chips/cfi_cmdset_0020.c index 9a1a6ffd16b8..94d3eb42c4d5 100644 --- a/drivers/mtd/chips/cfi_cmdset_0020.c +++ b/drivers/mtd/chips/cfi_cmdset_0020.c @@ -416,7 +416,7 @@ static int cfi_staa_read (struct mtd_info *mtd, loff_t from, size_t len, size_t return ret; } -static inline int do_write_buffer(struct map_info *map, struct flchip *chip, +static int do_write_buffer(struct map_info *map, struct flchip *chip, unsigned long adr, const u_char *buf, int len) { struct cfi_private *cfi = map->fldrv_priv; diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig index 64a248556d29..58329d2dacd1 100644 --- a/drivers/mtd/devices/Kconfig +++ b/drivers/mtd/devices/Kconfig @@ -113,12 +113,12 @@ config MTD_SST25L if you want to specify device partitioning. config MTD_BCM47XXSFLASH - tristate "R/O support for serial flash on BCMA bus" + tristate "Support for serial flash on BCMA bus" depends on BCMA_SFLASH && (MIPS || ARM) help BCMA bus can have various flash memories attached, they are registered by bcma as platform devices. This enables driver for - serial flash memories (only read-only mode is implemented). + serial flash memories. config MTD_SLRAM tristate "Uncached system RAM" @@ -171,18 +171,6 @@ config MTDRAM_ERASE_SIZE as a module, it is also possible to specify this as a parameter when loading the module. -#If not a module (I don't want to test it as a module) -config MTDRAM_ABS_POS - hex "SRAM Hexadecimal Absolute position or 0" - depends on MTD_MTDRAM=y - default "0" - help - If you have system RAM accessible by the CPU but not used by Linux - in normal operation, you can give the physical address at which the - available RAM starts, and the MTDRAM driver will use it instead of - allocating space from Linux's available memory. Otherwise, leave - this set to zero. Most people will want to leave this as zero. - config MTD_BLOCK2MTD tristate "MTD using block device" depends on BLOCK diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index 9d6854467651..9cf7fcd28034 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -73,14 +73,15 @@ static int m25p80_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) return spi_write(spi, flash->command, len + 1); } -static void m25p80_write(struct spi_nor *nor, loff_t to, size_t len, - size_t *retlen, const u_char *buf) +static ssize_t m25p80_write(struct spi_nor *nor, loff_t to, size_t len, + const u_char *buf) { struct m25p *flash = nor->priv; struct spi_device *spi = flash->spi; struct spi_transfer t[2] = {}; struct spi_message m; int cmd_sz = m25p_cmdsz(nor); + ssize_t ret; spi_message_init(&m); @@ -98,9 +99,14 @@ static void m25p80_write(struct spi_nor *nor, loff_t to, size_t len, t[1].len = len; spi_message_add_tail(&t[1], &m); - spi_sync(spi, &m); + ret = spi_sync(spi, &m); + if (ret) + return ret; - *retlen += m.actual_length - cmd_sz; + ret = m.actual_length - cmd_sz; + if (ret < 0) + return -EIO; + return ret; } static inline unsigned int m25p80_rx_nbits(struct spi_nor *nor) @@ -119,21 +125,21 @@ static inline unsigned int m25p80_rx_nbits(struct spi_nor *nor) * Read an address range from the nor chip. The address range * may be any size provided it is within the physical boundaries. */ -static int m25p80_read(struct spi_nor *nor, loff_t from, size_t len, - size_t *retlen, u_char *buf) +static ssize_t m25p80_read(struct spi_nor *nor, loff_t from, size_t len, + u_char *buf) { struct m25p *flash = nor->priv; struct spi_device *spi = flash->spi; struct spi_transfer t[2]; struct spi_message m; unsigned int dummy = nor->read_dummy; + ssize_t ret; /* convert the dummy cycles to the number of bytes */ dummy /= 8; if (spi_flash_read_supported(spi)) { struct spi_flash_read_message msg; - int ret; memset(&msg, 0, sizeof(msg)); @@ -149,8 +155,9 @@ static int m25p80_read(struct spi_nor *nor, loff_t from, size_t len, msg.data_nbits = m25p80_rx_nbits(nor); ret = spi_flash_read(spi, &msg); - *retlen = msg.retlen; - return ret; + if (ret < 0) + return ret; + return msg.retlen; } spi_message_init(&m); @@ -165,13 +172,17 @@ static int m25p80_read(struct spi_nor *nor, loff_t from, size_t len, t[1].rx_buf = buf; t[1].rx_nbits = m25p80_rx_nbits(nor); - t[1].len = len; + t[1].len = min(len, spi_max_transfer_size(spi)); spi_message_add_tail(&t[1], &m); - spi_sync(spi, &m); + ret = spi_sync(spi, &m); + if (ret) + return ret; - *retlen = m.actual_length - m25p_cmdsz(nor) - dummy; - return 0; + ret = m.actual_length - m25p_cmdsz(nor) - dummy; + if (ret < 0) + return -EIO; + return ret; } /* diff --git a/drivers/mtd/maps/physmap_of.c b/drivers/mtd/maps/physmap_of.c index 22f3858c0364..3fad35942895 100644 --- a/drivers/mtd/maps/physmap_of.c +++ b/drivers/mtd/maps/physmap_of.c @@ -186,7 +186,7 @@ static int of_flash_probe(struct platform_device *dev) * consists internally of 2 non-identical NOR chips on one die. */ p = of_get_property(dp, "reg", &count); - if (count % reg_tuple_size != 0) { + if (!p || count % reg_tuple_size != 0) { dev_err(&dev->dev, "Malformed reg property on %s\n", dev->dev.of_node->full_name); err = -EINVAL; diff --git a/drivers/mtd/maps/pmcmsp-flash.c b/drivers/mtd/maps/pmcmsp-flash.c index 744ca5cacc9b..f9fa3fad728e 100644 --- a/drivers/mtd/maps/pmcmsp-flash.c +++ b/drivers/mtd/maps/pmcmsp-flash.c @@ -75,15 +75,15 @@ static int __init init_msp_flash(void) printk(KERN_NOTICE "Found %d PMC flash devices\n", fcnt); - msp_flash = kmalloc(fcnt * sizeof(struct map_info *), GFP_KERNEL); + msp_flash = kcalloc(fcnt, sizeof(*msp_flash), GFP_KERNEL); if (!msp_flash) return -ENOMEM; - msp_parts = kmalloc(fcnt * sizeof(struct mtd_partition *), GFP_KERNEL); + msp_parts = kcalloc(fcnt, sizeof(*msp_parts), GFP_KERNEL); if (!msp_parts) goto free_msp_flash; - msp_maps = kcalloc(fcnt, sizeof(struct mtd_info), GFP_KERNEL); + msp_maps = kcalloc(fcnt, sizeof(*msp_maps), GFP_KERNEL); if (!msp_maps) goto free_msp_parts; diff --git a/drivers/mtd/maps/sa1100-flash.c b/drivers/mtd/maps/sa1100-flash.c index 142fc3d79463..784c6e1a0391 100644 --- a/drivers/mtd/maps/sa1100-flash.c +++ b/drivers/mtd/maps/sa1100-flash.c @@ -230,8 +230,10 @@ static struct sa_info *sa1100_setup_mtd(struct platform_device *pdev, info->mtd = mtd_concat_create(cdev, info->num_subdev, plat->name); - if (info->mtd == NULL) + if (info->mtd == NULL) { ret = -ENXIO; + goto err; + } } info->mtd->dev.parent = &pdev->dev; diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index f05e0e9eb2f7..21ff58099f3b 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -438,7 +438,7 @@ config MTD_NAND_FSL_ELBC config MTD_NAND_FSL_IFC tristate "NAND support for Freescale IFC controller" - depends on MTD_NAND && FSL_SOC + depends on MTD_NAND && (FSL_SOC || ARCH_LAYERSCAPE) select FSL_IFC select MEMORY help @@ -539,7 +539,6 @@ config MTD_NAND_FSMC config MTD_NAND_XWAY tristate "Support for NAND on Lantiq XWAY SoC" depends on LANTIQ && SOC_TYPE_XWAY - select MTD_NAND_PLATFORM help Enables support for NAND Flash chips on Lantiq XWAY SoCs. NAND is attached to the External Bus Unit (EBU). @@ -563,4 +562,11 @@ config MTD_NAND_QCOM Enables support for NAND flash chips on SoCs containing the EBI2 NAND controller. This controller is found on IPQ806x SoC. +config MTD_NAND_MTK + tristate "Support for NAND controller on MTK SoCs" + depends on HAS_DMA + help + Enables support for NAND controller on MTK SoCs. + This controller is found on mt27xx, mt81xx, mt65xx SoCs. + endif # MTD_NAND diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile index f55335373f7c..cafde6f3d957 100644 --- a/drivers/mtd/nand/Makefile +++ b/drivers/mtd/nand/Makefile @@ -57,5 +57,6 @@ obj-$(CONFIG_MTD_NAND_SUNXI) += sunxi_nand.o obj-$(CONFIG_MTD_NAND_HISI504) += hisi504_nand.o obj-$(CONFIG_MTD_NAND_BRCMNAND) += brcmnand/ obj-$(CONFIG_MTD_NAND_QCOM) += qcom_nandc.o +obj-$(CONFIG_MTD_NAND_MTK) += mtk_nand.o mtk_ecc.o nand-objs := nand_base.o nand_bbt.o nand_timings.o diff --git a/drivers/mtd/nand/brcmnand/brcmnand.c b/drivers/mtd/nand/brcmnand/brcmnand.c index b76ad7c0144f..8eb2c64df38c 100644 --- a/drivers/mtd/nand/brcmnand/brcmnand.c +++ b/drivers/mtd/nand/brcmnand/brcmnand.c @@ -340,6 +340,36 @@ static const u16 brcmnand_regs_v71[] = { [BRCMNAND_FC_BASE] = 0x400, }; +/* BRCMNAND v7.2 */ +static const u16 brcmnand_regs_v72[] = { + [BRCMNAND_CMD_START] = 0x04, + [BRCMNAND_CMD_EXT_ADDRESS] = 0x08, + [BRCMNAND_CMD_ADDRESS] = 0x0c, + [BRCMNAND_INTFC_STATUS] = 0x14, + [BRCMNAND_CS_SELECT] = 0x18, + [BRCMNAND_CS_XOR] = 0x1c, + [BRCMNAND_LL_OP] = 0x20, + [BRCMNAND_CS0_BASE] = 0x50, + [BRCMNAND_CS1_BASE] = 0, + [BRCMNAND_CORR_THRESHOLD] = 0xdc, + [BRCMNAND_CORR_THRESHOLD_EXT] = 0xe0, + [BRCMNAND_UNCORR_COUNT] = 0xfc, + [BRCMNAND_CORR_COUNT] = 0x100, + [BRCMNAND_CORR_EXT_ADDR] = 0x10c, + [BRCMNAND_CORR_ADDR] = 0x110, + [BRCMNAND_UNCORR_EXT_ADDR] = 0x114, + [BRCMNAND_UNCORR_ADDR] = 0x118, + [BRCMNAND_SEMAPHORE] = 0x150, + [BRCMNAND_ID] = 0x194, + [BRCMNAND_ID_EXT] = 0x198, + [BRCMNAND_LL_RDATA] = 0x19c, + [BRCMNAND_OOB_READ_BASE] = 0x200, + [BRCMNAND_OOB_READ_10_BASE] = 0, + [BRCMNAND_OOB_WRITE_BASE] = 0x400, + [BRCMNAND_OOB_WRITE_10_BASE] = 0, + [BRCMNAND_FC_BASE] = 0x600, +}; + enum brcmnand_cs_reg { BRCMNAND_CS_CFG_EXT = 0, BRCMNAND_CS_CFG, @@ -435,7 +465,9 @@ static int brcmnand_revision_init(struct brcmnand_controller *ctrl) } /* Register offsets */ - if (ctrl->nand_version >= 0x0701) + if (ctrl->nand_version >= 0x0702) + ctrl->reg_offsets = brcmnand_regs_v72; + else if (ctrl->nand_version >= 0x0701) ctrl->reg_offsets = brcmnand_regs_v71; else if (ctrl->nand_version >= 0x0600) ctrl->reg_offsets = brcmnand_regs_v60; @@ -480,7 +512,9 @@ static int brcmnand_revision_init(struct brcmnand_controller *ctrl) } /* Maximum spare area sector size (per 512B) */ - if (ctrl->nand_version >= 0x0600) + if (ctrl->nand_version >= 0x0702) + ctrl->max_oob = 128; + else if (ctrl->nand_version >= 0x0600) ctrl->max_oob = 64; else if (ctrl->nand_version >= 0x0500) ctrl->max_oob = 32; @@ -583,14 +617,20 @@ static void brcmnand_wr_corr_thresh(struct brcmnand_host *host, u8 val) enum brcmnand_reg reg = BRCMNAND_CORR_THRESHOLD; int cs = host->cs; - if (ctrl->nand_version >= 0x0600) + if (ctrl->nand_version >= 0x0702) + bits = 7; + else if (ctrl->nand_version >= 0x0600) bits = 6; else if (ctrl->nand_version >= 0x0500) bits = 5; else bits = 4; - if (ctrl->nand_version >= 0x0600) { + if (ctrl->nand_version >= 0x0702) { + if (cs >= 4) + reg = BRCMNAND_CORR_THRESHOLD_EXT; + shift = (cs % 4) * bits; + } else if (ctrl->nand_version >= 0x0600) { if (cs >= 5) reg = BRCMNAND_CORR_THRESHOLD_EXT; shift = (cs % 5) * bits; @@ -631,19 +671,28 @@ enum { static inline u32 brcmnand_spare_area_mask(struct brcmnand_controller *ctrl) { - if (ctrl->nand_version >= 0x0600) + if (ctrl->nand_version >= 0x0702) + return GENMASK(7, 0); + else if (ctrl->nand_version >= 0x0600) return GENMASK(6, 0); else return GENMASK(5, 0); } #define NAND_ACC_CONTROL_ECC_SHIFT 16 +#define NAND_ACC_CONTROL_ECC_EXT_SHIFT 13 static inline u32 brcmnand_ecc_level_mask(struct brcmnand_controller *ctrl) { u32 mask = (ctrl->nand_version >= 0x0600) ? 0x1f : 0x0f; - return mask << NAND_ACC_CONTROL_ECC_SHIFT; + mask <<= NAND_ACC_CONTROL_ECC_SHIFT; + + /* v7.2 includes additional ECC levels */ + if (ctrl->nand_version >= 0x0702) + mask |= 0x7 << NAND_ACC_CONTROL_ECC_EXT_SHIFT; + + return mask; } static void brcmnand_set_ecc_enabled(struct brcmnand_host *host, int en) @@ -667,7 +716,9 @@ static void brcmnand_set_ecc_enabled(struct brcmnand_host *host, int en) static inline int brcmnand_sector_1k_shift(struct brcmnand_controller *ctrl) { - if (ctrl->nand_version >= 0x0600) + if (ctrl->nand_version >= 0x0702) + return 9; + else if (ctrl->nand_version >= 0x0600) return 7; else if (ctrl->nand_version >= 0x0500) return 6; @@ -773,10 +824,16 @@ enum brcmnand_llop_type { * Internal support functions ***********************************************************************/ -static inline bool is_hamming_ecc(struct brcmnand_cfg *cfg) +static inline bool is_hamming_ecc(struct brcmnand_controller *ctrl, + struct brcmnand_cfg *cfg) { - return cfg->sector_size_1k == 0 && cfg->spare_area_size == 16 && - cfg->ecc_level == 15; + if (ctrl->nand_version <= 0x0701) + return cfg->sector_size_1k == 0 && cfg->spare_area_size == 16 && + cfg->ecc_level == 15; + else + return cfg->sector_size_1k == 0 && ((cfg->spare_area_size == 16 && + cfg->ecc_level == 15) || + (cfg->spare_area_size == 28 && cfg->ecc_level == 16)); } /* @@ -931,7 +988,7 @@ static int brcmstb_choose_ecc_layout(struct brcmnand_host *host) if (p->sector_size_1k) ecc_level <<= 1; - if (is_hamming_ecc(p)) { + if (is_hamming_ecc(host->ctrl, p)) { ecc->bytes = 3 * sectors; mtd_set_ooblayout(mtd, &brcmnand_hamming_ooblayout_ops); return 0; @@ -1108,7 +1165,7 @@ static void brcmnand_send_cmd(struct brcmnand_host *host, int cmd) ctrl->cmd_pending = cmd; intfc = brcmnand_read_reg(ctrl, BRCMNAND_INTFC_STATUS); - BUG_ON(!(intfc & INTFC_CTLR_READY)); + WARN_ON(!(intfc & INTFC_CTLR_READY)); mb(); /* flush previous writes */ brcmnand_write_reg(ctrl, BRCMNAND_CMD_START, @@ -1545,6 +1602,56 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip, return ret; } +/* + * Check a page to see if it is erased (w/ bitflips) after an uncorrectable ECC + * error + * + * Because the HW ECC signals an ECC error if an erase paged has even a single + * bitflip, we must check each ECC error to see if it is actually an erased + * page with bitflips, not a truly corrupted page. + * + * On a real error, return a negative error code (-EBADMSG for ECC error), and + * buf will contain raw data. + * Otherwise, buf gets filled with 0xffs and return the maximum number of + * bitflips-per-ECC-sector to the caller. + * + */ +static int brcmstb_nand_verify_erased_page(struct mtd_info *mtd, + struct nand_chip *chip, void *buf, u64 addr) +{ + int i, sas; + void *oob = chip->oob_poi; + int bitflips = 0; + int page = addr >> chip->page_shift; + int ret; + + if (!buf) { + buf = chip->buffers->databuf; + /* Invalidate page cache */ + chip->pagebuf = -1; + } + + sas = mtd->oobsize / chip->ecc.steps; + + /* read without ecc for verification */ + chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page); + ret = chip->ecc.read_page_raw(mtd, chip, buf, true, page); + if (ret) + return ret; + + for (i = 0; i < chip->ecc.steps; i++, oob += sas) { + ret = nand_check_erased_ecc_chunk(buf, chip->ecc.size, + oob, sas, NULL, 0, + chip->ecc.strength); + if (ret < 0) + return ret; + + bitflips = max(bitflips, ret); + } + + return bitflips; +} + static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip, u64 addr, unsigned int trans, u32 *buf, u8 *oob) { @@ -1552,9 +1659,11 @@ static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip, struct brcmnand_controller *ctrl = host->ctrl; u64 err_addr = 0; int err; + bool retry = true; dev_dbg(ctrl->dev, "read %llx -> %p\n", (unsigned long long)addr, buf); +try_dmaread: brcmnand_write_reg(ctrl, BRCMNAND_UNCORR_COUNT, 0); if (has_flash_dma(ctrl) && !oob && flash_dma_buf_ok(buf)) { @@ -1575,6 +1684,34 @@ static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip, } if (mtd_is_eccerr(err)) { + /* + * On controller version and 7.0, 7.1 , DMA read after a + * prior PIO read that reported uncorrectable error, + * the DMA engine captures this error following DMA read + * cleared only on subsequent DMA read, so just retry once + * to clear a possible false error reported for current DMA + * read + */ + if ((ctrl->nand_version == 0x0700) || + (ctrl->nand_version == 0x0701)) { + if (retry) { + retry = false; + goto try_dmaread; + } + } + + /* + * Controller version 7.2 has hw encoder to detect erased page + * bitflips, apply sw verification for older controllers only + */ + if (ctrl->nand_version < 0x0702) { + err = brcmstb_nand_verify_erased_page(mtd, chip, buf, + addr); + /* erased page bitflips corrected */ + if (err > 0) + return err; + } + dev_dbg(ctrl->dev, "uncorrectable error at 0x%llx\n", (unsigned long long)err_addr); mtd->ecc_stats.failed++; @@ -1857,7 +1994,8 @@ static int brcmnand_set_cfg(struct brcmnand_host *host, return 0; } -static void brcmnand_print_cfg(char *buf, struct brcmnand_cfg *cfg) +static void brcmnand_print_cfg(struct brcmnand_host *host, + char *buf, struct brcmnand_cfg *cfg) { buf += sprintf(buf, "%lluMiB total, %uKiB blocks, %u%s pages, %uB OOB, %u-bit", @@ -1868,7 +2006,7 @@ static void brcmnand_print_cfg(char *buf, struct brcmnand_cfg *cfg) cfg->spare_area_size, cfg->device_width); /* Account for Hamming ECC and for BCH 512B vs 1KiB sectors */ - if (is_hamming_ecc(cfg)) + if (is_hamming_ecc(host->ctrl, cfg)) sprintf(buf, ", Hamming ECC"); else if (cfg->sector_size_1k) sprintf(buf, ", BCH-%u (1KiB sector)", cfg->ecc_level << 1); @@ -1987,7 +2125,7 @@ static int brcmnand_setup_dev(struct brcmnand_host *host) brcmnand_set_ecc_enabled(host, 1); - brcmnand_print_cfg(msg, cfg); + brcmnand_print_cfg(host, msg, cfg); dev_info(ctrl->dev, "detected %s\n", msg); /* Configure ACC_CONTROL */ @@ -1995,6 +2133,10 @@ static int brcmnand_setup_dev(struct brcmnand_host *host) tmp = nand_readreg(ctrl, offs); tmp &= ~ACC_CONTROL_PARTIAL_PAGE; tmp &= ~ACC_CONTROL_RD_ERASED; + + /* We need to turn on Read from erased paged protected by ECC */ + if (ctrl->nand_version >= 0x0702) + tmp |= ACC_CONTROL_RD_ERASED; tmp &= ~ACC_CONTROL_FAST_PGM_RDIN; if (ctrl->features & BRCMNAND_HAS_PREFETCH) { /* @@ -2195,6 +2337,7 @@ static const struct of_device_id brcmnand_of_match[] = { { .compatible = "brcm,brcmnand-v6.2" }, { .compatible = "brcm,brcmnand-v7.0" }, { .compatible = "brcm,brcmnand-v7.1" }, + { .compatible = "brcm,brcmnand-v7.2" }, {}, }; MODULE_DEVICE_TABLE(of, brcmnand_of_match); diff --git a/drivers/mtd/nand/jz4780_bch.c b/drivers/mtd/nand/jz4780_bch.c index d74f4ba4a6f4..731c6051d91e 100644 --- a/drivers/mtd/nand/jz4780_bch.c +++ b/drivers/mtd/nand/jz4780_bch.c @@ -375,6 +375,6 @@ static struct platform_driver jz4780_bch_driver = { module_platform_driver(jz4780_bch_driver); MODULE_AUTHOR("Alex Smith "); -MODULE_AUTHOR("Harvey Hunt "); +MODULE_AUTHOR("Harvey Hunt "); MODULE_DESCRIPTION("Ingenic JZ4780 BCH error correction driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/mtd/nand/jz4780_nand.c b/drivers/mtd/nand/jz4780_nand.c index daf3c4217f4d..175f67da25af 100644 --- a/drivers/mtd/nand/jz4780_nand.c +++ b/drivers/mtd/nand/jz4780_nand.c @@ -412,6 +412,6 @@ static struct platform_driver jz4780_nand_driver = { module_platform_driver(jz4780_nand_driver); MODULE_AUTHOR("Alex Smith "); -MODULE_AUTHOR("Harvey Hunt "); +MODULE_AUTHOR("Harvey Hunt "); MODULE_DESCRIPTION("Ingenic JZ4780 NAND driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/mtd/nand/mtk_ecc.c b/drivers/mtd/nand/mtk_ecc.c new file mode 100644 index 000000000000..25a4fbd4d24a --- /dev/null +++ b/drivers/mtd/nand/mtk_ecc.c @@ -0,0 +1,530 @@ +/* + * MTK ECC controller driver. + * Copyright (C) 2016 MediaTek Inc. + * Authors: Xiaolei Li + * Jorge Ramirez-Ortiz + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mtk_ecc.h" + +#define ECC_IDLE_MASK BIT(0) +#define ECC_IRQ_EN BIT(0) +#define ECC_OP_ENABLE (1) +#define ECC_OP_DISABLE (0) + +#define ECC_ENCCON (0x00) +#define ECC_ENCCNFG (0x04) +#define ECC_CNFG_4BIT (0) +#define ECC_CNFG_6BIT (1) +#define ECC_CNFG_8BIT (2) +#define ECC_CNFG_10BIT (3) +#define ECC_CNFG_12BIT (4) +#define ECC_CNFG_14BIT (5) +#define ECC_CNFG_16BIT (6) +#define ECC_CNFG_18BIT (7) +#define ECC_CNFG_20BIT (8) +#define ECC_CNFG_22BIT (9) +#define ECC_CNFG_24BIT (0xa) +#define ECC_CNFG_28BIT (0xb) +#define ECC_CNFG_32BIT (0xc) +#define ECC_CNFG_36BIT (0xd) +#define ECC_CNFG_40BIT (0xe) +#define ECC_CNFG_44BIT (0xf) +#define ECC_CNFG_48BIT (0x10) +#define ECC_CNFG_52BIT (0x11) +#define ECC_CNFG_56BIT (0x12) +#define ECC_CNFG_60BIT (0x13) +#define ECC_MODE_SHIFT (5) +#define ECC_MS_SHIFT (16) +#define ECC_ENCDIADDR (0x08) +#define ECC_ENCIDLE (0x0C) +#define ECC_ENCPAR(x) (0x10 + (x) * sizeof(u32)) +#define ECC_ENCIRQ_EN (0x80) +#define ECC_ENCIRQ_STA (0x84) +#define ECC_DECCON (0x100) +#define ECC_DECCNFG (0x104) +#define DEC_EMPTY_EN BIT(31) +#define DEC_CNFG_CORRECT (0x3 << 12) +#define ECC_DECIDLE (0x10C) +#define ECC_DECENUM0 (0x114) +#define ERR_MASK (0x3f) +#define ECC_DECDONE (0x124) +#define ECC_DECIRQ_EN (0x200) +#define ECC_DECIRQ_STA (0x204) + +#define ECC_TIMEOUT (500000) + +#define ECC_IDLE_REG(op) ((op) == ECC_ENCODE ? ECC_ENCIDLE : ECC_DECIDLE) +#define ECC_CTL_REG(op) ((op) == ECC_ENCODE ? ECC_ENCCON : ECC_DECCON) +#define ECC_IRQ_REG(op) ((op) == ECC_ENCODE ? \ + ECC_ENCIRQ_EN : ECC_DECIRQ_EN) + +struct mtk_ecc { + struct device *dev; + void __iomem *regs; + struct clk *clk; + + struct completion done; + struct mutex lock; + u32 sectors; +}; + +static inline void mtk_ecc_wait_idle(struct mtk_ecc *ecc, + enum mtk_ecc_operation op) +{ + struct device *dev = ecc->dev; + u32 val; + int ret; + + ret = readl_poll_timeout_atomic(ecc->regs + ECC_IDLE_REG(op), val, + val & ECC_IDLE_MASK, + 10, ECC_TIMEOUT); + if (ret) + dev_warn(dev, "%s NOT idle\n", + op == ECC_ENCODE ? "encoder" : "decoder"); +} + +static irqreturn_t mtk_ecc_irq(int irq, void *id) +{ + struct mtk_ecc *ecc = id; + enum mtk_ecc_operation op; + u32 dec, enc; + + dec = readw(ecc->regs + ECC_DECIRQ_STA) & ECC_IRQ_EN; + if (dec) { + op = ECC_DECODE; + dec = readw(ecc->regs + ECC_DECDONE); + if (dec & ecc->sectors) { + ecc->sectors = 0; + complete(&ecc->done); + } else { + return IRQ_HANDLED; + } + } else { + enc = readl(ecc->regs + ECC_ENCIRQ_STA) & ECC_IRQ_EN; + if (enc) { + op = ECC_ENCODE; + complete(&ecc->done); + } else { + return IRQ_NONE; + } + } + + writel(0, ecc->regs + ECC_IRQ_REG(op)); + + return IRQ_HANDLED; +} + +static void mtk_ecc_config(struct mtk_ecc *ecc, struct mtk_ecc_config *config) +{ + u32 ecc_bit = ECC_CNFG_4BIT, dec_sz, enc_sz; + u32 reg; + + switch (config->strength) { + case 4: + ecc_bit = ECC_CNFG_4BIT; + break; + case 6: + ecc_bit = ECC_CNFG_6BIT; + break; + case 8: + ecc_bit = ECC_CNFG_8BIT; + break; + case 10: + ecc_bit = ECC_CNFG_10BIT; + break; + case 12: + ecc_bit = ECC_CNFG_12BIT; + break; + case 14: + ecc_bit = ECC_CNFG_14BIT; + break; + case 16: + ecc_bit = ECC_CNFG_16BIT; + break; + case 18: + ecc_bit = ECC_CNFG_18BIT; + break; + case 20: + ecc_bit = ECC_CNFG_20BIT; + break; + case 22: + ecc_bit = ECC_CNFG_22BIT; + break; + case 24: + ecc_bit = ECC_CNFG_24BIT; + break; + case 28: + ecc_bit = ECC_CNFG_28BIT; + break; + case 32: + ecc_bit = ECC_CNFG_32BIT; + break; + case 36: + ecc_bit = ECC_CNFG_36BIT; + break; + case 40: + ecc_bit = ECC_CNFG_40BIT; + break; + case 44: + ecc_bit = ECC_CNFG_44BIT; + break; + case 48: + ecc_bit = ECC_CNFG_48BIT; + break; + case 52: + ecc_bit = ECC_CNFG_52BIT; + break; + case 56: + ecc_bit = ECC_CNFG_56BIT; + break; + case 60: + ecc_bit = ECC_CNFG_60BIT; + break; + default: + dev_err(ecc->dev, "invalid strength %d, default to 4 bits\n", + config->strength); + } + + if (config->op == ECC_ENCODE) { + /* configure ECC encoder (in bits) */ + enc_sz = config->len << 3; + + reg = ecc_bit | (config->mode << ECC_MODE_SHIFT); + reg |= (enc_sz << ECC_MS_SHIFT); + writel(reg, ecc->regs + ECC_ENCCNFG); + + if (config->mode != ECC_NFI_MODE) + writel(lower_32_bits(config->addr), + ecc->regs + ECC_ENCDIADDR); + + } else { + /* configure ECC decoder (in bits) */ + dec_sz = (config->len << 3) + + config->strength * ECC_PARITY_BITS; + + reg = ecc_bit | (config->mode << ECC_MODE_SHIFT); + reg |= (dec_sz << ECC_MS_SHIFT) | DEC_CNFG_CORRECT; + reg |= DEC_EMPTY_EN; + writel(reg, ecc->regs + ECC_DECCNFG); + + if (config->sectors) + ecc->sectors = 1 << (config->sectors - 1); + } +} + +void mtk_ecc_get_stats(struct mtk_ecc *ecc, struct mtk_ecc_stats *stats, + int sectors) +{ + u32 offset, i, err; + u32 bitflips = 0; + + stats->corrected = 0; + stats->failed = 0; + + for (i = 0; i < sectors; i++) { + offset = (i >> 2) << 2; + err = readl(ecc->regs + ECC_DECENUM0 + offset); + err = err >> ((i % 4) * 8); + err &= ERR_MASK; + if (err == ERR_MASK) { + /* uncorrectable errors */ + stats->failed++; + continue; + } + + stats->corrected += err; + bitflips = max_t(u32, bitflips, err); + } + + stats->bitflips = bitflips; +} +EXPORT_SYMBOL(mtk_ecc_get_stats); + +void mtk_ecc_release(struct mtk_ecc *ecc) +{ + clk_disable_unprepare(ecc->clk); + put_device(ecc->dev); +} +EXPORT_SYMBOL(mtk_ecc_release); + +static void mtk_ecc_hw_init(struct mtk_ecc *ecc) +{ + mtk_ecc_wait_idle(ecc, ECC_ENCODE); + writew(ECC_OP_DISABLE, ecc->regs + ECC_ENCCON); + + mtk_ecc_wait_idle(ecc, ECC_DECODE); + writel(ECC_OP_DISABLE, ecc->regs + ECC_DECCON); +} + +static struct mtk_ecc *mtk_ecc_get(struct device_node *np) +{ + struct platform_device *pdev; + struct mtk_ecc *ecc; + + pdev = of_find_device_by_node(np); + if (!pdev || !platform_get_drvdata(pdev)) + return ERR_PTR(-EPROBE_DEFER); + + get_device(&pdev->dev); + ecc = platform_get_drvdata(pdev); + clk_prepare_enable(ecc->clk); + mtk_ecc_hw_init(ecc); + + return ecc; +} + +struct mtk_ecc *of_mtk_ecc_get(struct device_node *of_node) +{ + struct mtk_ecc *ecc = NULL; + struct device_node *np; + + np = of_parse_phandle(of_node, "ecc-engine", 0); + if (np) { + ecc = mtk_ecc_get(np); + of_node_put(np); + } + + return ecc; +} +EXPORT_SYMBOL(of_mtk_ecc_get); + +int mtk_ecc_enable(struct mtk_ecc *ecc, struct mtk_ecc_config *config) +{ + enum mtk_ecc_operation op = config->op; + int ret; + + ret = mutex_lock_interruptible(&ecc->lock); + if (ret) { + dev_err(ecc->dev, "interrupted when attempting to lock\n"); + return ret; + } + + mtk_ecc_wait_idle(ecc, op); + mtk_ecc_config(ecc, config); + writew(ECC_OP_ENABLE, ecc->regs + ECC_CTL_REG(op)); + + init_completion(&ecc->done); + writew(ECC_IRQ_EN, ecc->regs + ECC_IRQ_REG(op)); + + return 0; +} +EXPORT_SYMBOL(mtk_ecc_enable); + +void mtk_ecc_disable(struct mtk_ecc *ecc) +{ + enum mtk_ecc_operation op = ECC_ENCODE; + + /* find out the running operation */ + if (readw(ecc->regs + ECC_CTL_REG(op)) != ECC_OP_ENABLE) + op = ECC_DECODE; + + /* disable it */ + mtk_ecc_wait_idle(ecc, op); + writew(0, ecc->regs + ECC_IRQ_REG(op)); + writew(ECC_OP_DISABLE, ecc->regs + ECC_CTL_REG(op)); + + mutex_unlock(&ecc->lock); +} +EXPORT_SYMBOL(mtk_ecc_disable); + +int mtk_ecc_wait_done(struct mtk_ecc *ecc, enum mtk_ecc_operation op) +{ + int ret; + + ret = wait_for_completion_timeout(&ecc->done, msecs_to_jiffies(500)); + if (!ret) { + dev_err(ecc->dev, "%s timeout - interrupt did not arrive)\n", + (op == ECC_ENCODE) ? "encoder" : "decoder"); + return -ETIMEDOUT; + } + + return 0; +} +EXPORT_SYMBOL(mtk_ecc_wait_done); + +int mtk_ecc_encode(struct mtk_ecc *ecc, struct mtk_ecc_config *config, + u8 *data, u32 bytes) +{ + dma_addr_t addr; + u32 *p, len, i; + int ret = 0; + + addr = dma_map_single(ecc->dev, data, bytes, DMA_TO_DEVICE); + ret = dma_mapping_error(ecc->dev, addr); + if (ret) { + dev_err(ecc->dev, "dma mapping error\n"); + return -EINVAL; + } + + config->op = ECC_ENCODE; + config->addr = addr; + ret = mtk_ecc_enable(ecc, config); + if (ret) { + dma_unmap_single(ecc->dev, addr, bytes, DMA_TO_DEVICE); + return ret; + } + + ret = mtk_ecc_wait_done(ecc, ECC_ENCODE); + if (ret) + goto timeout; + + mtk_ecc_wait_idle(ecc, ECC_ENCODE); + + /* Program ECC bytes to OOB: per sector oob = FDM + ECC + SPARE */ + len = (config->strength * ECC_PARITY_BITS + 7) >> 3; + p = (u32 *)(data + bytes); + + /* write the parity bytes generated by the ECC back to the OOB region */ + for (i = 0; i < len; i++) + p[i] = readl(ecc->regs + ECC_ENCPAR(i)); +timeout: + + dma_unmap_single(ecc->dev, addr, bytes, DMA_TO_DEVICE); + mtk_ecc_disable(ecc); + + return ret; +} +EXPORT_SYMBOL(mtk_ecc_encode); + +void mtk_ecc_adjust_strength(u32 *p) +{ + u32 ecc[] = {4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 28, 32, 36, + 40, 44, 48, 52, 56, 60}; + int i; + + for (i = 0; i < ARRAY_SIZE(ecc); i++) { + if (*p <= ecc[i]) { + if (!i) + *p = ecc[i]; + else if (*p != ecc[i]) + *p = ecc[i - 1]; + return; + } + } + + *p = ecc[ARRAY_SIZE(ecc) - 1]; +} +EXPORT_SYMBOL(mtk_ecc_adjust_strength); + +static int mtk_ecc_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct mtk_ecc *ecc; + struct resource *res; + int irq, ret; + + ecc = devm_kzalloc(dev, sizeof(*ecc), GFP_KERNEL); + if (!ecc) + return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + ecc->regs = devm_ioremap_resource(dev, res); + if (IS_ERR(ecc->regs)) { + dev_err(dev, "failed to map regs: %ld\n", PTR_ERR(ecc->regs)); + return PTR_ERR(ecc->regs); + } + + ecc->clk = devm_clk_get(dev, NULL); + if (IS_ERR(ecc->clk)) { + dev_err(dev, "failed to get clock: %ld\n", PTR_ERR(ecc->clk)); + return PTR_ERR(ecc->clk); + } + + irq = platform_get_irq(pdev, 0); + if (irq < 0) { + dev_err(dev, "failed to get irq\n"); + return -EINVAL; + } + + ret = dma_set_mask(dev, DMA_BIT_MASK(32)); + if (ret) { + dev_err(dev, "failed to set DMA mask\n"); + return ret; + } + + ret = devm_request_irq(dev, irq, mtk_ecc_irq, 0x0, "mtk-ecc", ecc); + if (ret) { + dev_err(dev, "failed to request irq\n"); + return -EINVAL; + } + + ecc->dev = dev; + mutex_init(&ecc->lock); + platform_set_drvdata(pdev, ecc); + dev_info(dev, "probed\n"); + + return 0; +} + +#ifdef CONFIG_PM_SLEEP +static int mtk_ecc_suspend(struct device *dev) +{ + struct mtk_ecc *ecc = dev_get_drvdata(dev); + + clk_disable_unprepare(ecc->clk); + + return 0; +} + +static int mtk_ecc_resume(struct device *dev) +{ + struct mtk_ecc *ecc = dev_get_drvdata(dev); + int ret; + + ret = clk_prepare_enable(ecc->clk); + if (ret) { + dev_err(dev, "failed to enable clk\n"); + return ret; + } + + mtk_ecc_hw_init(ecc); + + return 0; +} + +static SIMPLE_DEV_PM_OPS(mtk_ecc_pm_ops, mtk_ecc_suspend, mtk_ecc_resume); +#endif + +static const struct of_device_id mtk_ecc_dt_match[] = { + { .compatible = "mediatek,mt2701-ecc" }, + {}, +}; + +MODULE_DEVICE_TABLE(of, mtk_ecc_dt_match); + +static struct platform_driver mtk_ecc_driver = { + .probe = mtk_ecc_probe, + .driver = { + .name = "mtk-ecc", + .of_match_table = of_match_ptr(mtk_ecc_dt_match), +#ifdef CONFIG_PM_SLEEP + .pm = &mtk_ecc_pm_ops, +#endif + }, +}; + +module_platform_driver(mtk_ecc_driver); + +MODULE_AUTHOR("Xiaolei Li "); +MODULE_DESCRIPTION("MTK Nand ECC Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/nand/mtk_ecc.h b/drivers/mtd/nand/mtk_ecc.h new file mode 100644 index 000000000000..cbeba5cd1c13 --- /dev/null +++ b/drivers/mtd/nand/mtk_ecc.h @@ -0,0 +1,50 @@ +/* + * MTK SDG1 ECC controller + * + * Copyright (c) 2016 Mediatek + * Authors: Xiaolei Li + * Jorge Ramirez-Ortiz + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef __DRIVERS_MTD_NAND_MTK_ECC_H__ +#define __DRIVERS_MTD_NAND_MTK_ECC_H__ + +#include + +#define ECC_PARITY_BITS (14) + +enum mtk_ecc_mode {ECC_DMA_MODE = 0, ECC_NFI_MODE = 1}; +enum mtk_ecc_operation {ECC_ENCODE, ECC_DECODE}; + +struct device_node; +struct mtk_ecc; + +struct mtk_ecc_stats { + u32 corrected; + u32 bitflips; + u32 failed; +}; + +struct mtk_ecc_config { + enum mtk_ecc_operation op; + enum mtk_ecc_mode mode; + dma_addr_t addr; + u32 strength; + u32 sectors; + u32 len; +}; + +int mtk_ecc_encode(struct mtk_ecc *, struct mtk_ecc_config *, u8 *, u32); +void mtk_ecc_get_stats(struct mtk_ecc *, struct mtk_ecc_stats *, int); +int mtk_ecc_wait_done(struct mtk_ecc *, enum mtk_ecc_operation); +int mtk_ecc_enable(struct mtk_ecc *, struct mtk_ecc_config *); +void mtk_ecc_disable(struct mtk_ecc *); +void mtk_ecc_adjust_strength(u32 *); + +struct mtk_ecc *of_mtk_ecc_get(struct device_node *); +void mtk_ecc_release(struct mtk_ecc *); + +#endif diff --git a/drivers/mtd/nand/mtk_nand.c b/drivers/mtd/nand/mtk_nand.c new file mode 100644 index 000000000000..ddaa2acb9dd7 --- /dev/null +++ b/drivers/mtd/nand/mtk_nand.c @@ -0,0 +1,1526 @@ +/* + * MTK NAND Flash controller driver. + * Copyright (C) 2016 MediaTek Inc. + * Authors: Xiaolei Li + * Jorge Ramirez-Ortiz + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mtk_ecc.h" + +/* NAND controller register definition */ +#define NFI_CNFG (0x00) +#define CNFG_AHB BIT(0) +#define CNFG_READ_EN BIT(1) +#define CNFG_DMA_BURST_EN BIT(2) +#define CNFG_BYTE_RW BIT(6) +#define CNFG_HW_ECC_EN BIT(8) +#define CNFG_AUTO_FMT_EN BIT(9) +#define CNFG_OP_CUST (6 << 12) +#define NFI_PAGEFMT (0x04) +#define PAGEFMT_FDM_ECC_SHIFT (12) +#define PAGEFMT_FDM_SHIFT (8) +#define PAGEFMT_SPARE_16 (0) +#define PAGEFMT_SPARE_26 (1) +#define PAGEFMT_SPARE_27 (2) +#define PAGEFMT_SPARE_28 (3) +#define PAGEFMT_SPARE_32 (4) +#define PAGEFMT_SPARE_36 (5) +#define PAGEFMT_SPARE_40 (6) +#define PAGEFMT_SPARE_44 (7) +#define PAGEFMT_SPARE_48 (8) +#define PAGEFMT_SPARE_49 (9) +#define PAGEFMT_SPARE_50 (0xa) +#define PAGEFMT_SPARE_51 (0xb) +#define PAGEFMT_SPARE_52 (0xc) +#define PAGEFMT_SPARE_62 (0xd) +#define PAGEFMT_SPARE_63 (0xe) +#define PAGEFMT_SPARE_64 (0xf) +#define PAGEFMT_SPARE_SHIFT (4) +#define PAGEFMT_SEC_SEL_512 BIT(2) +#define PAGEFMT_512_2K (0) +#define PAGEFMT_2K_4K (1) +#define PAGEFMT_4K_8K (2) +#define PAGEFMT_8K_16K (3) +/* NFI control */ +#define NFI_CON (0x08) +#define CON_FIFO_FLUSH BIT(0) +#define CON_NFI_RST BIT(1) +#define CON_BRD BIT(8) /* burst read */ +#define CON_BWR BIT(9) /* burst write */ +#define CON_SEC_SHIFT (12) +/* Timming control register */ +#define NFI_ACCCON (0x0C) +#define NFI_INTR_EN (0x10) +#define INTR_AHB_DONE_EN BIT(6) +#define NFI_INTR_STA (0x14) +#define NFI_CMD (0x20) +#define NFI_ADDRNOB (0x30) +#define NFI_COLADDR (0x34) +#define NFI_ROWADDR (0x38) +#define NFI_STRDATA (0x40) +#define STAR_EN (1) +#define STAR_DE (0) +#define NFI_CNRNB (0x44) +#define NFI_DATAW (0x50) +#define NFI_DATAR (0x54) +#define NFI_PIO_DIRDY (0x58) +#define PIO_DI_RDY (0x01) +#define NFI_STA (0x60) +#define STA_CMD BIT(0) +#define STA_ADDR BIT(1) +#define STA_BUSY BIT(8) +#define STA_EMP_PAGE BIT(12) +#define NFI_FSM_CUSTDATA (0xe << 16) +#define NFI_FSM_MASK (0xf << 16) +#define NFI_ADDRCNTR (0x70) +#define CNTR_MASK GENMASK(16, 12) +#define NFI_STRADDR (0x80) +#define NFI_BYTELEN (0x84) +#define NFI_CSEL (0x90) +#define NFI_FDML(x) (0xA0 + (x) * sizeof(u32) * 2) +#define NFI_FDMM(x) (0xA4 + (x) * sizeof(u32) * 2) +#define NFI_FDM_MAX_SIZE (8) +#define NFI_FDM_MIN_SIZE (1) +#define NFI_MASTER_STA (0x224) +#define MASTER_STA_MASK (0x0FFF) +#define NFI_EMPTY_THRESH (0x23C) + +#define MTK_NAME "mtk-nand" +#define KB(x) ((x) * 1024UL) +#define MB(x) (KB(x) * 1024UL) + +#define MTK_TIMEOUT (500000) +#define MTK_RESET_TIMEOUT (1000000) +#define MTK_MAX_SECTOR (16) +#define MTK_NAND_MAX_NSELS (2) + +struct mtk_nfc_bad_mark_ctl { + void (*bm_swap)(struct mtd_info *, u8 *buf, int raw); + u32 sec; + u32 pos; +}; + +/* + * FDM: region used to store free OOB data + */ +struct mtk_nfc_fdm { + u32 reg_size; + u32 ecc_size; +}; + +struct mtk_nfc_nand_chip { + struct list_head node; + struct nand_chip nand; + + struct mtk_nfc_bad_mark_ctl bad_mark; + struct mtk_nfc_fdm fdm; + u32 spare_per_sector; + + int nsels; + u8 sels[0]; + /* nothing after this field */ +}; + +struct mtk_nfc_clk { + struct clk *nfi_clk; + struct clk *pad_clk; +}; + +struct mtk_nfc { + struct nand_hw_control controller; + struct mtk_ecc_config ecc_cfg; + struct mtk_nfc_clk clk; + struct mtk_ecc *ecc; + + struct device *dev; + void __iomem *regs; + + struct completion done; + struct list_head chips; + + u8 *buffer; +}; + +static inline struct mtk_nfc_nand_chip *to_mtk_nand(struct nand_chip *nand) +{ + return container_of(nand, struct mtk_nfc_nand_chip, nand); +} + +static inline u8 *data_ptr(struct nand_chip *chip, const u8 *p, int i) +{ + return (u8 *)p + i * chip->ecc.size; +} + +static inline u8 *oob_ptr(struct nand_chip *chip, int i) +{ + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + u8 *poi; + + /* map the sector's FDM data to free oob: + * the beginning of the oob area stores the FDM data of bad mark sectors + */ + + if (i < mtk_nand->bad_mark.sec) + poi = chip->oob_poi + (i + 1) * mtk_nand->fdm.reg_size; + else if (i == mtk_nand->bad_mark.sec) + poi = chip->oob_poi; + else + poi = chip->oob_poi + i * mtk_nand->fdm.reg_size; + + return poi; +} + +static inline int mtk_data_len(struct nand_chip *chip) +{ + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + + return chip->ecc.size + mtk_nand->spare_per_sector; +} + +static inline u8 *mtk_data_ptr(struct nand_chip *chip, int i) +{ + struct mtk_nfc *nfc = nand_get_controller_data(chip); + + return nfc->buffer + i * mtk_data_len(chip); +} + +static inline u8 *mtk_oob_ptr(struct nand_chip *chip, int i) +{ + struct mtk_nfc *nfc = nand_get_controller_data(chip); + + return nfc->buffer + i * mtk_data_len(chip) + chip->ecc.size; +} + +static inline void nfi_writel(struct mtk_nfc *nfc, u32 val, u32 reg) +{ + writel(val, nfc->regs + reg); +} + +static inline void nfi_writew(struct mtk_nfc *nfc, u16 val, u32 reg) +{ + writew(val, nfc->regs + reg); +} + +static inline void nfi_writeb(struct mtk_nfc *nfc, u8 val, u32 reg) +{ + writeb(val, nfc->regs + reg); +} + +static inline u32 nfi_readl(struct mtk_nfc *nfc, u32 reg) +{ + return readl_relaxed(nfc->regs + reg); +} + +static inline u16 nfi_readw(struct mtk_nfc *nfc, u32 reg) +{ + return readw_relaxed(nfc->regs + reg); +} + +static inline u8 nfi_readb(struct mtk_nfc *nfc, u32 reg) +{ + return readb_relaxed(nfc->regs + reg); +} + +static void mtk_nfc_hw_reset(struct mtk_nfc *nfc) +{ + struct device *dev = nfc->dev; + u32 val; + int ret; + + /* reset all registers and force the NFI master to terminate */ + nfi_writel(nfc, CON_FIFO_FLUSH | CON_NFI_RST, NFI_CON); + + /* wait for the master to finish the last transaction */ + ret = readl_poll_timeout(nfc->regs + NFI_MASTER_STA, val, + !(val & MASTER_STA_MASK), 50, + MTK_RESET_TIMEOUT); + if (ret) + dev_warn(dev, "master active in reset [0x%x] = 0x%x\n", + NFI_MASTER_STA, val); + + /* ensure any status register affected by the NFI master is reset */ + nfi_writel(nfc, CON_FIFO_FLUSH | CON_NFI_RST, NFI_CON); + nfi_writew(nfc, STAR_DE, NFI_STRDATA); +} + +static int mtk_nfc_send_command(struct mtk_nfc *nfc, u8 command) +{ + struct device *dev = nfc->dev; + u32 val; + int ret; + + nfi_writel(nfc, command, NFI_CMD); + + ret = readl_poll_timeout_atomic(nfc->regs + NFI_STA, val, + !(val & STA_CMD), 10, MTK_TIMEOUT); + if (ret) { + dev_warn(dev, "nfi core timed out entering command mode\n"); + return -EIO; + } + + return 0; +} + +static int mtk_nfc_send_address(struct mtk_nfc *nfc, int addr) +{ + struct device *dev = nfc->dev; + u32 val; + int ret; + + nfi_writel(nfc, addr, NFI_COLADDR); + nfi_writel(nfc, 0, NFI_ROWADDR); + nfi_writew(nfc, 1, NFI_ADDRNOB); + + ret = readl_poll_timeout_atomic(nfc->regs + NFI_STA, val, + !(val & STA_ADDR), 10, MTK_TIMEOUT); + if (ret) { + dev_warn(dev, "nfi core timed out entering address mode\n"); + return -EIO; + } + + return 0; +} + +static int mtk_nfc_hw_runtime_config(struct mtd_info *mtd) +{ + struct nand_chip *chip = mtd_to_nand(mtd); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + struct mtk_nfc *nfc = nand_get_controller_data(chip); + u32 fmt, spare; + + if (!mtd->writesize) + return 0; + + spare = mtk_nand->spare_per_sector; + + switch (mtd->writesize) { + case 512: + fmt = PAGEFMT_512_2K | PAGEFMT_SEC_SEL_512; + break; + case KB(2): + if (chip->ecc.size == 512) + fmt = PAGEFMT_2K_4K | PAGEFMT_SEC_SEL_512; + else + fmt = PAGEFMT_512_2K; + break; + case KB(4): + if (chip->ecc.size == 512) + fmt = PAGEFMT_4K_8K | PAGEFMT_SEC_SEL_512; + else + fmt = PAGEFMT_2K_4K; + break; + case KB(8): + if (chip->ecc.size == 512) + fmt = PAGEFMT_8K_16K | PAGEFMT_SEC_SEL_512; + else + fmt = PAGEFMT_4K_8K; + break; + case KB(16): + fmt = PAGEFMT_8K_16K; + break; + default: + dev_err(nfc->dev, "invalid page len: %d\n", mtd->writesize); + return -EINVAL; + } + + /* + * the hardware will double the value for this eccsize, so we need to + * halve it + */ + if (chip->ecc.size == 1024) + spare >>= 1; + + switch (spare) { + case 16: + fmt |= (PAGEFMT_SPARE_16 << PAGEFMT_SPARE_SHIFT); + break; + case 26: + fmt |= (PAGEFMT_SPARE_26 << PAGEFMT_SPARE_SHIFT); + break; + case 27: + fmt |= (PAGEFMT_SPARE_27 << PAGEFMT_SPARE_SHIFT); + break; + case 28: + fmt |= (PAGEFMT_SPARE_28 << PAGEFMT_SPARE_SHIFT); + break; + case 32: + fmt |= (PAGEFMT_SPARE_32 << PAGEFMT_SPARE_SHIFT); + break; + case 36: + fmt |= (PAGEFMT_SPARE_36 << PAGEFMT_SPARE_SHIFT); + break; + case 40: + fmt |= (PAGEFMT_SPARE_40 << PAGEFMT_SPARE_SHIFT); + break; + case 44: + fmt |= (PAGEFMT_SPARE_44 << PAGEFMT_SPARE_SHIFT); + break; + case 48: + fmt |= (PAGEFMT_SPARE_48 << PAGEFMT_SPARE_SHIFT); + break; + case 49: + fmt |= (PAGEFMT_SPARE_49 << PAGEFMT_SPARE_SHIFT); + break; + case 50: + fmt |= (PAGEFMT_SPARE_50 << PAGEFMT_SPARE_SHIFT); + break; + case 51: + fmt |= (PAGEFMT_SPARE_51 << PAGEFMT_SPARE_SHIFT); + break; + case 52: + fmt |= (PAGEFMT_SPARE_52 << PAGEFMT_SPARE_SHIFT); + break; + case 62: + fmt |= (PAGEFMT_SPARE_62 << PAGEFMT_SPARE_SHIFT); + break; + case 63: + fmt |= (PAGEFMT_SPARE_63 << PAGEFMT_SPARE_SHIFT); + break; + case 64: + fmt |= (PAGEFMT_SPARE_64 << PAGEFMT_SPARE_SHIFT); + break; + default: + dev_err(nfc->dev, "invalid spare per sector %d\n", spare); + return -EINVAL; + } + + fmt |= mtk_nand->fdm.reg_size << PAGEFMT_FDM_SHIFT; + fmt |= mtk_nand->fdm.ecc_size << PAGEFMT_FDM_ECC_SHIFT; + nfi_writew(nfc, fmt, NFI_PAGEFMT); + + nfc->ecc_cfg.strength = chip->ecc.strength; + nfc->ecc_cfg.len = chip->ecc.size + mtk_nand->fdm.ecc_size; + + return 0; +} + +static void mtk_nfc_select_chip(struct mtd_info *mtd, int chip) +{ + struct nand_chip *nand = mtd_to_nand(mtd); + struct mtk_nfc *nfc = nand_get_controller_data(nand); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(nand); + + if (chip < 0) + return; + + mtk_nfc_hw_runtime_config(mtd); + + nfi_writel(nfc, mtk_nand->sels[chip], NFI_CSEL); +} + +static int mtk_nfc_dev_ready(struct mtd_info *mtd) +{ + struct mtk_nfc *nfc = nand_get_controller_data(mtd_to_nand(mtd)); + + if (nfi_readl(nfc, NFI_STA) & STA_BUSY) + return 0; + + return 1; +} + +static void mtk_nfc_cmd_ctrl(struct mtd_info *mtd, int dat, unsigned int ctrl) +{ + struct mtk_nfc *nfc = nand_get_controller_data(mtd_to_nand(mtd)); + + if (ctrl & NAND_ALE) { + mtk_nfc_send_address(nfc, dat); + } else if (ctrl & NAND_CLE) { + mtk_nfc_hw_reset(nfc); + + nfi_writew(nfc, CNFG_OP_CUST, NFI_CNFG); + mtk_nfc_send_command(nfc, dat); + } +} + +static inline void mtk_nfc_wait_ioready(struct mtk_nfc *nfc) +{ + int rc; + u8 val; + + rc = readb_poll_timeout_atomic(nfc->regs + NFI_PIO_DIRDY, val, + val & PIO_DI_RDY, 10, MTK_TIMEOUT); + if (rc < 0) + dev_err(nfc->dev, "data not ready\n"); +} + +static inline u8 mtk_nfc_read_byte(struct mtd_info *mtd) +{ + struct nand_chip *chip = mtd_to_nand(mtd); + struct mtk_nfc *nfc = nand_get_controller_data(chip); + u32 reg; + + /* after each byte read, the NFI_STA reg is reset by the hardware */ + reg = nfi_readl(nfc, NFI_STA) & NFI_FSM_MASK; + if (reg != NFI_FSM_CUSTDATA) { + reg = nfi_readw(nfc, NFI_CNFG); + reg |= CNFG_BYTE_RW | CNFG_READ_EN; + nfi_writew(nfc, reg, NFI_CNFG); + + /* + * set to max sector to allow the HW to continue reading over + * unaligned accesses + */ + reg = (MTK_MAX_SECTOR << CON_SEC_SHIFT) | CON_BRD; + nfi_writel(nfc, reg, NFI_CON); + + /* trigger to fetch data */ + nfi_writew(nfc, STAR_EN, NFI_STRDATA); + } + + mtk_nfc_wait_ioready(nfc); + + return nfi_readb(nfc, NFI_DATAR); +} + +static void mtk_nfc_read_buf(struct mtd_info *mtd, u8 *buf, int len) +{ + int i; + + for (i = 0; i < len; i++) + buf[i] = mtk_nfc_read_byte(mtd); +} + +static void mtk_nfc_write_byte(struct mtd_info *mtd, u8 byte) +{ + struct mtk_nfc *nfc = nand_get_controller_data(mtd_to_nand(mtd)); + u32 reg; + + reg = nfi_readl(nfc, NFI_STA) & NFI_FSM_MASK; + + if (reg != NFI_FSM_CUSTDATA) { + reg = nfi_readw(nfc, NFI_CNFG) | CNFG_BYTE_RW; + nfi_writew(nfc, reg, NFI_CNFG); + + reg = MTK_MAX_SECTOR << CON_SEC_SHIFT | CON_BWR; + nfi_writel(nfc, reg, NFI_CON); + + nfi_writew(nfc, STAR_EN, NFI_STRDATA); + } + + mtk_nfc_wait_ioready(nfc); + nfi_writeb(nfc, byte, NFI_DATAW); +} + +static void mtk_nfc_write_buf(struct mtd_info *mtd, const u8 *buf, int len) +{ + int i; + + for (i = 0; i < len; i++) + mtk_nfc_write_byte(mtd, buf[i]); +} + +static int mtk_nfc_sector_encode(struct nand_chip *chip, u8 *data) +{ + struct mtk_nfc *nfc = nand_get_controller_data(chip); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + int size = chip->ecc.size + mtk_nand->fdm.reg_size; + + nfc->ecc_cfg.mode = ECC_DMA_MODE; + nfc->ecc_cfg.op = ECC_ENCODE; + + return mtk_ecc_encode(nfc->ecc, &nfc->ecc_cfg, data, size); +} + +static void mtk_nfc_no_bad_mark_swap(struct mtd_info *a, u8 *b, int c) +{ + /* nop */ +} + +static void mtk_nfc_bad_mark_swap(struct mtd_info *mtd, u8 *buf, int raw) +{ + struct nand_chip *chip = mtd_to_nand(mtd); + struct mtk_nfc_nand_chip *nand = to_mtk_nand(chip); + u32 bad_pos = nand->bad_mark.pos; + + if (raw) + bad_pos += nand->bad_mark.sec * mtk_data_len(chip); + else + bad_pos += nand->bad_mark.sec * chip->ecc.size; + + swap(chip->oob_poi[0], buf[bad_pos]); +} + +static int mtk_nfc_format_subpage(struct mtd_info *mtd, u32 offset, + u32 len, const u8 *buf) +{ + struct nand_chip *chip = mtd_to_nand(mtd); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + struct mtk_nfc *nfc = nand_get_controller_data(chip); + struct mtk_nfc_fdm *fdm = &mtk_nand->fdm; + u32 start, end; + int i, ret; + + start = offset / chip->ecc.size; + end = DIV_ROUND_UP(offset + len, chip->ecc.size); + + memset(nfc->buffer, 0xff, mtd->writesize + mtd->oobsize); + for (i = 0; i < chip->ecc.steps; i++) { + memcpy(mtk_data_ptr(chip, i), data_ptr(chip, buf, i), + chip->ecc.size); + + if (start > i || i >= end) + continue; + + if (i == mtk_nand->bad_mark.sec) + mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, 1); + + memcpy(mtk_oob_ptr(chip, i), oob_ptr(chip, i), fdm->reg_size); + + /* program the CRC back to the OOB */ + ret = mtk_nfc_sector_encode(chip, mtk_data_ptr(chip, i)); + if (ret < 0) + return ret; + } + + return 0; +} + +static void mtk_nfc_format_page(struct mtd_info *mtd, const u8 *buf) +{ + struct nand_chip *chip = mtd_to_nand(mtd); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + struct mtk_nfc *nfc = nand_get_controller_data(chip); + struct mtk_nfc_fdm *fdm = &mtk_nand->fdm; + u32 i; + + memset(nfc->buffer, 0xff, mtd->writesize + mtd->oobsize); + for (i = 0; i < chip->ecc.steps; i++) { + if (buf) + memcpy(mtk_data_ptr(chip, i), data_ptr(chip, buf, i), + chip->ecc.size); + + if (i == mtk_nand->bad_mark.sec) + mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, 1); + + memcpy(mtk_oob_ptr(chip, i), oob_ptr(chip, i), fdm->reg_size); + } +} + +static inline void mtk_nfc_read_fdm(struct nand_chip *chip, u32 start, + u32 sectors) +{ + struct mtk_nfc *nfc = nand_get_controller_data(chip); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + struct mtk_nfc_fdm *fdm = &mtk_nand->fdm; + u32 vall, valm; + u8 *oobptr; + int i, j; + + for (i = 0; i < sectors; i++) { + oobptr = oob_ptr(chip, start + i); + vall = nfi_readl(nfc, NFI_FDML(i)); + valm = nfi_readl(nfc, NFI_FDMM(i)); + + for (j = 0; j < fdm->reg_size; j++) + oobptr[j] = (j >= 4 ? valm : vall) >> ((j % 4) * 8); + } +} + +static inline void mtk_nfc_write_fdm(struct nand_chip *chip) +{ + struct mtk_nfc *nfc = nand_get_controller_data(chip); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + struct mtk_nfc_fdm *fdm = &mtk_nand->fdm; + u32 vall, valm; + u8 *oobptr; + int i, j; + + for (i = 0; i < chip->ecc.steps; i++) { + oobptr = oob_ptr(chip, i); + vall = 0; + valm = 0; + for (j = 0; j < 8; j++) { + if (j < 4) + vall |= (j < fdm->reg_size ? oobptr[j] : 0xff) + << (j * 8); + else + valm |= (j < fdm->reg_size ? oobptr[j] : 0xff) + << ((j - 4) * 8); + } + nfi_writel(nfc, vall, NFI_FDML(i)); + nfi_writel(nfc, valm, NFI_FDMM(i)); + } +} + +static int mtk_nfc_do_write_page(struct mtd_info *mtd, struct nand_chip *chip, + const u8 *buf, int page, int len) +{ + struct mtk_nfc *nfc = nand_get_controller_data(chip); + struct device *dev = nfc->dev; + dma_addr_t addr; + u32 reg; + int ret; + + addr = dma_map_single(dev, (void *)buf, len, DMA_TO_DEVICE); + ret = dma_mapping_error(nfc->dev, addr); + if (ret) { + dev_err(nfc->dev, "dma mapping error\n"); + return -EINVAL; + } + + reg = nfi_readw(nfc, NFI_CNFG) | CNFG_AHB | CNFG_DMA_BURST_EN; + nfi_writew(nfc, reg, NFI_CNFG); + + nfi_writel(nfc, chip->ecc.steps << CON_SEC_SHIFT, NFI_CON); + nfi_writel(nfc, lower_32_bits(addr), NFI_STRADDR); + nfi_writew(nfc, INTR_AHB_DONE_EN, NFI_INTR_EN); + + init_completion(&nfc->done); + + reg = nfi_readl(nfc, NFI_CON) | CON_BWR; + nfi_writel(nfc, reg, NFI_CON); + nfi_writew(nfc, STAR_EN, NFI_STRDATA); + + ret = wait_for_completion_timeout(&nfc->done, msecs_to_jiffies(500)); + if (!ret) { + dev_err(dev, "program ahb done timeout\n"); + nfi_writew(nfc, 0, NFI_INTR_EN); + ret = -ETIMEDOUT; + goto timeout; + } + + ret = readl_poll_timeout_atomic(nfc->regs + NFI_ADDRCNTR, reg, + (reg & CNTR_MASK) >= chip->ecc.steps, + 10, MTK_TIMEOUT); + if (ret) + dev_err(dev, "hwecc write timeout\n"); + +timeout: + + dma_unmap_single(nfc->dev, addr, len, DMA_TO_DEVICE); + nfi_writel(nfc, 0, NFI_CON); + + return ret; +} + +static int mtk_nfc_write_page(struct mtd_info *mtd, struct nand_chip *chip, + const u8 *buf, int page, int raw) +{ + struct mtk_nfc *nfc = nand_get_controller_data(chip); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + size_t len; + const u8 *bufpoi; + u32 reg; + int ret; + + if (!raw) { + /* OOB => FDM: from register, ECC: from HW */ + reg = nfi_readw(nfc, NFI_CNFG) | CNFG_AUTO_FMT_EN; + nfi_writew(nfc, reg | CNFG_HW_ECC_EN, NFI_CNFG); + + nfc->ecc_cfg.op = ECC_ENCODE; + nfc->ecc_cfg.mode = ECC_NFI_MODE; + ret = mtk_ecc_enable(nfc->ecc, &nfc->ecc_cfg); + if (ret) { + /* clear NFI config */ + reg = nfi_readw(nfc, NFI_CNFG); + reg &= ~(CNFG_AUTO_FMT_EN | CNFG_HW_ECC_EN); + nfi_writew(nfc, reg, NFI_CNFG); + + return ret; + } + + memcpy(nfc->buffer, buf, mtd->writesize); + mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, raw); + bufpoi = nfc->buffer; + + /* write OOB into the FDM registers (OOB area in MTK NAND) */ + mtk_nfc_write_fdm(chip); + } else { + bufpoi = buf; + } + + len = mtd->writesize + (raw ? mtd->oobsize : 0); + ret = mtk_nfc_do_write_page(mtd, chip, bufpoi, page, len); + + if (!raw) + mtk_ecc_disable(nfc->ecc); + + return ret; +} + +static int mtk_nfc_write_page_hwecc(struct mtd_info *mtd, + struct nand_chip *chip, const u8 *buf, + int oob_on, int page) +{ + return mtk_nfc_write_page(mtd, chip, buf, page, 0); +} + +static int mtk_nfc_write_page_raw(struct mtd_info *mtd, struct nand_chip *chip, + const u8 *buf, int oob_on, int pg) +{ + struct mtk_nfc *nfc = nand_get_controller_data(chip); + + mtk_nfc_format_page(mtd, buf); + return mtk_nfc_write_page(mtd, chip, nfc->buffer, pg, 1); +} + +static int mtk_nfc_write_subpage_hwecc(struct mtd_info *mtd, + struct nand_chip *chip, u32 offset, + u32 data_len, const u8 *buf, + int oob_on, int page) +{ + struct mtk_nfc *nfc = nand_get_controller_data(chip); + int ret; + + ret = mtk_nfc_format_subpage(mtd, offset, data_len, buf); + if (ret < 0) + return ret; + + /* use the data in the private buffer (now with FDM and CRC) */ + return mtk_nfc_write_page(mtd, chip, nfc->buffer, page, 1); +} + +static int mtk_nfc_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, + int page) +{ + int ret; + + chip->cmdfunc(mtd, NAND_CMD_SEQIN, 0x00, page); + + ret = mtk_nfc_write_page_raw(mtd, chip, NULL, 1, page); + if (ret < 0) + return -EIO; + + chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1); + ret = chip->waitfunc(mtd, chip); + + return ret & NAND_STATUS_FAIL ? -EIO : 0; +} + +static int mtk_nfc_update_ecc_stats(struct mtd_info *mtd, u8 *buf, u32 sectors) +{ + struct nand_chip *chip = mtd_to_nand(mtd); + struct mtk_nfc *nfc = nand_get_controller_data(chip); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + struct mtk_ecc_stats stats; + int rc, i; + + rc = nfi_readl(nfc, NFI_STA) & STA_EMP_PAGE; + if (rc) { + memset(buf, 0xff, sectors * chip->ecc.size); + for (i = 0; i < sectors; i++) + memset(oob_ptr(chip, i), 0xff, mtk_nand->fdm.reg_size); + return 0; + } + + mtk_ecc_get_stats(nfc->ecc, &stats, sectors); + mtd->ecc_stats.corrected += stats.corrected; + mtd->ecc_stats.failed += stats.failed; + + return stats.bitflips; +} + +static int mtk_nfc_read_subpage(struct mtd_info *mtd, struct nand_chip *chip, + u32 data_offs, u32 readlen, + u8 *bufpoi, int page, int raw) +{ + struct mtk_nfc *nfc = nand_get_controller_data(chip); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + u32 spare = mtk_nand->spare_per_sector; + u32 column, sectors, start, end, reg; + dma_addr_t addr; + int bitflips; + size_t len; + u8 *buf; + int rc; + + start = data_offs / chip->ecc.size; + end = DIV_ROUND_UP(data_offs + readlen, chip->ecc.size); + + sectors = end - start; + column = start * (chip->ecc.size + spare); + + len = sectors * chip->ecc.size + (raw ? sectors * spare : 0); + buf = bufpoi + start * chip->ecc.size; + + if (column != 0) + chip->cmdfunc(mtd, NAND_CMD_RNDOUT, column, -1); + + addr = dma_map_single(nfc->dev, buf, len, DMA_FROM_DEVICE); + rc = dma_mapping_error(nfc->dev, addr); + if (rc) { + dev_err(nfc->dev, "dma mapping error\n"); + + return -EINVAL; + } + + reg = nfi_readw(nfc, NFI_CNFG); + reg |= CNFG_READ_EN | CNFG_DMA_BURST_EN | CNFG_AHB; + if (!raw) { + reg |= CNFG_AUTO_FMT_EN | CNFG_HW_ECC_EN; + nfi_writew(nfc, reg, NFI_CNFG); + + nfc->ecc_cfg.mode = ECC_NFI_MODE; + nfc->ecc_cfg.sectors = sectors; + nfc->ecc_cfg.op = ECC_DECODE; + rc = mtk_ecc_enable(nfc->ecc, &nfc->ecc_cfg); + if (rc) { + dev_err(nfc->dev, "ecc enable\n"); + /* clear NFI_CNFG */ + reg &= ~(CNFG_DMA_BURST_EN | CNFG_AHB | CNFG_READ_EN | + CNFG_AUTO_FMT_EN | CNFG_HW_ECC_EN); + nfi_writew(nfc, reg, NFI_CNFG); + dma_unmap_single(nfc->dev, addr, len, DMA_FROM_DEVICE); + + return rc; + } + } else { + nfi_writew(nfc, reg, NFI_CNFG); + } + + nfi_writel(nfc, sectors << CON_SEC_SHIFT, NFI_CON); + nfi_writew(nfc, INTR_AHB_DONE_EN, NFI_INTR_EN); + nfi_writel(nfc, lower_32_bits(addr), NFI_STRADDR); + + init_completion(&nfc->done); + reg = nfi_readl(nfc, NFI_CON) | CON_BRD; + nfi_writel(nfc, reg, NFI_CON); + nfi_writew(nfc, STAR_EN, NFI_STRDATA); + + rc = wait_for_completion_timeout(&nfc->done, msecs_to_jiffies(500)); + if (!rc) + dev_warn(nfc->dev, "read ahb/dma done timeout\n"); + + rc = readl_poll_timeout_atomic(nfc->regs + NFI_BYTELEN, reg, + (reg & CNTR_MASK) >= sectors, 10, + MTK_TIMEOUT); + if (rc < 0) { + dev_err(nfc->dev, "subpage done timeout\n"); + bitflips = -EIO; + } else { + bitflips = 0; + if (!raw) { + rc = mtk_ecc_wait_done(nfc->ecc, ECC_DECODE); + bitflips = rc < 0 ? -ETIMEDOUT : + mtk_nfc_update_ecc_stats(mtd, buf, sectors); + mtk_nfc_read_fdm(chip, start, sectors); + } + } + + dma_unmap_single(nfc->dev, addr, len, DMA_FROM_DEVICE); + + if (raw) + goto done; + + mtk_ecc_disable(nfc->ecc); + + if (clamp(mtk_nand->bad_mark.sec, start, end) == mtk_nand->bad_mark.sec) + mtk_nand->bad_mark.bm_swap(mtd, bufpoi, raw); +done: + nfi_writel(nfc, 0, NFI_CON); + + return bitflips; +} + +static int mtk_nfc_read_subpage_hwecc(struct mtd_info *mtd, + struct nand_chip *chip, u32 off, + u32 len, u8 *p, int pg) +{ + return mtk_nfc_read_subpage(mtd, chip, off, len, p, pg, 0); +} + +static int mtk_nfc_read_page_hwecc(struct mtd_info *mtd, + struct nand_chip *chip, u8 *p, + int oob_on, int pg) +{ + return mtk_nfc_read_subpage(mtd, chip, 0, mtd->writesize, p, pg, 0); +} + +static int mtk_nfc_read_page_raw(struct mtd_info *mtd, struct nand_chip *chip, + u8 *buf, int oob_on, int page) +{ + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + struct mtk_nfc *nfc = nand_get_controller_data(chip); + struct mtk_nfc_fdm *fdm = &mtk_nand->fdm; + int i, ret; + + memset(nfc->buffer, 0xff, mtd->writesize + mtd->oobsize); + ret = mtk_nfc_read_subpage(mtd, chip, 0, mtd->writesize, nfc->buffer, + page, 1); + if (ret < 0) + return ret; + + for (i = 0; i < chip->ecc.steps; i++) { + memcpy(oob_ptr(chip, i), mtk_oob_ptr(chip, i), fdm->reg_size); + + if (i == mtk_nand->bad_mark.sec) + mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, 1); + + if (buf) + memcpy(data_ptr(chip, buf, i), mtk_data_ptr(chip, i), + chip->ecc.size); + } + + return ret; +} + +static int mtk_nfc_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, + int page) +{ + chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page); + + return mtk_nfc_read_page_raw(mtd, chip, NULL, 1, page); +} + +static inline void mtk_nfc_hw_init(struct mtk_nfc *nfc) +{ + /* + * ACCON: access timing control register + * ------------------------------------- + * 31:28: minimum required time for CS post pulling down after accessing + * the device + * 27:22: minimum required time for CS pre pulling down before accessing + * the device + * 21:16: minimum required time from NCEB low to NREB low + * 15:12: minimum required time from NWEB high to NREB low. + * 11:08: write enable hold time + * 07:04: write wait states + * 03:00: read wait states + */ + nfi_writel(nfc, 0x10804211, NFI_ACCCON); + + /* + * CNRNB: nand ready/busy register + * ------------------------------- + * 7:4: timeout register for polling the NAND busy/ready signal + * 0 : poll the status of the busy/ready signal after [7:4]*16 cycles. + */ + nfi_writew(nfc, 0xf1, NFI_CNRNB); + nfi_writew(nfc, PAGEFMT_8K_16K, NFI_PAGEFMT); + + mtk_nfc_hw_reset(nfc); + + nfi_readl(nfc, NFI_INTR_STA); + nfi_writel(nfc, 0, NFI_INTR_EN); +} + +static irqreturn_t mtk_nfc_irq(int irq, void *id) +{ + struct mtk_nfc *nfc = id; + u16 sta, ien; + + sta = nfi_readw(nfc, NFI_INTR_STA); + ien = nfi_readw(nfc, NFI_INTR_EN); + + if (!(sta & ien)) + return IRQ_NONE; + + nfi_writew(nfc, ~sta & ien, NFI_INTR_EN); + complete(&nfc->done); + + return IRQ_HANDLED; +} + +static int mtk_nfc_enable_clk(struct device *dev, struct mtk_nfc_clk *clk) +{ + int ret; + + ret = clk_prepare_enable(clk->nfi_clk); + if (ret) { + dev_err(dev, "failed to enable nfi clk\n"); + return ret; + } + + ret = clk_prepare_enable(clk->pad_clk); + if (ret) { + dev_err(dev, "failed to enable pad clk\n"); + clk_disable_unprepare(clk->nfi_clk); + return ret; + } + + return 0; +} + +static void mtk_nfc_disable_clk(struct mtk_nfc_clk *clk) +{ + clk_disable_unprepare(clk->nfi_clk); + clk_disable_unprepare(clk->pad_clk); +} + +static int mtk_nfc_ooblayout_free(struct mtd_info *mtd, int section, + struct mtd_oob_region *oob_region) +{ + struct nand_chip *chip = mtd_to_nand(mtd); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + struct mtk_nfc_fdm *fdm = &mtk_nand->fdm; + u32 eccsteps; + + eccsteps = mtd->writesize / chip->ecc.size; + + if (section >= eccsteps) + return -ERANGE; + + oob_region->length = fdm->reg_size - fdm->ecc_size; + oob_region->offset = section * fdm->reg_size + fdm->ecc_size; + + return 0; +} + +static int mtk_nfc_ooblayout_ecc(struct mtd_info *mtd, int section, + struct mtd_oob_region *oob_region) +{ + struct nand_chip *chip = mtd_to_nand(mtd); + struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip); + u32 eccsteps; + + if (section) + return -ERANGE; + + eccsteps = mtd->writesize / chip->ecc.size; + oob_region->offset = mtk_nand->fdm.reg_size * eccsteps; + oob_region->length = mtd->oobsize - oob_region->offset; + + return 0; +} + +static const struct mtd_ooblayout_ops mtk_nfc_ooblayout_ops = { + .free = mtk_nfc_ooblayout_free, + .ecc = mtk_nfc_ooblayout_ecc, +}; + +static void mtk_nfc_set_fdm(struct mtk_nfc_fdm *fdm, struct mtd_info *mtd) +{ + struct nand_chip *nand = mtd_to_nand(mtd); + struct mtk_nfc_nand_chip *chip = to_mtk_nand(nand); + u32 ecc_bytes; + + ecc_bytes = DIV_ROUND_UP(nand->ecc.strength * ECC_PARITY_BITS, 8); + + fdm->reg_size = chip->spare_per_sector - ecc_bytes; + if (fdm->reg_size > NFI_FDM_MAX_SIZE) + fdm->reg_size = NFI_FDM_MAX_SIZE; + + /* bad block mark storage */ + fdm->ecc_size = 1; +} + +static void mtk_nfc_set_bad_mark_ctl(struct mtk_nfc_bad_mark_ctl *bm_ctl, + struct mtd_info *mtd) +{ + struct nand_chip *nand = mtd_to_nand(mtd); + + if (mtd->writesize == 512) { + bm_ctl->bm_swap = mtk_nfc_no_bad_mark_swap; + } else { + bm_ctl->bm_swap = mtk_nfc_bad_mark_swap; + bm_ctl->sec = mtd->writesize / mtk_data_len(nand); + bm_ctl->pos = mtd->writesize % mtk_data_len(nand); + } +} + +static void mtk_nfc_set_spare_per_sector(u32 *sps, struct mtd_info *mtd) +{ + struct nand_chip *nand = mtd_to_nand(mtd); + u32 spare[] = {16, 26, 27, 28, 32, 36, 40, 44, + 48, 49, 50, 51, 52, 62, 63, 64}; + u32 eccsteps, i; + + eccsteps = mtd->writesize / nand->ecc.size; + *sps = mtd->oobsize / eccsteps; + + if (nand->ecc.size == 1024) + *sps >>= 1; + + for (i = 0; i < ARRAY_SIZE(spare); i++) { + if (*sps <= spare[i]) { + if (!i) + *sps = spare[i]; + else if (*sps != spare[i]) + *sps = spare[i - 1]; + break; + } + } + + if (i >= ARRAY_SIZE(spare)) + *sps = spare[ARRAY_SIZE(spare) - 1]; + + if (nand->ecc.size == 1024) + *sps <<= 1; +} + +static int mtk_nfc_ecc_init(struct device *dev, struct mtd_info *mtd) +{ + struct nand_chip *nand = mtd_to_nand(mtd); + u32 spare; + int free; + + /* support only ecc hw mode */ + if (nand->ecc.mode != NAND_ECC_HW) { + dev_err(dev, "ecc.mode not supported\n"); + return -EINVAL; + } + + /* if optional dt settings not present */ + if (!nand->ecc.size || !nand->ecc.strength) { + /* use datasheet requirements */ + nand->ecc.strength = nand->ecc_strength_ds; + nand->ecc.size = nand->ecc_step_ds; + + /* + * align eccstrength and eccsize + * this controller only supports 512 and 1024 sizes + */ + if (nand->ecc.size < 1024) { + if (mtd->writesize > 512) { + nand->ecc.size = 1024; + nand->ecc.strength <<= 1; + } else { + nand->ecc.size = 512; + } + } else { + nand->ecc.size = 1024; + } + + mtk_nfc_set_spare_per_sector(&spare, mtd); + + /* calculate oob bytes except ecc parity data */ + free = ((nand->ecc.strength * ECC_PARITY_BITS) + 7) >> 3; + free = spare - free; + + /* + * enhance ecc strength if oob left is bigger than max FDM size + * or reduce ecc strength if oob size is not enough for ecc + * parity data. + */ + if (free > NFI_FDM_MAX_SIZE) { + spare -= NFI_FDM_MAX_SIZE; + nand->ecc.strength = (spare << 3) / ECC_PARITY_BITS; + } else if (free < 0) { + spare -= NFI_FDM_MIN_SIZE; + nand->ecc.strength = (spare << 3) / ECC_PARITY_BITS; + } + } + + mtk_ecc_adjust_strength(&nand->ecc.strength); + + dev_info(dev, "eccsize %d eccstrength %d\n", + nand->ecc.size, nand->ecc.strength); + + return 0; +} + +static int mtk_nfc_nand_chip_init(struct device *dev, struct mtk_nfc *nfc, + struct device_node *np) +{ + struct mtk_nfc_nand_chip *chip; + struct nand_chip *nand; + struct mtd_info *mtd; + int nsels, len; + u32 tmp; + int ret; + int i; + + if (!of_get_property(np, "reg", &nsels)) + return -ENODEV; + + nsels /= sizeof(u32); + if (!nsels || nsels > MTK_NAND_MAX_NSELS) { + dev_err(dev, "invalid reg property size %d\n", nsels); + return -EINVAL; + } + + chip = devm_kzalloc(dev, sizeof(*chip) + nsels * sizeof(u8), + GFP_KERNEL); + if (!chip) + return -ENOMEM; + + chip->nsels = nsels; + for (i = 0; i < nsels; i++) { + ret = of_property_read_u32_index(np, "reg", i, &tmp); + if (ret) { + dev_err(dev, "reg property failure : %d\n", ret); + return ret; + } + chip->sels[i] = tmp; + } + + nand = &chip->nand; + nand->controller = &nfc->controller; + + nand_set_flash_node(nand, np); + nand_set_controller_data(nand, nfc); + + nand->options |= NAND_USE_BOUNCE_BUFFER | NAND_SUBPAGE_READ; + nand->dev_ready = mtk_nfc_dev_ready; + nand->select_chip = mtk_nfc_select_chip; + nand->write_byte = mtk_nfc_write_byte; + nand->write_buf = mtk_nfc_write_buf; + nand->read_byte = mtk_nfc_read_byte; + nand->read_buf = mtk_nfc_read_buf; + nand->cmd_ctrl = mtk_nfc_cmd_ctrl; + + /* set default mode in case dt entry is missing */ + nand->ecc.mode = NAND_ECC_HW; + + nand->ecc.write_subpage = mtk_nfc_write_subpage_hwecc; + nand->ecc.write_page_raw = mtk_nfc_write_page_raw; + nand->ecc.write_page = mtk_nfc_write_page_hwecc; + nand->ecc.write_oob_raw = mtk_nfc_write_oob_std; + nand->ecc.write_oob = mtk_nfc_write_oob_std; + + nand->ecc.read_subpage = mtk_nfc_read_subpage_hwecc; + nand->ecc.read_page_raw = mtk_nfc_read_page_raw; + nand->ecc.read_page = mtk_nfc_read_page_hwecc; + nand->ecc.read_oob_raw = mtk_nfc_read_oob_std; + nand->ecc.read_oob = mtk_nfc_read_oob_std; + + mtd = nand_to_mtd(nand); + mtd->owner = THIS_MODULE; + mtd->dev.parent = dev; + mtd->name = MTK_NAME; + mtd_set_ooblayout(mtd, &mtk_nfc_ooblayout_ops); + + mtk_nfc_hw_init(nfc); + + ret = nand_scan_ident(mtd, nsels, NULL); + if (ret) + return -ENODEV; + + /* store bbt magic in page, cause OOB is not protected */ + if (nand->bbt_options & NAND_BBT_USE_FLASH) + nand->bbt_options |= NAND_BBT_NO_OOB; + + ret = mtk_nfc_ecc_init(dev, mtd); + if (ret) + return -EINVAL; + + if (nand->options & NAND_BUSWIDTH_16) { + dev_err(dev, "16bits buswidth not supported"); + return -EINVAL; + } + + mtk_nfc_set_spare_per_sector(&chip->spare_per_sector, mtd); + mtk_nfc_set_fdm(&chip->fdm, mtd); + mtk_nfc_set_bad_mark_ctl(&chip->bad_mark, mtd); + + len = mtd->writesize + mtd->oobsize; + nfc->buffer = devm_kzalloc(dev, len, GFP_KERNEL); + if (!nfc->buffer) + return -ENOMEM; + + ret = nand_scan_tail(mtd); + if (ret) + return -ENODEV; + + ret = mtd_device_parse_register(mtd, NULL, NULL, NULL, 0); + if (ret) { + dev_err(dev, "mtd parse partition error\n"); + nand_release(mtd); + return ret; + } + + list_add_tail(&chip->node, &nfc->chips); + + return 0; +} + +static int mtk_nfc_nand_chips_init(struct device *dev, struct mtk_nfc *nfc) +{ + struct device_node *np = dev->of_node; + struct device_node *nand_np; + int ret; + + for_each_child_of_node(np, nand_np) { + ret = mtk_nfc_nand_chip_init(dev, nfc, nand_np); + if (ret) { + of_node_put(nand_np); + return ret; + } + } + + return 0; +} + +static int mtk_nfc_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + struct mtk_nfc *nfc; + struct resource *res; + int ret, irq; + + nfc = devm_kzalloc(dev, sizeof(*nfc), GFP_KERNEL); + if (!nfc) + return -ENOMEM; + + spin_lock_init(&nfc->controller.lock); + init_waitqueue_head(&nfc->controller.wq); + INIT_LIST_HEAD(&nfc->chips); + + /* probe defer if not ready */ + nfc->ecc = of_mtk_ecc_get(np); + if (IS_ERR(nfc->ecc)) + return PTR_ERR(nfc->ecc); + else if (!nfc->ecc) + return -ENODEV; + + nfc->dev = dev; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + nfc->regs = devm_ioremap_resource(dev, res); + if (IS_ERR(nfc->regs)) { + ret = PTR_ERR(nfc->regs); + dev_err(dev, "no nfi base\n"); + goto release_ecc; + } + + nfc->clk.nfi_clk = devm_clk_get(dev, "nfi_clk"); + if (IS_ERR(nfc->clk.nfi_clk)) { + dev_err(dev, "no clk\n"); + ret = PTR_ERR(nfc->clk.nfi_clk); + goto release_ecc; + } + + nfc->clk.pad_clk = devm_clk_get(dev, "pad_clk"); + if (IS_ERR(nfc->clk.pad_clk)) { + dev_err(dev, "no pad clk\n"); + ret = PTR_ERR(nfc->clk.pad_clk); + goto release_ecc; + } + + ret = mtk_nfc_enable_clk(dev, &nfc->clk); + if (ret) + goto release_ecc; + + irq = platform_get_irq(pdev, 0); + if (irq < 0) { + dev_err(dev, "no nfi irq resource\n"); + ret = -EINVAL; + goto clk_disable; + } + + ret = devm_request_irq(dev, irq, mtk_nfc_irq, 0x0, "mtk-nand", nfc); + if (ret) { + dev_err(dev, "failed to request nfi irq\n"); + goto clk_disable; + } + + ret = dma_set_mask(dev, DMA_BIT_MASK(32)); + if (ret) { + dev_err(dev, "failed to set dma mask\n"); + goto clk_disable; + } + + platform_set_drvdata(pdev, nfc); + + ret = mtk_nfc_nand_chips_init(dev, nfc); + if (ret) { + dev_err(dev, "failed to init nand chips\n"); + goto clk_disable; + } + + return 0; + +clk_disable: + mtk_nfc_disable_clk(&nfc->clk); + +release_ecc: + mtk_ecc_release(nfc->ecc); + + return ret; +} + +static int mtk_nfc_remove(struct platform_device *pdev) +{ + struct mtk_nfc *nfc = platform_get_drvdata(pdev); + struct mtk_nfc_nand_chip *chip; + + while (!list_empty(&nfc->chips)) { + chip = list_first_entry(&nfc->chips, struct mtk_nfc_nand_chip, + node); + nand_release(nand_to_mtd(&chip->nand)); + list_del(&chip->node); + } + + mtk_ecc_release(nfc->ecc); + mtk_nfc_disable_clk(&nfc->clk); + + return 0; +} + +#ifdef CONFIG_PM_SLEEP +static int mtk_nfc_suspend(struct device *dev) +{ + struct mtk_nfc *nfc = dev_get_drvdata(dev); + + mtk_nfc_disable_clk(&nfc->clk); + + return 0; +} + +static int mtk_nfc_resume(struct device *dev) +{ + struct mtk_nfc *nfc = dev_get_drvdata(dev); + struct mtk_nfc_nand_chip *chip; + struct nand_chip *nand; + struct mtd_info *mtd; + int ret; + u32 i; + + udelay(200); + + ret = mtk_nfc_enable_clk(dev, &nfc->clk); + if (ret) + return ret; + + mtk_nfc_hw_init(nfc); + + /* reset NAND chip if VCC was powered off */ + list_for_each_entry(chip, &nfc->chips, node) { + nand = &chip->nand; + mtd = nand_to_mtd(nand); + for (i = 0; i < chip->nsels; i++) { + nand->select_chip(mtd, i); + nand->cmdfunc(mtd, NAND_CMD_RESET, -1, -1); + } + } + + return 0; +} + +static SIMPLE_DEV_PM_OPS(mtk_nfc_pm_ops, mtk_nfc_suspend, mtk_nfc_resume); +#endif + +static const struct of_device_id mtk_nfc_id_table[] = { + { .compatible = "mediatek,mt2701-nfc" }, + {} +}; +MODULE_DEVICE_TABLE(of, mtk_nfc_id_table); + +static struct platform_driver mtk_nfc_driver = { + .probe = mtk_nfc_probe, + .remove = mtk_nfc_remove, + .driver = { + .name = MTK_NAME, + .of_match_table = mtk_nfc_id_table, +#ifdef CONFIG_PM_SLEEP + .pm = &mtk_nfc_pm_ops, +#endif + }, +}; + +module_platform_driver(mtk_nfc_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Xiaolei Li "); +MODULE_DESCRIPTION("MTK Nand Flash Controller Driver"); diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index 0b0dc29d2af7..77533f7f2429 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -2610,7 +2610,7 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to, int cached = writelen > bytes && page != blockmask; uint8_t *wbuf = buf; int use_bufpoi; - int part_pagewr = (column || writelen < (mtd->writesize - 1)); + int part_pagewr = (column || writelen < mtd->writesize); if (part_pagewr) use_bufpoi = 1; diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c index ccc05f5b2695..2af9869a115e 100644 --- a/drivers/mtd/nand/nand_ids.c +++ b/drivers/mtd/nand/nand_ids.c @@ -168,6 +168,7 @@ struct nand_flash_dev nand_flash_ids[] = { /* Manufacturer IDs */ struct nand_manufacturers nand_manuf_ids[] = { {NAND_MFR_TOSHIBA, "Toshiba"}, + {NAND_MFR_ESMT, "ESMT"}, {NAND_MFR_SAMSUNG, "Samsung"}, {NAND_MFR_FUJITSU, "Fujitsu"}, {NAND_MFR_NATIONAL, "National"}, diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c index a136da8df6fe..a59361c36f40 100644 --- a/drivers/mtd/nand/omap2.c +++ b/drivers/mtd/nand/omap2.c @@ -118,8 +118,6 @@ #define PREFETCH_STATUS_FIFO_CNT(val) ((val >> 24) & 0x7F) #define STATUS_BUFF_EMPTY 0x00000001 -#define OMAP24XX_DMA_GPMC 4 - #define SECTOR_BYTES 512 /* 4 bit padding to make byte aligned, 56 = 52 + 4 */ #define BCH4_BIT_PAD 4 @@ -1811,7 +1809,6 @@ static int omap_nand_probe(struct platform_device *pdev) struct nand_chip *nand_chip; int err; dma_cap_mask_t mask; - unsigned sig; struct resource *res; struct device *dev = &pdev->dev; int min_oobbytes = BADBLOCK_MARKER_LENGTH; @@ -1924,11 +1921,11 @@ static int omap_nand_probe(struct platform_device *pdev) case NAND_OMAP_PREFETCH_DMA: dma_cap_zero(mask); dma_cap_set(DMA_SLAVE, mask); - sig = OMAP24XX_DMA_GPMC; - info->dma = dma_request_channel(mask, omap_dma_filter_fn, &sig); - if (!info->dma) { + info->dma = dma_request_chan(pdev->dev.parent, "rxtx"); + + if (IS_ERR(info->dma)) { dev_err(&pdev->dev, "DMA engine request failed\n"); - err = -ENXIO; + err = PTR_ERR(info->dma); goto return_error; } else { struct dma_slave_config cfg; diff --git a/drivers/mtd/nand/sunxi_nand.c b/drivers/mtd/nand/sunxi_nand.c index a83a690688b4..e414b31b71c1 100644 --- a/drivers/mtd/nand/sunxi_nand.c +++ b/drivers/mtd/nand/sunxi_nand.c @@ -39,6 +39,7 @@ #include #include #include +#include #define NFC_REG_CTL 0x0000 #define NFC_REG_ST 0x0004 @@ -153,6 +154,7 @@ /* define bit use in NFC_ECC_ST */ #define NFC_ECC_ERR(x) BIT(x) +#define NFC_ECC_ERR_MSK GENMASK(15, 0) #define NFC_ECC_PAT_FOUND(x) BIT(x + 16) #define NFC_ECC_ERR_CNT(b, x) (((x) >> (((b) % 4) * 8)) & 0xff) @@ -269,10 +271,12 @@ struct sunxi_nfc { void __iomem *regs; struct clk *ahb_clk; struct clk *mod_clk; + struct reset_control *reset; unsigned long assigned_cs; unsigned long clk_rate; struct list_head chips; struct completion complete; + struct dma_chan *dmac; }; static inline struct sunxi_nfc *to_sunxi_nfc(struct nand_hw_control *ctrl) @@ -365,6 +369,67 @@ static int sunxi_nfc_rst(struct sunxi_nfc *nfc) return ret; } +static int sunxi_nfc_dma_op_prepare(struct mtd_info *mtd, const void *buf, + int chunksize, int nchunks, + enum dma_data_direction ddir, + struct scatterlist *sg) +{ + struct nand_chip *nand = mtd_to_nand(mtd); + struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller); + struct dma_async_tx_descriptor *dmad; + enum dma_transfer_direction tdir; + dma_cookie_t dmat; + int ret; + + if (ddir == DMA_FROM_DEVICE) + tdir = DMA_DEV_TO_MEM; + else + tdir = DMA_MEM_TO_DEV; + + sg_init_one(sg, buf, nchunks * chunksize); + ret = dma_map_sg(nfc->dev, sg, 1, ddir); + if (!ret) + return -ENOMEM; + + dmad = dmaengine_prep_slave_sg(nfc->dmac, sg, 1, tdir, DMA_CTRL_ACK); + if (!dmad) { + ret = -EINVAL; + goto err_unmap_buf; + } + + writel(readl(nfc->regs + NFC_REG_CTL) | NFC_RAM_METHOD, + nfc->regs + NFC_REG_CTL); + writel(nchunks, nfc->regs + NFC_REG_SECTOR_NUM); + writel(chunksize, nfc->regs + NFC_REG_CNT); + dmat = dmaengine_submit(dmad); + + ret = dma_submit_error(dmat); + if (ret) + goto err_clr_dma_flag; + + return 0; + +err_clr_dma_flag: + writel(readl(nfc->regs + NFC_REG_CTL) & ~NFC_RAM_METHOD, + nfc->regs + NFC_REG_CTL); + +err_unmap_buf: + dma_unmap_sg(nfc->dev, sg, 1, ddir); + return ret; +} + +static void sunxi_nfc_dma_op_cleanup(struct mtd_info *mtd, + enum dma_data_direction ddir, + struct scatterlist *sg) +{ + struct nand_chip *nand = mtd_to_nand(mtd); + struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller); + + dma_unmap_sg(nfc->dev, sg, 1, ddir); + writel(readl(nfc->regs + NFC_REG_CTL) & ~NFC_RAM_METHOD, + nfc->regs + NFC_REG_CTL); +} + static int sunxi_nfc_dev_ready(struct mtd_info *mtd) { struct nand_chip *nand = mtd_to_nand(mtd); @@ -822,17 +887,15 @@ static void sunxi_nfc_hw_ecc_update_stats(struct mtd_info *mtd, } static int sunxi_nfc_hw_ecc_correct(struct mtd_info *mtd, u8 *data, u8 *oob, - int step, bool *erased) + int step, u32 status, bool *erased) { struct nand_chip *nand = mtd_to_nand(mtd); struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller); struct nand_ecc_ctrl *ecc = &nand->ecc; - u32 status, tmp; + u32 tmp; *erased = false; - status = readl(nfc->regs + NFC_REG_ECC_ST); - if (status & NFC_ECC_ERR(step)) return -EBADMSG; @@ -898,6 +961,7 @@ static int sunxi_nfc_hw_ecc_read_chunk(struct mtd_info *mtd, *cur_off = oob_off + ecc->bytes + 4; ret = sunxi_nfc_hw_ecc_correct(mtd, data, oob_required ? oob : NULL, 0, + readl(nfc->regs + NFC_REG_ECC_ST), &erased); if (erased) return 1; @@ -967,6 +1031,130 @@ static void sunxi_nfc_hw_ecc_read_extra_oob(struct mtd_info *mtd, *cur_off = mtd->oobsize + mtd->writesize; } +static int sunxi_nfc_hw_ecc_read_chunks_dma(struct mtd_info *mtd, uint8_t *buf, + int oob_required, int page, + int nchunks) +{ + struct nand_chip *nand = mtd_to_nand(mtd); + bool randomized = nand->options & NAND_NEED_SCRAMBLING; + struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller); + struct nand_ecc_ctrl *ecc = &nand->ecc; + unsigned int max_bitflips = 0; + int ret, i, raw_mode = 0; + struct scatterlist sg; + u32 status; + + ret = sunxi_nfc_wait_cmd_fifo_empty(nfc); + if (ret) + return ret; + + ret = sunxi_nfc_dma_op_prepare(mtd, buf, ecc->size, nchunks, + DMA_FROM_DEVICE, &sg); + if (ret) + return ret; + + sunxi_nfc_hw_ecc_enable(mtd); + sunxi_nfc_randomizer_config(mtd, page, false); + sunxi_nfc_randomizer_enable(mtd); + + writel((NAND_CMD_RNDOUTSTART << 16) | (NAND_CMD_RNDOUT << 8) | + NAND_CMD_READSTART, nfc->regs + NFC_REG_RCMD_SET); + + dma_async_issue_pending(nfc->dmac); + + writel(NFC_PAGE_OP | NFC_DATA_SWAP_METHOD | NFC_DATA_TRANS, + nfc->regs + NFC_REG_CMD); + + ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0); + if (ret) + dmaengine_terminate_all(nfc->dmac); + + sunxi_nfc_randomizer_disable(mtd); + sunxi_nfc_hw_ecc_disable(mtd); + + sunxi_nfc_dma_op_cleanup(mtd, DMA_FROM_DEVICE, &sg); + + if (ret) + return ret; + + status = readl(nfc->regs + NFC_REG_ECC_ST); + + for (i = 0; i < nchunks; i++) { + int data_off = i * ecc->size; + int oob_off = i * (ecc->bytes + 4); + u8 *data = buf + data_off; + u8 *oob = nand->oob_poi + oob_off; + bool erased; + + ret = sunxi_nfc_hw_ecc_correct(mtd, randomized ? data : NULL, + oob_required ? oob : NULL, + i, status, &erased); + + /* ECC errors are handled in the second loop. */ + if (ret < 0) + continue; + + if (oob_required && !erased) { + /* TODO: use DMA to retrieve OOB */ + nand->cmdfunc(mtd, NAND_CMD_RNDOUT, + mtd->writesize + oob_off, -1); + nand->read_buf(mtd, oob, ecc->bytes + 4); + + sunxi_nfc_hw_ecc_get_prot_oob_bytes(mtd, oob, i, + !i, page); + } + + if (erased) + raw_mode = 1; + + sunxi_nfc_hw_ecc_update_stats(mtd, &max_bitflips, ret); + } + + if (status & NFC_ECC_ERR_MSK) { + for (i = 0; i < nchunks; i++) { + int data_off = i * ecc->size; + int oob_off = i * (ecc->bytes + 4); + u8 *data = buf + data_off; + u8 *oob = nand->oob_poi + oob_off; + + if (!(status & NFC_ECC_ERR(i))) + continue; + + /* + * Re-read the data with the randomizer disabled to + * identify bitflips in erased pages. + */ + if (randomized) { + /* TODO: use DMA to read page in raw mode */ + nand->cmdfunc(mtd, NAND_CMD_RNDOUT, + data_off, -1); + nand->read_buf(mtd, data, ecc->size); + } + + /* TODO: use DMA to retrieve OOB */ + nand->cmdfunc(mtd, NAND_CMD_RNDOUT, + mtd->writesize + oob_off, -1); + nand->read_buf(mtd, oob, ecc->bytes + 4); + + ret = nand_check_erased_ecc_chunk(data, ecc->size, + oob, ecc->bytes + 4, + NULL, 0, + ecc->strength); + if (ret >= 0) + raw_mode = 1; + + sunxi_nfc_hw_ecc_update_stats(mtd, &max_bitflips, ret); + } + } + + if (oob_required) + sunxi_nfc_hw_ecc_read_extra_oob(mtd, nand->oob_poi, + NULL, !raw_mode, + page); + + return max_bitflips; +} + static int sunxi_nfc_hw_ecc_write_chunk(struct mtd_info *mtd, const u8 *data, int data_off, const u8 *oob, int oob_off, @@ -1065,6 +1253,23 @@ static int sunxi_nfc_hw_ecc_read_page(struct mtd_info *mtd, return max_bitflips; } +static int sunxi_nfc_hw_ecc_read_page_dma(struct mtd_info *mtd, + struct nand_chip *chip, u8 *buf, + int oob_required, int page) +{ + int ret; + + ret = sunxi_nfc_hw_ecc_read_chunks_dma(mtd, buf, oob_required, page, + chip->ecc.steps); + if (ret >= 0) + return ret; + + /* Fallback to PIO mode */ + chip->cmdfunc(mtd, NAND_CMD_RNDOUT, 0, -1); + + return sunxi_nfc_hw_ecc_read_page(mtd, chip, buf, oob_required, page); +} + static int sunxi_nfc_hw_ecc_read_subpage(struct mtd_info *mtd, struct nand_chip *chip, u32 data_offs, u32 readlen, @@ -1098,6 +1303,25 @@ static int sunxi_nfc_hw_ecc_read_subpage(struct mtd_info *mtd, return max_bitflips; } +static int sunxi_nfc_hw_ecc_read_subpage_dma(struct mtd_info *mtd, + struct nand_chip *chip, + u32 data_offs, u32 readlen, + u8 *buf, int page) +{ + int nchunks = DIV_ROUND_UP(data_offs + readlen, chip->ecc.size); + int ret; + + ret = sunxi_nfc_hw_ecc_read_chunks_dma(mtd, buf, false, page, nchunks); + if (ret >= 0) + return ret; + + /* Fallback to PIO mode */ + chip->cmdfunc(mtd, NAND_CMD_RNDOUT, 0, -1); + + return sunxi_nfc_hw_ecc_read_subpage(mtd, chip, data_offs, readlen, + buf, page); +} + static int sunxi_nfc_hw_ecc_write_page(struct mtd_info *mtd, struct nand_chip *chip, const uint8_t *buf, int oob_required, @@ -1130,6 +1354,99 @@ static int sunxi_nfc_hw_ecc_write_page(struct mtd_info *mtd, return 0; } +static int sunxi_nfc_hw_ecc_write_subpage(struct mtd_info *mtd, + struct nand_chip *chip, + u32 data_offs, u32 data_len, + const u8 *buf, int oob_required, + int page) +{ + struct nand_ecc_ctrl *ecc = &chip->ecc; + int ret, i, cur_off = 0; + + sunxi_nfc_hw_ecc_enable(mtd); + + for (i = data_offs / ecc->size; + i < DIV_ROUND_UP(data_offs + data_len, ecc->size); i++) { + int data_off = i * ecc->size; + int oob_off = i * (ecc->bytes + 4); + const u8 *data = buf + data_off; + const u8 *oob = chip->oob_poi + oob_off; + + ret = sunxi_nfc_hw_ecc_write_chunk(mtd, data, data_off, oob, + oob_off + mtd->writesize, + &cur_off, !i, page); + if (ret) + return ret; + } + + sunxi_nfc_hw_ecc_disable(mtd); + + return 0; +} + +static int sunxi_nfc_hw_ecc_write_page_dma(struct mtd_info *mtd, + struct nand_chip *chip, + const u8 *buf, + int oob_required, + int page) +{ + struct nand_chip *nand = mtd_to_nand(mtd); + struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller); + struct nand_ecc_ctrl *ecc = &nand->ecc; + struct scatterlist sg; + int ret, i; + + ret = sunxi_nfc_wait_cmd_fifo_empty(nfc); + if (ret) + return ret; + + ret = sunxi_nfc_dma_op_prepare(mtd, buf, ecc->size, ecc->steps, + DMA_TO_DEVICE, &sg); + if (ret) + goto pio_fallback; + + for (i = 0; i < ecc->steps; i++) { + const u8 *oob = nand->oob_poi + (i * (ecc->bytes + 4)); + + sunxi_nfc_hw_ecc_set_prot_oob_bytes(mtd, oob, i, !i, page); + } + + sunxi_nfc_hw_ecc_enable(mtd); + sunxi_nfc_randomizer_config(mtd, page, false); + sunxi_nfc_randomizer_enable(mtd); + + writel((NAND_CMD_RNDIN << 8) | NAND_CMD_PAGEPROG, + nfc->regs + NFC_REG_RCMD_SET); + + dma_async_issue_pending(nfc->dmac); + + writel(NFC_PAGE_OP | NFC_DATA_SWAP_METHOD | + NFC_DATA_TRANS | NFC_ACCESS_DIR, + nfc->regs + NFC_REG_CMD); + + ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0); + if (ret) + dmaengine_terminate_all(nfc->dmac); + + sunxi_nfc_randomizer_disable(mtd); + sunxi_nfc_hw_ecc_disable(mtd); + + sunxi_nfc_dma_op_cleanup(mtd, DMA_TO_DEVICE, &sg); + + if (ret) + return ret; + + if (oob_required || (chip->options & NAND_NEED_SCRAMBLING)) + /* TODO: use DMA to transfer extra OOB bytes ? */ + sunxi_nfc_hw_ecc_write_extra_oob(mtd, chip->oob_poi, + NULL, page); + + return 0; + +pio_fallback: + return sunxi_nfc_hw_ecc_write_page(mtd, chip, buf, oob_required, page); +} + static int sunxi_nfc_hw_syndrome_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip, uint8_t *buf, int oob_required, @@ -1497,10 +1814,19 @@ static int sunxi_nand_hw_common_ecc_ctrl_init(struct mtd_info *mtd, int ret; int i; + if (ecc->size != 512 && ecc->size != 1024) + return -EINVAL; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; + /* Prefer 1k ECC chunk over 512 ones */ + if (ecc->size == 512 && mtd->writesize > 512) { + ecc->size = 1024; + ecc->strength *= 2; + } + /* Add ECC info retrieval from DT */ for (i = 0; i < ARRAY_SIZE(strengths); i++) { if (ecc->strength <= strengths[i]) @@ -1550,14 +1876,28 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct mtd_info *mtd, struct nand_ecc_ctrl *ecc, struct device_node *np) { + struct nand_chip *nand = mtd_to_nand(mtd); + struct sunxi_nand_chip *sunxi_nand = to_sunxi_nand(nand); + struct sunxi_nfc *nfc = to_sunxi_nfc(sunxi_nand->nand.controller); int ret; ret = sunxi_nand_hw_common_ecc_ctrl_init(mtd, ecc, np); if (ret) return ret; - ecc->read_page = sunxi_nfc_hw_ecc_read_page; - ecc->write_page = sunxi_nfc_hw_ecc_write_page; + if (nfc->dmac) { + ecc->read_page = sunxi_nfc_hw_ecc_read_page_dma; + ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage_dma; + ecc->write_page = sunxi_nfc_hw_ecc_write_page_dma; + nand->options |= NAND_USE_BOUNCE_BUFFER; + } else { + ecc->read_page = sunxi_nfc_hw_ecc_read_page; + ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage; + ecc->write_page = sunxi_nfc_hw_ecc_write_page; + } + + /* TODO: support DMA for raw accesses and subpage write */ + ecc->write_subpage = sunxi_nfc_hw_ecc_write_subpage; ecc->read_oob_raw = nand_read_oob_std; ecc->write_oob_raw = nand_write_oob_std; ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage; @@ -1871,26 +2211,59 @@ static int sunxi_nfc_probe(struct platform_device *pdev) if (ret) goto out_ahb_clk_unprepare; + nfc->reset = devm_reset_control_get_optional(dev, "ahb"); + if (!IS_ERR(nfc->reset)) { + ret = reset_control_deassert(nfc->reset); + if (ret) { + dev_err(dev, "reset err %d\n", ret); + goto out_mod_clk_unprepare; + } + } else if (PTR_ERR(nfc->reset) != -ENOENT) { + ret = PTR_ERR(nfc->reset); + goto out_mod_clk_unprepare; + } + ret = sunxi_nfc_rst(nfc); if (ret) - goto out_mod_clk_unprepare; + goto out_ahb_reset_reassert; writel(0, nfc->regs + NFC_REG_INT); ret = devm_request_irq(dev, irq, sunxi_nfc_interrupt, 0, "sunxi-nand", nfc); if (ret) - goto out_mod_clk_unprepare; + goto out_ahb_reset_reassert; + + nfc->dmac = dma_request_slave_channel(dev, "rxtx"); + if (nfc->dmac) { + struct dma_slave_config dmac_cfg = { }; + + dmac_cfg.src_addr = r->start + NFC_REG_IO_DATA; + dmac_cfg.dst_addr = dmac_cfg.src_addr; + dmac_cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; + dmac_cfg.dst_addr_width = dmac_cfg.src_addr_width; + dmac_cfg.src_maxburst = 4; + dmac_cfg.dst_maxburst = 4; + dmaengine_slave_config(nfc->dmac, &dmac_cfg); + } else { + dev_warn(dev, "failed to request rxtx DMA channel\n"); + } platform_set_drvdata(pdev, nfc); ret = sunxi_nand_chips_init(dev, nfc); if (ret) { dev_err(dev, "failed to init nand chips\n"); - goto out_mod_clk_unprepare; + goto out_release_dmac; } return 0; +out_release_dmac: + if (nfc->dmac) + dma_release_channel(nfc->dmac); +out_ahb_reset_reassert: + if (!IS_ERR(nfc->reset)) + reset_control_assert(nfc->reset); out_mod_clk_unprepare: clk_disable_unprepare(nfc->mod_clk); out_ahb_clk_unprepare: @@ -1904,6 +2277,12 @@ static int sunxi_nfc_remove(struct platform_device *pdev) struct sunxi_nfc *nfc = platform_get_drvdata(pdev); sunxi_nand_chips_cleanup(nfc); + + if (!IS_ERR(nfc->reset)) + reset_control_assert(nfc->reset); + + if (nfc->dmac) + dma_release_channel(nfc->dmac); clk_disable_unprepare(nfc->mod_clk); clk_disable_unprepare(nfc->ahb_clk); diff --git a/drivers/mtd/nand/xway_nand.c b/drivers/mtd/nand/xway_nand.c index 0cf0ac07a8c2..1f2948c0c458 100644 --- a/drivers/mtd/nand/xway_nand.c +++ b/drivers/mtd/nand/xway_nand.c @@ -4,6 +4,7 @@ * by the Free Software Foundation. * * Copyright © 2012 John Crispin + * Copyright © 2016 Hauke Mehrtens */ #include @@ -16,20 +17,28 @@ #define EBU_ADDSEL1 0x24 #define EBU_NAND_CON 0xB0 #define EBU_NAND_WAIT 0xB4 +#define NAND_WAIT_RD BIT(0) /* NAND flash status output */ +#define NAND_WAIT_WR_C BIT(3) /* NAND Write/Read complete */ #define EBU_NAND_ECC0 0xB8 #define EBU_NAND_ECC_AC 0xBC -/* nand commands */ -#define NAND_CMD_ALE (1 << 2) -#define NAND_CMD_CLE (1 << 3) -#define NAND_CMD_CS (1 << 4) -#define NAND_WRITE_CMD_RESET 0xff +/* + * nand commands + * The pins of the NAND chip are selected based on the address bits of the + * "register" read and write. There are no special registers, but an + * address range and the lower address bits are used to activate the + * correct line. For example when the bit (1 << 2) is set in the address + * the ALE pin will be activated. + */ +#define NAND_CMD_ALE BIT(2) /* address latch enable */ +#define NAND_CMD_CLE BIT(3) /* command latch enable */ +#define NAND_CMD_CS BIT(4) /* chip select */ +#define NAND_CMD_SE BIT(5) /* spare area access latch */ +#define NAND_CMD_WP BIT(6) /* write protect */ #define NAND_WRITE_CMD (NAND_CMD_CS | NAND_CMD_CLE) #define NAND_WRITE_ADDR (NAND_CMD_CS | NAND_CMD_ALE) #define NAND_WRITE_DATA (NAND_CMD_CS) #define NAND_READ_DATA (NAND_CMD_CS) -#define NAND_WAIT_WR_C (1 << 3) -#define NAND_WAIT_RD (0x1) /* we need to tel the ebu which addr we mapped the nand to */ #define ADDSEL1_MASK(x) (x << 4) @@ -54,31 +63,41 @@ #define NAND_CON_CSMUX (1 << 1) #define NAND_CON_NANDM 1 -static void xway_reset_chip(struct nand_chip *chip) +struct xway_nand_data { + struct nand_chip chip; + unsigned long csflags; + void __iomem *nandaddr; +}; + +static u8 xway_readb(struct mtd_info *mtd, int op) { - unsigned long nandaddr = (unsigned long) chip->IO_ADDR_W; - unsigned long flags; + struct nand_chip *chip = mtd_to_nand(mtd); + struct xway_nand_data *data = nand_get_controller_data(chip); - nandaddr &= ~NAND_WRITE_ADDR; - nandaddr |= NAND_WRITE_CMD; - - /* finish with a reset */ - spin_lock_irqsave(&ebu_lock, flags); - writeb(NAND_WRITE_CMD_RESET, (void __iomem *) nandaddr); - while ((ltq_ebu_r32(EBU_NAND_WAIT) & NAND_WAIT_WR_C) == 0) - ; - spin_unlock_irqrestore(&ebu_lock, flags); + return readb(data->nandaddr + op); } -static void xway_select_chip(struct mtd_info *mtd, int chip) +static void xway_writeb(struct mtd_info *mtd, int op, u8 value) { + struct nand_chip *chip = mtd_to_nand(mtd); + struct xway_nand_data *data = nand_get_controller_data(chip); - switch (chip) { + writeb(value, data->nandaddr + op); +} + +static void xway_select_chip(struct mtd_info *mtd, int select) +{ + struct nand_chip *chip = mtd_to_nand(mtd); + struct xway_nand_data *data = nand_get_controller_data(chip); + + switch (select) { case -1: ltq_ebu_w32_mask(NAND_CON_CE, 0, EBU_NAND_CON); ltq_ebu_w32_mask(NAND_CON_NANDM, 0, EBU_NAND_CON); + spin_unlock_irqrestore(&ebu_lock, data->csflags); break; case 0: + spin_lock_irqsave(&ebu_lock, data->csflags); ltq_ebu_w32_mask(0, NAND_CON_NANDM, EBU_NAND_CON); ltq_ebu_w32_mask(0, NAND_CON_CE, EBU_NAND_CON); break; @@ -89,26 +108,16 @@ static void xway_select_chip(struct mtd_info *mtd, int chip) static void xway_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl) { - struct nand_chip *this = mtd_to_nand(mtd); - unsigned long nandaddr = (unsigned long) this->IO_ADDR_W; - unsigned long flags; + if (cmd == NAND_CMD_NONE) + return; - if (ctrl & NAND_CTRL_CHANGE) { - nandaddr &= ~(NAND_WRITE_CMD | NAND_WRITE_ADDR); - if (ctrl & NAND_CLE) - nandaddr |= NAND_WRITE_CMD; - else - nandaddr |= NAND_WRITE_ADDR; - this->IO_ADDR_W = (void __iomem *) nandaddr; - } + if (ctrl & NAND_CLE) + xway_writeb(mtd, NAND_WRITE_CMD, cmd); + else if (ctrl & NAND_ALE) + xway_writeb(mtd, NAND_WRITE_ADDR, cmd); - if (cmd != NAND_CMD_NONE) { - spin_lock_irqsave(&ebu_lock, flags); - writeb(cmd, this->IO_ADDR_W); - while ((ltq_ebu_r32(EBU_NAND_WAIT) & NAND_WAIT_WR_C) == 0) - ; - spin_unlock_irqrestore(&ebu_lock, flags); - } + while ((ltq_ebu_r32(EBU_NAND_WAIT) & NAND_WAIT_WR_C) == 0) + ; } static int xway_dev_ready(struct mtd_info *mtd) @@ -118,80 +127,122 @@ static int xway_dev_ready(struct mtd_info *mtd) static unsigned char xway_read_byte(struct mtd_info *mtd) { - struct nand_chip *this = mtd_to_nand(mtd); - unsigned long nandaddr = (unsigned long) this->IO_ADDR_R; - unsigned long flags; - int ret; - - spin_lock_irqsave(&ebu_lock, flags); - ret = ltq_r8((void __iomem *)(nandaddr + NAND_READ_DATA)); - spin_unlock_irqrestore(&ebu_lock, flags); - - return ret; + return xway_readb(mtd, NAND_READ_DATA); } +static void xway_read_buf(struct mtd_info *mtd, u_char *buf, int len) +{ + int i; + + for (i = 0; i < len; i++) + buf[i] = xway_readb(mtd, NAND_WRITE_DATA); +} + +static void xway_write_buf(struct mtd_info *mtd, const u_char *buf, int len) +{ + int i; + + for (i = 0; i < len; i++) + xway_writeb(mtd, NAND_WRITE_DATA, buf[i]); +} + +/* + * Probe for the NAND device. + */ static int xway_nand_probe(struct platform_device *pdev) { - struct nand_chip *this = platform_get_drvdata(pdev); - unsigned long nandaddr = (unsigned long) this->IO_ADDR_W; - const __be32 *cs = of_get_property(pdev->dev.of_node, - "lantiq,cs", NULL); + struct xway_nand_data *data; + struct mtd_info *mtd; + struct resource *res; + int err; + u32 cs; u32 cs_flag = 0; + /* Allocate memory for the device structure (and zero it) */ + data = devm_kzalloc(&pdev->dev, sizeof(struct xway_nand_data), + GFP_KERNEL); + if (!data) + return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + data->nandaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(data->nandaddr)) + return PTR_ERR(data->nandaddr); + + nand_set_flash_node(&data->chip, pdev->dev.of_node); + mtd = nand_to_mtd(&data->chip); + mtd->dev.parent = &pdev->dev; + + data->chip.cmd_ctrl = xway_cmd_ctrl; + data->chip.dev_ready = xway_dev_ready; + data->chip.select_chip = xway_select_chip; + data->chip.write_buf = xway_write_buf; + data->chip.read_buf = xway_read_buf; + data->chip.read_byte = xway_read_byte; + data->chip.chip_delay = 30; + + data->chip.ecc.mode = NAND_ECC_SOFT; + data->chip.ecc.algo = NAND_ECC_HAMMING; + + platform_set_drvdata(pdev, data); + nand_set_controller_data(&data->chip, data); + /* load our CS from the DT. Either we find a valid 1 or default to 0 */ - if (cs && (*cs == 1)) + err = of_property_read_u32(pdev->dev.of_node, "lantiq,cs", &cs); + if (!err && cs == 1) cs_flag = NAND_CON_IN_CS1 | NAND_CON_OUT_CS1; /* setup the EBU to run in NAND mode on our base addr */ - ltq_ebu_w32(CPHYSADDR(nandaddr) - | ADDSEL1_MASK(3) | ADDSEL1_REGEN, EBU_ADDSEL1); + ltq_ebu_w32(CPHYSADDR(data->nandaddr) + | ADDSEL1_MASK(3) | ADDSEL1_REGEN, EBU_ADDSEL1); ltq_ebu_w32(BUSCON1_SETUP | BUSCON1_BCGEN_RES | BUSCON1_WAITWRC2 - | BUSCON1_WAITRDC2 | BUSCON1_HOLDC1 | BUSCON1_RECOVC1 - | BUSCON1_CMULT4, LTQ_EBU_BUSCON1); + | BUSCON1_WAITRDC2 | BUSCON1_HOLDC1 | BUSCON1_RECOVC1 + | BUSCON1_CMULT4, LTQ_EBU_BUSCON1); ltq_ebu_w32(NAND_CON_NANDM | NAND_CON_CSMUX | NAND_CON_CS_P - | NAND_CON_SE_P | NAND_CON_WP_P | NAND_CON_PRE_P - | cs_flag, EBU_NAND_CON); + | NAND_CON_SE_P | NAND_CON_WP_P | NAND_CON_PRE_P + | cs_flag, EBU_NAND_CON); - /* finish with a reset */ - xway_reset_chip(this); + /* Scan to find existence of the device */ + err = nand_scan(mtd, 1); + if (err) + return err; - return 0; + err = mtd_device_register(mtd, NULL, 0); + if (err) + nand_release(mtd); + + return err; } -static struct platform_nand_data xway_nand_data = { - .chip = { - .nr_chips = 1, - .chip_delay = 30, - }, - .ctrl = { - .probe = xway_nand_probe, - .cmd_ctrl = xway_cmd_ctrl, - .dev_ready = xway_dev_ready, - .select_chip = xway_select_chip, - .read_byte = xway_read_byte, - } -}; - /* - * Try to find the node inside the DT. If it is available attach out - * platform_nand_data + * Remove a NAND device. */ -static int __init xway_register_nand(void) +static int xway_nand_remove(struct platform_device *pdev) { - struct device_node *node; - struct platform_device *pdev; + struct xway_nand_data *data = platform_get_drvdata(pdev); + + nand_release(nand_to_mtd(&data->chip)); - node = of_find_compatible_node(NULL, NULL, "lantiq,nand-xway"); - if (!node) - return -ENOENT; - pdev = of_find_device_by_node(node); - if (!pdev) - return -EINVAL; - pdev->dev.platform_data = &xway_nand_data; - of_node_put(node); return 0; } -subsys_initcall(xway_register_nand); +static const struct of_device_id xway_nand_match[] = { + { .compatible = "lantiq,nand-xway" }, + {}, +}; +MODULE_DEVICE_TABLE(of, xway_nand_match); + +static struct platform_driver xway_nand_driver = { + .probe = xway_nand_probe, + .remove = xway_nand_remove, + .driver = { + .name = "lantiq,nand-xway", + .of_match_table = xway_nand_match, + }, +}; + +module_platform_driver(xway_nand_driver); + +MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c index a4b029a417f0..1a6d0e367b89 100644 --- a/drivers/mtd/onenand/onenand_base.c +++ b/drivers/mtd/onenand/onenand_base.c @@ -3188,13 +3188,13 @@ static int onenand_otp_walk(struct mtd_info *mtd, loff_t from, size_t len, size_t tmp_retlen; ret = action(mtd, from, len, &tmp_retlen, buf); + if (ret) + break; buf += tmp_retlen; len -= tmp_retlen; *retlen += tmp_retlen; - if (ret) - break; } otp_pages--; } diff --git a/drivers/mtd/spi-nor/Kconfig b/drivers/mtd/spi-nor/Kconfig index d42c98e1f581..4a682ee0f632 100644 --- a/drivers/mtd/spi-nor/Kconfig +++ b/drivers/mtd/spi-nor/Kconfig @@ -29,6 +29,26 @@ config MTD_SPI_NOR_USE_4K_SECTORS Please note that some tools/drivers/filesystems may not work with 4096 B erase size (e.g. UBIFS requires 15 KiB as a minimum). +config SPI_ATMEL_QUADSPI + tristate "Atmel Quad SPI Controller" + depends on ARCH_AT91 || (ARM && COMPILE_TEST) + depends on OF && HAS_IOMEM + help + This enables support for the Quad SPI controller in master mode. + This driver does not support generic SPI. The implementation only + supports SPI NOR. + +config SPI_CADENCE_QUADSPI + tristate "Cadence Quad SPI controller" + depends on OF && ARM + help + Enable support for the Cadence Quad SPI Flash controller. + + Cadence QSPI is a specialized controller for connecting an SPI + Flash over 1/2/4-bit wide bus. Enable this option if you have a + device with a Cadence QSPI controller and want to access the + Flash as an MTD device. + config SPI_FSL_QUADSPI tristate "Freescale Quad SPI controller" depends on ARCH_MXC || SOC_LS1021A || ARCH_LAYERSCAPE || COMPILE_TEST @@ -38,6 +58,13 @@ config SPI_FSL_QUADSPI This controller does not support generic SPI. It only supports SPI NOR. +config SPI_HISI_SFC + tristate "Hisilicon SPI-NOR Flash Controller(SFC)" + depends on ARCH_HISI || COMPILE_TEST + depends on HAS_IOMEM && HAS_DMA + help + This enables support for hisilicon SPI-NOR flash controller. + config SPI_NXP_SPIFI tristate "NXP SPI Flash Interface (SPIFI)" depends on OF && (ARCH_LPC18XX || COMPILE_TEST) diff --git a/drivers/mtd/spi-nor/Makefile b/drivers/mtd/spi-nor/Makefile index 0bf3a7f81675..121695e83542 100644 --- a/drivers/mtd/spi-nor/Makefile +++ b/drivers/mtd/spi-nor/Makefile @@ -1,4 +1,7 @@ obj-$(CONFIG_MTD_SPI_NOR) += spi-nor.o +obj-$(CONFIG_SPI_ATMEL_QUADSPI) += atmel-quadspi.o +obj-$(CONFIG_SPI_CADENCE_QUADSPI) += cadence-quadspi.o obj-$(CONFIG_SPI_FSL_QUADSPI) += fsl-quadspi.o +obj-$(CONFIG_SPI_HISI_SFC) += hisi-sfc.o obj-$(CONFIG_MTD_MT81xx_NOR) += mtk-quadspi.o obj-$(CONFIG_SPI_NXP_SPIFI) += nxp-spifi.o diff --git a/drivers/mtd/spi-nor/atmel-quadspi.c b/drivers/mtd/spi-nor/atmel-quadspi.c new file mode 100644 index 000000000000..47937d9beec6 --- /dev/null +++ b/drivers/mtd/spi-nor/atmel-quadspi.c @@ -0,0 +1,732 @@ +/* + * Driver for Atmel QSPI Controller + * + * Copyright (C) 2015 Atmel Corporation + * + * Author: Cyrille Pitchen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * This driver is based on drivers/mtd/spi-nor/fsl-quadspi.c from Freescale. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* QSPI register offsets */ +#define QSPI_CR 0x0000 /* Control Register */ +#define QSPI_MR 0x0004 /* Mode Register */ +#define QSPI_RD 0x0008 /* Receive Data Register */ +#define QSPI_TD 0x000c /* Transmit Data Register */ +#define QSPI_SR 0x0010 /* Status Register */ +#define QSPI_IER 0x0014 /* Interrupt Enable Register */ +#define QSPI_IDR 0x0018 /* Interrupt Disable Register */ +#define QSPI_IMR 0x001c /* Interrupt Mask Register */ +#define QSPI_SCR 0x0020 /* Serial Clock Register */ + +#define QSPI_IAR 0x0030 /* Instruction Address Register */ +#define QSPI_ICR 0x0034 /* Instruction Code Register */ +#define QSPI_IFR 0x0038 /* Instruction Frame Register */ + +#define QSPI_SMR 0x0040 /* Scrambling Mode Register */ +#define QSPI_SKR 0x0044 /* Scrambling Key Register */ + +#define QSPI_WPMR 0x00E4 /* Write Protection Mode Register */ +#define QSPI_WPSR 0x00E8 /* Write Protection Status Register */ + +#define QSPI_VERSION 0x00FC /* Version Register */ + + +/* Bitfields in QSPI_CR (Control Register) */ +#define QSPI_CR_QSPIEN BIT(0) +#define QSPI_CR_QSPIDIS BIT(1) +#define QSPI_CR_SWRST BIT(7) +#define QSPI_CR_LASTXFER BIT(24) + +/* Bitfields in QSPI_MR (Mode Register) */ +#define QSPI_MR_SSM BIT(0) +#define QSPI_MR_LLB BIT(1) +#define QSPI_MR_WDRBT BIT(2) +#define QSPI_MR_SMRM BIT(3) +#define QSPI_MR_CSMODE_MASK GENMASK(5, 4) +#define QSPI_MR_CSMODE_NOT_RELOADED (0 << 4) +#define QSPI_MR_CSMODE_LASTXFER (1 << 4) +#define QSPI_MR_CSMODE_SYSTEMATICALLY (2 << 4) +#define QSPI_MR_NBBITS_MASK GENMASK(11, 8) +#define QSPI_MR_NBBITS(n) ((((n) - 8) << 8) & QSPI_MR_NBBITS_MASK) +#define QSPI_MR_DLYBCT_MASK GENMASK(23, 16) +#define QSPI_MR_DLYBCT(n) (((n) << 16) & QSPI_MR_DLYBCT_MASK) +#define QSPI_MR_DLYCS_MASK GENMASK(31, 24) +#define QSPI_MR_DLYCS(n) (((n) << 24) & QSPI_MR_DLYCS_MASK) + +/* Bitfields in QSPI_SR/QSPI_IER/QSPI_IDR/QSPI_IMR */ +#define QSPI_SR_RDRF BIT(0) +#define QSPI_SR_TDRE BIT(1) +#define QSPI_SR_TXEMPTY BIT(2) +#define QSPI_SR_OVRES BIT(3) +#define QSPI_SR_CSR BIT(8) +#define QSPI_SR_CSS BIT(9) +#define QSPI_SR_INSTRE BIT(10) +#define QSPI_SR_QSPIENS BIT(24) + +#define QSPI_SR_CMD_COMPLETED (QSPI_SR_INSTRE | QSPI_SR_CSR) + +/* Bitfields in QSPI_SCR (Serial Clock Register) */ +#define QSPI_SCR_CPOL BIT(0) +#define QSPI_SCR_CPHA BIT(1) +#define QSPI_SCR_SCBR_MASK GENMASK(15, 8) +#define QSPI_SCR_SCBR(n) (((n) << 8) & QSPI_SCR_SCBR_MASK) +#define QSPI_SCR_DLYBS_MASK GENMASK(23, 16) +#define QSPI_SCR_DLYBS(n) (((n) << 16) & QSPI_SCR_DLYBS_MASK) + +/* Bitfields in QSPI_ICR (Instruction Code Register) */ +#define QSPI_ICR_INST_MASK GENMASK(7, 0) +#define QSPI_ICR_INST(inst) (((inst) << 0) & QSPI_ICR_INST_MASK) +#define QSPI_ICR_OPT_MASK GENMASK(23, 16) +#define QSPI_ICR_OPT(opt) (((opt) << 16) & QSPI_ICR_OPT_MASK) + +/* Bitfields in QSPI_IFR (Instruction Frame Register) */ +#define QSPI_IFR_WIDTH_MASK GENMASK(2, 0) +#define QSPI_IFR_WIDTH_SINGLE_BIT_SPI (0 << 0) +#define QSPI_IFR_WIDTH_DUAL_OUTPUT (1 << 0) +#define QSPI_IFR_WIDTH_QUAD_OUTPUT (2 << 0) +#define QSPI_IFR_WIDTH_DUAL_IO (3 << 0) +#define QSPI_IFR_WIDTH_QUAD_IO (4 << 0) +#define QSPI_IFR_WIDTH_DUAL_CMD (5 << 0) +#define QSPI_IFR_WIDTH_QUAD_CMD (6 << 0) +#define QSPI_IFR_INSTEN BIT(4) +#define QSPI_IFR_ADDREN BIT(5) +#define QSPI_IFR_OPTEN BIT(6) +#define QSPI_IFR_DATAEN BIT(7) +#define QSPI_IFR_OPTL_MASK GENMASK(9, 8) +#define QSPI_IFR_OPTL_1BIT (0 << 8) +#define QSPI_IFR_OPTL_2BIT (1 << 8) +#define QSPI_IFR_OPTL_4BIT (2 << 8) +#define QSPI_IFR_OPTL_8BIT (3 << 8) +#define QSPI_IFR_ADDRL BIT(10) +#define QSPI_IFR_TFRTYP_MASK GENMASK(13, 12) +#define QSPI_IFR_TFRTYP_TRSFR_READ (0 << 12) +#define QSPI_IFR_TFRTYP_TRSFR_READ_MEM (1 << 12) +#define QSPI_IFR_TFRTYP_TRSFR_WRITE (2 << 12) +#define QSPI_IFR_TFRTYP_TRSFR_WRITE_MEM (3 << 13) +#define QSPI_IFR_CRM BIT(14) +#define QSPI_IFR_NBDUM_MASK GENMASK(20, 16) +#define QSPI_IFR_NBDUM(n) (((n) << 16) & QSPI_IFR_NBDUM_MASK) + +/* Bitfields in QSPI_SMR (Scrambling Mode Register) */ +#define QSPI_SMR_SCREN BIT(0) +#define QSPI_SMR_RVDIS BIT(1) + +/* Bitfields in QSPI_WPMR (Write Protection Mode Register) */ +#define QSPI_WPMR_WPEN BIT(0) +#define QSPI_WPMR_WPKEY_MASK GENMASK(31, 8) +#define QSPI_WPMR_WPKEY(wpkey) (((wpkey) << 8) & QSPI_WPMR_WPKEY_MASK) + +/* Bitfields in QSPI_WPSR (Write Protection Status Register) */ +#define QSPI_WPSR_WPVS BIT(0) +#define QSPI_WPSR_WPVSRC_MASK GENMASK(15, 8) +#define QSPI_WPSR_WPVSRC(src) (((src) << 8) & QSPI_WPSR_WPVSRC) + + +struct atmel_qspi { + void __iomem *regs; + void __iomem *mem; + struct clk *clk; + struct platform_device *pdev; + u32 pending; + + struct spi_nor nor; + u32 clk_rate; + struct completion cmd_completion; +}; + +struct atmel_qspi_command { + union { + struct { + u32 instruction:1; + u32 address:3; + u32 mode:1; + u32 dummy:1; + u32 data:1; + u32 reserved:25; + } bits; + u32 word; + } enable; + u8 instruction; + u8 mode; + u8 num_mode_cycles; + u8 num_dummy_cycles; + u32 address; + + size_t buf_len; + const void *tx_buf; + void *rx_buf; +}; + +/* Register access functions */ +static inline u32 qspi_readl(struct atmel_qspi *aq, u32 reg) +{ + return readl_relaxed(aq->regs + reg); +} + +static inline void qspi_writel(struct atmel_qspi *aq, u32 reg, u32 value) +{ + writel_relaxed(value, aq->regs + reg); +} + +static int atmel_qspi_run_transfer(struct atmel_qspi *aq, + const struct atmel_qspi_command *cmd) +{ + void __iomem *ahb_mem; + + /* Then fallback to a PIO transfer (memcpy() DOES NOT work!) */ + ahb_mem = aq->mem; + if (cmd->enable.bits.address) + ahb_mem += cmd->address; + if (cmd->tx_buf) + _memcpy_toio(ahb_mem, cmd->tx_buf, cmd->buf_len); + else + _memcpy_fromio(cmd->rx_buf, ahb_mem, cmd->buf_len); + + return 0; +} + +#ifdef DEBUG +static void atmel_qspi_debug_command(struct atmel_qspi *aq, + const struct atmel_qspi_command *cmd, + u32 ifr) +{ + u8 cmd_buf[SPI_NOR_MAX_CMD_SIZE]; + size_t len = 0; + int i; + + if (cmd->enable.bits.instruction) + cmd_buf[len++] = cmd->instruction; + + for (i = cmd->enable.bits.address-1; i >= 0; --i) + cmd_buf[len++] = (cmd->address >> (i << 3)) & 0xff; + + if (cmd->enable.bits.mode) + cmd_buf[len++] = cmd->mode; + + if (cmd->enable.bits.dummy) { + int num = cmd->num_dummy_cycles; + + switch (ifr & QSPI_IFR_WIDTH_MASK) { + case QSPI_IFR_WIDTH_SINGLE_BIT_SPI: + case QSPI_IFR_WIDTH_DUAL_OUTPUT: + case QSPI_IFR_WIDTH_QUAD_OUTPUT: + num >>= 3; + break; + case QSPI_IFR_WIDTH_DUAL_IO: + case QSPI_IFR_WIDTH_DUAL_CMD: + num >>= 2; + break; + case QSPI_IFR_WIDTH_QUAD_IO: + case QSPI_IFR_WIDTH_QUAD_CMD: + num >>= 1; + break; + default: + return; + } + + for (i = 0; i < num; ++i) + cmd_buf[len++] = 0; + } + + /* Dump the SPI command */ + print_hex_dump(KERN_DEBUG, "qspi cmd: ", DUMP_PREFIX_NONE, + 32, 1, cmd_buf, len, false); + +#ifdef VERBOSE_DEBUG + /* If verbose debug is enabled, also dump the TX data */ + if (cmd->enable.bits.data && cmd->tx_buf) + print_hex_dump(KERN_DEBUG, "qspi tx : ", DUMP_PREFIX_NONE, + 32, 1, cmd->tx_buf, cmd->buf_len, false); +#endif +} +#else +#define atmel_qspi_debug_command(aq, cmd, ifr) +#endif + +static int atmel_qspi_run_command(struct atmel_qspi *aq, + const struct atmel_qspi_command *cmd, + u32 ifr_tfrtyp, u32 ifr_width) +{ + u32 iar, icr, ifr, sr; + int err = 0; + + iar = 0; + icr = 0; + ifr = ifr_tfrtyp | ifr_width; + + /* Compute instruction parameters */ + if (cmd->enable.bits.instruction) { + icr |= QSPI_ICR_INST(cmd->instruction); + ifr |= QSPI_IFR_INSTEN; + } + + /* Compute address parameters */ + switch (cmd->enable.bits.address) { + case 4: + ifr |= QSPI_IFR_ADDRL; + /* fall through to the 24bit (3 byte) address case. */ + case 3: + iar = (cmd->enable.bits.data) ? 0 : cmd->address; + ifr |= QSPI_IFR_ADDREN; + break; + case 0: + break; + default: + return -EINVAL; + } + + /* Compute option parameters */ + if (cmd->enable.bits.mode && cmd->num_mode_cycles) { + u32 mode_cycle_bits, mode_bits; + + icr |= QSPI_ICR_OPT(cmd->mode); + ifr |= QSPI_IFR_OPTEN; + + switch (ifr & QSPI_IFR_WIDTH_MASK) { + case QSPI_IFR_WIDTH_SINGLE_BIT_SPI: + case QSPI_IFR_WIDTH_DUAL_OUTPUT: + case QSPI_IFR_WIDTH_QUAD_OUTPUT: + mode_cycle_bits = 1; + break; + case QSPI_IFR_WIDTH_DUAL_IO: + case QSPI_IFR_WIDTH_DUAL_CMD: + mode_cycle_bits = 2; + break; + case QSPI_IFR_WIDTH_QUAD_IO: + case QSPI_IFR_WIDTH_QUAD_CMD: + mode_cycle_bits = 4; + break; + default: + return -EINVAL; + } + + mode_bits = cmd->num_mode_cycles * mode_cycle_bits; + switch (mode_bits) { + case 1: + ifr |= QSPI_IFR_OPTL_1BIT; + break; + + case 2: + ifr |= QSPI_IFR_OPTL_2BIT; + break; + + case 4: + ifr |= QSPI_IFR_OPTL_4BIT; + break; + + case 8: + ifr |= QSPI_IFR_OPTL_8BIT; + break; + + default: + return -EINVAL; + } + } + + /* Set number of dummy cycles */ + if (cmd->enable.bits.dummy) + ifr |= QSPI_IFR_NBDUM(cmd->num_dummy_cycles); + + /* Set data enable */ + if (cmd->enable.bits.data) { + ifr |= QSPI_IFR_DATAEN; + + /* Special case for Continuous Read Mode */ + if (!cmd->tx_buf && !cmd->rx_buf) + ifr |= QSPI_IFR_CRM; + } + + /* Clear pending interrupts */ + (void)qspi_readl(aq, QSPI_SR); + + /* Set QSPI Instruction Frame registers */ + atmel_qspi_debug_command(aq, cmd, ifr); + qspi_writel(aq, QSPI_IAR, iar); + qspi_writel(aq, QSPI_ICR, icr); + qspi_writel(aq, QSPI_IFR, ifr); + + /* Skip to the final steps if there is no data */ + if (!cmd->enable.bits.data) + goto no_data; + + /* Dummy read of QSPI_IFR to synchronize APB and AHB accesses */ + (void)qspi_readl(aq, QSPI_IFR); + + /* Stop here for continuous read */ + if (!cmd->tx_buf && !cmd->rx_buf) + return 0; + /* Send/Receive data */ + err = atmel_qspi_run_transfer(aq, cmd); + + /* Release the chip-select */ + qspi_writel(aq, QSPI_CR, QSPI_CR_LASTXFER); + + if (err) + return err; + +#if defined(DEBUG) && defined(VERBOSE_DEBUG) + /* + * If verbose debug is enabled, also dump the RX data in addition to + * the SPI command previously dumped by atmel_qspi_debug_command() + */ + if (cmd->rx_buf) + print_hex_dump(KERN_DEBUG, "qspi rx : ", DUMP_PREFIX_NONE, + 32, 1, cmd->rx_buf, cmd->buf_len, false); +#endif +no_data: + /* Poll INSTRuction End status */ + sr = qspi_readl(aq, QSPI_SR); + if ((sr & QSPI_SR_CMD_COMPLETED) == QSPI_SR_CMD_COMPLETED) + return err; + + /* Wait for INSTRuction End interrupt */ + reinit_completion(&aq->cmd_completion); + aq->pending = sr & QSPI_SR_CMD_COMPLETED; + qspi_writel(aq, QSPI_IER, QSPI_SR_CMD_COMPLETED); + if (!wait_for_completion_timeout(&aq->cmd_completion, + msecs_to_jiffies(1000))) + err = -ETIMEDOUT; + qspi_writel(aq, QSPI_IDR, QSPI_SR_CMD_COMPLETED); + + return err; +} + +static int atmel_qspi_read_reg(struct spi_nor *nor, u8 opcode, + u8 *buf, int len) +{ + struct atmel_qspi *aq = nor->priv; + struct atmel_qspi_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.enable.bits.instruction = 1; + cmd.enable.bits.data = 1; + cmd.instruction = opcode; + cmd.rx_buf = buf; + cmd.buf_len = len; + return atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_READ, + QSPI_IFR_WIDTH_SINGLE_BIT_SPI); +} + +static int atmel_qspi_write_reg(struct spi_nor *nor, u8 opcode, + u8 *buf, int len) +{ + struct atmel_qspi *aq = nor->priv; + struct atmel_qspi_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.enable.bits.instruction = 1; + cmd.enable.bits.data = (buf != NULL && len > 0); + cmd.instruction = opcode; + cmd.tx_buf = buf; + cmd.buf_len = len; + return atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_WRITE, + QSPI_IFR_WIDTH_SINGLE_BIT_SPI); +} + +static ssize_t atmel_qspi_write(struct spi_nor *nor, loff_t to, size_t len, + const u_char *write_buf) +{ + struct atmel_qspi *aq = nor->priv; + struct atmel_qspi_command cmd; + ssize_t ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.enable.bits.instruction = 1; + cmd.enable.bits.address = nor->addr_width; + cmd.enable.bits.data = 1; + cmd.instruction = nor->program_opcode; + cmd.address = (u32)to; + cmd.tx_buf = write_buf; + cmd.buf_len = len; + ret = atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_WRITE_MEM, + QSPI_IFR_WIDTH_SINGLE_BIT_SPI); + return (ret < 0) ? ret : len; +} + +static int atmel_qspi_erase(struct spi_nor *nor, loff_t offs) +{ + struct atmel_qspi *aq = nor->priv; + struct atmel_qspi_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.enable.bits.instruction = 1; + cmd.enable.bits.address = nor->addr_width; + cmd.instruction = nor->erase_opcode; + cmd.address = (u32)offs; + return atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_WRITE, + QSPI_IFR_WIDTH_SINGLE_BIT_SPI); +} + +static ssize_t atmel_qspi_read(struct spi_nor *nor, loff_t from, size_t len, + u_char *read_buf) +{ + struct atmel_qspi *aq = nor->priv; + struct atmel_qspi_command cmd; + u8 num_mode_cycles, num_dummy_cycles; + u32 ifr_width; + ssize_t ret; + + switch (nor->flash_read) { + case SPI_NOR_NORMAL: + case SPI_NOR_FAST: + ifr_width = QSPI_IFR_WIDTH_SINGLE_BIT_SPI; + break; + + case SPI_NOR_DUAL: + ifr_width = QSPI_IFR_WIDTH_DUAL_OUTPUT; + break; + + case SPI_NOR_QUAD: + ifr_width = QSPI_IFR_WIDTH_QUAD_OUTPUT; + break; + + default: + return -EINVAL; + } + + if (nor->read_dummy >= 2) { + num_mode_cycles = 2; + num_dummy_cycles = nor->read_dummy - 2; + } else { + num_mode_cycles = nor->read_dummy; + num_dummy_cycles = 0; + } + + memset(&cmd, 0, sizeof(cmd)); + cmd.enable.bits.instruction = 1; + cmd.enable.bits.address = nor->addr_width; + cmd.enable.bits.mode = (num_mode_cycles > 0); + cmd.enable.bits.dummy = (num_dummy_cycles > 0); + cmd.enable.bits.data = 1; + cmd.instruction = nor->read_opcode; + cmd.address = (u32)from; + cmd.mode = 0xff; /* This value prevents from entering the 0-4-4 mode */ + cmd.num_mode_cycles = num_mode_cycles; + cmd.num_dummy_cycles = num_dummy_cycles; + cmd.rx_buf = read_buf; + cmd.buf_len = len; + ret = atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_READ_MEM, + ifr_width); + return (ret < 0) ? ret : len; +} + +static int atmel_qspi_init(struct atmel_qspi *aq) +{ + unsigned long src_rate; + u32 mr, scr, scbr; + + /* Reset the QSPI controller */ + qspi_writel(aq, QSPI_CR, QSPI_CR_SWRST); + + /* Set the QSPI controller in Serial Memory Mode */ + mr = QSPI_MR_NBBITS(8) | QSPI_MR_SSM; + qspi_writel(aq, QSPI_MR, mr); + + src_rate = clk_get_rate(aq->clk); + if (!src_rate) + return -EINVAL; + + /* Compute the QSPI baudrate */ + scbr = DIV_ROUND_UP(src_rate, aq->clk_rate); + if (scbr > 0) + scbr--; + scr = QSPI_SCR_SCBR(scbr); + qspi_writel(aq, QSPI_SCR, scr); + + /* Enable the QSPI controller */ + qspi_writel(aq, QSPI_CR, QSPI_CR_QSPIEN); + + return 0; +} + +static irqreturn_t atmel_qspi_interrupt(int irq, void *dev_id) +{ + struct atmel_qspi *aq = (struct atmel_qspi *)dev_id; + u32 status, mask, pending; + + status = qspi_readl(aq, QSPI_SR); + mask = qspi_readl(aq, QSPI_IMR); + pending = status & mask; + + if (!pending) + return IRQ_NONE; + + aq->pending |= pending; + if ((aq->pending & QSPI_SR_CMD_COMPLETED) == QSPI_SR_CMD_COMPLETED) + complete(&aq->cmd_completion); + + return IRQ_HANDLED; +} + +static int atmel_qspi_probe(struct platform_device *pdev) +{ + struct device_node *child, *np = pdev->dev.of_node; + struct atmel_qspi *aq; + struct resource *res; + struct spi_nor *nor; + struct mtd_info *mtd; + int irq, err = 0; + + if (of_get_child_count(np) != 1) + return -ENODEV; + child = of_get_next_child(np, NULL); + + aq = devm_kzalloc(&pdev->dev, sizeof(*aq), GFP_KERNEL); + if (!aq) { + err = -ENOMEM; + goto exit; + } + + platform_set_drvdata(pdev, aq); + init_completion(&aq->cmd_completion); + aq->pdev = pdev; + + /* Map the registers */ + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qspi_base"); + aq->regs = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(aq->regs)) { + dev_err(&pdev->dev, "missing registers\n"); + err = PTR_ERR(aq->regs); + goto exit; + } + + /* Map the AHB memory */ + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qspi_mmap"); + aq->mem = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(aq->mem)) { + dev_err(&pdev->dev, "missing AHB memory\n"); + err = PTR_ERR(aq->mem); + goto exit; + } + + /* Get the peripheral clock */ + aq->clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(aq->clk)) { + dev_err(&pdev->dev, "missing peripheral clock\n"); + err = PTR_ERR(aq->clk); + goto exit; + } + + /* Enable the peripheral clock */ + err = clk_prepare_enable(aq->clk); + if (err) { + dev_err(&pdev->dev, "failed to enable the peripheral clock\n"); + goto exit; + } + + /* Request the IRQ */ + irq = platform_get_irq(pdev, 0); + if (irq < 0) { + dev_err(&pdev->dev, "missing IRQ\n"); + err = irq; + goto disable_clk; + } + err = devm_request_irq(&pdev->dev, irq, atmel_qspi_interrupt, + 0, dev_name(&pdev->dev), aq); + if (err) + goto disable_clk; + + /* Setup the spi-nor */ + nor = &aq->nor; + mtd = &nor->mtd; + + nor->dev = &pdev->dev; + spi_nor_set_flash_node(nor, child); + nor->priv = aq; + mtd->priv = nor; + + nor->read_reg = atmel_qspi_read_reg; + nor->write_reg = atmel_qspi_write_reg; + nor->read = atmel_qspi_read; + nor->write = atmel_qspi_write; + nor->erase = atmel_qspi_erase; + + err = of_property_read_u32(child, "spi-max-frequency", &aq->clk_rate); + if (err < 0) + goto disable_clk; + + err = atmel_qspi_init(aq); + if (err) + goto disable_clk; + + err = spi_nor_scan(nor, NULL, SPI_NOR_QUAD); + if (err) + goto disable_clk; + + err = mtd_device_register(mtd, NULL, 0); + if (err) + goto disable_clk; + + of_node_put(child); + + return 0; + +disable_clk: + clk_disable_unprepare(aq->clk); +exit: + of_node_put(child); + + return err; +} + +static int atmel_qspi_remove(struct platform_device *pdev) +{ + struct atmel_qspi *aq = platform_get_drvdata(pdev); + + mtd_device_unregister(&aq->nor.mtd); + qspi_writel(aq, QSPI_CR, QSPI_CR_QSPIDIS); + clk_disable_unprepare(aq->clk); + return 0; +} + + +static const struct of_device_id atmel_qspi_dt_ids[] = { + { .compatible = "atmel,sama5d2-qspi" }, + { /* sentinel */ } +}; + +MODULE_DEVICE_TABLE(of, atmel_qspi_dt_ids); + +static struct platform_driver atmel_qspi_driver = { + .driver = { + .name = "atmel_qspi", + .of_match_table = atmel_qspi_dt_ids, + }, + .probe = atmel_qspi_probe, + .remove = atmel_qspi_remove, +}; +module_platform_driver(atmel_qspi_driver); + +MODULE_AUTHOR("Cyrille Pitchen "); +MODULE_DESCRIPTION("Atmel QSPI Controller driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c b/drivers/mtd/spi-nor/cadence-quadspi.c new file mode 100644 index 000000000000..d403ba7b8f43 --- /dev/null +++ b/drivers/mtd/spi-nor/cadence-quadspi.c @@ -0,0 +1,1299 @@ +/* + * Driver for Cadence QSPI Controller + * + * Copyright Altera Corporation (C) 2012-2014. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CQSPI_NAME "cadence-qspi" +#define CQSPI_MAX_CHIPSELECT 16 + +struct cqspi_st; + +struct cqspi_flash_pdata { + struct spi_nor nor; + struct cqspi_st *cqspi; + u32 clk_rate; + u32 read_delay; + u32 tshsl_ns; + u32 tsd2d_ns; + u32 tchsh_ns; + u32 tslch_ns; + u8 inst_width; + u8 addr_width; + u8 data_width; + u8 cs; + bool registered; +}; + +struct cqspi_st { + struct platform_device *pdev; + + struct clk *clk; + unsigned int sclk; + + void __iomem *iobase; + void __iomem *ahb_base; + struct completion transfer_complete; + struct mutex bus_mutex; + + int current_cs; + int current_page_size; + int current_erase_size; + int current_addr_width; + unsigned long master_ref_clk_hz; + bool is_decoded_cs; + u32 fifo_depth; + u32 fifo_width; + u32 trigger_address; + struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT]; +}; + +/* Operation timeout value */ +#define CQSPI_TIMEOUT_MS 500 +#define CQSPI_READ_TIMEOUT_MS 10 + +/* Instruction type */ +#define CQSPI_INST_TYPE_SINGLE 0 +#define CQSPI_INST_TYPE_DUAL 1 +#define CQSPI_INST_TYPE_QUAD 2 + +#define CQSPI_DUMMY_CLKS_PER_BYTE 8 +#define CQSPI_DUMMY_BYTES_MAX 4 +#define CQSPI_DUMMY_CLKS_MAX 31 + +#define CQSPI_STIG_DATA_LEN_MAX 8 + +/* Register map */ +#define CQSPI_REG_CONFIG 0x00 +#define CQSPI_REG_CONFIG_ENABLE_MASK BIT(0) +#define CQSPI_REG_CONFIG_DECODE_MASK BIT(9) +#define CQSPI_REG_CONFIG_CHIPSELECT_LSB 10 +#define CQSPI_REG_CONFIG_DMA_MASK BIT(15) +#define CQSPI_REG_CONFIG_BAUD_LSB 19 +#define CQSPI_REG_CONFIG_IDLE_LSB 31 +#define CQSPI_REG_CONFIG_CHIPSELECT_MASK 0xF +#define CQSPI_REG_CONFIG_BAUD_MASK 0xF + +#define CQSPI_REG_RD_INSTR 0x04 +#define CQSPI_REG_RD_INSTR_OPCODE_LSB 0 +#define CQSPI_REG_RD_INSTR_TYPE_INSTR_LSB 8 +#define CQSPI_REG_RD_INSTR_TYPE_ADDR_LSB 12 +#define CQSPI_REG_RD_INSTR_TYPE_DATA_LSB 16 +#define CQSPI_REG_RD_INSTR_MODE_EN_LSB 20 +#define CQSPI_REG_RD_INSTR_DUMMY_LSB 24 +#define CQSPI_REG_RD_INSTR_TYPE_INSTR_MASK 0x3 +#define CQSPI_REG_RD_INSTR_TYPE_ADDR_MASK 0x3 +#define CQSPI_REG_RD_INSTR_TYPE_DATA_MASK 0x3 +#define CQSPI_REG_RD_INSTR_DUMMY_MASK 0x1F + +#define CQSPI_REG_WR_INSTR 0x08 +#define CQSPI_REG_WR_INSTR_OPCODE_LSB 0 +#define CQSPI_REG_WR_INSTR_TYPE_ADDR_LSB 12 +#define CQSPI_REG_WR_INSTR_TYPE_DATA_LSB 16 + +#define CQSPI_REG_DELAY 0x0C +#define CQSPI_REG_DELAY_TSLCH_LSB 0 +#define CQSPI_REG_DELAY_TCHSH_LSB 8 +#define CQSPI_REG_DELAY_TSD2D_LSB 16 +#define CQSPI_REG_DELAY_TSHSL_LSB 24 +#define CQSPI_REG_DELAY_TSLCH_MASK 0xFF +#define CQSPI_REG_DELAY_TCHSH_MASK 0xFF +#define CQSPI_REG_DELAY_TSD2D_MASK 0xFF +#define CQSPI_REG_DELAY_TSHSL_MASK 0xFF + +#define CQSPI_REG_READCAPTURE 0x10 +#define CQSPI_REG_READCAPTURE_BYPASS_LSB 0 +#define CQSPI_REG_READCAPTURE_DELAY_LSB 1 +#define CQSPI_REG_READCAPTURE_DELAY_MASK 0xF + +#define CQSPI_REG_SIZE 0x14 +#define CQSPI_REG_SIZE_ADDRESS_LSB 0 +#define CQSPI_REG_SIZE_PAGE_LSB 4 +#define CQSPI_REG_SIZE_BLOCK_LSB 16 +#define CQSPI_REG_SIZE_ADDRESS_MASK 0xF +#define CQSPI_REG_SIZE_PAGE_MASK 0xFFF +#define CQSPI_REG_SIZE_BLOCK_MASK 0x3F + +#define CQSPI_REG_SRAMPARTITION 0x18 +#define CQSPI_REG_INDIRECTTRIGGER 0x1C + +#define CQSPI_REG_DMA 0x20 +#define CQSPI_REG_DMA_SINGLE_LSB 0 +#define CQSPI_REG_DMA_BURST_LSB 8 +#define CQSPI_REG_DMA_SINGLE_MASK 0xFF +#define CQSPI_REG_DMA_BURST_MASK 0xFF + +#define CQSPI_REG_REMAP 0x24 +#define CQSPI_REG_MODE_BIT 0x28 + +#define CQSPI_REG_SDRAMLEVEL 0x2C +#define CQSPI_REG_SDRAMLEVEL_RD_LSB 0 +#define CQSPI_REG_SDRAMLEVEL_WR_LSB 16 +#define CQSPI_REG_SDRAMLEVEL_RD_MASK 0xFFFF +#define CQSPI_REG_SDRAMLEVEL_WR_MASK 0xFFFF + +#define CQSPI_REG_IRQSTATUS 0x40 +#define CQSPI_REG_IRQMASK 0x44 + +#define CQSPI_REG_INDIRECTRD 0x60 +#define CQSPI_REG_INDIRECTRD_START_MASK BIT(0) +#define CQSPI_REG_INDIRECTRD_CANCEL_MASK BIT(1) +#define CQSPI_REG_INDIRECTRD_DONE_MASK BIT(5) + +#define CQSPI_REG_INDIRECTRDWATERMARK 0x64 +#define CQSPI_REG_INDIRECTRDSTARTADDR 0x68 +#define CQSPI_REG_INDIRECTRDBYTES 0x6C + +#define CQSPI_REG_CMDCTRL 0x90 +#define CQSPI_REG_CMDCTRL_EXECUTE_MASK BIT(0) +#define CQSPI_REG_CMDCTRL_INPROGRESS_MASK BIT(1) +#define CQSPI_REG_CMDCTRL_WR_BYTES_LSB 12 +#define CQSPI_REG_CMDCTRL_WR_EN_LSB 15 +#define CQSPI_REG_CMDCTRL_ADD_BYTES_LSB 16 +#define CQSPI_REG_CMDCTRL_ADDR_EN_LSB 19 +#define CQSPI_REG_CMDCTRL_RD_BYTES_LSB 20 +#define CQSPI_REG_CMDCTRL_RD_EN_LSB 23 +#define CQSPI_REG_CMDCTRL_OPCODE_LSB 24 +#define CQSPI_REG_CMDCTRL_WR_BYTES_MASK 0x7 +#define CQSPI_REG_CMDCTRL_ADD_BYTES_MASK 0x3 +#define CQSPI_REG_CMDCTRL_RD_BYTES_MASK 0x7 + +#define CQSPI_REG_INDIRECTWR 0x70 +#define CQSPI_REG_INDIRECTWR_START_MASK BIT(0) +#define CQSPI_REG_INDIRECTWR_CANCEL_MASK BIT(1) +#define CQSPI_REG_INDIRECTWR_DONE_MASK BIT(5) + +#define CQSPI_REG_INDIRECTWRWATERMARK 0x74 +#define CQSPI_REG_INDIRECTWRSTARTADDR 0x78 +#define CQSPI_REG_INDIRECTWRBYTES 0x7C + +#define CQSPI_REG_CMDADDRESS 0x94 +#define CQSPI_REG_CMDREADDATALOWER 0xA0 +#define CQSPI_REG_CMDREADDATAUPPER 0xA4 +#define CQSPI_REG_CMDWRITEDATALOWER 0xA8 +#define CQSPI_REG_CMDWRITEDATAUPPER 0xAC + +/* Interrupt status bits */ +#define CQSPI_REG_IRQ_MODE_ERR BIT(0) +#define CQSPI_REG_IRQ_UNDERFLOW BIT(1) +#define CQSPI_REG_IRQ_IND_COMP BIT(2) +#define CQSPI_REG_IRQ_IND_RD_REJECT BIT(3) +#define CQSPI_REG_IRQ_WR_PROTECTED_ERR BIT(4) +#define CQSPI_REG_IRQ_ILLEGAL_AHB_ERR BIT(5) +#define CQSPI_REG_IRQ_WATERMARK BIT(6) +#define CQSPI_REG_IRQ_IND_SRAM_FULL BIT(12) + +#define CQSPI_IRQ_MASK_RD (CQSPI_REG_IRQ_WATERMARK | \ + CQSPI_REG_IRQ_IND_SRAM_FULL | \ + CQSPI_REG_IRQ_IND_COMP) + +#define CQSPI_IRQ_MASK_WR (CQSPI_REG_IRQ_IND_COMP | \ + CQSPI_REG_IRQ_WATERMARK | \ + CQSPI_REG_IRQ_UNDERFLOW) + +#define CQSPI_IRQ_STATUS_MASK 0x1FFFF + +static int cqspi_wait_for_bit(void __iomem *reg, const u32 mask, bool clear) +{ + unsigned long end = jiffies + msecs_to_jiffies(CQSPI_TIMEOUT_MS); + u32 val; + + while (1) { + val = readl(reg); + if (clear) + val = ~val; + val &= mask; + + if (val == mask) + return 0; + + if (time_after(jiffies, end)) + return -ETIMEDOUT; + } +} + +static bool cqspi_is_idle(struct cqspi_st *cqspi) +{ + u32 reg = readl(cqspi->iobase + CQSPI_REG_CONFIG); + + return reg & (1 << CQSPI_REG_CONFIG_IDLE_LSB); +} + +static u32 cqspi_get_rd_sram_level(struct cqspi_st *cqspi) +{ + u32 reg = readl(cqspi->iobase + CQSPI_REG_SDRAMLEVEL); + + reg >>= CQSPI_REG_SDRAMLEVEL_RD_LSB; + return reg & CQSPI_REG_SDRAMLEVEL_RD_MASK; +} + +static irqreturn_t cqspi_irq_handler(int this_irq, void *dev) +{ + struct cqspi_st *cqspi = dev; + unsigned int irq_status; + + /* Read interrupt status */ + irq_status = readl(cqspi->iobase + CQSPI_REG_IRQSTATUS); + + /* Clear interrupt */ + writel(irq_status, cqspi->iobase + CQSPI_REG_IRQSTATUS); + + irq_status &= CQSPI_IRQ_MASK_RD | CQSPI_IRQ_MASK_WR; + + if (irq_status) + complete(&cqspi->transfer_complete); + + return IRQ_HANDLED; +} + +static unsigned int cqspi_calc_rdreg(struct spi_nor *nor, const u8 opcode) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + u32 rdreg = 0; + + rdreg |= f_pdata->inst_width << CQSPI_REG_RD_INSTR_TYPE_INSTR_LSB; + rdreg |= f_pdata->addr_width << CQSPI_REG_RD_INSTR_TYPE_ADDR_LSB; + rdreg |= f_pdata->data_width << CQSPI_REG_RD_INSTR_TYPE_DATA_LSB; + + return rdreg; +} + +static int cqspi_wait_idle(struct cqspi_st *cqspi) +{ + const unsigned int poll_idle_retry = 3; + unsigned int count = 0; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(CQSPI_TIMEOUT_MS); + while (1) { + /* + * Read few times in succession to ensure the controller + * is indeed idle, that is, the bit does not transition + * low again. + */ + if (cqspi_is_idle(cqspi)) + count++; + else + count = 0; + + if (count >= poll_idle_retry) + return 0; + + if (time_after(jiffies, timeout)) { + /* Timeout, in busy mode. */ + dev_err(&cqspi->pdev->dev, + "QSPI is still busy after %dms timeout.\n", + CQSPI_TIMEOUT_MS); + return -ETIMEDOUT; + } + + cpu_relax(); + } +} + +static int cqspi_exec_flash_cmd(struct cqspi_st *cqspi, unsigned int reg) +{ + void __iomem *reg_base = cqspi->iobase; + int ret; + + /* Write the CMDCTRL without start execution. */ + writel(reg, reg_base + CQSPI_REG_CMDCTRL); + /* Start execute */ + reg |= CQSPI_REG_CMDCTRL_EXECUTE_MASK; + writel(reg, reg_base + CQSPI_REG_CMDCTRL); + + /* Polling for completion. */ + ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_CMDCTRL, + CQSPI_REG_CMDCTRL_INPROGRESS_MASK, 1); + if (ret) { + dev_err(&cqspi->pdev->dev, + "Flash command execution timed out.\n"); + return ret; + } + + /* Polling QSPI idle status. */ + return cqspi_wait_idle(cqspi); +} + +static int cqspi_command_read(struct spi_nor *nor, + const u8 *txbuf, const unsigned n_tx, + u8 *rxbuf, const unsigned n_rx) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + void __iomem *reg_base = cqspi->iobase; + unsigned int rdreg; + unsigned int reg; + unsigned int read_len; + int status; + + if (!n_rx || n_rx > CQSPI_STIG_DATA_LEN_MAX || !rxbuf) { + dev_err(nor->dev, "Invalid input argument, len %d rxbuf 0x%p\n", + n_rx, rxbuf); + return -EINVAL; + } + + reg = txbuf[0] << CQSPI_REG_CMDCTRL_OPCODE_LSB; + + rdreg = cqspi_calc_rdreg(nor, txbuf[0]); + writel(rdreg, reg_base + CQSPI_REG_RD_INSTR); + + reg |= (0x1 << CQSPI_REG_CMDCTRL_RD_EN_LSB); + + /* 0 means 1 byte. */ + reg |= (((n_rx - 1) & CQSPI_REG_CMDCTRL_RD_BYTES_MASK) + << CQSPI_REG_CMDCTRL_RD_BYTES_LSB); + status = cqspi_exec_flash_cmd(cqspi, reg); + if (status) + return status; + + reg = readl(reg_base + CQSPI_REG_CMDREADDATALOWER); + + /* Put the read value into rx_buf */ + read_len = (n_rx > 4) ? 4 : n_rx; + memcpy(rxbuf, ®, read_len); + rxbuf += read_len; + + if (n_rx > 4) { + reg = readl(reg_base + CQSPI_REG_CMDREADDATAUPPER); + + read_len = n_rx - read_len; + memcpy(rxbuf, ®, read_len); + } + + return 0; +} + +static int cqspi_command_write(struct spi_nor *nor, const u8 opcode, + const u8 *txbuf, const unsigned n_tx) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + void __iomem *reg_base = cqspi->iobase; + unsigned int reg; + unsigned int data; + int ret; + + if (n_tx > 4 || (n_tx && !txbuf)) { + dev_err(nor->dev, + "Invalid input argument, cmdlen %d txbuf 0x%p\n", + n_tx, txbuf); + return -EINVAL; + } + + reg = opcode << CQSPI_REG_CMDCTRL_OPCODE_LSB; + if (n_tx) { + reg |= (0x1 << CQSPI_REG_CMDCTRL_WR_EN_LSB); + reg |= ((n_tx - 1) & CQSPI_REG_CMDCTRL_WR_BYTES_MASK) + << CQSPI_REG_CMDCTRL_WR_BYTES_LSB; + data = 0; + memcpy(&data, txbuf, n_tx); + writel(data, reg_base + CQSPI_REG_CMDWRITEDATALOWER); + } + + ret = cqspi_exec_flash_cmd(cqspi, reg); + return ret; +} + +static int cqspi_command_write_addr(struct spi_nor *nor, + const u8 opcode, const unsigned int addr) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + void __iomem *reg_base = cqspi->iobase; + unsigned int reg; + + reg = opcode << CQSPI_REG_CMDCTRL_OPCODE_LSB; + reg |= (0x1 << CQSPI_REG_CMDCTRL_ADDR_EN_LSB); + reg |= ((nor->addr_width - 1) & CQSPI_REG_CMDCTRL_ADD_BYTES_MASK) + << CQSPI_REG_CMDCTRL_ADD_BYTES_LSB; + + writel(addr, reg_base + CQSPI_REG_CMDADDRESS); + + return cqspi_exec_flash_cmd(cqspi, reg); +} + +static int cqspi_indirect_read_setup(struct spi_nor *nor, + const unsigned int from_addr) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + void __iomem *reg_base = cqspi->iobase; + unsigned int dummy_clk = 0; + unsigned int reg; + + writel(from_addr, reg_base + CQSPI_REG_INDIRECTRDSTARTADDR); + + reg = nor->read_opcode << CQSPI_REG_RD_INSTR_OPCODE_LSB; + reg |= cqspi_calc_rdreg(nor, nor->read_opcode); + + /* Setup dummy clock cycles */ + dummy_clk = nor->read_dummy; + if (dummy_clk > CQSPI_DUMMY_CLKS_MAX) + dummy_clk = CQSPI_DUMMY_CLKS_MAX; + + if (dummy_clk / 8) { + reg |= (1 << CQSPI_REG_RD_INSTR_MODE_EN_LSB); + /* Set mode bits high to ensure chip doesn't enter XIP */ + writel(0xFF, reg_base + CQSPI_REG_MODE_BIT); + + /* Need to subtract the mode byte (8 clocks). */ + if (f_pdata->inst_width != CQSPI_INST_TYPE_QUAD) + dummy_clk -= 8; + + if (dummy_clk) + reg |= (dummy_clk & CQSPI_REG_RD_INSTR_DUMMY_MASK) + << CQSPI_REG_RD_INSTR_DUMMY_LSB; + } + + writel(reg, reg_base + CQSPI_REG_RD_INSTR); + + /* Set address width */ + reg = readl(reg_base + CQSPI_REG_SIZE); + reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK; + reg |= (nor->addr_width - 1); + writel(reg, reg_base + CQSPI_REG_SIZE); + return 0; +} + +static int cqspi_indirect_read_execute(struct spi_nor *nor, + u8 *rxbuf, const unsigned n_rx) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + void __iomem *reg_base = cqspi->iobase; + void __iomem *ahb_base = cqspi->ahb_base; + unsigned int remaining = n_rx; + unsigned int bytes_to_read = 0; + int ret = 0; + + writel(remaining, reg_base + CQSPI_REG_INDIRECTRDBYTES); + + /* Clear all interrupts. */ + writel(CQSPI_IRQ_STATUS_MASK, reg_base + CQSPI_REG_IRQSTATUS); + + writel(CQSPI_IRQ_MASK_RD, reg_base + CQSPI_REG_IRQMASK); + + reinit_completion(&cqspi->transfer_complete); + writel(CQSPI_REG_INDIRECTRD_START_MASK, + reg_base + CQSPI_REG_INDIRECTRD); + + while (remaining > 0) { + ret = wait_for_completion_timeout(&cqspi->transfer_complete, + msecs_to_jiffies + (CQSPI_READ_TIMEOUT_MS)); + + bytes_to_read = cqspi_get_rd_sram_level(cqspi); + + if (!ret && bytes_to_read == 0) { + dev_err(nor->dev, "Indirect read timeout, no bytes\n"); + ret = -ETIMEDOUT; + goto failrd; + } + + while (bytes_to_read != 0) { + bytes_to_read *= cqspi->fifo_width; + bytes_to_read = bytes_to_read > remaining ? + remaining : bytes_to_read; + readsl(ahb_base, rxbuf, DIV_ROUND_UP(bytes_to_read, 4)); + rxbuf += bytes_to_read; + remaining -= bytes_to_read; + bytes_to_read = cqspi_get_rd_sram_level(cqspi); + } + + if (remaining > 0) + reinit_completion(&cqspi->transfer_complete); + } + + /* Check indirect done status */ + ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_INDIRECTRD, + CQSPI_REG_INDIRECTRD_DONE_MASK, 0); + if (ret) { + dev_err(nor->dev, + "Indirect read completion error (%i)\n", ret); + goto failrd; + } + + /* Disable interrupt */ + writel(0, reg_base + CQSPI_REG_IRQMASK); + + /* Clear indirect completion status */ + writel(CQSPI_REG_INDIRECTRD_DONE_MASK, reg_base + CQSPI_REG_INDIRECTRD); + + return 0; + +failrd: + /* Disable interrupt */ + writel(0, reg_base + CQSPI_REG_IRQMASK); + + /* Cancel the indirect read */ + writel(CQSPI_REG_INDIRECTWR_CANCEL_MASK, + reg_base + CQSPI_REG_INDIRECTRD); + return ret; +} + +static int cqspi_indirect_write_setup(struct spi_nor *nor, + const unsigned int to_addr) +{ + unsigned int reg; + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + void __iomem *reg_base = cqspi->iobase; + + /* Set opcode. */ + reg = nor->program_opcode << CQSPI_REG_WR_INSTR_OPCODE_LSB; + writel(reg, reg_base + CQSPI_REG_WR_INSTR); + reg = cqspi_calc_rdreg(nor, nor->program_opcode); + writel(reg, reg_base + CQSPI_REG_RD_INSTR); + + writel(to_addr, reg_base + CQSPI_REG_INDIRECTWRSTARTADDR); + + reg = readl(reg_base + CQSPI_REG_SIZE); + reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK; + reg |= (nor->addr_width - 1); + writel(reg, reg_base + CQSPI_REG_SIZE); + return 0; +} + +static int cqspi_indirect_write_execute(struct spi_nor *nor, + const u8 *txbuf, const unsigned n_tx) +{ + const unsigned int page_size = nor->page_size; + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + void __iomem *reg_base = cqspi->iobase; + unsigned int remaining = n_tx; + unsigned int write_bytes; + int ret; + + writel(remaining, reg_base + CQSPI_REG_INDIRECTWRBYTES); + + /* Clear all interrupts. */ + writel(CQSPI_IRQ_STATUS_MASK, reg_base + CQSPI_REG_IRQSTATUS); + + writel(CQSPI_IRQ_MASK_WR, reg_base + CQSPI_REG_IRQMASK); + + reinit_completion(&cqspi->transfer_complete); + writel(CQSPI_REG_INDIRECTWR_START_MASK, + reg_base + CQSPI_REG_INDIRECTWR); + + while (remaining > 0) { + write_bytes = remaining > page_size ? page_size : remaining; + writesl(cqspi->ahb_base, txbuf, DIV_ROUND_UP(write_bytes, 4)); + + ret = wait_for_completion_timeout(&cqspi->transfer_complete, + msecs_to_jiffies + (CQSPI_TIMEOUT_MS)); + if (!ret) { + dev_err(nor->dev, "Indirect write timeout\n"); + ret = -ETIMEDOUT; + goto failwr; + } + + txbuf += write_bytes; + remaining -= write_bytes; + + if (remaining > 0) + reinit_completion(&cqspi->transfer_complete); + } + + /* Check indirect done status */ + ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_INDIRECTWR, + CQSPI_REG_INDIRECTWR_DONE_MASK, 0); + if (ret) { + dev_err(nor->dev, + "Indirect write completion error (%i)\n", ret); + goto failwr; + } + + /* Disable interrupt. */ + writel(0, reg_base + CQSPI_REG_IRQMASK); + + /* Clear indirect completion status */ + writel(CQSPI_REG_INDIRECTWR_DONE_MASK, reg_base + CQSPI_REG_INDIRECTWR); + + cqspi_wait_idle(cqspi); + + return 0; + +failwr: + /* Disable interrupt. */ + writel(0, reg_base + CQSPI_REG_IRQMASK); + + /* Cancel the indirect write */ + writel(CQSPI_REG_INDIRECTWR_CANCEL_MASK, + reg_base + CQSPI_REG_INDIRECTWR); + return ret; +} + +static void cqspi_chipselect(struct spi_nor *nor) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + void __iomem *reg_base = cqspi->iobase; + unsigned int chip_select = f_pdata->cs; + unsigned int reg; + + reg = readl(reg_base + CQSPI_REG_CONFIG); + if (cqspi->is_decoded_cs) { + reg |= CQSPI_REG_CONFIG_DECODE_MASK; + } else { + reg &= ~CQSPI_REG_CONFIG_DECODE_MASK; + + /* Convert CS if without decoder. + * CS0 to 4b'1110 + * CS1 to 4b'1101 + * CS2 to 4b'1011 + * CS3 to 4b'0111 + */ + chip_select = 0xF & ~(1 << chip_select); + } + + reg &= ~(CQSPI_REG_CONFIG_CHIPSELECT_MASK + << CQSPI_REG_CONFIG_CHIPSELECT_LSB); + reg |= (chip_select & CQSPI_REG_CONFIG_CHIPSELECT_MASK) + << CQSPI_REG_CONFIG_CHIPSELECT_LSB; + writel(reg, reg_base + CQSPI_REG_CONFIG); +} + +static void cqspi_configure_cs_and_sizes(struct spi_nor *nor) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + void __iomem *iobase = cqspi->iobase; + unsigned int reg; + + /* configure page size and block size. */ + reg = readl(iobase + CQSPI_REG_SIZE); + reg &= ~(CQSPI_REG_SIZE_PAGE_MASK << CQSPI_REG_SIZE_PAGE_LSB); + reg &= ~(CQSPI_REG_SIZE_BLOCK_MASK << CQSPI_REG_SIZE_BLOCK_LSB); + reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK; + reg |= (nor->page_size << CQSPI_REG_SIZE_PAGE_LSB); + reg |= (ilog2(nor->mtd.erasesize) << CQSPI_REG_SIZE_BLOCK_LSB); + reg |= (nor->addr_width - 1); + writel(reg, iobase + CQSPI_REG_SIZE); + + /* configure the chip select */ + cqspi_chipselect(nor); + + /* Store the new configuration of the controller */ + cqspi->current_page_size = nor->page_size; + cqspi->current_erase_size = nor->mtd.erasesize; + cqspi->current_addr_width = nor->addr_width; +} + +static unsigned int calculate_ticks_for_ns(const unsigned int ref_clk_hz, + const unsigned int ns_val) +{ + unsigned int ticks; + + ticks = ref_clk_hz / 1000; /* kHz */ + ticks = DIV_ROUND_UP(ticks * ns_val, 1000000); + + return ticks; +} + +static void cqspi_delay(struct spi_nor *nor) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + void __iomem *iobase = cqspi->iobase; + const unsigned int ref_clk_hz = cqspi->master_ref_clk_hz; + unsigned int tshsl, tchsh, tslch, tsd2d; + unsigned int reg; + unsigned int tsclk; + + /* calculate the number of ref ticks for one sclk tick */ + tsclk = DIV_ROUND_UP(ref_clk_hz, cqspi->sclk); + + tshsl = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tshsl_ns); + /* this particular value must be at least one sclk */ + if (tshsl < tsclk) + tshsl = tsclk; + + tchsh = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tchsh_ns); + tslch = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tslch_ns); + tsd2d = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tsd2d_ns); + + reg = (tshsl & CQSPI_REG_DELAY_TSHSL_MASK) + << CQSPI_REG_DELAY_TSHSL_LSB; + reg |= (tchsh & CQSPI_REG_DELAY_TCHSH_MASK) + << CQSPI_REG_DELAY_TCHSH_LSB; + reg |= (tslch & CQSPI_REG_DELAY_TSLCH_MASK) + << CQSPI_REG_DELAY_TSLCH_LSB; + reg |= (tsd2d & CQSPI_REG_DELAY_TSD2D_MASK) + << CQSPI_REG_DELAY_TSD2D_LSB; + writel(reg, iobase + CQSPI_REG_DELAY); +} + +static void cqspi_config_baudrate_div(struct cqspi_st *cqspi) +{ + const unsigned int ref_clk_hz = cqspi->master_ref_clk_hz; + void __iomem *reg_base = cqspi->iobase; + u32 reg, div; + + /* Recalculate the baudrate divisor based on QSPI specification. */ + div = DIV_ROUND_UP(ref_clk_hz, 2 * cqspi->sclk) - 1; + + reg = readl(reg_base + CQSPI_REG_CONFIG); + reg &= ~(CQSPI_REG_CONFIG_BAUD_MASK << CQSPI_REG_CONFIG_BAUD_LSB); + reg |= (div & CQSPI_REG_CONFIG_BAUD_MASK) << CQSPI_REG_CONFIG_BAUD_LSB; + writel(reg, reg_base + CQSPI_REG_CONFIG); +} + +static void cqspi_readdata_capture(struct cqspi_st *cqspi, + const unsigned int bypass, + const unsigned int delay) +{ + void __iomem *reg_base = cqspi->iobase; + unsigned int reg; + + reg = readl(reg_base + CQSPI_REG_READCAPTURE); + + if (bypass) + reg |= (1 << CQSPI_REG_READCAPTURE_BYPASS_LSB); + else + reg &= ~(1 << CQSPI_REG_READCAPTURE_BYPASS_LSB); + + reg &= ~(CQSPI_REG_READCAPTURE_DELAY_MASK + << CQSPI_REG_READCAPTURE_DELAY_LSB); + + reg |= (delay & CQSPI_REG_READCAPTURE_DELAY_MASK) + << CQSPI_REG_READCAPTURE_DELAY_LSB; + + writel(reg, reg_base + CQSPI_REG_READCAPTURE); +} + +static void cqspi_controller_enable(struct cqspi_st *cqspi, bool enable) +{ + void __iomem *reg_base = cqspi->iobase; + unsigned int reg; + + reg = readl(reg_base + CQSPI_REG_CONFIG); + + if (enable) + reg |= CQSPI_REG_CONFIG_ENABLE_MASK; + else + reg &= ~CQSPI_REG_CONFIG_ENABLE_MASK; + + writel(reg, reg_base + CQSPI_REG_CONFIG); +} + +static void cqspi_configure(struct spi_nor *nor) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + const unsigned int sclk = f_pdata->clk_rate; + int switch_cs = (cqspi->current_cs != f_pdata->cs); + int switch_ck = (cqspi->sclk != sclk); + + if ((cqspi->current_page_size != nor->page_size) || + (cqspi->current_erase_size != nor->mtd.erasesize) || + (cqspi->current_addr_width != nor->addr_width)) + switch_cs = 1; + + if (switch_cs || switch_ck) + cqspi_controller_enable(cqspi, 0); + + /* Switch chip select. */ + if (switch_cs) { + cqspi->current_cs = f_pdata->cs; + cqspi_configure_cs_and_sizes(nor); + } + + /* Setup baudrate divisor and delays */ + if (switch_ck) { + cqspi->sclk = sclk; + cqspi_config_baudrate_div(cqspi); + cqspi_delay(nor); + cqspi_readdata_capture(cqspi, 1, f_pdata->read_delay); + } + + if (switch_cs || switch_ck) + cqspi_controller_enable(cqspi, 1); +} + +static int cqspi_set_protocol(struct spi_nor *nor, const int read) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + + f_pdata->inst_width = CQSPI_INST_TYPE_SINGLE; + f_pdata->addr_width = CQSPI_INST_TYPE_SINGLE; + f_pdata->data_width = CQSPI_INST_TYPE_SINGLE; + + if (read) { + switch (nor->flash_read) { + case SPI_NOR_NORMAL: + case SPI_NOR_FAST: + f_pdata->data_width = CQSPI_INST_TYPE_SINGLE; + break; + case SPI_NOR_DUAL: + f_pdata->data_width = CQSPI_INST_TYPE_DUAL; + break; + case SPI_NOR_QUAD: + f_pdata->data_width = CQSPI_INST_TYPE_QUAD; + break; + default: + return -EINVAL; + } + } + + cqspi_configure(nor); + + return 0; +} + +static ssize_t cqspi_write(struct spi_nor *nor, loff_t to, + size_t len, const u_char *buf) +{ + int ret; + + ret = cqspi_set_protocol(nor, 0); + if (ret) + return ret; + + ret = cqspi_indirect_write_setup(nor, to); + if (ret) + return ret; + + ret = cqspi_indirect_write_execute(nor, buf, len); + if (ret) + return ret; + + return (ret < 0) ? ret : len; +} + +static ssize_t cqspi_read(struct spi_nor *nor, loff_t from, + size_t len, u_char *buf) +{ + int ret; + + ret = cqspi_set_protocol(nor, 1); + if (ret) + return ret; + + ret = cqspi_indirect_read_setup(nor, from); + if (ret) + return ret; + + ret = cqspi_indirect_read_execute(nor, buf, len); + if (ret) + return ret; + + return (ret < 0) ? ret : len; +} + +static int cqspi_erase(struct spi_nor *nor, loff_t offs) +{ + int ret; + + ret = cqspi_set_protocol(nor, 0); + if (ret) + return ret; + + /* Send write enable, then erase commands. */ + ret = nor->write_reg(nor, SPINOR_OP_WREN, NULL, 0); + if (ret) + return ret; + + /* Set up command buffer. */ + ret = cqspi_command_write_addr(nor, nor->erase_opcode, offs); + if (ret) + return ret; + + return 0; +} + +static int cqspi_prep(struct spi_nor *nor, enum spi_nor_ops ops) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + + mutex_lock(&cqspi->bus_mutex); + + return 0; +} + +static void cqspi_unprep(struct spi_nor *nor, enum spi_nor_ops ops) +{ + struct cqspi_flash_pdata *f_pdata = nor->priv; + struct cqspi_st *cqspi = f_pdata->cqspi; + + mutex_unlock(&cqspi->bus_mutex); +} + +static int cqspi_read_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) +{ + int ret; + + ret = cqspi_set_protocol(nor, 0); + if (!ret) + ret = cqspi_command_read(nor, &opcode, 1, buf, len); + + return ret; +} + +static int cqspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) +{ + int ret; + + ret = cqspi_set_protocol(nor, 0); + if (!ret) + ret = cqspi_command_write(nor, opcode, buf, len); + + return ret; +} + +static int cqspi_of_get_flash_pdata(struct platform_device *pdev, + struct cqspi_flash_pdata *f_pdata, + struct device_node *np) +{ + if (of_property_read_u32(np, "cdns,read-delay", &f_pdata->read_delay)) { + dev_err(&pdev->dev, "couldn't determine read-delay\n"); + return -ENXIO; + } + + if (of_property_read_u32(np, "cdns,tshsl-ns", &f_pdata->tshsl_ns)) { + dev_err(&pdev->dev, "couldn't determine tshsl-ns\n"); + return -ENXIO; + } + + if (of_property_read_u32(np, "cdns,tsd2d-ns", &f_pdata->tsd2d_ns)) { + dev_err(&pdev->dev, "couldn't determine tsd2d-ns\n"); + return -ENXIO; + } + + if (of_property_read_u32(np, "cdns,tchsh-ns", &f_pdata->tchsh_ns)) { + dev_err(&pdev->dev, "couldn't determine tchsh-ns\n"); + return -ENXIO; + } + + if (of_property_read_u32(np, "cdns,tslch-ns", &f_pdata->tslch_ns)) { + dev_err(&pdev->dev, "couldn't determine tslch-ns\n"); + return -ENXIO; + } + + if (of_property_read_u32(np, "spi-max-frequency", &f_pdata->clk_rate)) { + dev_err(&pdev->dev, "couldn't determine spi-max-frequency\n"); + return -ENXIO; + } + + return 0; +} + +static int cqspi_of_get_pdata(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + struct cqspi_st *cqspi = platform_get_drvdata(pdev); + + cqspi->is_decoded_cs = of_property_read_bool(np, "cdns,is-decoded-cs"); + + if (of_property_read_u32(np, "cdns,fifo-depth", &cqspi->fifo_depth)) { + dev_err(&pdev->dev, "couldn't determine fifo-depth\n"); + return -ENXIO; + } + + if (of_property_read_u32(np, "cdns,fifo-width", &cqspi->fifo_width)) { + dev_err(&pdev->dev, "couldn't determine fifo-width\n"); + return -ENXIO; + } + + if (of_property_read_u32(np, "cdns,trigger-address", + &cqspi->trigger_address)) { + dev_err(&pdev->dev, "couldn't determine trigger-address\n"); + return -ENXIO; + } + + return 0; +} + +static void cqspi_controller_init(struct cqspi_st *cqspi) +{ + cqspi_controller_enable(cqspi, 0); + + /* Configure the remap address register, no remap */ + writel(0, cqspi->iobase + CQSPI_REG_REMAP); + + /* Disable all interrupts. */ + writel(0, cqspi->iobase + CQSPI_REG_IRQMASK); + + /* Configure the SRAM split to 1:1 . */ + writel(cqspi->fifo_depth / 2, cqspi->iobase + CQSPI_REG_SRAMPARTITION); + + /* Load indirect trigger address. */ + writel(cqspi->trigger_address, + cqspi->iobase + CQSPI_REG_INDIRECTTRIGGER); + + /* Program read watermark -- 1/2 of the FIFO. */ + writel(cqspi->fifo_depth * cqspi->fifo_width / 2, + cqspi->iobase + CQSPI_REG_INDIRECTRDWATERMARK); + /* Program write watermark -- 1/8 of the FIFO. */ + writel(cqspi->fifo_depth * cqspi->fifo_width / 8, + cqspi->iobase + CQSPI_REG_INDIRECTWRWATERMARK); + + cqspi_controller_enable(cqspi, 1); +} + +static int cqspi_setup_flash(struct cqspi_st *cqspi, struct device_node *np) +{ + struct platform_device *pdev = cqspi->pdev; + struct device *dev = &pdev->dev; + struct cqspi_flash_pdata *f_pdata; + struct spi_nor *nor; + struct mtd_info *mtd; + unsigned int cs; + int i, ret; + + /* Get flash device data */ + for_each_available_child_of_node(dev->of_node, np) { + if (of_property_read_u32(np, "reg", &cs)) { + dev_err(dev, "Couldn't determine chip select.\n"); + goto err; + } + + if (cs > CQSPI_MAX_CHIPSELECT) { + dev_err(dev, "Chip select %d out of range.\n", cs); + goto err; + } + + f_pdata = &cqspi->f_pdata[cs]; + f_pdata->cqspi = cqspi; + f_pdata->cs = cs; + + ret = cqspi_of_get_flash_pdata(pdev, f_pdata, np); + if (ret) + goto err; + + nor = &f_pdata->nor; + mtd = &nor->mtd; + + mtd->priv = nor; + + nor->dev = dev; + spi_nor_set_flash_node(nor, np); + nor->priv = f_pdata; + + nor->read_reg = cqspi_read_reg; + nor->write_reg = cqspi_write_reg; + nor->read = cqspi_read; + nor->write = cqspi_write; + nor->erase = cqspi_erase; + nor->prepare = cqspi_prep; + nor->unprepare = cqspi_unprep; + + mtd->name = devm_kasprintf(dev, GFP_KERNEL, "%s.%d", + dev_name(dev), cs); + if (!mtd->name) { + ret = -ENOMEM; + goto err; + } + + ret = spi_nor_scan(nor, NULL, SPI_NOR_QUAD); + if (ret) + goto err; + + ret = mtd_device_register(mtd, NULL, 0); + if (ret) + goto err; + + f_pdata->registered = true; + } + + return 0; + +err: + for (i = 0; i < CQSPI_MAX_CHIPSELECT; i++) + if (cqspi->f_pdata[i].registered) + mtd_device_unregister(&cqspi->f_pdata[i].nor.mtd); + return ret; +} + +static int cqspi_probe(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + struct device *dev = &pdev->dev; + struct cqspi_st *cqspi; + struct resource *res; + struct resource *res_ahb; + int ret; + int irq; + + cqspi = devm_kzalloc(dev, sizeof(*cqspi), GFP_KERNEL); + if (!cqspi) + return -ENOMEM; + + mutex_init(&cqspi->bus_mutex); + cqspi->pdev = pdev; + platform_set_drvdata(pdev, cqspi); + + /* Obtain configuration from OF. */ + ret = cqspi_of_get_pdata(pdev); + if (ret) { + dev_err(dev, "Cannot get mandatory OF data.\n"); + return -ENODEV; + } + + /* Obtain QSPI clock. */ + cqspi->clk = devm_clk_get(dev, NULL); + if (IS_ERR(cqspi->clk)) { + dev_err(dev, "Cannot claim QSPI clock.\n"); + return PTR_ERR(cqspi->clk); + } + + /* Obtain and remap controller address. */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + cqspi->iobase = devm_ioremap_resource(dev, res); + if (IS_ERR(cqspi->iobase)) { + dev_err(dev, "Cannot remap controller address.\n"); + return PTR_ERR(cqspi->iobase); + } + + /* Obtain and remap AHB address. */ + res_ahb = platform_get_resource(pdev, IORESOURCE_MEM, 1); + cqspi->ahb_base = devm_ioremap_resource(dev, res_ahb); + if (IS_ERR(cqspi->ahb_base)) { + dev_err(dev, "Cannot remap AHB address.\n"); + return PTR_ERR(cqspi->ahb_base); + } + + init_completion(&cqspi->transfer_complete); + + /* Obtain IRQ line. */ + irq = platform_get_irq(pdev, 0); + if (irq < 0) { + dev_err(dev, "Cannot obtain IRQ.\n"); + return -ENXIO; + } + + ret = clk_prepare_enable(cqspi->clk); + if (ret) { + dev_err(dev, "Cannot enable QSPI clock.\n"); + return ret; + } + + cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk); + + ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0, + pdev->name, cqspi); + if (ret) { + dev_err(dev, "Cannot request IRQ.\n"); + goto probe_irq_failed; + } + + cqspi_wait_idle(cqspi); + cqspi_controller_init(cqspi); + cqspi->current_cs = -1; + cqspi->sclk = 0; + + ret = cqspi_setup_flash(cqspi, np); + if (ret) { + dev_err(dev, "Cadence QSPI NOR probe failed %d\n", ret); + goto probe_setup_failed; + } + + return ret; +probe_irq_failed: + cqspi_controller_enable(cqspi, 0); +probe_setup_failed: + clk_disable_unprepare(cqspi->clk); + return ret; +} + +static int cqspi_remove(struct platform_device *pdev) +{ + struct cqspi_st *cqspi = platform_get_drvdata(pdev); + int i; + + for (i = 0; i < CQSPI_MAX_CHIPSELECT; i++) + if (cqspi->f_pdata[i].registered) + mtd_device_unregister(&cqspi->f_pdata[i].nor.mtd); + + cqspi_controller_enable(cqspi, 0); + + clk_disable_unprepare(cqspi->clk); + + return 0; +} + +#ifdef CONFIG_PM_SLEEP +static int cqspi_suspend(struct device *dev) +{ + struct cqspi_st *cqspi = dev_get_drvdata(dev); + + cqspi_controller_enable(cqspi, 0); + return 0; +} + +static int cqspi_resume(struct device *dev) +{ + struct cqspi_st *cqspi = dev_get_drvdata(dev); + + cqspi_controller_enable(cqspi, 1); + return 0; +} + +static const struct dev_pm_ops cqspi__dev_pm_ops = { + .suspend = cqspi_suspend, + .resume = cqspi_resume, +}; + +#define CQSPI_DEV_PM_OPS (&cqspi__dev_pm_ops) +#else +#define CQSPI_DEV_PM_OPS NULL +#endif + +static struct of_device_id const cqspi_dt_ids[] = { + {.compatible = "cdns,qspi-nor",}, + { /* end of table */ } +}; + +MODULE_DEVICE_TABLE(of, cqspi_dt_ids); + +static struct platform_driver cqspi_platform_driver = { + .probe = cqspi_probe, + .remove = cqspi_remove, + .driver = { + .name = CQSPI_NAME, + .pm = CQSPI_DEV_PM_OPS, + .of_match_table = cqspi_dt_ids, + }, +}; + +module_platform_driver(cqspi_platform_driver); + +MODULE_DESCRIPTION("Cadence QSPI Controller Driver"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:" CQSPI_NAME); +MODULE_AUTHOR("Ley Foon Tan "); +MODULE_AUTHOR("Graham Moore "); diff --git a/drivers/mtd/spi-nor/fsl-quadspi.c b/drivers/mtd/spi-nor/fsl-quadspi.c index 9ab2b51d54b8..5c82e4ef1904 100644 --- a/drivers/mtd/spi-nor/fsl-quadspi.c +++ b/drivers/mtd/spi-nor/fsl-quadspi.c @@ -618,9 +618,9 @@ static inline void fsl_qspi_invalid(struct fsl_qspi *q) qspi_writel(q, reg, q->iobase + QUADSPI_MCR); } -static int fsl_qspi_nor_write(struct fsl_qspi *q, struct spi_nor *nor, +static ssize_t fsl_qspi_nor_write(struct fsl_qspi *q, struct spi_nor *nor, u8 opcode, unsigned int to, u32 *txbuf, - unsigned count, size_t *retlen) + unsigned count) { int ret, i, j; u32 tmp; @@ -647,8 +647,8 @@ static int fsl_qspi_nor_write(struct fsl_qspi *q, struct spi_nor *nor, /* Trigger it */ ret = fsl_qspi_runcmd(q, opcode, to, count); - if (ret == 0 && retlen) - *retlen += count; + if (ret == 0) + return count; return ret; } @@ -859,7 +859,9 @@ static int fsl_qspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) } else if (len > 0) { ret = fsl_qspi_nor_write(q, nor, opcode, 0, - (u32 *)buf, len, NULL); + (u32 *)buf, len); + if (ret > 0) + return 0; } else { dev_err(q->dev, "invalid cmd %d\n", opcode); ret = -EINVAL; @@ -868,20 +870,20 @@ static int fsl_qspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) return ret; } -static void fsl_qspi_write(struct spi_nor *nor, loff_t to, - size_t len, size_t *retlen, const u_char *buf) +static ssize_t fsl_qspi_write(struct spi_nor *nor, loff_t to, + size_t len, const u_char *buf) { struct fsl_qspi *q = nor->priv; - - fsl_qspi_nor_write(q, nor, nor->program_opcode, to, - (u32 *)buf, len, retlen); + ssize_t ret = fsl_qspi_nor_write(q, nor, nor->program_opcode, to, + (u32 *)buf, len); /* invalid the data in the AHB buffer. */ fsl_qspi_invalid(q); + return ret; } -static int fsl_qspi_read(struct spi_nor *nor, loff_t from, - size_t len, size_t *retlen, u_char *buf) +static ssize_t fsl_qspi_read(struct spi_nor *nor, loff_t from, + size_t len, u_char *buf) { struct fsl_qspi *q = nor->priv; u8 cmd = nor->read_opcode; @@ -923,8 +925,7 @@ static int fsl_qspi_read(struct spi_nor *nor, loff_t from, memcpy(buf, q->ahb_addr + q->chip_base_addr + from - q->memmap_offs, len); - *retlen += len; - return 0; + return len; } static int fsl_qspi_erase(struct spi_nor *nor, loff_t offs) diff --git a/drivers/mtd/spi-nor/hisi-sfc.c b/drivers/mtd/spi-nor/hisi-sfc.c new file mode 100644 index 000000000000..20378b0d55e9 --- /dev/null +++ b/drivers/mtd/spi-nor/hisi-sfc.c @@ -0,0 +1,489 @@ +/* + * HiSilicon SPI Nor Flash Controller Driver + * + * Copyright (c) 2015-2016 HiSilicon Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Hardware register offsets and field definitions */ +#define FMC_CFG 0x00 +#define FMC_CFG_OP_MODE_MASK BIT_MASK(0) +#define FMC_CFG_OP_MODE_BOOT 0 +#define FMC_CFG_OP_MODE_NORMAL 1 +#define FMC_CFG_FLASH_SEL(type) (((type) & 0x3) << 1) +#define FMC_CFG_FLASH_SEL_MASK 0x6 +#define FMC_ECC_TYPE(type) (((type) & 0x7) << 5) +#define FMC_ECC_TYPE_MASK GENMASK(7, 5) +#define SPI_NOR_ADDR_MODE_MASK BIT_MASK(10) +#define SPI_NOR_ADDR_MODE_3BYTES (0x0 << 10) +#define SPI_NOR_ADDR_MODE_4BYTES (0x1 << 10) +#define FMC_GLOBAL_CFG 0x04 +#define FMC_GLOBAL_CFG_WP_ENABLE BIT(6) +#define FMC_SPI_TIMING_CFG 0x08 +#define TIMING_CFG_TCSH(nr) (((nr) & 0xf) << 8) +#define TIMING_CFG_TCSS(nr) (((nr) & 0xf) << 4) +#define TIMING_CFG_TSHSL(nr) ((nr) & 0xf) +#define CS_HOLD_TIME 0x6 +#define CS_SETUP_TIME 0x6 +#define CS_DESELECT_TIME 0xf +#define FMC_INT 0x18 +#define FMC_INT_OP_DONE BIT(0) +#define FMC_INT_CLR 0x20 +#define FMC_CMD 0x24 +#define FMC_CMD_CMD1(cmd) ((cmd) & 0xff) +#define FMC_ADDRL 0x2c +#define FMC_OP_CFG 0x30 +#define OP_CFG_FM_CS(cs) ((cs) << 11) +#define OP_CFG_MEM_IF_TYPE(type) (((type) & 0x7) << 7) +#define OP_CFG_ADDR_NUM(addr) (((addr) & 0x7) << 4) +#define OP_CFG_DUMMY_NUM(dummy) ((dummy) & 0xf) +#define FMC_DATA_NUM 0x38 +#define FMC_DATA_NUM_CNT(cnt) ((cnt) & GENMASK(13, 0)) +#define FMC_OP 0x3c +#define FMC_OP_DUMMY_EN BIT(8) +#define FMC_OP_CMD1_EN BIT(7) +#define FMC_OP_ADDR_EN BIT(6) +#define FMC_OP_WRITE_DATA_EN BIT(5) +#define FMC_OP_READ_DATA_EN BIT(2) +#define FMC_OP_READ_STATUS_EN BIT(1) +#define FMC_OP_REG_OP_START BIT(0) +#define FMC_DMA_LEN 0x40 +#define FMC_DMA_LEN_SET(len) ((len) & GENMASK(27, 0)) +#define FMC_DMA_SADDR_D0 0x4c +#define HIFMC_DMA_MAX_LEN (4096) +#define HIFMC_DMA_MASK (HIFMC_DMA_MAX_LEN - 1) +#define FMC_OP_DMA 0x68 +#define OP_CTRL_RD_OPCODE(code) (((code) & 0xff) << 16) +#define OP_CTRL_WR_OPCODE(code) (((code) & 0xff) << 8) +#define OP_CTRL_RW_OP(op) ((op) << 1) +#define OP_CTRL_DMA_OP_READY BIT(0) +#define FMC_OP_READ 0x0 +#define FMC_OP_WRITE 0x1 +#define FMC_WAIT_TIMEOUT 1000000 + +enum hifmc_iftype { + IF_TYPE_STD, + IF_TYPE_DUAL, + IF_TYPE_DIO, + IF_TYPE_QUAD, + IF_TYPE_QIO, +}; + +struct hifmc_priv { + u32 chipselect; + u32 clkrate; + struct hifmc_host *host; +}; + +#define HIFMC_MAX_CHIP_NUM 2 +struct hifmc_host { + struct device *dev; + struct mutex lock; + + void __iomem *regbase; + void __iomem *iobase; + struct clk *clk; + void *buffer; + dma_addr_t dma_buffer; + + struct spi_nor *nor[HIFMC_MAX_CHIP_NUM]; + u32 num_chip; +}; + +static inline int wait_op_finish(struct hifmc_host *host) +{ + u32 reg; + + return readl_poll_timeout(host->regbase + FMC_INT, reg, + (reg & FMC_INT_OP_DONE), 0, FMC_WAIT_TIMEOUT); +} + +static int get_if_type(enum read_mode flash_read) +{ + enum hifmc_iftype if_type; + + switch (flash_read) { + case SPI_NOR_DUAL: + if_type = IF_TYPE_DUAL; + break; + case SPI_NOR_QUAD: + if_type = IF_TYPE_QUAD; + break; + case SPI_NOR_NORMAL: + case SPI_NOR_FAST: + default: + if_type = IF_TYPE_STD; + break; + } + + return if_type; +} + +static void hisi_spi_nor_init(struct hifmc_host *host) +{ + u32 reg; + + reg = TIMING_CFG_TCSH(CS_HOLD_TIME) + | TIMING_CFG_TCSS(CS_SETUP_TIME) + | TIMING_CFG_TSHSL(CS_DESELECT_TIME); + writel(reg, host->regbase + FMC_SPI_TIMING_CFG); +} + +static int hisi_spi_nor_prep(struct spi_nor *nor, enum spi_nor_ops ops) +{ + struct hifmc_priv *priv = nor->priv; + struct hifmc_host *host = priv->host; + int ret; + + mutex_lock(&host->lock); + + ret = clk_set_rate(host->clk, priv->clkrate); + if (ret) + goto out; + + ret = clk_prepare_enable(host->clk); + if (ret) + goto out; + + return 0; + +out: + mutex_unlock(&host->lock); + return ret; +} + +static void hisi_spi_nor_unprep(struct spi_nor *nor, enum spi_nor_ops ops) +{ + struct hifmc_priv *priv = nor->priv; + struct hifmc_host *host = priv->host; + + clk_disable_unprepare(host->clk); + mutex_unlock(&host->lock); +} + +static int hisi_spi_nor_op_reg(struct spi_nor *nor, + u8 opcode, int len, u8 optype) +{ + struct hifmc_priv *priv = nor->priv; + struct hifmc_host *host = priv->host; + u32 reg; + + reg = FMC_CMD_CMD1(opcode); + writel(reg, host->regbase + FMC_CMD); + + reg = FMC_DATA_NUM_CNT(len); + writel(reg, host->regbase + FMC_DATA_NUM); + + reg = OP_CFG_FM_CS(priv->chipselect); + writel(reg, host->regbase + FMC_OP_CFG); + + writel(0xff, host->regbase + FMC_INT_CLR); + reg = FMC_OP_CMD1_EN | FMC_OP_REG_OP_START | optype; + writel(reg, host->regbase + FMC_OP); + + return wait_op_finish(host); +} + +static int hisi_spi_nor_read_reg(struct spi_nor *nor, u8 opcode, u8 *buf, + int len) +{ + struct hifmc_priv *priv = nor->priv; + struct hifmc_host *host = priv->host; + int ret; + + ret = hisi_spi_nor_op_reg(nor, opcode, len, FMC_OP_READ_DATA_EN); + if (ret) + return ret; + + memcpy_fromio(buf, host->iobase, len); + return 0; +} + +static int hisi_spi_nor_write_reg(struct spi_nor *nor, u8 opcode, + u8 *buf, int len) +{ + struct hifmc_priv *priv = nor->priv; + struct hifmc_host *host = priv->host; + + if (len) + memcpy_toio(host->iobase, buf, len); + + return hisi_spi_nor_op_reg(nor, opcode, len, FMC_OP_WRITE_DATA_EN); +} + +static int hisi_spi_nor_dma_transfer(struct spi_nor *nor, loff_t start_off, + dma_addr_t dma_buf, size_t len, u8 op_type) +{ + struct hifmc_priv *priv = nor->priv; + struct hifmc_host *host = priv->host; + u8 if_type = 0; + u32 reg; + + reg = readl(host->regbase + FMC_CFG); + reg &= ~(FMC_CFG_OP_MODE_MASK | SPI_NOR_ADDR_MODE_MASK); + reg |= FMC_CFG_OP_MODE_NORMAL; + reg |= (nor->addr_width == 4) ? SPI_NOR_ADDR_MODE_4BYTES + : SPI_NOR_ADDR_MODE_3BYTES; + writel(reg, host->regbase + FMC_CFG); + + writel(start_off, host->regbase + FMC_ADDRL); + writel(dma_buf, host->regbase + FMC_DMA_SADDR_D0); + writel(FMC_DMA_LEN_SET(len), host->regbase + FMC_DMA_LEN); + + reg = OP_CFG_FM_CS(priv->chipselect); + if_type = get_if_type(nor->flash_read); + reg |= OP_CFG_MEM_IF_TYPE(if_type); + if (op_type == FMC_OP_READ) + reg |= OP_CFG_DUMMY_NUM(nor->read_dummy >> 3); + writel(reg, host->regbase + FMC_OP_CFG); + + writel(0xff, host->regbase + FMC_INT_CLR); + reg = OP_CTRL_RW_OP(op_type) | OP_CTRL_DMA_OP_READY; + reg |= (op_type == FMC_OP_READ) + ? OP_CTRL_RD_OPCODE(nor->read_opcode) + : OP_CTRL_WR_OPCODE(nor->program_opcode); + writel(reg, host->regbase + FMC_OP_DMA); + + return wait_op_finish(host); +} + +static ssize_t hisi_spi_nor_read(struct spi_nor *nor, loff_t from, size_t len, + u_char *read_buf) +{ + struct hifmc_priv *priv = nor->priv; + struct hifmc_host *host = priv->host; + size_t offset; + int ret; + + for (offset = 0; offset < len; offset += HIFMC_DMA_MAX_LEN) { + size_t trans = min_t(size_t, HIFMC_DMA_MAX_LEN, len - offset); + + ret = hisi_spi_nor_dma_transfer(nor, + from + offset, host->dma_buffer, trans, FMC_OP_READ); + if (ret) { + dev_warn(nor->dev, "DMA read timeout\n"); + return ret; + } + memcpy(read_buf + offset, host->buffer, trans); + } + + return len; +} + +static ssize_t hisi_spi_nor_write(struct spi_nor *nor, loff_t to, + size_t len, const u_char *write_buf) +{ + struct hifmc_priv *priv = nor->priv; + struct hifmc_host *host = priv->host; + size_t offset; + int ret; + + for (offset = 0; offset < len; offset += HIFMC_DMA_MAX_LEN) { + size_t trans = min_t(size_t, HIFMC_DMA_MAX_LEN, len - offset); + + memcpy(host->buffer, write_buf + offset, trans); + ret = hisi_spi_nor_dma_transfer(nor, + to + offset, host->dma_buffer, trans, FMC_OP_WRITE); + if (ret) { + dev_warn(nor->dev, "DMA write timeout\n"); + return ret; + } + } + + return len; +} + +/** + * Get spi flash device information and register it as a mtd device. + */ +static int hisi_spi_nor_register(struct device_node *np, + struct hifmc_host *host) +{ + struct device *dev = host->dev; + struct spi_nor *nor; + struct hifmc_priv *priv; + struct mtd_info *mtd; + int ret; + + nor = devm_kzalloc(dev, sizeof(*nor), GFP_KERNEL); + if (!nor) + return -ENOMEM; + + nor->dev = dev; + spi_nor_set_flash_node(nor, np); + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + ret = of_property_read_u32(np, "reg", &priv->chipselect); + if (ret) { + dev_err(dev, "There's no reg property for %s\n", + np->full_name); + return ret; + } + + ret = of_property_read_u32(np, "spi-max-frequency", + &priv->clkrate); + if (ret) { + dev_err(dev, "There's no spi-max-frequency property for %s\n", + np->full_name); + return ret; + } + priv->host = host; + nor->priv = priv; + + nor->prepare = hisi_spi_nor_prep; + nor->unprepare = hisi_spi_nor_unprep; + nor->read_reg = hisi_spi_nor_read_reg; + nor->write_reg = hisi_spi_nor_write_reg; + nor->read = hisi_spi_nor_read; + nor->write = hisi_spi_nor_write; + nor->erase = NULL; + ret = spi_nor_scan(nor, NULL, SPI_NOR_QUAD); + if (ret) + return ret; + + mtd = &nor->mtd; + mtd->name = np->name; + ret = mtd_device_register(mtd, NULL, 0); + if (ret) + return ret; + + host->nor[host->num_chip] = nor; + host->num_chip++; + return 0; +} + +static void hisi_spi_nor_unregister_all(struct hifmc_host *host) +{ + int i; + + for (i = 0; i < host->num_chip; i++) + mtd_device_unregister(&host->nor[i]->mtd); +} + +static int hisi_spi_nor_register_all(struct hifmc_host *host) +{ + struct device *dev = host->dev; + struct device_node *np; + int ret; + + for_each_available_child_of_node(dev->of_node, np) { + ret = hisi_spi_nor_register(np, host); + if (ret) + goto fail; + + if (host->num_chip == HIFMC_MAX_CHIP_NUM) { + dev_warn(dev, "Flash device number exceeds the maximum chipselect number\n"); + break; + } + } + + return 0; + +fail: + hisi_spi_nor_unregister_all(host); + return ret; +} + +static int hisi_spi_nor_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *res; + struct hifmc_host *host; + int ret; + + host = devm_kzalloc(dev, sizeof(*host), GFP_KERNEL); + if (!host) + return -ENOMEM; + + platform_set_drvdata(pdev, host); + host->dev = dev; + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "control"); + host->regbase = devm_ioremap_resource(dev, res); + if (IS_ERR(host->regbase)) + return PTR_ERR(host->regbase); + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "memory"); + host->iobase = devm_ioremap_resource(dev, res); + if (IS_ERR(host->iobase)) + return PTR_ERR(host->iobase); + + host->clk = devm_clk_get(dev, NULL); + if (IS_ERR(host->clk)) + return PTR_ERR(host->clk); + + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32)); + if (ret) { + dev_warn(dev, "Unable to set dma mask\n"); + return ret; + } + + host->buffer = dmam_alloc_coherent(dev, HIFMC_DMA_MAX_LEN, + &host->dma_buffer, GFP_KERNEL); + if (!host->buffer) + return -ENOMEM; + + mutex_init(&host->lock); + clk_prepare_enable(host->clk); + hisi_spi_nor_init(host); + ret = hisi_spi_nor_register_all(host); + if (ret) + mutex_destroy(&host->lock); + + clk_disable_unprepare(host->clk); + return ret; +} + +static int hisi_spi_nor_remove(struct platform_device *pdev) +{ + struct hifmc_host *host = platform_get_drvdata(pdev); + + hisi_spi_nor_unregister_all(host); + mutex_destroy(&host->lock); + clk_disable_unprepare(host->clk); + return 0; +} + +static const struct of_device_id hisi_spi_nor_dt_ids[] = { + { .compatible = "hisilicon,fmc-spi-nor"}, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, hisi_spi_nor_dt_ids); + +static struct platform_driver hisi_spi_nor_driver = { + .driver = { + .name = "hisi-sfc", + .of_match_table = hisi_spi_nor_dt_ids, + }, + .probe = hisi_spi_nor_probe, + .remove = hisi_spi_nor_remove, +}; +module_platform_driver(hisi_spi_nor_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("HiSilicon SPI Nor Flash Controller Driver"); diff --git a/drivers/mtd/spi-nor/mtk-quadspi.c b/drivers/mtd/spi-nor/mtk-quadspi.c index 8bed1a4cb79c..e661877c23de 100644 --- a/drivers/mtd/spi-nor/mtk-quadspi.c +++ b/drivers/mtd/spi-nor/mtk-quadspi.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -243,8 +242,8 @@ static void mt8173_nor_set_addr(struct mt8173_nor *mt8173_nor, u32 addr) writeb(addr & 0xff, mt8173_nor->base + MTK_NOR_RADR3_REG); } -static int mt8173_nor_read(struct spi_nor *nor, loff_t from, size_t length, - size_t *retlen, u_char *buffer) +static ssize_t mt8173_nor_read(struct spi_nor *nor, loff_t from, size_t length, + u_char *buffer) { int i, ret; int addr = (int)from; @@ -255,13 +254,13 @@ static int mt8173_nor_read(struct spi_nor *nor, loff_t from, size_t length, mt8173_nor_set_read_mode(mt8173_nor); mt8173_nor_set_addr(mt8173_nor, addr); - for (i = 0; i < length; i++, (*retlen)++) { + for (i = 0; i < length; i++) { ret = mt8173_nor_execute_cmd(mt8173_nor, MTK_NOR_PIO_READ_CMD); if (ret < 0) return ret; buf[i] = readb(mt8173_nor->base + MTK_NOR_RDATA_REG); } - return 0; + return length; } static int mt8173_nor_write_single_byte(struct mt8173_nor *mt8173_nor, @@ -297,36 +296,44 @@ static int mt8173_nor_write_buffer(struct mt8173_nor *mt8173_nor, int addr, return mt8173_nor_execute_cmd(mt8173_nor, MTK_NOR_WR_CMD); } -static void mt8173_nor_write(struct spi_nor *nor, loff_t to, size_t len, - size_t *retlen, const u_char *buf) +static ssize_t mt8173_nor_write(struct spi_nor *nor, loff_t to, size_t len, + const u_char *buf) { int ret; struct mt8173_nor *mt8173_nor = nor->priv; + size_t i; ret = mt8173_nor_write_buffer_enable(mt8173_nor); - if (ret < 0) + if (ret < 0) { dev_warn(mt8173_nor->dev, "write buffer enable failed!\n"); + return ret; + } - while (len >= SFLASH_WRBUF_SIZE) { + for (i = 0; i + SFLASH_WRBUF_SIZE <= len; i += SFLASH_WRBUF_SIZE) { ret = mt8173_nor_write_buffer(mt8173_nor, to, buf); - if (ret < 0) + if (ret < 0) { dev_err(mt8173_nor->dev, "write buffer failed!\n"); - len -= SFLASH_WRBUF_SIZE; + return ret; + } to += SFLASH_WRBUF_SIZE; buf += SFLASH_WRBUF_SIZE; - (*retlen) += SFLASH_WRBUF_SIZE; } ret = mt8173_nor_write_buffer_disable(mt8173_nor); - if (ret < 0) + if (ret < 0) { dev_warn(mt8173_nor->dev, "write buffer disable failed!\n"); - - if (len) { - ret = mt8173_nor_write_single_byte(mt8173_nor, to, (int)len, - (u8 *)buf); - if (ret < 0) - dev_err(mt8173_nor->dev, "write single byte failed!\n"); - (*retlen) += len; + return ret; } + + if (i < len) { + ret = mt8173_nor_write_single_byte(mt8173_nor, to, + (int)(len - i), (u8 *)buf); + if (ret < 0) { + dev_err(mt8173_nor->dev, "write single byte failed!\n"); + return ret; + } + } + + return len; } static int mt8173_nor_read_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) diff --git a/drivers/mtd/spi-nor/nxp-spifi.c b/drivers/mtd/spi-nor/nxp-spifi.c index ae428cb0e04b..73a14f40928b 100644 --- a/drivers/mtd/spi-nor/nxp-spifi.c +++ b/drivers/mtd/spi-nor/nxp-spifi.c @@ -172,8 +172,8 @@ static int nxp_spifi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) return nxp_spifi_wait_for_cmd(spifi); } -static int nxp_spifi_read(struct spi_nor *nor, loff_t from, size_t len, - size_t *retlen, u_char *buf) +static ssize_t nxp_spifi_read(struct spi_nor *nor, loff_t from, size_t len, + u_char *buf) { struct nxp_spifi *spifi = nor->priv; int ret; @@ -183,24 +183,23 @@ static int nxp_spifi_read(struct spi_nor *nor, loff_t from, size_t len, return ret; memcpy_fromio(buf, spifi->flash_base + from, len); - *retlen += len; - return 0; + return len; } -static void nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len, - size_t *retlen, const u_char *buf) +static ssize_t nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len, + const u_char *buf) { struct nxp_spifi *spifi = nor->priv; u32 cmd; int ret; + size_t i; ret = nxp_spifi_set_memory_mode_off(spifi); if (ret) - return; + return ret; writel(to, spifi->io_base + SPIFI_ADDR); - *retlen += len; cmd = SPIFI_CMD_DOUT | SPIFI_CMD_DATALEN(len) | @@ -209,10 +208,14 @@ static void nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len, SPIFI_CMD_FRAMEFORM(spifi->nor.addr_width + 1); writel(cmd, spifi->io_base + SPIFI_CMD); - while (len--) - writeb(*buf++, spifi->io_base + SPIFI_DATA); + for (i = 0; i < len; i++) + writeb(buf[i], spifi->io_base + SPIFI_DATA); - nxp_spifi_wait_for_cmd(spifi); + ret = nxp_spifi_wait_for_cmd(spifi); + if (ret) + return ret; + + return len; } static int nxp_spifi_erase(struct spi_nor *nor, loff_t offs) diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index c52e45594bfd..d0fc165d7d66 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -661,7 +661,7 @@ static int stm_unlock(struct spi_nor *nor, loff_t ofs, uint64_t len) status_new = (status_old & ~mask & ~SR_TB) | val; /* Don't protect status register if we're fully unlocked */ - if (lock_len == mtd->size) + if (lock_len == 0) status_new &= ~SR_SRWD; if (!use_top) @@ -830,10 +830,26 @@ static const struct flash_info spi_nor_ids[] = { { "mb85rs1mt", INFO(0x047f27, 0, 128 * 1024, 1, SPI_NOR_NO_ERASE) }, /* GigaDevice */ - { "gd25q32", INFO(0xc84016, 0, 64 * 1024, 64, SECT_4K) }, - { "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128, SECT_4K) }, - { "gd25lq64c", INFO(0xc86017, 0, 64 * 1024, 128, SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) }, - { "gd25q128", INFO(0xc84018, 0, 64 * 1024, 256, SECT_4K) }, + { + "gd25q32", INFO(0xc84016, 0, 64 * 1024, 64, + SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ | + SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB) + }, + { + "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128, + SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ | + SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB) + }, + { + "gd25lq64c", INFO(0xc86017, 0, 64 * 1024, 128, + SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ | + SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB) + }, + { + "gd25q128", INFO(0xc84018, 0, 64 * 1024, 256, + SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ | + SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB) + }, /* Intel/Numonyx -- xxxs33b */ { "160s33b", INFO(0x898911, 0, 64 * 1024, 32, 0) }, @@ -871,6 +887,7 @@ static const struct flash_info spi_nor_ids[] = { { "n25q512a", INFO(0x20bb20, 0, 64 * 1024, 1024, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) }, { "n25q512ax3", INFO(0x20ba20, 0, 64 * 1024, 1024, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) }, { "n25q00", INFO(0x20ba21, 0, 64 * 1024, 2048, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) }, + { "n25q00a", INFO(0x20bb21, 0, 64 * 1024, 2048, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) }, /* PMC */ { "pm25lv512", INFO(0, 0, 32 * 1024, 2, SECT_4K_PMC) }, @@ -1031,8 +1048,25 @@ static int spi_nor_read(struct mtd_info *mtd, loff_t from, size_t len, if (ret) return ret; - ret = nor->read(nor, from, len, retlen, buf); + while (len) { + ret = nor->read(nor, from, len, buf); + if (ret == 0) { + /* We shouldn't see 0-length reads */ + ret = -EIO; + goto read_err; + } + if (ret < 0) + goto read_err; + WARN_ON(ret > len); + *retlen += ret; + buf += ret; + from += ret; + len -= ret; + } + ret = 0; + +read_err: spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_READ); return ret; } @@ -1060,10 +1094,14 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len, nor->program_opcode = SPINOR_OP_BP; /* write one byte. */ - nor->write(nor, to, 1, retlen, buf); + ret = nor->write(nor, to, 1, buf); + if (ret < 0) + goto sst_write_err; + WARN(ret != 1, "While writing 1 byte written %i bytes\n", + (int)ret); ret = spi_nor_wait_till_ready(nor); if (ret) - goto time_out; + goto sst_write_err; } to += actual; @@ -1072,10 +1110,14 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len, nor->program_opcode = SPINOR_OP_AAI_WP; /* write two bytes. */ - nor->write(nor, to, 2, retlen, buf + actual); + ret = nor->write(nor, to, 2, buf + actual); + if (ret < 0) + goto sst_write_err; + WARN(ret != 2, "While writing 2 bytes written %i bytes\n", + (int)ret); ret = spi_nor_wait_till_ready(nor); if (ret) - goto time_out; + goto sst_write_err; to += 2; nor->sst_write_second = true; } @@ -1084,21 +1126,26 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len, write_disable(nor); ret = spi_nor_wait_till_ready(nor); if (ret) - goto time_out; + goto sst_write_err; /* Write out trailing byte if it exists. */ if (actual != len) { write_enable(nor); nor->program_opcode = SPINOR_OP_BP; - nor->write(nor, to, 1, retlen, buf + actual); - + ret = nor->write(nor, to, 1, buf + actual); + if (ret < 0) + goto sst_write_err; + WARN(ret != 1, "While writing 1 byte written %i bytes\n", + (int)ret); ret = spi_nor_wait_till_ready(nor); if (ret) - goto time_out; + goto sst_write_err; write_disable(nor); + actual += 1; } -time_out: +sst_write_err: + *retlen += actual; spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_WRITE); return ret; } @@ -1112,8 +1159,8 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf) { struct spi_nor *nor = mtd_to_spi_nor(mtd); - u32 page_offset, page_size, i; - int ret; + size_t page_offset, page_remain, i; + ssize_t ret; dev_dbg(nor->dev, "to 0x%08x, len %zd\n", (u32)to, len); @@ -1121,35 +1168,37 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len, if (ret) return ret; - write_enable(nor); + for (i = 0; i < len; ) { + ssize_t written; - page_offset = to & (nor->page_size - 1); - - /* do all the bytes fit onto one page? */ - if (page_offset + len <= nor->page_size) { - nor->write(nor, to, len, retlen, buf); - } else { + page_offset = (to + i) & (nor->page_size - 1); + WARN_ONCE(page_offset, + "Writing at offset %zu into a NOR page. Writing partial pages may decrease reliability and increase wear of NOR flash.", + page_offset); /* the size of data remaining on the first page */ - page_size = nor->page_size - page_offset; - nor->write(nor, to, page_size, retlen, buf); + page_remain = min_t(size_t, + nor->page_size - page_offset, len - i); - /* write everything in nor->page_size chunks */ - for (i = page_size; i < len; i += page_size) { - page_size = len - i; - if (page_size > nor->page_size) - page_size = nor->page_size; + write_enable(nor); + ret = nor->write(nor, to + i, page_remain, buf + i); + if (ret < 0) + goto write_err; + written = ret; - ret = spi_nor_wait_till_ready(nor); - if (ret) - goto write_err; - - write_enable(nor); - - nor->write(nor, to + i, page_size, retlen, buf + i); + ret = spi_nor_wait_till_ready(nor); + if (ret) + goto write_err; + *retlen += written; + i += written; + if (written != page_remain) { + dev_err(nor->dev, + "While writing %zu bytes written %zd bytes\n", + page_remain, written); + ret = -EIO; + goto write_err; } } - ret = spi_nor_wait_till_ready(nor); write_err: spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_WRITE); return ret; diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c index daf82ba7aba0..41b13d1cdcc4 100644 --- a/drivers/mtd/ssfdc.c +++ b/drivers/mtd/ssfdc.c @@ -380,8 +380,7 @@ static int ssfdcr_readsect(struct mtd_blktrans_dev *dev, " block_addr=%d\n", logic_sect_no, sectors_per_block, offset, block_address); - if (block_address >= ssfdc->map_len) - BUG(); + BUG_ON(block_address >= ssfdc->map_len); block_address = ssfdc->logic_block_map[block_address]; diff --git a/drivers/mtd/tests/nandbiterrs.c b/drivers/mtd/tests/nandbiterrs.c index 09a4ccac53a2..f26dec896afa 100644 --- a/drivers/mtd/tests/nandbiterrs.c +++ b/drivers/mtd/tests/nandbiterrs.c @@ -290,7 +290,7 @@ static int overwrite_test(void) while (opno < max_overwrite) { - err = rewrite_page(0); + err = write_page(0); if (err) break; diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c index e708e360a9e3..6453148d066a 100644 --- a/drivers/net/ethernet/atheros/alx/main.c +++ b/drivers/net/ethernet/atheros/alx/main.c @@ -1251,7 +1251,7 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) struct alx_priv *alx; struct alx_hw *hw; bool phy_configured; - int bars, err; + int err; err = pci_enable_device_mem(pdev); if (err) @@ -1271,11 +1271,10 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } } - bars = pci_select_bars(pdev, IORESOURCE_MEM); - err = pci_request_selected_regions(pdev, bars, alx_drv_name); + err = pci_request_mem_regions(pdev, alx_drv_name); if (err) { dev_err(&pdev->dev, - "pci_request_selected_regions failed(bars:%d)\n", bars); + "pci_request_mem_regions failed\n"); goto out_pci_disable; } @@ -1401,7 +1400,7 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) out_free_netdev: free_netdev(netdev); out_pci_release: - pci_release_selected_regions(pdev, bars); + pci_release_mem_regions(pdev); out_pci_disable: pci_disable_device(pdev); return err; @@ -1420,8 +1419,7 @@ static void alx_remove(struct pci_dev *pdev) unregister_netdev(alx->dev); iounmap(hw->hw_addr); - pci_release_selected_regions(pdev, - pci_select_bars(pdev, IORESOURCE_MEM)); + pci_release_mem_regions(pdev); pci_disable_pcie_error_reporting(pdev); pci_disable_device(pdev); diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 41f32c0b341e..02f443958f31 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -7330,8 +7330,7 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err_ioremap: free_netdev(netdev); err_alloc_etherdev: - pci_release_selected_regions(pdev, - pci_select_bars(pdev, IORESOURCE_MEM)); + pci_release_mem_regions(pdev); err_pci_reg: err_dma: pci_disable_device(pdev); @@ -7398,8 +7397,7 @@ static void e1000_remove(struct pci_dev *pdev) if ((adapter->hw.flash_address) && (adapter->hw.mac.type < e1000_pch_spt)) iounmap(adapter->hw.flash_address); - pci_release_selected_regions(pdev, - pci_select_bars(pdev, IORESOURCE_MEM)); + pci_release_mem_regions(pdev); free_netdev(netdev); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index b8245c734c96..774a5654bf42 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -1963,10 +1963,7 @@ static int fm10k_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_dma; } - err = pci_request_selected_regions(pdev, - pci_select_bars(pdev, - IORESOURCE_MEM), - fm10k_driver_name); + err = pci_request_mem_regions(pdev, fm10k_driver_name); if (err) { dev_err(&pdev->dev, "pci_request_selected_regions failed: %d\n", err); @@ -2070,8 +2067,7 @@ static int fm10k_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err_ioremap: free_netdev(netdev); err_alloc_netdev: - pci_release_selected_regions(pdev, - pci_select_bars(pdev, IORESOURCE_MEM)); + pci_release_mem_regions(pdev); err_pci_reg: err_dma: pci_disable_device(pdev); @@ -2119,8 +2115,7 @@ static void fm10k_remove(struct pci_dev *pdev) free_netdev(netdev); - pci_release_selected_regions(pdev, - pci_select_bars(pdev, IORESOURCE_MEM)); + pci_release_mem_regions(pdev); pci_disable_pcie_error_reporting(pdev); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 339d99be4702..81c99e1be708 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -10710,8 +10710,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } /* set up pci connections */ - err = pci_request_selected_regions(pdev, pci_select_bars(pdev, - IORESOURCE_MEM), i40e_driver_name); + err = pci_request_mem_regions(pdev, i40e_driver_name); if (err) { dev_info(&pdev->dev, "pci_request_selected_regions failed %d\n", err); @@ -11208,8 +11207,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) kfree(pf); err_pf_alloc: pci_disable_pcie_error_reporting(pdev); - pci_release_selected_regions(pdev, - pci_select_bars(pdev, IORESOURCE_MEM)); + pci_release_mem_regions(pdev); err_pci_reg: err_dma: pci_disable_device(pdev); @@ -11320,8 +11318,7 @@ static void i40e_remove(struct pci_dev *pdev) iounmap(hw->hw_addr); kfree(pf); - pci_release_selected_regions(pdev, - pci_select_bars(pdev, IORESOURCE_MEM)); + pci_release_mem_regions(pdev); pci_disable_pcie_error_reporting(pdev); pci_disable_device(pdev); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 9bcba42abb91..942a89fb0090 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2324,9 +2324,7 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } } - err = pci_request_selected_regions(pdev, pci_select_bars(pdev, - IORESOURCE_MEM), - igb_driver_name); + err = pci_request_mem_regions(pdev, igb_driver_name); if (err) goto err_pci_reg; @@ -2750,8 +2748,7 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err_ioremap: free_netdev(netdev); err_alloc_etherdev: - pci_release_selected_regions(pdev, - pci_select_bars(pdev, IORESOURCE_MEM)); + pci_release_mem_regions(pdev); err_pci_reg: err_dma: pci_disable_device(pdev); @@ -2916,8 +2913,7 @@ static void igb_remove(struct pci_dev *pdev) pci_iounmap(pdev, adapter->io_addr); if (hw->flash_address) iounmap(hw->flash_address); - pci_release_selected_regions(pdev, - pci_select_bars(pdev, IORESOURCE_MEM)); + pci_release_mem_regions(pdev); kfree(adapter->shadow_vfta); free_netdev(netdev); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 7871f538f0ad..5418c69a7463 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9353,8 +9353,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_using_dac = 0; } - err = pci_request_selected_regions(pdev, pci_select_bars(pdev, - IORESOURCE_MEM), ixgbe_driver_name); + err = pci_request_mem_regions(pdev, ixgbe_driver_name); if (err) { dev_err(&pdev->dev, "pci_request_selected_regions failed 0x%x\n", err); @@ -9740,8 +9739,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state); free_netdev(netdev); err_alloc_etherdev: - pci_release_selected_regions(pdev, - pci_select_bars(pdev, IORESOURCE_MEM)); + pci_release_mem_regions(pdev); err_pci_reg: err_dma: if (!adapter || disable_dev) @@ -9808,8 +9806,7 @@ static void ixgbe_remove(struct pci_dev *pdev) #endif iounmap(adapter->io_addr); - pci_release_selected_regions(pdev, pci_select_bars(pdev, - IORESOURCE_MEM)); + pci_release_mem_regions(pdev); e_dev_info("complete\n"); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4cb9b156cab7..d7c33f9361aa 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1661,14 +1661,9 @@ static int nvme_pci_enable(struct nvme_dev *dev) static void nvme_dev_unmap(struct nvme_dev *dev) { - struct pci_dev *pdev = to_pci_dev(dev->dev); - int bars; - if (dev->bar) iounmap(dev->bar); - - bars = pci_select_bars(pdev, IORESOURCE_MEM); - pci_release_selected_regions(pdev, bars); + pci_release_mem_regions(to_pci_dev(dev->dev)); } static void nvme_pci_disable(struct nvme_dev *dev) @@ -1897,13 +1892,9 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { static int nvme_dev_map(struct nvme_dev *dev) { - int bars; struct pci_dev *pdev = to_pci_dev(dev->dev); - bars = pci_select_bars(pdev, IORESOURCE_MEM); - if (!bars) - return -ENODEV; - if (pci_request_selected_regions(pdev, bars, "nvme")) + if (pci_request_mem_regions(pdev, "nvme")) return -ENODEV; dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); @@ -1912,7 +1903,7 @@ static int nvme_dev_map(struct nvme_dev *dev) return 0; release: - pci_release_selected_regions(pdev, bars); + pci_release_mem_regions(pdev); return -ENODEV; } diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 56389be5d08b..67f9916ff14d 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -25,7 +25,7 @@ config PCI_MSI If you don't know what to do here, say Y. config PCI_MSI_IRQ_DOMAIN - bool + def_bool ARM || ARM64 || X86 depends on PCI_MSI select GENERIC_MSI_IRQ_DOMAIN diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index dd7cdbee8029..c288e5a52575 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -91,6 +91,35 @@ void pci_bus_remove_resources(struct pci_bus *bus) } } +int devm_request_pci_bus_resources(struct device *dev, + struct list_head *resources) +{ + struct resource_entry *win; + struct resource *parent, *res; + int err; + + resource_list_for_each_entry(win, resources) { + res = win->res; + switch (resource_type(res)) { + case IORESOURCE_IO: + parent = &ioport_resource; + break; + case IORESOURCE_MEM: + parent = &iomem_resource; + break; + default: + continue; + } + + err = devm_request_resource(dev, parent, res); + if (err) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(devm_request_pci_bus_resources); + static struct pci_bus_region pci_32_bit = {0, 0xffffffffULL}; #ifdef CONFIG_PCI_BUS_ADDR_T_64BIT static struct pci_bus_region pci_64_bit = {0, @@ -291,6 +320,7 @@ void pci_bus_add_device(struct pci_dev *dev) pci_fixup_device(pci_fixup_final, dev); pci_create_sysfs_dev_files(dev); pci_proc_attach_device(dev); + pci_bridge_d3_device_changed(dev); dev->match_driver = true; retval = device_attach(&dev->dev); @@ -397,4 +427,3 @@ void pci_bus_put(struct pci_bus *bus) put_device(&bus->dev); } EXPORT_SYMBOL(pci_bus_put); - diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c index f9832ad8efe2..66e0d718472f 100644 --- a/drivers/pci/ecam.c +++ b/drivers/pci/ecam.c @@ -19,10 +19,9 @@ #include #include #include +#include #include -#include "ecam.h" - /* * On 64-bit systems, we do a single ioremap for the whole config space * since we have enough virtual address range available. On 32-bit, we @@ -52,6 +51,7 @@ struct pci_config_window *pci_ecam_create(struct device *dev, if (!cfg) return ERR_PTR(-ENOMEM); + cfg->parent = dev; cfg->ops = ops; cfg->busr.start = busr->start; cfg->busr.end = busr->end; @@ -95,7 +95,7 @@ struct pci_config_window *pci_ecam_create(struct device *dev, } if (ops->init) { - err = ops->init(dev, cfg); + err = ops->init(cfg); if (err) goto err_exit; } diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig index 5d2374e4ee7f..9b485d873b0d 100644 --- a/drivers/pci/host/Kconfig +++ b/drivers/pci/host/Kconfig @@ -3,8 +3,9 @@ menu "PCI host controller drivers" config PCI_DRA7XX bool "TI DRA7xx PCIe controller" - select PCIE_DW depends on OF && HAS_IOMEM && TI_PIPE3 + depends on PCI_MSI_IRQ_DOMAIN + select PCIE_DW help Enables support for the PCIe controller in the DRA7xx SoC. There are two instances of PCIe controller in DRA7xx. This controller can @@ -16,11 +17,20 @@ config PCI_MVEBU depends on ARM depends on OF +config PCI_AARDVARK + bool "Aardvark PCIe controller" + depends on ARCH_MVEBU && ARM64 + depends on OF + depends on PCI_MSI_IRQ_DOMAIN + help + Add support for Aardvark 64bit PCIe Host Controller. This + controller is part of the South Bridge of the Marvel Armada + 3700 SoC. config PCIE_XILINX_NWL bool "NWL PCIe Core" depends on ARCH_ZYNQMP - select PCI_MSI_IRQ_DOMAIN if PCI_MSI + depends on PCI_MSI_IRQ_DOMAIN help Say 'Y' here if you want kernel support for Xilinx NWL PCIe controller. The controller can act as Root Port @@ -29,6 +39,7 @@ config PCIE_XILINX_NWL config PCIE_DW_PLAT bool "Platform bus based DesignWare PCIe Controller" + depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW ---help--- This selects the DesignWare PCIe controller support. Select this if @@ -40,16 +51,19 @@ config PCIE_DW_PLAT config PCIE_DW bool + depends on PCI_MSI_IRQ_DOMAIN config PCI_EXYNOS bool "Samsung Exynos PCIe controller" depends on SOC_EXYNOS5440 + depends on PCI_MSI_IRQ_DOMAIN select PCIEPORTBUS select PCIE_DW config PCI_IMX6 bool "Freescale i.MX6 PCIe controller" depends on SOC_IMX6Q + depends on PCI_MSI_IRQ_DOMAIN select PCIEPORTBUS select PCIE_DW @@ -72,8 +86,7 @@ config PCI_RCAR_GEN2 config PCIE_RCAR bool "Renesas R-Car PCIe controller" depends on ARCH_RENESAS || (ARM && COMPILE_TEST) - select PCI_MSI - select PCI_MSI_IRQ_DOMAIN + depends on PCI_MSI_IRQ_DOMAIN help Say Y here if you want PCIe controller support on R-Car SoCs. @@ -85,6 +98,7 @@ config PCI_HOST_GENERIC bool "Generic PCI host controller" depends on (ARM || ARM64) && OF select PCI_HOST_COMMON + select IRQ_DOMAIN help Say Y here if you want to support a simple generic PCI host controller, such as the one emulated by kvmtool. @@ -92,6 +106,7 @@ config PCI_HOST_GENERIC config PCIE_SPEAR13XX bool "STMicroelectronics SPEAr PCIe controller" depends on ARCH_SPEAR13XX + depends on PCI_MSI_IRQ_DOMAIN select PCIEPORTBUS select PCIE_DW help @@ -100,6 +115,7 @@ config PCIE_SPEAR13XX config PCI_KEYSTONE bool "TI Keystone PCIe controller" depends on ARCH_KEYSTONE + depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW select PCIEPORTBUS help @@ -120,7 +136,6 @@ config PCI_XGENE depends on ARCH_XGENE depends on OF select PCIEPORTBUS - select PCI_MSI_IRQ_DOMAIN if PCI_MSI help Say Y here if you want internal PCI support on APM X-Gene SoC. There are 5 internal PCIe ports available. Each port is GEN3 capable @@ -128,7 +143,8 @@ config PCI_XGENE config PCI_XGENE_MSI bool "X-Gene v1 PCIe MSI feature" - depends on PCI_XGENE && PCI_MSI + depends on PCI_XGENE + depends on PCI_MSI_IRQ_DOMAIN default y help Say Y here if you want PCIe MSI support for the APM X-Gene v1 SoC. @@ -137,6 +153,7 @@ config PCI_XGENE_MSI config PCI_LAYERSCAPE bool "Freescale Layerscape PCIe controller" depends on OF && (ARM || ARCH_LAYERSCAPE) + depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW select MFD_SYSCON help @@ -177,8 +194,7 @@ config PCIE_IPROC_BCMA config PCIE_IPROC_MSI bool "Broadcom iProc PCIe MSI support" depends on PCIE_IPROC_PLATFORM || PCIE_IPROC_BCMA - depends on PCI_MSI - select PCI_MSI_IRQ_DOMAIN + depends on PCI_MSI_IRQ_DOMAIN default ARCH_BCM_IPROC help Say Y here if you want to enable MSI support for Broadcom's iProc @@ -195,8 +211,8 @@ config PCIE_ALTERA config PCIE_ALTERA_MSI bool "Altera PCIe MSI feature" - depends on PCIE_ALTERA && PCI_MSI - select PCI_MSI_IRQ_DOMAIN + depends on PCIE_ALTERA + depends on PCI_MSI_IRQ_DOMAIN help Say Y here if you want PCIe MSI support for the Altera FPGA. This MSI driver supports Altera MSI to GIC controller IP. @@ -204,6 +220,7 @@ config PCIE_ALTERA_MSI config PCI_HISI depends on OF && ARM64 bool "HiSilicon Hip05 and Hip06 SoCs PCIe controllers" + depends on PCI_MSI_IRQ_DOMAIN select PCIEPORTBUS select PCIE_DW help @@ -213,6 +230,7 @@ config PCI_HISI config PCIE_QCOM bool "Qualcomm PCIe controller" depends on ARCH_QCOM && OF + depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW select PCIEPORTBUS help @@ -237,6 +255,7 @@ config PCI_HOST_THUNDER_ECAM config PCIE_ARMADA_8K bool "Marvell Armada-8K PCIe controller" depends on ARCH_MVEBU + depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW select PCIEPORTBUS help @@ -245,4 +264,14 @@ config PCIE_ARMADA_8K Designware hardware and therefore the driver re-uses the Designware core functions to implement the driver. +config PCIE_ARTPEC6 + bool "Axis ARTPEC-6 PCIe controller" + depends on MACH_ARTPEC6 + depends on PCI_MSI_IRQ_DOMAIN + select PCIE_DW + select PCIEPORTBUS + help + Say Y here to enable PCIe controller support on Axis ARTPEC-6 + SoCs. This PCIe controller uses the DesignWare core. + endmenu diff --git a/drivers/pci/host/Makefile b/drivers/pci/host/Makefile index 9c8698e89e96..88434101e4c4 100644 --- a/drivers/pci/host/Makefile +++ b/drivers/pci/host/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o obj-$(CONFIG_PCI_IMX6) += pci-imx6.o obj-$(CONFIG_PCI_HYPERV) += pci-hyperv.o obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o +obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o obj-$(CONFIG_PCI_RCAR_GEN2) += pci-rcar-gen2.o obj-$(CONFIG_PCIE_RCAR) += pcie-rcar.o @@ -29,3 +30,4 @@ obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o obj-$(CONFIG_PCI_HOST_THUNDER_ECAM) += pci-thunder-ecam.o obj-$(CONFIG_PCI_HOST_THUNDER_PEM) += pci-thunder-pem.o obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o +obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c new file mode 100644 index 000000000000..ef9893fa3176 --- /dev/null +++ b/drivers/pci/host/pci-aardvark.c @@ -0,0 +1,1001 @@ +/* + * Driver for the Aardvark PCIe controller, used on Marvell Armada + * 3700. + * + * Copyright (C) 2016 Marvell + * + * Author: Hezi Shahmoon + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* PCIe core registers */ +#define PCIE_CORE_CMD_STATUS_REG 0x4 +#define PCIE_CORE_CMD_IO_ACCESS_EN BIT(0) +#define PCIE_CORE_CMD_MEM_ACCESS_EN BIT(1) +#define PCIE_CORE_CMD_MEM_IO_REQ_EN BIT(2) +#define PCIE_CORE_DEV_CTRL_STATS_REG 0xc8 +#define PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE (0 << 4) +#define PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT 5 +#define PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE (0 << 11) +#define PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT 12 +#define PCIE_CORE_LINK_CTRL_STAT_REG 0xd0 +#define PCIE_CORE_LINK_L0S_ENTRY BIT(0) +#define PCIE_CORE_LINK_TRAINING BIT(5) +#define PCIE_CORE_LINK_WIDTH_SHIFT 20 +#define PCIE_CORE_ERR_CAPCTL_REG 0x118 +#define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX BIT(5) +#define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN BIT(6) +#define PCIE_CORE_ERR_CAPCTL_ECRC_CHCK BIT(7) +#define PCIE_CORE_ERR_CAPCTL_ECRC_CHCK_RCV BIT(8) + +/* PIO registers base address and register offsets */ +#define PIO_BASE_ADDR 0x4000 +#define PIO_CTRL (PIO_BASE_ADDR + 0x0) +#define PIO_CTRL_TYPE_MASK GENMASK(3, 0) +#define PIO_CTRL_ADDR_WIN_DISABLE BIT(24) +#define PIO_STAT (PIO_BASE_ADDR + 0x4) +#define PIO_COMPLETION_STATUS_SHIFT 7 +#define PIO_COMPLETION_STATUS_MASK GENMASK(9, 7) +#define PIO_COMPLETION_STATUS_OK 0 +#define PIO_COMPLETION_STATUS_UR 1 +#define PIO_COMPLETION_STATUS_CRS 2 +#define PIO_COMPLETION_STATUS_CA 4 +#define PIO_NON_POSTED_REQ BIT(0) +#define PIO_ADDR_LS (PIO_BASE_ADDR + 0x8) +#define PIO_ADDR_MS (PIO_BASE_ADDR + 0xc) +#define PIO_WR_DATA (PIO_BASE_ADDR + 0x10) +#define PIO_WR_DATA_STRB (PIO_BASE_ADDR + 0x14) +#define PIO_RD_DATA (PIO_BASE_ADDR + 0x18) +#define PIO_START (PIO_BASE_ADDR + 0x1c) +#define PIO_ISR (PIO_BASE_ADDR + 0x20) +#define PIO_ISRM (PIO_BASE_ADDR + 0x24) + +/* Aardvark Control registers */ +#define CONTROL_BASE_ADDR 0x4800 +#define PCIE_CORE_CTRL0_REG (CONTROL_BASE_ADDR + 0x0) +#define PCIE_GEN_SEL_MSK 0x3 +#define PCIE_GEN_SEL_SHIFT 0x0 +#define SPEED_GEN_1 0 +#define SPEED_GEN_2 1 +#define SPEED_GEN_3 2 +#define IS_RC_MSK 1 +#define IS_RC_SHIFT 2 +#define LANE_CNT_MSK 0x18 +#define LANE_CNT_SHIFT 0x3 +#define LANE_COUNT_1 (0 << LANE_CNT_SHIFT) +#define LANE_COUNT_2 (1 << LANE_CNT_SHIFT) +#define LANE_COUNT_4 (2 << LANE_CNT_SHIFT) +#define LANE_COUNT_8 (3 << LANE_CNT_SHIFT) +#define LINK_TRAINING_EN BIT(6) +#define LEGACY_INTA BIT(28) +#define LEGACY_INTB BIT(29) +#define LEGACY_INTC BIT(30) +#define LEGACY_INTD BIT(31) +#define PCIE_CORE_CTRL1_REG (CONTROL_BASE_ADDR + 0x4) +#define HOT_RESET_GEN BIT(0) +#define PCIE_CORE_CTRL2_REG (CONTROL_BASE_ADDR + 0x8) +#define PCIE_CORE_CTRL2_RESERVED 0x7 +#define PCIE_CORE_CTRL2_TD_ENABLE BIT(4) +#define PCIE_CORE_CTRL2_STRICT_ORDER_ENABLE BIT(5) +#define PCIE_CORE_CTRL2_OB_WIN_ENABLE BIT(6) +#define PCIE_CORE_CTRL2_MSI_ENABLE BIT(10) +#define PCIE_ISR0_REG (CONTROL_BASE_ADDR + 0x40) +#define PCIE_ISR0_MASK_REG (CONTROL_BASE_ADDR + 0x44) +#define PCIE_ISR0_MSI_INT_PENDING BIT(24) +#define PCIE_ISR0_INTX_ASSERT(val) BIT(16 + (val)) +#define PCIE_ISR0_INTX_DEASSERT(val) BIT(20 + (val)) +#define PCIE_ISR0_ALL_MASK GENMASK(26, 0) +#define PCIE_ISR1_REG (CONTROL_BASE_ADDR + 0x48) +#define PCIE_ISR1_MASK_REG (CONTROL_BASE_ADDR + 0x4C) +#define PCIE_ISR1_POWER_STATE_CHANGE BIT(4) +#define PCIE_ISR1_FLUSH BIT(5) +#define PCIE_ISR1_ALL_MASK GENMASK(5, 4) +#define PCIE_MSI_ADDR_LOW_REG (CONTROL_BASE_ADDR + 0x50) +#define PCIE_MSI_ADDR_HIGH_REG (CONTROL_BASE_ADDR + 0x54) +#define PCIE_MSI_STATUS_REG (CONTROL_BASE_ADDR + 0x58) +#define PCIE_MSI_MASK_REG (CONTROL_BASE_ADDR + 0x5C) +#define PCIE_MSI_PAYLOAD_REG (CONTROL_BASE_ADDR + 0x9C) + +/* PCIe window configuration */ +#define OB_WIN_BASE_ADDR 0x4c00 +#define OB_WIN_BLOCK_SIZE 0x20 +#define OB_WIN_REG_ADDR(win, offset) (OB_WIN_BASE_ADDR + \ + OB_WIN_BLOCK_SIZE * (win) + \ + (offset)) +#define OB_WIN_MATCH_LS(win) OB_WIN_REG_ADDR(win, 0x00) +#define OB_WIN_MATCH_MS(win) OB_WIN_REG_ADDR(win, 0x04) +#define OB_WIN_REMAP_LS(win) OB_WIN_REG_ADDR(win, 0x08) +#define OB_WIN_REMAP_MS(win) OB_WIN_REG_ADDR(win, 0x0c) +#define OB_WIN_MASK_LS(win) OB_WIN_REG_ADDR(win, 0x10) +#define OB_WIN_MASK_MS(win) OB_WIN_REG_ADDR(win, 0x14) +#define OB_WIN_ACTIONS(win) OB_WIN_REG_ADDR(win, 0x18) + +/* PCIe window types */ +#define OB_PCIE_MEM 0x0 +#define OB_PCIE_IO 0x4 + +/* LMI registers base address and register offsets */ +#define LMI_BASE_ADDR 0x6000 +#define CFG_REG (LMI_BASE_ADDR + 0x0) +#define LTSSM_SHIFT 24 +#define LTSSM_MASK 0x3f +#define LTSSM_L0 0x10 +#define RC_BAR_CONFIG 0x300 + +/* PCIe core controller registers */ +#define CTRL_CORE_BASE_ADDR 0x18000 +#define CTRL_CONFIG_REG (CTRL_CORE_BASE_ADDR + 0x0) +#define CTRL_MODE_SHIFT 0x0 +#define CTRL_MODE_MASK 0x1 +#define PCIE_CORE_MODE_DIRECT 0x0 +#define PCIE_CORE_MODE_COMMAND 0x1 + +/* PCIe Central Interrupts Registers */ +#define CENTRAL_INT_BASE_ADDR 0x1b000 +#define HOST_CTRL_INT_STATUS_REG (CENTRAL_INT_BASE_ADDR + 0x0) +#define HOST_CTRL_INT_MASK_REG (CENTRAL_INT_BASE_ADDR + 0x4) +#define PCIE_IRQ_CMDQ_INT BIT(0) +#define PCIE_IRQ_MSI_STATUS_INT BIT(1) +#define PCIE_IRQ_CMD_SENT_DONE BIT(3) +#define PCIE_IRQ_DMA_INT BIT(4) +#define PCIE_IRQ_IB_DXFERDONE BIT(5) +#define PCIE_IRQ_OB_DXFERDONE BIT(6) +#define PCIE_IRQ_OB_RXFERDONE BIT(7) +#define PCIE_IRQ_COMPQ_INT BIT(12) +#define PCIE_IRQ_DIR_RD_DDR_DET BIT(13) +#define PCIE_IRQ_DIR_WR_DDR_DET BIT(14) +#define PCIE_IRQ_CORE_INT BIT(16) +#define PCIE_IRQ_CORE_INT_PIO BIT(17) +#define PCIE_IRQ_DPMU_INT BIT(18) +#define PCIE_IRQ_PCIE_MIS_INT BIT(19) +#define PCIE_IRQ_MSI_INT1_DET BIT(20) +#define PCIE_IRQ_MSI_INT2_DET BIT(21) +#define PCIE_IRQ_RC_DBELL_DET BIT(22) +#define PCIE_IRQ_EP_STATUS BIT(23) +#define PCIE_IRQ_ALL_MASK 0xfff0fb +#define PCIE_IRQ_ENABLE_INTS_MASK PCIE_IRQ_CORE_INT + +/* Transaction types */ +#define PCIE_CONFIG_RD_TYPE0 0x8 +#define PCIE_CONFIG_RD_TYPE1 0x9 +#define PCIE_CONFIG_WR_TYPE0 0xa +#define PCIE_CONFIG_WR_TYPE1 0xb + +/* PCI_BDF shifts 8bit, so we need extra 4bit shift */ +#define PCIE_BDF(dev) (dev << 4) +#define PCIE_CONF_BUS(bus) (((bus) & 0xff) << 20) +#define PCIE_CONF_DEV(dev) (((dev) & 0x1f) << 15) +#define PCIE_CONF_FUNC(fun) (((fun) & 0x7) << 12) +#define PCIE_CONF_REG(reg) ((reg) & 0xffc) +#define PCIE_CONF_ADDR(bus, devfn, where) \ + (PCIE_CONF_BUS(bus) | PCIE_CONF_DEV(PCI_SLOT(devfn)) | \ + PCIE_CONF_FUNC(PCI_FUNC(devfn)) | PCIE_CONF_REG(where)) + +#define PIO_TIMEOUT_MS 1 + +#define LINK_WAIT_MAX_RETRIES 10 +#define LINK_WAIT_USLEEP_MIN 90000 +#define LINK_WAIT_USLEEP_MAX 100000 + +#define LEGACY_IRQ_NUM 4 +#define MSI_IRQ_NUM 32 + +struct advk_pcie { + struct platform_device *pdev; + void __iomem *base; + struct list_head resources; + struct irq_domain *irq_domain; + struct irq_chip irq_chip; + struct msi_controller msi; + struct irq_domain *msi_domain; + struct irq_chip msi_irq_chip; + DECLARE_BITMAP(msi_irq_in_use, MSI_IRQ_NUM); + struct mutex msi_used_lock; + u16 msi_msg; + int root_bus_nr; +}; + +static inline void advk_writel(struct advk_pcie *pcie, u32 val, u64 reg) +{ + writel(val, pcie->base + reg); +} + +static inline u32 advk_readl(struct advk_pcie *pcie, u64 reg) +{ + return readl(pcie->base + reg); +} + +static int advk_pcie_link_up(struct advk_pcie *pcie) +{ + u32 val, ltssm_state; + + val = advk_readl(pcie, CFG_REG); + ltssm_state = (val >> LTSSM_SHIFT) & LTSSM_MASK; + return ltssm_state >= LTSSM_L0; +} + +static int advk_pcie_wait_for_link(struct advk_pcie *pcie) +{ + int retries; + + /* check if the link is up or not */ + for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) { + if (advk_pcie_link_up(pcie)) { + dev_info(&pcie->pdev->dev, "link up\n"); + return 0; + } + + usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX); + } + + dev_err(&pcie->pdev->dev, "link never came up\n"); + + return -ETIMEDOUT; +} + +/* + * Set PCIe address window register which could be used for memory + * mapping. + */ +static void advk_pcie_set_ob_win(struct advk_pcie *pcie, + u32 win_num, u32 match_ms, + u32 match_ls, u32 mask_ms, + u32 mask_ls, u32 remap_ms, + u32 remap_ls, u32 action) +{ + advk_writel(pcie, match_ls, OB_WIN_MATCH_LS(win_num)); + advk_writel(pcie, match_ms, OB_WIN_MATCH_MS(win_num)); + advk_writel(pcie, mask_ms, OB_WIN_MASK_MS(win_num)); + advk_writel(pcie, mask_ls, OB_WIN_MASK_LS(win_num)); + advk_writel(pcie, remap_ms, OB_WIN_REMAP_MS(win_num)); + advk_writel(pcie, remap_ls, OB_WIN_REMAP_LS(win_num)); + advk_writel(pcie, action, OB_WIN_ACTIONS(win_num)); + advk_writel(pcie, match_ls | BIT(0), OB_WIN_MATCH_LS(win_num)); +} + +static void advk_pcie_setup_hw(struct advk_pcie *pcie) +{ + u32 reg; + int i; + + /* Point PCIe unit MBUS decode windows to DRAM space */ + for (i = 0; i < 8; i++) + advk_pcie_set_ob_win(pcie, i, 0, 0, 0, 0, 0, 0, 0); + + /* Set to Direct mode */ + reg = advk_readl(pcie, CTRL_CONFIG_REG); + reg &= ~(CTRL_MODE_MASK << CTRL_MODE_SHIFT); + reg |= ((PCIE_CORE_MODE_DIRECT & CTRL_MODE_MASK) << CTRL_MODE_SHIFT); + advk_writel(pcie, reg, CTRL_CONFIG_REG); + + /* Set PCI global control register to RC mode */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg |= (IS_RC_MSK << IS_RC_SHIFT); + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* Set Advanced Error Capabilities and Control PF0 register */ + reg = PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX | + PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN | + PCIE_CORE_ERR_CAPCTL_ECRC_CHCK | + PCIE_CORE_ERR_CAPCTL_ECRC_CHCK_RCV; + advk_writel(pcie, reg, PCIE_CORE_ERR_CAPCTL_REG); + + /* Set PCIe Device Control and Status 1 PF0 register */ + reg = PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE | + (7 << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) | + PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE | + PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT; + advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG); + + /* Program PCIe Control 2 to disable strict ordering */ + reg = PCIE_CORE_CTRL2_RESERVED | + PCIE_CORE_CTRL2_TD_ENABLE; + advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG); + + /* Set GEN2 */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg &= ~PCIE_GEN_SEL_MSK; + reg |= SPEED_GEN_2; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* Set lane X1 */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg &= ~LANE_CNT_MSK; + reg |= LANE_COUNT_1; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* Enable link training */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg |= LINK_TRAINING_EN; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* Enable MSI */ + reg = advk_readl(pcie, PCIE_CORE_CTRL2_REG); + reg |= PCIE_CORE_CTRL2_MSI_ENABLE; + advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG); + + /* Clear all interrupts */ + advk_writel(pcie, PCIE_ISR0_ALL_MASK, PCIE_ISR0_REG); + advk_writel(pcie, PCIE_ISR1_ALL_MASK, PCIE_ISR1_REG); + advk_writel(pcie, PCIE_IRQ_ALL_MASK, HOST_CTRL_INT_STATUS_REG); + + /* Disable All ISR0/1 Sources */ + reg = PCIE_ISR0_ALL_MASK; + reg &= ~PCIE_ISR0_MSI_INT_PENDING; + advk_writel(pcie, reg, PCIE_ISR0_MASK_REG); + + advk_writel(pcie, PCIE_ISR1_ALL_MASK, PCIE_ISR1_MASK_REG); + + /* Unmask all MSI's */ + advk_writel(pcie, 0, PCIE_MSI_MASK_REG); + + /* Enable summary interrupt for GIC SPI source */ + reg = PCIE_IRQ_ALL_MASK & (~PCIE_IRQ_ENABLE_INTS_MASK); + advk_writel(pcie, reg, HOST_CTRL_INT_MASK_REG); + + reg = advk_readl(pcie, PCIE_CORE_CTRL2_REG); + reg |= PCIE_CORE_CTRL2_OB_WIN_ENABLE; + advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG); + + /* Bypass the address window mapping for PIO */ + reg = advk_readl(pcie, PIO_CTRL); + reg |= PIO_CTRL_ADDR_WIN_DISABLE; + advk_writel(pcie, reg, PIO_CTRL); + + /* Start link training */ + reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); + reg |= PCIE_CORE_LINK_TRAINING; + advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG); + + advk_pcie_wait_for_link(pcie); + + reg = PCIE_CORE_LINK_L0S_ENTRY | + (1 << PCIE_CORE_LINK_WIDTH_SHIFT); + advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG); + + reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); + reg |= PCIE_CORE_CMD_MEM_ACCESS_EN | + PCIE_CORE_CMD_IO_ACCESS_EN | + PCIE_CORE_CMD_MEM_IO_REQ_EN; + advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG); +} + +static void advk_pcie_check_pio_status(struct advk_pcie *pcie) +{ + u32 reg; + unsigned int status; + char *strcomp_status, *str_posted; + + reg = advk_readl(pcie, PIO_STAT); + status = (reg & PIO_COMPLETION_STATUS_MASK) >> + PIO_COMPLETION_STATUS_SHIFT; + + if (!status) + return; + + switch (status) { + case PIO_COMPLETION_STATUS_UR: + strcomp_status = "UR"; + break; + case PIO_COMPLETION_STATUS_CRS: + strcomp_status = "CRS"; + break; + case PIO_COMPLETION_STATUS_CA: + strcomp_status = "CA"; + break; + default: + strcomp_status = "Unknown"; + break; + } + + if (reg & PIO_NON_POSTED_REQ) + str_posted = "Non-posted"; + else + str_posted = "Posted"; + + dev_err(&pcie->pdev->dev, "%s PIO Response Status: %s, %#x @ %#x\n", + str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS)); +} + +static int advk_pcie_wait_pio(struct advk_pcie *pcie) +{ + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(PIO_TIMEOUT_MS); + + while (time_before(jiffies, timeout)) { + u32 start, isr; + + start = advk_readl(pcie, PIO_START); + isr = advk_readl(pcie, PIO_ISR); + if (!start && isr) + return 0; + } + + dev_err(&pcie->pdev->dev, "config read/write timed out\n"); + return -ETIMEDOUT; +} + +static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, + int where, int size, u32 *val) +{ + struct advk_pcie *pcie = bus->sysdata; + u32 reg; + int ret; + + if (PCI_SLOT(devfn) != 0) { + *val = 0xffffffff; + return PCIBIOS_DEVICE_NOT_FOUND; + } + + /* Start PIO */ + advk_writel(pcie, 0, PIO_START); + advk_writel(pcie, 1, PIO_ISR); + + /* Program the control register */ + reg = advk_readl(pcie, PIO_CTRL); + reg &= ~PIO_CTRL_TYPE_MASK; + if (bus->number == pcie->root_bus_nr) + reg |= PCIE_CONFIG_RD_TYPE0; + else + reg |= PCIE_CONFIG_RD_TYPE1; + advk_writel(pcie, reg, PIO_CTRL); + + /* Program the address registers */ + reg = PCIE_BDF(devfn) | PCIE_CONF_REG(where); + advk_writel(pcie, reg, PIO_ADDR_LS); + advk_writel(pcie, 0, PIO_ADDR_MS); + + /* Program the data strobe */ + advk_writel(pcie, 0xf, PIO_WR_DATA_STRB); + + /* Start the transfer */ + advk_writel(pcie, 1, PIO_START); + + ret = advk_pcie_wait_pio(pcie); + if (ret < 0) + return PCIBIOS_SET_FAILED; + + advk_pcie_check_pio_status(pcie); + + /* Get the read result */ + *val = advk_readl(pcie, PIO_RD_DATA); + if (size == 1) + *val = (*val >> (8 * (where & 3))) & 0xff; + else if (size == 2) + *val = (*val >> (8 * (where & 3))) & 0xffff; + + return PCIBIOS_SUCCESSFUL; +} + +static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, + int where, int size, u32 val) +{ + struct advk_pcie *pcie = bus->sysdata; + u32 reg; + u32 data_strobe = 0x0; + int offset; + int ret; + + if (PCI_SLOT(devfn) != 0) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (where % size) + return PCIBIOS_SET_FAILED; + + /* Start PIO */ + advk_writel(pcie, 0, PIO_START); + advk_writel(pcie, 1, PIO_ISR); + + /* Program the control register */ + reg = advk_readl(pcie, PIO_CTRL); + reg &= ~PIO_CTRL_TYPE_MASK; + if (bus->number == pcie->root_bus_nr) + reg |= PCIE_CONFIG_WR_TYPE0; + else + reg |= PCIE_CONFIG_WR_TYPE1; + advk_writel(pcie, reg, PIO_CTRL); + + /* Program the address registers */ + reg = PCIE_CONF_ADDR(bus->number, devfn, where); + advk_writel(pcie, reg, PIO_ADDR_LS); + advk_writel(pcie, 0, PIO_ADDR_MS); + + /* Calculate the write strobe */ + offset = where & 0x3; + reg = val << (8 * offset); + data_strobe = GENMASK(size - 1, 0) << offset; + + /* Program the data register */ + advk_writel(pcie, reg, PIO_WR_DATA); + + /* Program the data strobe */ + advk_writel(pcie, data_strobe, PIO_WR_DATA_STRB); + + /* Start the transfer */ + advk_writel(pcie, 1, PIO_START); + + ret = advk_pcie_wait_pio(pcie); + if (ret < 0) + return PCIBIOS_SET_FAILED; + + advk_pcie_check_pio_status(pcie); + + return PCIBIOS_SUCCESSFUL; +} + +static struct pci_ops advk_pcie_ops = { + .read = advk_pcie_rd_conf, + .write = advk_pcie_wr_conf, +}; + +static int advk_pcie_alloc_msi(struct advk_pcie *pcie) +{ + int hwirq; + + mutex_lock(&pcie->msi_used_lock); + hwirq = find_first_zero_bit(pcie->msi_irq_in_use, MSI_IRQ_NUM); + if (hwirq >= MSI_IRQ_NUM) + hwirq = -ENOSPC; + else + set_bit(hwirq, pcie->msi_irq_in_use); + mutex_unlock(&pcie->msi_used_lock); + + return hwirq; +} + +static void advk_pcie_free_msi(struct advk_pcie *pcie, int hwirq) +{ + mutex_lock(&pcie->msi_used_lock); + if (!test_bit(hwirq, pcie->msi_irq_in_use)) + dev_err(&pcie->pdev->dev, "trying to free unused MSI#%d\n", + hwirq); + else + clear_bit(hwirq, pcie->msi_irq_in_use); + mutex_unlock(&pcie->msi_used_lock); +} + +static int advk_pcie_setup_msi_irq(struct msi_controller *chip, + struct pci_dev *pdev, + struct msi_desc *desc) +{ + struct advk_pcie *pcie = pdev->bus->sysdata; + struct msi_msg msg; + int virq, hwirq; + phys_addr_t msi_msg_phys; + + /* We support MSI, but not MSI-X */ + if (desc->msi_attrib.is_msix) + return -EINVAL; + + hwirq = advk_pcie_alloc_msi(pcie); + if (hwirq < 0) + return hwirq; + + virq = irq_create_mapping(pcie->msi_domain, hwirq); + if (!virq) { + advk_pcie_free_msi(pcie, hwirq); + return -EINVAL; + } + + irq_set_msi_desc(virq, desc); + + msi_msg_phys = virt_to_phys(&pcie->msi_msg); + + msg.address_lo = lower_32_bits(msi_msg_phys); + msg.address_hi = upper_32_bits(msi_msg_phys); + msg.data = virq; + + pci_write_msi_msg(virq, &msg); + + return 0; +} + +static void advk_pcie_teardown_msi_irq(struct msi_controller *chip, + unsigned int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + struct msi_desc *msi = irq_data_get_msi_desc(d); + struct advk_pcie *pcie = msi_desc_to_pci_sysdata(msi); + unsigned long hwirq = d->hwirq; + + irq_dispose_mapping(irq); + advk_pcie_free_msi(pcie, hwirq); +} + +static int advk_pcie_msi_map(struct irq_domain *domain, + unsigned int virq, irq_hw_number_t hw) +{ + struct advk_pcie *pcie = domain->host_data; + + irq_set_chip_and_handler(virq, &pcie->msi_irq_chip, + handle_simple_irq); + + return 0; +} + +static const struct irq_domain_ops advk_pcie_msi_irq_ops = { + .map = advk_pcie_msi_map, +}; + +static void advk_pcie_irq_mask(struct irq_data *d) +{ + struct advk_pcie *pcie = d->domain->host_data; + irq_hw_number_t hwirq = irqd_to_hwirq(d); + u32 mask; + + mask = advk_readl(pcie, PCIE_ISR0_MASK_REG); + mask |= PCIE_ISR0_INTX_ASSERT(hwirq); + advk_writel(pcie, mask, PCIE_ISR0_MASK_REG); +} + +static void advk_pcie_irq_unmask(struct irq_data *d) +{ + struct advk_pcie *pcie = d->domain->host_data; + irq_hw_number_t hwirq = irqd_to_hwirq(d); + u32 mask; + + mask = advk_readl(pcie, PCIE_ISR0_MASK_REG); + mask &= ~PCIE_ISR0_INTX_ASSERT(hwirq); + advk_writel(pcie, mask, PCIE_ISR0_MASK_REG); +} + +static int advk_pcie_irq_map(struct irq_domain *h, + unsigned int virq, irq_hw_number_t hwirq) +{ + struct advk_pcie *pcie = h->host_data; + + advk_pcie_irq_mask(irq_get_irq_data(virq)); + irq_set_status_flags(virq, IRQ_LEVEL); + irq_set_chip_and_handler(virq, &pcie->irq_chip, + handle_level_irq); + irq_set_chip_data(virq, pcie); + + return 0; +} + +static const struct irq_domain_ops advk_pcie_irq_domain_ops = { + .map = advk_pcie_irq_map, + .xlate = irq_domain_xlate_onecell, +}; + +static int advk_pcie_init_msi_irq_domain(struct advk_pcie *pcie) +{ + struct device *dev = &pcie->pdev->dev; + struct device_node *node = dev->of_node; + struct irq_chip *msi_irq_chip; + struct msi_controller *msi; + phys_addr_t msi_msg_phys; + int ret; + + msi_irq_chip = &pcie->msi_irq_chip; + + msi_irq_chip->name = devm_kasprintf(dev, GFP_KERNEL, "%s-msi", + dev_name(dev)); + if (!msi_irq_chip->name) + return -ENOMEM; + + msi_irq_chip->irq_enable = pci_msi_unmask_irq; + msi_irq_chip->irq_disable = pci_msi_mask_irq; + msi_irq_chip->irq_mask = pci_msi_mask_irq; + msi_irq_chip->irq_unmask = pci_msi_unmask_irq; + + msi = &pcie->msi; + + msi->setup_irq = advk_pcie_setup_msi_irq; + msi->teardown_irq = advk_pcie_teardown_msi_irq; + msi->of_node = node; + + mutex_init(&pcie->msi_used_lock); + + msi_msg_phys = virt_to_phys(&pcie->msi_msg); + + advk_writel(pcie, lower_32_bits(msi_msg_phys), + PCIE_MSI_ADDR_LOW_REG); + advk_writel(pcie, upper_32_bits(msi_msg_phys), + PCIE_MSI_ADDR_HIGH_REG); + + pcie->msi_domain = + irq_domain_add_linear(NULL, MSI_IRQ_NUM, + &advk_pcie_msi_irq_ops, pcie); + if (!pcie->msi_domain) + return -ENOMEM; + + ret = of_pci_msi_chip_add(msi); + if (ret < 0) { + irq_domain_remove(pcie->msi_domain); + return ret; + } + + return 0; +} + +static void advk_pcie_remove_msi_irq_domain(struct advk_pcie *pcie) +{ + of_pci_msi_chip_remove(&pcie->msi); + irq_domain_remove(pcie->msi_domain); +} + +static int advk_pcie_init_irq_domain(struct advk_pcie *pcie) +{ + struct device *dev = &pcie->pdev->dev; + struct device_node *node = dev->of_node; + struct device_node *pcie_intc_node; + struct irq_chip *irq_chip; + + pcie_intc_node = of_get_next_child(node, NULL); + if (!pcie_intc_node) { + dev_err(dev, "No PCIe Intc node found\n"); + return -ENODEV; + } + + irq_chip = &pcie->irq_chip; + + irq_chip->name = devm_kasprintf(dev, GFP_KERNEL, "%s-irq", + dev_name(dev)); + if (!irq_chip->name) { + of_node_put(pcie_intc_node); + return -ENOMEM; + } + + irq_chip->irq_mask = advk_pcie_irq_mask; + irq_chip->irq_mask_ack = advk_pcie_irq_mask; + irq_chip->irq_unmask = advk_pcie_irq_unmask; + + pcie->irq_domain = + irq_domain_add_linear(pcie_intc_node, LEGACY_IRQ_NUM, + &advk_pcie_irq_domain_ops, pcie); + if (!pcie->irq_domain) { + dev_err(dev, "Failed to get a INTx IRQ domain\n"); + of_node_put(pcie_intc_node); + return -ENOMEM; + } + + return 0; +} + +static void advk_pcie_remove_irq_domain(struct advk_pcie *pcie) +{ + irq_domain_remove(pcie->irq_domain); +} + +static void advk_pcie_handle_msi(struct advk_pcie *pcie) +{ + u32 msi_val, msi_mask, msi_status, msi_idx; + u16 msi_data; + + msi_mask = advk_readl(pcie, PCIE_MSI_MASK_REG); + msi_val = advk_readl(pcie, PCIE_MSI_STATUS_REG); + msi_status = msi_val & ~msi_mask; + + for (msi_idx = 0; msi_idx < MSI_IRQ_NUM; msi_idx++) { + if (!(BIT(msi_idx) & msi_status)) + continue; + + advk_writel(pcie, BIT(msi_idx), PCIE_MSI_STATUS_REG); + msi_data = advk_readl(pcie, PCIE_MSI_PAYLOAD_REG) & 0xFF; + generic_handle_irq(msi_data); + } + + advk_writel(pcie, PCIE_ISR0_MSI_INT_PENDING, + PCIE_ISR0_REG); +} + +static void advk_pcie_handle_int(struct advk_pcie *pcie) +{ + u32 val, mask, status; + int i, virq; + + val = advk_readl(pcie, PCIE_ISR0_REG); + mask = advk_readl(pcie, PCIE_ISR0_MASK_REG); + status = val & ((~mask) & PCIE_ISR0_ALL_MASK); + + if (!status) { + advk_writel(pcie, val, PCIE_ISR0_REG); + return; + } + + /* Process MSI interrupts */ + if (status & PCIE_ISR0_MSI_INT_PENDING) + advk_pcie_handle_msi(pcie); + + /* Process legacy interrupts */ + for (i = 0; i < LEGACY_IRQ_NUM; i++) { + if (!(status & PCIE_ISR0_INTX_ASSERT(i))) + continue; + + advk_writel(pcie, PCIE_ISR0_INTX_ASSERT(i), + PCIE_ISR0_REG); + + virq = irq_find_mapping(pcie->irq_domain, i); + generic_handle_irq(virq); + } +} + +static irqreturn_t advk_pcie_irq_handler(int irq, void *arg) +{ + struct advk_pcie *pcie = arg; + u32 status; + + status = advk_readl(pcie, HOST_CTRL_INT_STATUS_REG); + if (!(status & PCIE_IRQ_CORE_INT)) + return IRQ_NONE; + + advk_pcie_handle_int(pcie); + + /* Clear interrupt */ + advk_writel(pcie, PCIE_IRQ_CORE_INT, HOST_CTRL_INT_STATUS_REG); + + return IRQ_HANDLED; +} + +static int advk_pcie_parse_request_of_pci_ranges(struct advk_pcie *pcie) +{ + int err, res_valid = 0; + struct device *dev = &pcie->pdev->dev; + struct device_node *np = dev->of_node; + struct resource_entry *win; + resource_size_t iobase; + + INIT_LIST_HEAD(&pcie->resources); + + err = of_pci_get_host_bridge_resources(np, 0, 0xff, &pcie->resources, + &iobase); + if (err) + return err; + + err = devm_request_pci_bus_resources(dev, &pcie->resources); + if (err) + goto out_release_res; + + resource_list_for_each_entry(win, &pcie->resources) { + struct resource *res = win->res; + + switch (resource_type(res)) { + case IORESOURCE_IO: + advk_pcie_set_ob_win(pcie, 1, + upper_32_bits(res->start), + lower_32_bits(res->start), + 0, 0xF8000000, 0, + lower_32_bits(res->start), + OB_PCIE_IO); + err = pci_remap_iospace(res, iobase); + if (err) + dev_warn(dev, "error %d: failed to map resource %pR\n", + err, res); + break; + case IORESOURCE_MEM: + advk_pcie_set_ob_win(pcie, 0, + upper_32_bits(res->start), + lower_32_bits(res->start), + 0x0, 0xF8000000, 0, + lower_32_bits(res->start), + (2 << 20) | OB_PCIE_MEM); + res_valid |= !(res->flags & IORESOURCE_PREFETCH); + break; + case IORESOURCE_BUS: + pcie->root_bus_nr = res->start; + break; + } + } + + if (!res_valid) { + dev_err(dev, "non-prefetchable memory resource required\n"); + err = -EINVAL; + goto out_release_res; + } + + return 0; + +out_release_res: + pci_free_resource_list(&pcie->resources); + return err; +} + +static int advk_pcie_probe(struct platform_device *pdev) +{ + struct advk_pcie *pcie; + struct resource *res; + struct pci_bus *bus, *child; + struct msi_controller *msi; + struct device_node *msi_node; + int ret, irq; + + pcie = devm_kzalloc(&pdev->dev, sizeof(struct advk_pcie), + GFP_KERNEL); + if (!pcie) + return -ENOMEM; + + pcie->pdev = pdev; + platform_set_drvdata(pdev, pcie); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + pcie->base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(pcie->base)) { + dev_err(&pdev->dev, "Failed to map registers\n"); + return PTR_ERR(pcie->base); + } + + irq = platform_get_irq(pdev, 0); + ret = devm_request_irq(&pdev->dev, irq, advk_pcie_irq_handler, + IRQF_SHARED | IRQF_NO_THREAD, "advk-pcie", + pcie); + if (ret) { + dev_err(&pdev->dev, "Failed to register interrupt\n"); + return ret; + } + + ret = advk_pcie_parse_request_of_pci_ranges(pcie); + if (ret) { + dev_err(&pdev->dev, "Failed to parse resources\n"); + return ret; + } + + advk_pcie_setup_hw(pcie); + + ret = advk_pcie_init_irq_domain(pcie); + if (ret) { + dev_err(&pdev->dev, "Failed to initialize irq\n"); + return ret; + } + + ret = advk_pcie_init_msi_irq_domain(pcie); + if (ret) { + dev_err(&pdev->dev, "Failed to initialize irq\n"); + advk_pcie_remove_irq_domain(pcie); + return ret; + } + + msi_node = of_parse_phandle(pdev->dev.of_node, "msi-parent", 0); + if (msi_node) + msi = of_pci_find_msi_chip_by_node(msi_node); + else + msi = NULL; + + bus = pci_scan_root_bus_msi(&pdev->dev, 0, &advk_pcie_ops, + pcie, &pcie->resources, &pcie->msi); + if (!bus) { + advk_pcie_remove_msi_irq_domain(pcie); + advk_pcie_remove_irq_domain(pcie); + return -ENOMEM; + } + + pci_bus_assign_resources(bus); + + list_for_each_entry(child, &bus->children, node) + pcie_bus_configure_settings(child); + + pci_bus_add_devices(bus); + + return 0; +} + +static const struct of_device_id advk_pcie_of_match_table[] = { + { .compatible = "marvell,armada-3700-pcie", }, + {}, +}; + +static struct platform_driver advk_pcie_driver = { + .driver = { + .name = "advk-pcie", + .of_match_table = advk_pcie_of_match_table, + /* Driver unloading/unbinding currently not supported */ + .suppress_bind_attrs = true, + }, + .probe = advk_pcie_probe, +}; +builtin_platform_driver(advk_pcie_driver); diff --git a/drivers/pci/host/pci-dra7xx.c b/drivers/pci/host/pci-dra7xx.c index f441130407e7..81b3949a26db 100644 --- a/drivers/pci/host/pci-dra7xx.c +++ b/drivers/pci/host/pci-dra7xx.c @@ -181,14 +181,14 @@ static int dra7xx_pcie_init_irq_domain(struct pcie_port *pp) if (!pcie_intc_node) { dev_err(dev, "No PCIe Intc node found\n"); - return PTR_ERR(pcie_intc_node); + return -ENODEV; } pp->irq_domain = irq_domain_add_linear(pcie_intc_node, 4, &intx_domain_ops, pp); if (!pp->irq_domain) { dev_err(dev, "Failed to get a INTx IRQ domain\n"); - return PTR_ERR(pp->irq_domain); + return -ENODEV; } return 0; diff --git a/drivers/pci/host/pci-host-common.c b/drivers/pci/host/pci-host-common.c index 8cba7ab73df9..9d9d34e959b6 100644 --- a/drivers/pci/host/pci-host-common.c +++ b/drivers/pci/host/pci-host-common.c @@ -20,10 +20,9 @@ #include #include #include +#include #include -#include "../ecam.h" - static int gen_pci_parse_request_of_pci_ranges(struct device *dev, struct list_head *resources, struct resource **bus_range) { @@ -36,44 +35,34 @@ static int gen_pci_parse_request_of_pci_ranges(struct device *dev, if (err) return err; + err = devm_request_pci_bus_resources(dev, resources); + if (err) + return err; + resource_list_for_each_entry(win, resources) { - struct resource *parent, *res = win->res; + struct resource *res = win->res; switch (resource_type(res)) { case IORESOURCE_IO: - parent = &ioport_resource; err = pci_remap_iospace(res, iobase); - if (err) { + if (err) dev_warn(dev, "error %d: failed to map resource %pR\n", err, res); - continue; - } break; case IORESOURCE_MEM: - parent = &iomem_resource; res_valid |= !(res->flags & IORESOURCE_PREFETCH); break; case IORESOURCE_BUS: *bus_range = res; - default: - continue; + break; } - - err = devm_request_resource(dev, parent, res); - if (err) - goto out_release_res; } - if (!res_valid) { - dev_err(dev, "non-prefetchable memory resource required\n"); - err = -EINVAL; - goto out_release_res; - } + if (res_valid) + return 0; - return 0; - -out_release_res: - return err; + dev_err(dev, "non-prefetchable memory resource required\n"); + return -EINVAL; } static void gen_pci_unmap_cfg(void *ptr) @@ -155,7 +144,14 @@ int pci_host_common_probe(struct platform_device *pdev, pci_fixup_irqs(pci_common_swizzle, of_irq_parse_and_map_pci); - if (!pci_has_flag(PCI_PROBE_ONLY)) { + /* + * We insert PCI resources into the iomem_resource and + * ioport_resource trees in either pci_bus_claim_resources() + * or pci_bus_assign_resources(). + */ + if (pci_has_flag(PCI_PROBE_ONLY)) { + pci_bus_claim_resources(bus); + } else { pci_bus_size_bridges(bus); pci_bus_assign_resources(bus); diff --git a/drivers/pci/host/pci-host-generic.c b/drivers/pci/host/pci-host-generic.c index 6eaceab1bf04..c05ea9d72f69 100644 --- a/drivers/pci/host/pci-host-generic.c +++ b/drivers/pci/host/pci-host-generic.c @@ -20,13 +20,12 @@ */ #include -#include +#include #include #include +#include #include -#include "../ecam.h" - static struct pci_ecam_ops gen_pci_cfg_cam_bus_ops = { .bus_shift = 16, .pci_ops = { @@ -46,8 +45,6 @@ static const struct of_device_id gen_pci_of_match[] = { { }, }; -MODULE_DEVICE_TABLE(of, gen_pci_of_match); - static int gen_pci_probe(struct platform_device *pdev) { const struct of_device_id *of_id; @@ -66,8 +63,4 @@ static struct platform_driver gen_pci_driver = { }, .probe = gen_pci_probe, }; -module_platform_driver(gen_pci_driver); - -MODULE_DESCRIPTION("Generic PCI host driver"); -MODULE_AUTHOR("Will Deacon "); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver(gen_pci_driver); diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c index 7e9b2de2aa24..6955ffdb89f3 100644 --- a/drivers/pci/host/pci-hyperv.c +++ b/drivers/pci/host/pci-hyperv.c @@ -732,16 +732,18 @@ static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, pdev = msi_desc_to_pci_dev(msi); hbus = info->data; - hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); - if (!hpdev) + int_desc = irq_data_get_irq_chip_data(irq_data); + if (!int_desc) return; - int_desc = irq_data_get_irq_chip_data(irq_data); - if (int_desc) { - irq_data->chip_data = NULL; - hv_int_desc_free(hpdev, int_desc); + irq_data->chip_data = NULL; + hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); + if (!hpdev) { + kfree(int_desc); + return; } + hv_int_desc_free(hpdev, int_desc); put_pcichild(hpdev, hv_pcidev_ref_by_slot); } @@ -1657,14 +1659,16 @@ static void hv_pci_onchannelcallback(void *context) continue; } + /* Zero length indicates there are no more packets. */ + if (ret || !bytes_recvd) + break; + /* * All incoming packets must be at least as large as a * response. */ - if (bytes_recvd <= sizeof(struct pci_response)) { - kfree(buffer); - return; - } + if (bytes_recvd <= sizeof(struct pci_response)) + continue; desc = (struct vmpacket_descriptor *)buffer; switch (desc->type) { @@ -1679,8 +1683,7 @@ static void hv_pci_onchannelcallback(void *context) comp_packet->completion_func(comp_packet->compl_ctxt, response, bytes_recvd); - kfree(buffer); - return; + break; case VM_PKT_DATA_INBAND: @@ -1727,8 +1730,9 @@ static void hv_pci_onchannelcallback(void *context) desc->type, req_id, bytes_recvd); break; } - break; } + + kfree(buffer); } /** diff --git a/drivers/pci/host/pci-keystone.c b/drivers/pci/host/pci-keystone.c index 6b8301ef21ca..8ba28834d470 100644 --- a/drivers/pci/host/pci-keystone.c +++ b/drivers/pci/host/pci-keystone.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -360,7 +360,6 @@ static const struct of_device_id ks_pcie_of_match[] = { }, { }, }; -MODULE_DEVICE_TABLE(of, ks_pcie_of_match); static int __exit ks_pcie_remove(struct platform_device *pdev) { @@ -439,9 +438,4 @@ static struct platform_driver ks_pcie_driver __refdata = { .of_match_table = of_match_ptr(ks_pcie_of_match), }, }; - -module_platform_driver(ks_pcie_driver); - -MODULE_AUTHOR("Murali Karicheri "); -MODULE_DESCRIPTION("Keystone PCIe host controller driver"); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver(ks_pcie_driver); diff --git a/drivers/pci/host/pci-layerscape.c b/drivers/pci/host/pci-layerscape.c index a21e229d95e0..114ba819277a 100644 --- a/drivers/pci/host/pci-layerscape.c +++ b/drivers/pci/host/pci-layerscape.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include @@ -211,7 +211,6 @@ static const struct of_device_id ls_pcie_of_match[] = { { .compatible = "fsl,ls2085a-pcie", .data = &ls2080_drvdata }, { }, }; -MODULE_DEVICE_TABLE(of, ls_pcie_of_match); static int __init ls_add_pcie_port(struct pcie_port *pp, struct platform_device *pdev) @@ -275,9 +274,4 @@ static struct platform_driver ls_pcie_driver = { .of_match_table = ls_pcie_of_match, }, }; - -module_platform_driver_probe(ls_pcie_driver, ls_pcie_probe); - -MODULE_AUTHOR("Minghuan Lian "); -MODULE_DESCRIPTION("Freescale Layerscape PCIe host controller driver"); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver_probe(ls_pcie_driver, ls_pcie_probe); diff --git a/drivers/pci/host/pci-mvebu.c b/drivers/pci/host/pci-mvebu.c index 6b451df6502c..307f81d6b479 100644 --- a/drivers/pci/host/pci-mvebu.c +++ b/drivers/pci/host/pci-mvebu.c @@ -1,6 +1,8 @@ /* * PCIe driver for Marvell Armada 370 and Armada XP SoCs * + * Author: Thomas Petazzoni + * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any * warranty of any kind, whether express or implied. @@ -11,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -839,25 +841,22 @@ static struct pci_ops mvebu_pcie_ops = { static int mvebu_pcie_setup(int nr, struct pci_sys_data *sys) { struct mvebu_pcie *pcie = sys_to_pcie(sys); - int i; + int err, i; pcie->mem.name = "PCI MEM"; pcie->realio.name = "PCI I/O"; - if (request_resource(&iomem_resource, &pcie->mem)) - return 0; - - if (resource_size(&pcie->realio) != 0) { - if (request_resource(&ioport_resource, &pcie->realio)) { - release_resource(&pcie->mem); - return 0; - } + if (resource_size(&pcie->realio) != 0) pci_add_resource_offset(&sys->resources, &pcie->realio, sys->io_offset); - } + pci_add_resource_offset(&sys->resources, &pcie->mem, sys->mem_offset); pci_add_resource(&sys->resources, &pcie->busn); + err = devm_request_pci_bus_resources(&pcie->pdev->dev, &sys->resources); + if (err) + return 0; + for (i = 0; i < pcie->nports; i++) { struct mvebu_pcie_port *port = &pcie->ports[i]; @@ -1298,7 +1297,6 @@ static const struct of_device_id mvebu_pcie_of_match_table[] = { { .compatible = "marvell,kirkwood-pcie", }, {}, }; -MODULE_DEVICE_TABLE(of, mvebu_pcie_of_match_table); static const struct dev_pm_ops mvebu_pcie_pm_ops = { SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(mvebu_pcie_suspend, mvebu_pcie_resume) @@ -1314,8 +1312,4 @@ static struct platform_driver mvebu_pcie_driver = { }, .probe = mvebu_pcie_probe, }; -module_platform_driver(mvebu_pcie_driver); - -MODULE_AUTHOR("Thomas Petazzoni "); -MODULE_DESCRIPTION("Marvell EBU PCIe driver"); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver(mvebu_pcie_driver); diff --git a/drivers/pci/host/pci-rcar-gen2.c b/drivers/pci/host/pci-rcar-gen2.c index 9980a4bdae7e..597566f96f5e 100644 --- a/drivers/pci/host/pci-rcar-gen2.c +++ b/drivers/pci/host/pci-rcar-gen2.c @@ -4,6 +4,8 @@ * Copyright (C) 2013 Renesas Solutions Corp. * Copyright (C) 2013 Cogent Embedded, Inc. * + * Author: Valentine Barshak + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -14,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -97,7 +98,6 @@ struct rcar_pci_priv { struct device *dev; void __iomem *reg; - struct resource io_res; struct resource mem_res; struct resource *cfg_res; unsigned busnr; @@ -194,6 +194,7 @@ static int rcar_pci_setup(int nr, struct pci_sys_data *sys) struct rcar_pci_priv *priv = sys->private_data; void __iomem *reg = priv->reg; u32 val; + int ret; pm_runtime_enable(priv->dev); pm_runtime_get_sync(priv->dev); @@ -273,8 +274,10 @@ static int rcar_pci_setup(int nr, struct pci_sys_data *sys) rcar_pci_setup_errirq(priv); /* Add PCI resources */ - pci_add_resource(&sys->resources, &priv->io_res); pci_add_resource(&sys->resources, &priv->mem_res); + ret = devm_request_pci_bus_resources(priv->dev, &sys->resources); + if (ret < 0) + return ret; /* Setup bus number based on platform device id / of bus-range */ sys->busnr = priv->busnr; @@ -371,14 +374,6 @@ static int rcar_pci_probe(struct platform_device *pdev) return -ENOMEM; priv->mem_res = *mem_res; - /* - * The controller does not support/use port I/O, - * so setup a dummy port I/O region here. - */ - priv->io_res.start = priv->mem_res.start; - priv->io_res.end = priv->mem_res.end; - priv->io_res.flags = IORESOURCE_IO; - priv->cfg_res = cfg_res; priv->irq = platform_get_irq(pdev, 0); @@ -421,6 +416,7 @@ static int rcar_pci_probe(struct platform_device *pdev) hw_private[0] = priv; memset(&hw, 0, sizeof(hw)); hw.nr_controllers = ARRAY_SIZE(hw_private); + hw.io_optional = 1; hw.private_data = hw_private; hw.map_irq = rcar_pci_map_irq; hw.ops = &rcar_pci_ops; @@ -437,8 +433,6 @@ static struct of_device_id rcar_pci_of_match[] = { { }, }; -MODULE_DEVICE_TABLE(of, rcar_pci_of_match); - static struct platform_driver rcar_pci_driver = { .driver = { .name = "pci-rcar-gen2", @@ -447,9 +441,4 @@ static struct platform_driver rcar_pci_driver = { }, .probe = rcar_pci_probe, }; - -module_platform_driver(rcar_pci_driver); - -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Renesas R-Car Gen2 internal PCI"); -MODULE_AUTHOR("Valentine Barshak "); +builtin_platform_driver(rcar_pci_driver); diff --git a/drivers/pci/host/pci-tegra.c b/drivers/pci/host/pci-tegra.c index c388468c202a..6de0757b11e4 100644 --- a/drivers/pci/host/pci-tegra.c +++ b/drivers/pci/host/pci-tegra.c @@ -9,6 +9,8 @@ * * Bits taken from arch/arm/mach-dove/pcie.c * + * Author: Thierry Reding + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -32,7 +34,7 @@ #include #include #include -#include +#include #include #include #include @@ -183,26 +185,26 @@ #define AFI_PEXBIAS_CTRL_0 0x168 -#define RP_VEND_XP 0x00000F00 +#define RP_VEND_XP 0x00000f00 #define RP_VEND_XP_DL_UP (1 << 30) -#define RP_PRIV_MISC 0x00000FE0 -#define RP_PRIV_MISC_PRSNT_MAP_EP_PRSNT (0xE << 0) -#define RP_PRIV_MISC_PRSNT_MAP_EP_ABSNT (0xF << 0) +#define RP_PRIV_MISC 0x00000fe0 +#define RP_PRIV_MISC_PRSNT_MAP_EP_PRSNT (0xe << 0) +#define RP_PRIV_MISC_PRSNT_MAP_EP_ABSNT (0xf << 0) #define RP_LINK_CONTROL_STATUS 0x00000090 #define RP_LINK_CONTROL_STATUS_DL_LINK_ACTIVE 0x20000000 #define RP_LINK_CONTROL_STATUS_LINKSTAT_MASK 0x3fff0000 -#define PADS_CTL_SEL 0x0000009C +#define PADS_CTL_SEL 0x0000009c -#define PADS_CTL 0x000000A0 +#define PADS_CTL 0x000000a0 #define PADS_CTL_IDDQ_1L (1 << 0) #define PADS_CTL_TX_DATA_EN_1L (1 << 6) #define PADS_CTL_RX_DATA_EN_1L (1 << 10) -#define PADS_PLL_CTL_TEGRA20 0x000000B8 -#define PADS_PLL_CTL_TEGRA30 0x000000B4 +#define PADS_PLL_CTL_TEGRA20 0x000000b8 +#define PADS_PLL_CTL_TEGRA30 0x000000b4 #define PADS_PLL_CTL_RST_B4SM (1 << 1) #define PADS_PLL_CTL_LOCKDET (1 << 8) #define PADS_PLL_CTL_REFCLK_MASK (0x3 << 16) @@ -214,9 +216,9 @@ #define PADS_PLL_CTL_TXCLKREF_DIV5 (1 << 20) #define PADS_PLL_CTL_TXCLKREF_BUF_EN (1 << 22) -#define PADS_REFCLK_CFG0 0x000000C8 -#define PADS_REFCLK_CFG1 0x000000CC -#define PADS_REFCLK_BIAS 0x000000D0 +#define PADS_REFCLK_CFG0 0x000000c8 +#define PADS_REFCLK_CFG1 0x000000cc +#define PADS_REFCLK_BIAS 0x000000d0 /* * Fields in PADS_REFCLK_CFG*. Those registers form an array of 16-bit @@ -228,15 +230,6 @@ #define PADS_REFCLK_CFG_PREDI_SHIFT 8 /* 11:8 */ #define PADS_REFCLK_CFG_DRVI_SHIFT 12 /* 15:12 */ -/* Default value provided by HW engineering is 0xfa5c */ -#define PADS_REFCLK_CFG_VALUE \ - ( \ - (0x17 << PADS_REFCLK_CFG_TERM_SHIFT) | \ - (0 << PADS_REFCLK_CFG_E_TERM_SHIFT) | \ - (0xa << PADS_REFCLK_CFG_PREDI_SHIFT) | \ - (0xf << PADS_REFCLK_CFG_DRVI_SHIFT) \ - ) - struct tegra_msi { struct msi_controller chip; DECLARE_BITMAP(used, INT_PCI_MSI_NR); @@ -252,6 +245,8 @@ struct tegra_pcie_soc_data { unsigned int msi_base_shift; u32 pads_pll_ctl; u32 tx_ref_sel; + u32 pads_refclk_cfg0; + u32 pads_refclk_cfg1; bool has_pex_clkreq_en; bool has_pex_bias_ctrl; bool has_intr_prsnt_sense; @@ -274,7 +269,6 @@ struct tegra_pcie { struct list_head buses; struct resource *cs; - struct resource all; struct resource io; struct resource pio; struct resource mem; @@ -623,30 +617,21 @@ static int tegra_pcie_setup(int nr, struct pci_sys_data *sys) sys->mem_offset = pcie->offset.mem; sys->io_offset = pcie->offset.io; - err = devm_request_resource(pcie->dev, &pcie->all, &pcie->io); + err = devm_request_resource(pcie->dev, &iomem_resource, &pcie->io); if (err < 0) return err; - err = devm_request_resource(pcie->dev, &ioport_resource, &pcie->pio); - if (err < 0) - return err; - - err = devm_request_resource(pcie->dev, &pcie->all, &pcie->mem); - if (err < 0) - return err; - - err = devm_request_resource(pcie->dev, &pcie->all, &pcie->prefetch); - if (err) - return err; - pci_add_resource_offset(&sys->resources, &pcie->pio, sys->io_offset); pci_add_resource_offset(&sys->resources, &pcie->mem, sys->mem_offset); pci_add_resource_offset(&sys->resources, &pcie->prefetch, sys->mem_offset); pci_add_resource(&sys->resources, &pcie->busn); - pci_ioremap_io(pcie->pio.start, pcie->io.start); + err = devm_request_pci_bus_resources(pcie->dev, &sys->resources); + if (err < 0) + return err; + pci_remap_iospace(&pcie->pio, pcie->io.start); return 1; } @@ -838,12 +823,6 @@ static int tegra_pcie_phy_enable(struct tegra_pcie *pcie) value |= PADS_PLL_CTL_RST_B4SM; pads_writel(pcie, value, soc->pads_pll_ctl); - /* Configure the reference clock driver */ - value = PADS_REFCLK_CFG_VALUE | (PADS_REFCLK_CFG_VALUE << 16); - pads_writel(pcie, value, PADS_REFCLK_CFG0); - if (soc->num_ports > 2) - pads_writel(pcie, PADS_REFCLK_CFG_VALUE, PADS_REFCLK_CFG1); - /* wait for the PLL to lock */ err = tegra_pcie_pll_wait(pcie, 500); if (err < 0) { @@ -927,6 +906,7 @@ static int tegra_pcie_port_phy_power_off(struct tegra_pcie_port *port) static int tegra_pcie_phy_power_on(struct tegra_pcie *pcie) { + const struct tegra_pcie_soc_data *soc = pcie->soc_data; struct tegra_pcie_port *port; int err; @@ -952,6 +932,12 @@ static int tegra_pcie_phy_power_on(struct tegra_pcie *pcie) } } + /* Configure the reference clock driver */ + pads_writel(pcie, soc->pads_refclk_cfg0, PADS_REFCLK_CFG0); + + if (soc->num_ports > 2) + pads_writel(pcie, soc->pads_refclk_cfg1, PADS_REFCLK_CFG1); + return 0; } @@ -1822,12 +1808,6 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) struct resource res; int err; - memset(&pcie->all, 0, sizeof(pcie->all)); - pcie->all.flags = IORESOURCE_MEM; - pcie->all.name = np->full_name; - pcie->all.start = ~0; - pcie->all.end = 0; - if (of_pci_range_parser_init(&parser, np)) { dev_err(pcie->dev, "missing \"ranges\" property\n"); return -EINVAL; @@ -1880,18 +1860,8 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) } break; } - - if (res.start <= pcie->all.start) - pcie->all.start = res.start; - - if (res.end >= pcie->all.end) - pcie->all.end = res.end; } - err = devm_request_resource(pcie->dev, &iomem_resource, &pcie->all); - if (err < 0) - return err; - err = of_pci_parse_bus_range(np, &pcie->busn); if (err < 0) { dev_err(pcie->dev, "failed to parse ranges property: %d\n", @@ -2078,6 +2048,7 @@ static const struct tegra_pcie_soc_data tegra20_pcie_data = { .msi_base_shift = 0, .pads_pll_ctl = PADS_PLL_CTL_TEGRA20, .tx_ref_sel = PADS_PLL_CTL_TXCLKREF_DIV10, + .pads_refclk_cfg0 = 0xfa5cfa5c, .has_pex_clkreq_en = false, .has_pex_bias_ctrl = false, .has_intr_prsnt_sense = false, @@ -2090,6 +2061,8 @@ static const struct tegra_pcie_soc_data tegra30_pcie_data = { .msi_base_shift = 8, .pads_pll_ctl = PADS_PLL_CTL_TEGRA30, .tx_ref_sel = PADS_PLL_CTL_TXCLKREF_BUF_EN, + .pads_refclk_cfg0 = 0xfa5cfa5c, + .pads_refclk_cfg1 = 0xfa5cfa5c, .has_pex_clkreq_en = true, .has_pex_bias_ctrl = true, .has_intr_prsnt_sense = true, @@ -2102,6 +2075,7 @@ static const struct tegra_pcie_soc_data tegra124_pcie_data = { .msi_base_shift = 8, .pads_pll_ctl = PADS_PLL_CTL_TEGRA30, .tx_ref_sel = PADS_PLL_CTL_TXCLKREF_BUF_EN, + .pads_refclk_cfg0 = 0x44ac44ac, .has_pex_clkreq_en = true, .has_pex_bias_ctrl = true, .has_intr_prsnt_sense = true, @@ -2115,7 +2089,6 @@ static const struct of_device_id tegra_pcie_of_match[] = { { .compatible = "nvidia,tegra20-pcie", .data = &tegra20_pcie_data }, { }, }; -MODULE_DEVICE_TABLE(of, tegra_pcie_of_match); static void *tegra_pcie_ports_seq_start(struct seq_file *s, loff_t *pos) { @@ -2249,8 +2222,6 @@ static int tegra_pcie_probe(struct platform_device *pdev) if (err < 0) return err; - pcibios_min_mem = 0; - err = tegra_pcie_get_resources(pcie); if (err < 0) { dev_err(&pdev->dev, "failed to request resources: %d\n", err); @@ -2306,8 +2277,4 @@ static struct platform_driver tegra_pcie_driver = { }, .probe = tegra_pcie_probe, }; -module_platform_driver(tegra_pcie_driver); - -MODULE_AUTHOR("Thierry Reding "); -MODULE_DESCRIPTION("NVIDIA Tegra PCIe driver"); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver(tegra_pcie_driver); diff --git a/drivers/pci/host/pci-thunder-ecam.c b/drivers/pci/host/pci-thunder-ecam.c index 540d030613eb..d50a3dc2d8db 100644 --- a/drivers/pci/host/pci-thunder-ecam.c +++ b/drivers/pci/host/pci-thunder-ecam.c @@ -7,14 +7,13 @@ */ #include -#include +#include #include #include #include +#include #include -#include "../ecam.h" - static void set_val(u32 v, int where, int size, u32 *val) { int shift = (where & 3) * 8; @@ -360,7 +359,6 @@ static const struct of_device_id thunder_ecam_of_match[] = { { .compatible = "cavium,pci-host-thunder-ecam" }, { }, }; -MODULE_DEVICE_TABLE(of, thunder_ecam_of_match); static int thunder_ecam_probe(struct platform_device *pdev) { @@ -374,7 +372,4 @@ static struct platform_driver thunder_ecam_driver = { }, .probe = thunder_ecam_probe, }; -module_platform_driver(thunder_ecam_driver); - -MODULE_DESCRIPTION("Thunder ECAM PCI host driver"); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver(thunder_ecam_driver); diff --git a/drivers/pci/host/pci-thunder-pem.c b/drivers/pci/host/pci-thunder-pem.c index 9b8ab94f3c8c..6abaf80ffb39 100644 --- a/drivers/pci/host/pci-thunder-pem.c +++ b/drivers/pci/host/pci-thunder-pem.c @@ -15,13 +15,12 @@ */ #include -#include +#include #include #include +#include #include -#include "../ecam.h" - #define PEM_CFG_WR 0x28 #define PEM_CFG_RD 0x30 @@ -285,8 +284,9 @@ static int thunder_pem_config_write(struct pci_bus *bus, unsigned int devfn, return pci_generic_config_write(bus, devfn, where, size, val); } -static int thunder_pem_init(struct device *dev, struct pci_config_window *cfg) +static int thunder_pem_init(struct pci_config_window *cfg) { + struct device *dev = cfg->parent; resource_size_t bar4_start; struct resource *res_pem; struct thunder_pem_pci *pem_pci; @@ -346,7 +346,6 @@ static const struct of_device_id thunder_pem_of_match[] = { { .compatible = "cavium,pci-host-thunder-pem" }, { }, }; -MODULE_DEVICE_TABLE(of, thunder_pem_of_match); static int thunder_pem_probe(struct platform_device *pdev) { @@ -360,7 +359,4 @@ static struct platform_driver thunder_pem_driver = { }, .probe = thunder_pem_probe, }; -module_platform_driver(thunder_pem_driver); - -MODULE_DESCRIPTION("Thunder PEM PCIe host driver"); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver(thunder_pem_driver); diff --git a/drivers/pci/host/pci-versatile.c b/drivers/pci/host/pci-versatile.c index f843a72dc51c..f234405770ab 100644 --- a/drivers/pci/host/pci-versatile.c +++ b/drivers/pci/host/pci-versatile.c @@ -80,21 +80,21 @@ static int versatile_pci_parse_request_of_pci_ranges(struct device *dev, if (err) return err; + err = devm_request_pci_bus_resources(dev, res); + if (err) + goto out_release_res; + resource_list_for_each_entry(win, res) { - struct resource *parent, *res = win->res; + struct resource *res = win->res; switch (resource_type(res)) { case IORESOURCE_IO: - parent = &ioport_resource; err = pci_remap_iospace(res, iobase); - if (err) { + if (err) dev_warn(dev, "error %d: failed to map resource %pR\n", err, res); - continue; - } break; case IORESOURCE_MEM: - parent = &iomem_resource; res_valid |= !(res->flags & IORESOURCE_PREFETCH); writel(res->start >> 28, PCI_IMAP(mem)); @@ -102,23 +102,14 @@ static int versatile_pci_parse_request_of_pci_ranges(struct device *dev, mem++; break; - case IORESOURCE_BUS: - default: - continue; } - - err = devm_request_resource(dev, parent, res); - if (err) - goto out_release_res; } - if (!res_valid) { - dev_err(dev, "non-prefetchable memory resource required\n"); - err = -EINVAL; - goto out_release_res; - } + if (res_valid) + return 0; - return 0; + dev_err(dev, "non-prefetchable memory resource required\n"); + err = -EINVAL; out_release_res: pci_free_resource_list(res); diff --git a/drivers/pci/host/pci-xgene.c b/drivers/pci/host/pci-xgene.c index ae00ce22d5a6..a81273c23341 100644 --- a/drivers/pci/host/pci-xgene.c +++ b/drivers/pci/host/pci-xgene.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -540,14 +540,20 @@ static int xgene_pcie_probe_bridge(struct platform_device *pdev) if (ret) return ret; + ret = devm_request_pci_bus_resources(&pdev->dev, &res); + if (ret) + goto error; + ret = xgene_pcie_setup(port, &res, iobase); if (ret) - return ret; + goto error; bus = pci_create_root_bus(&pdev->dev, 0, &xgene_pcie_ops, port, &res); - if (!bus) - return -ENOMEM; + if (!bus) { + ret = -ENOMEM; + goto error; + } pci_scan_child_bus(bus); pci_assign_unassigned_bus_resources(bus); @@ -555,6 +561,10 @@ static int xgene_pcie_probe_bridge(struct platform_device *pdev) platform_set_drvdata(pdev, port); return 0; + +error: + pci_free_resource_list(&res); + return ret; } static const struct of_device_id xgene_pcie_match_table[] = { @@ -569,8 +579,4 @@ static struct platform_driver xgene_pcie_driver = { }, .probe = xgene_pcie_probe_bridge, }; -module_platform_driver(xgene_pcie_driver); - -MODULE_AUTHOR("Tanmay Inamdar "); -MODULE_DESCRIPTION("APM X-Gene PCIe driver"); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver(xgene_pcie_driver); diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c index dbac6fb3f0bd..2b7837650db8 100644 --- a/drivers/pci/host/pcie-altera.c +++ b/drivers/pci/host/pcie-altera.c @@ -61,6 +61,8 @@ #define TLP_LOOP 500 #define RP_DEVFN 0 +#define LINK_UP_TIMEOUT 5000 + #define INTX_NUM 4 #define DWORD_MASK 3 @@ -81,9 +83,30 @@ struct tlp_rp_regpair_t { u32 reg1; }; +static inline void cra_writel(struct altera_pcie *pcie, const u32 value, + const u32 reg) +{ + writel_relaxed(value, pcie->cra_base + reg); +} + +static inline u32 cra_readl(struct altera_pcie *pcie, const u32 reg) +{ + return readl_relaxed(pcie->cra_base + reg); +} + +static bool altera_pcie_link_is_up(struct altera_pcie *pcie) +{ + return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0); +} + static void altera_pcie_retrain(struct pci_dev *dev) { u16 linkcap, linkstat; + struct altera_pcie *pcie = dev->bus->sysdata; + int timeout = 0; + + if (!altera_pcie_link_is_up(pcie)) + return; /* * Set the retrain bit if the PCIe rootport support > 2.5GB/s, but @@ -95,9 +118,16 @@ static void altera_pcie_retrain(struct pci_dev *dev) return; pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &linkstat); - if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) + if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) { pcie_capability_set_word(dev, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_RL); + while (!altera_pcie_link_is_up(pcie)) { + timeout++; + if (timeout > LINK_UP_TIMEOUT) + break; + udelay(5); + } + } } DECLARE_PCI_FIXUP_EARLY(0x1172, PCI_ANY_ID, altera_pcie_retrain); @@ -120,17 +150,6 @@ static bool altera_pcie_hide_rc_bar(struct pci_bus *bus, unsigned int devfn, return false; } -static inline void cra_writel(struct altera_pcie *pcie, const u32 value, - const u32 reg) -{ - writel_relaxed(value, pcie->cra_base + reg); -} - -static inline u32 cra_readl(struct altera_pcie *pcie, const u32 reg) -{ - return readl_relaxed(pcie->cra_base + reg); -} - static void tlp_write_tx(struct altera_pcie *pcie, struct tlp_rp_regpair_t *tlp_rp_regdata) { @@ -139,11 +158,6 @@ static void tlp_write_tx(struct altera_pcie *pcie, cra_writel(pcie, tlp_rp_regdata->ctrl, RP_TX_CNTRL); } -static bool altera_pcie_link_is_up(struct altera_pcie *pcie) -{ - return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0); -} - static bool altera_pcie_valid_config(struct altera_pcie *pcie, struct pci_bus *bus, int dev) { @@ -415,11 +429,6 @@ static void altera_pcie_isr(struct irq_desc *desc) chained_irq_exit(chip, desc); } -static void altera_pcie_release_of_pci_ranges(struct altera_pcie *pcie) -{ - pci_free_resource_list(&pcie->resources); -} - static int altera_pcie_parse_request_of_pci_ranges(struct altera_pcie *pcie) { int err, res_valid = 0; @@ -432,33 +441,25 @@ static int altera_pcie_parse_request_of_pci_ranges(struct altera_pcie *pcie) if (err) return err; - resource_list_for_each_entry(win, &pcie->resources) { - struct resource *parent, *res = win->res; - - switch (resource_type(res)) { - case IORESOURCE_MEM: - parent = &iomem_resource; - res_valid |= !(res->flags & IORESOURCE_PREFETCH); - break; - default: - continue; - } - - err = devm_request_resource(dev, parent, res); - if (err) - goto out_release_res; - } - - if (!res_valid) { - dev_err(dev, "non-prefetchable memory resource required\n"); - err = -EINVAL; + err = devm_request_pci_bus_resources(dev, &pcie->resources); + if (err) goto out_release_res; + + resource_list_for_each_entry(win, &pcie->resources) { + struct resource *res = win->res; + + if (resource_type(res) == IORESOURCE_MEM) + res_valid |= !(res->flags & IORESOURCE_PREFETCH); } - return 0; + if (res_valid) + return 0; + + dev_err(dev, "non-prefetchable memory resource required\n"); + err = -EINVAL; out_release_res: - altera_pcie_release_of_pci_ranges(pcie); + pci_free_resource_list(&pcie->resources); return err; } diff --git a/drivers/pci/host/pcie-armada8k.c b/drivers/pci/host/pcie-armada8k.c index 55723567b5d4..0f4f570068e3 100644 --- a/drivers/pci/host/pcie-armada8k.c +++ b/drivers/pci/host/pcie-armada8k.c @@ -5,6 +5,9 @@ * * Copyright (C) 2016 Marvell Technology Group Ltd. * + * Author: Yehuda Yitshak + * Author: Shadi Ammouri + * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any * warranty of any kind, whether express or implied. @@ -14,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -244,7 +247,6 @@ static const struct of_device_id armada8k_pcie_of_match[] = { { .compatible = "marvell,armada8k-pcie", }, {}, }; -MODULE_DEVICE_TABLE(of, armada8k_pcie_of_match); static struct platform_driver armada8k_pcie_driver = { .probe = armada8k_pcie_probe, @@ -253,10 +255,4 @@ static struct platform_driver armada8k_pcie_driver = { .of_match_table = of_match_ptr(armada8k_pcie_of_match), }, }; - -module_platform_driver(armada8k_pcie_driver); - -MODULE_DESCRIPTION("Armada 8k PCIe host controller driver"); -MODULE_AUTHOR("Yehuda Yitshak "); -MODULE_AUTHOR("Shadi Ammouri "); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver(armada8k_pcie_driver); diff --git a/drivers/pci/host/pcie-artpec6.c b/drivers/pci/host/pcie-artpec6.c new file mode 100644 index 000000000000..16ba70b7ec65 --- /dev/null +++ b/drivers/pci/host/pcie-artpec6.c @@ -0,0 +1,280 @@ +/* + * PCIe host controller driver for Axis ARTPEC-6 SoC + * + * Author: Niklas Cassel + * + * Based on work done by Phil Edworthy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pcie-designware.h" + +#define to_artpec6_pcie(x) container_of(x, struct artpec6_pcie, pp) + +struct artpec6_pcie { + struct pcie_port pp; + struct regmap *regmap; + void __iomem *phy_base; +}; + +/* PCIe Port Logic registers (memory-mapped) */ +#define PL_OFFSET 0x700 +#define PCIE_PHY_DEBUG_R0 (PL_OFFSET + 0x28) +#define PCIE_PHY_DEBUG_R1 (PL_OFFSET + 0x2c) + +#define MISC_CONTROL_1_OFF (PL_OFFSET + 0x1bc) +#define DBI_RO_WR_EN 1 + +/* ARTPEC-6 specific registers */ +#define PCIECFG 0x18 +#define PCIECFG_DBG_OEN (1 << 24) +#define PCIECFG_CORE_RESET_REQ (1 << 21) +#define PCIECFG_LTSSM_ENABLE (1 << 20) +#define PCIECFG_CLKREQ_B (1 << 11) +#define PCIECFG_REFCLK_ENABLE (1 << 10) +#define PCIECFG_PLL_ENABLE (1 << 9) +#define PCIECFG_PCLK_ENABLE (1 << 8) +#define PCIECFG_RISRCREN (1 << 4) +#define PCIECFG_MODE_TX_DRV_EN (1 << 3) +#define PCIECFG_CISRREN (1 << 2) +#define PCIECFG_MACRO_ENABLE (1 << 0) + +#define NOCCFG 0x40 +#define NOCCFG_ENABLE_CLK_PCIE (1 << 4) +#define NOCCFG_POWER_PCIE_IDLEACK (1 << 3) +#define NOCCFG_POWER_PCIE_IDLE (1 << 2) +#define NOCCFG_POWER_PCIE_IDLEREQ (1 << 1) + +#define PHY_STATUS 0x118 +#define PHY_COSPLLLOCK (1 << 0) + +#define ARTPEC6_CPU_TO_BUS_ADDR 0x0fffffff + +static int artpec6_pcie_establish_link(struct pcie_port *pp) +{ + struct artpec6_pcie *artpec6_pcie = to_artpec6_pcie(pp); + u32 val; + unsigned int retries; + + /* Hold DW core in reset */ + regmap_read(artpec6_pcie->regmap, PCIECFG, &val); + val |= PCIECFG_CORE_RESET_REQ; + regmap_write(artpec6_pcie->regmap, PCIECFG, val); + + regmap_read(artpec6_pcie->regmap, PCIECFG, &val); + val |= PCIECFG_RISRCREN | /* Receiver term. 50 Ohm */ + PCIECFG_MODE_TX_DRV_EN | + PCIECFG_CISRREN | /* Reference clock term. 100 Ohm */ + PCIECFG_MACRO_ENABLE; + val |= PCIECFG_REFCLK_ENABLE; + val &= ~PCIECFG_DBG_OEN; + val &= ~PCIECFG_CLKREQ_B; + regmap_write(artpec6_pcie->regmap, PCIECFG, val); + usleep_range(5000, 6000); + + regmap_read(artpec6_pcie->regmap, NOCCFG, &val); + val |= NOCCFG_ENABLE_CLK_PCIE; + regmap_write(artpec6_pcie->regmap, NOCCFG, val); + usleep_range(20, 30); + + regmap_read(artpec6_pcie->regmap, PCIECFG, &val); + val |= PCIECFG_PCLK_ENABLE | PCIECFG_PLL_ENABLE; + regmap_write(artpec6_pcie->regmap, PCIECFG, val); + usleep_range(6000, 7000); + + regmap_read(artpec6_pcie->regmap, NOCCFG, &val); + val &= ~NOCCFG_POWER_PCIE_IDLEREQ; + regmap_write(artpec6_pcie->regmap, NOCCFG, val); + + retries = 50; + do { + usleep_range(1000, 2000); + regmap_read(artpec6_pcie->regmap, NOCCFG, &val); + retries--; + } while (retries && + (val & (NOCCFG_POWER_PCIE_IDLEACK | NOCCFG_POWER_PCIE_IDLE))); + + retries = 50; + do { + usleep_range(1000, 2000); + val = readl(artpec6_pcie->phy_base + PHY_STATUS); + retries--; + } while (retries && !(val & PHY_COSPLLLOCK)); + + /* Take DW core out of reset */ + regmap_read(artpec6_pcie->regmap, PCIECFG, &val); + val &= ~PCIECFG_CORE_RESET_REQ; + regmap_write(artpec6_pcie->regmap, PCIECFG, val); + usleep_range(100, 200); + + /* + * Enable writing to config regs. This is required as the Synopsys + * driver changes the class code. That register needs DBI write enable. + */ + writel(DBI_RO_WR_EN, pp->dbi_base + MISC_CONTROL_1_OFF); + + pp->io_base &= ARTPEC6_CPU_TO_BUS_ADDR; + pp->mem_base &= ARTPEC6_CPU_TO_BUS_ADDR; + pp->cfg0_base &= ARTPEC6_CPU_TO_BUS_ADDR; + pp->cfg1_base &= ARTPEC6_CPU_TO_BUS_ADDR; + + /* setup root complex */ + dw_pcie_setup_rc(pp); + + /* assert LTSSM enable */ + regmap_read(artpec6_pcie->regmap, PCIECFG, &val); + val |= PCIECFG_LTSSM_ENABLE; + regmap_write(artpec6_pcie->regmap, PCIECFG, val); + + /* check if the link is up or not */ + if (!dw_pcie_wait_for_link(pp)) + return 0; + + dev_dbg(pp->dev, "DEBUG_R0: 0x%08x, DEBUG_R1: 0x%08x\n", + readl(pp->dbi_base + PCIE_PHY_DEBUG_R0), + readl(pp->dbi_base + PCIE_PHY_DEBUG_R1)); + + return -ETIMEDOUT; +} + +static void artpec6_pcie_enable_interrupts(struct pcie_port *pp) +{ + if (IS_ENABLED(CONFIG_PCI_MSI)) + dw_pcie_msi_init(pp); +} + +static void artpec6_pcie_host_init(struct pcie_port *pp) +{ + artpec6_pcie_establish_link(pp); + artpec6_pcie_enable_interrupts(pp); +} + +static int artpec6_pcie_link_up(struct pcie_port *pp) +{ + u32 rc; + + /* + * Get status from Synopsys IP + * link is debug bit 36, debug register 1 starts at bit 32 + */ + rc = readl(pp->dbi_base + PCIE_PHY_DEBUG_R1) & (0x1 << (36 - 32)); + if (rc) + return 1; + + return 0; +} + +static struct pcie_host_ops artpec6_pcie_host_ops = { + .link_up = artpec6_pcie_link_up, + .host_init = artpec6_pcie_host_init, +}; + +static irqreturn_t artpec6_pcie_msi_handler(int irq, void *arg) +{ + struct pcie_port *pp = arg; + + return dw_handle_msi_irq(pp); +} + +static int __init artpec6_add_pcie_port(struct pcie_port *pp, + struct platform_device *pdev) +{ + int ret; + + if (IS_ENABLED(CONFIG_PCI_MSI)) { + pp->msi_irq = platform_get_irq_byname(pdev, "msi"); + if (pp->msi_irq <= 0) { + dev_err(&pdev->dev, "failed to get MSI irq\n"); + return -ENODEV; + } + + ret = devm_request_irq(&pdev->dev, pp->msi_irq, + artpec6_pcie_msi_handler, + IRQF_SHARED | IRQF_NO_THREAD, + "artpec6-pcie-msi", pp); + if (ret) { + dev_err(&pdev->dev, "failed to request MSI irq\n"); + return ret; + } + } + + pp->root_bus_nr = -1; + pp->ops = &artpec6_pcie_host_ops; + + ret = dw_pcie_host_init(pp); + if (ret) { + dev_err(&pdev->dev, "failed to initialize host\n"); + return ret; + } + + return 0; +} + +static int artpec6_pcie_probe(struct platform_device *pdev) +{ + struct artpec6_pcie *artpec6_pcie; + struct pcie_port *pp; + struct resource *dbi_base; + struct resource *phy_base; + int ret; + + artpec6_pcie = devm_kzalloc(&pdev->dev, sizeof(*artpec6_pcie), + GFP_KERNEL); + if (!artpec6_pcie) + return -ENOMEM; + + pp = &artpec6_pcie->pp; + pp->dev = &pdev->dev; + + dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); + pp->dbi_base = devm_ioremap_resource(&pdev->dev, dbi_base); + if (IS_ERR(pp->dbi_base)) + return PTR_ERR(pp->dbi_base); + + phy_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "phy"); + artpec6_pcie->phy_base = devm_ioremap_resource(&pdev->dev, phy_base); + if (IS_ERR(artpec6_pcie->phy_base)) + return PTR_ERR(artpec6_pcie->phy_base); + + artpec6_pcie->regmap = + syscon_regmap_lookup_by_phandle(pdev->dev.of_node, + "axis,syscon-pcie"); + if (IS_ERR(artpec6_pcie->regmap)) + return PTR_ERR(artpec6_pcie->regmap); + + ret = artpec6_add_pcie_port(pp, pdev); + if (ret < 0) + return ret; + + platform_set_drvdata(pdev, artpec6_pcie); + return 0; +} + +static const struct of_device_id artpec6_pcie_of_match[] = { + { .compatible = "axis,artpec6-pcie", }, + {}, +}; + +static struct platform_driver artpec6_pcie_driver = { + .probe = artpec6_pcie_probe, + .driver = { + .name = "artpec6-pcie", + .of_match_table = artpec6_pcie_of_match, + }, +}; +builtin_platform_driver(artpec6_pcie_driver); diff --git a/drivers/pci/host/pcie-designware-plat.c b/drivers/pci/host/pcie-designware-plat.c index b3500994d08a..c8079dc81c10 100644 --- a/drivers/pci/host/pcie-designware-plat.c +++ b/drivers/pci/host/pcie-designware-plat.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -121,7 +121,6 @@ static const struct of_device_id dw_plat_pcie_of_match[] = { { .compatible = "snps,dw-pcie", }, {}, }; -MODULE_DEVICE_TABLE(of, dw_plat_pcie_of_match); static struct platform_driver dw_plat_pcie_driver = { .driver = { @@ -130,9 +129,4 @@ static struct platform_driver dw_plat_pcie_driver = { }, .probe = dw_plat_pcie_probe, }; - -module_platform_driver(dw_plat_pcie_driver); - -MODULE_AUTHOR("Joao Pinto "); -MODULE_DESCRIPTION("Synopsys PCIe host controller glue platform driver"); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver(dw_plat_pcie_driver); diff --git a/drivers/pci/host/pcie-designware.c b/drivers/pci/host/pcie-designware.c index aafd766546f3..12afce19890b 100644 --- a/drivers/pci/host/pcie-designware.c +++ b/drivers/pci/host/pcie-designware.c @@ -452,6 +452,10 @@ int dw_pcie_host_init(struct pcie_port *pp) if (ret) return ret; + ret = devm_request_pci_bus_resources(&pdev->dev, &res); + if (ret) + goto error; + /* Get the I/O and memory ranges from DT */ resource_list_for_each_entry(win, &res) { switch (resource_type(win->res)) { @@ -461,11 +465,9 @@ int dw_pcie_host_init(struct pcie_port *pp) pp->io_size = resource_size(pp->io); pp->io_bus_addr = pp->io->start - win->offset; ret = pci_remap_iospace(pp->io, pp->io_base); - if (ret) { + if (ret) dev_warn(pp->dev, "error %d: failed to map resource %pR\n", ret, pp->io); - continue; - } break; case IORESOURCE_MEM: pp->mem = win->res; @@ -483,8 +485,6 @@ int dw_pcie_host_init(struct pcie_port *pp) case IORESOURCE_BUS: pp->busn = win->res; break; - default: - continue; } } @@ -493,7 +493,8 @@ int dw_pcie_host_init(struct pcie_port *pp) resource_size(pp->cfg)); if (!pp->dbi_base) { dev_err(pp->dev, "error with ioremap\n"); - return -ENOMEM; + ret = -ENOMEM; + goto error; } } @@ -504,7 +505,8 @@ int dw_pcie_host_init(struct pcie_port *pp) pp->cfg0_size); if (!pp->va_cfg0_base) { dev_err(pp->dev, "error with ioremap in function\n"); - return -ENOMEM; + ret = -ENOMEM; + goto error; } } @@ -513,7 +515,8 @@ int dw_pcie_host_init(struct pcie_port *pp) pp->cfg1_size); if (!pp->va_cfg1_base) { dev_err(pp->dev, "error with ioremap\n"); - return -ENOMEM; + ret = -ENOMEM; + goto error; } } @@ -528,7 +531,8 @@ int dw_pcie_host_init(struct pcie_port *pp) &dw_pcie_msi_chip); if (!pp->irq_domain) { dev_err(pp->dev, "irq domain init failed\n"); - return -ENXIO; + ret = -ENXIO; + goto error; } for (i = 0; i < MAX_MSI_IRQS; i++) @@ -536,7 +540,7 @@ int dw_pcie_host_init(struct pcie_port *pp) } else { ret = pp->ops->msi_host_init(pp, &dw_pcie_msi_chip); if (ret < 0) - return ret; + goto error; } } @@ -552,8 +556,10 @@ int dw_pcie_host_init(struct pcie_port *pp) } else bus = pci_scan_root_bus(pp->dev, pp->root_bus_nr, &dw_pcie_ops, pp, &res); - if (!bus) - return -ENOMEM; + if (!bus) { + ret = -ENOMEM; + goto error; + } if (pp->ops->scan_bus) pp->ops->scan_bus(pp); @@ -571,6 +577,10 @@ int dw_pcie_host_init(struct pcie_port *pp) pci_bus_add_devices(bus); return 0; + +error: + pci_free_resource_list(&res); + return ret; } static int dw_pcie_rd_other_conf(struct pcie_port *pp, struct pci_bus *bus, diff --git a/drivers/pci/host/pcie-hisi.c b/drivers/pci/host/pcie-hisi.c index 3e98d4edae2d..7ee9dfcc45fb 100644 --- a/drivers/pci/host/pcie-hisi.c +++ b/drivers/pci/host/pcie-hisi.c @@ -12,7 +12,7 @@ * published by the Free Software Foundation. */ #include -#include +#include #include #include #include @@ -235,9 +235,6 @@ static const struct of_device_id hisi_pcie_of_match[] = { {}, }; - -MODULE_DEVICE_TABLE(of, hisi_pcie_of_match); - static struct platform_driver hisi_pcie_driver = { .probe = hisi_pcie_probe, .driver = { @@ -245,10 +242,4 @@ static struct platform_driver hisi_pcie_driver = { .of_match_table = hisi_pcie_of_match, }, }; - -module_platform_driver(hisi_pcie_driver); - -MODULE_AUTHOR("Zhou Wang "); -MODULE_AUTHOR("Dacai Zhu "); -MODULE_AUTHOR("Gabriele Paoloni "); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver(hisi_pcie_driver); diff --git a/drivers/pci/host/pcie-iproc.c b/drivers/pci/host/pcie-iproc.c index a576aeeb22da..e167b2f0098d 100644 --- a/drivers/pci/host/pcie-iproc.c +++ b/drivers/pci/host/pcie-iproc.c @@ -462,6 +462,10 @@ int iproc_pcie_setup(struct iproc_pcie *pcie, struct list_head *res) if (!pcie || !pcie->dev || !pcie->base) return -EINVAL; + ret = devm_request_pci_bus_resources(pcie->dev, res); + if (ret) + return ret; + ret = phy_init(pcie->phy); if (ret) { dev_err(pcie->dev, "unable to initialize PCIe PHY\n"); diff --git a/drivers/pci/host/pcie-rcar.c b/drivers/pci/host/pcie-rcar.c index 35092188039b..65db7a221509 100644 --- a/drivers/pci/host/pcie-rcar.c +++ b/drivers/pci/host/pcie-rcar.c @@ -7,6 +7,8 @@ * arch/sh/drivers/pci/ops-sh7786.c * Copyright (C) 2009 - 2011 Paul Mundt * + * Author: Phil Edworthy + * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any * warranty of any kind, whether express or implied. @@ -18,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -936,12 +938,6 @@ static const struct of_device_id rcar_pcie_of_match[] = { { .compatible = "renesas,pcie-r8a7795", .data = rcar_pcie_hw_init }, {}, }; -MODULE_DEVICE_TABLE(of, rcar_pcie_of_match); - -static void rcar_pcie_release_of_pci_ranges(struct rcar_pcie *pci) -{ - pci_free_resource_list(&pci->resources); -} static int rcar_pcie_parse_request_of_pci_ranges(struct rcar_pcie *pci) { @@ -955,37 +951,25 @@ static int rcar_pcie_parse_request_of_pci_ranges(struct rcar_pcie *pci) if (err) return err; - resource_list_for_each_entry(win, &pci->resources) { - struct resource *parent, *res = win->res; + err = devm_request_pci_bus_resources(dev, &pci->resources); + if (err) + goto out_release_res; - switch (resource_type(res)) { - case IORESOURCE_IO: - parent = &ioport_resource; + resource_list_for_each_entry(win, &pci->resources) { + struct resource *res = win->res; + + if (resource_type(res) == IORESOURCE_IO) { err = pci_remap_iospace(res, iobase); - if (err) { + if (err) dev_warn(dev, "error %d: failed to map resource %pR\n", err, res); - continue; - } - break; - case IORESOURCE_MEM: - parent = &iomem_resource; - break; - - case IORESOURCE_BUS: - default: - continue; } - - err = devm_request_resource(dev, parent, res); - if (err) - goto out_release_res; } return 0; out_release_res: - rcar_pcie_release_of_pci_ranges(pci); + pci_free_resource_list(&pci->resources); return err; } @@ -1073,8 +1057,4 @@ static struct platform_driver rcar_pcie_driver = { }, .probe = rcar_pcie_probe, }; -module_platform_driver(rcar_pcie_driver); - -MODULE_AUTHOR("Phil Edworthy "); -MODULE_DESCRIPTION("Renesas R-Car PCIe driver"); -MODULE_LICENSE("GPL v2"); +builtin_platform_driver(rcar_pcie_driver); diff --git a/drivers/pci/host/pcie-xilinx-nwl.c b/drivers/pci/host/pcie-xilinx-nwl.c index 3479d30e2be8..0b597d9190b4 100644 --- a/drivers/pci/host/pcie-xilinx-nwl.c +++ b/drivers/pci/host/pcie-xilinx-nwl.c @@ -825,27 +825,33 @@ static int nwl_pcie_probe(struct platform_device *pdev) err = of_pci_get_host_bridge_resources(node, 0, 0xff, &res, &iobase); if (err) { - pr_err("Getting bridge resources failed\n"); + dev_err(pcie->dev, "Getting bridge resources failed\n"); return err; } + err = devm_request_pci_bus_resources(pcie->dev, &res); + if (err) + goto error; + err = nwl_pcie_init_irq_domain(pcie); if (err) { dev_err(pcie->dev, "Failed creating IRQ Domain\n"); - return err; + goto error; } bus = pci_create_root_bus(&pdev->dev, pcie->root_busno, &nwl_pcie_ops, pcie, &res); - if (!bus) - return -ENOMEM; + if (!bus) { + err = -ENOMEM; + goto error; + } if (IS_ENABLED(CONFIG_PCI_MSI)) { err = nwl_pcie_enable_msi(pcie, bus); if (err < 0) { dev_err(&pdev->dev, "failed to enable MSI support: %d\n", err); - return err; + goto error; } } pci_scan_child_bus(bus); @@ -855,6 +861,10 @@ static int nwl_pcie_probe(struct platform_device *pdev) pci_bus_add_devices(bus); platform_set_drvdata(pdev, pcie); return 0; + +error: + pci_free_resource_list(&res); + return err; } static int nwl_pcie_remove(struct platform_device *pdev) diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c index 65f0fe0c2eaf..a30e01639557 100644 --- a/drivers/pci/host/pcie-xilinx.c +++ b/drivers/pci/host/pcie-xilinx.c @@ -550,7 +550,7 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port) pcie_intc_node = of_get_next_child(node, NULL); if (!pcie_intc_node) { dev_err(dev, "No PCIe Intc node found\n"); - return PTR_ERR(pcie_intc_node); + return -ENODEV; } port->irq_domain = irq_domain_add_linear(pcie_intc_node, 4, @@ -558,7 +558,7 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port) port); if (!port->irq_domain) { dev_err(dev, "Failed to get a INTx IRQ domain\n"); - return PTR_ERR(port->irq_domain); + return -ENODEV; } /* Setup MSI */ @@ -569,7 +569,7 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port) &xilinx_pcie_msi_chip); if (!port->irq_domain) { dev_err(dev, "Failed to get a MSI IRQ domain\n"); - return PTR_ERR(port->irq_domain); + return -ENODEV; } xilinx_pcie_enable_msi(port); @@ -660,7 +660,6 @@ static int xilinx_pcie_probe(struct platform_device *pdev) struct xilinx_pcie_port *port; struct device *dev = &pdev->dev; struct pci_bus *bus; - int err; resource_size_t iobase = 0; LIST_HEAD(res); @@ -694,10 +693,17 @@ static int xilinx_pcie_probe(struct platform_device *pdev) dev_err(dev, "Getting bridge resources failed\n"); return err; } + + err = devm_request_pci_bus_resources(dev, &res); + if (err) + goto error; + bus = pci_create_root_bus(&pdev->dev, 0, &xilinx_pcie_ops, port, &res); - if (!bus) - return -ENOMEM; + if (!bus) { + err = -ENOMEM; + goto error; + } #ifdef CONFIG_PCI_MSI xilinx_pcie_msi_chip.dev = port->dev; @@ -712,6 +718,10 @@ static int xilinx_pcie_probe(struct platform_device *pdev) platform_set_drvdata(pdev, port); return 0; + +error: + pci_free_resource_list(&res); + return err; } /** diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index fa49f9143b80..6a33ddcfa20b 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -675,6 +675,8 @@ static void acpiphp_check_bridge(struct acpiphp_bridge *bridge) if (bridge->is_going_away) return; + pm_runtime_get_sync(&bridge->pci_dev->dev); + list_for_each_entry(slot, &bridge->slots, node) { struct pci_bus *bus = slot->bus; struct pci_dev *dev, *tmp; @@ -694,6 +696,8 @@ static void acpiphp_check_bridge(struct acpiphp_bridge *bridge) disable_slot(slot); } } + + pm_runtime_put(&bridge->pci_dev->dev); } /* diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 5c24e938042f..08e84d61874e 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -546,6 +546,10 @@ static irqreturn_t pcie_isr(int irq, void *dev_id) u8 present; bool link; + /* Interrupts cannot originate from a controller that's asleep */ + if (pdev->current_state == PCI_D3cold) + return IRQ_NONE; + /* * In order to guarantee that all interrupt events are * serviced, we need to re-inspect Slot Status register after diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index a080f4496fe2..a02981efdad5 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -4,6 +4,7 @@ * * Copyright (C) 2003-2004 Intel * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) + * Copyright (C) 2016 Christoph Hellwig. */ #include @@ -207,6 +208,12 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag); } +static void __iomem *pci_msix_desc_addr(struct msi_desc *desc) +{ + return desc->mask_base + + desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; +} + /* * This internal function does not flush PCI writes to the device. * All users must ensure that they read from the device before either @@ -217,8 +224,6 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag) { u32 mask_bits = desc->masked; - unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_VECTOR_CTRL; if (pci_msi_ignore_mask) return 0; @@ -226,7 +231,7 @@ u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag) mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; if (flag) mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT; - writel(mask_bits, desc->mask_base + offset); + writel(mask_bits, pci_msix_desc_addr(desc) + PCI_MSIX_ENTRY_VECTOR_CTRL); return mask_bits; } @@ -284,8 +289,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) BUG_ON(dev->current_state != PCI_D0); if (entry->msi_attrib.is_msix) { - void __iomem *base = entry->mask_base + - entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; + void __iomem *base = pci_msix_desc_addr(entry); msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR); msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR); @@ -315,9 +319,7 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) if (dev->current_state != PCI_D0) { /* Don't touch the hardware now */ } else if (entry->msi_attrib.is_msix) { - void __iomem *base; - base = entry->mask_base + - entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; + void __iomem *base = pci_msix_desc_addr(entry); writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR); writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR); @@ -567,6 +569,7 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec) entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1; entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec)); entry->nvec_used = nvec; + entry->affinity = dev->irq_affinity; if (control & PCI_MSI_FLAGS_64BIT) entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64; @@ -678,10 +681,18 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries) static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, struct msix_entry *entries, int nvec) { + const struct cpumask *mask = NULL; struct msi_desc *entry; - int i; + int cpu = -1, i; for (i = 0; i < nvec; i++) { + if (dev->irq_affinity) { + cpu = cpumask_next(cpu, dev->irq_affinity); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(dev->irq_affinity); + mask = cpumask_of(cpu); + } + entry = alloc_msi_entry(&dev->dev); if (!entry) { if (!i) @@ -694,10 +705,14 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.is_msix = 1; entry->msi_attrib.is_64 = 1; - entry->msi_attrib.entry_nr = entries[i].entry; + if (entries) + entry->msi_attrib.entry_nr = entries[i].entry; + else + entry->msi_attrib.entry_nr = i; entry->msi_attrib.default_irq = dev->irq; entry->mask_base = base; entry->nvec_used = 1; + entry->affinity = mask; list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); } @@ -712,13 +727,11 @@ static void msix_program_entries(struct pci_dev *dev, int i = 0; for_each_pci_msi_entry(entry, dev) { - int offset = entries[i].entry * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_VECTOR_CTRL; - - entries[i].vector = entry->irq; - entry->masked = readl(entry->mask_base + offset); + if (entries) + entries[i++].vector = entry->irq; + entry->masked = readl(pci_msix_desc_addr(entry) + + PCI_MSIX_ENTRY_VECTOR_CTRL); msix_mask_irq(entry, 1); - i++; } } @@ -931,7 +944,7 @@ EXPORT_SYMBOL(pci_msix_vec_count); /** * pci_enable_msix - configure device's MSI-X capability structure * @dev: pointer to the pci_dev data structure of MSI-X device function - * @entries: pointer to an array of MSI-X entries + * @entries: pointer to an array of MSI-X entries (optional) * @nvec: number of MSI-X irqs requested for allocation by device driver * * Setup the MSI-X capability structure of device function with the number @@ -951,22 +964,21 @@ int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) if (!pci_msi_supported(dev, nvec)) return -EINVAL; - if (!entries) - return -EINVAL; - nr_entries = pci_msix_vec_count(dev); if (nr_entries < 0) return nr_entries; if (nvec > nr_entries) return nr_entries; - /* Check for any invalid entries */ - for (i = 0; i < nvec; i++) { - if (entries[i].entry >= nr_entries) - return -EINVAL; /* invalid entry */ - for (j = i + 1; j < nvec; j++) { - if (entries[i].entry == entries[j].entry) - return -EINVAL; /* duplicate entry */ + if (entries) { + /* Check for any invalid entries */ + for (i = 0; i < nvec; i++) { + if (entries[i].entry >= nr_entries) + return -EINVAL; /* invalid entry */ + for (j = i + 1; j < nvec; j++) { + if (entries[i].entry == entries[j].entry) + return -EINVAL; /* duplicate entry */ + } } } WARN_ON(!!dev->msix_enabled); @@ -1026,19 +1038,8 @@ int pci_msi_enabled(void) } EXPORT_SYMBOL(pci_msi_enabled); -/** - * pci_enable_msi_range - configure device's MSI capability structure - * @dev: device to configure - * @minvec: minimal number of interrupts to configure - * @maxvec: maximum number of interrupts to configure - * - * This function tries to allocate a maximum possible number of interrupts in a - * range between @minvec and @maxvec. It returns a negative errno if an error - * occurs. If it succeeds, it returns the actual number of interrupts allocated - * and updates the @dev's irq member to the lowest new interrupt number; - * the other interrupt numbers allocated to this device are consecutive. - **/ -int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, + unsigned int flags) { int nvec; int rc; @@ -1061,26 +1062,86 @@ int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) nvec = pci_msi_vec_count(dev); if (nvec < 0) return nvec; - else if (nvec < minvec) + if (nvec < minvec) return -EINVAL; - else if (nvec > maxvec) + + if (nvec > maxvec) nvec = maxvec; - do { - rc = msi_capability_init(dev, nvec); - if (rc < 0) { - return rc; - } else if (rc > 0) { - if (rc < minvec) + for (;;) { + if (!(flags & PCI_IRQ_NOAFFINITY)) { + dev->irq_affinity = irq_create_affinity_mask(&nvec); + if (nvec < minvec) return -ENOSPC; - nvec = rc; } - } while (rc); - return nvec; + rc = msi_capability_init(dev, nvec); + if (rc == 0) + return nvec; + + kfree(dev->irq_affinity); + dev->irq_affinity = NULL; + + if (rc < 0) + return rc; + if (rc < minvec) + return -ENOSPC; + + nvec = rc; + } +} + +/** + * pci_enable_msi_range - configure device's MSI capability structure + * @dev: device to configure + * @minvec: minimal number of interrupts to configure + * @maxvec: maximum number of interrupts to configure + * + * This function tries to allocate a maximum possible number of interrupts in a + * range between @minvec and @maxvec. It returns a negative errno if an error + * occurs. If it succeeds, it returns the actual number of interrupts allocated + * and updates the @dev's irq member to the lowest new interrupt number; + * the other interrupt numbers allocated to this device are consecutive. + **/ +int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +{ + return __pci_enable_msi_range(dev, minvec, maxvec, PCI_IRQ_NOAFFINITY); } EXPORT_SYMBOL(pci_enable_msi_range); +static int __pci_enable_msix_range(struct pci_dev *dev, + struct msix_entry *entries, int minvec, int maxvec, + unsigned int flags) +{ + int nvec = maxvec; + int rc; + + if (maxvec < minvec) + return -ERANGE; + + for (;;) { + if (!(flags & PCI_IRQ_NOAFFINITY)) { + dev->irq_affinity = irq_create_affinity_mask(&nvec); + if (nvec < minvec) + return -ENOSPC; + } + + rc = pci_enable_msix(dev, entries, nvec); + if (rc == 0) + return nvec; + + kfree(dev->irq_affinity); + dev->irq_affinity = NULL; + + if (rc < 0) + return rc; + if (rc < minvec) + return -ENOSPC; + + nvec = rc; + } +} + /** * pci_enable_msix_range - configure device's MSI-X capability structure * @dev: pointer to the pci_dev data structure of MSI-X device function @@ -1097,29 +1158,102 @@ EXPORT_SYMBOL(pci_enable_msi_range); * with new allocated MSI-X interrupts. **/ int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, - int minvec, int maxvec) + int minvec, int maxvec) { - int nvec = maxvec; - int rc; - - if (maxvec < minvec) - return -ERANGE; - - do { - rc = pci_enable_msix(dev, entries, nvec); - if (rc < 0) { - return rc; - } else if (rc > 0) { - if (rc < minvec) - return -ENOSPC; - nvec = rc; - } - } while (rc); - - return nvec; + return __pci_enable_msix_range(dev, entries, minvec, maxvec, + PCI_IRQ_NOAFFINITY); } EXPORT_SYMBOL(pci_enable_msix_range); +/** + * pci_alloc_irq_vectors - allocate multiple IRQs for a device + * @dev: PCI device to operate on + * @min_vecs: minimum number of vectors required (must be >= 1) + * @max_vecs: maximum (desired) number of vectors + * @flags: flags or quirks for the allocation + * + * Allocate up to @max_vecs interrupt vectors for @dev, using MSI-X or MSI + * vectors if available, and fall back to a single legacy vector + * if neither is available. Return the number of vectors allocated, + * (which might be smaller than @max_vecs) if successful, or a negative + * error code on error. If less than @min_vecs interrupt vectors are + * available for @dev the function will fail with -ENOSPC. + * + * To get the Linux IRQ number used for a vector that can be passed to + * request_irq() use the pci_irq_vector() helper. + */ +int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags) +{ + int vecs = -ENOSPC; + + if (!(flags & PCI_IRQ_NOMSIX)) { + vecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs, + flags); + if (vecs > 0) + return vecs; + } + + if (!(flags & PCI_IRQ_NOMSI)) { + vecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, flags); + if (vecs > 0) + return vecs; + } + + /* use legacy irq if allowed */ + if (!(flags & PCI_IRQ_NOLEGACY) && min_vecs == 1) + return 1; + return vecs; +} +EXPORT_SYMBOL(pci_alloc_irq_vectors); + +/** + * pci_free_irq_vectors - free previously allocated IRQs for a device + * @dev: PCI device to operate on + * + * Undoes the allocations and enabling in pci_alloc_irq_vectors(). + */ +void pci_free_irq_vectors(struct pci_dev *dev) +{ + pci_disable_msix(dev); + pci_disable_msi(dev); +} +EXPORT_SYMBOL(pci_free_irq_vectors); + +/** + * pci_irq_vector - return Linux IRQ number of a device vector + * @dev: PCI device to operate on + * @nr: device-relative interrupt vector index (0-based). + */ +int pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ + if (dev->msix_enabled) { + struct msi_desc *entry; + int i = 0; + + for_each_pci_msi_entry(entry, dev) { + if (i == nr) + return entry->irq; + i++; + } + WARN_ON_ONCE(1); + return -EINVAL; + } + + if (dev->msi_enabled) { + struct msi_desc *entry = first_pci_msi_entry(dev); + + if (WARN_ON_ONCE(nr >= entry->nvec_used)) + return -EINVAL; + } else { + if (WARN_ON_ONCE(nr > 0)) + return -EINVAL; + } + + return dev->irq + nr; +} +EXPORT_SYMBOL(pci_irq_vector); + struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc) { return to_pci_dev(desc->dev); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index d7ffd66814bb..e39a67c8ef39 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -777,7 +777,7 @@ static int pci_pm_suspend_noirq(struct device *dev) if (!pci_dev->state_saved) { pci_save_state(pci_dev); - if (!pci_has_subordinate(pci_dev)) + if (pci_power_manageable(pci_dev)) pci_prepare_to_sleep(pci_dev); } @@ -1144,7 +1144,6 @@ static int pci_pm_runtime_suspend(struct device *dev) return -ENOSYS; pci_dev->state_saved = false; - pci_dev->no_d3cold = false; error = pm->runtime_suspend(dev); if (error) { /* @@ -1161,8 +1160,6 @@ static int pci_pm_runtime_suspend(struct device *dev) return error; } - if (!pci_dev->d3cold_allowed) - pci_dev->no_d3cold = true; pci_fixup_device(pci_fixup_suspend, pci_dev); diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index d319a9ca9b7b..bcd10c795284 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -406,6 +406,11 @@ static ssize_t d3cold_allowed_store(struct device *dev, return -EINVAL; pdev->d3cold_allowed = !!val; + if (pdev->d3cold_allowed) + pci_d3cold_enable(pdev); + else + pci_d3cold_disable(pdev); + pm_runtime_resume(dev); return count; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index badbddc683f0..aab9d5115a5f 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -7,8 +7,10 @@ * Copyright 1997 -- 2000 Martin Mares */ +#include #include #include +#include #include #include #include @@ -25,7 +27,9 @@ #include #include #include +#include #include +#include #include #include "pci.h" @@ -81,6 +85,9 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE; unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE; unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE; +#define DEFAULT_HOTPLUG_BUS_SIZE 1 +unsigned long pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE; + enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_DEFAULT; /* @@ -101,6 +108,21 @@ unsigned int pcibios_max_latency = 255; /* If set, the PCIe ARI capability will not be used. */ static bool pcie_ari_disabled; +/* Disable bridge_d3 for all PCIe ports */ +static bool pci_bridge_d3_disable; +/* Force bridge_d3 for all PCIe ports */ +static bool pci_bridge_d3_force; + +static int __init pcie_port_pm_setup(char *str) +{ + if (!strcmp(str, "off")) + pci_bridge_d3_disable = true; + else if (!strcmp(str, "force")) + pci_bridge_d3_force = true; + return 1; +} +__setup("pcie_port_pm=", pcie_port_pm_setup); + /** * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children * @bus: pointer to PCI bus structure to search @@ -2155,6 +2177,164 @@ void pci_config_pm_runtime_put(struct pci_dev *pdev) pm_runtime_put_sync(parent); } +/** + * pci_bridge_d3_possible - Is it possible to put the bridge into D3 + * @bridge: Bridge to check + * + * This function checks if it is possible to move the bridge to D3. + * Currently we only allow D3 for recent enough PCIe ports. + */ +static bool pci_bridge_d3_possible(struct pci_dev *bridge) +{ + unsigned int year; + + if (!pci_is_pcie(bridge)) + return false; + + switch (pci_pcie_type(bridge)) { + case PCI_EXP_TYPE_ROOT_PORT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_DOWNSTREAM: + if (pci_bridge_d3_disable) + return false; + if (pci_bridge_d3_force) + return true; + + /* + * It should be safe to put PCIe ports from 2015 or newer + * to D3. + */ + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && + year >= 2015) { + return true; + } + break; + } + + return false; +} + +static int pci_dev_check_d3cold(struct pci_dev *dev, void *data) +{ + bool *d3cold_ok = data; + bool no_d3cold; + + /* + * The device needs to be allowed to go D3cold and if it is wake + * capable to do so from D3cold. + */ + no_d3cold = dev->no_d3cold || !dev->d3cold_allowed || + (device_may_wakeup(&dev->dev) && !pci_pme_capable(dev, PCI_D3cold)) || + !pci_power_manageable(dev); + + *d3cold_ok = !no_d3cold; + + return no_d3cold; +} + +/* + * pci_bridge_d3_update - Update bridge D3 capabilities + * @dev: PCI device which is changed + * @remove: Is the device being removed + * + * Update upstream bridge PM capabilities accordingly depending on if the + * device PM configuration was changed or the device is being removed. The + * change is also propagated upstream. + */ +static void pci_bridge_d3_update(struct pci_dev *dev, bool remove) +{ + struct pci_dev *bridge; + bool d3cold_ok = true; + + bridge = pci_upstream_bridge(dev); + if (!bridge || !pci_bridge_d3_possible(bridge)) + return; + + pci_dev_get(bridge); + /* + * If the device is removed we do not care about its D3cold + * capabilities. + */ + if (!remove) + pci_dev_check_d3cold(dev, &d3cold_ok); + + if (d3cold_ok) { + /* + * We need to go through all children to find out if all of + * them can still go to D3cold. + */ + pci_walk_bus(bridge->subordinate, pci_dev_check_d3cold, + &d3cold_ok); + } + + if (bridge->bridge_d3 != d3cold_ok) { + bridge->bridge_d3 = d3cold_ok; + /* Propagate change to upstream bridges */ + pci_bridge_d3_update(bridge, false); + } + + pci_dev_put(bridge); +} + +/** + * pci_bridge_d3_device_changed - Update bridge D3 capabilities on change + * @dev: PCI device that was changed + * + * If a device is added or its PM configuration, such as is it allowed to + * enter D3cold, is changed this function updates upstream bridge PM + * capabilities accordingly. + */ +void pci_bridge_d3_device_changed(struct pci_dev *dev) +{ + pci_bridge_d3_update(dev, false); +} + +/** + * pci_bridge_d3_device_removed - Update bridge D3 capabilities on remove + * @dev: PCI device being removed + * + * Function updates upstream bridge PM capabilities based on other devices + * still left on the bus. + */ +void pci_bridge_d3_device_removed(struct pci_dev *dev) +{ + pci_bridge_d3_update(dev, true); +} + +/** + * pci_d3cold_enable - Enable D3cold for device + * @dev: PCI device to handle + * + * This function can be used in drivers to enable D3cold from the device + * they handle. It also updates upstream PCI bridge PM capabilities + * accordingly. + */ +void pci_d3cold_enable(struct pci_dev *dev) +{ + if (dev->no_d3cold) { + dev->no_d3cold = false; + pci_bridge_d3_device_changed(dev); + } +} +EXPORT_SYMBOL_GPL(pci_d3cold_enable); + +/** + * pci_d3cold_disable - Disable D3cold for device + * @dev: PCI device to handle + * + * This function can be used in drivers to disable D3cold from the device + * they handle. It also updates upstream PCI bridge PM capabilities + * accordingly. + */ +void pci_d3cold_disable(struct pci_dev *dev) +{ + if (!dev->no_d3cold) { + dev->no_d3cold = true; + pci_bridge_d3_device_changed(dev); + } +} +EXPORT_SYMBOL_GPL(pci_d3cold_disable); + /** * pci_pm_init - Initialize PM functions of given PCI device * @dev: PCI device to handle. @@ -2189,6 +2369,7 @@ void pci_pm_init(struct pci_dev *dev) dev->pm_cap = pm; dev->d3_delay = PCI_PM_D3_WAIT; dev->d3cold_delay = PCI_PM_D3COLD_WAIT; + dev->bridge_d3 = pci_bridge_d3_possible(dev); dev->d3cold_allowed = true; dev->d1_support = false; @@ -3165,6 +3346,23 @@ int __weak pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr) #endif } +/** + * pci_unmap_iospace - Unmap the memory mapped I/O space + * @res: resource to be unmapped + * + * Unmap the CPU virtual address @res from virtual address space. + * Only architectures that have memory mapped IO functions defined + * (and the PCI_IOBASE value defined) should call this function. + */ +void pci_unmap_iospace(struct resource *res) +{ +#if defined(PCI_IOBASE) && defined(CONFIG_MMU) + unsigned long vaddr = (unsigned long)PCI_IOBASE + res->start; + + unmap_kernel_range(vaddr, resource_size(res)); +#endif +} + static void __pci_set_master(struct pci_dev *dev, bool enable) { u16 old_cmd, cmd; @@ -4755,6 +4953,7 @@ static DEFINE_SPINLOCK(resource_alignment_lock); static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev) { int seg, bus, slot, func, align_order, count; + unsigned short vendor, device, subsystem_vendor, subsystem_device; resource_size_t align = 0; char *p; @@ -4768,28 +4967,55 @@ static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev) } else { align_order = -1; } - if (sscanf(p, "%x:%x:%x.%x%n", - &seg, &bus, &slot, &func, &count) != 4) { - seg = 0; - if (sscanf(p, "%x:%x.%x%n", - &bus, &slot, &func, &count) != 3) { - /* Invalid format */ - printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: %s\n", - p); + if (strncmp(p, "pci:", 4) == 0) { + /* PCI vendor/device (subvendor/subdevice) ids are specified */ + p += 4; + if (sscanf(p, "%hx:%hx:%hx:%hx%n", + &vendor, &device, &subsystem_vendor, &subsystem_device, &count) != 4) { + if (sscanf(p, "%hx:%hx%n", &vendor, &device, &count) != 2) { + printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: pci:%s\n", + p); + break; + } + subsystem_vendor = subsystem_device = 0; + } + p += count; + if ((!vendor || (vendor == dev->vendor)) && + (!device || (device == dev->device)) && + (!subsystem_vendor || (subsystem_vendor == dev->subsystem_vendor)) && + (!subsystem_device || (subsystem_device == dev->subsystem_device))) { + if (align_order == -1) + align = PAGE_SIZE; + else + align = 1 << align_order; + /* Found */ break; } } - p += count; - if (seg == pci_domain_nr(dev->bus) && - bus == dev->bus->number && - slot == PCI_SLOT(dev->devfn) && - func == PCI_FUNC(dev->devfn)) { - if (align_order == -1) - align = PAGE_SIZE; - else - align = 1 << align_order; - /* Found */ - break; + else { + if (sscanf(p, "%x:%x:%x.%x%n", + &seg, &bus, &slot, &func, &count) != 4) { + seg = 0; + if (sscanf(p, "%x:%x.%x%n", + &bus, &slot, &func, &count) != 3) { + /* Invalid format */ + printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: %s\n", + p); + break; + } + } + p += count; + if (seg == pci_domain_nr(dev->bus) && + bus == dev->bus->number && + slot == PCI_SLOT(dev->devfn) && + func == PCI_FUNC(dev->devfn)) { + if (align_order == -1) + align = PAGE_SIZE; + else + align = 1 << align_order; + /* Found */ + break; + } } if (*p != ';' && *p != ',') { /* End of param or invalid format */ @@ -4897,7 +5123,7 @@ static ssize_t pci_resource_alignment_store(struct bus_type *bus, return pci_set_resource_alignment_param(buf, count); } -BUS_ATTR(resource_alignment, 0644, pci_resource_alignment_show, +static BUS_ATTR(resource_alignment, 0644, pci_resource_alignment_show, pci_resource_alignment_store); static int __init pci_resource_alignment_sysfs_init(void) @@ -4923,7 +5149,7 @@ int pci_get_new_domain_nr(void) } #ifdef CONFIG_PCI_DOMAINS_GENERIC -void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent) +static int of_pci_bus_find_domain_nr(struct device *parent) { static int use_dt_domains = -1; int domain = -1; @@ -4967,7 +5193,13 @@ void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent) domain = -1; } - bus->domain_nr = domain; + return domain; +} + +int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent) +{ + return acpi_disabled ? of_pci_bus_find_domain_nr(parent) : + acpi_pci_bus_find_domain_nr(bus); } #endif #endif @@ -5021,6 +5253,11 @@ static int __init pci_setup(char *str) pci_hotplug_io_size = memparse(str + 9, &str); } else if (!strncmp(str, "hpmemsize=", 10)) { pci_hotplug_mem_size = memparse(str + 10, &str); + } else if (!strncmp(str, "hpbussize=", 10)) { + pci_hotplug_bus_size = + simple_strtoul(str + 10, &str, 0); + if (pci_hotplug_bus_size > 0xff) + pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE; } else if (!strncmp(str, "pcie_bus_tune_off", 17)) { pcie_bus_config = PCIE_BUS_TUNE_OFF; } else if (!strncmp(str, "pcie_bus_safe", 13)) { diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index a814bbb80fcb..9730c474b016 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -82,6 +82,8 @@ void pci_pm_init(struct pci_dev *dev); void pci_ea_init(struct pci_dev *dev); void pci_allocate_cap_save_buffers(struct pci_dev *dev); void pci_free_cap_save_buffers(struct pci_dev *dev); +void pci_bridge_d3_device_changed(struct pci_dev *dev); +void pci_bridge_d3_device_removed(struct pci_dev *dev); static inline void pci_wakeup_event(struct pci_dev *dev) { @@ -94,6 +96,15 @@ static inline bool pci_has_subordinate(struct pci_dev *pci_dev) return !!(pci_dev->subordinate); } +static inline bool pci_power_manageable(struct pci_dev *pci_dev) +{ + /* + * Currently we allow normal PCI devices and PCI bridges transition + * into D3 if their bridge_d3 is set. + */ + return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3; +} + struct pci_vpd_ops { ssize_t (*read)(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t (*write)(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig index 22ca6412bd15..7fcea75afa4c 100644 --- a/drivers/pci/pcie/Kconfig +++ b/drivers/pci/pcie/Kconfig @@ -83,7 +83,7 @@ config PCIE_PME depends on PCIEPORTBUS && PM config PCIE_DPC - tristate "PCIe Downstream Port Containment support" + bool "PCIe Downstream Port Containment support" depends on PCIEPORTBUS default n help @@ -92,6 +92,3 @@ config PCIE_DPC will be handled by the DPC driver. If your system doesn't have this capability or you do not want to use this feature, it is safe to answer N. - - To compile this driver as a module, choose M here: the module - will be called pcie-dpc. diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 2dfe7fdb77e7..0ec649d961d7 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -139,7 +139,7 @@ static void pcie_set_clkpm_nocheck(struct pcie_link_state *link, int enable) static void pcie_set_clkpm(struct pcie_link_state *link, int enable) { /* Don't enable Clock PM if the link is not Clock PM capable */ - if (!link->clkpm_capable && enable) + if (!link->clkpm_capable) enable = 0; /* Need nothing if the specified equals to current state */ if (link->clkpm_enabled == enable) diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c index ab552f1bc08f..250f87861786 100644 --- a/drivers/pci/pcie/pcie-dpc.c +++ b/drivers/pci/pcie/pcie-dpc.c @@ -15,8 +15,8 @@ struct dpc_dev { struct pcie_device *dev; - struct work_struct work; - int cap_pos; + struct work_struct work; + int cap_pos; }; static void dpc_wait_link_inactive(struct pci_dev *pdev) @@ -89,7 +89,7 @@ static int dpc_probe(struct pcie_device *dev) int status; u16 ctl, cap; - dpc = kzalloc(sizeof(*dpc), GFP_KERNEL); + dpc = devm_kzalloc(&dev->device, sizeof(*dpc), GFP_KERNEL); if (!dpc) return -ENOMEM; @@ -98,11 +98,12 @@ static int dpc_probe(struct pcie_device *dev) INIT_WORK(&dpc->work, interrupt_event_handler); set_service_data(dev, dpc); - status = request_irq(dev->irq, dpc_irq, IRQF_SHARED, "pcie-dpc", dpc); + status = devm_request_irq(&dev->device, dev->irq, dpc_irq, IRQF_SHARED, + "pcie-dpc", dpc); if (status) { dev_warn(&dev->device, "request IRQ%d failed: %d\n", dev->irq, status); - goto out; + return status; } pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP, &cap); @@ -117,9 +118,6 @@ static int dpc_probe(struct pcie_device *dev) FLAG(cap, PCI_EXP_DPC_CAP_SW_TRIGGER), (cap >> 8) & 0xf, FLAG(cap, PCI_EXP_DPC_CAP_DL_ACTIVE)); return status; - out: - kfree(dpc); - return status; } static void dpc_remove(struct pcie_device *dev) @@ -131,14 +129,11 @@ static void dpc_remove(struct pcie_device *dev) pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, &ctl); ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN); pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl); - - free_irq(dev->irq, dpc); - kfree(dpc); } static struct pcie_port_service_driver dpcdriver = { .name = "dpc", - .port_type = PCI_EXP_TYPE_ROOT_PORT | PCI_EXP_TYPE_DOWNSTREAM, + .port_type = PCIE_ANY_PORT, .service = PCIE_PORT_SERVICE_DPC, .probe = dpc_probe, .remove = dpc_remove, diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 32d4d0a3d20e..e9270b4026f3 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -342,6 +343,8 @@ static int pcie_device_init(struct pci_dev *pdev, int service, int irq) return retval; } + pm_runtime_no_callbacks(device); + return 0; } diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index be35da2e105e..70d7ad8c6d17 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -93,6 +93,26 @@ static int pcie_port_resume_noirq(struct device *dev) return 0; } +static int pcie_port_runtime_suspend(struct device *dev) +{ + return to_pci_dev(dev)->bridge_d3 ? 0 : -EBUSY; +} + +static int pcie_port_runtime_resume(struct device *dev) +{ + return 0; +} + +static int pcie_port_runtime_idle(struct device *dev) +{ + /* + * Assume the PCI core has set bridge_d3 whenever it thinks the port + * should be good to go to D3. Everything else, including moving + * the port to D3, is handled by the PCI core. + */ + return to_pci_dev(dev)->bridge_d3 ? 0 : -EBUSY; +} + static const struct dev_pm_ops pcie_portdrv_pm_ops = { .suspend = pcie_port_device_suspend, .resume = pcie_port_device_resume, @@ -101,6 +121,9 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = { .poweroff = pcie_port_device_suspend, .restore = pcie_port_device_resume, .resume_noirq = pcie_port_resume_noirq, + .runtime_suspend = pcie_port_runtime_suspend, + .runtime_resume = pcie_port_runtime_resume, + .runtime_idle = pcie_port_runtime_idle, }; #define PCIE_PORTDRV_PM_OPS (&pcie_portdrv_pm_ops) @@ -134,16 +157,39 @@ static int pcie_portdrv_probe(struct pci_dev *dev, return status; pci_save_state(dev); + /* - * D3cold may not work properly on some PCIe port, so disable - * it by default. + * Prevent runtime PM if the port is advertising support for PCIe + * hotplug. Otherwise the BIOS hotplug SMI code might not be able + * to enumerate devices behind this port properly (the port is + * powered down preventing all config space accesses to the + * subordinate devices). We can't be sure for native PCIe hotplug + * either so prevent that as well. */ - dev->d3cold_allowed = false; + if (!dev->is_hotplug_bridge) { + /* + * Keep the port resumed 100ms to make sure things like + * config space accesses from userspace (lspci) will not + * cause the port to repeatedly suspend and resume. + */ + pm_runtime_set_autosuspend_delay(&dev->dev, 100); + pm_runtime_use_autosuspend(&dev->dev); + pm_runtime_mark_last_busy(&dev->dev); + pm_runtime_put_autosuspend(&dev->dev); + pm_runtime_allow(&dev->dev); + } + return 0; } static void pcie_portdrv_remove(struct pci_dev *dev) { + if (!dev->is_hotplug_bridge) { + pm_runtime_forbid(&dev->dev); + pm_runtime_get_noresume(&dev->dev); + pm_runtime_dont_use_autosuspend(&dev->dev); + } + pcie_port_device_remove(dev); } diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 8e3ef720997d..93f280df3428 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "pci.h" #define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */ @@ -832,6 +833,12 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass) u8 primary, secondary, subordinate; int broken = 0; + /* + * Make sure the bridge is powered on to be able to access config + * space of devices below it. + */ + pm_runtime_get_sync(&dev->dev); + pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses); primary = buses & 0xFF; secondary = (buses >> 8) & 0xFF; @@ -1012,6 +1019,8 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass) out: pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl); + pm_runtime_put(&dev->dev); + return max; } EXPORT_SYMBOL(pci_scan_bridge); @@ -2076,6 +2085,15 @@ unsigned int pci_scan_child_bus(struct pci_bus *bus) max = pci_scan_bridge(bus, dev, max, pass); } + /* + * Make sure a hotplug bridge has at least the minimum requested + * number of buses. + */ + if (bus->self && bus->self->is_hotplug_bridge && pci_hotplug_bus_size) { + if (max - bus->busn_res.start < pci_hotplug_bus_size - 1) + max = bus->busn_res.start + pci_hotplug_bus_size - 1; + } + /* * We've scanned the bus and so we know all about what's on * the other side of any bridges that may be on this bus plus @@ -2127,7 +2145,9 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus, b->sysdata = sysdata; b->ops = ops; b->number = b->busn_res.start = bus; - pci_bus_assign_domain_nr(b, parent); +#ifdef CONFIG_PCI_DOMAINS_GENERIC + b->domain_nr = pci_bus_find_domain_nr(b, parent); +#endif b2 = pci_find_bus(pci_domain_nr(b), bus); if (b2) { /* If we already got to this bus through a different bridge, ignore it */ diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c index 3f155e78513f..2408abe4ee8c 100644 --- a/drivers/pci/proc.c +++ b/drivers/pci/proc.c @@ -231,7 +231,7 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma) { struct pci_dev *dev = PDE_DATA(file_inode(file)); struct pci_filp_private *fpriv = file->private_data; - int i, ret; + int i, ret, write_combine; if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -245,9 +245,12 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma) if (i >= PCI_ROM_RESOURCE) return -ENODEV; + if (fpriv->mmap_state == pci_mmap_mem) + write_combine = fpriv->write_combine; + else + write_combine = 0; ret = pci_mmap_page_range(dev, vma, - fpriv->mmap_state, - fpriv->write_combine); + fpriv->mmap_state, write_combine); if (ret < 0) return ret; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index ee72ebe18f4b..37ff0158e45f 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3189,13 +3189,15 @@ static void quirk_no_bus_reset(struct pci_dev *dev) } /* - * Atheros AR93xx chips do not behave after a bus reset. The device will - * throw a Link Down error on AER-capable systems and regardless of AER, - * config space of the device is never accessible again and typically - * causes the system to hang or reset when access is attempted. + * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. + * The device will throw a Link Down error on AER-capable systems and + * regardless of AER, config space of the device is never accessible again + * and typically causes the system to hang or reset when access is attempted. * http://www.spinics.net/lists/linux-pci/msg34797.html */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0030, quirk_no_bus_reset); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0032, quirk_no_bus_reset); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003c, quirk_no_bus_reset); static void quirk_no_pm_reset(struct pci_dev *dev) { @@ -3711,6 +3713,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x9172, /* https://bugzilla.kernel.org/show_bug.cgi?id=42679#c59 */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x917a, quirk_dma_func1_alias); +/* https://bugzilla.kernel.org/show_bug.cgi?id=42679#c78 */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x9182, + quirk_dma_func1_alias); /* https://bugzilla.kernel.org/show_bug.cgi?id=42679#c46 */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x91a0, quirk_dma_func1_alias); @@ -3747,6 +3752,9 @@ static const struct pci_device_id fixed_dma_alias_tbl[] = { { PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x0285, PCI_VENDOR_ID_ADAPTEC2, 0x02bb), /* Adaptec 3405 */ .driver_data = PCI_DEVFN(1, 0) }, + { PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x0285, + PCI_VENDOR_ID_ADAPTEC2, 0x02bc), /* Adaptec 3805 */ + .driver_data = PCI_DEVFN(1, 0) }, { 0 } }; @@ -4087,6 +4095,7 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_AMD, 0x7809, pci_quirk_amd_sb_acs }, { PCI_VENDOR_ID_SOLARFLARE, 0x0903, pci_quirk_mf_endpoint_acs }, { PCI_VENDOR_ID_SOLARFLARE, 0x0923, pci_quirk_mf_endpoint_acs }, + { PCI_VENDOR_ID_SOLARFLARE, 0x0A03, pci_quirk_mf_endpoint_acs }, { PCI_VENDOR_ID_INTEL, 0x10C6, pci_quirk_mf_endpoint_acs }, { PCI_VENDOR_ID_INTEL, 0x10DB, pci_quirk_mf_endpoint_acs }, { PCI_VENDOR_ID_INTEL, 0x10DD, pci_quirk_mf_endpoint_acs }, diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index 8982026637d5..d1ef7acf6930 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -96,6 +96,8 @@ static void pci_remove_bus_device(struct pci_dev *dev) dev->subordinate = NULL; } + pci_bridge_d3_device_removed(dev); + pci_destroy_dev(dev); } diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index d678c46e5f03..c74059e10a6d 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1428,6 +1428,74 @@ void pci_bus_assign_resources(const struct pci_bus *bus) } EXPORT_SYMBOL(pci_bus_assign_resources); +static void pci_claim_device_resources(struct pci_dev *dev) +{ + int i; + + for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) { + struct resource *r = &dev->resource[i]; + + if (!r->flags || r->parent) + continue; + + pci_claim_resource(dev, i); + } +} + +static void pci_claim_bridge_resources(struct pci_dev *dev) +{ + int i; + + for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) { + struct resource *r = &dev->resource[i]; + + if (!r->flags || r->parent) + continue; + + pci_claim_bridge_resource(dev, i); + } +} + +static void pci_bus_allocate_dev_resources(struct pci_bus *b) +{ + struct pci_dev *dev; + struct pci_bus *child; + + list_for_each_entry(dev, &b->devices, bus_list) { + pci_claim_device_resources(dev); + + child = dev->subordinate; + if (child) + pci_bus_allocate_dev_resources(child); + } +} + +static void pci_bus_allocate_resources(struct pci_bus *b) +{ + struct pci_bus *child; + + /* + * Carry out a depth-first search on the PCI bus + * tree to allocate bridge apertures. Read the + * programmed bridge bases and recursively claim + * the respective bridge resources. + */ + if (b->self) { + pci_read_bridge_bases(b); + pci_claim_bridge_resources(b->self); + } + + list_for_each_entry(child, &b->children, node) + pci_bus_allocate_resources(child); +} + +void pci_bus_claim_resources(struct pci_bus *b) +{ + pci_bus_allocate_resources(b); + pci_bus_allocate_dev_resources(b); +} +EXPORT_SYMBOL(pci_bus_claim_resources); + static void __pci_bridge_assign_resources(const struct pci_dev *bridge, struct list_head *add_head, struct list_head *fail_head) diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index 0ac520dd1b21..c71df0c7dedc 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -46,7 +46,8 @@ struct read_info_sccb { u64 rnmax2; /* 104-111 */ u8 _pad_112[116 - 112]; /* 112-115 */ u8 fac116; /* 116 */ - u8 _pad_117[119 - 117]; /* 117-118 */ + u8 fac117; /* 117 */ + u8 _pad_118; /* 118 */ u8 fac119; /* 119 */ u16 hcpua; /* 120-121 */ u8 _pad_122[124 - 122]; /* 122-123 */ @@ -114,7 +115,12 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.facilities = sccb->facilities; sclp.has_sprp = !!(sccb->fac84 & 0x02); sclp.has_core_type = !!(sccb->fac84 & 0x01); + sclp.has_gsls = !!(sccb->fac85 & 0x80); + sclp.has_64bscao = !!(sccb->fac116 & 0x80); + sclp.has_cmma = !!(sccb->fac116 & 0x40); sclp.has_esca = !!(sccb->fac116 & 0x08); + sclp.has_pfmfi = !!(sccb->fac117 & 0x40); + sclp.has_ibs = !!(sccb->fac117 & 0x20); sclp.has_hvs = !!(sccb->fac119 & 0x80); if (sccb->fac85 & 0x02) S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP; @@ -145,6 +151,10 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.has_siif = cpue->siif; sclp.has_sigpif = cpue->sigpif; sclp.has_sief2 = cpue->sief2; + sclp.has_gpere = cpue->gpere; + sclp.has_ib = cpue->ib; + sclp.has_cei = cpue->cei; + sclp.has_skey = cpue->skey; break; } diff --git a/drivers/s390/char/sclp_ocf.c b/drivers/s390/char/sclp_ocf.c index 2553db0fdb52..f59b71776bbd 100644 --- a/drivers/s390/char/sclp_ocf.c +++ b/drivers/s390/char/sclp_ocf.c @@ -26,7 +26,7 @@ #define OCF_LENGTH_CPC_NAME 8UL static char hmc_network[OCF_LENGTH_HMC_NETWORK + 1]; -static char cpc_name[OCF_LENGTH_CPC_NAME + 1]; +static char cpc_name[OCF_LENGTH_CPC_NAME]; /* in EBCDIC */ static DEFINE_SPINLOCK(sclp_ocf_lock); static struct work_struct sclp_ocf_change_work; @@ -72,9 +72,8 @@ static void sclp_ocf_handler(struct evbuf_header *evbuf) } if (cpc) { size = min(OCF_LENGTH_CPC_NAME, (size_t) cpc->length); + memset(cpc_name, 0, OCF_LENGTH_CPC_NAME); memcpy(cpc_name, cpc + 1, size); - EBCASC(cpc_name, size); - cpc_name[size] = 0; } spin_unlock(&sclp_ocf_lock); schedule_work(&sclp_ocf_change_work); @@ -85,15 +84,23 @@ static struct sclp_register sclp_ocf_event = { .receiver_fn = sclp_ocf_handler, }; +void sclp_ocf_cpc_name_copy(char *dst) +{ + spin_lock_irq(&sclp_ocf_lock); + memcpy(dst, cpc_name, OCF_LENGTH_CPC_NAME); + spin_unlock_irq(&sclp_ocf_lock); +} +EXPORT_SYMBOL(sclp_ocf_cpc_name_copy); + static ssize_t cpc_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - int rc; + char name[OCF_LENGTH_CPC_NAME + 1]; - spin_lock_irq(&sclp_ocf_lock); - rc = snprintf(page, PAGE_SIZE, "%s\n", cpc_name); - spin_unlock_irq(&sclp_ocf_lock); - return rc; + sclp_ocf_cpc_name_copy(name); + name[OCF_LENGTH_CPC_NAME] = 0; + EBCASC(name, OCF_LENGTH_CPC_NAME); + return snprintf(page, PAGE_SIZE, "%s\n", name); } static struct kobj_attribute cpc_name_attr = diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index adf61b43eb70..734a0428ef0e 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -4854,20 +4854,17 @@ static int lpfc_enable_pci_dev(struct lpfc_hba *phba) { struct pci_dev *pdev; - int bars = 0; /* Obtain PCI device reference */ if (!phba->pcidev) goto out_error; else pdev = phba->pcidev; - /* Select PCI BARs */ - bars = pci_select_bars(pdev, IORESOURCE_MEM); /* Enable PCI device */ if (pci_enable_device_mem(pdev)) goto out_error; /* Request PCI resource for the device */ - if (pci_request_selected_regions(pdev, bars, LPFC_DRIVER_NAME)) + if (pci_request_mem_regions(pdev, LPFC_DRIVER_NAME)) goto out_disable_device; /* Set up device as PCI master and save state for EEH */ pci_set_master(pdev); @@ -4884,7 +4881,7 @@ lpfc_enable_pci_dev(struct lpfc_hba *phba) pci_disable_device(pdev); out_error: lpfc_printf_log(phba, KERN_ERR, LOG_INIT, - "1401 Failed to enable pci device, bars:x%x\n", bars); + "1401 Failed to enable pci device\n"); return -ENODEV; } @@ -4899,17 +4896,14 @@ static void lpfc_disable_pci_dev(struct lpfc_hba *phba) { struct pci_dev *pdev; - int bars; /* Obtain PCI device reference */ if (!phba->pcidev) return; else pdev = phba->pcidev; - /* Select PCI BARs */ - bars = pci_select_bars(pdev, IORESOURCE_MEM); /* Release PCI resource and disable PCI device */ - pci_release_selected_regions(pdev, bars); + pci_release_mem_regions(pdev); pci_disable_device(pdev); return; @@ -9811,7 +9805,6 @@ lpfc_pci_remove_one_s3(struct pci_dev *pdev) struct lpfc_vport **vports; struct lpfc_hba *phba = vport->phba; int i; - int bars = pci_select_bars(pdev, IORESOURCE_MEM); spin_lock_irq(&phba->hbalock); vport->load_flag |= FC_UNLOADING; @@ -9886,7 +9879,7 @@ lpfc_pci_remove_one_s3(struct pci_dev *pdev) lpfc_hba_free(phba); - pci_release_selected_regions(pdev, bars); + pci_release_mem_regions(pdev); pci_disable_device(pdev); } diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index c10972fcc8e4..4fd041bec332 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -387,7 +387,7 @@ static int xhci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup) * need to have the registers polled during D3, so avoid D3cold. */ if (xhci->quirks & XHCI_COMP_MODE_QUIRK) - pdev->no_d3cold = true; + pci_d3cold_disable(pdev); if (xhci->quirks & XHCI_PME_STUCK_QUIRK) xhci_pme_quirk(hcd); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 26a9d10d75e9..d5b6f959a3c3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1730,7 +1730,8 @@ enum { POOL_WRITE = 2, }; -static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) +static int __ceph_pool_perm_get(struct ceph_inode_info *ci, + s64 pool, struct ceph_string *pool_ns) { struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1738,6 +1739,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) struct rb_node **p, *parent; struct ceph_pool_perm *perm; struct page **pages; + size_t pool_ns_len; int err = 0, err2 = 0, have = 0; down_read(&mdsc->pool_perm_rwsem); @@ -1749,17 +1751,31 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) else if (pool > perm->pool) p = &(*p)->rb_right; else { - have = perm->perm; - break; + int ret = ceph_compare_string(pool_ns, + perm->pool_ns, + perm->pool_ns_len); + if (ret < 0) + p = &(*p)->rb_left; + else if (ret > 0) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } } } up_read(&mdsc->pool_perm_rwsem); if (*p) goto out; - dout("__ceph_pool_perm_get pool %u no perm cached\n", pool); + if (pool_ns) + dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n", + pool, (int)pool_ns->len, pool_ns->str); + else + dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool); down_write(&mdsc->pool_perm_rwsem); + p = &mdsc->pool_perm_tree.rb_node; parent = NULL; while (*p) { parent = *p; @@ -1769,8 +1785,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) else if (pool > perm->pool) p = &(*p)->rb_right; else { - have = perm->perm; - break; + int ret = ceph_compare_string(pool_ns, + perm->pool_ns, + perm->pool_ns_len); + if (ret < 0) + p = &(*p)->rb_left; + else if (ret > 0) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } } } if (*p) { @@ -1788,6 +1813,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) rd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); rd_req->r_base_oloc.pool = pool; + if (pool_ns) + rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns); ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); @@ -1841,7 +1868,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) goto out_unlock; } - perm = kmalloc(sizeof(*perm), GFP_NOFS); + pool_ns_len = pool_ns ? pool_ns->len : 0; + perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS); if (!perm) { err = -ENOMEM; goto out_unlock; @@ -1849,6 +1877,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) perm->pool = pool; perm->perm = have; + perm->pool_ns_len = pool_ns_len; + if (pool_ns_len > 0) + memcpy(perm->pool_ns, pool_ns->str, pool_ns_len); + perm->pool_ns[pool_ns_len] = 0; + rb_link_node(&perm->node, parent, p); rb_insert_color(&perm->node, &mdsc->pool_perm_tree); err = 0; @@ -1860,43 +1893,46 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) out: if (!err) err = have; - dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err); + if (pool_ns) + dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n", + pool, (int)pool_ns->len, pool_ns->str, err); + else + dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err); return err; } int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) { - u32 pool; + s64 pool; + struct ceph_string *pool_ns; int ret, flags; - /* does not support pool namespace yet */ - if (ci->i_pool_ns_len) - return -EIO; - if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), NOPOOLPERM)) return 0; spin_lock(&ci->i_ceph_lock); flags = ci->i_ceph_flags; - pool = ceph_file_layout_pg_pool(ci->i_layout); + pool = ci->i_layout.pool_id; spin_unlock(&ci->i_ceph_lock); check: if (flags & CEPH_I_POOL_PERM) { if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { - dout("ceph_pool_perm_check pool %u no read perm\n", + dout("ceph_pool_perm_check pool %lld no read perm\n", pool); return -EPERM; } if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { - dout("ceph_pool_perm_check pool %u no write perm\n", + dout("ceph_pool_perm_check pool %lld no write perm\n", pool); return -EPERM; } return 0; } - ret = __ceph_pool_perm_get(ci, pool); + pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); + ret = __ceph_pool_perm_get(ci, pool, pool_ns); + ceph_put_string(pool_ns); if (ret < 0) return ret; @@ -1907,10 +1943,11 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) flags |= CEPH_I_POOL_WR; spin_lock(&ci->i_ceph_lock); - if (pool == ceph_file_layout_pg_pool(ci->i_layout)) { - ci->i_ceph_flags = flags; + if (pool == ci->i_layout.pool_id && + pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) { + ci->i_ceph_flags |= flags; } else { - pool = ceph_file_layout_pg_pool(ci->i_layout); + pool = ci->i_layout.pool_id; flags = ci->i_ceph_flags; } spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 238c55b01723..5bc5d37b1217 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -71,7 +71,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) &ceph_fscache_fsid_object_def, fsc, true); if (!fsc->fscache) - pr_err("Unable to resgister fsid: %p fscache cookie", fsc); + pr_err("Unable to register fsid: %p fscache cookie\n", fsc); return 0; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6f60d0a3d0f9..99115cae1652 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -40,6 +40,11 @@ * cluster to release server state. */ +static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc); +static void __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci, + u64 oldest_flush_tid); /* * Generate readable cap strings for debugging output. @@ -849,12 +854,14 @@ int __ceph_caps_used(struct ceph_inode_info *ci) */ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { - int want = 0; - int mode; - for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) - if (ci->i_nr_by_mode[mode]) - want |= ceph_caps_for_mode(mode); - return want; + int i, bits = 0; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + if (ci->i_nr_by_mode[i]) + bits |= 1 << i; + } + if (bits == 0) + return 0; + return ceph_caps_for_mode(bits >> 1); } /* @@ -991,7 +998,7 @@ static int send_cap_msg(struct ceph_mds_session *session, u32 seq, u64 flush_tid, u64 oldest_flush_tid, u32 issue_seq, u32 mseq, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, - struct timespec *ctime, u64 time_warp_seq, + struct timespec *ctime, u32 time_warp_seq, kuid_t uid, kgid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, @@ -1116,8 +1123,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, struct inode *inode = &ci->vfs_inode; u64 cap_id = cap->cap_id; int held, revoking, dropping, keep; - u64 seq, issue_seq, mseq, time_warp_seq, follows; - u64 size, max_size; + u64 follows, size, max_size; + u32 seq, issue_seq, mseq, time_warp_seq; struct timespec mtime, atime, ctime; int wake = 0; umode_t mode; @@ -1215,6 +1222,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, return delayed; } +static inline int __send_flush_snap(struct inode *inode, + struct ceph_mds_session *session, + struct ceph_cap_snap *capsnap, + u32 mseq, u64 oldest_flush_tid) +{ + return send_cap_msg(session, ceph_vino(inode).ino, 0, + CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, + capsnap->dirty, 0, capsnap->cap_flush.tid, + oldest_flush_tid, 0, mseq, capsnap->size, 0, + &capsnap->mtime, &capsnap->atime, + &capsnap->ctime, capsnap->time_warp_seq, + capsnap->uid, capsnap->gid, capsnap->mode, + capsnap->xattr_version, capsnap->xattr_blob, + capsnap->follows, capsnap->inline_data); +} + /* * When a snapshot is taken, clients accumulate dirty metadata on * inodes with capabilities in ceph_cap_snaps to describe the file @@ -1222,37 +1245,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Unless @kick is true, skip cap_snaps that were already sent to - * the MDS (i.e., during this session). - * * Called under i_ceph_lock. Takes s_mutex as needed. */ -void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession, - int kick) +static void __ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session *session) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->vfs_inode; - int mds; + struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_cap_snap *capsnap; - u32 mseq; - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - struct ceph_mds_session *session = NULL; /* if session != NULL, we hold - session->s_mutex */ - u64 next_follows = 0; /* keep track of how far we've gotten through the - i_cap_snaps list, and skip these entries next time - around to avoid an infinite loop */ + u64 oldest_flush_tid = 0; + u64 first_tid = 1, last_tid = 0; - if (psession) - session = *psession; + dout("__flush_snaps %p session %p\n", inode, session); - dout("__flush_snaps %p\n", inode); -retry: list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - /* avoid an infiniute loop after retry */ - if (capsnap->follows < next_follows) - continue; /* * we need to wait for sync writes to complete and for dirty * pages to be written out. @@ -1263,97 +1271,129 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci, /* should be removed by ceph_try_drop_cap_snap() */ BUG_ON(!capsnap->need_flush); - /* pick mds, take s_mutex */ - if (ci->i_auth_cap == NULL) { - dout("no auth cap (migrating?), doing nothing\n"); - goto out; - } - /* only flush each capsnap once */ - if (!kick && !list_empty(&capsnap->flushing_item)) { - dout("already flushed %p, skipping\n", capsnap); + if (capsnap->cap_flush.tid > 0) { + dout(" already flushed %p, skipping\n", capsnap); continue; } - mds = ci->i_auth_cap->session->s_mds; - mseq = ci->i_auth_cap->mseq; - - if (session && session->s_mds != mds) { - dout("oops, wrong session %p mutex\n", session); - if (kick) - goto out; - - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - session = NULL; - } - if (!session) { - spin_unlock(&ci->i_ceph_lock); - mutex_lock(&mdsc->mutex); - session = __ceph_lookup_mds_session(mdsc, mds); - mutex_unlock(&mdsc->mutex); - if (session) { - dout("inverting session/ino locks on %p\n", - session); - mutex_lock(&session->s_mutex); - } - /* - * if session == NULL, we raced against a cap - * deletion or migration. retry, and we'll - * get a better @mds value next time. - */ - spin_lock(&ci->i_ceph_lock); - goto retry; - } - spin_lock(&mdsc->cap_dirty_lock); - capsnap->flush_tid = ++mdsc->last_cap_flush_tid; + capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid; + list_add_tail(&capsnap->cap_flush.g_list, + &mdsc->cap_flush_list); + if (oldest_flush_tid == 0) + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + if (list_empty(&ci->i_flushing_item)) { + list_add_tail(&ci->i_flushing_item, + &session->s_cap_flushing); + } spin_unlock(&mdsc->cap_dirty_lock); + list_add_tail(&capsnap->cap_flush.i_list, + &ci->i_cap_flush_list); + + if (first_tid == 1) + first_tid = capsnap->cap_flush.tid; + last_tid = capsnap->cap_flush.tid; + } + + ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS; + + while (first_tid <= last_tid) { + struct ceph_cap *cap = ci->i_auth_cap; + struct ceph_cap_flush *cf; + int ret; + + if (!(cap && cap->session == session)) { + dout("__flush_snaps %p auth cap %p not mds%d, " + "stop\n", inode, cap, session->s_mds); + break; + } + + ret = -ENOENT; + list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { + if (cf->tid >= first_tid) { + ret = 0; + break; + } + } + if (ret < 0) + break; + + first_tid = cf->tid + 1; + + capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); atomic_inc(&capsnap->nref); - if (list_empty(&capsnap->flushing_item)) - list_add_tail(&capsnap->flushing_item, - &session->s_cap_snaps_flushing); spin_unlock(&ci->i_ceph_lock); - dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", - inode, capsnap, capsnap->follows, capsnap->flush_tid); - send_cap_msg(session, ceph_vino(inode).ino, 0, - CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, - capsnap->dirty, 0, capsnap->flush_tid, 0, - 0, mseq, capsnap->size, 0, - &capsnap->mtime, &capsnap->atime, - &capsnap->ctime, capsnap->time_warp_seq, - capsnap->uid, capsnap->gid, capsnap->mode, - capsnap->xattr_version, capsnap->xattr_blob, - capsnap->follows, capsnap->inline_data); + dout("__flush_snaps %p capsnap %p tid %llu %s\n", + inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); + + ret = __send_flush_snap(inode, session, capsnap, cap->mseq, + oldest_flush_tid); + if (ret < 0) { + pr_err("__flush_snaps: error sending cap flushsnap, " + "ino (%llx.%llx) tid %llu follows %llu\n", + ceph_vinop(inode), cf->tid, capsnap->follows); + } - next_follows = capsnap->follows + 1; ceph_put_cap_snap(capsnap); - spin_lock(&ci->i_ceph_lock); + } +} + +void ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *session = *psession; + int mds; + dout("ceph_flush_snaps %p\n", inode); +retry: + spin_lock(&ci->i_ceph_lock); + if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { + dout(" no capsnap needs flush, doing nothing\n"); + goto out; + } + if (!ci->i_auth_cap) { + dout(" no auth cap (migrating?), doing nothing\n"); + goto out; + } + + mds = ci->i_auth_cap->session->s_mds; + if (session && session->s_mds != mds) { + dout(" oops, wrong session %p mutex\n", session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + session = NULL; + } + if (!session) { + spin_unlock(&ci->i_ceph_lock); + mutex_lock(&mdsc->mutex); + session = __ceph_lookup_mds_session(mdsc, mds); + mutex_unlock(&mdsc->mutex); + if (session) { + dout(" inverting session/ino locks on %p\n", session); + mutex_lock(&session->s_mutex); + } goto retry; } + __ceph_flush_snaps(ci, session); +out: + spin_unlock(&ci->i_ceph_lock); + + if (psession) { + *psession = session; + } else { + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + } /* we flushed them all; remove this inode from the queue */ spin_lock(&mdsc->snap_flush_lock); list_del_init(&ci->i_snap_flush_item); spin_unlock(&mdsc->snap_flush_lock); - -out: - if (psession) - *psession = session; - else if (session) { - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - } -} - -static void ceph_flush_snaps(struct ceph_inode_info *ci) -{ - spin_lock(&ci->i_ceph_lock); - __ceph_flush_snaps(ci, NULL, 0); - spin_unlock(&ci->i_ceph_lock); } /* @@ -1411,52 +1451,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, return dirty; } -static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci, - struct ceph_cap_flush *cf) -{ - struct rb_node **p = &ci->i_cap_flush_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_cap_flush *other = NULL; - - while (*p) { - parent = *p; - other = rb_entry(parent, struct ceph_cap_flush, i_node); - - if (cf->tid < other->tid) - p = &(*p)->rb_left; - else if (cf->tid > other->tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&cf->i_node, parent, p); - rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); -} - -static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc, - struct ceph_cap_flush *cf) -{ - struct rb_node **p = &mdsc->cap_flush_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_cap_flush *other = NULL; - - while (*p) { - parent = *p; - other = rb_entry(parent, struct ceph_cap_flush, g_node); - - if (cf->tid < other->tid) - p = &(*p)->rb_left; - else if (cf->tid > other->tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&cf->g_node, parent, p); - rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree); -} - struct ceph_cap_flush *ceph_alloc_cap_flush(void) { return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); @@ -1470,15 +1464,46 @@ void ceph_free_cap_flush(struct ceph_cap_flush *cf) static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) { - struct rb_node *n = rb_first(&mdsc->cap_flush_tree); - if (n) { + if (!list_empty(&mdsc->cap_flush_list)) { struct ceph_cap_flush *cf = - rb_entry(n, struct ceph_cap_flush, g_node); + list_first_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); return cf->tid; } return 0; } +/* + * Remove cap_flush from the mdsc's or inode's flushing cap list. + * Return true if caller needs to wake up flush waiters. + */ +static bool __finish_cap_flush(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci, + struct ceph_cap_flush *cf) +{ + struct ceph_cap_flush *prev; + bool wake = cf->wake; + if (mdsc) { + /* are there older pending cap flushes? */ + if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { + prev = list_prev_entry(cf, g_list); + prev->wake = true; + wake = false; + } + list_del(&cf->g_list); + } else if (ci) { + if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { + prev = list_prev_entry(cf, i_list); + prev->wake = true; + wake = false; + } + list_del(&cf->i_list); + } else { + BUG_ON(1); + } + return wake; +} + /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. @@ -1486,7 +1511,7 @@ static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) * Called under i_ceph_lock. */ static int __mark_caps_flushing(struct inode *inode, - struct ceph_mds_session *session, + struct ceph_mds_session *session, bool wake, u64 *flush_tid, u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -1509,26 +1534,22 @@ static int __mark_caps_flushing(struct inode *inode, swap(cf, ci->i_prealloc_cap_flush); cf->caps = flushing; + cf->wake = wake; spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); cf->tid = ++mdsc->last_cap_flush_tid; - __add_cap_flushing_to_mdsc(mdsc, cf); + list_add_tail(&cf->g_list, &mdsc->cap_flush_list); *oldest_flush_tid = __get_oldest_flush_tid(mdsc); if (list_empty(&ci->i_flushing_item)) { list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; - dout(" inode %p now flushing tid %llu\n", inode, cf->tid); - } else { - list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); - dout(" inode %p now flushing (more) tid %llu\n", - inode, cf->tid); } spin_unlock(&mdsc->cap_dirty_lock); - __add_cap_flushing_to_inode(ci, cf); + list_add_tail(&cf->i_list, &ci->i_cap_flush_list); *flush_tid = cf->tid; return flushing; @@ -1583,10 +1604,11 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ struct rb_node *p; - int tried_invalidate = 0; - int delayed = 0, sent = 0, force_requeue = 0, num; - int queue_invalidate = 0; - int is_delayed = flags & CHECK_CAPS_NODELAY; + int delayed = 0, sent = 0, num; + bool is_delayed = flags & CHECK_CAPS_NODELAY; + bool queue_invalidate = false; + bool force_requeue = false; + bool tried_invalidate = false; /* if we are unmounting, flush any unused caps immediately. */ if (mdsc->stopping) @@ -1597,9 +1619,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; - /* flush snaps first time around only */ - if (!list_empty(&ci->i_cap_snaps)) - __ceph_flush_snaps(ci, &session, 0); goto retry_locked; retry: spin_lock(&ci->i_ceph_lock); @@ -1666,17 +1685,17 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, if (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) { dout("check_caps queuing invalidate\n"); - queue_invalidate = 1; + queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } else { dout("check_caps failed to invalidate pages\n"); /* we failed to invalidate pages. check these caps again later. */ - force_requeue = 1; + force_requeue = true; __cap_set_timeouts(mdsc, ci); } } - tried_invalidate = 1; + tried_invalidate = true; goto retry_locked; } @@ -1720,10 +1739,15 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, } } /* flush anything dirty? */ - if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) && - ci->i_dirty_caps) { - dout("flushing dirty caps\n"); - goto ack; + if (cap == ci->i_auth_cap) { + if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) { + dout("flushing dirty caps\n"); + goto ack; + } + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) { + dout("flushing snap caps\n"); + goto ack; + } } /* completed revocation? going down and there are no caps? */ @@ -1782,6 +1806,26 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, goto retry; } } + + /* kick flushing and flush snaps before sending normal + * cap message */ + if (cap == ci->i_auth_cap && + (ci->i_ceph_flags & + (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + __kick_flushing_caps(mdsc, session, ci, + oldest_flush_tid); + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + } + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) + __ceph_flush_snaps(ci, session); + + goto retry_locked; + } + /* take snap_rwsem after session mutex */ if (!took_snap_rwsem) { if (down_read_trylock(&mdsc->snap_rwsem) == 0) { @@ -1796,7 +1840,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { - flushing = __mark_caps_flushing(inode, session, + flushing = __mark_caps_flushing(inode, session, false, &flush_tid, &oldest_flush_tid); } else { @@ -1822,7 +1866,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, * otherwise cancel. */ if (delayed && is_delayed) - force_requeue = 1; /* __send_cap delayed release; requeue */ + force_requeue = true; /* __send_cap delayed release; requeue */ if (!delayed && !is_delayed) __cap_delay_cancel(mdsc, ci); else if (!is_delayed || force_requeue) @@ -1873,8 +1917,8 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; - flushing = __mark_caps_flushing(inode, session, &flush_tid, - &oldest_flush_tid); + flushing = __mark_caps_flushing(inode, session, true, + &flush_tid, &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, @@ -1887,10 +1931,11 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) spin_unlock(&ci->i_ceph_lock); } } else { - struct rb_node *n = rb_last(&ci->i_cap_flush_tree); - if (n) { + if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = - rb_entry(n, struct ceph_cap_flush, i_node); + list_last_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + cf->wake = true; flush_tid = cf->tid; } flushing = ci->i_flushing_caps; @@ -1910,14 +1955,13 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap_flush *cf; - struct rb_node *n; int ret = 1; spin_lock(&ci->i_ceph_lock); - n = rb_first(&ci->i_cap_flush_tree); - if (n) { - cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (!list_empty(&ci->i_cap_flush_list)) { + struct ceph_cap_flush * cf = + list_first_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); if (cf->tid <= flush_tid) ret = 0; } @@ -1925,53 +1969,6 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid) return ret; } -/* - * Wait on any unsafe replies for the given inode. First wait on the - * newest request, and make that the upper bound. Then, if there are - * more requests, keep waiting on the oldest as long as it is still older - * than the original request. - */ -static void sync_write_wait(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_writes; - struct ceph_osd_request *req; - u64 last_tid; - - if (!S_ISREG(inode->i_mode)) - return; - - spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; - - /* set upper bound as _last_ entry in chain */ - req = list_last_entry(head, struct ceph_osd_request, - r_unsafe_item); - last_tid = req->r_tid; - - do { - ceph_osdc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); - dout("sync_write_wait on tid %llu (until %llu)\n", - req->r_tid, last_tid); - wait_for_completion(&req->r_safe_completion); - spin_lock(&ci->i_unsafe_lock); - ceph_osdc_put_request(req); - - /* - * from here on look at first entry in chain, since we - * only want to wait for anything older than last_tid - */ - if (list_empty(head)) - break; - req = list_first_entry(head, struct ceph_osd_request, - r_unsafe_item); - } while (req->r_tid < last_tid); -out: - spin_unlock(&ci->i_unsafe_lock); -} - /* * wait for any unsafe requests to complete. */ @@ -2024,7 +2021,8 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) int dirty; dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); - sync_write_wait(inode); + + ceph_sync_write_wait(inode); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) @@ -2087,87 +2085,74 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } -/* - * After a recovering MDS goes active, we need to resend any caps - * we were flushing. - * - * Caller holds session->s_mutex. - */ -static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) -{ - struct ceph_cap_snap *capsnap; - - dout("kick_flushing_capsnaps mds%d\n", session->s_mds); - list_for_each_entry(capsnap, &session->s_cap_snaps_flushing, - flushing_item) { - struct ceph_inode_info *ci = capsnap->ci; - struct inode *inode = &ci->vfs_inode; - struct ceph_cap *cap; - - spin_lock(&ci->i_ceph_lock); - cap = ci->i_auth_cap; - if (cap && cap->session == session) { - dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, - cap, capsnap); - __ceph_flush_snaps(ci, &session, 1); - } else { - pr_err("%p auth cap %p not mds%d ???\n", inode, - cap, session->s_mds); - } - spin_unlock(&ci->i_ceph_lock); - } -} - -static int __kick_flushing_caps(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - struct ceph_inode_info *ci) +static void __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci, + u64 oldest_flush_tid) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; struct ceph_cap_flush *cf; - struct rb_node *n; - int delayed = 0; + int ret; u64 first_tid = 0; - u64 oldest_flush_tid; - spin_lock(&mdsc->cap_dirty_lock); - oldest_flush_tid = __get_oldest_flush_tid(mdsc); - spin_unlock(&mdsc->cap_dirty_lock); + list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { + if (cf->tid < first_tid) + continue; - while (true) { - spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { - pr_err("%p auth cap %p not mds%d ???\n", inode, - cap, session->s_mds); - spin_unlock(&ci->i_ceph_lock); + pr_err("%p auth cap %p not mds%d ???\n", + inode, cap, session->s_mds); break; } - for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { - cf = rb_entry(n, struct ceph_cap_flush, i_node); - if (cf->tid >= first_tid) - break; - } - if (!n) { - spin_unlock(&ci->i_ceph_lock); - break; - } - - cf = rb_entry(n, struct ceph_cap_flush, i_node); - first_tid = cf->tid + 1; - dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, - cap, cf->tid, ceph_cap_string(cf->caps)); - delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - cf->caps, cf->tid, oldest_flush_tid); + if (cf->caps) { + dout("kick_flushing_caps %p cap %p tid %llu %s\n", + inode, cap, cf->tid, ceph_cap_string(cf->caps)); + ci->i_ceph_flags |= CEPH_I_NODELAY; + ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + cf->caps, cf->tid, oldest_flush_tid); + if (ret) { + pr_err("kick_flushing_caps: error sending " + "cap flush, ino (%llx.%llx) " + "tid %llu flushing %s\n", + ceph_vinop(inode), cf->tid, + ceph_cap_string(cf->caps)); + } + } else { + struct ceph_cap_snap *capsnap = + container_of(cf, struct ceph_cap_snap, + cap_flush); + dout("kick_flushing_caps %p capsnap %p tid %llu %s\n", + inode, capsnap, cf->tid, + ceph_cap_string(capsnap->dirty)); + + atomic_inc(&capsnap->nref); + spin_unlock(&ci->i_ceph_lock); + + ret = __send_flush_snap(inode, session, capsnap, cap->mseq, + oldest_flush_tid); + if (ret < 0) { + pr_err("kick_flushing_caps: error sending " + "cap flushsnap, ino (%llx.%llx) " + "tid %llu follows %llu\n", + ceph_vinop(inode), cf->tid, + capsnap->follows); + } + + ceph_put_cap_snap(capsnap); + } + + spin_lock(&ci->i_ceph_lock); } - return delayed; } void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, @@ -2175,8 +2160,14 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, { struct ceph_inode_info *ci; struct ceph_cap *cap; + u64 oldest_flush_tid; dout("early_kick_flushing_caps mds%d\n", session->s_mds); + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; @@ -2196,10 +2187,11 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, */ if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { - spin_unlock(&ci->i_ceph_lock); - if (!__kick_flushing_caps(mdsc, session, ci)) - continue; - spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + __kick_flushing_caps(mdsc, session, ci, + oldest_flush_tid); + } else { + ci->i_ceph_flags |= CEPH_I_KICK_FLUSH; } spin_unlock(&ci->i_ceph_lock); @@ -2210,50 +2202,56 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_inode_info *ci; - - kick_flushing_capsnaps(mdsc, session); + struct ceph_cap *cap; + u64 oldest_flush_tid; dout("kick_flushing_caps mds%d\n", session->s_mds); + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { - int delayed = __kick_flushing_caps(mdsc, session, ci); - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", + &ci->vfs_inode, cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); + continue; } + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + __kick_flushing_caps(mdsc, session, ci, + oldest_flush_tid); + } + spin_unlock(&ci->i_ceph_lock); } } static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct inode *inode) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; - spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; dout("kick_flushing_inode_caps %p flushing %s\n", inode, ceph_cap_string(ci->i_flushing_caps)); - __ceph_flush_snaps(ci, &session, 1); - - if (ci->i_flushing_caps) { - int delayed; - + if (!list_empty(&ci->i_cap_flush_list)) { + u64 oldest_flush_tid; spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); - - delayed = __kick_flushing_caps(mdsc, session, ci); - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); - spin_unlock(&ci->i_ceph_lock); - } } else { spin_unlock(&ci->i_ceph_lock); } @@ -2580,16 +2578,19 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) * drop cap_snap that is not associated with any snapshot. * we don't need to send FLUSHSNAP message for it. */ -static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap) +static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap) { if (!capsnap->need_flush && !capsnap->writing && !capsnap->dirty_pages) { - dout("dropping cap_snap %p follows %llu\n", capsnap, capsnap->follows); + BUG_ON(capsnap->cap_flush.tid > 0); ceph_put_snap_context(capsnap->context); + if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps)) + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; + list_del(&capsnap->ci_item); - list_del(&capsnap->flushing_item); ceph_put_cap_snap(capsnap); return 1; } @@ -2636,7 +2637,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) struct ceph_cap_snap, ci_item); capsnap->writing = 0; - if (ceph_try_drop_cap_snap(capsnap)) + if (ceph_try_drop_cap_snap(ci, capsnap)) put++; else if (__ceph_finish_cap_snap(ci, capsnap)) flushsnaps = 1; @@ -2661,7 +2662,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (last && !flushsnaps) ceph_check_caps(ci, 0, NULL); else if (flushsnaps) - ceph_flush_snaps(ci); + ceph_flush_snaps(ci, NULL); if (wake) wake_up_all(&ci->i_cap_wq); while (put-- > 0) @@ -2679,15 +2680,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { struct inode *inode = &ci->vfs_inode; - int last = 0; - int complete_capsnap = 0; - int drop_capsnap = 0; - int found = 0; struct ceph_cap_snap *capsnap = NULL; + int put = 0; + bool last = false; + bool found = false; + bool flush_snaps = false; + bool complete_capsnap = false; spin_lock(&ci->i_ceph_lock); ci->i_wrbuffer_ref -= nr; - last = !ci->i_wrbuffer_ref; + if (ci->i_wrbuffer_ref == 0) { + last = true; + put++; + } if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; @@ -2707,15 +2712,22 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, } else { list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->context == snapc) { - found = 1; + found = true; break; } } BUG_ON(!found); capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { - complete_capsnap = 1; - drop_capsnap = ceph_try_drop_cap_snap(capsnap); + complete_capsnap = true; + if (!capsnap->writing) { + if (ceph_try_drop_cap_snap(ci, capsnap)) { + put++; + } else { + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; + flush_snaps = true; + } + } } dout("put_wrbuffer_cap_refs on %p cap_snap %p " " snap %lld %d/%d -> %d/%d %s%s\n", @@ -2730,12 +2742,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (last) { ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); - iput(inode); - } else if (complete_capsnap) { - ceph_flush_snaps(ci); - wake_up_all(&ci->i_cap_wq); + } else if (flush_snaps) { + ceph_flush_snaps(ci, NULL); } - if (drop_capsnap) + if (complete_capsnap) + wake_up_all(&ci->i_cap_wq); + while (put-- > 0) iput(inode); } @@ -2779,12 +2791,11 @@ static void invalidate_aliases(struct inode *inode) */ static void handle_cap_grant(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *grant, - u64 inline_version, - void *inline_data, int inline_len, + struct ceph_string **pns, u64 inline_version, + void *inline_data, u32 inline_len, struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, - struct ceph_cap *cap, int issued, - u32 pool_ns_len) + struct ceph_cap *cap, int issued) __releases(ci->i_ceph_lock) __releases(mdsc->snap_rwsem) { @@ -2895,8 +2906,18 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { /* file layout may have changed */ - ci->i_layout = grant->layout; - ci->i_pool_ns_len = pool_ns_len; + s64 old_pool = ci->i_layout.pool_id; + struct ceph_string *old_ns; + + ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); + old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, + lockdep_is_held(&ci->i_ceph_lock)); + rcu_assign_pointer(ci->i_layout.pool_ns, *pns); + + if (ci->i_layout.pool_id != old_pool || *pns != old_ns) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; + + *pns = old_ns; /* size/truncate_seq? */ queue_trunc = ceph_fill_file_size(inode, issued, @@ -2979,13 +3000,13 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, fill_inline = true; } - spin_unlock(&ci->i_ceph_lock); - if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { - kick_flushing_inode_caps(mdsc, session, inode); - up_read(&mdsc->snap_rwsem); if (newcaps & ~issued) wake = true; + kick_flushing_inode_caps(mdsc, session, inode); + up_read(&mdsc->snap_rwsem); + } else { + spin_unlock(&ci->i_ceph_lock); } if (fill_inline) @@ -3029,23 +3050,24 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; - struct ceph_cap_flush *cf; - struct rb_node *n; + struct ceph_cap_flush *cf, *tmp_cf; LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; - int drop = 0; + bool drop = false; + bool wake_ci = 0; + bool wake_mdsc = 0; - n = rb_first(&ci->i_cap_flush_tree); - while (n) { - cf = rb_entry(n, struct ceph_cap_flush, i_node); - n = rb_next(&cf->i_node); + list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { if (cf->tid == flush_tid) cleaned = cf->caps; + if (cf->caps == 0) /* capsnap */ + continue; if (cf->tid <= flush_tid) { - rb_erase(&cf->i_node, &ci->i_cap_flush_tree); - list_add_tail(&cf->list, &to_remove); + if (__finish_cap_flush(NULL, ci, cf)) + wake_ci = true; + list_add_tail(&cf->i_list, &to_remove); } else { cleaned &= ~cf->caps; if (!cleaned) @@ -3066,31 +3088,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&to_remove)) { - list_for_each_entry(cf, &to_remove, list) - rb_erase(&cf->g_node, &mdsc->cap_flush_tree); - - n = rb_first(&mdsc->cap_flush_tree); - cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; - if (!cf || cf->tid > flush_tid) - wake_up_all(&mdsc->cap_flushing_wq); + list_for_each_entry(cf, &to_remove, i_list) { + if (__finish_cap_flush(mdsc, NULL, cf)) + wake_mdsc = true; } if (ci->i_flushing_caps == 0) { - list_del_init(&ci->i_flushing_item); - if (!list_empty(&session->s_cap_flushing)) - dout(" mds%d still flushing cap on %p\n", - session->s_mds, - &list_entry(session->s_cap_flushing.next, - struct ceph_inode_info, - i_flushing_item)->vfs_inode); + if (list_empty(&ci->i_cap_flush_list)) { + list_del_init(&ci->i_flushing_item); + if (!list_empty(&session->s_cap_flushing)) { + dout(" mds%d still flushing cap on %p\n", + session->s_mds, + &list_first_entry(&session->s_cap_flushing, + struct ceph_inode_info, + i_flushing_item)->vfs_inode); + } + } mdsc->num_cap_flushing--; dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { dout(" inode %p now clean\n", inode); BUG_ON(!list_empty(&ci->i_dirty_item)); - drop = 1; + drop = true; if (ci->i_wr_ref == 0 && ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); @@ -3102,17 +3122,21 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, } } spin_unlock(&mdsc->cap_dirty_lock); - wake_up_all(&ci->i_cap_wq); out: spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { cf = list_first_entry(&to_remove, - struct ceph_cap_flush, list); - list_del(&cf->list); + struct ceph_cap_flush, i_list); + list_del(&cf->i_list); ceph_free_cap_flush(cf); } + + if (wake_ci) + wake_up_all(&ci->i_cap_wq); + if (wake_mdsc) + wake_up_all(&mdsc->cap_flushing_wq); if (drop) iput(inode); } @@ -3131,7 +3155,9 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap; - int drop = 0; + bool flushed = false; + bool wake_ci = false; + bool wake_mdsc = false; dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", inode, ci, session->s_mds, follows); @@ -3139,30 +3165,47 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->follows == follows) { - if (capsnap->flush_tid != flush_tid) { + if (capsnap->cap_flush.tid != flush_tid) { dout(" cap_snap %p follows %lld tid %lld !=" " %lld\n", capsnap, follows, - flush_tid, capsnap->flush_tid); + flush_tid, capsnap->cap_flush.tid); break; } - WARN_ON(capsnap->dirty_pages || capsnap->writing); - dout(" removing %p cap_snap %p follows %lld\n", - inode, capsnap, follows); - ceph_put_snap_context(capsnap->context); - list_del(&capsnap->ci_item); - list_del(&capsnap->flushing_item); - ceph_put_cap_snap(capsnap); - wake_up_all(&mdsc->cap_flushing_wq); - drop = 1; + flushed = true; break; } else { dout(" skipping cap_snap %p follows %lld\n", capsnap, capsnap->follows); } } + if (flushed) { + WARN_ON(capsnap->dirty_pages || capsnap->writing); + dout(" removing %p cap_snap %p follows %lld\n", + inode, capsnap, follows); + list_del(&capsnap->ci_item); + if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush)) + wake_ci = true; + + spin_lock(&mdsc->cap_dirty_lock); + + if (list_empty(&ci->i_cap_flush_list)) + list_del_init(&ci->i_flushing_item); + + if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush)) + wake_mdsc = true; + + spin_unlock(&mdsc->cap_dirty_lock); + } spin_unlock(&ci->i_ceph_lock); - if (drop) + if (flushed) { + ceph_put_snap_context(capsnap->context); + ceph_put_cap_snap(capsnap); + if (wake_ci) + wake_up_all(&ci->i_cap_wq); + if (wake_mdsc) + wake_up_all(&mdsc->cap_flushing_wq); iput(inode); + } } /* @@ -3267,7 +3310,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, tcap->implemented |= issued; if (cap == ci->i_auth_cap) ci->i_auth_cap = tcap; - if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { + if (!list_empty(&ci->i_cap_flush_list) && + ci->i_auth_cap == tcap) { spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &tcap->session->s_cap_flushing); @@ -3420,20 +3464,18 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_cap *cap; struct ceph_mds_caps *h; struct ceph_mds_cap_peer *peer = NULL; - struct ceph_snap_realm *realm; + struct ceph_snap_realm *realm = NULL; + struct ceph_string *pool_ns = NULL; int mds = session->s_mds; int op, issued; u32 seq, mseq; struct ceph_vino vino; - u64 cap_id; - u64 size, max_size; u64 tid; u64 inline_version = 0; void *inline_data = NULL; u32 inline_len = 0; void *snaptrace; size_t snaptrace_len; - u32 pool_ns_len = 0; void *p, *end; dout("handle_caps from mds%d\n", mds); @@ -3447,11 +3489,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, op = le32_to_cpu(h->op); vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; - cap_id = le64_to_cpu(h->cap_id); seq = le32_to_cpu(h->seq); mseq = le32_to_cpu(h->migrate_seq); - size = le64_to_cpu(h->size); - max_size = le64_to_cpu(h->max_size); snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); @@ -3490,6 +3529,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, u64 flush_tid; u32 caller_uid, caller_gid; u32 osd_epoch_barrier; + u32 pool_ns_len; /* version >= 5 */ ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad); /* version >= 6 */ @@ -3499,6 +3539,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_decode_32_safe(&p, end, caller_gid, bad); /* version >= 8 */ ceph_decode_32_safe(&p, end, pool_ns_len, bad); + if (pool_ns_len > 0) { + ceph_decode_need(&p, end, pool_ns_len, bad); + pool_ns = ceph_find_or_create_string(p, pool_ns_len); + p += pool_ns_len; + } } /* lookup ino */ @@ -3519,7 +3564,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, cap = ceph_get_cap(mdsc, NULL); cap->cap_ino = vino.ino; cap->queue_release = 1; - cap->cap_id = cap_id; + cap->cap_id = le64_to_cpu(h->cap_id); cap->mseq = mseq; cap->seq = seq; spin_lock(&session->s_cap_lock); @@ -3554,10 +3599,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, } handle_cap_import(mdsc, inode, h, peer, session, &cap, &issued); - handle_cap_grant(mdsc, inode, h, + handle_cap_grant(mdsc, inode, h, &pool_ns, inline_version, inline_data, inline_len, - msg->middle, session, cap, issued, - pool_ns_len); + msg->middle, session, cap, issued); if (realm) ceph_put_snap_realm(mdsc, realm); goto done_unlocked; @@ -3579,10 +3623,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_GRANT: __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); - handle_cap_grant(mdsc, inode, h, + handle_cap_grant(mdsc, inode, h, &pool_ns, inline_version, inline_data, inline_len, - msg->middle, session, cap, issued, - pool_ns_len); + msg->middle, session, cap, issued); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: @@ -3613,6 +3656,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, mutex_unlock(&session->s_mutex); done_unlocked: iput(inode); + ceph_put_string(pool_ns); return; bad: @@ -3673,6 +3717,16 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) dout("flush_dirty_caps done\n"); } +void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) +{ + int i; + int bits = (fmode << 1) | 1; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + if (bits & (1 << i)) + ci->i_nr_by_mode[i]++; + } +} + /* * Drop open file reference. If we were the last open file, * we may need to release capabilities to the MDS (or schedule @@ -3680,15 +3734,20 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) { - struct inode *inode = &ci->vfs_inode; - int last = 0; - + int i, last = 0; + int bits = (fmode << 1) | 1; spin_lock(&ci->i_ceph_lock); - dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, - ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); - BUG_ON(ci->i_nr_by_mode[fmode] == 0); - if (--ci->i_nr_by_mode[fmode] == 0) - last++; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + if (bits & (1 << i)) { + BUG_ON(ci->i_nr_by_mode[i] == 0); + if (--ci->i_nr_by_mode[i] == 0) + last++; + } + } + dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n", + &ci->vfs_inode, fmode, + ci->i_nr_by_mode[0], ci->i_nr_by_mode[1], + ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]); spin_unlock(&ci->i_ceph_lock); if (last && ci->i_vino.snap == CEPH_NOSNAP) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6e0fedf6713b..c64a0b794d49 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -59,7 +59,7 @@ int ceph_init_dentry(struct dentry *dentry) di->dentry = dentry; di->lease_session = NULL; - dentry->d_time = jiffies; + di->time = jiffies; /* avoid reordering d_fsdata setup so that the check above is safe */ smp_mb(); dentry->d_fsdata = di; @@ -1124,7 +1124,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, void ceph_invalidate_dentry_lease(struct dentry *dentry) { spin_lock(&dentry->d_lock); - dentry->d_time = jiffies; + ceph_dentry(dentry)->time = jiffies; ceph_dentry(dentry)->lease_shared_gen = 0; spin_unlock(&dentry->d_lock); } @@ -1133,7 +1133,8 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry) * Check if dentry lease is valid. If not, delete the lease. Try to * renew if the least is more than half up. */ -static int dentry_lease_is_valid(struct dentry *dentry) +static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, + struct inode *dir) { struct ceph_dentry_info *di; struct ceph_mds_session *s; @@ -1141,12 +1142,11 @@ static int dentry_lease_is_valid(struct dentry *dentry) u32 gen; unsigned long ttl; struct ceph_mds_session *session = NULL; - struct inode *dir = NULL; u32 seq = 0; spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); - if (di->lease_session) { + if (di && di->lease_session) { s = di->lease_session; spin_lock(&s->s_gen_ttl_lock); gen = s->s_cap_gen; @@ -1154,17 +1154,24 @@ static int dentry_lease_is_valid(struct dentry *dentry) spin_unlock(&s->s_gen_ttl_lock); if (di->lease_gen == gen && - time_before(jiffies, dentry->d_time) && + time_before(jiffies, di->time) && time_before(jiffies, ttl)) { valid = 1; if (di->lease_renew_after && time_after(jiffies, di->lease_renew_after)) { - /* we should renew */ - dir = d_inode(dentry->d_parent); - session = ceph_get_mds_session(s); - seq = di->lease_seq; - di->lease_renew_after = 0; - di->lease_renew_from = jiffies; + /* + * We should renew. If we're in RCU walk mode + * though, we can't do that so just return + * -ECHILD. + */ + if (flags & LOOKUP_RCU) { + valid = -ECHILD; + } else { + session = ceph_get_mds_session(s); + seq = di->lease_seq; + di->lease_renew_after = 0; + di->lease_renew_from = jiffies; + } } } } @@ -1207,15 +1214,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) struct dentry *parent; struct inode *dir; - if (flags & LOOKUP_RCU) - return -ECHILD; + if (flags & LOOKUP_RCU) { + parent = ACCESS_ONCE(dentry->d_parent); + dir = d_inode_rcu(parent); + if (!dir) + return -ECHILD; + } else { + parent = dget_parent(dentry); + dir = d_inode(parent); + } dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset); - parent = dget_parent(dentry); - dir = d_inode(parent); - /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, @@ -1224,12 +1235,16 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) } else if (d_really_is_positive(dentry) && ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { valid = 1; - } else if (dentry_lease_is_valid(dentry) || - dir_lease_is_valid(dir, dentry)) { - if (d_really_is_positive(dentry)) - valid = ceph_is_any_caps(d_inode(dentry)); - else - valid = 1; + } else { + valid = dentry_lease_is_valid(dentry, flags, dir); + if (valid == -ECHILD) + return valid; + if (valid || dir_lease_is_valid(dir, dentry)) { + if (d_really_is_positive(dentry)) + valid = ceph_is_any_caps(d_inode(dentry)); + else + valid = 1; + } } if (!valid) { @@ -1238,6 +1253,9 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) struct ceph_mds_request *req; int op, mask, err; + if (flags & LOOKUP_RCU) + return -ECHILD; + op = ceph_snap(dir) == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); @@ -1273,7 +1291,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) ceph_dir_clear_complete(dir); } - dput(parent); + if (!(flags & LOOKUP_RCU)) + dput(parent); return valid; } @@ -1286,10 +1305,14 @@ static void ceph_d_release(struct dentry *dentry) dout("d_release %p\n", dentry); ceph_dentry_lru_del(dentry); + + spin_lock(&dentry->d_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry->d_lock); + if (di->lease_session) ceph_put_mds_session(di->lease_session); kmem_cache_free(ceph_dentry_cachep, di); - dentry->d_fsdata = NULL; } static int ceph_snapdir_d_revalidate(struct dentry *dentry, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 0daaf7ceedc5..0f5375d8e030 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -708,7 +708,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } } - ceph_put_page_vector(osd_data->pages, num_pages, false); + ceph_put_page_vector(osd_data->pages, num_pages, !aio_req->write); ceph_osdc_put_request(req); if (rc < 0) @@ -821,6 +821,54 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } } +/* + * Wait on any unsafe replies for the given inode. First wait on the + * newest request, and make that the upper bound. Then, if there are + * more requests, keep waiting on the oldest as long as it is still older + * than the original request. + */ +void ceph_sync_write_wait(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_writes; + struct ceph_osd_request *req; + u64 last_tid; + + if (!S_ISREG(inode->i_mode)) + return; + + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + /* set upper bound as _last_ entry in chain */ + + req = list_last_entry(head, struct ceph_osd_request, + r_unsafe_item); + last_tid = req->r_tid; + + do { + ceph_osdc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + + dout("sync_write_wait on tid %llu (until %llu)\n", + req->r_tid, last_tid); + wait_for_completion(&req->r_safe_completion); + ceph_osdc_put_request(req); + + spin_lock(&ci->i_unsafe_lock); + /* + * from here on look at first entry in chain, since we + * only want to wait for anything older than last_tid + */ + if (list_empty(head)) + break; + req = list_first_entry(head, struct ceph_osd_request, + r_unsafe_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); +} static ssize_t ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, @@ -964,7 +1012,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, len = ret; } - ceph_put_page_vector(pages, num_pages, false); + ceph_put_page_vector(pages, num_pages, !write); ceph_osdc_put_request(req); if (ret < 0) @@ -985,6 +1033,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } if (aio_req) { + LIST_HEAD(osd_reqs); + if (aio_req->num_reqs == 0) { kfree(aio_req); return ret; @@ -993,8 +1043,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : CEPH_CAP_FILE_RD); - while (!list_empty(&aio_req->osd_reqs)) { - req = list_first_entry(&aio_req->osd_reqs, + list_splice(&aio_req->osd_reqs, &osd_reqs); + while (!list_empty(&osd_reqs)) { + req = list_first_entry(&osd_reqs, struct ceph_osd_request, r_unsafe_item); list_del_init(&req->r_unsafe_item); @@ -1448,16 +1499,14 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t i_size; - int ret; + loff_t ret; inode_lock(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); - if (ret < 0) { - offset = ret; + if (ret < 0) goto out; - } } i_size = i_size_read(inode); @@ -1473,7 +1522,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) * write() or lseek() might have altered it */ if (offset == 0) { - offset = file->f_pos; + ret = file->f_pos; goto out; } offset += file->f_pos; @@ -1493,11 +1542,11 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) break; } - offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + ret = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: inode_unlock(inode); - return offset; + return ret; } static inline void ceph_zero_partial_page( @@ -1583,9 +1632,9 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) { int ret = 0; struct ceph_inode_info *ci = ceph_inode(inode); - s32 stripe_unit = ceph_file_layout_su(ci->i_layout); - s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); - s32 object_size = ceph_file_layout_object_size(ci->i_layout); + s32 stripe_unit = ci->i_layout.stripe_unit; + s32 stripe_count = ci->i_layout.stripe_count; + s32 object_size = ci->i_layout.object_size; u64 object_set_size = object_size * stripe_count; u64 nearly, t; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 99bdef66213a..dd3a6dbf71eb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -446,7 +446,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); - ci->i_pool_ns_len = 0; + RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); ci->i_fragtree = RB_ROOT; mutex_init(&ci->i_fragtree_mutex); @@ -468,7 +468,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_flushing_item); ci->i_prealloc_cap_flush = NULL; - ci->i_cap_flush_tree = RB_ROOT; + INIT_LIST_HEAD(&ci->i_cap_flush_list); init_waitqueue_head(&ci->i_cap_wq); ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; @@ -477,7 +477,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_head_snapc = NULL; ci->i_snap_caps = 0; - for (i = 0; i < CEPH_FILE_MODE_NUM; i++) + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) ci->i_nr_by_mode[i] = 0; mutex_init(&ci->i_truncate_mutex); @@ -570,6 +570,8 @@ void ceph_destroy_inode(struct inode *inode) if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); + ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); + call_rcu(&inode->i_rcu, ceph_i_callback); } @@ -583,6 +585,14 @@ int ceph_drop_inode(struct inode *inode) return 1; } +void ceph_evict_inode(struct inode *inode) +{ + /* wait unsafe sync writes */ + ceph_sync_write_wait(inode); + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); +} + static inline blkcnt_t calc_inode_blocks(u64 size) { return (size + (1<<9) - 1) >> 9; @@ -733,6 +743,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, int issued = 0, implemented, new_issued; struct timespec mtime, atime, ctime; struct ceph_buffer *xattr_blob = NULL; + struct ceph_string *pool_ns = NULL; struct ceph_cap *new_cap = NULL; int err = 0; bool wake = false; @@ -760,6 +771,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, iinfo->xattr_len); } + if (iinfo->pool_ns_len > 0) + pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, + iinfo->pool_ns_len); + spin_lock(&ci->i_ceph_lock); /* @@ -814,10 +829,18 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { - if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool) + s64 old_pool = ci->i_layout.pool_id; + struct ceph_string *old_ns; + + ceph_file_layout_from_legacy(&ci->i_layout, &info->layout); + old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, + lockdep_is_held(&ci->i_ceph_lock)); + rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns); + + if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns) ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; - ci->i_layout = info->layout; - ci->i_pool_ns_len = iinfo->pool_ns_len; + + pool_ns = old_ns; queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), @@ -985,6 +1008,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ceph_put_cap(mdsc, new_cap); if (xattr_blob) ceph_buffer_put(xattr_blob); + ceph_put_string(pool_ns); return err; } @@ -1018,7 +1042,7 @@ static void update_dentry_lease(struct dentry *dentry, goto out_unlock; if (di->lease_gen == session->s_cap_gen && - time_before(ttl, dentry->d_time)) + time_before(ttl, di->time)) goto out_unlock; /* we already have a newer lease. */ if (di->lease_session && di->lease_session != session) @@ -1032,7 +1056,7 @@ static void update_dentry_lease(struct dentry *dentry, di->lease_seq = le32_to_cpu(lease->seq); di->lease_renew_after = half_ttl; di->lease_renew_from = 0; - dentry->d_time = ttl; + di->time = ttl; out_unlock: spin_unlock(&dentry->d_lock); return; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index be6b1657b1af..7d752d53353a 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -21,10 +21,10 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false); if (!err) { - l.stripe_unit = ceph_file_layout_su(ci->i_layout); - l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); - l.object_size = ceph_file_layout_object_size(ci->i_layout); - l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); + l.stripe_unit = ci->i_layout.stripe_unit; + l.stripe_count = ci->i_layout.stripe_count; + l.object_size = ci->i_layout.object_size; + l.data_pool = ci->i_layout.pool_id; l.preferred_osd = (s32)-1; if (copy_to_user(arg, &l, sizeof(l))) return -EFAULT; @@ -82,19 +82,19 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) if (l.stripe_count) nl.stripe_count = l.stripe_count; else - nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + nl.stripe_count = ci->i_layout.stripe_count; if (l.stripe_unit) nl.stripe_unit = l.stripe_unit; else - nl.stripe_unit = ceph_file_layout_su(ci->i_layout); + nl.stripe_unit = ci->i_layout.stripe_unit; if (l.object_size) nl.object_size = l.object_size; else - nl.object_size = ceph_file_layout_object_size(ci->i_layout); + nl.object_size = ci->i_layout.object_size; if (l.data_pool) nl.data_pool = l.data_pool; else - nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout); + nl.data_pool = ci->i_layout.pool_id; /* this is obsolete, and always -1 */ nl.preferred_osd = le64_to_cpu(-1); @@ -183,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; struct ceph_object_locator oloc; - struct ceph_object_id oid; + CEPH_DEFINE_OID_ONSTACK(oid); u64 len = 1, olen; u64 tmp; struct ceph_pg pgid; @@ -202,8 +202,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return -EIO; } dl.file_offset -= dl.object_offset; - dl.object_size = ceph_file_layout_object_size(ci->i_layout); - dl.block_size = ceph_file_layout_su(ci->i_layout); + dl.object_size = ci->i_layout.object_size; + dl.block_size = ci->i_layout.stripe_unit; /* block_offset = object_offset % block_size */ tmp = dl.object_offset; @@ -212,10 +212,13 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); + oloc.pool = ci->i_layout.pool_id; + oloc.pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); ceph_oid_printf(&oid, "%s", dl.object_name); r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); + + ceph_oloc_destroy(&oloc); if (r < 0) { up_read(&osdc->lock); return r; @@ -247,9 +250,8 @@ static long ceph_ioctl_lazyio(struct file *file) if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { spin_lock(&ci->i_ceph_lock); - ci->i_nr_by_mode[fi->fmode]--; fi->fmode |= CEPH_FILE_MODE_LAZY; - ci->i_nr_by_mode[fi->fmode]++; + ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++; spin_unlock(&ci->i_ceph_lock); dout("ioctl_layzio: file %p marked lazy\n", file); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4e8678a612b6..fa59a85226b2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -48,7 +48,7 @@ struct ceph_reconnect_state { int nr_caps; struct ceph_pagelist *pagelist; - bool flock; + unsigned msg_version; }; static void __wake_requests(struct ceph_mds_client *mdsc, @@ -100,12 +100,15 @@ static int parse_reply_info_in(void **p, void *end, } else info->inline_version = CEPH_INLINE_NONE; + info->pool_ns_len = 0; + info->pool_ns_data = NULL; if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { ceph_decode_32_safe(p, end, info->pool_ns_len, bad); - ceph_decode_need(p, end, info->pool_ns_len, bad); - *p += info->pool_ns_len; - } else { - info->pool_ns_len = 0; + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } } return 0; @@ -469,7 +472,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); INIT_LIST_HEAD(&s->s_cap_flushing); - INIT_LIST_HEAD(&s->s_cap_snaps_flushing); dout("register_session mds%d\n", mds); if (mds >= mdsc->max_sessions) { @@ -1145,19 +1147,17 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) invalidate = true; - while (true) { - struct rb_node *n = rb_first(&ci->i_cap_flush_tree); - if (!n) - break; - cf = rb_entry(n, struct ceph_cap_flush, i_node); - rb_erase(&cf->i_node, &ci->i_cap_flush_tree); - list_add(&cf->list, &to_remove); + while (!list_empty(&ci->i_cap_flush_list)) { + cf = list_first_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + list_del(&cf->i_list); + list_add(&cf->i_list, &to_remove); } spin_lock(&mdsc->cap_dirty_lock); - list_for_each_entry(cf, &to_remove, list) - rb_erase(&cf->g_node, &mdsc->cap_flush_tree); + list_for_each_entry(cf, &to_remove, i_list) + list_del(&cf->g_list); if (!list_empty(&ci->i_dirty_item)) { pr_warn_ratelimited( @@ -1181,7 +1181,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_unlock(&mdsc->cap_dirty_lock); if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { - list_add(&ci->i_prealloc_cap_flush->list, &to_remove); + list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } } @@ -1189,8 +1189,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, while (!list_empty(&to_remove)) { struct ceph_cap_flush *cf; cf = list_first_entry(&to_remove, - struct ceph_cap_flush, list); - list_del(&cf->list); + struct ceph_cap_flush, i_list); + list_del(&cf->i_list); ceph_free_cap_flush(cf); } @@ -1212,6 +1212,8 @@ static void remove_session_caps(struct ceph_mds_session *session) dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, fsc); + wake_up_all(&fsc->mdsc->cap_flushing_wq); + spin_lock(&session->s_cap_lock); if (session->s_nr_caps > 0) { struct inode *inode; @@ -1478,35 +1480,21 @@ static int trim_caps(struct ceph_mds_client *mdsc, return 0; } -static int check_capsnap_flush(struct ceph_inode_info *ci, - u64 want_snap_seq) -{ - int ret = 1; - spin_lock(&ci->i_ceph_lock); - if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { - struct ceph_cap_snap *capsnap = - list_first_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, ci_item); - ret = capsnap->follows >= want_snap_seq; - } - spin_unlock(&ci->i_ceph_lock); - return ret; -} - static int check_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { - struct rb_node *n; - struct ceph_cap_flush *cf; int ret = 1; spin_lock(&mdsc->cap_dirty_lock); - n = rb_first(&mdsc->cap_flush_tree); - cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; - if (cf && cf->tid <= want_flush_tid) { - dout("check_caps_flush still flushing tid %llu <= %llu\n", - cf->tid, want_flush_tid); - ret = 0; + if (!list_empty(&mdsc->cap_flush_list)) { + struct ceph_cap_flush *cf = + list_first_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); + if (cf->tid <= want_flush_tid) { + dout("check_caps_flush still flushing tid " + "%llu <= %llu\n", cf->tid, want_flush_tid); + ret = 0; + } } spin_unlock(&mdsc->cap_dirty_lock); return ret; @@ -1518,54 +1506,9 @@ static int check_caps_flush(struct ceph_mds_client *mdsc, * returns true if we've flushed through want_flush_tid */ static void wait_caps_flush(struct ceph_mds_client *mdsc, - u64 want_flush_tid, u64 want_snap_seq) + u64 want_flush_tid) { - int mds; - - dout("check_caps_flush want %llu snap want %llu\n", - want_flush_tid, want_snap_seq); - mutex_lock(&mdsc->mutex); - for (mds = 0; mds < mdsc->max_sessions; ) { - struct ceph_mds_session *session = mdsc->sessions[mds]; - struct inode *inode = NULL; - - if (!session) { - mds++; - continue; - } - get_session(session); - mutex_unlock(&mdsc->mutex); - - mutex_lock(&session->s_mutex); - if (!list_empty(&session->s_cap_snaps_flushing)) { - struct ceph_cap_snap *capsnap = - list_first_entry(&session->s_cap_snaps_flushing, - struct ceph_cap_snap, - flushing_item); - struct ceph_inode_info *ci = capsnap->ci; - if (!check_capsnap_flush(ci, want_snap_seq)) { - dout("check_cap_flush still flushing snap %p " - "follows %lld <= %lld to mds%d\n", - &ci->vfs_inode, capsnap->follows, - want_snap_seq, mds); - inode = igrab(&ci->vfs_inode); - } - } - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - - if (inode) { - wait_event(mdsc->cap_flushing_wq, - check_capsnap_flush(ceph_inode(inode), - want_snap_seq)); - iput(inode); - } else { - mds++; - } - - mutex_lock(&mdsc->mutex); - } - mutex_unlock(&mdsc->mutex); + dout("check_caps_flush want %llu\n", want_flush_tid); wait_event(mdsc->cap_flushing_wq, check_caps_flush(mdsc, want_flush_tid)); @@ -2163,6 +2106,11 @@ static int __do_request(struct ceph_mds_client *mdsc, mds = __choose_mds(mdsc, req); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { + if (mdsc->mdsmap_err) { + err = mdsc->mdsmap_err; + dout("do_request mdsmap err %d\n", err); + goto finish; + } dout("do_request no mds or not active, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); goto out; @@ -2292,14 +2240,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); - /* deny access to directories with pool_ns layouts */ - if (req->r_inode && S_ISDIR(req->r_inode->i_mode) && - ceph_inode(req->r_inode)->i_pool_ns_len) - return -EIO; - if (req->r_locked_dir && - ceph_inode(req->r_locked_dir)->i_pool_ns_len) - return -EIO; - /* issue */ mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); @@ -2791,13 +2731,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; } rec; - size_t reclen; struct ceph_inode_info *ci; struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; char *path; int pathlen, err; u64 pathbase; + u64 snap_follows; struct dentry *dentry; ci = cap->ci; @@ -2820,9 +2760,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, path = NULL; pathlen = 0; } - err = ceph_pagelist_encode_string(pagelist, path, pathlen); - if (err) - goto out_free; spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ @@ -2830,14 +2767,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, cap->mseq = 0; /* and migrate_seq */ cap->cap_gen = cap->session->s_cap_gen; - if (recon_state->flock) { + if (recon_state->msg_version >= 2) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v2.pathbase = cpu_to_le64(pathbase); rec.v2.flock_len = 0; - reclen = sizeof(rec.v2); } else { rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); @@ -2847,13 +2783,23 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v1.pathbase = cpu_to_le64(pathbase); - reclen = sizeof(rec.v1); + } + + if (list_empty(&ci->i_cap_snaps)) { + snap_follows = 0; + } else { + struct ceph_cap_snap *capsnap = + list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + snap_follows = capsnap->follows; } spin_unlock(&ci->i_ceph_lock); - if (recon_state->flock) { + if (recon_state->msg_version >= 2) { int num_fcntl_locks, num_flock_locks; struct ceph_filelock *flocks; + size_t struct_len, total_len = 0; + u8 struct_v = 0; encode_again: ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); @@ -2872,20 +2818,51 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, goto encode_again; goto out_free; } + + if (recon_state->msg_version >= 3) { + /* version, compat_version and struct_len */ + total_len = 2 * sizeof(u8) + sizeof(u32); + struct_v = 2; + } /* * number of encoded locks is stable, so copy to pagelist */ - rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + - (num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock)); - err = ceph_pagelist_append(pagelist, &rec, reclen); - if (!err) - err = ceph_locks_to_pagelist(flocks, pagelist, - num_fcntl_locks, - num_flock_locks); + struct_len = 2 * sizeof(u32) + + (num_fcntl_locks + num_flock_locks) * + sizeof(struct ceph_filelock); + rec.v2.flock_len = cpu_to_le32(struct_len); + + struct_len += sizeof(rec.v2); + struct_len += sizeof(u32) + pathlen; + + if (struct_v >= 2) + struct_len += sizeof(u64); /* snap_follows */ + + total_len += struct_len; + err = ceph_pagelist_reserve(pagelist, total_len); + + if (!err) { + if (recon_state->msg_version >= 3) { + ceph_pagelist_encode_8(pagelist, struct_v); + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_32(pagelist, struct_len); + } + ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); + ceph_locks_to_pagelist(flocks, pagelist, + num_fcntl_locks, + num_flock_locks); + if (struct_v >= 2) + ceph_pagelist_encode_64(pagelist, snap_follows); + } kfree(flocks); } else { - err = ceph_pagelist_append(pagelist, &rec, reclen); + size_t size = sizeof(u32) + pathlen + sizeof(rec.v1); + err = ceph_pagelist_reserve(pagelist, size); + if (!err) { + ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); + } } recon_state->nr_caps++; @@ -2976,7 +2953,12 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, recon_state.nr_caps = 0; recon_state.pagelist = pagelist; - recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; + if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) + recon_state.msg_version = 3; + else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK) + recon_state.msg_version = 2; + else + recon_state.msg_version = 1; err = iterate_session_caps(session, encode_caps_cb, &recon_state); if (err < 0) goto fail; @@ -3005,8 +2987,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail; } - if (recon_state.flock) - reply->hdr.version = cpu_to_le16(2); + reply->hdr.version = cpu_to_le16(recon_state.msg_version); /* raced with cap release? */ if (s_nr_caps != recon_state.nr_caps) { @@ -3231,7 +3212,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, msecs_to_jiffies(le32_to_cpu(h->duration_ms)); di->lease_seq = seq; - dentry->d_time = di->lease_renew_from + duration; + di->time = di->lease_renew_from + duration; di->lease_renew_after = di->lease_renew_from + (duration >> 1); di->lease_renew_from = 0; @@ -3296,47 +3277,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, ceph_con_send(&session->s_con, msg); } -/* - * Preemptively release a lease we expect to invalidate anyway. - * Pass @inode always, @dentry is optional. - */ -void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dentry) -{ - struct ceph_dentry_info *di; - struct ceph_mds_session *session; - u32 seq; - - BUG_ON(inode == NULL); - BUG_ON(dentry == NULL); - - /* is dentry lease valid? */ - spin_lock(&dentry->d_lock); - di = ceph_dentry(dentry); - if (!di || !di->lease_session || - di->lease_session->s_mds < 0 || - di->lease_gen != di->lease_session->s_cap_gen || - !time_before(jiffies, dentry->d_time)) { - dout("lease_release inode %p dentry %p -- " - "no lease\n", - inode, dentry); - spin_unlock(&dentry->d_lock); - return; - } - - /* we do have a lease on this dentry; note mds and seq */ - session = ceph_get_mds_session(di->lease_session); - seq = di->lease_seq; - __ceph_mdsc_drop_dentry_lease(dentry); - spin_unlock(&dentry->d_lock); - - dout("lease_release inode %p dentry %p to mds%d\n", - inode, dentry, session->s_mds); - ceph_mdsc_lease_send_msg(session, inode, dentry, - CEPH_MDS_LEASE_RELEASE, seq); - ceph_put_mds_session(session); -} - /* * drop all leases (and dentry refs) in preparation for umount */ @@ -3470,7 +3410,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; - mdsc->cap_flush_tree = RB_ROOT; + INIT_LIST_HEAD(&mdsc->cap_flush_list); INIT_LIST_HEAD(&mdsc->cap_dirty); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; @@ -3585,7 +3525,7 @@ static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { - u64 want_tid, want_flush, want_snap; + u64 want_tid, want_flush; if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return; @@ -3598,17 +3538,19 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) ceph_flush_dirty_caps(mdsc); spin_lock(&mdsc->cap_dirty_lock); want_flush = mdsc->last_cap_flush_tid; + if (!list_empty(&mdsc->cap_flush_list)) { + struct ceph_cap_flush *cf = + list_last_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); + cf->wake = true; + } spin_unlock(&mdsc->cap_dirty_lock); - down_read(&mdsc->snap_rwsem); - want_snap = mdsc->last_snap_seq; - up_read(&mdsc->snap_rwsem); - - dout("sync want tid %lld flush_seq %lld snap_seq %lld\n", - want_tid, want_flush, want_snap); + dout("sync want tid %lld flush_seq %lld\n", + want_tid, want_flush); wait_unsafe_requests(mdsc, want_tid); - wait_caps_flush(mdsc, want_flush, want_snap); + wait_caps_flush(mdsc, want_flush); } /* @@ -3729,11 +3671,86 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) dout("mdsc_destroy %p done\n", mdsc); } +void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) +{ + struct ceph_fs_client *fsc = mdsc->fsc; + const char *mds_namespace = fsc->mount_options->mds_namespace; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + u32 epoch; + u32 map_len; + u32 num_fs; + u32 mount_fscid = (u32)-1; + u8 struct_v, struct_cv; + int err = -EINVAL; + + ceph_decode_need(&p, end, sizeof(u32), bad); + epoch = ceph_decode_32(&p); + + dout("handle_fsmap epoch %u\n", epoch); + + ceph_decode_need(&p, end, 2 + sizeof(u32), bad); + struct_v = ceph_decode_8(&p); + struct_cv = ceph_decode_8(&p); + map_len = ceph_decode_32(&p); + + ceph_decode_need(&p, end, sizeof(u32) * 3, bad); + p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */ + + num_fs = ceph_decode_32(&p); + while (num_fs-- > 0) { + void *info_p, *info_end; + u32 info_len; + u8 info_v, info_cv; + u32 fscid, namelen; + + ceph_decode_need(&p, end, 2 + sizeof(u32), bad); + info_v = ceph_decode_8(&p); + info_cv = ceph_decode_8(&p); + info_len = ceph_decode_32(&p); + ceph_decode_need(&p, end, info_len, bad); + info_p = p; + info_end = p + info_len; + p = info_end; + + ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad); + fscid = ceph_decode_32(&info_p); + namelen = ceph_decode_32(&info_p); + ceph_decode_need(&info_p, info_end, namelen, bad); + + if (mds_namespace && + strlen(mds_namespace) == namelen && + !strncmp(mds_namespace, (char *)info_p, namelen)) { + mount_fscid = fscid; + break; + } + } + + ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch); + if (mount_fscid != (u32)-1) { + fsc->client->monc.fs_cluster_id = mount_fscid; + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, + 0, true); + ceph_monc_renew_subs(&fsc->client->monc); + } else { + err = -ENOENT; + goto err_out; + } + return; +bad: + pr_err("error decoding fsmap\n"); +err_out: + mutex_lock(&mdsc->mutex); + mdsc->mdsmap_err = -ENOENT; + __wake_requests(mdsc, &mdsc->waiting_for_map); + mutex_unlock(&mdsc->mutex); + return; +} /* * handle mds map update. */ -void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) +void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) { u32 epoch; u32 maplen; @@ -3840,7 +3857,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) switch (type) { case CEPH_MSG_MDS_MAP: - ceph_mdsc_handle_map(mdsc, msg); + ceph_mdsc_handle_mdsmap(mdsc, msg); + break; + case CEPH_MSG_FS_MAP_USER: + ceph_mdsc_handle_fsmap(mdsc, msg); break; case CEPH_MSG_CLIENT_SESSION: handle_session(s, msg); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index e7d38aac7109..6b3679737d4a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -45,6 +45,7 @@ struct ceph_mds_reply_info_in { u32 inline_len; char *inline_data; u32 pool_ns_len; + char *pool_ns_data; }; struct ceph_mds_reply_dir_entry { @@ -151,7 +152,6 @@ struct ceph_mds_session { /* protected by mutex */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ - struct list_head s_cap_snaps_flushing; unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; @@ -275,8 +275,10 @@ struct ceph_mds_request { struct ceph_pool_perm { struct rb_node node; - u32 pool; int perm; + s64 pool; + size_t pool_ns_len; + char pool_ns[]; }; /* @@ -290,6 +292,7 @@ struct ceph_mds_client { struct completion safe_umount_waiters; wait_queue_head_t session_close_wq; struct list_head waiting_for_map; + int mdsmap_err; struct ceph_mds_session **sessions; /* NULL for mds if no session */ atomic_t num_sessions; @@ -321,7 +324,7 @@ struct ceph_mds_client { spinlock_t snap_flush_lock; u64 last_cap_flush_tid; - struct rb_root cap_flush_tree; + struct list_head cap_flush_list; struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ @@ -382,10 +385,6 @@ extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); -extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, - struct inode *inode, - struct dentry *dn); - extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct inode *dir); @@ -420,8 +419,10 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, struct dentry *dentry, char action, u32 seq); -extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, - struct ceph_msg *msg); +extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, + struct ceph_msg *msg); +extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, + struct ceph_msg *msg); extern struct ceph_mds_session * ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 9caaa7ffc93f..9ff5219d849e 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -520,9 +520,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ihold(inode); atomic_set(&capsnap->nref, 1); - capsnap->ci = ci; INIT_LIST_HEAD(&capsnap->ci_item); - INIT_LIST_HEAD(&capsnap->flushing_item); capsnap->follows = old_snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); @@ -551,7 +549,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ci->i_wrbuffer_ref_head = 0; capsnap->context = old_snapc; list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); - old_snapc = NULL; if (used & CEPH_CAP_FILE_WR) { dout("queue_cap_snap %p cap_snap %p snapc %p" @@ -563,6 +560,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) __ceph_finish_cap_snap(ci, capsnap); } capsnap = NULL; + old_snapc = NULL; update_snapc: if (ci->i_head_snapc) { @@ -603,6 +601,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->dirty_pages); return 0; } + + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", inode, capsnap, capsnap->context, capsnap->context->seq, ceph_cap_string(capsnap->dirty), @@ -799,9 +799,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) inode = &ci->vfs_inode; ihold(inode); spin_unlock(&mdsc->snap_flush_lock); - spin_lock(&ci->i_ceph_lock); - __ceph_flush_snaps(ci, &session, 0); - spin_unlock(&ci->i_ceph_lock); + ceph_flush_snaps(ci, &session); iput(inode); spin_lock(&mdsc->snap_flush_lock); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 91e02481ce06..e247f6f0feb7 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -108,7 +108,6 @@ static int ceph_sync_fs(struct super_block *sb, int wait) * mount options */ enum { - Opt_mds_namespace, Opt_wsize, Opt_rsize, Opt_rasize, @@ -121,6 +120,7 @@ enum { Opt_last_int, /* int args above */ Opt_snapdirname, + Opt_mds_namespace, Opt_last_string, /* string args above */ Opt_dirstat, @@ -144,7 +144,6 @@ enum { }; static match_table_t fsopt_tokens = { - {Opt_mds_namespace, "mds_namespace=%d"}, {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, {Opt_rasize, "rasize=%d"}, @@ -156,6 +155,7 @@ static match_table_t fsopt_tokens = { {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, + {Opt_mds_namespace, "mds_namespace=%s"}, /* string args above */ {Opt_dirstat, "dirstat"}, {Opt_nodirstat, "nodirstat"}, @@ -212,11 +212,14 @@ static int parse_fsopt_token(char *c, void *private) if (!fsopt->snapdir_name) return -ENOMEM; break; - - /* misc */ case Opt_mds_namespace: - fsopt->mds_namespace = intval; + fsopt->mds_namespace = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + if (!fsopt->mds_namespace) + return -ENOMEM; break; + /* misc */ case Opt_wsize: fsopt->wsize = intval; break; @@ -302,6 +305,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); kfree(args->snapdir_name); + kfree(args->mds_namespace); kfree(args->server_path); kfree(args); } @@ -331,6 +335,9 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, return ret; ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); + if (ret) + return ret; + ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); if (ret) return ret; @@ -376,7 +383,6 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); - fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE; /* * Distinguish the server list from the path in "dev_name". @@ -469,8 +475,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noacl"); #endif - if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE) - seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace); + if (fsopt->mds_namespace) + seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) @@ -509,9 +515,11 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) switch (type) { case CEPH_MSG_MDS_MAP: - ceph_mdsc_handle_map(fsc->mdsc, msg); + ceph_mdsc_handle_mdsmap(fsc->mdsc, msg); + return 0; + case CEPH_MSG_FS_MAP_USER: + ceph_mdsc_handle_fsmap(fsc->mdsc, msg); return 0; - default: return -1; } @@ -543,8 +551,14 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->monc.fs_cluster_id = fsopt->mds_namespace; - ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); + + if (fsopt->mds_namespace == NULL) { + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, + 0, true); + } else { + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP, + 0, false); + } fsc->mount_options = fsopt; @@ -672,8 +686,8 @@ static int __init init_caches(void) if (ceph_dentry_cachep == NULL) goto bad_dentry; - ceph_file_cachep = KMEM_CACHE(ceph_file_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); + if (ceph_file_cachep == NULL) goto bad_file; @@ -731,6 +745,7 @@ static const struct super_operations ceph_super_ops = { .destroy_inode = ceph_destroy_inode, .write_inode = ceph_write_inode, .drop_inode = ceph_drop_inode, + .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0168b49fb6ad..3e3fa9163059 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -62,7 +62,6 @@ struct ceph_mount_options { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ - int mds_namespace; /* * everything above this point can be memcmp'd; everything below @@ -70,6 +69,7 @@ struct ceph_mount_options { */ char *snapdir_name; /* default ".snap" */ + char *mds_namespace; /* default NULL */ char *server_path; /* default "/" */ }; @@ -147,6 +147,14 @@ struct ceph_cap { #define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */ #define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */ +struct ceph_cap_flush { + u64 tid; + int caps; /* 0 means capsnap */ + bool wake; /* wake up flush waiters when finish ? */ + struct list_head g_list; // global + struct list_head i_list; // per inode +}; + /* * Snapped cap state that is pending flush to mds. When a snapshot occurs, * we first complete any in-process sync writes and writeback any dirty @@ -154,10 +162,11 @@ struct ceph_cap { */ struct ceph_cap_snap { atomic_t nref; - struct ceph_inode_info *ci; - struct list_head ci_item, flushing_item; + struct list_head ci_item; - u64 follows, flush_tid; + struct ceph_cap_flush cap_flush; + + u64 follows; int issued, dirty; struct ceph_snap_context *context; @@ -186,16 +195,6 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) } } -struct ceph_cap_flush { - u64 tid; - int caps; - struct rb_node g_node; // global - union { - struct rb_node i_node; // inode - struct list_head list; - }; -}; - /* * The frag tree describes how a directory is fragmented, potentially across * multiple metadata servers. It is also used to indicate points where @@ -246,7 +245,7 @@ struct ceph_dentry_info { unsigned long lease_renew_after, lease_renew_from; struct list_head lru; struct dentry *dentry; - u64 time; + unsigned long time; u64 offset; }; @@ -287,7 +286,6 @@ struct ceph_inode_info { struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; - size_t i_pool_ns_len; char *i_symlink; /* for dirs */ @@ -311,7 +309,7 @@ struct ceph_inode_info { * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ struct ceph_cap_flush *i_prealloc_cap_flush; - struct rb_root i_cap_flush_tree; + struct list_head i_cap_flush_list; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ @@ -322,7 +320,7 @@ struct ceph_inode_info { dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ - int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */ struct mutex i_truncate_mutex; u32 i_truncate_seq; /* last truncate to smaller size */ @@ -471,6 +469,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ #define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ +#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ +#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, @@ -750,6 +750,7 @@ extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_destroy_inode(struct inode *inode); extern int ceph_drop_inode(struct inode *inode); +extern void ceph_evict_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino); @@ -890,9 +891,8 @@ extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); -extern void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession, - int again); +extern void ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession); extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session); extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); @@ -907,10 +907,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page); /* for counting open files by mode */ -static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) -{ - ci->i_nr_by_mode[mode]++; -} +extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); /* addr.c */ @@ -931,6 +928,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); +extern void ceph_sync_write_wait(struct inode *inode); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 4870b29df224..adc231892b0d 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -57,81 +57,88 @@ struct ceph_vxattr { static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) { - size_t s; - char *p = (char *)&ci->i_layout; - - for (s = 0; s < sizeof(ci->i_layout); s++, p++) - if (*p) - return true; - return false; + struct ceph_file_layout *fl = &ci->i_layout; + return (fl->stripe_unit > 0 || fl->stripe_count > 0 || + fl->object_size > 0 || fl->pool_id >= 0 || + rcu_dereference_raw(fl->pool_ns) != NULL); } static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, size_t size) { - int ret; struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; - s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + struct ceph_string *pool_ns; + s64 pool = ci->i_layout.pool_id; const char *pool_name; + const char *ns_field = " pool_namespace="; char buf[128]; + size_t len, total_len = 0; + int ret; + + pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { - size_t len = strlen(pool_name); - ret = snprintf(buf, sizeof(buf), - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); - if (!size) { - ret += len; - } else if (ret + len > size) { - ret = -ERANGE; - } else { - memcpy(val, buf, ret); + len = snprintf(buf, sizeof(buf), + "stripe_unit=%u stripe_count=%u object_size=%u pool=", + ci->i_layout.stripe_unit, ci->i_layout.stripe_count, + ci->i_layout.object_size); + total_len = len + strlen(pool_name); + } else { + len = snprintf(buf, sizeof(buf), + "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld", + ci->i_layout.stripe_unit, ci->i_layout.stripe_count, + ci->i_layout.object_size, (unsigned long long)pool); + total_len = len; + } + + if (pool_ns) + total_len += strlen(ns_field) + pool_ns->len; + + if (!size) { + ret = total_len; + } else if (total_len > size) { + ret = -ERANGE; + } else { + memcpy(val, buf, len); + ret = len; + if (pool_name) { + len = strlen(pool_name); memcpy(val + ret, pool_name, len); ret += len; } - } else { - ret = snprintf(buf, sizeof(buf), - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout), - (unsigned long long)pool); - if (size) { - if (ret <= size) - memcpy(val, buf, ret); - else - ret = -ERANGE; + if (pool_ns) { + len = strlen(ns_field); + memcpy(val + ret, ns_field, len); + ret += len; + memcpy(val + ret, pool_ns->str, pool_ns->len); + ret += pool_ns->len; } } up_read(&osdc->lock); + ceph_put_string(pool_ns); return ret; } static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_su(ci->i_layout)); + return snprintf(val, size, "%u", ci->i_layout.stripe_unit); } static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); + return snprintf(val, size, "%u", ci->i_layout.stripe_count); } static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + return snprintf(val, size, "%u", ci->i_layout.object_size); } static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, @@ -140,7 +147,7 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, int ret; struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; - s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + s64 pool = ci->i_layout.pool_id; const char *pool_name; down_read(&osdc->lock); @@ -153,6 +160,18 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, return ret; } +static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci, + char *val, size_t size) +{ + int ret = 0; + struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns); + if (ns) { + ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str); + ceph_put_string(ns); + } + return ret; +} + /* directories */ static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, @@ -241,6 +260,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_LAYOUT_FIELD(dir, layout, stripe_count), XATTR_LAYOUT_FIELD(dir, layout, object_size), XATTR_LAYOUT_FIELD(dir, layout, pool), + XATTR_LAYOUT_FIELD(dir, layout, pool_namespace), XATTR_NAME_CEPH(dir, entries), XATTR_NAME_CEPH(dir, files), XATTR_NAME_CEPH(dir, subdirs), @@ -268,6 +288,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { XATTR_LAYOUT_FIELD(file, layout, stripe_count), XATTR_LAYOUT_FIELD(file, layout, object_size), XATTR_LAYOUT_FIELD(file, layout, pool), + XATTR_LAYOUT_FIELD(file, layout, pool_namespace), { .name = NULL, 0 } /* Required table terminator */ }; static size_t ceph_file_vxattrs_name_size; /* total size of all names */ diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h index 66b99210f1f9..db6e8722a89e 100644 --- a/fs/orangefs/downcall.h +++ b/fs/orangefs/downcall.h @@ -83,7 +83,10 @@ struct orangefs_listxattr_response { }; struct orangefs_param_response { - __s64 value; + union { + __s64 value64; + __s32 value32[2]; + } u; }; #define PERF_COUNT_BUF_SIZE 4096 diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 526040e09f78..43c08b5c7168 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -14,6 +14,32 @@ #include #include +static int flush_racache(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op; + int ret; + + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: %pU: Handle is %pU | fs_id %d\n", __func__, + get_khandle_from_ino(inode), &orangefs_inode->refn.khandle, + orangefs_inode->refn.fs_id); + + new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn; + + ret = service_operation(new_op, "orangefs_flush_racache", + get_interruptible_flag(inode)); + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n", + __func__, ret); + + op_release(new_op); + return ret; +} + /* * Copy to client-core's address space from the buffers specified * by the iovec upto total_size bytes. @@ -591,15 +617,21 @@ static int orangefs_file_release(struct inode *inode, struct file *file) orangefs_flush_inode(inode); /* - * remove all associated inode pages from the page cache and mmap + * remove all associated inode pages from the page cache and * readahead cache (if any); this forces an expensive refresh of * data for the next caller of mmap (or 'get_block' accesses) */ if (file->f_path.dentry->d_inode && file->f_path.dentry->d_inode->i_mapping && - mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) + mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) { + gossip_debug(GOSSIP_INODE_DEBUG, + "calling flush_racache on %pU\n", + get_khandle_from_ino(inode)); + flush_racache(inode); + gossip_debug(GOSSIP_INODE_DEBUG, "flush_racache finished\n"); truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping, 0); + } return 0; } diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c index b6edbe9fb309..eb0b6e00b519 100644 --- a/fs/orangefs/orangefs-cache.c +++ b/fs/orangefs/orangefs-cache.c @@ -73,8 +73,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op) return "OP_STATFS"; else if (type == ORANGEFS_VFS_OP_TRUNCATE) return "OP_TRUNCATE"; - else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH) - return "OP_MMAP_RA_FLUSH"; + else if (type == ORANGEFS_VFS_OP_RA_FLUSH) + return "OP_RA_FLUSH"; else if (type == ORANGEFS_VFS_OP_FS_MOUNT) return "OP_FS_MOUNT"; else if (type == ORANGEFS_VFS_OP_FS_UMOUNT) diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h index 9eac9d9a3f3a..71902899b1da 100644 --- a/fs/orangefs/orangefs-dev-proto.h +++ b/fs/orangefs/orangefs-dev-proto.h @@ -28,7 +28,7 @@ #define ORANGEFS_VFS_OP_RENAME 0xFF00000A #define ORANGEFS_VFS_OP_STATFS 0xFF00000B #define ORANGEFS_VFS_OP_TRUNCATE 0xFF00000C -#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH 0xFF00000D +#define ORANGEFS_VFS_OP_RA_FLUSH 0xFF00000D #define ORANGEFS_VFS_OP_FS_MOUNT 0xFF00000E #define ORANGEFS_VFS_OP_FS_UMOUNT 0xFF00000F #define ORANGEFS_VFS_OP_GETXATTR 0xFF000010 diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 375708c2db87..2fe9a3a2117b 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -73,6 +73,24 @@ * Description: * Time getattr is valid in milliseconds. * + * What: /sys/fs/orangefs/readahead_count + * Date: Aug 2016 + * Contact: Martin Brandenburg + * Description: + * Readahead cache buffer count. + * + * What: /sys/fs/orangefs/readahead_size + * Date: Aug 2016 + * Contact: Martin Brandenburg + * Description: + * Readahead cache buffer size. + * + * What: /sys/fs/orangefs/readahead_count_size + * Date: Aug 2016 + * Contact: Martin Brandenburg + * Description: + * Readahead cache buffer count and size. + * * What: /sys/fs/orangefs/acache/... * Date: Jun 2015 * Contact: Martin Brandenburg @@ -836,6 +854,20 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_RESET; + else if (!strcmp(orangefs_attr->attr.name, + "readahead_count")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; + + else if (!strcmp(orangefs_attr->attr.name, + "readahead_size")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; + + else if (!strcmp(orangefs_attr->attr.name, + "readahead_count_size")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) { acache_attr = (struct acache_orangefs_attribute *)attr; @@ -949,10 +981,17 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr) out: if (!rc) { if (strcmp(kobj_id, PC_KOBJ_ID)) { - rc = scnprintf(buf, - PAGE_SIZE, - "%d\n", - (int)new_op->downcall.resp.param.value); + if (new_op->upcall.req.param.op == + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) { + rc = scnprintf(buf, PAGE_SIZE, "%d %d\n", + (int)new_op->downcall.resp.param.u. + value32[0], + (int)new_op->downcall.resp.param.u. + value32[1]); + } else { + rc = scnprintf(buf, PAGE_SIZE, "%d\n", + (int)new_op->downcall.resp.param.u.value64); + } } else { rc = scnprintf( buf, @@ -1079,11 +1118,18 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) } /* - * The value we want to send back to userspace is in buf. + * The value we want to send back to userspace is in buf, unless this + * there are two parameters, which is specially handled below. */ - rc = kstrtoint(buf, 0, &val); - if (rc) - goto out; + if (strcmp(kobj_id, ORANGEFS_KOBJ_ID) || + strcmp(((struct orangefs_attribute *)attr)->attr.name, + "readahead_count_size")) { + rc = kstrtoint(buf, 0, &val); + if (rc) + goto out; + } + + new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET; if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) { orangefs_attr = (struct orangefs_attribute *)attr; @@ -1114,6 +1160,51 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } + } else if (!strcmp(orangefs_attr->attr.name, + "readahead_count")) { + if ((val >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(orangefs_attr->attr.name, + "readahead_size")) { + if ((val >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(orangefs_attr->attr.name, + "readahead_count_size")) { + int val1, val2; + rc = sscanf(buf, "%d %d", &val1, &val2); + if (rc < 2) { + rc = 0; + goto out; + } + if ((val1 >= 0) && (val2 >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; + } else { + rc = 0; + goto out; + } + new_op->upcall.req.param.u.value32[0] = val1; + new_op->upcall.req.param.u.value32[1] = val2; + goto value_set; + } else if (!strcmp(orangefs_attr->attr.name, + "perf_counter_reset")) { + if ((val > 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; + } else { + rc = 0; + goto out; + } } } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) { @@ -1275,9 +1366,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) goto out; } - new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET; - - new_op->upcall.req.param.value = val; + new_op->upcall.req.param.u.value64 = val; +value_set: /* * The service_operation will return a errno return code on @@ -1400,6 +1490,18 @@ static struct orangefs_attribute dcache_timeout_msecs_attribute = static struct orangefs_attribute getattr_timeout_msecs_attribute = __ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store); +static struct orangefs_attribute readahead_count_attribute = + __ATTR(readahead_count, 0664, service_orangefs_show, + service_orangefs_store); + +static struct orangefs_attribute readahead_size_attribute = + __ATTR(readahead_size, 0664, service_orangefs_show, + service_orangefs_store); + +static struct orangefs_attribute readahead_count_size_attribute = + __ATTR(readahead_count_size, 0664, service_orangefs_show, + service_orangefs_store); + static struct orangefs_attribute perf_counter_reset_attribute = __ATTR(perf_counter_reset, 0664, @@ -1423,6 +1525,9 @@ static struct attribute *orangefs_default_attrs[] = { &slot_timeout_secs_attribute.attr, &dcache_timeout_msecs_attribute.attr, &getattr_timeout_msecs_attribute.attr, + &readahead_count_attribute.attr, + &readahead_size_attribute.attr, + &readahead_count_size_attribute.attr, &perf_counter_reset_attribute.attr, &perf_history_size_attribute.attr, &perf_time_interval_secs_attribute.attr, diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index d13c7291fd05..9a99285fe310 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -50,7 +50,7 @@ __s32 fsid_of_op(struct orangefs_kernel_op_s *op) case ORANGEFS_VFS_OP_TRUNCATE: fsid = op->upcall.req.truncate.refn.fs_id; break; - case ORANGEFS_VFS_OP_MMAP_RA_FLUSH: + case ORANGEFS_VFS_OP_RA_FLUSH: fsid = op->upcall.req.ra_cache_flush.refn.fs_id; break; case ORANGEFS_VFS_OP_FS_UMOUNT: diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h index 001b20239407..7c29fdf08ec5 100644 --- a/fs/orangefs/upcall.h +++ b/fs/orangefs/upcall.h @@ -98,7 +98,7 @@ struct orangefs_truncate_request_s { __s64 size; }; -struct orangefs_mmap_ra_cache_flush_request_s { +struct orangefs_ra_cache_flush_request_s { struct orangefs_object_kref refn; }; @@ -179,12 +179,18 @@ enum orangefs_param_request_op { ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23, ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24, ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE = 26, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT = 27, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE = 28, }; struct orangefs_param_request_s { enum orangefs_param_request_type type; enum orangefs_param_request_op op; - __s64 value; + union { + __s64 value64; + __s32 value32[2]; + } u; char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN]; }; @@ -228,7 +234,7 @@ struct orangefs_upcall_s { struct orangefs_rename_request_s rename; struct orangefs_statfs_request_s statfs; struct orangefs_truncate_request_s truncate; - struct orangefs_mmap_ra_cache_flush_request_s ra_cache_flush; + struct orangefs_ra_cache_flush_request_s ra_cache_flush; struct orangefs_fs_mount_request_s fs_mount; struct orangefs_fs_umount_request_s fs_umount; struct orangefs_getxattr_request_s getxattr; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 54643d1f5af4..24563970ff7b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -164,7 +164,7 @@ #define ___OF_TABLE(cfg, name) _OF_TABLE_##cfg(name) #define __OF_TABLE(cfg, name) ___OF_TABLE(cfg, name) -#define OF_TABLE(cfg, name) __OF_TABLE(config_enabled(cfg), name) +#define OF_TABLE(cfg, name) __OF_TABLE(IS_ENABLED(cfg), name) #define _OF_TABLE_0(name) #define _OF_TABLE_1(name) \ . = ALIGN(8); \ diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h index 72dee1213268..63b8bd502444 100644 --- a/include/drm/drm_dp_helper.h +++ b/include/drm/drm_dp_helper.h @@ -657,6 +657,8 @@ struct edp_vsc_psr { #define EDP_VSC_PSR_UPDATE_RFB (1<<1) #define EDP_VSC_PSR_CRC_VALUES_VALID (1<<2) +int drm_dp_psr_setup_time(const u8 psr_cap[EDP_PSR_RECEIVER_CAP_SIZE]); + static inline int drm_dp_max_link_rate(const u8 dpcd[DP_RECEIVER_CAP_SIZE]) { diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index da0a524802cb..540da5149ba7 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -1,6 +1,5 @@ /* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier + * Copyright (C) 2015, 2016 ARM Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -12,16 +11,10 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program. If not, see . */ - -#ifndef __ASM_ARM_KVM_VGIC_H -#define __ASM_ARM_KVM_VGIC_H - -#ifdef CONFIG_KVM_NEW_VGIC -#include -#else +#ifndef __KVM_ARM_VGIC_H +#define __KVM_ARM_VGIC_H #include #include @@ -29,248 +22,187 @@ #include #include #include -#include +#include -#define VGIC_NR_IRQS_LEGACY 256 +#define VGIC_V3_MAX_CPUS 255 +#define VGIC_V2_MAX_CPUS 8 +#define VGIC_NR_IRQS_LEGACY 256 #define VGIC_NR_SGIS 16 #define VGIC_NR_PPIS 16 #define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS) - -#define VGIC_V2_MAX_LRS (1 << 6) -#define VGIC_V3_MAX_LRS 16 -#define VGIC_MAX_IRQS 1024 -#define VGIC_V2_MAX_CPUS 8 -#define VGIC_V3_MAX_CPUS 255 - -#if (VGIC_NR_IRQS_LEGACY & 31) -#error "VGIC_NR_IRQS must be a multiple of 32" -#endif - -#if (VGIC_NR_IRQS_LEGACY > VGIC_MAX_IRQS) -#error "VGIC_NR_IRQS must be <= 1024" -#endif - -/* - * The GIC distributor registers describing interrupts have two parts: - * - 32 per-CPU interrupts (SGI + PPI) - * - a bunch of shared interrupts (SPI) - */ -struct vgic_bitmap { - /* - * - One UL per VCPU for private interrupts (assumes UL is at - * least 32 bits) - * - As many UL as necessary for shared interrupts. - * - * The private interrupts are accessed via the "private" - * field, one UL per vcpu (the state for vcpu n is in - * private[n]). The shared interrupts are accessed via the - * "shared" pointer (IRQn state is at bit n-32 in the bitmap). - */ - unsigned long *private; - unsigned long *shared; -}; - -struct vgic_bytemap { - /* - * - 8 u32 per VCPU for private interrupts - * - As many u32 as necessary for shared interrupts. - * - * The private interrupts are accessed via the "private" - * field, (the state for vcpu n is in private[n*8] to - * private[n*8 + 7]). The shared interrupts are accessed via - * the "shared" pointer (IRQn state is at byte (n-32)%4 of the - * shared[(n-32)/4] word). - */ - u32 *private; - u32 *shared; -}; - -struct kvm_vcpu; +#define VGIC_MAX_PRIVATE (VGIC_NR_PRIVATE_IRQS - 1) +#define VGIC_MAX_SPI 1019 +#define VGIC_MAX_RESERVED 1023 +#define VGIC_MIN_LPI 8192 enum vgic_type { VGIC_V2, /* Good ol' GICv2 */ VGIC_V3, /* New fancy GICv3 */ }; -#define LR_STATE_PENDING (1 << 0) -#define LR_STATE_ACTIVE (1 << 1) -#define LR_STATE_MASK (3 << 0) -#define LR_EOI_INT (1 << 2) -#define LR_HW (1 << 3) +/* same for all guests, as depending only on the _host's_ GIC model */ +struct vgic_global { + /* type of the host GIC */ + enum vgic_type type; -struct vgic_lr { - unsigned irq:10; - union { - unsigned hwirq:10; - unsigned source:3; - }; - unsigned state:4; -}; - -struct vgic_vmcr { - u32 ctlr; - u32 abpr; - u32 bpr; - u32 pmr; -}; - -struct vgic_ops { - struct vgic_lr (*get_lr)(const struct kvm_vcpu *, int); - void (*set_lr)(struct kvm_vcpu *, int, struct vgic_lr); - u64 (*get_elrsr)(const struct kvm_vcpu *vcpu); - u64 (*get_eisr)(const struct kvm_vcpu *vcpu); - void (*clear_eisr)(struct kvm_vcpu *vcpu); - u32 (*get_interrupt_status)(const struct kvm_vcpu *vcpu); - void (*enable_underflow)(struct kvm_vcpu *vcpu); - void (*disable_underflow)(struct kvm_vcpu *vcpu); - void (*get_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); - void (*set_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); - void (*enable)(struct kvm_vcpu *vcpu); -}; - -struct vgic_params { - /* vgic type */ - enum vgic_type type; /* Physical address of vgic virtual cpu interface */ - phys_addr_t vcpu_base; - /* Number of list registers */ - u32 nr_lr; - /* Interrupt number */ - unsigned int maint_irq; - /* Virtual control interface base address */ - void __iomem *vctrl_base; - int max_gic_vcpus; + phys_addr_t vcpu_base; + + /* virtual control interface mapping */ + void __iomem *vctrl_base; + + /* Number of implemented list registers */ + int nr_lr; + + /* Maintenance IRQ number */ + unsigned int maint_irq; + + /* maximum number of VCPUs allowed (GICv2 limits us to 8) */ + int max_gic_vcpus; + /* Only needed for the legacy KVM_CREATE_IRQCHIP */ - bool can_emulate_gicv2; + bool can_emulate_gicv2; }; -struct vgic_vm_ops { - bool (*queue_sgi)(struct kvm_vcpu *, int irq); - void (*add_sgi_source)(struct kvm_vcpu *, int irq, int source); - int (*init_model)(struct kvm *); - int (*map_resources)(struct kvm *, const struct vgic_params *); +extern struct vgic_global kvm_vgic_global_state; + +#define VGIC_V2_MAX_LRS (1 << 6) +#define VGIC_V3_MAX_LRS 16 +#define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr) + +enum vgic_irq_config { + VGIC_CONFIG_EDGE = 0, + VGIC_CONFIG_LEVEL +}; + +struct vgic_irq { + spinlock_t irq_lock; /* Protects the content of the struct */ + struct list_head lpi_list; /* Used to link all LPIs together */ + struct list_head ap_list; + + struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU + * SPIs and LPIs: The VCPU whose ap_list + * this is queued on. + */ + + struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should + * be sent to, as a result of the + * targets reg (v2) or the + * affinity reg (v3). + */ + + u32 intid; /* Guest visible INTID */ + bool pending; + bool line_level; /* Level only */ + bool soft_pending; /* Level only */ + bool active; /* not used for LPIs */ + bool enabled; + bool hw; /* Tied to HW IRQ */ + struct kref refcount; /* Used for LPIs */ + u32 hwintid; /* HW INTID number */ + union { + u8 targets; /* GICv2 target VCPUs mask */ + u32 mpidr; /* GICv3 target VCPU */ + }; + u8 source; /* GICv2 SGIs only */ + u8 priority; + enum vgic_irq_config config; /* Level or edge */ +}; + +struct vgic_register_region; +struct vgic_its; + +enum iodev_type { + IODEV_CPUIF, + IODEV_DIST, + IODEV_REDIST, + IODEV_ITS }; struct vgic_io_device { - gpa_t addr; - int len; - const struct vgic_io_range *reg_ranges; - struct kvm_vcpu *redist_vcpu; + gpa_t base_addr; + union { + struct kvm_vcpu *redist_vcpu; + struct vgic_its *its; + }; + const struct vgic_register_region *regions; + enum iodev_type iodev_type; + int nr_regions; struct kvm_io_device dev; }; -struct irq_phys_map { - u32 virt_irq; - u32 phys_irq; -}; +struct vgic_its { + /* The base address of the ITS control register frame */ + gpa_t vgic_its_base; -struct irq_phys_map_entry { - struct list_head entry; - struct rcu_head rcu; - struct irq_phys_map map; + bool enabled; + bool initialized; + struct vgic_io_device iodev; + struct kvm_device *dev; + + /* These registers correspond to GITS_BASER{0,1} */ + u64 baser_device_table; + u64 baser_coll_table; + + /* Protects the command queue */ + struct mutex cmd_lock; + u64 cbaser; + u32 creadr; + u32 cwriter; + + /* Protects the device and collection lists */ + struct mutex its_lock; + struct list_head device_list; + struct list_head collection_list; }; struct vgic_dist { - spinlock_t lock; bool in_kernel; bool ready; + bool initialized; /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */ u32 vgic_model; - int nr_cpus; - int nr_irqs; + /* Do injected MSIs require an additional device ID? */ + bool msis_require_devid; + int nr_spis; + + /* TODO: Consider moving to global state */ /* Virtual control interface mapping */ void __iomem *vctrl_base; - /* Distributor and vcpu interface mapping in the guest */ - phys_addr_t vgic_dist_base; - /* GICv2 and GICv3 use different mapped register blocks */ + /* base addresses in guest physical address space: */ + gpa_t vgic_dist_base; /* distributor */ union { - phys_addr_t vgic_cpu_base; - phys_addr_t vgic_redist_base; + /* either a GICv2 CPU interface */ + gpa_t vgic_cpu_base; + /* or a number of GICv3 redistributor regions */ + gpa_t vgic_redist_base; }; - /* Distributor enabled */ - u32 enabled; + /* distributor enabled */ + bool enabled; - /* Interrupt enabled (one bit per IRQ) */ - struct vgic_bitmap irq_enabled; + struct vgic_irq *spis; - /* Level-triggered interrupt external input is asserted */ - struct vgic_bitmap irq_level; - - /* - * Interrupt state is pending on the distributor - */ - struct vgic_bitmap irq_pending; - - /* - * Tracks writes to GICD_ISPENDRn and GICD_ICPENDRn for level-triggered - * interrupts. Essentially holds the state of the flip-flop in - * Figure 4-10 on page 4-101 in ARM IHI 0048B.b. - * Once set, it is only cleared for level-triggered interrupts on - * guest ACKs (when we queue it) or writes to GICD_ICPENDRn. - */ - struct vgic_bitmap irq_soft_pend; - - /* Level-triggered interrupt queued on VCPU interface */ - struct vgic_bitmap irq_queued; - - /* Interrupt was active when unqueue from VCPU interface */ - struct vgic_bitmap irq_active; - - /* Interrupt priority. Not used yet. */ - struct vgic_bytemap irq_priority; - - /* Level/edge triggered */ - struct vgic_bitmap irq_cfg; - - /* - * Source CPU per SGI and target CPU: - * - * Each byte represent a SGI observable on a VCPU, each bit of - * this byte indicating if the corresponding VCPU has - * generated this interrupt. This is a GICv2 feature only. - * - * For VCPUn (n < 8), irq_sgi_sources[n*16] to [n*16 + 15] are - * the SGIs observable on VCPUn. - */ - u8 *irq_sgi_sources; - - /* - * Target CPU for each SPI: - * - * Array of available SPI, each byte indicating the target - * VCPU for SPI. IRQn (n >=32) is at irq_spi_cpu[n-32]. - */ - u8 *irq_spi_cpu; - - /* - * Reverse lookup of irq_spi_cpu for faster compute pending: - * - * Array of bitmaps, one per VCPU, describing if IRQn is - * routed to a particular VCPU. - */ - struct vgic_bitmap *irq_spi_target; - - /* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */ - u32 *irq_spi_mpidr; - - /* Bitmap indicating which CPU has something pending */ - unsigned long *irq_pending_on_cpu; - - /* Bitmap indicating which CPU has active IRQs */ - unsigned long *irq_active_on_cpu; - - struct vgic_vm_ops vm_ops; struct vgic_io_device dist_iodev; - struct vgic_io_device *redist_iodevs; - /* Virtual irq to hwirq mapping */ - spinlock_t irq_phys_map_lock; - struct list_head irq_phys_map_list; + bool has_its; + + /* + * Contains the attributes and gpa of the LPI configuration table. + * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share + * one address across all redistributors. + * GICv3 spec: 6.1.2 "LPI Configuration tables" + */ + u64 propbaser; + + /* Protects the lpi_list and the count value below. */ + spinlock_t lpi_list_lock; + struct list_head lpi_list_head; + int lpi_list_count; }; struct vgic_v2_cpu_if { @@ -298,78 +230,88 @@ struct vgic_v3_cpu_if { }; struct vgic_cpu { - /* Pending/active/both interrupts on this VCPU */ - DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS); - DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS); - DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS); - - /* Pending/active/both shared interrupts, dynamically sized */ - unsigned long *pending_shared; - unsigned long *active_shared; - unsigned long *pend_act_shared; - /* CPU vif control registers for world switch */ union { struct vgic_v2_cpu_if vgic_v2; struct vgic_v3_cpu_if vgic_v3; }; - /* Protected by the distributor's irq_phys_map_lock */ - struct list_head irq_phys_map_list; + unsigned int used_lrs; + struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS]; - u64 live_lrs; + spinlock_t ap_list_lock; /* Protects the ap_list */ + + /* + * List of IRQs that this VCPU should consider because they are either + * Active or Pending (hence the name; AP list), or because they recently + * were one of the two and need to be migrated off this list to another + * VCPU. + */ + struct list_head ap_list_head; + + u64 live_lrs; + + /* + * Members below are used with GICv3 emulation only and represent + * parts of the redistributor. + */ + struct vgic_io_device rd_iodev; + struct vgic_io_device sgi_iodev; + + /* Contains the attributes and gpa of the LPI pending tables. */ + u64 pendbaser; + + bool lpis_enabled; }; -#define LR_EMPTY 0xff - -#define INT_STATUS_EOI (1 << 0) -#define INT_STATUS_UNDERFLOW (1 << 1) - -struct kvm; -struct kvm_vcpu; - int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); -int kvm_vgic_hyp_init(void); -int kvm_vgic_map_resources(struct kvm *kvm); -int kvm_vgic_get_max_vcpus(void); void kvm_vgic_early_init(struct kvm *kvm); int kvm_vgic_create(struct kvm *kvm, u32 type); void kvm_vgic_destroy(struct kvm *kvm); void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu); void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, +int kvm_vgic_map_resources(struct kvm *kvm); +int kvm_vgic_hyp_init(void); + +int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, bool level); -int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, - unsigned int virt_irq, bool level); -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); -int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq); +int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid, + bool level); +int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq); int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq); bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq); +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); + #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) -#define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus)) +#define vgic_initialized(k) ((k)->arch.vgic.initialized) #define vgic_ready(k) ((k)->arch.vgic.ready) #define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ - ((i) < (k)->arch.vgic.nr_irqs)) + ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) + +bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); +void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); +void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); -int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info, - const struct vgic_ops **ops, - const struct vgic_params **params); #ifdef CONFIG_KVM_ARM_VGIC_V3 -int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, - const struct vgic_ops **ops, - const struct vgic_params **params); +void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); #else -static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, - const struct vgic_ops **ops, - const struct vgic_params **params) +static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) { - return -ENODEV; } #endif -#endif /* old VGIC include */ -#endif +/** + * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW + * + * The host's GIC naturally limits the maximum amount of VCPUs a guest + * can use. + */ +static inline int kvm_vgic_get_max_vcpus(void) +{ + return kvm_vgic_global_state.max_gic_vcpus; +} + +int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); + +#endif /* __KVM_ARM_VGIC_H */ diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h deleted file mode 100644 index 3fbd175265ae..000000000000 --- a/include/kvm/vgic/vgic.h +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ -#ifndef __ASM_ARM_KVM_VGIC_VGIC_H -#define __ASM_ARM_KVM_VGIC_VGIC_H - -#include -#include -#include -#include -#include -#include - -#define VGIC_V3_MAX_CPUS 255 -#define VGIC_V2_MAX_CPUS 8 -#define VGIC_NR_IRQS_LEGACY 256 -#define VGIC_NR_SGIS 16 -#define VGIC_NR_PPIS 16 -#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS) -#define VGIC_MAX_PRIVATE (VGIC_NR_PRIVATE_IRQS - 1) -#define VGIC_MAX_SPI 1019 -#define VGIC_MAX_RESERVED 1023 -#define VGIC_MIN_LPI 8192 - -enum vgic_type { - VGIC_V2, /* Good ol' GICv2 */ - VGIC_V3, /* New fancy GICv3 */ -}; - -/* same for all guests, as depending only on the _host's_ GIC model */ -struct vgic_global { - /* type of the host GIC */ - enum vgic_type type; - - /* Physical address of vgic virtual cpu interface */ - phys_addr_t vcpu_base; - - /* virtual control interface mapping */ - void __iomem *vctrl_base; - - /* Number of implemented list registers */ - int nr_lr; - - /* Maintenance IRQ number */ - unsigned int maint_irq; - - /* maximum number of VCPUs allowed (GICv2 limits us to 8) */ - int max_gic_vcpus; - - /* Only needed for the legacy KVM_CREATE_IRQCHIP */ - bool can_emulate_gicv2; -}; - -extern struct vgic_global kvm_vgic_global_state; - -#define VGIC_V2_MAX_LRS (1 << 6) -#define VGIC_V3_MAX_LRS 16 -#define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr) - -enum vgic_irq_config { - VGIC_CONFIG_EDGE = 0, - VGIC_CONFIG_LEVEL -}; - -struct vgic_irq { - spinlock_t irq_lock; /* Protects the content of the struct */ - struct list_head ap_list; - - struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU - * SPIs and LPIs: The VCPU whose ap_list - * this is queued on. - */ - - struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should - * be sent to, as a result of the - * targets reg (v2) or the - * affinity reg (v3). - */ - - u32 intid; /* Guest visible INTID */ - bool pending; - bool line_level; /* Level only */ - bool soft_pending; /* Level only */ - bool active; /* not used for LPIs */ - bool enabled; - bool hw; /* Tied to HW IRQ */ - u32 hwintid; /* HW INTID number */ - union { - u8 targets; /* GICv2 target VCPUs mask */ - u32 mpidr; /* GICv3 target VCPU */ - }; - u8 source; /* GICv2 SGIs only */ - u8 priority; - enum vgic_irq_config config; /* Level or edge */ -}; - -struct vgic_register_region; - -struct vgic_io_device { - gpa_t base_addr; - struct kvm_vcpu *redist_vcpu; - const struct vgic_register_region *regions; - int nr_regions; - struct kvm_io_device dev; -}; - -struct vgic_dist { - bool in_kernel; - bool ready; - bool initialized; - - /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */ - u32 vgic_model; - - int nr_spis; - - /* TODO: Consider moving to global state */ - /* Virtual control interface mapping */ - void __iomem *vctrl_base; - - /* base addresses in guest physical address space: */ - gpa_t vgic_dist_base; /* distributor */ - union { - /* either a GICv2 CPU interface */ - gpa_t vgic_cpu_base; - /* or a number of GICv3 redistributor regions */ - gpa_t vgic_redist_base; - }; - - /* distributor enabled */ - bool enabled; - - struct vgic_irq *spis; - - struct vgic_io_device dist_iodev; - struct vgic_io_device *redist_iodevs; -}; - -struct vgic_v2_cpu_if { - u32 vgic_hcr; - u32 vgic_vmcr; - u32 vgic_misr; /* Saved only */ - u64 vgic_eisr; /* Saved only */ - u64 vgic_elrsr; /* Saved only */ - u32 vgic_apr; - u32 vgic_lr[VGIC_V2_MAX_LRS]; -}; - -struct vgic_v3_cpu_if { -#ifdef CONFIG_KVM_ARM_VGIC_V3 - u32 vgic_hcr; - u32 vgic_vmcr; - u32 vgic_sre; /* Restored only, change ignored */ - u32 vgic_misr; /* Saved only */ - u32 vgic_eisr; /* Saved only */ - u32 vgic_elrsr; /* Saved only */ - u32 vgic_ap0r[4]; - u32 vgic_ap1r[4]; - u64 vgic_lr[VGIC_V3_MAX_LRS]; -#endif -}; - -struct vgic_cpu { - /* CPU vif control registers for world switch */ - union { - struct vgic_v2_cpu_if vgic_v2; - struct vgic_v3_cpu_if vgic_v3; - }; - - unsigned int used_lrs; - struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS]; - - spinlock_t ap_list_lock; /* Protects the ap_list */ - - /* - * List of IRQs that this VCPU should consider because they are either - * Active or Pending (hence the name; AP list), or because they recently - * were one of the two and need to be migrated off this list to another - * VCPU. - */ - struct list_head ap_list_head; - - u64 live_lrs; -}; - -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); -void kvm_vgic_early_init(struct kvm *kvm); -int kvm_vgic_create(struct kvm *kvm, u32 type); -void kvm_vgic_destroy(struct kvm *kvm); -void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu); -void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); -int kvm_vgic_map_resources(struct kvm *kvm); -int kvm_vgic_hyp_init(void); - -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level); -int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level); -int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq); -int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq); -bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq); - -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); - -#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) -#define vgic_initialized(k) ((k)->arch.vgic.initialized) -#define vgic_ready(k) ((k)->arch.vgic.ready) -#define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ - ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) - -bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); - -#ifdef CONFIG_KVM_ARM_VGIC_V3 -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); -#else -static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) -{ -} -#endif - -/** - * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW - * - * The host's GIC naturally limits the maximum amount of VCPUs a guest - * can use. - */ -static inline int kvm_vgic_get_max_vcpus(void) -{ - return kvm_vgic_global_state.max_gic_vcpus; -} - -#endif /* __ASM_ARM_KVM_VGIC_VGIC_H */ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index dfce616002ad..7868d602c0a0 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -34,9 +34,9 @@ #define CEPH_MAX_MON 31 /* - * ceph_file_layout - describe data layout for a file/inode + * legacy ceph_file_layoute */ -struct ceph_file_layout { +struct ceph_file_layout_legacy { /* file -> object mapping */ __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple of page size. */ @@ -53,33 +53,27 @@ struct ceph_file_layout { __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ } __attribute__ ((packed)); -#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) -#define ceph_file_layout_stripe_count(l) \ - ((__s32)le32_to_cpu((l).fl_stripe_count)) -#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) -#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) -#define ceph_file_layout_object_su(l) \ - ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) -#define ceph_file_layout_pg_pool(l) \ - ((__s32)le32_to_cpu((l).fl_pg_pool)) +struct ceph_string; +/* + * ceph_file_layout - describe data layout for a file/inode + */ +struct ceph_file_layout { + /* file -> object mapping */ + u32 stripe_unit; /* stripe unit, in bytes */ + u32 stripe_count; /* over this many objects */ + u32 object_size; /* until objects are this big */ + s64 pool_id; /* rados pool id */ + struct ceph_string __rcu *pool_ns; /* rados pool namespace */ +}; -static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) -{ - return le32_to_cpu(l->fl_stripe_unit) * - le32_to_cpu(l->fl_stripe_count); -} - -/* "period" == bytes before i start on a new set of objects */ -static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) -{ - return le32_to_cpu(l->fl_object_size) * - le32_to_cpu(l->fl_stripe_count); -} +extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); +extern void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy); +extern void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy); #define CEPH_MIN_STRIPE_UNIT 65536 -int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); - struct ceph_dir_layout { __u8 dl_dir_hash; /* see ceph_hash.h for ids */ __u8 dl_unused1; @@ -127,6 +121,7 @@ struct ceph_dir_layout { /* client <-> mds */ #define CEPH_MSG_MDS_MAP 21 +#define CEPH_MSG_FS_MAP_USER 103 #define CEPH_MSG_CLIENT_SESSION 22 #define CEPH_MSG_CLIENT_RECONNECT 23 @@ -399,7 +394,7 @@ union ceph_mds_request_args { __le32 flags; } __attribute__ ((packed)) setxattr; struct { - struct ceph_file_layout layout; + struct ceph_file_layout_legacy layout; } __attribute__ ((packed)) setlayout; struct { __u8 rule; /* currently fcntl or flock */ @@ -478,7 +473,7 @@ struct ceph_mds_reply_inode { __le64 version; /* inode version */ __le64 xattr_version; /* version for xattr blob */ struct ceph_mds_reply_cap cap; /* caps issued for this inode */ - struct ceph_file_layout layout; + struct ceph_file_layout_legacy layout; struct ceph_timespec ctime, mtime, atime; __le32 time_warp_seq; __le64 size, max_size, truncate_size; @@ -531,7 +526,7 @@ struct ceph_filelock { #define CEPH_FILE_MODE_WR 2 #define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ #define CEPH_FILE_MODE_LAZY 4 /* lazy io */ -#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */ +#define CEPH_FILE_MODE_BITS 4 int ceph_flags_to_mode(int flags); @@ -673,7 +668,7 @@ struct ceph_mds_caps { __le64 size, max_size, truncate_size; __le32 truncate_seq; struct ceph_timespec mtime, atime, ctime; - struct ceph_file_layout layout; + struct ceph_file_layout_legacy layout; __le32 time_warp_seq; } __attribute__ ((packed)); diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 19e9932f3e77..f990f2cc907a 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -217,6 +218,60 @@ static inline void ceph_encode_string(void **p, void *end, *p += len; } +/* + * version and length starting block encoders/decoders + */ + +/* current code version (u8) + compat code version (u8) + len of struct (u32) */ +#define CEPH_ENCODING_START_BLK_LEN 6 + +/** + * ceph_start_encoding - start encoding block + * @struct_v: current (code) version of the encoding + * @struct_compat: oldest code version that can decode it + * @struct_len: length of struct encoding + */ +static inline void ceph_start_encoding(void **p, u8 struct_v, u8 struct_compat, + u32 struct_len) +{ + ceph_encode_8(p, struct_v); + ceph_encode_8(p, struct_compat); + ceph_encode_32(p, struct_len); +} + +/** + * ceph_start_decoding - start decoding block + * @v: current version of the encoding that the code supports + * @name: name of the struct (free-form) + * @struct_v: out param for the encoding version + * @struct_len: out param for the length of struct encoding + * + * Validates the length of struct encoding, so unsafe ceph_decode_* + * variants can be used for decoding. + */ +static inline int ceph_start_decoding(void **p, void *end, u8 v, + const char *name, u8 *struct_v, + u32 *struct_len) +{ + u8 struct_compat; + + ceph_decode_need(p, end, CEPH_ENCODING_START_BLK_LEN, bad); + *struct_v = ceph_decode_8(p); + struct_compat = ceph_decode_8(p); + if (v < struct_compat) { + pr_warn("got struct_v %d struct_compat %d > %d of %s\n", + *struct_v, struct_compat, v, name); + return -EINVAL; + } + + *struct_len = ceph_decode_32(p); + ceph_decode_need(p, end, *struct_len, bad); + return 0; + +bad: + return -ERANGE; +} + #define ceph_encode_need(p, end, n, bad) \ do { \ if (!likely(ceph_has_room(p, end, n))) \ diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 690985daad1c..83fc1fff7061 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -21,6 +21,7 @@ #include #include #include +#include /* * mount options @@ -214,8 +215,9 @@ static void erase_##name(struct rb_root *root, type *t) \ } #define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ +extern type __lookup_##name##_key; \ static type *lookup_##name(struct rb_root *root, \ - typeof(((type *)0)->keyfld) key) \ + typeof(__lookup_##name##_key.keyfld) key) \ { \ struct rb_node *n = root->rb_node; \ \ diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index e2a92df08b47..24d704d1ea5c 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -95,7 +95,7 @@ struct ceph_mon_client { struct ceph_mon_subscribe_item item; bool want; u32 have; /* epoch */ - } subs[3]; + } subs[4]; int fs_cluster_id; /* "mdsmap." sub */ #ifdef CONFIG_DEBUG_FS @@ -111,9 +111,10 @@ extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern void ceph_monc_stop(struct ceph_mon_client *monc); enum { - CEPH_SUB_MDSMAP = 0, - CEPH_SUB_MONMAP, + CEPH_SUB_MONMAP = 0, CEPH_SUB_OSDMAP, + CEPH_SUB_FSMAP, + CEPH_SUB_MDSMAP, }; extern const char *ceph_sub_str[]; diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h index 4b0d38960726..ddd0d48d0384 100644 --- a/include/linux/ceph/msgpool.h +++ b/include/linux/ceph/msgpool.h @@ -2,7 +2,6 @@ #define _FS_CEPH_MSGPOOL #include -#include /* * we use memory pools for preallocating messages we may receive, to diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 1b3b6e155392..858932304260 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 9ccf4dbe55f8..9a9041784dcf 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -63,11 +63,13 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) struct ceph_object_locator { s64 pool; + struct ceph_string *pool_ns; }; static inline void ceph_oloc_init(struct ceph_object_locator *oloc) { oloc->pool = -1; + oloc->pool_ns = NULL; } static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc) @@ -75,11 +77,9 @@ static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc) return oloc->pool == -1; } -static inline void ceph_oloc_copy(struct ceph_object_locator *dest, - const struct ceph_object_locator *src) -{ - dest->pool = src->pool; -} +void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src); +void ceph_oloc_destroy(struct ceph_object_locator *oloc); /* * Maximum supported by kernel client object name length @@ -115,6 +115,11 @@ static inline void ceph_oid_init(struct ceph_object_id *oid) oid->name_len = 0; } +#define CEPH_OID_INIT_ONSTACK(oid) \ + ({ ceph_oid_init(&oid); oid; }) +#define CEPH_DEFINE_OID_ONSTACK(oid) \ + struct ceph_object_id oid = CEPH_OID_INIT_ONSTACK(oid) + static inline bool ceph_oid_empty(const struct ceph_object_id *oid) { return oid->name == oid->inline_name && !oid->name_len; diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h new file mode 100644 index 000000000000..1b02c96daf75 --- /dev/null +++ b/include/linux/ceph/string_table.h @@ -0,0 +1,62 @@ +#ifndef _FS_CEPH_STRING_TABLE_H +#define _FS_CEPH_STRING_TABLE_H + +#include +#include +#include +#include + +struct ceph_string { + struct kref kref; + union { + struct rb_node node; + struct rcu_head rcu; + }; + size_t len; + char str[]; +}; + +extern void ceph_release_string(struct kref *ref); +extern struct ceph_string *ceph_find_or_create_string(const char *str, + size_t len); +extern bool ceph_strings_empty(void); + +static inline struct ceph_string *ceph_get_string(struct ceph_string *str) +{ + kref_get(&str->kref); + return str; +} + +static inline void ceph_put_string(struct ceph_string *str) +{ + if (!str) + return; + kref_put(&str->kref, ceph_release_string); +} + +static inline int ceph_compare_string(struct ceph_string *cs, + const char* str, size_t len) +{ + size_t cs_len = cs ? cs->len : 0; + if (cs_len != len) + return cs_len - len; + if (len == 0) + return 0; + return strncmp(cs->str, str, len); +} + +#define ceph_try_get_string(x) \ +({ \ + struct ceph_string *___str; \ + rcu_read_lock(); \ + for (;;) { \ + ___str = rcu_dereference(x); \ + if (!___str || \ + kref_get_unless_zero(&___str->kref)) \ + break; \ + } \ + rcu_read_unlock(); \ + (___str); \ +}) + +#endif diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index d9aef2a0ec8e..c78fc27418f2 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -99,7 +99,8 @@ static inline void context_tracking_init(void) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static inline void guest_enter(void) +/* must be called with irqs disabled */ +static inline void guest_enter_irqoff(void) { if (vtime_accounting_cpu_enabled()) vtime_guest_enter(current); @@ -108,9 +109,19 @@ static inline void guest_enter(void) if (context_tracking_is_enabled()) __context_tracking_enter(CONTEXT_GUEST); + + /* KVM does not hold any references to rcu protected data when it + * switches CPU into a guest mode. In fact switching to a guest mode + * is very similar to exiting to userspace from rcu point of view. In + * addition CPU may stay in a guest mode for quite a long time (up to + * one time slice). Lets treat guest mode as quiescent state, just like + * we do with user-mode execution. + */ + if (!context_tracking_cpu_is_enabled()) + rcu_virt_note_context_switch(smp_processor_id()); } -static inline void guest_exit(void) +static inline void guest_exit_irqoff(void) { if (context_tracking_is_enabled()) __context_tracking_exit(CONTEXT_GUEST); @@ -122,7 +133,7 @@ static inline void guest_exit(void) } #else -static inline void guest_enter(void) +static inline void guest_enter_irqoff(void) { /* * This is running in ioctl context so its safe @@ -131,9 +142,10 @@ static inline void guest_enter(void) */ vtime_account_system(current); current->flags |= PF_VCPU; + rcu_virt_note_context_switch(smp_processor_id()); } -static inline void guest_exit(void) +static inline void guest_exit_irqoff(void) { /* Flush the guest cputime we spent on the guest */ vtime_account_system(current); @@ -141,4 +153,22 @@ static inline void guest_exit(void) } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ +static inline void guest_enter(void) +{ + unsigned long flags; + + local_irq_save(flags); + guest_enter_irqoff(); + local_irq_restore(flags); +} + +static inline void guest_exit(void) +{ + unsigned long flags; + + local_irq_save(flags); + guest_exit_irqoff(); + local_irq_restore(flags); +} + #endif diff --git a/include/linux/export.h b/include/linux/export.h index 2f9ccbe6a639..c565f87f005e 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -82,7 +82,7 @@ extern struct module __this_module; #include #define __EXPORT_SYMBOL(sym, sec) \ - __cond_export_sym(sym, sec, config_enabled(__KSYM_##sym)) + __cond_export_sym(sym, sec, __is_defined(__KSYM_##sym)) #define __cond_export_sym(sym, sec, conf) \ ___cond_export_sym(sym, sec, conf) #define ___cond_export_sym(sym, sec, enabled) \ diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 107eed475b94..56b0b7ec66aa 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -112,34 +112,76 @@ #define GICR_WAKER_ProcessorSleep (1U << 1) #define GICR_WAKER_ChildrenAsleep (1U << 2) -#define GICR_PROPBASER_NonShareable (0U << 10) -#define GICR_PROPBASER_InnerShareable (1U << 10) -#define GICR_PROPBASER_OuterShareable (2U << 10) -#define GICR_PROPBASER_SHAREABILITY_MASK (3UL << 10) -#define GICR_PROPBASER_nCnB (0U << 7) -#define GICR_PROPBASER_nC (1U << 7) -#define GICR_PROPBASER_RaWt (2U << 7) -#define GICR_PROPBASER_RaWb (3U << 7) -#define GICR_PROPBASER_WaWt (4U << 7) -#define GICR_PROPBASER_WaWb (5U << 7) -#define GICR_PROPBASER_RaWaWt (6U << 7) -#define GICR_PROPBASER_RaWaWb (7U << 7) -#define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7) -#define GICR_PROPBASER_IDBITS_MASK (0x1f) +#define GIC_BASER_CACHE_nCnB 0ULL +#define GIC_BASER_CACHE_SameAsInner 0ULL +#define GIC_BASER_CACHE_nC 1ULL +#define GIC_BASER_CACHE_RaWt 2ULL +#define GIC_BASER_CACHE_RaWb 3ULL +#define GIC_BASER_CACHE_WaWt 4ULL +#define GIC_BASER_CACHE_WaWb 5ULL +#define GIC_BASER_CACHE_RaWaWt 6ULL +#define GIC_BASER_CACHE_RaWaWb 7ULL +#define GIC_BASER_CACHE_MASK 7ULL +#define GIC_BASER_NonShareable 0ULL +#define GIC_BASER_InnerShareable 1ULL +#define GIC_BASER_OuterShareable 2ULL +#define GIC_BASER_SHAREABILITY_MASK 3ULL -#define GICR_PENDBASER_NonShareable (0U << 10) -#define GICR_PENDBASER_InnerShareable (1U << 10) -#define GICR_PENDBASER_OuterShareable (2U << 10) -#define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10) -#define GICR_PENDBASER_nCnB (0U << 7) -#define GICR_PENDBASER_nC (1U << 7) -#define GICR_PENDBASER_RaWt (2U << 7) -#define GICR_PENDBASER_RaWb (3U << 7) -#define GICR_PENDBASER_WaWt (4U << 7) -#define GICR_PENDBASER_WaWb (5U << 7) -#define GICR_PENDBASER_RaWaWt (6U << 7) -#define GICR_PENDBASER_RaWaWb (7U << 7) -#define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7) +#define GIC_BASER_CACHEABILITY(reg, inner_outer, type) \ + (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT) + +#define GIC_BASER_SHAREABILITY(reg, type) \ + (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT) + +#define GICR_PROPBASER_SHAREABILITY_SHIFT (10) +#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PROPBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK) +#define GICR_PROPBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK) +#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK) +#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK + +#define GICR_PROPBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable) + +#define GICR_PROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB) +#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC) +#define GICR_PROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt) +#define GICR_PROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt) +#define GICR_PROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt) +#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb) +#define GICR_PROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt) +#define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb) + +#define GICR_PROPBASER_IDBITS_MASK (0x1f) + +#define GICR_PENDBASER_SHAREABILITY_SHIFT (10) +#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PENDBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK) +#define GICR_PENDBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK) +#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK) +#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK + +#define GICR_PENDBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable) + +#define GICR_PENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB) +#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC) +#define GICR_PENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt) +#define GICR_PENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt) +#define GICR_PENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt) +#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb) +#define GICR_PENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt) +#define GICR_PENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb) + +#define GICR_PENDBASER_PTZ BIT_ULL(62) /* * Re-Distributor registers, offsets from SGI_base @@ -175,54 +217,83 @@ #define GITS_CWRITER 0x0088 #define GITS_CREADR 0x0090 #define GITS_BASER 0x0100 +#define GITS_IDREGS_BASE 0xffd0 +#define GITS_PIDR0 0xffe0 +#define GITS_PIDR1 0xffe4 #define GITS_PIDR2 GICR_PIDR2 +#define GITS_PIDR4 0xffd0 +#define GITS_CIDR0 0xfff0 +#define GITS_CIDR1 0xfff4 +#define GITS_CIDR2 0xfff8 +#define GITS_CIDR3 0xfffc #define GITS_TRANSLATER 0x10040 #define GITS_CTLR_ENABLE (1U << 0) #define GITS_CTLR_QUIESCENT (1U << 31) +#define GITS_TYPER_PLPIS (1UL << 0) +#define GITS_TYPER_IDBITS_SHIFT 8 #define GITS_TYPER_DEVBITS_SHIFT 13 #define GITS_TYPER_DEVBITS(r) ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1) #define GITS_TYPER_PTA (1UL << 19) +#define GITS_TYPER_HWCOLLCNT_SHIFT 24 -#define GITS_CBASER_VALID (1UL << 63) -#define GITS_CBASER_nCnB (0UL << 59) -#define GITS_CBASER_nC (1UL << 59) -#define GITS_CBASER_RaWt (2UL << 59) -#define GITS_CBASER_RaWb (3UL << 59) -#define GITS_CBASER_WaWt (4UL << 59) -#define GITS_CBASER_WaWb (5UL << 59) -#define GITS_CBASER_RaWaWt (6UL << 59) -#define GITS_CBASER_RaWaWb (7UL << 59) -#define GITS_CBASER_CACHEABILITY_MASK (7UL << 59) -#define GITS_CBASER_NonShareable (0UL << 10) -#define GITS_CBASER_InnerShareable (1UL << 10) -#define GITS_CBASER_OuterShareable (2UL << 10) -#define GITS_CBASER_SHAREABILITY_MASK (3UL << 10) +#define GITS_CBASER_VALID (1UL << 63) +#define GITS_CBASER_SHAREABILITY_SHIFT (10) +#define GITS_CBASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_CBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK) +#define GITS_CBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK) +#define GITS_CBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK) +#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK + +#define GITS_CBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable) + +#define GITS_CBASER_nCnB GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB) +#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC) +#define GITS_CBASER_RaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt) +#define GITS_CBASER_RaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt) +#define GITS_CBASER_WaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt) +#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb) +#define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt) +#define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb) #define GITS_BASER_NR_REGS 8 -#define GITS_BASER_VALID (1UL << 63) -#define GITS_BASER_INDIRECT (1UL << 62) -#define GITS_BASER_nCnB (0UL << 59) -#define GITS_BASER_nC (1UL << 59) -#define GITS_BASER_RaWt (2UL << 59) -#define GITS_BASER_RaWb (3UL << 59) -#define GITS_BASER_WaWt (4UL << 59) -#define GITS_BASER_WaWb (5UL << 59) -#define GITS_BASER_RaWaWt (6UL << 59) -#define GITS_BASER_RaWaWb (7UL << 59) -#define GITS_BASER_CACHEABILITY_MASK (7UL << 59) -#define GITS_BASER_TYPE_SHIFT (56) +#define GITS_BASER_VALID (1UL << 63) +#define GITS_BASER_INDIRECT (1ULL << 62) + +#define GITS_BASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_BASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_BASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK) +#define GITS_BASER_CACHEABILITY_MASK GITS_BASER_INNER_CACHEABILITY_MASK +#define GITS_BASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK) +#define GITS_BASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK) + +#define GITS_BASER_nCnB GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB) +#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC) +#define GITS_BASER_RaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt) +#define GITS_BASER_RaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt) +#define GITS_BASER_WaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt) +#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb) +#define GITS_BASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt) +#define GITS_BASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb) + +#define GITS_BASER_TYPE_SHIFT (56) #define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) -#define GITS_BASER_ENTRY_SIZE_SHIFT (48) +#define GITS_BASER_ENTRY_SIZE_SHIFT (48) #define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1) -#define GITS_BASER_NonShareable (0UL << 10) -#define GITS_BASER_InnerShareable (1UL << 10) -#define GITS_BASER_OuterShareable (2UL << 10) #define GITS_BASER_SHAREABILITY_SHIFT (10) -#define GITS_BASER_SHAREABILITY_MASK (3UL << GITS_BASER_SHAREABILITY_SHIFT) +#define GITS_BASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) #define GITS_BASER_PAGE_SIZE_SHIFT (8) #define GITS_BASER_PAGE_SIZE_4K (0UL << GITS_BASER_PAGE_SIZE_SHIFT) #define GITS_BASER_PAGE_SIZE_16K (1UL << GITS_BASER_PAGE_SIZE_SHIFT) @@ -230,6 +301,7 @@ #define GITS_BASER_PAGE_SIZE_MASK (3UL << GITS_BASER_PAGE_SIZE_SHIFT) #define GITS_BASER_PAGES_MAX 256 #define GITS_BASER_PAGES_SHIFT (0) +#define GITS_BASER_NR_PAGES(r) (((r) & 0xff) + 1) #define GITS_BASER_TYPE_NONE 0 #define GITS_BASER_TYPE_DEVICE 1 @@ -247,7 +319,10 @@ */ #define GITS_CMD_MAPD 0x08 #define GITS_CMD_MAPC 0x09 -#define GITS_CMD_MAPVI 0x0a +#define GITS_CMD_MAPTI 0x0a +/* older GIC documentation used MAPVI for this command */ +#define GITS_CMD_MAPVI GITS_CMD_MAPTI +#define GITS_CMD_MAPI 0x0b #define GITS_CMD_MOVI 0x01 #define GITS_CMD_DISCARD 0x0f #define GITS_CMD_INV 0x0c @@ -257,6 +332,22 @@ #define GITS_CMD_CLEAR 0x04 #define GITS_CMD_SYNC 0x05 +/* + * ITS error numbers + */ +#define E_ITS_MOVI_UNMAPPED_INTERRUPT 0x010107 +#define E_ITS_MOVI_UNMAPPED_COLLECTION 0x010109 +#define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507 +#define E_ITS_MAPD_DEVICE_OOR 0x010801 +#define E_ITS_MAPC_PROCNUM_OOR 0x010902 +#define E_ITS_MAPC_COLLECTION_OOR 0x010903 +#define E_ITS_MAPTI_UNMAPPED_DEVICE 0x010a04 +#define E_ITS_MAPTI_PHYSICALID_OOR 0x010a06 +#define E_ITS_INV_UNMAPPED_INTERRUPT 0x010c07 +#define E_ITS_INVALL_UNMAPPED_COLLECTION 0x010d09 +#define E_ITS_MOVALL_PROCNUM_OOR 0x010e01 +#define E_ITS_DISCARD_UNMAPPED_INTERRUPT 0x010f07 + /* * CPU interface registers */ diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index b33c7797eb57..15ec117ec537 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -3,6 +3,21 @@ #include +#define __ARG_PLACEHOLDER_1 0, +#define __take_second_arg(__ignored, val, ...) val + +/* + * The use of "&&" / "||" is limited in certain expressions. + * The followings enable to calculate "and" / "or" with macro expansion only. + */ +#define __and(x, y) ___and(x, y) +#define ___and(x, y) ____and(__ARG_PLACEHOLDER_##x, y) +#define ____and(arg1_or_junk, y) __take_second_arg(arg1_or_junk y, 0) + +#define __or(x, y) ___or(x, y) +#define ___or(x, y) ____or(__ARG_PLACEHOLDER_##x, y) +#define ____or(arg1_or_junk, y) __take_second_arg(arg1_or_junk 1, y) + /* * Helper macros to use CONFIG_ options in C/CPP expressions. Note that * these only work with boolean and tristate options. @@ -16,11 +31,10 @@ * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when * the last step cherry picks the 2nd arg, we get a zero. */ -#define __ARG_PLACEHOLDER_1 0, -#define config_enabled(cfg) _config_enabled(cfg) -#define _config_enabled(value) __config_enabled(__ARG_PLACEHOLDER_##value) -#define __config_enabled(arg1_or_junk) ___config_enabled(arg1_or_junk 1, 0) -#define ___config_enabled(__ignored, val, ...) val +#define config_enabled(cfg) ___is_defined(cfg) +#define __is_defined(x) ___is_defined(x) +#define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val) +#define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0) /* * IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0 @@ -41,14 +55,13 @@ * This is similar to IS_ENABLED(), but returns false when invoked from * built-in code when CONFIG_FOO is set to 'm'. */ -#define IS_REACHABLE(option) (config_enabled(option) || \ - (config_enabled(option##_MODULE) && config_enabled(MODULE))) +#define IS_REACHABLE(option) __or(IS_BUILTIN(option), \ + __and(IS_MODULE(option), __is_defined(MODULE))) /* * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm', * 0 otherwise. */ -#define IS_ENABLED(option) \ - (IS_BUILTIN(option) || IS_MODULE(option)) +#define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option)) #endif /* __LINUX_KCONFIG_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1c9c973a7dd9..aafd702f3e21 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -164,6 +164,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev); int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); +struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, + gpa_t addr); #ifdef CONFIG_KVM_ASYNC_PF struct kvm_async_pf { @@ -371,7 +373,15 @@ struct kvm { struct srcu_struct srcu; struct srcu_struct irq_srcu; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + + /* + * created_vcpus is protected by kvm->lock, and is incremented + * at the beginning of KVM_CREATE_VCPU. online_vcpus is only + * incremented after storing the kvm_vcpu pointer in vcpus, + * and is accessed atomically. + */ atomic_t online_vcpus; + int created_vcpus; int last_boosted_vcpu; struct list_head vm_list; struct mutex lock; @@ -867,45 +877,6 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm, } #endif -/* must be called with irqs disabled */ -static inline void __kvm_guest_enter(void) -{ - guest_enter(); - /* KVM does not hold any references to rcu protected data when it - * switches CPU into a guest mode. In fact switching to a guest mode - * is very similar to exiting to userspace from rcu point of view. In - * addition CPU may stay in a guest mode for quite a long time (up to - * one time slice). Lets treat guest mode as quiescent state, just like - * we do with user-mode execution. - */ - if (!context_tracking_cpu_is_enabled()) - rcu_virt_note_context_switch(smp_processor_id()); -} - -/* must be called with irqs disabled */ -static inline void __kvm_guest_exit(void) -{ - guest_exit(); -} - -static inline void kvm_guest_enter(void) -{ - unsigned long flags; - - local_irq_save(flags); - __kvm_guest_enter(); - local_irq_restore(flags); -} - -static inline void kvm_guest_exit(void) -{ - unsigned long flags; - - local_irq_save(flags); - __kvm_guest_exit(); - local_irq_restore(flags); -} - /* * search_memslots() and __gfn_to_memslot() are here because they are * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. @@ -1042,7 +1013,8 @@ int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, unsigned flags); -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); void kvm_free_irq_routing(struct kvm *kvm); @@ -1097,12 +1069,6 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) #endif /* CONFIG_HAVE_KVM_EVENTFD */ -#ifdef CONFIG_KVM_APIC_ARCHITECTURE -bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu); -#else -static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; } -#endif - static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index fbe8e164a4ee..8dd6e01f45c0 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -783,6 +783,7 @@ static inline void nand_set_controller_data(struct nand_chip *chip, void *priv) * NAND Flash Manufacturer ID Codes */ #define NAND_MFR_TOSHIBA 0x98 +#define NAND_MFR_ESMT 0xc8 #define NAND_MFR_SAMSUNG 0xec #define NAND_MFR_FUJITSU 0x04 #define NAND_MFR_NATIONAL 0x8f diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 7f041bd88b82..c425c7b4c2a0 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -173,10 +173,10 @@ struct spi_nor { int (*read_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len); int (*write_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len); - int (*read)(struct spi_nor *nor, loff_t from, - size_t len, size_t *retlen, u_char *read_buf); - void (*write)(struct spi_nor *nor, loff_t to, - size_t len, size_t *retlen, const u_char *write_buf); + ssize_t (*read)(struct spi_nor *nor, loff_t from, + size_t len, u_char *read_buf); + ssize_t (*write)(struct spi_nor *nor, loff_t to, + size_t len, const u_char *write_buf); int (*erase)(struct spi_nor *nor, loff_t offs); int (*flash_lock)(struct spi_nor *nor, loff_t ofs, uint64_t len); diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 8b5e0a9f2431..610e13271918 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -124,6 +124,15 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) return ret; } +static inline int page_ref_inc_return(struct page *page) +{ + int ret = atomic_inc_return(&page->_refcount); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) + __page_ref_mod_and_return(page, 1, ret); + return ret; +} + static inline int page_ref_dec_and_test(struct page *page) { int ret = atomic_dec_and_test(&page->_refcount); diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 89ab0572dbc6..7d63a66e8ed4 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -24,6 +24,8 @@ static inline acpi_status pci_acpi_remove_pm_notifier(struct acpi_device *dev) } extern phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle); +extern phys_addr_t pci_mcfg_lookup(u16 domain, struct resource *bus_res); + static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) { struct pci_bus *pbus = pdev->bus; diff --git a/drivers/pci/ecam.h b/include/linux/pci-ecam.h similarity index 95% rename from drivers/pci/ecam.h rename to include/linux/pci-ecam.h index 9878bebd45bb..7adad206b1f4 100644 --- a/drivers/pci/ecam.h +++ b/include/linux/pci-ecam.h @@ -27,8 +27,7 @@ struct pci_config_window; struct pci_ecam_ops { unsigned int bus_shift; struct pci_ops pci_ops; - int (*init)(struct device *, - struct pci_config_window *); + int (*init)(struct pci_config_window *); }; /* @@ -45,6 +44,7 @@ struct pci_config_window { void __iomem *win; /* 64-bit single mapping */ void __iomem **winp; /* 32-bit per-bus mapping */ }; + struct device *parent;/* ECAM res was from this dev */ }; /* create and free pci_config_window */ diff --git a/include/linux/pci.h b/include/linux/pci.h index c40ac910cce4..2599a980340f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -101,6 +101,10 @@ enum { DEVICE_COUNT_RESOURCE = PCI_NUM_RESOURCES, }; +/* + * pci_power_t values must match the bits in the Capabilities PME_Support + * and Control/Status PowerState fields in the Power Management capability. + */ typedef int __bitwise pci_power_t; #define PCI_D0 ((pci_power_t __force) 0) @@ -116,7 +120,7 @@ extern const char *pci_power_names[]; static inline const char *pci_power_name(pci_power_t state) { - return pci_power_names[1 + (int) state]; + return pci_power_names[1 + (__force int) state]; } #define PCI_PM_D2_DELAY 200 @@ -294,6 +298,7 @@ struct pci_dev { unsigned int d2_support:1; /* Low power state D2 is supported */ unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ unsigned int no_d3cold:1; /* D3cold is forbidden */ + unsigned int bridge_d3:1; /* Allow D3 for bridge */ unsigned int d3cold_allowed:1; /* D3cold is allowed by user */ unsigned int mmio_always_on:1; /* disallow turning off io/mem decoding during bar sizing */ @@ -320,6 +325,7 @@ struct pci_dev { * directly, use the values stored here. They might be different! */ unsigned int irq; + struct cpumask *irq_affinity; struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ bool match_driver; /* Skip attaching driver */ @@ -1084,6 +1090,8 @@ int pci_back_from_sleep(struct pci_dev *dev); bool pci_dev_run_wake(struct pci_dev *dev); bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); +void pci_d3cold_enable(struct pci_dev *dev); +void pci_d3cold_disable(struct pci_dev *dev); static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable) @@ -1115,6 +1123,7 @@ int pci_set_vpd_size(struct pci_dev *dev, size_t len); /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx); void pci_bus_assign_resources(const struct pci_bus *bus); +void pci_bus_claim_resources(struct pci_bus *bus); void pci_bus_size_bridges(struct pci_bus *bus); int pci_claim_resource(struct pci_dev *, int); int pci_claim_bridge_resource(struct pci_dev *bridge, int i); @@ -1144,9 +1153,12 @@ void pci_add_resource(struct list_head *resources, struct resource *res); void pci_add_resource_offset(struct list_head *resources, struct resource *res, resource_size_t offset); void pci_free_resource_list(struct list_head *resources); -void pci_bus_add_resource(struct pci_bus *bus, struct resource *res, unsigned int flags); +void pci_bus_add_resource(struct pci_bus *bus, struct resource *res, + unsigned int flags); struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n); void pci_bus_remove_resources(struct pci_bus *bus); +int devm_request_pci_bus_resources(struct device *dev, + struct list_head *resources); #define pci_bus_for_each_resource(bus, res, i) \ for (i = 0; \ @@ -1168,6 +1180,7 @@ int pci_register_io_range(phys_addr_t addr, resource_size_t size); unsigned long pci_address_to_pio(phys_addr_t addr); phys_addr_t pci_pio_to_address(unsigned long pio); int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr); +void pci_unmap_iospace(struct resource *res); static inline pci_bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar) { @@ -1238,6 +1251,11 @@ resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno); int pci_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags); +#define PCI_IRQ_NOLEGACY (1 << 0) /* don't use legacy interrupts */ +#define PCI_IRQ_NOMSI (1 << 1) /* don't use MSI interrupts */ +#define PCI_IRQ_NOMSIX (1 << 2) /* don't use MSI-X interrupts */ +#define PCI_IRQ_NOAFFINITY (1 << 3) /* don't auto-assign affinity */ + /* kmem_cache style wrapper around pci_alloc_consistent() */ #include @@ -1285,6 +1303,11 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev, return rc; return 0; } +int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags); +void pci_free_irq_vectors(struct pci_dev *dev); +int pci_irq_vector(struct pci_dev *dev, unsigned int nr); + #else static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; } static inline void pci_msi_shutdown(struct pci_dev *dev) { } @@ -1308,6 +1331,24 @@ static inline int pci_enable_msix_range(struct pci_dev *dev, static inline int pci_enable_msix_exact(struct pci_dev *dev, struct msix_entry *entries, int nvec) { return -ENOSYS; } +static inline int pci_alloc_irq_vectors(struct pci_dev *dev, + unsigned int min_vecs, unsigned int max_vecs, + unsigned int flags) +{ + if (min_vecs > 1) + return -EINVAL; + return 1; +} +static inline void pci_free_irq_vectors(struct pci_dev *dev) +{ +} + +static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ + if (WARN_ON_ONCE(nr > 0)) + return -EINVAL; + return dev->irq; +} #endif #ifdef CONFIG_PCIEPORTBUS @@ -1390,12 +1431,13 @@ static inline int pci_domain_nr(struct pci_bus *bus) { return bus->domain_nr; } -void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent); +#ifdef CONFIG_ACPI +int acpi_pci_bus_find_domain_nr(struct pci_bus *bus); #else -static inline void pci_bus_assign_domain_nr(struct pci_bus *bus, - struct device *parent) -{ -} +static inline int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) +{ return 0; } +#endif +int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent); #endif /* some architectures require additional setup to direct VGA traffic */ @@ -1403,6 +1445,34 @@ typedef int (*arch_set_vga_state_t)(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags); void pci_register_set_vga_state(arch_set_vga_state_t func); +static inline int +pci_request_io_regions(struct pci_dev *pdev, const char *name) +{ + return pci_request_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_IO), name); +} + +static inline void +pci_release_io_regions(struct pci_dev *pdev) +{ + return pci_release_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_IO)); +} + +static inline int +pci_request_mem_regions(struct pci_dev *pdev, const char *name) +{ + return pci_request_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_MEM), name); +} + +static inline void +pci_release_mem_regions(struct pci_dev *pdev) +{ + return pci_release_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_MEM)); +} + #else /* CONFIG_PCI is not enabled */ static inline void pci_set_flags(int flags) { } @@ -1555,7 +1625,11 @@ static inline const char *pci_name(const struct pci_dev *pdev) /* Some archs don't want to expose struct resource to userland as-is * in sysfs and /proc */ -#ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER +#ifdef HAVE_ARCH_PCI_RESOURCE_TO_USER +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + resource_size_t *start, resource_size_t *end); +#else static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, const struct resource *rsrc, resource_size_t *start, resource_size_t *end) @@ -1707,6 +1781,7 @@ extern u8 pci_cache_line_size; extern unsigned long pci_hotplug_io_size; extern unsigned long pci_hotplug_mem_size; +extern unsigned long pci_hotplug_bus_size; /* Architecture-specific versions may override these (weak) */ void pcibios_disable_device(struct pci_dev *dev); @@ -1723,7 +1798,7 @@ void pcibios_free_irq(struct pci_dev *dev); extern struct dev_pm_ops pcibios_pm_ops; #endif -#ifdef CONFIG_PCI_MMCONFIG +#if defined(CONFIG_PCI_MMCONFIG) || defined(CONFIG_ACPI_MCFG) void __init pci_mmcfg_early_init(void); void __init pci_mmcfg_late_init(void); #else diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index f28292d73ddb..8ade3eb6c640 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -151,8 +151,9 @@ TRACE_EVENT(kvm_msi_set_irq, __entry->data = data; ), - TP_printk("dst %u vec %u (%s|%s|%s%s)", - (u8)(__entry->address >> 12), (u8)__entry->data, + TP_printk("dst %llx vec %u (%s|%s|%s%s)", + (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), + (u8)__entry->data, __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), (__entry->address & (1<<2)) ? "logical" : "physical", (__entry->data & (1<<15)) ? "level" : "edge", diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 05ebf475104c..e98bb4cce639 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -866,6 +866,10 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ARM_PMU_V3 126 #define KVM_CAP_VCPU_ATTRIBUTES 127 #define KVM_CAP_MAX_VCPU_ID 128 +#define KVM_CAP_X2APIC_API 129 +#define KVM_CAP_S390_USER_INSTR0 130 +#define KVM_CAP_MSI_DEVID 131 +#define KVM_CAP_PPC_HTM 132 #ifdef KVM_CAP_IRQ_ROUTING @@ -1024,12 +1028,14 @@ struct kvm_one_reg { __u64 addr; }; +#define KVM_MSI_VALID_DEVID (1U << 0) struct kvm_msi { __u32 address_lo; __u32 address_hi; __u32 data; __u32 flags; - __u8 pad[16]; + __u32 devid; + __u8 pad[12]; }; struct kvm_arm_device_addr { @@ -1074,6 +1080,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_ARM_VGIC_V3, #define KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_V3 + KVM_DEV_TYPE_ARM_VGIC_ITS, +#define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_MAX, }; @@ -1313,4 +1321,7 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + #endif /* __LINUX_KVM_H */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f07842e2d69f..eb8917a71489 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -709,6 +709,8 @@ config KCOV bool "Code coverage for fuzzing" depends on ARCH_HAS_KCOV select DEBUG_FS + select GCC_PLUGINS if !COMPILE_TEST + select GCC_PLUGIN_SANCOV if !COMPILE_TEST help KCOV exposes kernel code coverage information in a form suitable for coverage-guided fuzzing (randomized testing). diff --git a/mm/gup.c b/mm/gup.c index 547741f5f7a7..96b2b2fd0fbd 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -723,6 +723,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, } return 0; } +EXPORT_SYMBOL_GPL(fixup_user_fault); static __always_inline long __get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, diff --git a/net/ceph/Makefile b/net/ceph/Makefile index 958d9856912c..84cbed630c4b 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ crypto.o armor.o \ auth_x.o \ ceph_fs.o ceph_strings.o ceph_hash.o \ - pagevec.o snapshot.o + pagevec.o snapshot.o string_table.o diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 55d2bfee16d7..bddfcf6f09c2 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -747,6 +747,8 @@ static int __init init_ceph_lib(void) static void __exit exit_ceph_lib(void) { dout("exit_ceph_lib\n"); + WARN_ON(!ceph_strings_empty()); + ceph_osdc_cleanup(); ceph_msgr_exit(); ceph_crypto_shutdown(); diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c index 41466ccb972a..7d54e944de5e 100644 --- a/net/ceph/ceph_fs.c +++ b/net/ceph/ceph_fs.c @@ -9,9 +9,9 @@ */ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) { - __u32 su = le32_to_cpu(layout->fl_stripe_unit); - __u32 sc = le32_to_cpu(layout->fl_stripe_count); - __u32 os = le32_to_cpu(layout->fl_object_size); + __u32 su = layout->stripe_unit; + __u32 sc = layout->stripe_count; + __u32 os = layout->object_size; /* stripe unit, object size must be non-zero, 64k increment */ if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) @@ -27,6 +27,30 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) return 1; } +void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit); + fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count); + fl->object_size = le32_to_cpu(legacy->fl_object_size); + fl->pool_id = le32_to_cpu(legacy->fl_pg_pool); + if (fl->pool_id == 0) + fl->pool_id = -1; +} +EXPORT_SYMBOL(ceph_file_layout_from_legacy); + +void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit); + legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count); + legacy->fl_object_size = cpu_to_le32(fl->object_size); + if (fl->pool_id >= 0) + legacy->fl_pg_pool = cpu_to_le32(fl->pool_id); + else + legacy->fl_pg_pool = 0; +} +EXPORT_SYMBOL(ceph_file_layout_to_legacy); int ceph_flags_to_mode(int flags) { diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index e77b04ca7802..c62b2b029a6e 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -156,8 +156,16 @@ static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t) seq_printf(s, "]/%d\t[", t->up.primary); for (i = 0; i < t->acting.size; i++) seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]); - seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary, - t->target_oid.name_len, t->target_oid.name, t->flags); + seq_printf(s, "]/%d\t", t->acting.primary); + if (t->target_oloc.pool_ns) { + seq_printf(s, "%*pE/%*pE\t0x%x", + (int)t->target_oloc.pool_ns->len, + t->target_oloc.pool_ns->str, + t->target_oid.name_len, t->target_oid.name, t->flags); + } else { + seq_printf(s, "%*pE\t0x%x", t->target_oid.name_len, + t->target_oid.name, t->flags); + } if (t->paused) seq_puts(s, "\tP"); } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 37c38a7fb5c5..c83326c5ba58 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -227,9 +227,10 @@ static void __schedule_delayed(struct ceph_mon_client *monc) } const char *ceph_sub_str[] = { - [CEPH_SUB_MDSMAP] = "mdsmap", [CEPH_SUB_MONMAP] = "monmap", [CEPH_SUB_OSDMAP] = "osdmap", + [CEPH_SUB_FSMAP] = "fsmap.user", + [CEPH_SUB_MDSMAP] = "mdsmap", }; /* @@ -1193,6 +1194,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: + case CEPH_MSG_FS_MAP_USER: m = ceph_msg_new(type, front_len, GFP_NOFS, false); if (!m) return NULL; /* ENOMEM--return skip == 0 */ diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index ddec1c10ac80..aaed59a47b1d 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -5,6 +5,7 @@ #include #include +#include #include static void *msgpool_alloc(gfp_t gfp_mask, void *arg) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 89469592076c..b5ec09612ff7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -387,7 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest, static void target_destroy(struct ceph_osd_request_target *t) { ceph_oid_destroy(&t->base_oid); + ceph_oloc_destroy(&t->base_oloc); ceph_oid_destroy(&t->target_oid); + ceph_oloc_destroy(&t->target_oloc); } /* @@ -533,6 +535,11 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_alloc_request); +static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc) +{ + return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); +} + int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) { struct ceph_osd_client *osdc = req->r_osdc; @@ -540,11 +547,13 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) int msg_size; WARN_ON(ceph_oid_empty(&req->r_base_oid)); + WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); /* create request message */ msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ - msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ + msg_size += CEPH_ENCODING_START_BLK_LEN + + ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pgid */ msg_size += 4 + req->r_base_oid.name_len; /* oid */ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); @@ -932,7 +941,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { osd_req_op_init(req, which, opcode, 0); } else { - u32 object_size = le32_to_cpu(layout->fl_object_size); + u32 object_size = layout->object_size; u32 object_base = off - objoff; if (!(truncate_seq == 1 && truncate_size == -1ULL)) { if (truncate_size <= object_base) { @@ -948,7 +957,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } req->r_flags = flags; - req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); + req->r_base_oloc.pool = layout->pool_id; + req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); req->r_snapid = vino.snap; @@ -1489,12 +1499,16 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) p += sizeof(req->r_replay_version); /* oloc */ - ceph_encode_8(&p, 4); - ceph_encode_8(&p, 4); - ceph_encode_32(&p, 8 + 4 + 4); + ceph_start_encoding(&p, 5, 4, + ceph_oloc_encoding_size(&req->r_t.target_oloc)); ceph_encode_64(&p, req->r_t.target_oloc.pool); ceph_encode_32(&p, -1); /* preferred */ ceph_encode_32(&p, 0); /* key len */ + if (req->r_t.target_oloc.pool_ns) + ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str, + req->r_t.target_oloc.pool_ns->len); + else + ceph_encode_32(&p, 0); /* pgid */ ceph_encode_8(&p, 1); @@ -2594,9 +2608,22 @@ static int ceph_oloc_decode(void **p, void *end, } if (struct_v >= 5) { + bool changed = false; + len = ceph_decode_32(p); if (len > 0) { - pr_warn("ceph_object_locator::nspace is set\n"); + ceph_decode_need(p, end, len, e_inval); + if (!oloc->pool_ns || + ceph_compare_string(oloc->pool_ns, *p, len)) + changed = true; + *p += len; + } else { + if (oloc->pool_ns) + changed = true; + } + if (changed) { + /* redirect changes namespace */ + pr_warn("ceph_object_locator::nspace is changed\n"); goto e_inval; } } @@ -2806,7 +2833,9 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) goto out_unlock_session; } + m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns; ret = decode_MOSDOpReply(msg, &m); + m.redirect.oloc.pool_ns = NULL; if (ret) { pr_err("failed to decode MOSDOpReply for tid %llu: %d\n", req->r_tid, ret); @@ -2835,7 +2864,11 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) unlink_request(osd, req); mutex_unlock(&osd->lock); - ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc); + /* + * Not ceph_oloc_copy() - changing pool_ns is not + * supported. + */ + req->r_t.target_oloc.pool = m.redirect.oloc.pool; req->r_flags |= CEPH_OSD_FLAG_REDIRECTED; req->r_tid = 0; __submit_request(req, false); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7e480bf75bcf..d2436880b305 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1510,6 +1510,24 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, return ERR_PTR(err); } +void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src) +{ + WARN_ON(!ceph_oloc_empty(dest)); + WARN_ON(dest->pool_ns); /* empty() only covers ->pool */ + + dest->pool = src->pool; + if (src->pool_ns) + dest->pool_ns = ceph_get_string(src->pool_ns); +} +EXPORT_SYMBOL(ceph_oloc_copy); + +void ceph_oloc_destroy(struct ceph_object_locator *oloc) +{ + ceph_put_string(oloc->pool_ns); +} +EXPORT_SYMBOL(ceph_oloc_destroy); + void ceph_oid_copy(struct ceph_object_id *dest, const struct ceph_object_id *src) { @@ -1770,9 +1788,9 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 *ono, u64 *oxoff, u64 *oxlen) { - u32 osize = le32_to_cpu(layout->fl_object_size); - u32 su = le32_to_cpu(layout->fl_stripe_unit); - u32 sc = le32_to_cpu(layout->fl_stripe_count); + u32 osize = layout->object_size; + u32 su = layout->stripe_unit; + u32 sc = layout->stripe_count; u32 bl, stripeno, stripepos, objsetno; u32 su_per_object; u64 t, su_offset; @@ -1844,12 +1862,34 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, if (!pi) return -ENOENT; - raw_pgid->pool = oloc->pool; - raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, - oid->name_len); + if (!oloc->pool_ns) { + raw_pgid->pool = oloc->pool; + raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); + dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, + raw_pgid->pool, raw_pgid->seed); + } else { + char stack_buf[256]; + char *buf = stack_buf; + int nsl = oloc->pool_ns->len; + size_t total = nsl + 1 + oid->name_len; - dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, - raw_pgid->pool, raw_pgid->seed); + if (total > sizeof(stack_buf)) { + buf = kmalloc(total, GFP_NOIO); + if (!buf) + return -ENOMEM; + } + memcpy(buf, oloc->pool_ns->str, nsl); + buf[nsl] = '\037'; + memcpy(buf + nsl + 1, oid->name, oid->name_len); + raw_pgid->pool = oloc->pool; + raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total); + if (buf != stack_buf) + kfree(buf); + dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__, + oid->name, nsl, oloc->pool_ns->str, + raw_pgid->pool, raw_pgid->seed); + } return 0; } EXPORT_SYMBOL(ceph_object_locator_to_pg); diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c new file mode 100644 index 000000000000..ca53c8319209 --- /dev/null +++ b/net/ceph/string_table.c @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(string_tree_lock); +static struct rb_root string_tree = RB_ROOT; + +struct ceph_string *ceph_find_or_create_string(const char* str, size_t len) +{ + struct ceph_string *cs, *exist; + struct rb_node **p, *parent; + int ret; + + exist = NULL; + spin_lock(&string_tree_lock); + p = &string_tree.rb_node; + while (*p) { + exist = rb_entry(*p, struct ceph_string, node); + ret = ceph_compare_string(exist, str, len); + if (ret > 0) + p = &(*p)->rb_left; + else if (ret < 0) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + if (exist && !kref_get_unless_zero(&exist->kref)) { + rb_erase(&exist->node, &string_tree); + RB_CLEAR_NODE(&exist->node); + exist = NULL; + } + spin_unlock(&string_tree_lock); + if (exist) + return exist; + + cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS); + if (!cs) + return NULL; + + kref_init(&cs->kref); + cs->len = len; + memcpy(cs->str, str, len); + cs->str[len] = 0; + +retry: + exist = NULL; + parent = NULL; + p = &string_tree.rb_node; + spin_lock(&string_tree_lock); + while (*p) { + parent = *p; + exist = rb_entry(*p, struct ceph_string, node); + ret = ceph_compare_string(exist, str, len); + if (ret > 0) + p = &(*p)->rb_left; + else if (ret < 0) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + ret = 0; + if (!exist) { + rb_link_node(&cs->node, parent, p); + rb_insert_color(&cs->node, &string_tree); + } else if (!kref_get_unless_zero(&exist->kref)) { + rb_erase(&exist->node, &string_tree); + RB_CLEAR_NODE(&exist->node); + ret = -EAGAIN; + } + spin_unlock(&string_tree_lock); + if (ret == -EAGAIN) + goto retry; + + if (exist) { + kfree(cs); + cs = exist; + } + + return cs; +} +EXPORT_SYMBOL(ceph_find_or_create_string); + +static void ceph_free_string(struct rcu_head *head) +{ + struct ceph_string *cs = container_of(head, struct ceph_string, rcu); + kfree(cs); +} + +void ceph_release_string(struct kref *ref) +{ + struct ceph_string *cs = container_of(ref, struct ceph_string, kref); + + spin_lock(&string_tree_lock); + if (!RB_EMPTY_NODE(&cs->node)) { + rb_erase(&cs->node, &string_tree); + RB_CLEAR_NODE(&cs->node); + } + spin_unlock(&string_tree_lock); + + call_rcu(&cs->rcu, ceph_free_string); +} +EXPORT_SYMBOL(ceph_release_string); + +bool ceph_strings_empty(void) +{ + return RB_EMPTY_ROOT(&string_tree); +} diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 0f82314621f2..15b196fc2f49 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -202,7 +202,7 @@ hdr-inst := -f $(srctree)/scripts/Makefile.headersinst obj # Prefix -I with $(srctree) if it is not an absolute path. # skip if -I has no parameter addtree = $(if $(patsubst -I%,%,$(1)), \ -$(if $(filter-out -I/%,$(1)),$(patsubst -I%,-I$(srctree)/%,$(1))) $(1)) +$(if $(filter-out -I/% -I./% -I../%,$(1)),$(patsubst -I%,-I$(srctree)/%,$(1)),$(1))) # Find all -I options and call addtree flags = $(foreach o,$($(1)),$(if $(filter -I%,$(o)),$(call addtree,$(o)),$(o))) diff --git a/scripts/Makefile b/scripts/Makefile index 822ab4a6a4aa..1d80897a9644 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -47,4 +47,4 @@ subdir-$(CONFIG_DTC) += dtc subdir-$(CONFIG_GDB_SCRIPTS) += gdb # Let clean descend into subdirs -subdir- += basic kconfig package +subdir- += basic kconfig package gcc-plugins diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 0d1ca5bf42fb..11602e5efb3b 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -60,7 +60,7 @@ endif endif # Do not include host rules unless needed -ifneq ($(hostprogs-y)$(hostprogs-m),) +ifneq ($(hostprogs-y)$(hostprogs-m)$(hostlibs-y)$(hostlibs-m)$(hostcxxlibs-y)$(hostcxxlibs-m),) include scripts/Makefile.host endif diff --git a/scripts/Makefile.clean b/scripts/Makefile.clean index 55c96cb8070f..50616ea25131 100644 --- a/scripts/Makefile.clean +++ b/scripts/Makefile.clean @@ -38,7 +38,9 @@ subdir-ymn := $(addprefix $(obj)/,$(subdir-ymn)) __clean-files := $(extra-y) $(extra-m) $(extra-) \ $(always) $(targets) $(clean-files) \ $(host-progs) \ - $(hostprogs-y) $(hostprogs-m) $(hostprogs-) + $(hostprogs-y) $(hostprogs-m) $(hostprogs-) \ + $(hostlibs-y) $(hostlibs-m) $(hostlibs-) \ + $(hostcxxlibs-y) $(hostcxxlibs-m) __clean-files := $(filter-out $(no-clean-files), $(__clean-files)) diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins new file mode 100644 index 000000000000..5e22b60589c1 --- /dev/null +++ b/scripts/Makefile.gcc-plugins @@ -0,0 +1,43 @@ +ifdef CONFIG_GCC_PLUGINS + __PLUGINCC := $(call cc-ifversion, -ge, 0408, $(HOSTCXX), $(HOSTCC)) + PLUGINCC := $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-plugin.sh "$(__PLUGINCC)" "$(HOSTCXX)" "$(CC)") + + SANCOV_PLUGIN := -fplugin=$(objtree)/scripts/gcc-plugins/sancov_plugin.so + + gcc-plugin-$(CONFIG_GCC_PLUGIN_CYC_COMPLEXITY) += cyc_complexity_plugin.so + + ifdef CONFIG_GCC_PLUGIN_SANCOV + ifeq ($(CFLAGS_KCOV),) + # It is needed because of the gcc-plugin.sh and gcc version checks. + gcc-plugin-$(CONFIG_GCC_PLUGIN_SANCOV) += sancov_plugin.so + + ifneq ($(PLUGINCC),) + CFLAGS_KCOV := $(SANCOV_PLUGIN) + else + $(warning warning: cannot use CONFIG_KCOV: -fsanitize-coverage=trace-pc is not supported by compiler) + endif + endif + endif + + GCC_PLUGINS_CFLAGS := $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) + + export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN SANCOV_PLUGIN + + ifeq ($(PLUGINCC),) + ifneq ($(GCC_PLUGINS_CFLAGS),) + ifeq ($(call cc-ifversion, -ge, 0405, y), y) + PLUGINCC := $(shell $(CONFIG_SHELL) -x $(srctree)/scripts/gcc-plugin.sh "$(__PLUGINCC)" "$(HOSTCXX)" "$(CC)") + $(warning warning: your gcc installation does not support plugins, perhaps the necessary headers are missing?) + else + $(warning warning: your gcc version does not support plugins, you should upgrade it to gcc 4.5 at least) + endif + endif + else + # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication. + GCC_PLUGINS_CFLAGS := $(filter-out $(SANCOV_PLUGIN), $(GCC_PLUGINS_CFLAGS)) + endif + + KBUILD_CFLAGS += $(GCC_PLUGINS_CFLAGS) + GCC_PLUGIN := $(gcc-plugin-y) + +endif diff --git a/scripts/Makefile.host b/scripts/Makefile.host index 133edfae5b8a..45b5b1aaedbd 100644 --- a/scripts/Makefile.host +++ b/scripts/Makefile.host @@ -20,7 +20,15 @@ # Will compile qconf as a C++ program, and menu as a C program. # They are linked as C++ code to the executable qconf +# hostcc-option +# Usage: cflags-y += $(call hostcc-option,-march=winchip-c6,-march=i586) + +hostcc-option = $(call try-run,\ + $(HOSTCC) $(HOSTCFLAGS) $(HOST_EXTRACFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) + __hostprogs := $(sort $(hostprogs-y) $(hostprogs-m)) +host-cshlib := $(sort $(hostlibs-y) $(hostlibs-m)) +host-cxxshlib := $(sort $(hostcxxlibs-y) $(hostcxxlibs-m)) # C code # Executables compiled from a single .c file @@ -42,6 +50,10 @@ host-cxxmulti := $(foreach m,$(__hostprogs),$(if $($(m)-cxxobjs),$(m))) # C++ Object (.o) files compiled from .cc files host-cxxobjs := $(sort $(foreach m,$(host-cxxmulti),$($(m)-cxxobjs))) +# Object (.o) files used by the shared libaries +host-cshobjs := $(sort $(foreach m,$(host-cshlib),$($(m:.so=-objs)))) +host-cxxshobjs := $(sort $(foreach m,$(host-cxxshlib),$($(m:.so=-objs)))) + # output directory for programs/.o files # hostprogs-y := tools/build may have been specified. # Retrieve also directory of .o files from prog-objs or prog-cxxobjs notation @@ -56,6 +68,10 @@ host-cmulti := $(addprefix $(obj)/,$(host-cmulti)) host-cobjs := $(addprefix $(obj)/,$(host-cobjs)) host-cxxmulti := $(addprefix $(obj)/,$(host-cxxmulti)) host-cxxobjs := $(addprefix $(obj)/,$(host-cxxobjs)) +host-cshlib := $(addprefix $(obj)/,$(host-cshlib)) +host-cxxshlib := $(addprefix $(obj)/,$(host-cxxshlib)) +host-cshobjs := $(addprefix $(obj)/,$(host-cshobjs)) +host-cxxshobjs := $(addprefix $(obj)/,$(host-cxxshobjs)) host-objdirs := $(addprefix $(obj)/,$(host-objdirs)) obj-dirs += $(host-objdirs) @@ -124,5 +140,42 @@ quiet_cmd_host-cxxobjs = HOSTCXX $@ $(host-cxxobjs): $(obj)/%.o: $(src)/%.cc FORCE $(call if_changed_dep,host-cxxobjs) +# Compile .c file, create position independent .o file +# host-cshobjs -> .o +quiet_cmd_host-cshobjs = HOSTCC -fPIC $@ + cmd_host-cshobjs = $(HOSTCC) $(hostc_flags) -fPIC -c -o $@ $< +$(host-cshobjs): $(obj)/%.o: $(src)/%.c FORCE + $(call if_changed_dep,host-cshobjs) + +# Compile .c file, create position independent .o file +# Note that plugin capable gcc versions can be either C or C++ based +# therefore plugin source files have to be compilable in both C and C++ mode. +# This is why a C++ compiler is invoked on a .c file. +# host-cxxshobjs -> .o +quiet_cmd_host-cxxshobjs = HOSTCXX -fPIC $@ + cmd_host-cxxshobjs = $(HOSTCXX) $(hostcxx_flags) -fPIC -c -o $@ $< +$(host-cxxshobjs): $(obj)/%.o: $(src)/%.c FORCE + $(call if_changed_dep,host-cxxshobjs) + +# Link a shared library, based on position independent .o files +# *.o -> .so shared library (host-cshlib) +quiet_cmd_host-cshlib = HOSTLLD -shared $@ + cmd_host-cshlib = $(HOSTCC) $(HOSTLDFLAGS) -shared -o $@ \ + $(addprefix $(obj)/,$($(@F:.so=-objs))) \ + $(HOST_LOADLIBES) $(HOSTLOADLIBES_$(@F)) +$(host-cshlib): FORCE + $(call if_changed,host-cshlib) +$(call multi_depend, $(host-cshlib), .so, -objs) + +# Link a shared library, based on position independent .o files +# *.o -> .so shared library (host-cxxshlib) +quiet_cmd_host-cxxshlib = HOSTLLD -shared $@ + cmd_host-cxxshlib = $(HOSTCXX) $(HOSTLDFLAGS) -shared -o $@ \ + $(addprefix $(obj)/,$($(@F:.so=-objs))) \ + $(HOST_LOADLIBES) $(HOSTLOADLIBES_$(@F)) +$(host-cxxshlib): FORCE + $(call if_changed,host-cxxshlib) +$(call multi_depend, $(host-cxxshlib), .so, -objs) + targets += $(host-csingle) $(host-cmulti) $(host-cobjs)\ - $(host-cxxmulti) $(host-cxxobjs) + $(host-cxxmulti) $(host-cxxobjs) $(host-cshlib) $(host-cshobjs) $(host-cxxshlib) $(host-cxxshobjs) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index e7df0f5db7ec..e89c214745eb 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -155,9 +155,10 @@ else # $(call addtree,-I$(obj)) locates .h files in srctree, from generated .c files # and locates generated .h files # FIXME: Replace both with specific CFLAGS* statements in the makefiles -__c_flags = $(call addtree,-I$(obj)) $(call flags,_c_flags) -__a_flags = $(call flags,_a_flags) -__cpp_flags = $(call flags,_cpp_flags) +__c_flags = $(if $(obj),-I$(srctree)/$(src) -I$(obj)) \ + $(call flags,_c_flags) +__a_flags = $(call flags,_a_flags) +__cpp_flags = $(call flags,_cpp_flags) endif c_flags = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ diff --git a/scripts/basic/bin2c.c b/scripts/basic/bin2c.c index af187e695345..c3d7eef3ad06 100644 --- a/scripts/basic/bin2c.c +++ b/scripts/basic/bin2c.c @@ -29,7 +29,8 @@ int main(int argc, char *argv[]) } while (ch != EOF); if (argc > 1) - printf("\t;\n\nconst int %s_size = %d;\n", argv[1], total); + printf("\t;\n\n#include \n\nconst size_t %s_size = %d;\n", + argv[1], total); return 0; } diff --git a/scripts/coccicheck b/scripts/coccicheck index dd85a455b2ba..c92c1528a54d 100755 --- a/scripts/coccicheck +++ b/scripts/coccicheck @@ -1,14 +1,24 @@ #!/bin/bash - +# Linux kernel coccicheck +# +# Read Documentation/coccinelle.txt # # This script requires at least spatch # version 1.0.0-rc11. -# +DIR="$(dirname $(readlink -f $0))/.." SPATCH="`which ${SPATCH:=spatch}`" -trap kill_running SIGTERM SIGINT -declare -a SPATCH_PID +if [ ! -x "$SPATCH" ]; then + echo 'spatch is part of the Coccinelle project and is available at http://coccinelle.lip6.fr/' + exit 1 +fi + +SPATCH_VERSION=$($SPATCH --version | head -1 | awk '{print $3}') +SPATCH_VERSION_NUM=$(echo $SPATCH_VERSION | ${DIR}/scripts/ld-version.sh) + +USE_JOBS="no" +$SPATCH --help | grep "\-\-jobs" > /dev/null && USE_JOBS="yes" # The verbosity may be set by the environmental parameter V= # as for example with 'make V=1 coccicheck' @@ -25,7 +35,28 @@ else NPROC="$J" fi -FLAGS="$SPFLAGS --very-quiet" +FLAGS="--very-quiet" + +# You can use SPFLAGS to append extra arguments to coccicheck or override any +# heuristics done in this file as Coccinelle accepts the last options when +# options conflict. +# +# A good example for use of SPFLAGS is if you want to debug your cocci script, +# you can for instance use the following: +# +# $ export COCCI=scripts/coccinelle/misc/irqf_oneshot.cocci +# $ make coccicheck MODE=report DEBUG_FILE="all.err" SPFLAGS="--profile --show-trying" M=./drivers/mfd/arizona-irq.c +# +# "--show-trying" should show you what rule is being processed as it goes to +# stdout, you do not need a debug file for that. The profile output will be +# be sent to stdout, if you provide a DEBUG_FILE the profiling data can be +# inspected there. +# +# --profile will not output if --very-quiet is used, so avoid it. +echo $SPFLAGS | egrep -e "--profile|--show-trying" 2>&1 > /dev/null +if [ $? -eq 0 ]; then + FLAGS="--quiet" +fi # spatch only allows include directories with the syntax "-I include" # while gcc also allows "-Iinclude" and "-include include" @@ -51,9 +82,14 @@ if [ "$KBUILD_EXTMOD" != "" ] ; then OPTIONS="--patch $srctree $OPTIONS" fi -if [ ! -x "$SPATCH" ]; then - echo 'spatch is part of the Coccinelle project and is available at http://coccinelle.lip6.fr/' - exit 1 +# You can override by using SPFLAGS +if [ "$USE_JOBS" = "no" ]; then + trap kill_running SIGTERM SIGINT + declare -a SPATCH_PID +elif [ "$NPROC" != "1" ]; then + # Using 0 should work as well, refer to _SC_NPROCESSORS_ONLN use on + # https://github.com/rdicosmo/parmap/blob/master/setcore_stubs.c + OPTIONS="$OPTIONS --jobs $NPROC --chunksize 1" fi if [ "$MODE" = "" ] ; then @@ -72,7 +108,7 @@ if [ "$MODE" = "chain" ] ; then echo 'All available modes will be tried (in that order): patch, report, context, org' fi elif [ "$MODE" = "report" -o "$MODE" = "org" ] ; then - FLAGS="$FLAGS --no-show-diff" + FLAGS="--no-show-diff $FLAGS" fi if [ "$ONLINE" = "0" ] ; then @@ -82,7 +118,26 @@ if [ "$ONLINE" = "0" ] ; then echo '' fi -run_cmd() { +run_cmd_parmap() { + if [ $VERBOSE -ne 0 ] ; then + echo "Running ($NPROC in parallel): $@" + fi + if [ "$DEBUG_FILE" != "/dev/null" -a "$DEBUG_FILE" != "" ]; then + if [ -f $DEBUG_FILE ]; then + echo "Debug file $DEBUG_FILE exists, bailing" + exit + fi + else + DEBUG_FILE="/dev/null" + fi + $@ 2>$DEBUG_FILE + if [[ $? -ne 0 ]]; then + echo "coccicheck failed" + exit $? + fi +} + +run_cmd_old() { local i if [ $VERBOSE -ne 0 ] ; then echo "Running ($NPROC in parallel): $@" @@ -97,6 +152,14 @@ run_cmd() { wait } +run_cmd() { + if [ "$USE_JOBS" = "yes" ]; then + run_cmd_parmap $@ + else + run_cmd_old $@ + fi +} + kill_running() { for i in $(seq 0 $(( NPROC - 1 )) ); do if [ $VERBOSE -eq 2 ] ; then @@ -106,10 +169,23 @@ kill_running() { done } +# You can override heuristics with SPFLAGS, these must always go last +OPTIONS="$OPTIONS $SPFLAGS" + coccinelle () { COCCI="$1" OPT=`grep "Option" $COCCI | cut -d':' -f2` + REQ=`grep "Requires" $COCCI | cut -d':' -f2 | sed "s| ||"` + REQ_NUM=$(echo $REQ | ${DIR}/scripts/ld-version.sh) + if [ "$REQ_NUM" != "0" ] ; then + if [ "$SPATCH_VERSION_NUM" -lt "$REQ_NUM" ] ; then + echo "Skipping coccinele SmPL patch: $COCCI" + echo "You have coccinelle: $SPATCH_VERSION" + echo "This SmPL patch requires: $REQ" + return + fi + fi # The option '--parse-cocci' can be used to syntactically check the SmPL files. # diff --git a/scripts/coccinelle/free/devm_free.cocci b/scripts/coccinelle/free/devm_free.cocci index 3d9349012bb3..c990d2c7ee16 100644 --- a/scripts/coccinelle/free/devm_free.cocci +++ b/scripts/coccinelle/free/devm_free.cocci @@ -29,7 +29,23 @@ expression x; @@ ( + x = devm_kmalloc(...) +| + x = devm_kvasprintf(...) +| + x = devm_kasprintf(...) +| x = devm_kzalloc(...) +| + x = devm_kmalloc_array(...) +| + x = devm_kcalloc(...) +| + x = devm_kstrdup(...) +| + x = devm_kmemdup(...) +| + x = devm_get_free_pages(...) | x = devm_request_irq(...) | @@ -48,6 +64,16 @@ position p; ( * kfree@p(x) | +* kzfree@p(x) +| +* __krealloc@p(x, ...) +| +* krealloc@p(x, ...) +| +* free_pages@p(x, ...) +| +* free_page@p(x) +| * free_irq@p(x) | * iounmap@p(x) diff --git a/scripts/coccinelle/free/ifnullfree.cocci b/scripts/coccinelle/free/ifnullfree.cocci index 52bd235286fa..14a4cd98e83b 100644 --- a/scripts/coccinelle/free/ifnullfree.cocci +++ b/scripts/coccinelle/free/ifnullfree.cocci @@ -19,6 +19,8 @@ expression E; - if (E != NULL) ( kfree(E); +| + kzfree(E); | debugfs_remove(E); | @@ -39,7 +41,7 @@ position p; @@ * if (E != NULL) -* \(kfree@p\|debugfs_remove@p\|debugfs_remove_recursive@p\| +* \(kfree@p\|kzfree@p\|debugfs_remove@p\|debugfs_remove_recursive@p\| * usb_free_urb@p\|kmem_cache_destroy@p\|mempool_destroy@p\| * dma_pool_destroy@p\)(E); diff --git a/scripts/coccinelle/free/kfree.cocci b/scripts/coccinelle/free/kfree.cocci index 577b78056990..ac438da4fd7b 100644 --- a/scripts/coccinelle/free/kfree.cocci +++ b/scripts/coccinelle/free/kfree.cocci @@ -20,7 +20,11 @@ expression E; position p1; @@ -kfree@p1(E) +( +* kfree@p1(E) +| +* kzfree@p1(E) +) @print expression@ constant char [] c; @@ -60,7 +64,11 @@ position ok; @@ while (1) { ... - kfree@ok(E) +( +* kfree@ok(E) +| +* kzfree@ok(E) +) ... when != break; when != goto l; when forall @@ -74,7 +82,11 @@ statement S; position free.p1!=loop.ok,p2!={print.p,sz.p}; @@ -kfree@p1(E,...) +( +* kfree@p1(E,...) +| +* kzfree@p1(E,...) +) ... ( iter(...,subE,...) S // no use diff --git a/scripts/coccinelle/free/kfreeaddr.cocci b/scripts/coccinelle/free/kfreeaddr.cocci index ce8aacc314cb..d46063b1db8b 100644 --- a/scripts/coccinelle/free/kfreeaddr.cocci +++ b/scripts/coccinelle/free/kfreeaddr.cocci @@ -16,7 +16,11 @@ identifier f; position p; @@ +( * kfree@p(&e->f) +| +* kzfree@p(&e->f) +) @script:python depends on org@ p << r.p; @@ -28,5 +32,5 @@ cocci.print_main("kfree",p) p << r.p; @@ -msg = "ERROR: kfree of structure field" +msg = "ERROR: invalid free of structure field" coccilib.report.print_report(p[0],msg) diff --git a/scripts/coccinelle/iterators/device_node_continue.cocci b/scripts/coccinelle/iterators/device_node_continue.cocci index 38ab744a4037..a36c16db171b 100644 --- a/scripts/coccinelle/iterators/device_node_continue.cocci +++ b/scripts/coccinelle/iterators/device_node_continue.cocci @@ -5,8 +5,11 @@ // Copyright: (C) 2015 Julia Lawall, Inria. GPLv2. // URL: http://coccinelle.lip6.fr/ // Options: --no-includes --include-headers +// Requires: 1.0.4 // Keywords: for_each_child_of_node, etc. +// This uses a conjunction, which requires at least coccinelle >= 1.0.4 + virtual patch virtual context virtual org diff --git a/scripts/coccinelle/misc/noderef.cocci b/scripts/coccinelle/misc/noderef.cocci index 80a831c91161..007f0de0c715 100644 --- a/scripts/coccinelle/misc/noderef.cocci +++ b/scripts/coccinelle/misc/noderef.cocci @@ -16,6 +16,7 @@ virtual patch @depends on patch@ expression *x; expression f; +expression i; type T; @@ @@ -30,15 +31,26 @@ f(...,(T)(x),...,sizeof( + *x ),...) | -f(...,sizeof(x),...,(T)( +f(...,sizeof( +- x ++ *x + ),...,(T)(x),...) +| +f(...,(T)(x),...,i*sizeof( - x + *x ),...) +| +f(...,i*sizeof( +- x ++ *x + ),...,(T)(x),...) ) @r depends on !patch@ expression *x; expression f; +expression i; position p; type T; @@ @@ -49,6 +61,10 @@ type T; *f(...,(T)(x),...,sizeof@p(x),...) | *f(...,sizeof@p(x),...,(T)(x),...) +| +*f(...,(T)(x),...,i*sizeof@p(x),...) +| +*f(...,i*sizeof@p(x),...,(T)(x),...) ) @script:python depends on org@ diff --git a/scripts/gcc-plugin.sh b/scripts/gcc-plugin.sh new file mode 100755 index 000000000000..fb9207565471 --- /dev/null +++ b/scripts/gcc-plugin.sh @@ -0,0 +1,51 @@ +#!/bin/sh +srctree=$(dirname "$0") +gccplugins_dir=$($3 -print-file-name=plugin) +plugincc=$($1 -E -x c++ - -o /dev/null -I"${srctree}"/gcc-plugins -I"${gccplugins_dir}"/include 2>&1 <= 4008 || defined(ENABLE_BUILD_WITH_CXX) +#warning $2 CXX +#else +#warning $1 CC +#endif +EOF +) + +if [ $? -ne 0 ] +then + exit 1 +fi + +case "$plugincc" in + *"$1 CC"*) + echo "$1" + exit 0 + ;; + + *"$2 CXX"*) + # the c++ compiler needs another test, see below + ;; + + *) + exit 1 + ;; +esac + +# we need a c++ compiler that supports the designated initializer GNU extension +plugincc=$($2 -c -x c++ -std=gnu++98 - -fsyntax-only -I"${srctree}"/gcc-plugins -I"${gccplugins_dir}"/include 2>&1 < + * Licensed under the GPL v2, or (at your option) v3 + * + * Homepage: + * https://github.com/ephox-gcc-plugins/cyclomatic_complexity + * + * http://en.wikipedia.org/wiki/Cyclomatic_complexity + * The complexity M is then defined as: + * M = E - N + 2P + * where + * + * E = the number of edges of the graph + * N = the number of nodes of the graph + * P = the number of connected components (exit nodes). + * + * Usage (4.5 - 5): + * $ make clean; make run + */ + +#include "gcc-common.h" + +int plugin_is_GPL_compatible; + +static struct plugin_info cyc_complexity_plugin_info = { + .version = "20160225", + .help = "Cyclomatic Complexity\n", +}; + +static unsigned int cyc_complexity_execute(void) +{ + int complexity; + expanded_location xloc; + + /* M = E - N + 2P */ + complexity = n_edges_for_fn(cfun) - n_basic_blocks_for_fn(cfun) + 2; + + xloc = expand_location(DECL_SOURCE_LOCATION(current_function_decl)); + fprintf(stderr, "Cyclomatic Complexity %d %s:%s\n", complexity, + xloc.file, DECL_NAME_POINTER(current_function_decl)); + + return 0; +} + +#define PASS_NAME cyc_complexity + +#define NO_GATE +#define TODO_FLAGS_FINISH TODO_dump_func + +#include "gcc-generate-gimple-pass.h" + +int plugin_init(struct plugin_name_args *plugin_info, struct plugin_gcc_version *version) +{ + const char * const plugin_name = plugin_info->base_name; + struct register_pass_info cyc_complexity_pass_info; + + cyc_complexity_pass_info.pass = make_cyc_complexity_pass(); + cyc_complexity_pass_info.reference_pass_name = "ssa"; + cyc_complexity_pass_info.ref_pass_instance_number = 1; + cyc_complexity_pass_info.pos_op = PASS_POS_INSERT_AFTER; + + if (!plugin_default_version_check(version, &gcc_version)) { + error(G_("incompatible gcc/plugin versions")); + return 1; + } + + register_callback(plugin_name, PLUGIN_INFO, NULL, + &cyc_complexity_plugin_info); + register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, + &cyc_complexity_pass_info); + + return 0; +} diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h new file mode 100644 index 000000000000..172850bcd0d9 --- /dev/null +++ b/scripts/gcc-plugins/gcc-common.h @@ -0,0 +1,830 @@ +#ifndef GCC_COMMON_H_INCLUDED +#define GCC_COMMON_H_INCLUDED + +#include "bversion.h" +#if BUILDING_GCC_VERSION >= 6000 +#include "gcc-plugin.h" +#else +#include "plugin.h" +#endif +#include "plugin-version.h" +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "line-map.h" +#include "input.h" +#include "tree.h" + +#include "tree-inline.h" +#include "version.h" +#include "rtl.h" +#include "tm_p.h" +#include "flags.h" +#include "hard-reg-set.h" +#include "output.h" +#include "except.h" +#include "function.h" +#include "toplev.h" +#include "basic-block.h" +#include "intl.h" +#include "ggc.h" +#include "timevar.h" + +#include "params.h" + +#if BUILDING_GCC_VERSION <= 4009 +#include "pointer-set.h" +#else +#include "hash-map.h" +#endif + +#include "emit-rtl.h" +#include "debug.h" +#include "target.h" +#include "langhooks.h" +#include "cfgloop.h" +#include "cgraph.h" +#include "opts.h" + +#if BUILDING_GCC_VERSION == 4005 +#include +#endif + +#if BUILDING_GCC_VERSION >= 4007 +#include "tree-pretty-print.h" +#include "gimple-pretty-print.h" +#endif + +#if BUILDING_GCC_VERSION >= 4006 +#include "c-family/c-common.h" +#else +#include "c-common.h" +#endif + +#if BUILDING_GCC_VERSION <= 4008 +#include "tree-flow.h" +#else +#include "tree-cfgcleanup.h" +#include "tree-ssa-operands.h" +#include "tree-into-ssa.h" +#endif + +#if BUILDING_GCC_VERSION >= 4008 +#include "is-a.h" +#endif + +#include "diagnostic.h" +#include "tree-dump.h" +#include "tree-pass.h" +#include "predict.h" +#include "ipa-utils.h" + +#if BUILDING_GCC_VERSION >= 4009 +#include "attribs.h" +#include "varasm.h" +#include "stor-layout.h" +#include "internal-fn.h" +#include "gimple-expr.h" +#include "gimple-fold.h" +#include "context.h" +#include "tree-ssa-alias.h" +#include "tree-ssa.h" +#include "stringpool.h" +#include "tree-ssanames.h" +#include "print-tree.h" +#include "tree-eh.h" +#include "stmt.h" +#include "gimplify.h" +#endif + +#include "gimple.h" + +#if BUILDING_GCC_VERSION >= 4009 +#include "tree-ssa-operands.h" +#include "tree-phinodes.h" +#include "tree-cfg.h" +#include "gimple-iterator.h" +#include "gimple-ssa.h" +#include "ssa-iterators.h" +#endif + +#if BUILDING_GCC_VERSION >= 5000 +#include "builtins.h" +#endif + +/* #include "expr.h" where are you... */ +extern rtx emit_move_insn(rtx x, rtx y); + +/* missing from basic_block.h... */ +extern void debug_dominance_info(enum cdi_direction dir); +extern void debug_dominance_tree(enum cdi_direction dir, basic_block root); + +#if BUILDING_GCC_VERSION == 4006 +extern void debug_gimple_stmt(gimple); +extern void debug_gimple_seq(gimple_seq); +extern void print_gimple_seq(FILE *, gimple_seq, int, int); +extern void print_gimple_stmt(FILE *, gimple, int, int); +extern void print_gimple_expr(FILE *, gimple, int, int); +extern void dump_gimple_stmt(pretty_printer *, gimple, int, int); +#endif + +#define __unused __attribute__((__unused__)) + +#define DECL_NAME_POINTER(node) IDENTIFIER_POINTER(DECL_NAME(node)) +#define DECL_NAME_LENGTH(node) IDENTIFIER_LENGTH(DECL_NAME(node)) +#define TYPE_NAME_POINTER(node) IDENTIFIER_POINTER(TYPE_NAME(node)) +#define TYPE_NAME_LENGTH(node) IDENTIFIER_LENGTH(TYPE_NAME(node)) + +/* should come from c-tree.h if only it were installed for gcc 4.5... */ +#define C_TYPE_FIELDS_READONLY(TYPE) TREE_LANG_FLAG_1(TYPE) + +#if BUILDING_GCC_VERSION == 4005 +#define FOR_EACH_LOCAL_DECL(FUN, I, D) \ + for (tree vars = (FUN)->local_decls, (I) = 0; \ + vars && ((D) = TREE_VALUE(vars)); \ + vars = TREE_CHAIN(vars), (I)++) +#define DECL_CHAIN(NODE) (TREE_CHAIN(DECL_MINIMAL_CHECK(NODE))) +#define FOR_EACH_VEC_ELT(T, V, I, P) \ + for (I = 0; VEC_iterate(T, (V), (I), (P)); ++(I)) +#define TODO_rebuild_cgraph_edges 0 +#define SCOPE_FILE_SCOPE_P(EXP) (!(EXP)) + +#ifndef O_BINARY +#define O_BINARY 0 +#endif + +typedef struct varpool_node *varpool_node_ptr; + +static inline bool gimple_call_builtin_p(gimple stmt, enum built_in_function code) +{ + tree fndecl; + + if (!is_gimple_call(stmt)) + return false; + fndecl = gimple_call_fndecl(stmt); + if (!fndecl || DECL_BUILT_IN_CLASS(fndecl) != BUILT_IN_NORMAL) + return false; + return DECL_FUNCTION_CODE(fndecl) == code; +} + +static inline bool is_simple_builtin(tree decl) +{ + if (decl && DECL_BUILT_IN_CLASS(decl) != BUILT_IN_NORMAL) + return false; + + switch (DECL_FUNCTION_CODE(decl)) { + /* Builtins that expand to constants. */ + case BUILT_IN_CONSTANT_P: + case BUILT_IN_EXPECT: + case BUILT_IN_OBJECT_SIZE: + case BUILT_IN_UNREACHABLE: + /* Simple register moves or loads from stack. */ + case BUILT_IN_RETURN_ADDRESS: + case BUILT_IN_EXTRACT_RETURN_ADDR: + case BUILT_IN_FROB_RETURN_ADDR: + case BUILT_IN_RETURN: + case BUILT_IN_AGGREGATE_INCOMING_ADDRESS: + case BUILT_IN_FRAME_ADDRESS: + case BUILT_IN_VA_END: + case BUILT_IN_STACK_SAVE: + case BUILT_IN_STACK_RESTORE: + /* Exception state returns or moves registers around. */ + case BUILT_IN_EH_FILTER: + case BUILT_IN_EH_POINTER: + case BUILT_IN_EH_COPY_VALUES: + return true; + + default: + return false; + } +} + +static inline void add_local_decl(struct function *fun, tree d) +{ + gcc_assert(TREE_CODE(d) == VAR_DECL); + fun->local_decls = tree_cons(NULL_TREE, d, fun->local_decls); +} +#endif + +#if BUILDING_GCC_VERSION <= 4006 +#define ANY_RETURN_P(rtx) (GET_CODE(rtx) == RETURN) +#define C_DECL_REGISTER(EXP) DECL_LANG_FLAG_4(EXP) +#define EDGE_PRESERVE 0ULL +#define HOST_WIDE_INT_PRINT_HEX_PURE "%" HOST_WIDE_INT_PRINT "x" +#define flag_fat_lto_objects true + +#define get_random_seed(noinit) ({ \ + unsigned HOST_WIDE_INT seed; \ + sscanf(get_random_seed(noinit), "%" HOST_WIDE_INT_PRINT "x", &seed); \ + seed * seed; }) + +#define int_const_binop(code, arg1, arg2) \ + int_const_binop((code), (arg1), (arg2), 0) + +static inline bool gimple_clobber_p(gimple s __unused) +{ + return false; +} + +static inline bool gimple_asm_clobbers_memory_p(const_gimple stmt) +{ + unsigned i; + + for (i = 0; i < gimple_asm_nclobbers(stmt); i++) { + tree op = gimple_asm_clobber_op(stmt, i); + + if (!strcmp(TREE_STRING_POINTER(TREE_VALUE(op)), "memory")) + return true; + } + + return false; +} + +static inline tree builtin_decl_implicit(enum built_in_function fncode) +{ + return implicit_built_in_decls[fncode]; +} + +static inline int ipa_reverse_postorder(struct cgraph_node **order) +{ + return cgraph_postorder(order); +} + +static inline struct cgraph_node *cgraph_create_node(tree decl) +{ + return cgraph_node(decl); +} + +static inline struct cgraph_node *cgraph_get_create_node(tree decl) +{ + struct cgraph_node *node = cgraph_get_node(decl); + + return node ? node : cgraph_node(decl); +} + +static inline bool cgraph_function_with_gimple_body_p(struct cgraph_node *node) +{ + return node->analyzed && !node->thunk.thunk_p && !node->alias; +} + +static inline struct cgraph_node *cgraph_first_function_with_gimple_body(void) +{ + struct cgraph_node *node; + + for (node = cgraph_nodes; node; node = node->next) + if (cgraph_function_with_gimple_body_p(node)) + return node; + return NULL; +} + +static inline struct cgraph_node *cgraph_next_function_with_gimple_body(struct cgraph_node *node) +{ + for (node = node->next; node; node = node->next) + if (cgraph_function_with_gimple_body_p(node)) + return node; + return NULL; +} + +#define FOR_EACH_FUNCTION_WITH_GIMPLE_BODY(node) \ + for ((node) = cgraph_first_function_with_gimple_body(); (node); \ + (node) = cgraph_next_function_with_gimple_body(node)) + +static inline void varpool_add_new_variable(tree decl) +{ + varpool_finalize_decl(decl); +} +#endif + +#if BUILDING_GCC_VERSION <= 4007 +#define FOR_EACH_FUNCTION(node) \ + for (node = cgraph_nodes; node; node = node->next) +#define FOR_EACH_VARIABLE(node) \ + for (node = varpool_nodes; node; node = node->next) +#define PROP_loops 0 +#define NODE_SYMBOL(node) (node) +#define NODE_DECL(node) (node)->decl +#define INSN_LOCATION(INSN) RTL_LOCATION(INSN) +#define vNULL NULL + +static inline int bb_loop_depth(const_basic_block bb) +{ + return bb->loop_father ? loop_depth(bb->loop_father) : 0; +} + +static inline bool gimple_store_p(gimple gs) +{ + tree lhs = gimple_get_lhs(gs); + + return lhs && !is_gimple_reg(lhs); +} + +static inline void gimple_init_singleton(gimple g __unused) +{ +} +#endif + +#if BUILDING_GCC_VERSION == 4007 || BUILDING_GCC_VERSION == 4008 +static inline struct cgraph_node *cgraph_alias_target(struct cgraph_node *n) +{ + return cgraph_alias_aliased_node(n); +} +#endif + +#if BUILDING_GCC_VERSION >= 4007 && BUILDING_GCC_VERSION <= 4009 +#define cgraph_create_edge(caller, callee, call_stmt, count, freq, nest) \ + cgraph_create_edge((caller), (callee), (call_stmt), (count), (freq)) +#define cgraph_create_edge_including_clones(caller, callee, old_call_stmt, call_stmt, count, freq, nest, reason) \ + cgraph_create_edge_including_clones((caller), (callee), (old_call_stmt), (call_stmt), (count), (freq), (reason)) +#endif + +#if BUILDING_GCC_VERSION <= 4008 +#define ENTRY_BLOCK_PTR_FOR_FN(FN) ENTRY_BLOCK_PTR_FOR_FUNCTION(FN) +#define EXIT_BLOCK_PTR_FOR_FN(FN) EXIT_BLOCK_PTR_FOR_FUNCTION(FN) +#define basic_block_info_for_fn(FN) ((FN)->cfg->x_basic_block_info) +#define n_basic_blocks_for_fn(FN) ((FN)->cfg->x_n_basic_blocks) +#define n_edges_for_fn(FN) ((FN)->cfg->x_n_edges) +#define last_basic_block_for_fn(FN) ((FN)->cfg->x_last_basic_block) +#define label_to_block_map_for_fn(FN) ((FN)->cfg->x_label_to_block_map) +#define profile_status_for_fn(FN) ((FN)->cfg->x_profile_status) +#define BASIC_BLOCK_FOR_FN(FN, N) BASIC_BLOCK_FOR_FUNCTION((FN), (N)) +#define NODE_IMPLICIT_ALIAS(node) (node)->same_body_alias +#define VAR_P(NODE) (TREE_CODE(NODE) == VAR_DECL) + +static inline bool tree_fits_shwi_p(const_tree t) +{ + if (t == NULL_TREE || TREE_CODE(t) != INTEGER_CST) + return false; + + if (TREE_INT_CST_HIGH(t) == 0 && (HOST_WIDE_INT)TREE_INT_CST_LOW(t) >= 0) + return true; + + if (TREE_INT_CST_HIGH(t) == -1 && (HOST_WIDE_INT)TREE_INT_CST_LOW(t) < 0 && !TYPE_UNSIGNED(TREE_TYPE(t))) + return true; + + return false; +} + +static inline bool tree_fits_uhwi_p(const_tree t) +{ + if (t == NULL_TREE || TREE_CODE(t) != INTEGER_CST) + return false; + + return TREE_INT_CST_HIGH(t) == 0; +} + +static inline HOST_WIDE_INT tree_to_shwi(const_tree t) +{ + gcc_assert(tree_fits_shwi_p(t)); + return TREE_INT_CST_LOW(t); +} + +static inline unsigned HOST_WIDE_INT tree_to_uhwi(const_tree t) +{ + gcc_assert(tree_fits_uhwi_p(t)); + return TREE_INT_CST_LOW(t); +} + +static inline const char *get_tree_code_name(enum tree_code code) +{ + gcc_assert(code < MAX_TREE_CODES); + return tree_code_name[code]; +} + +#define ipa_remove_stmt_references(cnode, stmt) + +typedef union gimple_statement_d gasm; +typedef union gimple_statement_d gassign; +typedef union gimple_statement_d gcall; +typedef union gimple_statement_d gcond; +typedef union gimple_statement_d gdebug; +typedef union gimple_statement_d gphi; +typedef union gimple_statement_d greturn; + +static inline gasm *as_a_gasm(gimple stmt) +{ + return stmt; +} + +static inline const gasm *as_a_const_gasm(const_gimple stmt) +{ + return stmt; +} + +static inline gassign *as_a_gassign(gimple stmt) +{ + return stmt; +} + +static inline const gassign *as_a_const_gassign(const_gimple stmt) +{ + return stmt; +} + +static inline gcall *as_a_gcall(gimple stmt) +{ + return stmt; +} + +static inline const gcall *as_a_const_gcall(const_gimple stmt) +{ + return stmt; +} + +static inline gcond *as_a_gcond(gimple stmt) +{ + return stmt; +} + +static inline const gcond *as_a_const_gcond(const_gimple stmt) +{ + return stmt; +} + +static inline gdebug *as_a_gdebug(gimple stmt) +{ + return stmt; +} + +static inline const gdebug *as_a_const_gdebug(const_gimple stmt) +{ + return stmt; +} + +static inline gphi *as_a_gphi(gimple stmt) +{ + return stmt; +} + +static inline const gphi *as_a_const_gphi(const_gimple stmt) +{ + return stmt; +} + +static inline greturn *as_a_greturn(gimple stmt) +{ + return stmt; +} + +static inline const greturn *as_a_const_greturn(const_gimple stmt) +{ + return stmt; +} +#endif + +#if BUILDING_GCC_VERSION == 4008 +#define NODE_SYMBOL(node) (&(node)->symbol) +#define NODE_DECL(node) (node)->symbol.decl +#endif + +#if BUILDING_GCC_VERSION >= 4008 +#define add_referenced_var(var) +#define mark_sym_for_renaming(var) +#define varpool_mark_needed_node(node) +#define create_var_ann(var) +#define TODO_dump_func 0 +#define TODO_dump_cgraph 0 +#endif + +#if BUILDING_GCC_VERSION <= 4009 +#define TODO_verify_il 0 +#define AVAIL_INTERPOSABLE AVAIL_OVERWRITABLE + +#define section_name_prefix LTO_SECTION_NAME_PREFIX +#define fatal_error(loc, gmsgid, ...) fatal_error((gmsgid), __VA_ARGS__) + +typedef struct rtx_def rtx_insn; + +static inline void set_decl_section_name(tree node, const char *value) +{ + if (value) + DECL_SECTION_NAME(node) = build_string(strlen(value) + 1, value); + else + DECL_SECTION_NAME(node) = NULL; +} +#endif + +#if BUILDING_GCC_VERSION == 4009 +typedef struct gimple_statement_asm gasm; +typedef struct gimple_statement_base gassign; +typedef struct gimple_statement_call gcall; +typedef struct gimple_statement_base gcond; +typedef struct gimple_statement_base gdebug; +typedef struct gimple_statement_phi gphi; +typedef struct gimple_statement_base greturn; + +static inline gasm *as_a_gasm(gimple stmt) +{ + return as_a(stmt); +} + +static inline const gasm *as_a_const_gasm(const_gimple stmt) +{ + return as_a(stmt); +} + +static inline gassign *as_a_gassign(gimple stmt) +{ + return stmt; +} + +static inline const gassign *as_a_const_gassign(const_gimple stmt) +{ + return stmt; +} + +static inline gcall *as_a_gcall(gimple stmt) +{ + return as_a(stmt); +} + +static inline const gcall *as_a_const_gcall(const_gimple stmt) +{ + return as_a(stmt); +} + +static inline gcond *as_a_gcond(gimple stmt) +{ + return stmt; +} + +static inline const gcond *as_a_const_gcond(const_gimple stmt) +{ + return stmt; +} + +static inline gdebug *as_a_gdebug(gimple stmt) +{ + return stmt; +} + +static inline const gdebug *as_a_const_gdebug(const_gimple stmt) +{ + return stmt; +} + +static inline gphi *as_a_gphi(gimple stmt) +{ + return as_a(stmt); +} + +static inline const gphi *as_a_const_gphi(const_gimple stmt) +{ + return as_a(stmt); +} + +static inline greturn *as_a_greturn(gimple stmt) +{ + return stmt; +} + +static inline const greturn *as_a_const_greturn(const_gimple stmt) +{ + return stmt; +} +#endif + +#if BUILDING_GCC_VERSION >= 4009 +#define TODO_ggc_collect 0 +#define NODE_SYMBOL(node) (node) +#define NODE_DECL(node) (node)->decl +#define cgraph_node_name(node) (node)->name() +#define NODE_IMPLICIT_ALIAS(node) (node)->cpp_implicit_alias +#endif + +#if BUILDING_GCC_VERSION >= 5000 && BUILDING_GCC_VERSION < 6000 +/* gimple related */ +template <> +template <> +inline bool is_a_helper::test(const_gimple gs) +{ + return gs->code == GIMPLE_ASSIGN; +} +#endif + +#if BUILDING_GCC_VERSION >= 5000 +#define TODO_verify_ssa TODO_verify_il +#define TODO_verify_flow TODO_verify_il +#define TODO_verify_stmts TODO_verify_il +#define TODO_verify_rtl_sharing TODO_verify_il + +#define INSN_DELETED_P(insn) (insn)->deleted() + +/* symtab/cgraph related */ +#define debug_cgraph_node(node) (node)->debug() +#define cgraph_get_node(decl) cgraph_node::get(decl) +#define cgraph_get_create_node(decl) cgraph_node::get_create(decl) +#define cgraph_create_node(decl) cgraph_node::create(decl) +#define cgraph_n_nodes symtab->cgraph_count +#define cgraph_max_uid symtab->cgraph_max_uid +#define varpool_get_node(decl) varpool_node::get(decl) + +#define cgraph_create_edge(caller, callee, call_stmt, count, freq, nest) \ + (caller)->create_edge((callee), (call_stmt), (count), (freq)) +#define cgraph_create_edge_including_clones(caller, callee, old_call_stmt, call_stmt, count, freq, nest, reason) \ + (caller)->create_edge_including_clones((callee), (old_call_stmt), (call_stmt), (count), (freq), (reason)) + +typedef struct cgraph_node *cgraph_node_ptr; +typedef struct cgraph_edge *cgraph_edge_p; +typedef struct varpool_node *varpool_node_ptr; + +static inline void change_decl_assembler_name(tree decl, tree name) +{ + symtab->change_decl_assembler_name(decl, name); +} + +static inline void varpool_finalize_decl(tree decl) +{ + varpool_node::finalize_decl(decl); +} + +static inline void varpool_add_new_variable(tree decl) +{ + varpool_node::add(decl); +} + +static inline unsigned int rebuild_cgraph_edges(void) +{ + return cgraph_edge::rebuild_edges(); +} + +static inline cgraph_node_ptr cgraph_function_node(cgraph_node_ptr node, enum availability *availability) +{ + return node->function_symbol(availability); +} + +static inline cgraph_node_ptr cgraph_function_or_thunk_node(cgraph_node_ptr node, enum availability *availability = NULL) +{ + return node->ultimate_alias_target(availability); +} + +static inline bool cgraph_only_called_directly_p(cgraph_node_ptr node) +{ + return node->only_called_directly_p(); +} + +static inline enum availability cgraph_function_body_availability(cgraph_node_ptr node) +{ + return node->get_availability(); +} + +static inline cgraph_node_ptr cgraph_alias_target(cgraph_node_ptr node) +{ + return node->get_alias_target(); +} + +static inline struct cgraph_node_hook_list *cgraph_add_function_insertion_hook(cgraph_node_hook hook, void *data) +{ + return symtab->add_cgraph_insertion_hook(hook, data); +} + +static inline void cgraph_remove_function_insertion_hook(struct cgraph_node_hook_list *entry) +{ + symtab->remove_cgraph_insertion_hook(entry); +} + +static inline struct cgraph_node_hook_list *cgraph_add_node_removal_hook(cgraph_node_hook hook, void *data) +{ + return symtab->add_cgraph_removal_hook(hook, data); +} + +static inline void cgraph_remove_node_removal_hook(struct cgraph_node_hook_list *entry) +{ + symtab->remove_cgraph_removal_hook(entry); +} + +static inline struct cgraph_2node_hook_list *cgraph_add_node_duplication_hook(cgraph_2node_hook hook, void *data) +{ + return symtab->add_cgraph_duplication_hook(hook, data); +} + +static inline void cgraph_remove_node_duplication_hook(struct cgraph_2node_hook_list *entry) +{ + symtab->remove_cgraph_duplication_hook(entry); +} + +static inline void cgraph_call_node_duplication_hooks(cgraph_node_ptr node, cgraph_node_ptr node2) +{ + symtab->call_cgraph_duplication_hooks(node, node2); +} + +static inline void cgraph_call_edge_duplication_hooks(cgraph_edge *cs1, cgraph_edge *cs2) +{ + symtab->call_edge_duplication_hooks(cs1, cs2); +} + +#if BUILDING_GCC_VERSION >= 6000 +typedef gimple *gimple_ptr; +typedef const gimple *const_gimple_ptr; +#define gimple gimple_ptr +#define const_gimple const_gimple_ptr +#undef CONST_CAST_GIMPLE +#define CONST_CAST_GIMPLE(X) CONST_CAST(gimple, (X)) +#endif + +/* gimple related */ +static inline gimple gimple_build_assign_with_ops(enum tree_code subcode, tree lhs, tree op1, tree op2 MEM_STAT_DECL) +{ + return gimple_build_assign(lhs, subcode, op1, op2 PASS_MEM_STAT); +} + +template <> +template <> +inline bool is_a_helper::test(const_gimple gs) +{ + return gs->code == GIMPLE_RETURN; +} + +static inline gasm *as_a_gasm(gimple stmt) +{ + return as_a(stmt); +} + +static inline const gasm *as_a_const_gasm(const_gimple stmt) +{ + return as_a(stmt); +} + +static inline gassign *as_a_gassign(gimple stmt) +{ + return as_a(stmt); +} + +static inline const gassign *as_a_const_gassign(const_gimple stmt) +{ + return as_a(stmt); +} + +static inline gcall *as_a_gcall(gimple stmt) +{ + return as_a(stmt); +} + +static inline const gcall *as_a_const_gcall(const_gimple stmt) +{ + return as_a(stmt); +} + +static inline gphi *as_a_gphi(gimple stmt) +{ + return as_a(stmt); +} + +static inline const gphi *as_a_const_gphi(const_gimple stmt) +{ + return as_a(stmt); +} + +static inline greturn *as_a_greturn(gimple stmt) +{ + return as_a(stmt); +} + +static inline const greturn *as_a_const_greturn(const_gimple stmt) +{ + return as_a(stmt); +} + +/* IPA/LTO related */ +#define ipa_ref_list_referring_iterate(L, I, P) \ + (L)->referring.iterate((I), &(P)) +#define ipa_ref_list_reference_iterate(L, I, P) \ + (L)->reference.iterate((I), &(P)) + +static inline cgraph_node_ptr ipa_ref_referring_node(struct ipa_ref *ref) +{ + return dyn_cast(ref->referring); +} + +static inline void ipa_remove_stmt_references(symtab_node *referring_node, gimple stmt) +{ + referring_node->remove_stmt_references(stmt); +} +#endif + +#if BUILDING_GCC_VERSION < 6000 +#define get_inner_reference(exp, pbitsize, pbitpos, poffset, pmode, punsignedp, preversep, pvolatilep, keep_aligning) \ + get_inner_reference(exp, pbitsize, pbitpos, poffset, pmode, punsignedp, pvolatilep, keep_aligning) +#define gen_rtx_set(ARG0, ARG1) gen_rtx_SET(VOIDmode, (ARG0), (ARG1)) +#endif + +#if BUILDING_GCC_VERSION >= 6000 +#define gen_rtx_set(ARG0, ARG1) gen_rtx_SET((ARG0), (ARG1)) +#endif + +#ifdef __cplusplus +static inline void debug_tree(const_tree t) +{ + debug_tree(CONST_CAST_TREE(t)); +} + +static inline void debug_gimple_stmt(const_gimple s) +{ + debug_gimple_stmt(CONST_CAST_GIMPLE(s)); +} +#else +#define debug_tree(t) debug_tree(CONST_CAST_TREE(t)) +#define debug_gimple_stmt(s) debug_gimple_stmt(CONST_CAST_GIMPLE(s)) +#endif + +#endif diff --git a/scripts/gcc-plugins/gcc-generate-gimple-pass.h b/scripts/gcc-plugins/gcc-generate-gimple-pass.h new file mode 100644 index 000000000000..526c3c79b68e --- /dev/null +++ b/scripts/gcc-plugins/gcc-generate-gimple-pass.h @@ -0,0 +1,175 @@ +/* + * Generator for GIMPLE pass related boilerplate code/data + * + * Supports gcc 4.5-6 + * + * Usage: + * + * 1. before inclusion define PASS_NAME + * 2. before inclusion define NO_* for unimplemented callbacks + * NO_GATE + * NO_EXECUTE + * 3. before inclusion define PROPERTIES_* and TODO_FLAGS_* to override + * the default 0 values + * 4. for convenience, all the above will be undefined after inclusion! + * 5. the only exported name is make_PASS_NAME_pass() to register with gcc + */ + +#ifndef PASS_NAME +#error at least PASS_NAME must be defined +#else +#define __GCC_PLUGIN_STRINGIFY(n) #n +#define _GCC_PLUGIN_STRINGIFY(n) __GCC_PLUGIN_STRINGIFY(n) +#define _GCC_PLUGIN_CONCAT2(x, y) x ## y +#define _GCC_PLUGIN_CONCAT3(x, y, z) x ## y ## z + +#define __PASS_NAME_PASS_DATA(n) _GCC_PLUGIN_CONCAT2(n, _pass_data) +#define _PASS_NAME_PASS_DATA __PASS_NAME_PASS_DATA(PASS_NAME) + +#define __PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT2(n, _pass) +#define _PASS_NAME_PASS __PASS_NAME_PASS(PASS_NAME) + +#define _PASS_NAME_NAME _GCC_PLUGIN_STRINGIFY(PASS_NAME) + +#define __MAKE_PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT3(make_, n, _pass) +#define _MAKE_PASS_NAME_PASS __MAKE_PASS_NAME_PASS(PASS_NAME) + +#ifdef NO_GATE +#define _GATE NULL +#define _HAS_GATE false +#else +#define __GATE(n) _GCC_PLUGIN_CONCAT2(n, _gate) +#define _GATE __GATE(PASS_NAME) +#define _HAS_GATE true +#endif + +#ifdef NO_EXECUTE +#define _EXECUTE NULL +#define _HAS_EXECUTE false +#else +#define __EXECUTE(n) _GCC_PLUGIN_CONCAT2(n, _execute) +#define _EXECUTE __EXECUTE(PASS_NAME) +#define _HAS_EXECUTE true +#endif + +#ifndef PROPERTIES_REQUIRED +#define PROPERTIES_REQUIRED 0 +#endif + +#ifndef PROPERTIES_PROVIDED +#define PROPERTIES_PROVIDED 0 +#endif + +#ifndef PROPERTIES_DESTROYED +#define PROPERTIES_DESTROYED 0 +#endif + +#ifndef TODO_FLAGS_START +#define TODO_FLAGS_START 0 +#endif + +#ifndef TODO_FLAGS_FINISH +#define TODO_FLAGS_FINISH 0 +#endif + +#if BUILDING_GCC_VERSION >= 4009 +namespace { +static const pass_data _PASS_NAME_PASS_DATA = { +#else +static struct gimple_opt_pass _PASS_NAME_PASS = { + .pass = { +#endif + .type = GIMPLE_PASS, + .name = _PASS_NAME_NAME, +#if BUILDING_GCC_VERSION >= 4008 + .optinfo_flags = OPTGROUP_NONE, +#endif +#if BUILDING_GCC_VERSION >= 5000 +#elif BUILDING_GCC_VERSION == 4009 + .has_gate = _HAS_GATE, + .has_execute = _HAS_EXECUTE, +#else + .gate = _GATE, + .execute = _EXECUTE, + .sub = NULL, + .next = NULL, + .static_pass_number = 0, +#endif + .tv_id = TV_NONE, + .properties_required = PROPERTIES_REQUIRED, + .properties_provided = PROPERTIES_PROVIDED, + .properties_destroyed = PROPERTIES_DESTROYED, + .todo_flags_start = TODO_FLAGS_START, + .todo_flags_finish = TODO_FLAGS_FINISH, +#if BUILDING_GCC_VERSION < 4009 + } +#endif +}; + +#if BUILDING_GCC_VERSION >= 4009 +class _PASS_NAME_PASS : public gimple_opt_pass { +public: + _PASS_NAME_PASS() : gimple_opt_pass(_PASS_NAME_PASS_DATA, g) {} + +#ifndef NO_GATE +#if BUILDING_GCC_VERSION >= 5000 + virtual bool gate(function *) { return _GATE(); } +#else + virtual bool gate(void) { return _GATE(); } +#endif +#endif + + virtual opt_pass * clone () { return new _PASS_NAME_PASS(); } + +#ifndef NO_EXECUTE +#if BUILDING_GCC_VERSION >= 5000 + virtual unsigned int execute(function *) { return _EXECUTE(); } +#else + virtual unsigned int execute(void) { return _EXECUTE(); } +#endif +#endif +}; +} + +opt_pass *_MAKE_PASS_NAME_PASS(void) +{ + return new _PASS_NAME_PASS(); +} +#else +struct opt_pass *_MAKE_PASS_NAME_PASS(void) +{ + return &_PASS_NAME_PASS.pass; +} +#endif + +/* clean up user provided defines */ +#undef PASS_NAME +#undef NO_GATE +#undef NO_EXECUTE + +#undef PROPERTIES_DESTROYED +#undef PROPERTIES_PROVIDED +#undef PROPERTIES_REQUIRED +#undef TODO_FLAGS_FINISH +#undef TODO_FLAGS_START + +/* clean up generated defines */ +#undef _EXECUTE +#undef __EXECUTE +#undef _GATE +#undef __GATE +#undef _GCC_PLUGIN_CONCAT2 +#undef _GCC_PLUGIN_CONCAT3 +#undef _GCC_PLUGIN_STRINGIFY +#undef __GCC_PLUGIN_STRINGIFY +#undef _HAS_EXECUTE +#undef _HAS_GATE +#undef _MAKE_PASS_NAME_PASS +#undef __MAKE_PASS_NAME_PASS +#undef _PASS_NAME_NAME +#undef _PASS_NAME_PASS +#undef __PASS_NAME_PASS +#undef _PASS_NAME_PASS_DATA +#undef __PASS_NAME_PASS_DATA + +#endif /* PASS_NAME */ diff --git a/scripts/gcc-plugins/gcc-generate-ipa-pass.h b/scripts/gcc-plugins/gcc-generate-ipa-pass.h new file mode 100644 index 000000000000..9bd926e072f0 --- /dev/null +++ b/scripts/gcc-plugins/gcc-generate-ipa-pass.h @@ -0,0 +1,289 @@ +/* + * Generator for IPA pass related boilerplate code/data + * + * Supports gcc 4.5-6 + * + * Usage: + * + * 1. before inclusion define PASS_NAME + * 2. before inclusion define NO_* for unimplemented callbacks + * NO_GENERATE_SUMMARY + * NO_READ_SUMMARY + * NO_WRITE_SUMMARY + * NO_READ_OPTIMIZATION_SUMMARY + * NO_WRITE_OPTIMIZATION_SUMMARY + * NO_STMT_FIXUP + * NO_FUNCTION_TRANSFORM + * NO_VARIABLE_TRANSFORM + * NO_GATE + * NO_EXECUTE + * 3. before inclusion define PROPERTIES_* and *TODO_FLAGS_* to override + * the default 0 values + * 4. for convenience, all the above will be undefined after inclusion! + * 5. the only exported name is make_PASS_NAME_pass() to register with gcc + */ + +#ifndef PASS_NAME +#error at least PASS_NAME must be defined +#else +#define __GCC_PLUGIN_STRINGIFY(n) #n +#define _GCC_PLUGIN_STRINGIFY(n) __GCC_PLUGIN_STRINGIFY(n) +#define _GCC_PLUGIN_CONCAT2(x, y) x ## y +#define _GCC_PLUGIN_CONCAT3(x, y, z) x ## y ## z + +#define __PASS_NAME_PASS_DATA(n) _GCC_PLUGIN_CONCAT2(n, _pass_data) +#define _PASS_NAME_PASS_DATA __PASS_NAME_PASS_DATA(PASS_NAME) + +#define __PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT2(n, _pass) +#define _PASS_NAME_PASS __PASS_NAME_PASS(PASS_NAME) + +#define _PASS_NAME_NAME _GCC_PLUGIN_STRINGIFY(PASS_NAME) + +#define __MAKE_PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT3(make_, n, _pass) +#define _MAKE_PASS_NAME_PASS __MAKE_PASS_NAME_PASS(PASS_NAME) + +#ifdef NO_GENERATE_SUMMARY +#define _GENERATE_SUMMARY NULL +#else +#define __GENERATE_SUMMARY(n) _GCC_PLUGIN_CONCAT2(n, _generate_summary) +#define _GENERATE_SUMMARY __GENERATE_SUMMARY(PASS_NAME) +#endif + +#ifdef NO_READ_SUMMARY +#define _READ_SUMMARY NULL +#else +#define __READ_SUMMARY(n) _GCC_PLUGIN_CONCAT2(n, _read_summary) +#define _READ_SUMMARY __READ_SUMMARY(PASS_NAME) +#endif + +#ifdef NO_WRITE_SUMMARY +#define _WRITE_SUMMARY NULL +#else +#define __WRITE_SUMMARY(n) _GCC_PLUGIN_CONCAT2(n, _write_summary) +#define _WRITE_SUMMARY __WRITE_SUMMARY(PASS_NAME) +#endif + +#ifdef NO_READ_OPTIMIZATION_SUMMARY +#define _READ_OPTIMIZATION_SUMMARY NULL +#else +#define __READ_OPTIMIZATION_SUMMARY(n) _GCC_PLUGIN_CONCAT2(n, _read_optimization_summary) +#define _READ_OPTIMIZATION_SUMMARY __READ_OPTIMIZATION_SUMMARY(PASS_NAME) +#endif + +#ifdef NO_WRITE_OPTIMIZATION_SUMMARY +#define _WRITE_OPTIMIZATION_SUMMARY NULL +#else +#define __WRITE_OPTIMIZATION_SUMMARY(n) _GCC_PLUGIN_CONCAT2(n, _write_optimization_summary) +#define _WRITE_OPTIMIZATION_SUMMARY __WRITE_OPTIMIZATION_SUMMARY(PASS_NAME) +#endif + +#ifdef NO_STMT_FIXUP +#define _STMT_FIXUP NULL +#else +#define __STMT_FIXUP(n) _GCC_PLUGIN_CONCAT2(n, _stmt_fixup) +#define _STMT_FIXUP __STMT_FIXUP(PASS_NAME) +#endif + +#ifdef NO_FUNCTION_TRANSFORM +#define _FUNCTION_TRANSFORM NULL +#else +#define __FUNCTION_TRANSFORM(n) _GCC_PLUGIN_CONCAT2(n, _function_transform) +#define _FUNCTION_TRANSFORM __FUNCTION_TRANSFORM(PASS_NAME) +#endif + +#ifdef NO_VARIABLE_TRANSFORM +#define _VARIABLE_TRANSFORM NULL +#else +#define __VARIABLE_TRANSFORM(n) _GCC_PLUGIN_CONCAT2(n, _variable_transform) +#define _VARIABLE_TRANSFORM __VARIABLE_TRANSFORM(PASS_NAME) +#endif + +#ifdef NO_GATE +#define _GATE NULL +#define _HAS_GATE false +#else +#define __GATE(n) _GCC_PLUGIN_CONCAT2(n, _gate) +#define _GATE __GATE(PASS_NAME) +#define _HAS_GATE true +#endif + +#ifdef NO_EXECUTE +#define _EXECUTE NULL +#define _HAS_EXECUTE false +#else +#define __EXECUTE(n) _GCC_PLUGIN_CONCAT2(n, _execute) +#define _EXECUTE __EXECUTE(PASS_NAME) +#define _HAS_EXECUTE true +#endif + +#ifndef PROPERTIES_REQUIRED +#define PROPERTIES_REQUIRED 0 +#endif + +#ifndef PROPERTIES_PROVIDED +#define PROPERTIES_PROVIDED 0 +#endif + +#ifndef PROPERTIES_DESTROYED +#define PROPERTIES_DESTROYED 0 +#endif + +#ifndef TODO_FLAGS_START +#define TODO_FLAGS_START 0 +#endif + +#ifndef TODO_FLAGS_FINISH +#define TODO_FLAGS_FINISH 0 +#endif + +#ifndef FUNCTION_TRANSFORM_TODO_FLAGS_START +#define FUNCTION_TRANSFORM_TODO_FLAGS_START 0 +#endif + +#if BUILDING_GCC_VERSION >= 4009 +namespace { +static const pass_data _PASS_NAME_PASS_DATA = { +#else +static struct ipa_opt_pass_d _PASS_NAME_PASS = { + .pass = { +#endif + .type = IPA_PASS, + .name = _PASS_NAME_NAME, +#if BUILDING_GCC_VERSION >= 4008 + .optinfo_flags = OPTGROUP_NONE, +#endif +#if BUILDING_GCC_VERSION >= 5000 +#elif BUILDING_GCC_VERSION == 4009 + .has_gate = _HAS_GATE, + .has_execute = _HAS_EXECUTE, +#else + .gate = _GATE, + .execute = _EXECUTE, + .sub = NULL, + .next = NULL, + .static_pass_number = 0, +#endif + .tv_id = TV_NONE, + .properties_required = PROPERTIES_REQUIRED, + .properties_provided = PROPERTIES_PROVIDED, + .properties_destroyed = PROPERTIES_DESTROYED, + .todo_flags_start = TODO_FLAGS_START, + .todo_flags_finish = TODO_FLAGS_FINISH, +#if BUILDING_GCC_VERSION < 4009 + }, + .generate_summary = _GENERATE_SUMMARY, + .write_summary = _WRITE_SUMMARY, + .read_summary = _READ_SUMMARY, +#if BUILDING_GCC_VERSION >= 4006 + .write_optimization_summary = _WRITE_OPTIMIZATION_SUMMARY, + .read_optimization_summary = _READ_OPTIMIZATION_SUMMARY, +#endif + .stmt_fixup = _STMT_FIXUP, + .function_transform_todo_flags_start = FUNCTION_TRANSFORM_TODO_FLAGS_START, + .function_transform = _FUNCTION_TRANSFORM, + .variable_transform = _VARIABLE_TRANSFORM, +#endif +}; + +#if BUILDING_GCC_VERSION >= 4009 +class _PASS_NAME_PASS : public ipa_opt_pass_d { +public: + _PASS_NAME_PASS() : ipa_opt_pass_d(_PASS_NAME_PASS_DATA, + g, + _GENERATE_SUMMARY, + _WRITE_SUMMARY, + _READ_SUMMARY, + _WRITE_OPTIMIZATION_SUMMARY, + _READ_OPTIMIZATION_SUMMARY, + _STMT_FIXUP, + FUNCTION_TRANSFORM_TODO_FLAGS_START, + _FUNCTION_TRANSFORM, + _VARIABLE_TRANSFORM) {} + +#ifndef NO_GATE +#if BUILDING_GCC_VERSION >= 5000 + virtual bool gate(function *) { return _GATE(); } +#else + virtual bool gate(void) { return _GATE(); } +#endif +#endif + + virtual opt_pass *clone() { return new _PASS_NAME_PASS(); } + +#ifndef NO_EXECUTE +#if BUILDING_GCC_VERSION >= 5000 + virtual unsigned int execute(function *) { return _EXECUTE(); } +#else + virtual unsigned int execute(void) { return _EXECUTE(); } +#endif +#endif +}; +} + +opt_pass *_MAKE_PASS_NAME_PASS(void) +{ + return new _PASS_NAME_PASS(); +} +#else +struct opt_pass *_MAKE_PASS_NAME_PASS(void) +{ + return &_PASS_NAME_PASS.pass; +} +#endif + +/* clean up user provided defines */ +#undef PASS_NAME +#undef NO_GENERATE_SUMMARY +#undef NO_WRITE_SUMMARY +#undef NO_READ_SUMMARY +#undef NO_WRITE_OPTIMIZATION_SUMMARY +#undef NO_READ_OPTIMIZATION_SUMMARY +#undef NO_STMT_FIXUP +#undef NO_FUNCTION_TRANSFORM +#undef NO_VARIABLE_TRANSFORM +#undef NO_GATE +#undef NO_EXECUTE + +#undef FUNCTION_TRANSFORM_TODO_FLAGS_START +#undef PROPERTIES_DESTROYED +#undef PROPERTIES_PROVIDED +#undef PROPERTIES_REQUIRED +#undef TODO_FLAGS_FINISH +#undef TODO_FLAGS_START + +/* clean up generated defines */ +#undef _EXECUTE +#undef __EXECUTE +#undef _FUNCTION_TRANSFORM +#undef __FUNCTION_TRANSFORM +#undef _GATE +#undef __GATE +#undef _GCC_PLUGIN_CONCAT2 +#undef _GCC_PLUGIN_CONCAT3 +#undef _GCC_PLUGIN_STRINGIFY +#undef __GCC_PLUGIN_STRINGIFY +#undef _GENERATE_SUMMARY +#undef __GENERATE_SUMMARY +#undef _HAS_EXECUTE +#undef _HAS_GATE +#undef _MAKE_PASS_NAME_PASS +#undef __MAKE_PASS_NAME_PASS +#undef _PASS_NAME_NAME +#undef _PASS_NAME_PASS +#undef __PASS_NAME_PASS +#undef _PASS_NAME_PASS_DATA +#undef __PASS_NAME_PASS_DATA +#undef _READ_OPTIMIZATION_SUMMARY +#undef __READ_OPTIMIZATION_SUMMARY +#undef _READ_SUMMARY +#undef __READ_SUMMARY +#undef _STMT_FIXUP +#undef __STMT_FIXUP +#undef _VARIABLE_TRANSFORM +#undef __VARIABLE_TRANSFORM +#undef _WRITE_OPTIMIZATION_SUMMARY +#undef __WRITE_OPTIMIZATION_SUMMARY +#undef _WRITE_SUMMARY +#undef __WRITE_SUMMARY + +#endif /* PASS_NAME */ diff --git a/scripts/gcc-plugins/gcc-generate-rtl-pass.h b/scripts/gcc-plugins/gcc-generate-rtl-pass.h new file mode 100644 index 000000000000..1dc67a5aeadf --- /dev/null +++ b/scripts/gcc-plugins/gcc-generate-rtl-pass.h @@ -0,0 +1,175 @@ +/* + * Generator for RTL pass related boilerplate code/data + * + * Supports gcc 4.5-6 + * + * Usage: + * + * 1. before inclusion define PASS_NAME + * 2. before inclusion define NO_* for unimplemented callbacks + * NO_GATE + * NO_EXECUTE + * 3. before inclusion define PROPERTIES_* and TODO_FLAGS_* to override + * the default 0 values + * 4. for convenience, all the above will be undefined after inclusion! + * 5. the only exported name is make_PASS_NAME_pass() to register with gcc + */ + +#ifndef PASS_NAME +#error at least PASS_NAME must be defined +#else +#define __GCC_PLUGIN_STRINGIFY(n) #n +#define _GCC_PLUGIN_STRINGIFY(n) __GCC_PLUGIN_STRINGIFY(n) +#define _GCC_PLUGIN_CONCAT2(x, y) x ## y +#define _GCC_PLUGIN_CONCAT3(x, y, z) x ## y ## z + +#define __PASS_NAME_PASS_DATA(n) _GCC_PLUGIN_CONCAT2(n, _pass_data) +#define _PASS_NAME_PASS_DATA __PASS_NAME_PASS_DATA(PASS_NAME) + +#define __PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT2(n, _pass) +#define _PASS_NAME_PASS __PASS_NAME_PASS(PASS_NAME) + +#define _PASS_NAME_NAME _GCC_PLUGIN_STRINGIFY(PASS_NAME) + +#define __MAKE_PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT3(make_, n, _pass) +#define _MAKE_PASS_NAME_PASS __MAKE_PASS_NAME_PASS(PASS_NAME) + +#ifdef NO_GATE +#define _GATE NULL +#define _HAS_GATE false +#else +#define __GATE(n) _GCC_PLUGIN_CONCAT2(n, _gate) +#define _GATE __GATE(PASS_NAME) +#define _HAS_GATE true +#endif + +#ifdef NO_EXECUTE +#define _EXECUTE NULL +#define _HAS_EXECUTE false +#else +#define __EXECUTE(n) _GCC_PLUGIN_CONCAT2(n, _execute) +#define _EXECUTE __EXECUTE(PASS_NAME) +#define _HAS_EXECUTE true +#endif + +#ifndef PROPERTIES_REQUIRED +#define PROPERTIES_REQUIRED 0 +#endif + +#ifndef PROPERTIES_PROVIDED +#define PROPERTIES_PROVIDED 0 +#endif + +#ifndef PROPERTIES_DESTROYED +#define PROPERTIES_DESTROYED 0 +#endif + +#ifndef TODO_FLAGS_START +#define TODO_FLAGS_START 0 +#endif + +#ifndef TODO_FLAGS_FINISH +#define TODO_FLAGS_FINISH 0 +#endif + +#if BUILDING_GCC_VERSION >= 4009 +namespace { +static const pass_data _PASS_NAME_PASS_DATA = { +#else +static struct rtl_opt_pass _PASS_NAME_PASS = { + .pass = { +#endif + .type = RTL_PASS, + .name = _PASS_NAME_NAME, +#if BUILDING_GCC_VERSION >= 4008 + .optinfo_flags = OPTGROUP_NONE, +#endif +#if BUILDING_GCC_VERSION >= 5000 +#elif BUILDING_GCC_VERSION == 4009 + .has_gate = _HAS_GATE, + .has_execute = _HAS_EXECUTE, +#else + .gate = _GATE, + .execute = _EXECUTE, + .sub = NULL, + .next = NULL, + .static_pass_number = 0, +#endif + .tv_id = TV_NONE, + .properties_required = PROPERTIES_REQUIRED, + .properties_provided = PROPERTIES_PROVIDED, + .properties_destroyed = PROPERTIES_DESTROYED, + .todo_flags_start = TODO_FLAGS_START, + .todo_flags_finish = TODO_FLAGS_FINISH, +#if BUILDING_GCC_VERSION < 4009 + } +#endif +}; + +#if BUILDING_GCC_VERSION >= 4009 +class _PASS_NAME_PASS : public rtl_opt_pass { +public: + _PASS_NAME_PASS() : rtl_opt_pass(_PASS_NAME_PASS_DATA, g) {} + +#ifndef NO_GATE +#if BUILDING_GCC_VERSION >= 5000 + virtual bool gate(function *) { return _GATE(); } +#else + virtual bool gate(void) { return _GATE(); } +#endif +#endif + + virtual opt_pass *clone() { return new _PASS_NAME_PASS(); } + +#ifndef NO_EXECUTE +#if BUILDING_GCC_VERSION >= 5000 + virtual unsigned int execute(function *) { return _EXECUTE(); } +#else + virtual unsigned int execute(void) { return _EXECUTE(); } +#endif +#endif +}; +} + +opt_pass *_MAKE_PASS_NAME_PASS(void) +{ + return new _PASS_NAME_PASS(); +} +#else +struct opt_pass *_MAKE_PASS_NAME_PASS(void) +{ + return &_PASS_NAME_PASS.pass; +} +#endif + +/* clean up user provided defines */ +#undef PASS_NAME +#undef NO_GATE +#undef NO_EXECUTE + +#undef PROPERTIES_DESTROYED +#undef PROPERTIES_PROVIDED +#undef PROPERTIES_REQUIRED +#undef TODO_FLAGS_FINISH +#undef TODO_FLAGS_START + +/* clean up generated defines */ +#undef _EXECUTE +#undef __EXECUTE +#undef _GATE +#undef __GATE +#undef _GCC_PLUGIN_CONCAT2 +#undef _GCC_PLUGIN_CONCAT3 +#undef _GCC_PLUGIN_STRINGIFY +#undef __GCC_PLUGIN_STRINGIFY +#undef _HAS_EXECUTE +#undef _HAS_GATE +#undef _MAKE_PASS_NAME_PASS +#undef __MAKE_PASS_NAME_PASS +#undef _PASS_NAME_NAME +#undef _PASS_NAME_PASS +#undef __PASS_NAME_PASS +#undef _PASS_NAME_PASS_DATA +#undef __PASS_NAME_PASS_DATA + +#endif /* PASS_NAME */ diff --git a/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h b/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h new file mode 100644 index 000000000000..a27e2b36afaa --- /dev/null +++ b/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h @@ -0,0 +1,175 @@ +/* + * Generator for SIMPLE_IPA pass related boilerplate code/data + * + * Supports gcc 4.5-6 + * + * Usage: + * + * 1. before inclusion define PASS_NAME + * 2. before inclusion define NO_* for unimplemented callbacks + * NO_GATE + * NO_EXECUTE + * 3. before inclusion define PROPERTIES_* and TODO_FLAGS_* to override + * the default 0 values + * 4. for convenience, all the above will be undefined after inclusion! + * 5. the only exported name is make_PASS_NAME_pass() to register with gcc + */ + +#ifndef PASS_NAME +#error at least PASS_NAME must be defined +#else +#define __GCC_PLUGIN_STRINGIFY(n) #n +#define _GCC_PLUGIN_STRINGIFY(n) __GCC_PLUGIN_STRINGIFY(n) +#define _GCC_PLUGIN_CONCAT2(x, y) x ## y +#define _GCC_PLUGIN_CONCAT3(x, y, z) x ## y ## z + +#define __PASS_NAME_PASS_DATA(n) _GCC_PLUGIN_CONCAT2(n, _pass_data) +#define _PASS_NAME_PASS_DATA __PASS_NAME_PASS_DATA(PASS_NAME) + +#define __PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT2(n, _pass) +#define _PASS_NAME_PASS __PASS_NAME_PASS(PASS_NAME) + +#define _PASS_NAME_NAME _GCC_PLUGIN_STRINGIFY(PASS_NAME) + +#define __MAKE_PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT3(make_, n, _pass) +#define _MAKE_PASS_NAME_PASS __MAKE_PASS_NAME_PASS(PASS_NAME) + +#ifdef NO_GATE +#define _GATE NULL +#define _HAS_GATE false +#else +#define __GATE(n) _GCC_PLUGIN_CONCAT2(n, _gate) +#define _GATE __GATE(PASS_NAME) +#define _HAS_GATE true +#endif + +#ifdef NO_EXECUTE +#define _EXECUTE NULL +#define _HAS_EXECUTE false +#else +#define __EXECUTE(n) _GCC_PLUGIN_CONCAT2(n, _execute) +#define _EXECUTE __EXECUTE(PASS_NAME) +#define _HAS_EXECUTE true +#endif + +#ifndef PROPERTIES_REQUIRED +#define PROPERTIES_REQUIRED 0 +#endif + +#ifndef PROPERTIES_PROVIDED +#define PROPERTIES_PROVIDED 0 +#endif + +#ifndef PROPERTIES_DESTROYED +#define PROPERTIES_DESTROYED 0 +#endif + +#ifndef TODO_FLAGS_START +#define TODO_FLAGS_START 0 +#endif + +#ifndef TODO_FLAGS_FINISH +#define TODO_FLAGS_FINISH 0 +#endif + +#if BUILDING_GCC_VERSION >= 4009 +namespace { +static const pass_data _PASS_NAME_PASS_DATA = { +#else +static struct simple_ipa_opt_pass _PASS_NAME_PASS = { + .pass = { +#endif + .type = SIMPLE_IPA_PASS, + .name = _PASS_NAME_NAME, +#if BUILDING_GCC_VERSION >= 4008 + .optinfo_flags = OPTGROUP_NONE, +#endif +#if BUILDING_GCC_VERSION >= 5000 +#elif BUILDING_GCC_VERSION == 4009 + .has_gate = _HAS_GATE, + .has_execute = _HAS_EXECUTE, +#else + .gate = _GATE, + .execute = _EXECUTE, + .sub = NULL, + .next = NULL, + .static_pass_number = 0, +#endif + .tv_id = TV_NONE, + .properties_required = PROPERTIES_REQUIRED, + .properties_provided = PROPERTIES_PROVIDED, + .properties_destroyed = PROPERTIES_DESTROYED, + .todo_flags_start = TODO_FLAGS_START, + .todo_flags_finish = TODO_FLAGS_FINISH, +#if BUILDING_GCC_VERSION < 4009 + } +#endif +}; + +#if BUILDING_GCC_VERSION >= 4009 +class _PASS_NAME_PASS : public simple_ipa_opt_pass { +public: + _PASS_NAME_PASS() : simple_ipa_opt_pass(_PASS_NAME_PASS_DATA, g) {} + +#ifndef NO_GATE +#if BUILDING_GCC_VERSION >= 5000 + virtual bool gate(function *) { return _GATE(); } +#else + virtual bool gate(void) { return _GATE(); } +#endif +#endif + + virtual opt_pass *clone() { return new _PASS_NAME_PASS(); } + +#ifndef NO_EXECUTE +#if BUILDING_GCC_VERSION >= 5000 + virtual unsigned int execute(function *) { return _EXECUTE(); } +#else + virtual unsigned int execute(void) { return _EXECUTE(); } +#endif +#endif +}; +} + +opt_pass *_MAKE_PASS_NAME_PASS(void) +{ + return new _PASS_NAME_PASS(); +} +#else +struct opt_pass *_MAKE_PASS_NAME_PASS(void) +{ + return &_PASS_NAME_PASS.pass; +} +#endif + +/* clean up user provided defines */ +#undef PASS_NAME +#undef NO_GATE +#undef NO_EXECUTE + +#undef PROPERTIES_DESTROYED +#undef PROPERTIES_PROVIDED +#undef PROPERTIES_REQUIRED +#undef TODO_FLAGS_FINISH +#undef TODO_FLAGS_START + +/* clean up generated defines */ +#undef _EXECUTE +#undef __EXECUTE +#undef _GATE +#undef __GATE +#undef _GCC_PLUGIN_CONCAT2 +#undef _GCC_PLUGIN_CONCAT3 +#undef _GCC_PLUGIN_STRINGIFY +#undef __GCC_PLUGIN_STRINGIFY +#undef _HAS_EXECUTE +#undef _HAS_GATE +#undef _MAKE_PASS_NAME_PASS +#undef __MAKE_PASS_NAME_PASS +#undef _PASS_NAME_NAME +#undef _PASS_NAME_PASS +#undef __PASS_NAME_PASS +#undef _PASS_NAME_PASS_DATA +#undef __PASS_NAME_PASS_DATA + +#endif /* PASS_NAME */ diff --git a/scripts/gcc-plugins/sancov_plugin.c b/scripts/gcc-plugins/sancov_plugin.c new file mode 100644 index 000000000000..aedd6113cb73 --- /dev/null +++ b/scripts/gcc-plugins/sancov_plugin.c @@ -0,0 +1,144 @@ +/* + * Copyright 2011-2016 by Emese Revfy + * Licensed under the GPL v2, or (at your option) v3 + * + * Homepage: + * https://github.com/ephox-gcc-plugins/sancov + * + * This plugin inserts a __sanitizer_cov_trace_pc() call at the start of basic blocks. + * It supports all gcc versions with plugin support (from gcc-4.5 on). + * It is based on the commit "Add fuzzing coverage support" by Dmitry Vyukov . + * + * You can read about it more here: + * https://gcc.gnu.org/viewcvs/gcc?limit_changes=0&view=revision&revision=231296 + * http://lwn.net/Articles/674854/ + * https://github.com/google/syzkaller + * https://lwn.net/Articles/677764/ + * + * Usage: + * make run + */ + +#include "gcc-common.h" + +int plugin_is_GPL_compatible; + +tree sancov_fndecl; + +static struct plugin_info sancov_plugin_info = { + .version = "20160402", + .help = "sancov plugin\n", +}; + +static unsigned int sancov_execute(void) +{ + basic_block bb; + + /* Remove this line when this plugin and kcov will be in the kernel. + if (!strcmp(DECL_NAME_POINTER(current_function_decl), DECL_NAME_POINTER(sancov_fndecl))) + return 0; + */ + + FOR_EACH_BB_FN(bb, cfun) { + const_gimple stmt; + gcall *gcall; + gimple_stmt_iterator gsi = gsi_after_labels(bb); + + if (gsi_end_p(gsi)) + continue; + + stmt = gsi_stmt(gsi); + gcall = as_a_gcall(gimple_build_call(sancov_fndecl, 0)); + gimple_set_location(gcall, gimple_location(stmt)); + gsi_insert_before(&gsi, gcall, GSI_SAME_STMT); + } + return 0; +} + +#define PASS_NAME sancov + +#define NO_GATE +#define TODO_FLAGS_FINISH TODO_dump_func | TODO_verify_stmts | TODO_update_ssa_no_phi | TODO_verify_flow + +#include "gcc-generate-gimple-pass.h" + +static void sancov_start_unit(void __unused *gcc_data, void __unused *user_data) +{ + tree leaf_attr, nothrow_attr; + tree BT_FN_VOID = build_function_type_list(void_type_node, NULL_TREE); + + sancov_fndecl = build_fn_decl("__sanitizer_cov_trace_pc", BT_FN_VOID); + + DECL_ASSEMBLER_NAME(sancov_fndecl); + TREE_PUBLIC(sancov_fndecl) = 1; + DECL_EXTERNAL(sancov_fndecl) = 1; + DECL_ARTIFICIAL(sancov_fndecl) = 1; + DECL_PRESERVE_P(sancov_fndecl) = 1; + DECL_UNINLINABLE(sancov_fndecl) = 1; + TREE_USED(sancov_fndecl) = 1; + + nothrow_attr = tree_cons(get_identifier("nothrow"), NULL, NULL); + decl_attributes(&sancov_fndecl, nothrow_attr, 0); + gcc_assert(TREE_NOTHROW(sancov_fndecl)); +#if BUILDING_GCC_VERSION > 4005 + leaf_attr = tree_cons(get_identifier("leaf"), NULL, NULL); + decl_attributes(&sancov_fndecl, leaf_attr, 0); +#endif +} + +int plugin_init(struct plugin_name_args *plugin_info, struct plugin_gcc_version *version) +{ + int i; + struct register_pass_info sancov_plugin_pass_info; + const char * const plugin_name = plugin_info->base_name; + const int argc = plugin_info->argc; + const struct plugin_argument * const argv = plugin_info->argv; + bool enable = true; + + static const struct ggc_root_tab gt_ggc_r_gt_sancov[] = { + { + .base = &sancov_fndecl, + .nelt = 1, + .stride = sizeof(sancov_fndecl), + .cb = >_ggc_mx_tree_node, + .pchw = >_pch_nx_tree_node + }, + LAST_GGC_ROOT_TAB + }; + + /* BBs can be split afterwards?? */ + sancov_plugin_pass_info.pass = make_sancov_pass(); +#if BUILDING_GCC_VERSION >= 4009 + sancov_plugin_pass_info.reference_pass_name = "asan"; +#else + sancov_plugin_pass_info.reference_pass_name = "nrv"; +#endif + sancov_plugin_pass_info.ref_pass_instance_number = 0; + sancov_plugin_pass_info.pos_op = PASS_POS_INSERT_BEFORE; + + if (!plugin_default_version_check(version, &gcc_version)) { + error(G_("incompatible gcc/plugin versions")); + return 1; + } + + for (i = 0; i < argc; ++i) { + if (!strcmp(argv[i].key, "no-sancov")) { + enable = false; + continue; + } + error(G_("unkown option '-fplugin-arg-%s-%s'"), plugin_name, argv[i].key); + } + + register_callback(plugin_name, PLUGIN_INFO, NULL, &sancov_plugin_info); + + if (!enable) + return 0; + +#if BUILDING_GCC_VERSION < 6000 + register_callback(plugin_name, PLUGIN_START_UNIT, &sancov_start_unit, NULL); + register_callback(plugin_name, PLUGIN_REGISTER_GGC_ROOTS, NULL, (void *)>_ggc_r_gt_sancov); + register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, &sancov_plugin_pass_info); +#endif + + return 0; +} diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index f0f6d9d75435..4f727eb5ec43 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -180,7 +180,7 @@ else fi; # final build of init/ -${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init +${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init GCC_PLUGINS_CFLAGS="${GCC_PLUGINS_CFLAGS}" kallsymso="" kallsyms_vmlinux="" diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 86e56fef7473..e1c09e2f9be7 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -26,6 +26,8 @@ create_package() { # Fix ownership and permissions chown -R root:root "$pdir" chmod -R go-w "$pdir" + # in case we are in a restrictive umask environment like 0077 + chmod -R a+rX "$pdir" # Create the package dpkg-gencontrol $forcearch -Vkernel:debarch="${debarch}" -p$pname -P"$pdir" @@ -238,7 +240,8 @@ maintainer="$name <$email>" # Try to determine distribution if [ -n "$KDEB_CHANGELOG_DIST" ]; then distribution=$KDEB_CHANGELOG_DIST -elif distribution=$(lsb_release -cs 2>/dev/null) && [ -n "$distribution" ]; then +# In some cases lsb_release returns the codename as n/a, which breaks dpkg-parsechangelog +elif distribution=$(lsb_release -cs 2>/dev/null) && [ -n "$distribution" ] && [ "$distribution" != "n/a" ]; then : # nothing to do in this case else distribution="unstable" @@ -322,13 +325,14 @@ fi # Build kernel header package (cd $srctree; find . -name Makefile\* -o -name Kconfig\* -o -name \*.pl) > "$objtree/debian/hdrsrcfiles" -if grep -q '^CONFIG_STACK_VALIDATION=y' $KCONFIG_CONFIG ; then - (cd $srctree; find tools/objtool -type f -executable) >> "$objtree/debian/hdrsrcfiles" -fi (cd $srctree; find arch/*/include include scripts -type f) >> "$objtree/debian/hdrsrcfiles" (cd $srctree; find arch/$SRCARCH -name module.lds -o -name Kbuild.platforms -o -name Platform) >> "$objtree/debian/hdrsrcfiles" (cd $srctree; find $(find arch/$SRCARCH -name include -o -name scripts -type d) -type f) >> "$objtree/debian/hdrsrcfiles" +if grep -q '^CONFIG_STACK_VALIDATION=y' $KCONFIG_CONFIG ; then + (cd $objtree; find tools/objtool -type f -executable) >> "$objtree/debian/hdrobjfiles" +fi (cd $objtree; find arch/$SRCARCH/include Module.symvers include scripts -type f) >> "$objtree/debian/hdrobjfiles" +(cd $objtree; find scripts/gcc-plugins -name \*.so -o -name gcc-common.h) >> "$objtree/debian/hdrobjfiles" destdir=$kernel_headers_dir/usr/src/linux-headers-$version mkdir -p "$destdir" (cd $srctree; tar -c -f - -T -) < "$objtree/debian/hdrsrcfiles" | (cd $destdir; tar -xf -) diff --git a/scripts/setlocalversion b/scripts/setlocalversion index 63d91e22ed7c..966dd3924ea9 100755 --- a/scripts/setlocalversion +++ b/scripts/setlocalversion @@ -143,7 +143,7 @@ fi if test -e include/config/auto.conf; then . include/config/auto.conf else - echo "Error: kernelrelease not valid - run 'make prepare' to update it" + echo "Error: kernelrelease not valid - run 'make prepare' to update it" >&2 exit 1 fi diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index e5d6108f5e85..b0cc1a34db27 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -16,9 +16,6 @@ config HAVE_KVM_EVENTFD bool select EVENTFD -config KVM_APIC_ARCHITECTURE - bool - config KVM_MMIO bool diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index 3a3a699b7489..7cffd9338c49 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -21,18 +21,11 @@ #include -#ifdef CONFIG_KVM_NEW_VGIC -extern struct vgic_global kvm_vgic_global_state; -#define vgic_v2_params kvm_vgic_global_state -#else -extern struct vgic_params vgic_v2_params; -#endif - static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; + int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; u32 eisr0, eisr1; int i; bool expect_mi; @@ -74,7 +67,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; + int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; u32 elrsr0, elrsr1; elrsr0 = readl_relaxed(base + GICH_ELRSR0); @@ -93,7 +86,7 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; + int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; int i; for (i = 0; i < nr_lr; i++) { @@ -147,7 +140,7 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu) struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; struct vgic_dist *vgic = &kvm->arch.vgic; void __iomem *base = kern_hyp_va(vgic->vctrl_base); - int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; + int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; int i; u64 live_lrs = 0; diff --git a/virt/kvm/arm/vgic-v2-emul.c b/virt/kvm/arm/vgic-v2-emul.c deleted file mode 100644 index 1b0bee095427..000000000000 --- a/virt/kvm/arm/vgic-v2-emul.c +++ /dev/null @@ -1,856 +0,0 @@ -/* - * Contains GICv2 specific emulation code, was in vgic.c before. - * - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include "vgic.h" - -#define GICC_ARCH_VERSION_V2 0x2 - -static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg); -static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi) -{ - return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi; -} - -static bool handle_mmio_misc(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg; - u32 word_offset = offset & 3; - - switch (offset & ~3) { - case 0: /* GICD_CTLR */ - reg = vcpu->kvm->arch.vgic.enabled; - vgic_reg_access(mmio, ®, word_offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - vcpu->kvm->arch.vgic.enabled = reg & 1; - vgic_update_state(vcpu->kvm); - return true; - } - break; - - case 4: /* GICD_TYPER */ - reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; - reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1; - vgic_reg_access(mmio, ®, word_offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - break; - - case 8: /* GICD_IIDR */ - reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); - vgic_reg_access(mmio, ®, word_offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - break; - } - - return false; -} - -static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id, ACCESS_WRITE_SETBIT); -} - -static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id, ACCESS_WRITE_CLEARBIT); -} - -static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); -} - -static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); -} - -static bool handle_mmio_set_active_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); -} - -static bool handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); -} - -static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority, - vcpu->vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - return false; -} - -#define GICD_ITARGETSR_SIZE 32 -#define GICD_CPUTARGETS_BITS 8 -#define GICD_IRQS_PER_ITARGETSR (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS) -static u32 vgic_get_target_reg(struct kvm *kvm, int irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - int i; - u32 val = 0; - - irq -= VGIC_NR_PRIVATE_IRQS; - - for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) - val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8); - - return val; -} - -static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int i, c; - unsigned long *bmap; - u32 target; - - irq -= VGIC_NR_PRIVATE_IRQS; - - /* - * Pick the LSB in each byte. This ensures we target exactly - * one vcpu per IRQ. If the byte is null, assume we target - * CPU0. - */ - for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) { - int shift = i * GICD_CPUTARGETS_BITS; - - target = ffs((val >> shift) & 0xffU); - target = target ? (target - 1) : 0; - dist->irq_spi_cpu[irq + i] = target; - kvm_for_each_vcpu(c, vcpu, kvm) { - bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]); - if (c == target) - set_bit(irq + i, bmap); - else - clear_bit(irq + i, bmap); - } - } -} - -static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 reg; - - /* We treat the banked interrupts targets as read-only */ - if (offset < 32) { - u32 roreg; - - roreg = 1 << vcpu->vcpu_id; - roreg |= roreg << 8; - roreg |= roreg << 16; - - vgic_reg_access(mmio, &roreg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - return false; - } - - reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U); - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U); - vgic_update_state(vcpu->kvm); - return true; - } - - return false; -} - -static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 *reg; - - reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg, - vcpu->vcpu_id, offset >> 1); - - return vgic_handle_cfg_reg(reg, mmio, offset); -} - -static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg; - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - vgic_dispatch_sgi(vcpu, reg); - vgic_update_state(vcpu->kvm); - return true; - } - - return false; -} - -/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */ -static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int sgi; - int min_sgi = (offset & ~0x3); - int max_sgi = min_sgi + 3; - int vcpu_id = vcpu->vcpu_id; - u32 reg = 0; - - /* Copy source SGIs from distributor side */ - for (sgi = min_sgi; sgi <= max_sgi; sgi++) { - u8 sources = *vgic_get_sgi_sources(dist, vcpu_id, sgi); - - reg |= ((u32)sources) << (8 * (sgi - min_sgi)); - } - - mmio_data_write(mmio, ~0, reg); - return false; -} - -static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, bool set) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int sgi; - int min_sgi = (offset & ~0x3); - int max_sgi = min_sgi + 3; - int vcpu_id = vcpu->vcpu_id; - u32 reg; - bool updated = false; - - reg = mmio_data_read(mmio, ~0); - - /* Clear pending SGIs on the distributor */ - for (sgi = min_sgi; sgi <= max_sgi; sgi++) { - u8 mask = reg >> (8 * (sgi - min_sgi)); - u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi); - - if (set) { - if ((*src & mask) != mask) - updated = true; - *src |= mask; - } else { - if (*src & mask) - updated = true; - *src &= ~mask; - } - } - - if (updated) - vgic_update_state(vcpu->kvm); - - return updated; -} - -static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (!mmio->is_write) - return read_set_clear_sgi_pend_reg(vcpu, mmio, offset); - else - return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true); -} - -static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (!mmio->is_write) - return read_set_clear_sgi_pend_reg(vcpu, mmio, offset); - else - return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false); -} - -static const struct vgic_io_range vgic_dist_ranges[] = { - { - .base = GIC_DIST_SOFTINT, - .len = 4, - .handle_mmio = handle_mmio_sgi_reg, - }, - { - .base = GIC_DIST_CTRL, - .len = 12, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_misc, - }, - { - .base = GIC_DIST_IGROUP, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GIC_DIST_ENABLE_SET, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_enable_reg, - }, - { - .base = GIC_DIST_ENABLE_CLEAR, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_enable_reg, - }, - { - .base = GIC_DIST_PENDING_SET, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_pending_reg, - }, - { - .base = GIC_DIST_PENDING_CLEAR, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_pending_reg, - }, - { - .base = GIC_DIST_ACTIVE_SET, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_active_reg, - }, - { - .base = GIC_DIST_ACTIVE_CLEAR, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_active_reg, - }, - { - .base = GIC_DIST_PRI, - .len = VGIC_MAX_IRQS, - .bits_per_irq = 8, - .handle_mmio = handle_mmio_priority_reg, - }, - { - .base = GIC_DIST_TARGET, - .len = VGIC_MAX_IRQS, - .bits_per_irq = 8, - .handle_mmio = handle_mmio_target_reg, - }, - { - .base = GIC_DIST_CONFIG, - .len = VGIC_MAX_IRQS / 4, - .bits_per_irq = 2, - .handle_mmio = handle_mmio_cfg_reg, - }, - { - .base = GIC_DIST_SGI_PENDING_CLEAR, - .len = VGIC_NR_SGIS, - .handle_mmio = handle_mmio_sgi_clear, - }, - { - .base = GIC_DIST_SGI_PENDING_SET, - .len = VGIC_NR_SGIS, - .handle_mmio = handle_mmio_sgi_set, - }, - {} -}; - -static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) -{ - struct kvm *kvm = vcpu->kvm; - struct vgic_dist *dist = &kvm->arch.vgic; - int nrcpus = atomic_read(&kvm->online_vcpus); - u8 target_cpus; - int sgi, mode, c, vcpu_id; - - vcpu_id = vcpu->vcpu_id; - - sgi = reg & 0xf; - target_cpus = (reg >> 16) & 0xff; - mode = (reg >> 24) & 3; - - switch (mode) { - case 0: - if (!target_cpus) - return; - break; - - case 1: - target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff; - break; - - case 2: - target_cpus = 1 << vcpu_id; - break; - } - - kvm_for_each_vcpu(c, vcpu, kvm) { - if (target_cpus & 1) { - /* Flag the SGI as pending */ - vgic_dist_irq_set_pending(vcpu, sgi); - *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id; - kvm_debug("SGI%d from CPU%d to CPU%d\n", - sgi, vcpu_id, c); - } - - target_cpus >>= 1; - } -} - -static bool vgic_v2_queue_sgi(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - unsigned long sources; - int vcpu_id = vcpu->vcpu_id; - int c; - - sources = *vgic_get_sgi_sources(dist, vcpu_id, irq); - - for_each_set_bit(c, &sources, dist->nr_cpus) { - if (vgic_queue_irq(vcpu, c, irq)) - clear_bit(c, &sources); - } - - *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources; - - /* - * If the sources bitmap has been cleared it means that we - * could queue all the SGIs onto link registers (see the - * clear_bit above), and therefore we are done with them in - * our emulated gic and can get rid of them. - */ - if (!sources) { - vgic_dist_irq_clear_pending(vcpu, irq); - vgic_cpu_irq_clear(vcpu, irq); - return true; - } - - return false; -} - -/** - * kvm_vgic_map_resources - Configure global VGIC state before running any VCPUs - * @kvm: pointer to the kvm struct - * - * Map the virtual CPU interface into the VM before running any VCPUs. We - * can't do this at creation time, because user space must first set the - * virtual CPU interface address in the guest physical address space. - */ -static int vgic_v2_map_resources(struct kvm *kvm, - const struct vgic_params *params) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - int ret = 0; - - if (!irqchip_in_kernel(kvm)) - return 0; - - mutex_lock(&kvm->lock); - - if (vgic_ready(kvm)) - goto out; - - if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || - IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { - kvm_err("Need to set vgic cpu and dist addresses first\n"); - ret = -ENXIO; - goto out; - } - - vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base, - KVM_VGIC_V2_DIST_SIZE, - vgic_dist_ranges, -1, &dist->dist_iodev); - - /* - * Initialize the vgic if this hasn't already been done on demand by - * accessing the vgic state from userspace. - */ - ret = vgic_init(kvm); - if (ret) { - kvm_err("Unable to allocate maps\n"); - goto out_unregister; - } - - ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, - params->vcpu_base, KVM_VGIC_V2_CPU_SIZE, - true); - if (ret) { - kvm_err("Unable to remap VGIC CPU to VCPU\n"); - goto out_unregister; - } - - dist->ready = true; - goto out; - -out_unregister: - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev); - -out: - if (ret) - kvm_vgic_destroy(kvm); - mutex_unlock(&kvm->lock); - return ret; -} - -static void vgic_v2_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - *vgic_get_sgi_sources(dist, vcpu->vcpu_id, irq) |= 1 << source; -} - -static int vgic_v2_init_model(struct kvm *kvm) -{ - int i; - - for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4) - vgic_set_target_reg(kvm, 0, i); - - return 0; -} - -void vgic_v2_init_emulation(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - dist->vm_ops.queue_sgi = vgic_v2_queue_sgi; - dist->vm_ops.add_sgi_source = vgic_v2_add_sgi_source; - dist->vm_ops.init_model = vgic_v2_init_model; - dist->vm_ops.map_resources = vgic_v2_map_resources; - - kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS; -} - -static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - bool updated = false; - struct vgic_vmcr vmcr; - u32 *vmcr_field; - u32 reg; - - vgic_get_vmcr(vcpu, &vmcr); - - switch (offset & ~0x3) { - case GIC_CPU_CTRL: - vmcr_field = &vmcr.ctlr; - break; - case GIC_CPU_PRIMASK: - vmcr_field = &vmcr.pmr; - break; - case GIC_CPU_BINPOINT: - vmcr_field = &vmcr.bpr; - break; - case GIC_CPU_ALIAS_BINPOINT: - vmcr_field = &vmcr.abpr; - break; - default: - BUG(); - } - - if (!mmio->is_write) { - reg = *vmcr_field; - mmio_data_write(mmio, ~0, reg); - } else { - reg = mmio_data_read(mmio, ~0); - if (reg != *vmcr_field) { - *vmcr_field = reg; - vgic_set_vmcr(vcpu, &vmcr); - updated = true; - } - } - return updated; -} - -static bool handle_mmio_abpr(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT); -} - -static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 reg; - - if (mmio->is_write) - return false; - - /* GICC_IIDR */ - reg = (PRODUCT_ID_KVM << 20) | - (GICC_ARCH_VERSION_V2 << 16) | - (IMPLEMENTER_ARM << 0); - mmio_data_write(mmio, ~0, reg); - return false; -} - -/* - * CPU Interface Register accesses - these are not accessed by the VM, but by - * user space for saving and restoring VGIC state. - */ -static const struct vgic_io_range vgic_cpu_ranges[] = { - { - .base = GIC_CPU_CTRL, - .len = 12, - .handle_mmio = handle_cpu_mmio_misc, - }, - { - .base = GIC_CPU_ALIAS_BINPOINT, - .len = 4, - .handle_mmio = handle_mmio_abpr, - }, - { - .base = GIC_CPU_ACTIVEPRIO, - .len = 16, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GIC_CPU_IDENT, - .len = 4, - .handle_mmio = handle_cpu_mmio_ident, - }, -}; - -static int vgic_attr_regs_access(struct kvm_device *dev, - struct kvm_device_attr *attr, - u32 *reg, bool is_write) -{ - const struct vgic_io_range *r = NULL, *ranges; - phys_addr_t offset; - int ret, cpuid, c; - struct kvm_vcpu *vcpu, *tmp_vcpu; - struct vgic_dist *vgic; - struct kvm_exit_mmio mmio; - u32 data; - - offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> - KVM_DEV_ARM_VGIC_CPUID_SHIFT; - - mutex_lock(&dev->kvm->lock); - - ret = vgic_init(dev->kvm); - if (ret) - goto out; - - if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) { - ret = -EINVAL; - goto out; - } - - vcpu = kvm_get_vcpu(dev->kvm, cpuid); - vgic = &dev->kvm->arch.vgic; - - mmio.len = 4; - mmio.is_write = is_write; - mmio.data = &data; - if (is_write) - mmio_data_write(&mmio, ~0, *reg); - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - mmio.phys_addr = vgic->vgic_dist_base + offset; - ranges = vgic_dist_ranges; - break; - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - mmio.phys_addr = vgic->vgic_cpu_base + offset; - ranges = vgic_cpu_ranges; - break; - default: - BUG(); - } - r = vgic_find_range(ranges, 4, offset); - - if (unlikely(!r || !r->handle_mmio)) { - ret = -ENXIO; - goto out; - } - - - spin_lock(&vgic->lock); - - /* - * Ensure that no other VCPU is running by checking the vcpu->cpu - * field. If no other VPCUs are running we can safely access the VGIC - * state, because even if another VPU is run after this point, that - * VCPU will not touch the vgic state, because it will block on - * getting the vgic->lock in kvm_vgic_sync_hwstate(). - */ - kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) { - if (unlikely(tmp_vcpu->cpu != -1)) { - ret = -EBUSY; - goto out_vgic_unlock; - } - } - - /* - * Move all pending IRQs from the LRs on all VCPUs so the pending - * state can be properly represented in the register state accessible - * through this API. - */ - kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) - vgic_unqueue_irqs(tmp_vcpu); - - offset -= r->base; - r->handle_mmio(vcpu, &mmio, offset); - - if (!is_write) - *reg = mmio_data_read(&mmio, ~0); - - ret = 0; -out_vgic_unlock: - spin_unlock(&vgic->lock); -out: - mutex_unlock(&dev->kvm->lock); - return ret; -} - -static int vgic_v2_create(struct kvm_device *dev, u32 type) -{ - return kvm_vgic_create(dev->kvm, type); -} - -static void vgic_v2_destroy(struct kvm_device *dev) -{ - kfree(dev); -} - -static int vgic_v2_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_attr_regs_access(dev, attr, ®, true); - } - - } - - return -ENXIO; -} - -static int vgic_v2_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg = 0; - - ret = vgic_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } - - } - - return -ENXIO; -} - -static int vgic_v2_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - phys_addr_t offset; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - case KVM_VGIC_V2_ADDR_TYPE_CPU: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - return vgic_has_attr_regs(vgic_dist_ranges, offset); - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - return vgic_has_attr_regs(vgic_cpu_ranges, offset); - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: - return 0; - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - } - } - return -ENXIO; -} - -struct kvm_device_ops kvm_arm_vgic_v2_ops = { - .name = "kvm-arm-vgic-v2", - .create = vgic_v2_create, - .destroy = vgic_v2_destroy, - .set_attr = vgic_v2_set_attr, - .get_attr = vgic_v2_get_attr, - .has_attr = vgic_v2_has_attr, -}; diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c deleted file mode 100644 index 334cd7a89106..000000000000 --- a/virt/kvm/arm/vgic-v2.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Copyright (C) 2012,2013 ARM Limited, All Rights Reserved. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr) -{ - struct vgic_lr lr_desc; - u32 val = vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr]; - - lr_desc.irq = val & GICH_LR_VIRTUALID; - if (lr_desc.irq <= 15) - lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7; - else - lr_desc.source = 0; - lr_desc.state = 0; - - if (val & GICH_LR_PENDING_BIT) - lr_desc.state |= LR_STATE_PENDING; - if (val & GICH_LR_ACTIVE_BIT) - lr_desc.state |= LR_STATE_ACTIVE; - if (val & GICH_LR_EOI) - lr_desc.state |= LR_EOI_INT; - if (val & GICH_LR_HW) { - lr_desc.state |= LR_HW; - lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT; - } - - return lr_desc; -} - -static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr lr_desc) -{ - u32 lr_val; - - lr_val = lr_desc.irq; - - if (lr_desc.state & LR_STATE_PENDING) - lr_val |= GICH_LR_PENDING_BIT; - if (lr_desc.state & LR_STATE_ACTIVE) - lr_val |= GICH_LR_ACTIVE_BIT; - if (lr_desc.state & LR_EOI_INT) - lr_val |= GICH_LR_EOI; - - if (lr_desc.state & LR_HW) { - lr_val |= GICH_LR_HW; - lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT; - } - - if (lr_desc.irq < VGIC_NR_SGIS) - lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT); - - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val; - - if (!(lr_desc.state & LR_STATE_MASK)) - vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr); - else - vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr &= ~(1ULL << lr); -} - -static u64 vgic_v2_get_elrsr(const struct kvm_vcpu *vcpu) -{ - return vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr; -} - -static u64 vgic_v2_get_eisr(const struct kvm_vcpu *vcpu) -{ - return vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr; -} - -static void vgic_v2_clear_eisr(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr = 0; -} - -static u32 vgic_v2_get_interrupt_status(const struct kvm_vcpu *vcpu) -{ - u32 misr = vcpu->arch.vgic_cpu.vgic_v2.vgic_misr; - u32 ret = 0; - - if (misr & GICH_MISR_EOI) - ret |= INT_STATUS_EOI; - if (misr & GICH_MISR_U) - ret |= INT_STATUS_UNDERFLOW; - - return ret; -} - -static void vgic_v2_enable_underflow(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= GICH_HCR_UIE; -} - -static void vgic_v2_disable_underflow(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_UIE; -} - -static void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr; - - vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> GICH_VMCR_CTRL_SHIFT; - vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> GICH_VMCR_ALIAS_BINPOINT_SHIFT; - vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> GICH_VMCR_BINPOINT_SHIFT; - vmcrp->pmr = (vmcr & GICH_VMCR_PRIMASK_MASK) >> GICH_VMCR_PRIMASK_SHIFT; -} - -static void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr; - - vmcr = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK; - vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & GICH_VMCR_ALIAS_BINPOINT_MASK; - vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & GICH_VMCR_BINPOINT_MASK; - vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK; - - vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr; -} - -static void vgic_v2_enable(struct kvm_vcpu *vcpu) -{ - /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. - */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; - vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0; - - /* Get the show on the road... */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; -} - -static const struct vgic_ops vgic_v2_ops = { - .get_lr = vgic_v2_get_lr, - .set_lr = vgic_v2_set_lr, - .get_elrsr = vgic_v2_get_elrsr, - .get_eisr = vgic_v2_get_eisr, - .clear_eisr = vgic_v2_clear_eisr, - .get_interrupt_status = vgic_v2_get_interrupt_status, - .enable_underflow = vgic_v2_enable_underflow, - .disable_underflow = vgic_v2_disable_underflow, - .get_vmcr = vgic_v2_get_vmcr, - .set_vmcr = vgic_v2_set_vmcr, - .enable = vgic_v2_enable, -}; - -struct vgic_params __section(.hyp.text) vgic_v2_params; - -static void vgic_cpu_init_lrs(void *params) -{ - struct vgic_params *vgic = params; - int i; - - for (i = 0; i < vgic->nr_lr; i++) - writel_relaxed(0, vgic->vctrl_base + GICH_LR0 + (i * 4)); -} - -/** - * vgic_v2_probe - probe for a GICv2 compatible interrupt controller - * @gic_kvm_info: pointer to the GIC description - * @ops: address of a pointer to the GICv2 operations - * @params: address of a pointer to HW-specific parameters - * - * Returns 0 if a GICv2 has been found, with the low level operations - * in *ops and the HW parameters in *params. Returns an error code - * otherwise. - */ -int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info, - const struct vgic_ops **ops, - const struct vgic_params **params) -{ - int ret; - struct vgic_params *vgic = &vgic_v2_params; - const struct resource *vctrl_res = &gic_kvm_info->vctrl; - const struct resource *vcpu_res = &gic_kvm_info->vcpu; - - memset(vgic, 0, sizeof(*vgic)); - - if (!gic_kvm_info->maint_irq) { - kvm_err("error getting vgic maintenance irq\n"); - ret = -ENXIO; - goto out; - } - vgic->maint_irq = gic_kvm_info->maint_irq; - - if (!gic_kvm_info->vctrl.start) { - kvm_err("GICH not present in the firmware table\n"); - ret = -ENXIO; - goto out; - } - - vgic->vctrl_base = ioremap(gic_kvm_info->vctrl.start, - resource_size(&gic_kvm_info->vctrl)); - if (!vgic->vctrl_base) { - kvm_err("Cannot ioremap GICH\n"); - ret = -ENOMEM; - goto out; - } - - vgic->nr_lr = readl_relaxed(vgic->vctrl_base + GICH_VTR); - vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1; - - ret = create_hyp_io_mappings(vgic->vctrl_base, - vgic->vctrl_base + resource_size(vctrl_res), - vctrl_res->start); - if (ret) { - kvm_err("Cannot map VCTRL into hyp\n"); - goto out_unmap; - } - - if (!PAGE_ALIGNED(vcpu_res->start)) { - kvm_err("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)vcpu_res->start); - ret = -ENXIO; - goto out_unmap; - } - - if (!PAGE_ALIGNED(resource_size(vcpu_res))) { - kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(vcpu_res), - PAGE_SIZE); - ret = -ENXIO; - goto out_unmap; - } - - vgic->can_emulate_gicv2 = true; - kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2); - - vgic->vcpu_base = vcpu_res->start; - - kvm_info("GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n", - gic_kvm_info->vctrl.start, vgic->vcpu_base, vgic->maint_irq); - - vgic->type = VGIC_V2; - vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS; - - on_each_cpu(vgic_cpu_init_lrs, vgic, 1); - - *ops = &vgic_v2_ops; - *params = vgic; - goto out; - -out_unmap: - iounmap(vgic->vctrl_base); -out: - return ret; -} diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c deleted file mode 100644 index e661e7fb9d91..000000000000 --- a/virt/kvm/arm/vgic-v3-emul.c +++ /dev/null @@ -1,1074 +0,0 @@ -/* - * GICv3 distributor and redistributor emulation - * - * GICv3 emulation is currently only supported on a GICv3 host (because - * we rely on the hardware's CPU interface virtualization support), but - * supports both hardware with or without the optional GICv2 backwards - * compatibility features. - * - * Limitations of the emulation: - * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore) - * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI. - * - We do not support the message based interrupts (MBIs) triggered by - * writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0. - * - We do not support the (optional) backwards compatibility feature. - * GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports - * the compatiblity feature, you can use a GICv2 in the guest, though. - * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI. - * - Priorities are not emulated (same as the GICv2 emulation). Linux - * as a guest is fine with this, because it does not use priorities. - * - We only support Group1 interrupts. Again Linux uses only those. - * - * Copyright (C) 2014 ARM Ltd. - * Author: Andre Przywara - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include "vgic.h" - -static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg = 0xffffffff; - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - - return false; -} - -static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg = 0; - - /* - * Force ARE and DS to 1, the guest cannot change this. - * For the time being we only support Group1 interrupts. - */ - if (vcpu->kvm->arch.vgic.enabled) - reg = GICD_CTLR_ENABLE_SS_G1; - reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS; - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1); - vgic_update_state(vcpu->kvm); - return true; - } - return false; -} - -/* - * As this implementation does not provide compatibility - * with GICv2 (ARE==1), we report zero CPUs in bits [5..7]. - * Also LPIs and MBIs are not supported, so we set the respective bits to 0. - * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs). - */ -#define INTERRUPT_ID_BITS 10 -static bool handle_mmio_typer(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg; - - reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1; - - reg |= (INTERRUPT_ID_BITS - 1) << 19; - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - - return false; -} - -static bool handle_mmio_iidr(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg; - - reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - - return false; -} - -static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id, - ACCESS_WRITE_SETBIT); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id, - ACCESS_WRITE_CLEARBIT); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_set_active_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_clear_active_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 *reg; - - if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) { - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; - } - - reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority, - vcpu->vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - return false; -} - -static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 *reg; - - if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) { - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; - } - - reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg, - vcpu->vcpu_id, offset >> 1); - - return vgic_handle_cfg_reg(reg, mmio, offset); -} - -/* - * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word) - * when we store the target MPIDR written by the guest. - */ -static u32 compress_mpidr(unsigned long mpidr) -{ - u32 ret; - - ret = MPIDR_AFFINITY_LEVEL(mpidr, 0); - ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8; - ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16; - ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24; - - return ret; -} - -static unsigned long uncompress_mpidr(u32 value) -{ - unsigned long mpidr; - - mpidr = ((value >> 0) & 0xFF) << MPIDR_LEVEL_SHIFT(0); - mpidr |= ((value >> 8) & 0xFF) << MPIDR_LEVEL_SHIFT(1); - mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2); - mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3); - - return mpidr; -} - -/* - * Lookup the given MPIDR value to get the vcpu_id (if there is one) - * and store that in the irq_spi_cpu[] array. - * This limits the number of VCPUs to 255 for now, extending the data - * type (or storing kvm_vcpu pointers) should lift the limit. - * Store the original MPIDR value in an extra array to support read-as-written. - * Unallocated MPIDRs are translated to a special value and caught - * before any array accesses. - */ -static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm *kvm = vcpu->kvm; - struct vgic_dist *dist = &kvm->arch.vgic; - int spi; - u32 reg; - int vcpu_id; - unsigned long *bmap, mpidr; - - /* - * The upper 32 bits of each 64 bit register are zero, - * as we don't support Aff3. - */ - if ((offset & 4)) { - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; - } - - /* This region only covers SPIs, so no handling of private IRQs here. */ - spi = offset / 8; - - /* get the stored MPIDR for this IRQ */ - mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]); - reg = mpidr; - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - - if (!mmio->is_write) - return false; - - /* - * Now clear the currently assigned vCPU from the map, making room - * for the new one to be written below - */ - vcpu = kvm_mpidr_to_vcpu(kvm, mpidr); - if (likely(vcpu)) { - vcpu_id = vcpu->vcpu_id; - bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]); - __clear_bit(spi, bmap); - } - - dist->irq_spi_mpidr[spi] = compress_mpidr(reg); - vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK); - - /* - * The spec says that non-existent MPIDR values should not be - * forwarded to any existent (v)CPU, but should be able to become - * pending anyway. We simply keep the irq_spi_target[] array empty, so - * the interrupt will never be injected. - * irq_spi_cpu[irq] gets a magic value in this case. - */ - if (likely(vcpu)) { - vcpu_id = vcpu->vcpu_id; - dist->irq_spi_cpu[spi] = vcpu_id; - bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]); - __set_bit(spi, bmap); - } else { - dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED; - } - - vgic_update_state(kvm); - - return true; -} - -/* - * We should be careful about promising too much when a guest reads - * this register. Don't claim to be like any hardware implementation, - * but just report the GIC as version 3 - which is what a Linux guest - * would check. - */ -static bool handle_mmio_idregs(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 reg = 0; - - switch (offset + GICD_IDREGS) { - case GICD_PIDR2: - reg = 0x3b; - break; - } - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - - return false; -} - -static const struct vgic_io_range vgic_v3_dist_ranges[] = { - { - .base = GICD_CTLR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_ctlr, - }, - { - .base = GICD_TYPER, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_typer, - }, - { - .base = GICD_IIDR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_iidr, - }, - { - /* this register is optional, it is RAZ/WI if not implemented */ - .base = GICD_STATUSR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this write only register is WI when TYPER.MBIS=0 */ - .base = GICD_SETSPI_NSR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this write only register is WI when TYPER.MBIS=0 */ - .base = GICD_CLRSPI_NSR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when DS=1 */ - .base = GICD_SETSPI_SR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when DS=1 */ - .base = GICD_CLRSPI_SR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GICD_IGROUPR, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_rao_wi, - }, - { - .base = GICD_ISENABLER, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_enable_reg_dist, - }, - { - .base = GICD_ICENABLER, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_enable_reg_dist, - }, - { - .base = GICD_ISPENDR, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_pending_reg_dist, - }, - { - .base = GICD_ICPENDR, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_pending_reg_dist, - }, - { - .base = GICD_ISACTIVER, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_active_reg_dist, - }, - { - .base = GICD_ICACTIVER, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_active_reg_dist, - }, - { - .base = GICD_IPRIORITYR, - .len = 0x400, - .bits_per_irq = 8, - .handle_mmio = handle_mmio_priority_reg_dist, - }, - { - /* TARGETSRn is RES0 when ARE=1 */ - .base = GICD_ITARGETSR, - .len = 0x400, - .bits_per_irq = 8, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GICD_ICFGR, - .len = 0x100, - .bits_per_irq = 2, - .handle_mmio = handle_mmio_cfg_reg_dist, - }, - { - /* this is RAZ/WI when DS=1 */ - .base = GICD_IGRPMODR, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when DS=1 */ - .base = GICD_NSACR, - .len = 0x100, - .bits_per_irq = 2, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when ARE=1 */ - .base = GICD_SGIR, - .len = 0x04, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when ARE=1 */ - .base = GICD_CPENDSGIR, - .len = 0x10, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when ARE=1 */ - .base = GICD_SPENDSGIR, - .len = 0x10, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GICD_IROUTER + 0x100, - .len = 0x1ee0, - .bits_per_irq = 64, - .handle_mmio = handle_mmio_route_reg, - }, - { - .base = GICD_IDREGS, - .len = 0x30, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_idregs, - }, - {}, -}; - -static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - /* since we don't support LPIs, this register is zero for now */ - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 reg; - u64 mpidr; - struct kvm_vcpu *redist_vcpu = mmio->private; - int target_vcpu_id = redist_vcpu->vcpu_id; - - /* the upper 32 bits contain the affinity value */ - if ((offset & ~3) == 4) { - mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu); - reg = compress_mpidr(mpidr); - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - return false; - } - - reg = redist_vcpu->vcpu_id << 8; - if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1) - reg |= GICR_TYPER_LAST; - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id, - ACCESS_WRITE_SETBIT); -} - -static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id, - ACCESS_WRITE_CLEARBIT); -} - -static bool handle_mmio_set_active_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id); -} - -static bool handle_mmio_clear_active_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id); -} - -static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id); -} - -static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id); -} - -static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - u32 *reg; - - reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority, - redist_vcpu->vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - return false; -} - -static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg, - redist_vcpu->vcpu_id, offset >> 1); - - return vgic_handle_cfg_reg(reg, mmio, offset); -} - -#define SGI_base(x) ((x) + SZ_64K) - -static const struct vgic_io_range vgic_redist_ranges[] = { - { - .base = GICR_CTLR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_ctlr_redist, - }, - { - .base = GICR_TYPER, - .len = 0x08, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_typer_redist, - }, - { - .base = GICR_IIDR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_iidr, - }, - { - .base = GICR_WAKER, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GICR_IDREGS, - .len = 0x30, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_idregs, - }, - { - .base = SGI_base(GICR_IGROUPR0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_rao_wi, - }, - { - .base = SGI_base(GICR_ISENABLER0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_enable_reg_redist, - }, - { - .base = SGI_base(GICR_ICENABLER0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_enable_reg_redist, - }, - { - .base = SGI_base(GICR_ISPENDR0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_pending_reg_redist, - }, - { - .base = SGI_base(GICR_ICPENDR0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_pending_reg_redist, - }, - { - .base = SGI_base(GICR_ISACTIVER0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_active_reg_redist, - }, - { - .base = SGI_base(GICR_ICACTIVER0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_active_reg_redist, - }, - { - .base = SGI_base(GICR_IPRIORITYR0), - .len = 0x20, - .bits_per_irq = 8, - .handle_mmio = handle_mmio_priority_reg_redist, - }, - { - .base = SGI_base(GICR_ICFGR0), - .len = 0x08, - .bits_per_irq = 2, - .handle_mmio = handle_mmio_cfg_reg_redist, - }, - { - .base = SGI_base(GICR_IGRPMODR0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = SGI_base(GICR_NSACR), - .len = 0x04, - .handle_mmio = handle_mmio_raz_wi, - }, - {}, -}; - -static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq) -{ - if (vgic_queue_irq(vcpu, 0, irq)) { - vgic_dist_irq_clear_pending(vcpu, irq); - vgic_cpu_irq_clear(vcpu, irq); - return true; - } - - return false; -} - -static int vgic_v3_map_resources(struct kvm *kvm, - const struct vgic_params *params) -{ - int ret = 0; - struct vgic_dist *dist = &kvm->arch.vgic; - gpa_t rdbase = dist->vgic_redist_base; - struct vgic_io_device *iodevs = NULL; - int i; - - if (!irqchip_in_kernel(kvm)) - return 0; - - mutex_lock(&kvm->lock); - - if (vgic_ready(kvm)) - goto out; - - if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || - IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) { - kvm_err("Need to set vgic distributor addresses first\n"); - ret = -ENXIO; - goto out; - } - - /* - * For a VGICv3 we require the userland to explicitly initialize - * the VGIC before we need to use it. - */ - if (!vgic_initialized(kvm)) { - ret = -EBUSY; - goto out; - } - - ret = vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base, - GIC_V3_DIST_SIZE, vgic_v3_dist_ranges, - -1, &dist->dist_iodev); - if (ret) - goto out; - - iodevs = kcalloc(dist->nr_cpus, sizeof(iodevs[0]), GFP_KERNEL); - if (!iodevs) { - ret = -ENOMEM; - goto out_unregister; - } - - for (i = 0; i < dist->nr_cpus; i++) { - ret = vgic_register_kvm_io_dev(kvm, rdbase, - SZ_128K, vgic_redist_ranges, - i, &iodevs[i]); - if (ret) - goto out_unregister; - rdbase += GIC_V3_REDIST_SIZE; - } - - dist->redist_iodevs = iodevs; - dist->ready = true; - goto out; - -out_unregister: - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev); - if (iodevs) { - for (i = 0; i < dist->nr_cpus; i++) { - if (iodevs[i].dev.ops) - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, - &iodevs[i].dev); - } - } - -out: - if (ret) - kvm_vgic_destroy(kvm); - mutex_unlock(&kvm->lock); - return ret; -} - -static int vgic_v3_init_model(struct kvm *kvm) -{ - int i; - u32 mpidr; - struct vgic_dist *dist = &kvm->arch.vgic; - int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS; - - dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]), - GFP_KERNEL); - - if (!dist->irq_spi_mpidr) - return -ENOMEM; - - /* Initialize the target VCPUs for each IRQ to VCPU 0 */ - mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0))); - for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) { - dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0; - dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr; - vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1); - } - - return 0; -} - -/* GICv3 does not keep track of SGI sources anymore. */ -static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source) -{ -} - -void vgic_v3_init_emulation(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - dist->vm_ops.queue_sgi = vgic_v3_queue_sgi; - dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source; - dist->vm_ops.init_model = vgic_v3_init_model; - dist->vm_ops.map_resources = vgic_v3_map_resources; - - kvm->arch.max_vcpus = KVM_MAX_VCPUS; -} - -/* - * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI - * generation register ICC_SGI1R_EL1) with a given VCPU. - * If the VCPU's MPIDR matches, return the level0 affinity, otherwise - * return -1. - */ -static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) -{ - unsigned long affinity; - int level0; - - /* - * Split the current VCPU's MPIDR into affinity level 0 and the - * rest as this is what we have to compare against. - */ - affinity = kvm_vcpu_get_mpidr_aff(vcpu); - level0 = MPIDR_AFFINITY_LEVEL(affinity, 0); - affinity &= ~MPIDR_LEVEL_MASK; - - /* bail out if the upper three levels don't match */ - if (sgi_aff != affinity) - return -1; - - /* Is this VCPU's bit set in the mask ? */ - if (!(sgi_cpu_mask & BIT(level0))) - return -1; - - return level0; -} - -#define SGI_AFFINITY_LEVEL(reg, level) \ - ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \ - >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) - -/** - * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs - * @vcpu: The VCPU requesting a SGI - * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU - * - * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register. - * This will trap in sys_regs.c and call this function. - * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the - * target processors as well as a bitmask of 16 Aff0 CPUs. - * If the interrupt routing mode bit is not set, we iterate over all VCPUs to - * check for matching ones. If this bit is set, we signal all, but not the - * calling VCPU. - */ -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu *c_vcpu; - struct vgic_dist *dist = &kvm->arch.vgic; - u16 target_cpus; - u64 mpidr; - int sgi, c; - int vcpu_id = vcpu->vcpu_id; - bool broadcast; - int updated = 0; - - sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; - broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); - target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT; - mpidr = SGI_AFFINITY_LEVEL(reg, 3); - mpidr |= SGI_AFFINITY_LEVEL(reg, 2); - mpidr |= SGI_AFFINITY_LEVEL(reg, 1); - - /* - * We take the dist lock here, because we come from the sysregs - * code path and not from the MMIO one (which already takes the lock). - */ - spin_lock(&dist->lock); - - /* - * We iterate over all VCPUs to find the MPIDRs matching the request. - * If we have handled one CPU, we clear it's bit to detect early - * if we are already finished. This avoids iterating through all - * VCPUs when most of the times we just signal a single VCPU. - */ - kvm_for_each_vcpu(c, c_vcpu, kvm) { - - /* Exit early if we have dealt with all requested CPUs */ - if (!broadcast && target_cpus == 0) - break; - - /* Don't signal the calling VCPU */ - if (broadcast && c == vcpu_id) - continue; - - if (!broadcast) { - int level0; - - level0 = match_mpidr(mpidr, target_cpus, c_vcpu); - if (level0 == -1) - continue; - - /* remove this matching VCPU from the mask */ - target_cpus &= ~BIT(level0); - } - - /* Flag the SGI as pending */ - vgic_dist_irq_set_pending(c_vcpu, sgi); - updated = 1; - kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c); - } - if (updated) - vgic_update_state(vcpu->kvm); - spin_unlock(&dist->lock); - if (updated) - vgic_kick_vcpus(vcpu->kvm); -} - -static int vgic_v3_create(struct kvm_device *dev, u32 type) -{ - return kvm_vgic_create(dev->kvm, type); -} - -static void vgic_v3_destroy(struct kvm_device *dev) -{ - kfree(dev); -} - -static int vgic_v3_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - return -ENXIO; - } - - return -ENXIO; -} - -static int vgic_v3_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - return -ENXIO; - } - - return -ENXIO; -} - -static int vgic_v3_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - case KVM_VGIC_V2_ADDR_TYPE_CPU: - return -ENXIO; - case KVM_VGIC_V3_ADDR_TYPE_DIST: - case KVM_VGIC_V3_ADDR_TYPE_REDIST: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - return -ENXIO; - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: - return 0; - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - } - } - return -ENXIO; -} - -struct kvm_device_ops kvm_arm_vgic_v3_ops = { - .name = "kvm-arm-vgic-v3", - .create = vgic_v3_create, - .destroy = vgic_v3_destroy, - .set_attr = vgic_v3_set_attr, - .get_attr = vgic_v3_get_attr, - .has_attr = vgic_v3_has_attr, -}; diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c deleted file mode 100644 index 75b02fa86436..000000000000 --- a/virt/kvm/arm/vgic-v3.c +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Copyright (C) 2013 ARM Limited, All Rights Reserved. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -static u32 ich_vtr_el2; - -static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr) -{ - struct vgic_lr lr_desc; - u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr]; - - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) - lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK; - else - lr_desc.irq = val & GICH_LR_VIRTUALID; - - lr_desc.source = 0; - if (lr_desc.irq <= 15 && - vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) - lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7; - - lr_desc.state = 0; - - if (val & ICH_LR_PENDING_BIT) - lr_desc.state |= LR_STATE_PENDING; - if (val & ICH_LR_ACTIVE_BIT) - lr_desc.state |= LR_STATE_ACTIVE; - if (val & ICH_LR_EOI) - lr_desc.state |= LR_EOI_INT; - if (val & ICH_LR_HW) { - lr_desc.state |= LR_HW; - lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0); - } - - return lr_desc; -} - -static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr lr_desc) -{ - u64 lr_val; - - lr_val = lr_desc.irq; - - /* - * Currently all guest IRQs are Group1, as Group0 would result - * in a FIQ in the guest, which it wouldn't expect. - * Eventually we want to make this configurable, so we may revisit - * this in the future. - */ - switch (vcpu->kvm->arch.vgic.vgic_model) { - case KVM_DEV_TYPE_ARM_VGIC_V3: - lr_val |= ICH_LR_GROUP; - break; - case KVM_DEV_TYPE_ARM_VGIC_V2: - if (lr_desc.irq < VGIC_NR_SGIS) - lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT; - break; - default: - BUG(); - } - - if (lr_desc.state & LR_STATE_PENDING) - lr_val |= ICH_LR_PENDING_BIT; - if (lr_desc.state & LR_STATE_ACTIVE) - lr_val |= ICH_LR_ACTIVE_BIT; - if (lr_desc.state & LR_EOI_INT) - lr_val |= ICH_LR_EOI; - if (lr_desc.state & LR_HW) { - lr_val |= ICH_LR_HW; - lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT; - } - - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = lr_val; - - if (!(lr_desc.state & LR_STATE_MASK)) - vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr); - else - vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr &= ~(1U << lr); -} - -static u64 vgic_v3_get_elrsr(const struct kvm_vcpu *vcpu) -{ - return vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr; -} - -static u64 vgic_v3_get_eisr(const struct kvm_vcpu *vcpu) -{ - return vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr; -} - -static void vgic_v3_clear_eisr(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr = 0; -} - -static u32 vgic_v3_get_interrupt_status(const struct kvm_vcpu *vcpu) -{ - u32 misr = vcpu->arch.vgic_cpu.vgic_v3.vgic_misr; - u32 ret = 0; - - if (misr & ICH_MISR_EOI) - ret |= INT_STATUS_EOI; - if (misr & ICH_MISR_U) - ret |= INT_STATUS_UNDERFLOW; - - return ret; -} - -static void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr; - - vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT; - vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; - vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; - vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; -} - -static void vgic_v3_enable_underflow(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr |= ICH_HCR_UIE; -} - -static void vgic_v3_disable_underflow(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr &= ~ICH_HCR_UIE; -} - -static void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr; - - vmcr = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK; - vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; - vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; - vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; - - vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr; -} - -static void vgic_v3_enable(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; - - /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. - */ - vgic_v3->vgic_vmcr = 0; - vgic_v3->vgic_elrsr = ~0; - - /* - * If we are emulating a GICv3, we do it in an non-GICv2-compatible - * way, so we force SRE to 1 to demonstrate this to the guest. - * This goes with the spec allowing the value to be RAO/WI. - */ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) - vgic_v3->vgic_sre = ICC_SRE_EL1_SRE; - else - vgic_v3->vgic_sre = 0; - - /* Get the show on the road... */ - vgic_v3->vgic_hcr = ICH_HCR_EN; -} - -static const struct vgic_ops vgic_v3_ops = { - .get_lr = vgic_v3_get_lr, - .set_lr = vgic_v3_set_lr, - .get_elrsr = vgic_v3_get_elrsr, - .get_eisr = vgic_v3_get_eisr, - .clear_eisr = vgic_v3_clear_eisr, - .get_interrupt_status = vgic_v3_get_interrupt_status, - .enable_underflow = vgic_v3_enable_underflow, - .disable_underflow = vgic_v3_disable_underflow, - .get_vmcr = vgic_v3_get_vmcr, - .set_vmcr = vgic_v3_set_vmcr, - .enable = vgic_v3_enable, -}; - -static struct vgic_params vgic_v3_params; - -static void vgic_cpu_init_lrs(void *params) -{ - kvm_call_hyp(__vgic_v3_init_lrs); -} - -/** - * vgic_v3_probe - probe for a GICv3 compatible interrupt controller - * @gic_kvm_info: pointer to the GIC description - * @ops: address of a pointer to the GICv3 operations - * @params: address of a pointer to HW-specific parameters - * - * Returns 0 if a GICv3 has been found, with the low level operations - * in *ops and the HW parameters in *params. Returns an error code - * otherwise. - */ -int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, - const struct vgic_ops **ops, - const struct vgic_params **params) -{ - int ret = 0; - struct vgic_params *vgic = &vgic_v3_params; - const struct resource *vcpu_res = &gic_kvm_info->vcpu; - - vgic->maint_irq = gic_kvm_info->maint_irq; - - ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); - - /* - * The ListRegs field is 5 bits, but there is a architectural - * maximum of 16 list registers. Just ignore bit 4... - */ - vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1; - vgic->can_emulate_gicv2 = false; - - if (!vcpu_res->start) { - kvm_info("GICv3: no GICV resource entry\n"); - vgic->vcpu_base = 0; - } else if (!PAGE_ALIGNED(vcpu_res->start)) { - pr_warn("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)vcpu_res->start); - vgic->vcpu_base = 0; - } else if (!PAGE_ALIGNED(resource_size(vcpu_res))) { - pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(vcpu_res), - PAGE_SIZE); - } else { - vgic->vcpu_base = vcpu_res->start; - vgic->can_emulate_gicv2 = true; - kvm_register_device_ops(&kvm_arm_vgic_v2_ops, - KVM_DEV_TYPE_ARM_VGIC_V2); - } - if (vgic->vcpu_base == 0) - kvm_info("disabling GICv2 emulation\n"); - kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3); - - vgic->vctrl_base = NULL; - vgic->type = VGIC_V3; - vgic->max_gic_vcpus = VGIC_V3_MAX_CPUS; - - kvm_info("GICV base=0x%llx, IRQ=%d\n", - vgic->vcpu_base, vgic->maint_irq); - - on_each_cpu(vgic_cpu_init_lrs, vgic, 1); - - *ops = &vgic_v3_ops; - *params = vgic; - - return ret; -} diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c deleted file mode 100644 index 67cb5e948be2..000000000000 --- a/virt/kvm/arm/vgic.c +++ /dev/null @@ -1,2417 +0,0 @@ -/* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include "trace.h" - -/* - * How the whole thing works (courtesy of Christoffer Dall): - * - * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if - * something is pending on the CPU interface. - * - Interrupts that are pending on the distributor are stored on the - * vgic.irq_pending vgic bitmap (this bitmap is updated by both user land - * ioctls and guest mmio ops, and other in-kernel peripherals such as the - * arch. timers). - * - Every time the bitmap changes, the irq_pending_on_cpu oracle is - * recalculated - * - To calculate the oracle, we need info for each cpu from - * compute_pending_for_cpu, which considers: - * - PPI: dist->irq_pending & dist->irq_enable - * - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target - * - irq_spi_target is a 'formatted' version of the GICD_ITARGETSRn - * registers, stored on each vcpu. We only keep one bit of - * information per interrupt, making sure that only one vcpu can - * accept the interrupt. - * - If any of the above state changes, we must recalculate the oracle. - * - The same is true when injecting an interrupt, except that we only - * consider a single interrupt at a time. The irq_spi_cpu array - * contains the target CPU for each SPI. - * - * The handling of level interrupts adds some extra complexity. We - * need to track when the interrupt has been EOIed, so we can sample - * the 'line' again. This is achieved as such: - * - * - When a level interrupt is moved onto a vcpu, the corresponding - * bit in irq_queued is set. As long as this bit is set, the line - * will be ignored for further interrupts. The interrupt is injected - * into the vcpu with the GICH_LR_EOI bit set (generate a - * maintenance interrupt on EOI). - * - When the interrupt is EOIed, the maintenance interrupt fires, - * and clears the corresponding bit in irq_queued. This allows the - * interrupt line to be sampled again. - * - Note that level-triggered interrupts can also be set to pending from - * writes to GICD_ISPENDRn and lowering the external input line does not - * cause the interrupt to become inactive in such a situation. - * Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become - * inactive as long as the external input line is held high. - * - * - * Initialization rules: there are multiple stages to the vgic - * initialization, both for the distributor and the CPU interfaces. - * - * Distributor: - * - * - kvm_vgic_early_init(): initialization of static data that doesn't - * depend on any sizing information or emulation type. No allocation - * is allowed there. - * - * - vgic_init(): allocation and initialization of the generic data - * structures that depend on sizing information (number of CPUs, - * number of interrupts). Also initializes the vcpu specific data - * structures. Can be executed lazily for GICv2. - * [to be renamed to kvm_vgic_init??] - * - * CPU Interface: - * - * - kvm_vgic_cpu_early_init(): initialization of static data that - * doesn't depend on any sizing information or emulation type. No - * allocation is allowed there. - */ - -#include "vgic.h" - -static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); -static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu); -static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr); -static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc); -static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu); -static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu, - int virt_irq); -static int compute_pending_for_cpu(struct kvm_vcpu *vcpu); - -static const struct vgic_ops *vgic_ops; -static const struct vgic_params *vgic; - -static void add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source) -{ - vcpu->kvm->arch.vgic.vm_ops.add_sgi_source(vcpu, irq, source); -} - -static bool queue_sgi(struct kvm_vcpu *vcpu, int irq) -{ - return vcpu->kvm->arch.vgic.vm_ops.queue_sgi(vcpu, irq); -} - -int kvm_vgic_map_resources(struct kvm *kvm) -{ - return kvm->arch.vgic.vm_ops.map_resources(kvm, vgic); -} - -/* - * struct vgic_bitmap contains a bitmap made of unsigned longs, but - * extracts u32s out of them. - * - * This does not work on 64-bit BE systems, because the bitmap access - * will store two consecutive 32-bit words with the higher-addressed - * register's bits at the lower index and the lower-addressed register's - * bits at the higher index. - * - * Therefore, swizzle the register index when accessing the 32-bit word - * registers to access the right register's value. - */ -#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 64 -#define REG_OFFSET_SWIZZLE 1 -#else -#define REG_OFFSET_SWIZZLE 0 -#endif - -static int vgic_init_bitmap(struct vgic_bitmap *b, int nr_cpus, int nr_irqs) -{ - int nr_longs; - - nr_longs = nr_cpus + BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS); - - b->private = kzalloc(sizeof(unsigned long) * nr_longs, GFP_KERNEL); - if (!b->private) - return -ENOMEM; - - b->shared = b->private + nr_cpus; - - return 0; -} - -static void vgic_free_bitmap(struct vgic_bitmap *b) -{ - kfree(b->private); - b->private = NULL; - b->shared = NULL; -} - -/* - * Call this function to convert a u64 value to an unsigned long * bitmask - * in a way that works on both 32-bit and 64-bit LE and BE platforms. - * - * Warning: Calling this function may modify *val. - */ -static unsigned long *u64_to_bitmask(u64 *val) -{ -#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32 - *val = (*val >> 32) | (*val << 32); -#endif - return (unsigned long *)val; -} - -u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset) -{ - offset >>= 2; - if (!offset) - return (u32 *)(x->private + cpuid) + REG_OFFSET_SWIZZLE; - else - return (u32 *)(x->shared) + ((offset - 1) ^ REG_OFFSET_SWIZZLE); -} - -static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x, - int cpuid, int irq) -{ - if (irq < VGIC_NR_PRIVATE_IRQS) - return test_bit(irq, x->private + cpuid); - - return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared); -} - -void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, - int irq, int val) -{ - unsigned long *reg; - - if (irq < VGIC_NR_PRIVATE_IRQS) { - reg = x->private + cpuid; - } else { - reg = x->shared; - irq -= VGIC_NR_PRIVATE_IRQS; - } - - if (val) - set_bit(irq, reg); - else - clear_bit(irq, reg); -} - -static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid) -{ - return x->private + cpuid; -} - -unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x) -{ - return x->shared; -} - -static int vgic_init_bytemap(struct vgic_bytemap *x, int nr_cpus, int nr_irqs) -{ - int size; - - size = nr_cpus * VGIC_NR_PRIVATE_IRQS; - size += nr_irqs - VGIC_NR_PRIVATE_IRQS; - - x->private = kzalloc(size, GFP_KERNEL); - if (!x->private) - return -ENOMEM; - - x->shared = x->private + nr_cpus * VGIC_NR_PRIVATE_IRQS / sizeof(u32); - return 0; -} - -static void vgic_free_bytemap(struct vgic_bytemap *b) -{ - kfree(b->private); - b->private = NULL; - b->shared = NULL; -} - -u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset) -{ - u32 *reg; - - if (offset < VGIC_NR_PRIVATE_IRQS) { - reg = x->private; - offset += cpuid * VGIC_NR_PRIVATE_IRQS; - } else { - reg = x->shared; - offset -= VGIC_NR_PRIVATE_IRQS; - } - - return reg + (offset / sizeof(u32)); -} - -#define VGIC_CFG_LEVEL 0 -#define VGIC_CFG_EDGE 1 - -static bool vgic_irq_is_edge(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int irq_val; - - irq_val = vgic_bitmap_get_irq_val(&dist->irq_cfg, vcpu->vcpu_id, irq); - return irq_val == VGIC_CFG_EDGE; -} - -static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq); -} - -static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq); -} - -static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq); -} - -static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 1); -} - -static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0); -} - -static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1); -} - -static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0); -} - -static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_level, vcpu->vcpu_id, irq); -} - -static void vgic_dist_irq_set_level(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 1); -} - -static void vgic_dist_irq_clear_level(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 0); -} - -static int vgic_dist_irq_soft_pend(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq); -} - -static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0); - if (!vgic_dist_irq_get_level(vcpu, irq)) { - vgic_dist_irq_clear_pending(vcpu, irq); - if (!compute_pending_for_cpu(vcpu)) - clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); - } -} - -static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq); -} - -void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1); -} - -void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 0); -} - -static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq) -{ - if (irq < VGIC_NR_PRIVATE_IRQS) - set_bit(irq, vcpu->arch.vgic_cpu.pending_percpu); - else - set_bit(irq - VGIC_NR_PRIVATE_IRQS, - vcpu->arch.vgic_cpu.pending_shared); -} - -void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq) -{ - if (irq < VGIC_NR_PRIVATE_IRQS) - clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu); - else - clear_bit(irq - VGIC_NR_PRIVATE_IRQS, - vcpu->arch.vgic_cpu.pending_shared); -} - -static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq) -{ - return !vgic_irq_is_queued(vcpu, irq); -} - -/** - * vgic_reg_access - access vgic register - * @mmio: pointer to the data describing the mmio access - * @reg: pointer to the virtual backing of vgic distributor data - * @offset: least significant 2 bits used for word offset - * @mode: ACCESS_ mode (see defines above) - * - * Helper to make vgic register access easier using one of the access - * modes defined for vgic register access - * (read,raz,write-ignored,setbit,clearbit,write) - */ -void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg, - phys_addr_t offset, int mode) -{ - int word_offset = (offset & 3) * 8; - u32 mask = (1UL << (mmio->len * 8)) - 1; - u32 regval; - - /* - * Any alignment fault should have been delivered to the guest - * directly (ARM ARM B3.12.7 "Prioritization of aborts"). - */ - - if (reg) { - regval = *reg; - } else { - BUG_ON(mode != (ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED)); - regval = 0; - } - - if (mmio->is_write) { - u32 data = mmio_data_read(mmio, mask) << word_offset; - switch (ACCESS_WRITE_MASK(mode)) { - case ACCESS_WRITE_IGNORED: - return; - - case ACCESS_WRITE_SETBIT: - regval |= data; - break; - - case ACCESS_WRITE_CLEARBIT: - regval &= ~data; - break; - - case ACCESS_WRITE_VALUE: - regval = (regval & ~(mask << word_offset)) | data; - break; - } - *reg = regval; - } else { - switch (ACCESS_READ_MASK(mode)) { - case ACCESS_READ_RAZ: - regval = 0; - /* fall through */ - - case ACCESS_READ_VALUE: - mmio_data_write(mmio, mask, regval >> word_offset); - } - } -} - -bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id, int access) -{ - u32 *reg; - int mode = ACCESS_READ_VALUE | access; - struct kvm_vcpu *target_vcpu = kvm_get_vcpu(kvm, vcpu_id); - - reg = vgic_bitmap_get_reg(&kvm->arch.vgic.irq_enabled, vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, mode); - if (mmio->is_write) { - if (access & ACCESS_WRITE_CLEARBIT) { - if (offset < 4) /* Force SGI enabled */ - *reg |= 0xffff; - vgic_retire_disabled_irqs(target_vcpu); - } - vgic_update_state(kvm); - return true; - } - - return false; -} - -bool vgic_handle_set_pending_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id) -{ - u32 *reg, orig; - u32 level_mask; - int mode = ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT; - struct vgic_dist *dist = &kvm->arch.vgic; - - reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu_id, offset); - level_mask = (~(*reg)); - - /* Mark both level and edge triggered irqs as pending */ - reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset); - orig = *reg; - vgic_reg_access(mmio, reg, offset, mode); - - if (mmio->is_write) { - /* Set the soft-pending flag only for level-triggered irqs */ - reg = vgic_bitmap_get_reg(&dist->irq_soft_pend, - vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, mode); - *reg &= level_mask; - - /* Ignore writes to SGIs */ - if (offset < 2) { - *reg &= ~0xffff; - *reg |= orig & 0xffff; - } - - vgic_update_state(kvm); - return true; - } - - return false; -} - -bool vgic_handle_clear_pending_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id) -{ - u32 *level_active; - u32 *reg, orig; - int mode = ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT; - struct vgic_dist *dist = &kvm->arch.vgic; - - reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset); - orig = *reg; - vgic_reg_access(mmio, reg, offset, mode); - if (mmio->is_write) { - /* Re-set level triggered level-active interrupts */ - level_active = vgic_bitmap_get_reg(&dist->irq_level, - vcpu_id, offset); - reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset); - *reg |= *level_active; - - /* Ignore writes to SGIs */ - if (offset < 2) { - *reg &= ~0xffff; - *reg |= orig & 0xffff; - } - - /* Clear soft-pending flags */ - reg = vgic_bitmap_get_reg(&dist->irq_soft_pend, - vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, mode); - - vgic_update_state(kvm); - return true; - } - return false; -} - -bool vgic_handle_set_active_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id) -{ - u32 *reg; - struct vgic_dist *dist = &kvm->arch.vgic; - - reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); - - if (mmio->is_write) { - vgic_update_state(kvm); - return true; - } - - return false; -} - -bool vgic_handle_clear_active_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id) -{ - u32 *reg; - struct vgic_dist *dist = &kvm->arch.vgic; - - reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); - - if (mmio->is_write) { - vgic_update_state(kvm); - return true; - } - - return false; -} - -static u32 vgic_cfg_expand(u16 val) -{ - u32 res = 0; - int i; - - /* - * Turn a 16bit value like abcd...mnop into a 32bit word - * a0b0c0d0...m0n0o0p0, which is what the HW cfg register is. - */ - for (i = 0; i < 16; i++) - res |= ((val >> i) & VGIC_CFG_EDGE) << (2 * i + 1); - - return res; -} - -static u16 vgic_cfg_compress(u32 val) -{ - u16 res = 0; - int i; - - /* - * Turn a 32bit word a0b0c0d0...m0n0o0p0 into 16bit value like - * abcd...mnop which is what we really care about. - */ - for (i = 0; i < 16; i++) - res |= ((val >> (i * 2 + 1)) & VGIC_CFG_EDGE) << i; - - return res; -} - -/* - * The distributor uses 2 bits per IRQ for the CFG register, but the - * LSB is always 0. As such, we only keep the upper bit, and use the - * two above functions to compress/expand the bits - */ -bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 val; - - if (offset & 4) - val = *reg >> 16; - else - val = *reg & 0xffff; - - val = vgic_cfg_expand(val); - vgic_reg_access(mmio, &val, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - /* Ignore writes to read-only SGI and PPI bits */ - if (offset < 8) - return false; - - val = vgic_cfg_compress(val); - if (offset & 4) { - *reg &= 0xffff; - *reg |= val << 16; - } else { - *reg &= 0xffff << 16; - *reg |= val; - } - } - - return false; -} - -/** - * vgic_unqueue_irqs - move pending/active IRQs from LRs to the distributor - * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs - * - * Move any IRQs that have already been assigned to LRs back to the - * emulated distributor state so that the complete emulated state can be read - * from the main emulation structures without investigating the LRs. - */ -void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) -{ - u64 elrsr = vgic_get_elrsr(vcpu); - unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); - int i; - - for_each_clear_bit(i, elrsr_ptr, vgic->nr_lr) { - struct vgic_lr lr = vgic_get_lr(vcpu, i); - - /* - * There are three options for the state bits: - * - * 01: pending - * 10: active - * 11: pending and active - */ - BUG_ON(!(lr.state & LR_STATE_MASK)); - - /* Reestablish SGI source for pending and active IRQs */ - if (lr.irq < VGIC_NR_SGIS) - add_sgi_source(vcpu, lr.irq, lr.source); - - /* - * If the LR holds an active (10) or a pending and active (11) - * interrupt then move the active state to the - * distributor tracking bit. - */ - if (lr.state & LR_STATE_ACTIVE) - vgic_irq_set_active(vcpu, lr.irq); - - /* - * Reestablish the pending state on the distributor and the - * CPU interface and mark the LR as free for other use. - */ - vgic_retire_lr(i, vcpu); - - /* Finally update the VGIC state. */ - vgic_update_state(vcpu->kvm); - } -} - -const -struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges, - int len, gpa_t offset) -{ - while (ranges->len) { - if (offset >= ranges->base && - (offset + len) <= (ranges->base + ranges->len)) - return ranges; - ranges++; - } - - return NULL; -} - -static bool vgic_validate_access(const struct vgic_dist *dist, - const struct vgic_io_range *range, - unsigned long offset) -{ - int irq; - - if (!range->bits_per_irq) - return true; /* Not an irq-based access */ - - irq = offset * 8 / range->bits_per_irq; - if (irq >= dist->nr_irqs) - return false; - - return true; -} - -/* - * Call the respective handler function for the given range. - * We split up any 64 bit accesses into two consecutive 32 bit - * handler calls and merge the result afterwards. - * We do this in a little endian fashion regardless of the host's - * or guest's endianness, because the GIC is always LE and the rest of - * the code (vgic_reg_access) also puts it in a LE fashion already. - * At this point we have already identified the handle function, so - * range points to that one entry and offset is relative to this. - */ -static bool call_range_handler(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - unsigned long offset, - const struct vgic_io_range *range) -{ - struct kvm_exit_mmio mmio32; - bool ret; - - if (likely(mmio->len <= 4)) - return range->handle_mmio(vcpu, mmio, offset); - - /* - * Any access bigger than 4 bytes (that we currently handle in KVM) - * is actually 8 bytes long, caused by a 64-bit access - */ - - mmio32.len = 4; - mmio32.is_write = mmio->is_write; - mmio32.private = mmio->private; - - mmio32.phys_addr = mmio->phys_addr + 4; - mmio32.data = &((u32 *)mmio->data)[1]; - ret = range->handle_mmio(vcpu, &mmio32, offset + 4); - - mmio32.phys_addr = mmio->phys_addr; - mmio32.data = &((u32 *)mmio->data)[0]; - ret |= range->handle_mmio(vcpu, &mmio32, offset); - - return ret; -} - -/** - * vgic_handle_mmio_access - handle an in-kernel MMIO access - * This is called by the read/write KVM IO device wrappers below. - * @vcpu: pointer to the vcpu performing the access - * @this: pointer to the KVM IO device in charge - * @addr: guest physical address of the access - * @len: size of the access - * @val: pointer to the data region - * @is_write: read or write access - * - * returns true if the MMIO access could be performed - */ -static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, gpa_t addr, - int len, void *val, bool is_write) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct vgic_io_device *iodev = container_of(this, - struct vgic_io_device, dev); - const struct vgic_io_range *range; - struct kvm_exit_mmio mmio; - bool updated_state; - gpa_t offset; - - offset = addr - iodev->addr; - range = vgic_find_range(iodev->reg_ranges, len, offset); - if (unlikely(!range || !range->handle_mmio)) { - pr_warn("Unhandled access %d %08llx %d\n", is_write, addr, len); - return -ENXIO; - } - - mmio.phys_addr = addr; - mmio.len = len; - mmio.is_write = is_write; - mmio.data = val; - mmio.private = iodev->redist_vcpu; - - spin_lock(&dist->lock); - offset -= range->base; - if (vgic_validate_access(dist, range, offset)) { - updated_state = call_range_handler(vcpu, &mmio, offset, range); - } else { - if (!is_write) - memset(val, 0, len); - updated_state = false; - } - spin_unlock(&dist->lock); - - if (updated_state) - vgic_kick_vcpus(vcpu->kvm); - - return 0; -} - -static int vgic_handle_mmio_read(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, void *val) -{ - return vgic_handle_mmio_access(vcpu, this, addr, len, val, false); -} - -static int vgic_handle_mmio_write(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, const void *val) -{ - return vgic_handle_mmio_access(vcpu, this, addr, len, (void *)val, - true); -} - -static struct kvm_io_device_ops vgic_io_ops = { - .read = vgic_handle_mmio_read, - .write = vgic_handle_mmio_write, -}; - -/** - * vgic_register_kvm_io_dev - register VGIC register frame on the KVM I/O bus - * @kvm: The VM structure pointer - * @base: The (guest) base address for the register frame - * @len: Length of the register frame window - * @ranges: Describing the handler functions for each register - * @redist_vcpu_id: The VCPU ID to pass on to the handlers on call - * @iodev: Points to memory to be passed on to the handler - * - * @iodev stores the parameters of this function to be usable by the handler - * respectively the dispatcher function (since the KVM I/O bus framework lacks - * an opaque parameter). Initialization is done in this function, but the - * reference should be valid and unique for the whole VGIC lifetime. - * If the register frame is not mapped for a specific VCPU, pass -1 to - * @redist_vcpu_id. - */ -int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len, - const struct vgic_io_range *ranges, - int redist_vcpu_id, - struct vgic_io_device *iodev) -{ - struct kvm_vcpu *vcpu = NULL; - int ret; - - if (redist_vcpu_id >= 0) - vcpu = kvm_get_vcpu(kvm, redist_vcpu_id); - - iodev->addr = base; - iodev->len = len; - iodev->reg_ranges = ranges; - iodev->redist_vcpu = vcpu; - - kvm_iodevice_init(&iodev->dev, &vgic_io_ops); - - mutex_lock(&kvm->slots_lock); - - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, base, len, - &iodev->dev); - mutex_unlock(&kvm->slots_lock); - - /* Mark the iodev as invalid if registration fails. */ - if (ret) - iodev->dev.ops = NULL; - - return ret; -} - -static int vgic_nr_shared_irqs(struct vgic_dist *dist) -{ - return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS; -} - -static int compute_active_for_cpu(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - unsigned long *active, *enabled, *act_percpu, *act_shared; - unsigned long active_private, active_shared; - int nr_shared = vgic_nr_shared_irqs(dist); - int vcpu_id; - - vcpu_id = vcpu->vcpu_id; - act_percpu = vcpu->arch.vgic_cpu.active_percpu; - act_shared = vcpu->arch.vgic_cpu.active_shared; - - active = vgic_bitmap_get_cpu_map(&dist->irq_active, vcpu_id); - enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id); - bitmap_and(act_percpu, active, enabled, VGIC_NR_PRIVATE_IRQS); - - active = vgic_bitmap_get_shared_map(&dist->irq_active); - enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled); - bitmap_and(act_shared, active, enabled, nr_shared); - bitmap_and(act_shared, act_shared, - vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]), - nr_shared); - - active_private = find_first_bit(act_percpu, VGIC_NR_PRIVATE_IRQS); - active_shared = find_first_bit(act_shared, nr_shared); - - return (active_private < VGIC_NR_PRIVATE_IRQS || - active_shared < nr_shared); -} - -static int compute_pending_for_cpu(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - unsigned long *pending, *enabled, *pend_percpu, *pend_shared; - unsigned long pending_private, pending_shared; - int nr_shared = vgic_nr_shared_irqs(dist); - int vcpu_id; - - vcpu_id = vcpu->vcpu_id; - pend_percpu = vcpu->arch.vgic_cpu.pending_percpu; - pend_shared = vcpu->arch.vgic_cpu.pending_shared; - - if (!dist->enabled) { - bitmap_zero(pend_percpu, VGIC_NR_PRIVATE_IRQS); - bitmap_zero(pend_shared, nr_shared); - return 0; - } - - pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id); - enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id); - bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS); - - pending = vgic_bitmap_get_shared_map(&dist->irq_pending); - enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled); - bitmap_and(pend_shared, pending, enabled, nr_shared); - bitmap_and(pend_shared, pend_shared, - vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]), - nr_shared); - - pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS); - pending_shared = find_first_bit(pend_shared, nr_shared); - return (pending_private < VGIC_NR_PRIVATE_IRQS || - pending_shared < vgic_nr_shared_irqs(dist)); -} - -/* - * Update the interrupt state and determine which CPUs have pending - * or active interrupts. Must be called with distributor lock held. - */ -void vgic_update_state(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int c; - - kvm_for_each_vcpu(c, vcpu, kvm) { - if (compute_pending_for_cpu(vcpu)) - set_bit(c, dist->irq_pending_on_cpu); - - if (compute_active_for_cpu(vcpu)) - set_bit(c, dist->irq_active_on_cpu); - else - clear_bit(c, dist->irq_active_on_cpu); - } -} - -static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr) -{ - return vgic_ops->get_lr(vcpu, lr); -} - -static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr vlr) -{ - vgic_ops->set_lr(vcpu, lr, vlr); -} - -static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu) -{ - return vgic_ops->get_elrsr(vcpu); -} - -static inline u64 vgic_get_eisr(struct kvm_vcpu *vcpu) -{ - return vgic_ops->get_eisr(vcpu); -} - -static inline void vgic_clear_eisr(struct kvm_vcpu *vcpu) -{ - vgic_ops->clear_eisr(vcpu); -} - -static inline u32 vgic_get_interrupt_status(struct kvm_vcpu *vcpu) -{ - return vgic_ops->get_interrupt_status(vcpu); -} - -static inline void vgic_enable_underflow(struct kvm_vcpu *vcpu) -{ - vgic_ops->enable_underflow(vcpu); -} - -static inline void vgic_disable_underflow(struct kvm_vcpu *vcpu) -{ - vgic_ops->disable_underflow(vcpu); -} - -void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - vgic_ops->get_vmcr(vcpu, vmcr); -} - -void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - vgic_ops->set_vmcr(vcpu, vmcr); -} - -static inline void vgic_enable(struct kvm_vcpu *vcpu) -{ - vgic_ops->enable(vcpu); -} - -static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu) -{ - struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr); - - vgic_irq_clear_queued(vcpu, vlr.irq); - - /* - * We must transfer the pending state back to the distributor before - * retiring the LR, otherwise we may loose edge-triggered interrupts. - */ - if (vlr.state & LR_STATE_PENDING) { - vgic_dist_irq_set_pending(vcpu, vlr.irq); - vlr.hwirq = 0; - } - - vlr.state = 0; - vgic_set_lr(vcpu, lr_nr, vlr); -} - -static bool dist_active_irq(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu); -} - -bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq) -{ - int i; - - for (i = 0; i < vgic->nr_lr; i++) { - struct vgic_lr vlr = vgic_get_lr(vcpu, i); - - if (vlr.irq == virt_irq && vlr.state & LR_STATE_ACTIVE) - return true; - } - - return vgic_irq_is_active(vcpu, virt_irq); -} - -/* - * An interrupt may have been disabled after being made pending on the - * CPU interface (the classic case is a timer running while we're - * rebooting the guest - the interrupt would kick as soon as the CPU - * interface gets enabled, with deadly consequences). - * - * The solution is to examine already active LRs, and check the - * interrupt is still enabled. If not, just retire it. - */ -static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) -{ - u64 elrsr = vgic_get_elrsr(vcpu); - unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); - int lr; - - for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) { - struct vgic_lr vlr = vgic_get_lr(vcpu, lr); - - if (!vgic_irq_is_enabled(vcpu, vlr.irq)) - vgic_retire_lr(lr, vcpu); - } -} - -static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq, - int lr_nr, struct vgic_lr vlr) -{ - if (vgic_irq_is_active(vcpu, irq)) { - vlr.state |= LR_STATE_ACTIVE; - kvm_debug("Set active, clear distributor: 0x%x\n", vlr.state); - vgic_irq_clear_active(vcpu, irq); - vgic_update_state(vcpu->kvm); - } else { - WARN_ON(!vgic_dist_irq_is_pending(vcpu, irq)); - vlr.state |= LR_STATE_PENDING; - kvm_debug("Set pending: 0x%x\n", vlr.state); - } - - if (!vgic_irq_is_edge(vcpu, irq)) - vlr.state |= LR_EOI_INT; - - if (vlr.irq >= VGIC_NR_SGIS) { - struct irq_phys_map *map; - map = vgic_irq_map_search(vcpu, irq); - - if (map) { - vlr.hwirq = map->phys_irq; - vlr.state |= LR_HW; - vlr.state &= ~LR_EOI_INT; - - /* - * Make sure we're not going to sample this - * again, as a HW-backed interrupt cannot be - * in the PENDING_ACTIVE stage. - */ - vgic_irq_set_queued(vcpu, irq); - } - } - - vgic_set_lr(vcpu, lr_nr, vlr); -} - -/* - * Queue an interrupt to a CPU virtual interface. Return true on success, - * or false if it wasn't possible to queue it. - * sgi_source must be zero for any non-SGI interrupts. - */ -bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - u64 elrsr = vgic_get_elrsr(vcpu); - unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); - struct vgic_lr vlr; - int lr; - - /* Sanitize the input... */ - BUG_ON(sgi_source_id & ~7); - BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS); - BUG_ON(irq >= dist->nr_irqs); - - kvm_debug("Queue IRQ%d\n", irq); - - /* Do we have an active interrupt for the same CPUID? */ - for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) { - vlr = vgic_get_lr(vcpu, lr); - if (vlr.irq == irq && vlr.source == sgi_source_id) { - kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq); - vgic_queue_irq_to_lr(vcpu, irq, lr, vlr); - return true; - } - } - - /* Try to use another LR for this interrupt */ - lr = find_first_bit(elrsr_ptr, vgic->nr_lr); - if (lr >= vgic->nr_lr) - return false; - - kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id); - - vlr.irq = irq; - vlr.source = sgi_source_id; - vlr.state = 0; - vgic_queue_irq_to_lr(vcpu, irq, lr, vlr); - - return true; -} - -static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq) -{ - if (!vgic_can_sample_irq(vcpu, irq)) - return true; /* level interrupt, already queued */ - - if (vgic_queue_irq(vcpu, 0, irq)) { - if (vgic_irq_is_edge(vcpu, irq)) { - vgic_dist_irq_clear_pending(vcpu, irq); - vgic_cpu_irq_clear(vcpu, irq); - } else { - vgic_irq_set_queued(vcpu, irq); - } - - return true; - } - - return false; -} - -/* - * Fill the list registers with pending interrupts before running the - * guest. - */ -static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - unsigned long *pa_percpu, *pa_shared; - int i, vcpu_id; - int overflow = 0; - int nr_shared = vgic_nr_shared_irqs(dist); - - vcpu_id = vcpu->vcpu_id; - - pa_percpu = vcpu->arch.vgic_cpu.pend_act_percpu; - pa_shared = vcpu->arch.vgic_cpu.pend_act_shared; - - bitmap_or(pa_percpu, vgic_cpu->pending_percpu, vgic_cpu->active_percpu, - VGIC_NR_PRIVATE_IRQS); - bitmap_or(pa_shared, vgic_cpu->pending_shared, vgic_cpu->active_shared, - nr_shared); - /* - * We may not have any pending interrupt, or the interrupts - * may have been serviced from another vcpu. In all cases, - * move along. - */ - if (!kvm_vgic_vcpu_pending_irq(vcpu) && !dist_active_irq(vcpu)) - goto epilog; - - /* SGIs */ - for_each_set_bit(i, pa_percpu, VGIC_NR_SGIS) { - if (!queue_sgi(vcpu, i)) - overflow = 1; - } - - /* PPIs */ - for_each_set_bit_from(i, pa_percpu, VGIC_NR_PRIVATE_IRQS) { - if (!vgic_queue_hwirq(vcpu, i)) - overflow = 1; - } - - /* SPIs */ - for_each_set_bit(i, pa_shared, nr_shared) { - if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS)) - overflow = 1; - } - - - - -epilog: - if (overflow) { - vgic_enable_underflow(vcpu); - } else { - vgic_disable_underflow(vcpu); - /* - * We're about to run this VCPU, and we've consumed - * everything the distributor had in store for - * us. Claim we don't have anything pending. We'll - * adjust that if needed while exiting. - */ - clear_bit(vcpu_id, dist->irq_pending_on_cpu); - } -} - -static int process_queued_irq(struct kvm_vcpu *vcpu, - int lr, struct vgic_lr vlr) -{ - int pending = 0; - - /* - * If the IRQ was EOIed (called from vgic_process_maintenance) or it - * went from active to non-active (called from vgic_sync_hwirq) it was - * also ACKed and we we therefore assume we can clear the soft pending - * state (should it had been set) for this interrupt. - * - * Note: if the IRQ soft pending state was set after the IRQ was - * acked, it actually shouldn't be cleared, but we have no way of - * knowing that unless we start trapping ACKs when the soft-pending - * state is set. - */ - vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq); - - /* - * Tell the gic to start sampling this interrupt again. - */ - vgic_irq_clear_queued(vcpu, vlr.irq); - - /* Any additional pending interrupt? */ - if (vgic_irq_is_edge(vcpu, vlr.irq)) { - BUG_ON(!(vlr.state & LR_HW)); - pending = vgic_dist_irq_is_pending(vcpu, vlr.irq); - } else { - if (vgic_dist_irq_get_level(vcpu, vlr.irq)) { - vgic_cpu_irq_set(vcpu, vlr.irq); - pending = 1; - } else { - vgic_dist_irq_clear_pending(vcpu, vlr.irq); - vgic_cpu_irq_clear(vcpu, vlr.irq); - } - } - - /* - * Despite being EOIed, the LR may not have - * been marked as empty. - */ - vlr.state = 0; - vlr.hwirq = 0; - vgic_set_lr(vcpu, lr, vlr); - - return pending; -} - -static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) -{ - u32 status = vgic_get_interrupt_status(vcpu); - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct kvm *kvm = vcpu->kvm; - int level_pending = 0; - - kvm_debug("STATUS = %08x\n", status); - - if (status & INT_STATUS_EOI) { - /* - * Some level interrupts have been EOIed. Clear their - * active bit. - */ - u64 eisr = vgic_get_eisr(vcpu); - unsigned long *eisr_ptr = u64_to_bitmask(&eisr); - int lr; - - for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) { - struct vgic_lr vlr = vgic_get_lr(vcpu, lr); - - WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq)); - WARN_ON(vlr.state & LR_STATE_MASK); - - - /* - * kvm_notify_acked_irq calls kvm_set_irq() - * to reset the IRQ level, which grabs the dist->lock - * so we call this before taking the dist->lock. - */ - kvm_notify_acked_irq(kvm, 0, - vlr.irq - VGIC_NR_PRIVATE_IRQS); - - spin_lock(&dist->lock); - level_pending |= process_queued_irq(vcpu, lr, vlr); - spin_unlock(&dist->lock); - } - } - - if (status & INT_STATUS_UNDERFLOW) - vgic_disable_underflow(vcpu); - - /* - * In the next iterations of the vcpu loop, if we sync the vgic state - * after flushing it, but before entering the guest (this happens for - * pending signals and vmid rollovers), then make sure we don't pick - * up any old maintenance interrupts here. - */ - vgic_clear_eisr(vcpu); - - return level_pending; -} - -/* - * Save the physical active state, and reset it to inactive. - * - * Return true if there's a pending forwarded interrupt to queue. - */ -static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - bool level_pending; - - if (!(vlr.state & LR_HW)) - return false; - - if (vlr.state & LR_STATE_ACTIVE) - return false; - - spin_lock(&dist->lock); - level_pending = process_queued_irq(vcpu, lr, vlr); - spin_unlock(&dist->lock); - return level_pending; -} - -/* Sync back the VGIC state after a guest run */ -static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - u64 elrsr; - unsigned long *elrsr_ptr; - int lr, pending; - bool level_pending; - - level_pending = vgic_process_maintenance(vcpu); - - /* Deal with HW interrupts, and clear mappings for empty LRs */ - for (lr = 0; lr < vgic->nr_lr; lr++) { - struct vgic_lr vlr = vgic_get_lr(vcpu, lr); - - level_pending |= vgic_sync_hwirq(vcpu, lr, vlr); - BUG_ON(vlr.irq >= dist->nr_irqs); - } - - /* Check if we still have something up our sleeve... */ - elrsr = vgic_get_elrsr(vcpu); - elrsr_ptr = u64_to_bitmask(&elrsr); - pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr); - if (level_pending || pending < vgic->nr_lr) - set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); -} - -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - if (!irqchip_in_kernel(vcpu->kvm)) - return; - - spin_lock(&dist->lock); - __kvm_vgic_flush_hwstate(vcpu); - spin_unlock(&dist->lock); -} - -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) -{ - if (!irqchip_in_kernel(vcpu->kvm)) - return; - - __kvm_vgic_sync_hwstate(vcpu); -} - -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - if (!irqchip_in_kernel(vcpu->kvm)) - return 0; - - return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); -} - -void vgic_kick_vcpus(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int c; - - /* - * We've injected an interrupt, time to find out who deserves - * a good kick... - */ - kvm_for_each_vcpu(c, vcpu, kvm) { - if (kvm_vgic_vcpu_pending_irq(vcpu)) - kvm_vcpu_kick(vcpu); - } -} - -static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level) -{ - int edge_triggered = vgic_irq_is_edge(vcpu, irq); - - /* - * Only inject an interrupt if: - * - edge triggered and we have a rising edge - * - level triggered and we change level - */ - if (edge_triggered) { - int state = vgic_dist_irq_is_pending(vcpu, irq); - return level > state; - } else { - int state = vgic_dist_irq_get_level(vcpu, irq); - return level != state; - } -} - -static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, - unsigned int irq_num, bool level) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int edge_triggered, level_triggered; - int enabled; - bool ret = true, can_inject = true; - - trace_vgic_update_irq_pending(cpuid, irq_num, level); - - if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020)) - return -EINVAL; - - spin_lock(&dist->lock); - - vcpu = kvm_get_vcpu(kvm, cpuid); - edge_triggered = vgic_irq_is_edge(vcpu, irq_num); - level_triggered = !edge_triggered; - - if (!vgic_validate_injection(vcpu, irq_num, level)) { - ret = false; - goto out; - } - - if (irq_num >= VGIC_NR_PRIVATE_IRQS) { - cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS]; - if (cpuid == VCPU_NOT_ALLOCATED) { - /* Pretend we use CPU0, and prevent injection */ - cpuid = 0; - can_inject = false; - } - vcpu = kvm_get_vcpu(kvm, cpuid); - } - - kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid); - - if (level) { - if (level_triggered) - vgic_dist_irq_set_level(vcpu, irq_num); - vgic_dist_irq_set_pending(vcpu, irq_num); - } else { - if (level_triggered) { - vgic_dist_irq_clear_level(vcpu, irq_num); - if (!vgic_dist_irq_soft_pend(vcpu, irq_num)) { - vgic_dist_irq_clear_pending(vcpu, irq_num); - vgic_cpu_irq_clear(vcpu, irq_num); - if (!compute_pending_for_cpu(vcpu)) - clear_bit(cpuid, dist->irq_pending_on_cpu); - } - } - - ret = false; - goto out; - } - - enabled = vgic_irq_is_enabled(vcpu, irq_num); - - if (!enabled || !can_inject) { - ret = false; - goto out; - } - - if (!vgic_can_sample_irq(vcpu, irq_num)) { - /* - * Level interrupt in progress, will be picked up - * when EOId. - */ - ret = false; - goto out; - } - - if (level) { - vgic_cpu_irq_set(vcpu, irq_num); - set_bit(cpuid, dist->irq_pending_on_cpu); - } - -out: - spin_unlock(&dist->lock); - - if (ret) { - /* kick the specified vcpu */ - kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid)); - } - - return 0; -} - -static int vgic_lazy_init(struct kvm *kvm) -{ - int ret = 0; - - if (unlikely(!vgic_initialized(kvm))) { - /* - * We only provide the automatic initialization of the VGIC - * for the legacy case of a GICv2. Any other type must - * be explicitly initialized once setup with the respective - * KVM device call. - */ - if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) - return -EBUSY; - - mutex_lock(&kvm->lock); - ret = vgic_init(kvm); - mutex_unlock(&kvm->lock); - } - - return ret; -} - -/** - * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic - * @kvm: The VM structure pointer - * @cpuid: The CPU for PPIs - * @irq_num: The IRQ number that is assigned to the device. This IRQ - * must not be mapped to a HW interrupt. - * @level: Edge-triggered: true: to trigger the interrupt - * false: to ignore the call - * Level-sensitive true: raise the input signal - * false: lower the input signal - * - * The GIC is not concerned with devices being active-LOW or active-HIGH for - * level-sensitive interrupts. You can think of the level parameter as 1 - * being HIGH and 0 being LOW and all devices being active-HIGH. - */ -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, - bool level) -{ - struct irq_phys_map *map; - int ret; - - ret = vgic_lazy_init(kvm); - if (ret) - return ret; - - map = vgic_irq_map_search(kvm_get_vcpu(kvm, cpuid), irq_num); - if (map) - return -EINVAL; - - return vgic_update_irq_pending(kvm, cpuid, irq_num, level); -} - -/** - * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic - * @kvm: The VM structure pointer - * @cpuid: The CPU for PPIs - * @virt_irq: The virtual IRQ to be injected - * @level: Edge-triggered: true: to trigger the interrupt - * false: to ignore the call - * Level-sensitive true: raise the input signal - * false: lower the input signal - * - * The GIC is not concerned with devices being active-LOW or active-HIGH for - * level-sensitive interrupts. You can think of the level parameter as 1 - * being HIGH and 0 being LOW and all devices being active-HIGH. - */ -int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, - unsigned int virt_irq, bool level) -{ - int ret; - - ret = vgic_lazy_init(kvm); - if (ret) - return ret; - - return vgic_update_irq_pending(kvm, cpuid, virt_irq, level); -} - -static irqreturn_t vgic_maintenance_handler(int irq, void *data) -{ - /* - * We cannot rely on the vgic maintenance interrupt to be - * delivered synchronously. This means we can only use it to - * exit the VM, and we perform the handling of EOIed - * interrupts on the exit path (see vgic_process_maintenance). - */ - return IRQ_HANDLED; -} - -static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu, - int virt_irq) -{ - if (virt_irq < VGIC_NR_PRIVATE_IRQS) - return &vcpu->arch.vgic_cpu.irq_phys_map_list; - else - return &vcpu->kvm->arch.vgic.irq_phys_map_list; -} - -/** - * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ - * @vcpu: The VCPU pointer - * @virt_irq: The virtual IRQ number for the guest - * @phys_irq: The hardware IRQ number of the host - * - * Establish a mapping between a guest visible irq (@virt_irq) and a - * hardware irq (@phys_irq). On injection, @virt_irq will be associated with - * the physical interrupt represented by @phys_irq. This mapping can be - * established multiple times as long as the parameters are the same. - * - * Returns 0 on success or an error value otherwise. - */ -int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq); - struct irq_phys_map *map; - struct irq_phys_map_entry *entry; - int ret = 0; - - /* Create a new mapping */ - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - return -ENOMEM; - - spin_lock(&dist->irq_phys_map_lock); - - /* Try to match an existing mapping */ - map = vgic_irq_map_search(vcpu, virt_irq); - if (map) { - /* Make sure this mapping matches */ - if (map->phys_irq != phys_irq) - ret = -EINVAL; - - /* Found an existing, valid mapping */ - goto out; - } - - map = &entry->map; - map->virt_irq = virt_irq; - map->phys_irq = phys_irq; - - list_add_tail_rcu(&entry->entry, root); - -out: - spin_unlock(&dist->irq_phys_map_lock); - /* If we've found a hit in the existing list, free the useless - * entry */ - if (ret || map != &entry->map) - kfree(entry); - return ret; -} - -static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu, - int virt_irq) -{ - struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq); - struct irq_phys_map_entry *entry; - struct irq_phys_map *map; - - rcu_read_lock(); - - list_for_each_entry_rcu(entry, root, entry) { - map = &entry->map; - if (map->virt_irq == virt_irq) { - rcu_read_unlock(); - return map; - } - } - - rcu_read_unlock(); - - return NULL; -} - -static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu) -{ - struct irq_phys_map_entry *entry; - - entry = container_of(rcu, struct irq_phys_map_entry, rcu); - kfree(entry); -} - -/** - * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping - * @vcpu: The VCPU pointer - * @virt_irq: The virtual IRQ number to be unmapped - * - * Remove an existing mapping between virtual and physical interrupts. - */ -int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct irq_phys_map_entry *entry; - struct list_head *root; - - root = vgic_get_irq_phys_map_list(vcpu, virt_irq); - - spin_lock(&dist->irq_phys_map_lock); - - list_for_each_entry(entry, root, entry) { - if (entry->map.virt_irq == virt_irq) { - list_del_rcu(&entry->entry); - call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu); - break; - } - } - - spin_unlock(&dist->irq_phys_map_lock); - - return 0; -} - -static void vgic_destroy_irq_phys_map(struct kvm *kvm, struct list_head *root) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct irq_phys_map_entry *entry; - - spin_lock(&dist->irq_phys_map_lock); - - list_for_each_entry(entry, root, entry) { - list_del_rcu(&entry->entry); - call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu); - } - - spin_unlock(&dist->irq_phys_map_lock); -} - -void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - kfree(vgic_cpu->pending_shared); - kfree(vgic_cpu->active_shared); - kfree(vgic_cpu->pend_act_shared); - vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list); - vgic_cpu->pending_shared = NULL; - vgic_cpu->active_shared = NULL; - vgic_cpu->pend_act_shared = NULL; -} - -static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - int nr_longs = BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS); - int sz = nr_longs * sizeof(unsigned long); - vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL); - vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL); - vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL); - - if (!vgic_cpu->pending_shared - || !vgic_cpu->active_shared - || !vgic_cpu->pend_act_shared) { - kvm_vgic_vcpu_destroy(vcpu); - return -ENOMEM; - } - - return 0; -} - -/** - * kvm_vgic_vcpu_early_init - Earliest possible per-vcpu vgic init stage - * - * No memory allocation should be performed here, only static init. - */ -void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - INIT_LIST_HEAD(&vgic_cpu->irq_phys_map_list); -} - -/** - * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW - * - * The host's GIC naturally limits the maximum amount of VCPUs a guest - * can use. - */ -int kvm_vgic_get_max_vcpus(void) -{ - return vgic->max_gic_vcpus; -} - -void kvm_vgic_destroy(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int i; - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vgic_vcpu_destroy(vcpu); - - vgic_free_bitmap(&dist->irq_enabled); - vgic_free_bitmap(&dist->irq_level); - vgic_free_bitmap(&dist->irq_pending); - vgic_free_bitmap(&dist->irq_soft_pend); - vgic_free_bitmap(&dist->irq_queued); - vgic_free_bitmap(&dist->irq_cfg); - vgic_free_bytemap(&dist->irq_priority); - if (dist->irq_spi_target) { - for (i = 0; i < dist->nr_cpus; i++) - vgic_free_bitmap(&dist->irq_spi_target[i]); - } - kfree(dist->irq_sgi_sources); - kfree(dist->irq_spi_cpu); - kfree(dist->irq_spi_mpidr); - kfree(dist->irq_spi_target); - kfree(dist->irq_pending_on_cpu); - kfree(dist->irq_active_on_cpu); - vgic_destroy_irq_phys_map(kvm, &dist->irq_phys_map_list); - dist->irq_sgi_sources = NULL; - dist->irq_spi_cpu = NULL; - dist->irq_spi_target = NULL; - dist->irq_pending_on_cpu = NULL; - dist->irq_active_on_cpu = NULL; - dist->nr_cpus = 0; -} - -/* - * Allocate and initialize the various data structures. Must be called - * with kvm->lock held! - */ -int vgic_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int nr_cpus, nr_irqs; - int ret, i, vcpu_id; - - if (vgic_initialized(kvm)) - return 0; - - nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus); - if (!nr_cpus) /* No vcpus? Can't be good... */ - return -ENODEV; - - /* - * If nobody configured the number of interrupts, use the - * legacy one. - */ - if (!dist->nr_irqs) - dist->nr_irqs = VGIC_NR_IRQS_LEGACY; - - nr_irqs = dist->nr_irqs; - - ret = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_active, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs); - ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs); - - if (ret) - goto out; - - dist->irq_sgi_sources = kzalloc(nr_cpus * VGIC_NR_SGIS, GFP_KERNEL); - dist->irq_spi_cpu = kzalloc(nr_irqs - VGIC_NR_PRIVATE_IRQS, GFP_KERNEL); - dist->irq_spi_target = kzalloc(sizeof(*dist->irq_spi_target) * nr_cpus, - GFP_KERNEL); - dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long), - GFP_KERNEL); - dist->irq_active_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long), - GFP_KERNEL); - if (!dist->irq_sgi_sources || - !dist->irq_spi_cpu || - !dist->irq_spi_target || - !dist->irq_pending_on_cpu || - !dist->irq_active_on_cpu) { - ret = -ENOMEM; - goto out; - } - - for (i = 0; i < nr_cpus; i++) - ret |= vgic_init_bitmap(&dist->irq_spi_target[i], - nr_cpus, nr_irqs); - - if (ret) - goto out; - - ret = kvm->arch.vgic.vm_ops.init_model(kvm); - if (ret) - goto out; - - kvm_for_each_vcpu(vcpu_id, vcpu, kvm) { - ret = vgic_vcpu_init_maps(vcpu, nr_irqs); - if (ret) { - kvm_err("VGIC: Failed to allocate vcpu memory\n"); - break; - } - - /* - * Enable and configure all SGIs to be edge-triggere and - * configure all PPIs as level-triggered. - */ - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - if (i < VGIC_NR_SGIS) { - /* SGIs */ - vgic_bitmap_set_irq_val(&dist->irq_enabled, - vcpu->vcpu_id, i, 1); - vgic_bitmap_set_irq_val(&dist->irq_cfg, - vcpu->vcpu_id, i, - VGIC_CFG_EDGE); - } else if (i < VGIC_NR_PRIVATE_IRQS) { - /* PPIs */ - vgic_bitmap_set_irq_val(&dist->irq_cfg, - vcpu->vcpu_id, i, - VGIC_CFG_LEVEL); - } - } - - vgic_enable(vcpu); - } - -out: - if (ret) - kvm_vgic_destroy(kvm); - - return ret; -} - -static int init_vgic_model(struct kvm *kvm, int type) -{ - switch (type) { - case KVM_DEV_TYPE_ARM_VGIC_V2: - vgic_v2_init_emulation(kvm); - break; -#ifdef CONFIG_KVM_ARM_VGIC_V3 - case KVM_DEV_TYPE_ARM_VGIC_V3: - vgic_v3_init_emulation(kvm); - break; -#endif - default: - return -ENODEV; - } - - if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) - return -E2BIG; - - return 0; -} - -/** - * kvm_vgic_early_init - Earliest possible vgic initialization stage - * - * No memory allocation should be performed here, only static init. - */ -void kvm_vgic_early_init(struct kvm *kvm) -{ - spin_lock_init(&kvm->arch.vgic.lock); - spin_lock_init(&kvm->arch.vgic.irq_phys_map_lock); - INIT_LIST_HEAD(&kvm->arch.vgic.irq_phys_map_list); -} - -int kvm_vgic_create(struct kvm *kvm, u32 type) -{ - int i, vcpu_lock_idx = -1, ret; - struct kvm_vcpu *vcpu; - - mutex_lock(&kvm->lock); - - if (irqchip_in_kernel(kvm)) { - ret = -EEXIST; - goto out; - } - - /* - * This function is also called by the KVM_CREATE_IRQCHIP handler, - * which had no chance yet to check the availability of the GICv2 - * emulation. So check this here again. KVM_CREATE_DEVICE does - * the proper checks already. - */ - if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && !vgic->can_emulate_gicv2) { - ret = -ENODEV; - goto out; - } - - /* - * Any time a vcpu is run, vcpu_load is called which tries to grab the - * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure - * that no other VCPUs are run while we create the vgic. - */ - ret = -EBUSY; - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!mutex_trylock(&vcpu->mutex)) - goto out_unlock; - vcpu_lock_idx = i; - } - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->arch.has_run_once) - goto out_unlock; - } - ret = 0; - - ret = init_vgic_model(kvm, type); - if (ret) - goto out_unlock; - - kvm->arch.vgic.in_kernel = true; - kvm->arch.vgic.vgic_model = type; - kvm->arch.vgic.vctrl_base = vgic->vctrl_base; - kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF; - -out_unlock: - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { - vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); - mutex_unlock(&vcpu->mutex); - } - -out: - mutex_unlock(&kvm->lock); - return ret; -} - -static int vgic_ioaddr_overlap(struct kvm *kvm) -{ - phys_addr_t dist = kvm->arch.vgic.vgic_dist_base; - phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base; - - if (IS_VGIC_ADDR_UNDEF(dist) || IS_VGIC_ADDR_UNDEF(cpu)) - return 0; - if ((dist <= cpu && dist + KVM_VGIC_V2_DIST_SIZE > cpu) || - (cpu <= dist && cpu + KVM_VGIC_V2_CPU_SIZE > dist)) - return -EBUSY; - return 0; -} - -static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t size) -{ - int ret; - - if (addr & ~KVM_PHYS_MASK) - return -E2BIG; - - if (addr & (SZ_4K - 1)) - return -EINVAL; - - if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) - return -EEXIST; - if (addr + size < addr) - return -EINVAL; - - *ioaddr = addr; - ret = vgic_ioaddr_overlap(kvm); - if (ret) - *ioaddr = VGIC_ADDR_UNDEF; - - return ret; -} - -/** - * kvm_vgic_addr - set or get vgic VM base addresses - * @kvm: pointer to the vm struct - * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX - * @addr: pointer to address value - * @write: if true set the address in the VM address space, if false read the - * address - * - * Set or get the vgic base addresses for the distributor and the virtual CPU - * interface in the VM physical address space. These addresses are properties - * of the emulated core/SoC and therefore user space initially knows this - * information. - */ -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) -{ - int r = 0; - struct vgic_dist *vgic = &kvm->arch.vgic; - int type_needed; - phys_addr_t *addr_ptr, block_size; - phys_addr_t alignment; - - mutex_lock(&kvm->lock); - switch (type) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V2; - addr_ptr = &vgic->vgic_dist_base; - block_size = KVM_VGIC_V2_DIST_SIZE; - alignment = SZ_4K; - break; - case KVM_VGIC_V2_ADDR_TYPE_CPU: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V2; - addr_ptr = &vgic->vgic_cpu_base; - block_size = KVM_VGIC_V2_CPU_SIZE; - alignment = SZ_4K; - break; -#ifdef CONFIG_KVM_ARM_VGIC_V3 - case KVM_VGIC_V3_ADDR_TYPE_DIST: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V3; - addr_ptr = &vgic->vgic_dist_base; - block_size = KVM_VGIC_V3_DIST_SIZE; - alignment = SZ_64K; - break; - case KVM_VGIC_V3_ADDR_TYPE_REDIST: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V3; - addr_ptr = &vgic->vgic_redist_base; - block_size = KVM_VGIC_V3_REDIST_SIZE; - alignment = SZ_64K; - break; -#endif - default: - r = -ENODEV; - goto out; - } - - if (vgic->vgic_model != type_needed) { - r = -ENODEV; - goto out; - } - - if (write) { - if (!IS_ALIGNED(*addr, alignment)) - r = -EINVAL; - else - r = vgic_ioaddr_assign(kvm, addr_ptr, *addr, - block_size); - } else { - *addr = *addr_ptr; - } - -out: - mutex_unlock(&kvm->lock); - return r; -} - -int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr) -{ - int r; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - r = kvm_vgic_addr(dev->kvm, type, &addr, true); - return (r == -ENODEV) ? -ENXIO : r; - } - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 val; - int ret = 0; - - if (get_user(val, uaddr)) - return -EFAULT; - - /* - * We require: - * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs - * - at most 1024 interrupts - * - a multiple of 32 interrupts - */ - if (val < (VGIC_NR_PRIVATE_IRQS + 32) || - val > VGIC_MAX_IRQS || - (val & 31)) - return -EINVAL; - - mutex_lock(&dev->kvm->lock); - - if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_irqs) - ret = -EBUSY; - else - dev->kvm->arch.vgic.nr_irqs = val; - - mutex_unlock(&dev->kvm->lock); - - return ret; - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: { - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - r = vgic_init(dev->kvm); - return r; - } - break; - } - } - - return -ENXIO; -} - -int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr) -{ - int r = -ENXIO; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - r = kvm_vgic_addr(dev->kvm, type, &addr, false); - if (r) - return (r == -ENODEV) ? -ENXIO : r; - - if (copy_to_user(uaddr, &addr, sizeof(addr))) - return -EFAULT; - break; - } - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - - r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr); - break; - } - - } - - return r; -} - -int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset) -{ - if (vgic_find_range(ranges, 4, offset)) - return 0; - else - return -ENXIO; -} - -static int vgic_starting_cpu(unsigned int cpu) -{ - enable_percpu_irq(vgic->maint_irq, 0); - return 0; -} - -static int vgic_dying_cpu(unsigned int cpu) -{ - disable_percpu_irq(vgic->maint_irq); - return 0; -} - -static int kvm_vgic_probe(void) -{ - const struct gic_kvm_info *gic_kvm_info; - int ret; - - gic_kvm_info = gic_get_kvm_info(); - if (!gic_kvm_info) - return -ENODEV; - - switch (gic_kvm_info->type) { - case GIC_V2: - ret = vgic_v2_probe(gic_kvm_info, &vgic_ops, &vgic); - break; - case GIC_V3: - ret = vgic_v3_probe(gic_kvm_info, &vgic_ops, &vgic); - break; - default: - ret = -ENODEV; - } - - return ret; -} - -int kvm_vgic_hyp_init(void) -{ - int ret; - - ret = kvm_vgic_probe(); - if (ret) { - kvm_err("error: KVM vGIC probing failed\n"); - return ret; - } - - ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler, - "vgic", kvm_get_running_vcpus()); - if (ret) { - kvm_err("Cannot register interrupt %d\n", vgic->maint_irq); - return ret; - } - - cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_STARTING, - "AP_KVM_ARM_VGIC_STARTING", vgic_starting_cpu, - vgic_dying_cpu); - return 0; -} - -int kvm_irq_map_gsi(struct kvm *kvm, - struct kvm_kernel_irq_routing_entry *entries, - int gsi) -{ - return 0; -} - -int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) -{ - return pin; -} - -int kvm_set_irq(struct kvm *kvm, int irq_source_id, - u32 irq, int level, bool line_status) -{ - unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS; - - trace_kvm_set_irq(irq, level, irq_source_id); - - BUG_ON(!vgic_initialized(kvm)); - - return kvm_vgic_inject_irq(kvm, 0, spi, level); -} - -/* MSI not implemented yet */ -int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, - int level, bool line_status) -{ - return 0; -} diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h deleted file mode 100644 index 0df74cbb6200..000000000000 --- a/virt/kvm/arm/vgic.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (C) 2012-2014 ARM Ltd. - * Author: Marc Zyngier - * - * Derived from virt/kvm/arm/vgic.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#ifndef __KVM_VGIC_H__ -#define __KVM_VGIC_H__ - -#include - -#define VGIC_ADDR_UNDEF (-1) -#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) - -#define PRODUCT_ID_KVM 0x4b /* ASCII code K */ -#define IMPLEMENTER_ARM 0x43b - -#define ACCESS_READ_VALUE (1 << 0) -#define ACCESS_READ_RAZ (0 << 0) -#define ACCESS_READ_MASK(x) ((x) & (1 << 0)) -#define ACCESS_WRITE_IGNORED (0 << 1) -#define ACCESS_WRITE_SETBIT (1 << 1) -#define ACCESS_WRITE_CLEARBIT (2 << 1) -#define ACCESS_WRITE_VALUE (3 << 1) -#define ACCESS_WRITE_MASK(x) ((x) & (3 << 1)) - -#define VCPU_NOT_ALLOCATED ((u8)-1) - -unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x); - -void vgic_update_state(struct kvm *kvm); -int vgic_init_common_maps(struct kvm *kvm); - -u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset); -u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset); - -void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq); -void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq); -void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq); -void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, - int irq, int val); - -void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); - -bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq); -void vgic_unqueue_irqs(struct kvm_vcpu *vcpu); - -struct kvm_exit_mmio { - phys_addr_t phys_addr; - void *data; - u32 len; - bool is_write; - void *private; -}; - -void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg, - phys_addr_t offset, int mode); -bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, - phys_addr_t offset); - -static inline -u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask) -{ - return le32_to_cpu(*((u32 *)mmio->data)) & mask; -} - -static inline -void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value) -{ - *((u32 *)mmio->data) = cpu_to_le32(value) & mask; -} - -struct vgic_io_range { - phys_addr_t base; - unsigned long len; - int bits_per_irq; - bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, - phys_addr_t offset); -}; - -int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len, - const struct vgic_io_range *ranges, - int redist_id, - struct vgic_io_device *iodev); - -static inline bool is_in_range(phys_addr_t addr, unsigned long len, - phys_addr_t baseaddr, unsigned long size) -{ - return (addr >= baseaddr) && (addr + len <= baseaddr + size); -} - -const -struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges, - int len, gpa_t offset); - -bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id, int access); - -bool vgic_handle_set_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id); - -bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id); - -bool vgic_handle_set_active_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id); - -bool vgic_handle_clear_active_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id); - -bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio, - phys_addr_t offset); - -void vgic_kick_vcpus(struct kvm *kvm); - -int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset); -int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr); -int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr); - -int vgic_init(struct kvm *kvm); -void vgic_v2_init_emulation(struct kvm *kvm); -void vgic_v3_init_emulation(struct kvm *kvm); - -#endif diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 2c7f0d5a62ea..1e30ce08700d 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -157,6 +157,9 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); int i; + INIT_LIST_HEAD(&dist->lpi_list_head); + spin_lock_init(&dist->lpi_list_lock); + dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); if (!dist->spis) return -ENOMEM; @@ -177,6 +180,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) spin_lock_init(&irq->irq_lock); irq->vcpu = NULL; irq->target_vcpu = vcpu0; + kref_init(&irq->refcount); if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) irq->targets = 0; else @@ -211,6 +215,7 @@ static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) irq->vcpu = NULL; irq->target_vcpu = vcpu; irq->targets = 1U << vcpu->vcpu_id; + kref_init(&irq->refcount); if (vgic_irq_is_sgi(i)) { /* SGIs */ irq->enabled = 1; @@ -253,6 +258,9 @@ int vgic_init(struct kvm *kvm) if (ret) goto out; + if (vgic_has_its(kvm)) + dist->msis_require_devid = true; + kvm_for_each_vcpu(i, vcpu, kvm) kvm_vgic_vcpu_init(vcpu); @@ -271,7 +279,6 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm) dist->initialized = false; kfree(dist->spis); - kfree(dist->redist_iodevs); dist->nr_spis = 0; mutex_unlock(&kvm->lock); diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c new file mode 100644 index 000000000000..07411cf967b9 --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -0,0 +1,1500 @@ +/* + * GICv3 ITS emulation + * + * Copyright (C) 2015,2016 ARM Ltd. + * Author: Andre Przywara + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "vgic.h" +#include "vgic-mmio.h" + +/* + * Creates a new (reference to a) struct vgic_irq for a given LPI. + * If this LPI is already mapped on another ITS, we increase its refcount + * and return a pointer to the existing structure. + * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq. + * This function returns a pointer to the _unlocked_ structure. + */ +static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq; + + /* In this case there is no put, since we keep the reference. */ + if (irq) + return irq; + + irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL); + if (!irq) + return NULL; + + INIT_LIST_HEAD(&irq->lpi_list); + INIT_LIST_HEAD(&irq->ap_list); + spin_lock_init(&irq->irq_lock); + + irq->config = VGIC_CONFIG_EDGE; + kref_init(&irq->refcount); + irq->intid = intid; + + spin_lock(&dist->lpi_list_lock); + + /* + * There could be a race with another vgic_add_lpi(), so we need to + * check that we don't add a second list entry with the same LPI. + */ + list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) { + if (oldirq->intid != intid) + continue; + + /* Someone was faster with adding this LPI, lets use that. */ + kfree(irq); + irq = oldirq; + + /* + * This increases the refcount, the caller is expected to + * call vgic_put_irq() on the returned pointer once it's + * finished with the IRQ. + */ + vgic_get_irq_kref(irq); + + goto out_unlock; + } + + list_add_tail(&irq->lpi_list, &dist->lpi_list_head); + dist->lpi_list_count++; + +out_unlock: + spin_unlock(&dist->lpi_list_lock); + + return irq; +} + +struct its_device { + struct list_head dev_list; + + /* the head for the list of ITTEs */ + struct list_head itt_head; + u32 device_id; +}; + +#define COLLECTION_NOT_MAPPED ((u32)~0) + +struct its_collection { + struct list_head coll_list; + + u32 collection_id; + u32 target_addr; +}; + +#define its_is_collection_mapped(coll) ((coll) && \ + ((coll)->target_addr != COLLECTION_NOT_MAPPED)) + +struct its_itte { + struct list_head itte_list; + + struct vgic_irq *irq; + struct its_collection *collection; + u32 lpi; + u32 event_id; +}; + +/* + * Find and returns a device in the device table for an ITS. + * Must be called with the its_lock mutex held. + */ +static struct its_device *find_its_device(struct vgic_its *its, u32 device_id) +{ + struct its_device *device; + + list_for_each_entry(device, &its->device_list, dev_list) + if (device_id == device->device_id) + return device; + + return NULL; +} + +/* + * Find and returns an interrupt translation table entry (ITTE) for a given + * Device ID/Event ID pair on an ITS. + * Must be called with the its_lock mutex held. + */ +static struct its_itte *find_itte(struct vgic_its *its, u32 device_id, + u32 event_id) +{ + struct its_device *device; + struct its_itte *itte; + + device = find_its_device(its, device_id); + if (device == NULL) + return NULL; + + list_for_each_entry(itte, &device->itt_head, itte_list) + if (itte->event_id == event_id) + return itte; + + return NULL; +} + +/* To be used as an iterator this macro misses the enclosing parentheses */ +#define for_each_lpi_its(dev, itte, its) \ + list_for_each_entry(dev, &(its)->device_list, dev_list) \ + list_for_each_entry(itte, &(dev)->itt_head, itte_list) + +/* + * We only implement 48 bits of PA at the moment, although the ITS + * supports more. Let's be restrictive here. + */ +#define BASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) +#define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) +#define PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) +#define PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) + +#define GIC_LPI_OFFSET 8192 + +/* + * Finds and returns a collection in the ITS collection table. + * Must be called with the its_lock mutex held. + */ +static struct its_collection *find_collection(struct vgic_its *its, int coll_id) +{ + struct its_collection *collection; + + list_for_each_entry(collection, &its->collection_list, coll_list) { + if (coll_id == collection->collection_id) + return collection; + } + + return NULL; +} + +#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED) +#define LPI_PROP_PRIORITY(p) ((p) & 0xfc) + +/* + * Reads the configuration data for a given LPI from guest memory and + * updates the fields in struct vgic_irq. + * If filter_vcpu is not NULL, applies only if the IRQ is targeting this + * VCPU. Unconditionally applies if filter_vcpu is NULL. + */ +static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, + struct kvm_vcpu *filter_vcpu) +{ + u64 propbase = PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); + u8 prop; + int ret; + + ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET, + &prop, 1); + + if (ret) + return ret; + + spin_lock(&irq->irq_lock); + + if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { + irq->priority = LPI_PROP_PRIORITY(prop); + irq->enabled = LPI_PROP_ENABLE_BIT(prop); + + vgic_queue_irq_unlock(kvm, irq); + } else { + spin_unlock(&irq->irq_lock); + } + + return 0; +} + +/* + * Create a snapshot of the current LPI list, so that we can enumerate all + * LPIs without holding any lock. + * Returns the array length and puts the kmalloc'ed array into intid_ptr. + */ +static int vgic_copy_lpi_list(struct kvm *kvm, u32 **intid_ptr) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + u32 *intids; + int irq_count = dist->lpi_list_count, i = 0; + + /* + * We use the current value of the list length, which may change + * after the kmalloc. We don't care, because the guest shouldn't + * change anything while the command handling is still running, + * and in the worst case we would miss a new IRQ, which one wouldn't + * expect to be covered by this command anyway. + */ + intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL); + if (!intids) + return -ENOMEM; + + spin_lock(&dist->lpi_list_lock); + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + /* We don't need to "get" the IRQ, as we hold the list lock. */ + intids[i] = irq->intid; + if (++i == irq_count) + break; + } + spin_unlock(&dist->lpi_list_lock); + + *intid_ptr = intids; + return irq_count; +} + +/* + * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI + * is targeting) to the VGIC's view, which deals with target VCPUs. + * Needs to be called whenever either the collection for a LPIs has + * changed or the collection itself got retargeted. + */ +static void update_affinity_itte(struct kvm *kvm, struct its_itte *itte) +{ + struct kvm_vcpu *vcpu; + + if (!its_is_collection_mapped(itte->collection)) + return; + + vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr); + + spin_lock(&itte->irq->irq_lock); + itte->irq->target_vcpu = vcpu; + spin_unlock(&itte->irq->irq_lock); +} + +/* + * Updates the target VCPU for every LPI targeting this collection. + * Must be called with the its_lock mutex held. + */ +static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its, + struct its_collection *coll) +{ + struct its_device *device; + struct its_itte *itte; + + for_each_lpi_its(device, itte, its) { + if (!itte->collection || coll != itte->collection) + continue; + + update_affinity_itte(kvm, itte); + } +} + +static u32 max_lpis_propbaser(u64 propbaser) +{ + int nr_idbits = (propbaser & 0x1f) + 1; + + return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS); +} + +/* + * Scan the whole LPI pending table and sync the pending bit in there + * with our own data structures. This relies on the LPI being + * mapped before. + */ +static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) +{ + gpa_t pendbase = PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + struct vgic_irq *irq; + int last_byte_offset = -1; + int ret = 0; + u32 *intids; + int nr_irqs, i; + + nr_irqs = vgic_copy_lpi_list(vcpu->kvm, &intids); + if (nr_irqs < 0) + return nr_irqs; + + for (i = 0; i < nr_irqs; i++) { + int byte_offset, bit_nr; + u8 pendmask; + + byte_offset = intids[i] / BITS_PER_BYTE; + bit_nr = intids[i] % BITS_PER_BYTE; + + /* + * For contiguously allocated LPIs chances are we just read + * this very same byte in the last iteration. Reuse that. + */ + if (byte_offset != last_byte_offset) { + ret = kvm_read_guest(vcpu->kvm, pendbase + byte_offset, + &pendmask, 1); + if (ret) { + kfree(intids); + return ret; + } + last_byte_offset = byte_offset; + } + + irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); + spin_lock(&irq->irq_lock); + irq->pending = pendmask & (1U << bit_nr); + vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_put_irq(vcpu->kvm, irq); + } + + kfree(intids); + + return ret; +} + +static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u32 reg = 0; + + mutex_lock(&its->cmd_lock); + if (its->creadr == its->cwriter) + reg |= GITS_CTLR_QUIESCENT; + if (its->enabled) + reg |= GITS_CTLR_ENABLE; + mutex_unlock(&its->cmd_lock); + + return reg; +} + +static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + its->enabled = !!(val & GITS_CTLR_ENABLE); +} + +static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u64 reg = GITS_TYPER_PLPIS; + + /* + * We use linear CPU numbers for redistributor addressing, + * so GITS_TYPER.PTA is 0. + * Also we force all PROPBASER registers to be the same, so + * CommonLPIAff is 0 as well. + * To avoid memory waste in the guest, we keep the number of IDBits and + * DevBits low - as least for the time being. + */ + reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT; + reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT; + + return extract_bytes(reg, addr & 7, len); +} + +static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); +} + +static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + switch (addr & 0xffff) { + case GITS_PIDR0: + return 0x92; /* part number, bits[7:0] */ + case GITS_PIDR1: + return 0xb4; /* part number, bits[11:8] */ + case GITS_PIDR2: + return GIC_PIDR2_ARCH_GICv3 | 0x0b; + case GITS_PIDR4: + return 0x40; /* This is a 64K software visible page */ + /* The following are the ID registers for (any) GIC. */ + case GITS_CIDR0: + return 0x0d; + case GITS_CIDR1: + return 0xf0; + case GITS_CIDR2: + return 0x05; + case GITS_CIDR3: + return 0xb1; + } + + return 0; +} + +/* + * Find the target VCPU and the LPI number for a given devid/eventid pair + * and make this IRQ pending, possibly injecting it. + * Must be called with the its_lock mutex held. + */ +static void vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid) +{ + struct its_itte *itte; + + if (!its->enabled) + return; + + itte = find_itte(its, devid, eventid); + /* Triggering an unmapped IRQ gets silently dropped. */ + if (itte && its_is_collection_mapped(itte->collection)) { + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr); + if (vcpu && vcpu->arch.vgic_cpu.lpis_enabled) { + spin_lock(&itte->irq->irq_lock); + itte->irq->pending = true; + vgic_queue_irq_unlock(kvm, itte->irq); + } + } +} + +/* + * Queries the KVM IO bus framework to get the ITS pointer from the given + * doorbell address. + * We then call vgic_its_trigger_msi() with the decoded data. + */ +int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + u64 address; + struct kvm_io_device *kvm_io_dev; + struct vgic_io_device *iodev; + + if (!vgic_has_its(kvm)) + return -ENODEV; + + if (!(msi->flags & KVM_MSI_VALID_DEVID)) + return -EINVAL; + + address = (u64)msi->address_hi << 32 | msi->address_lo; + + kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); + if (!kvm_io_dev) + return -ENODEV; + + iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); + + mutex_lock(&iodev->its->its_lock); + vgic_its_trigger_msi(kvm, iodev->its, msi->devid, msi->data); + mutex_unlock(&iodev->its->its_lock); + + return 0; +} + +/* Requires the its_lock to be held. */ +static void its_free_itte(struct kvm *kvm, struct its_itte *itte) +{ + list_del(&itte->itte_list); + + /* This put matches the get in vgic_add_lpi. */ + vgic_put_irq(kvm, itte->irq); + + kfree(itte); +} + +static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size) +{ + return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1); +} + +#define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8) +#define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32) +#define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32) +#define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32) +#define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16) +#define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32) +#define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1) + +/* + * The DISCARD command frees an Interrupt Translation Table Entry (ITTE). + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_itte *itte; + + + itte = find_itte(its, device_id, event_id); + if (itte && itte->collection) { + /* + * Though the spec talks about removing the pending state, we + * don't bother here since we clear the ITTE anyway and the + * pending state is a property of the ITTE struct. + */ + its_free_itte(kvm, itte); + return 0; + } + + return E_ITS_DISCARD_UNMAPPED_INTERRUPT; +} + +/* + * The MOVI command moves an ITTE to a different collection. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + u32 coll_id = its_cmd_get_collection(its_cmd); + struct kvm_vcpu *vcpu; + struct its_itte *itte; + struct its_collection *collection; + + itte = find_itte(its, device_id, event_id); + if (!itte) + return E_ITS_MOVI_UNMAPPED_INTERRUPT; + + if (!its_is_collection_mapped(itte->collection)) + return E_ITS_MOVI_UNMAPPED_COLLECTION; + + collection = find_collection(its, coll_id); + if (!its_is_collection_mapped(collection)) + return E_ITS_MOVI_UNMAPPED_COLLECTION; + + itte->collection = collection; + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + spin_lock(&itte->irq->irq_lock); + itte->irq->target_vcpu = vcpu; + spin_unlock(&itte->irq->irq_lock); + + return 0; +} + +/* + * Check whether an ID can be stored into the corresponding guest table. + * For a direct table this is pretty easy, but gets a bit nasty for + * indirect tables. We check whether the resulting guest physical address + * is actually valid (covered by a memslot and guest accessbible). + * For this we have to read the respective first level entry. + */ +static bool vgic_its_check_id(struct vgic_its *its, u64 baser, int id) +{ + int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + int index; + u64 indirect_ptr; + gfn_t gfn; + + if (!(baser & GITS_BASER_INDIRECT)) { + phys_addr_t addr; + + if (id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(baser))) + return false; + + addr = BASER_ADDRESS(baser) + id * GITS_BASER_ENTRY_SIZE(baser); + gfn = addr >> PAGE_SHIFT; + + return kvm_is_visible_gfn(its->dev->kvm, gfn); + } + + /* calculate and check the index into the 1st level */ + index = id / (SZ_64K / GITS_BASER_ENTRY_SIZE(baser)); + if (index >= (l1_tbl_size / sizeof(u64))) + return false; + + /* Each 1st level entry is represented by a 64-bit value. */ + if (kvm_read_guest(its->dev->kvm, + BASER_ADDRESS(baser) + index * sizeof(indirect_ptr), + &indirect_ptr, sizeof(indirect_ptr))) + return false; + + indirect_ptr = le64_to_cpu(indirect_ptr); + + /* check the valid bit of the first level entry */ + if (!(indirect_ptr & BIT_ULL(63))) + return false; + + /* + * Mask the guest physical address and calculate the frame number. + * Any address beyond our supported 48 bits of PA will be caught + * by the actual check in the final step. + */ + indirect_ptr &= GENMASK_ULL(51, 16); + + /* Find the address of the actual entry */ + index = id % (SZ_64K / GITS_BASER_ENTRY_SIZE(baser)); + indirect_ptr += index * GITS_BASER_ENTRY_SIZE(baser); + gfn = indirect_ptr >> PAGE_SHIFT; + + return kvm_is_visible_gfn(its->dev->kvm, gfn); +} + +static int vgic_its_alloc_collection(struct vgic_its *its, + struct its_collection **colp, + u32 coll_id) +{ + struct its_collection *collection; + + if (!vgic_its_check_id(its, its->baser_coll_table, coll_id)) + return E_ITS_MAPC_COLLECTION_OOR; + + collection = kzalloc(sizeof(*collection), GFP_KERNEL); + + collection->collection_id = coll_id; + collection->target_addr = COLLECTION_NOT_MAPPED; + + list_add_tail(&collection->coll_list, &its->collection_list); + *colp = collection; + + return 0; +} + +static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id) +{ + struct its_collection *collection; + struct its_device *device; + struct its_itte *itte; + + /* + * Clearing the mapping for that collection ID removes the + * entry from the list. If there wasn't any before, we can + * go home early. + */ + collection = find_collection(its, coll_id); + if (!collection) + return; + + for_each_lpi_its(device, itte, its) + if (itte->collection && + itte->collection->collection_id == coll_id) + itte->collection = NULL; + + list_del(&collection->coll_list); + kfree(collection); +} + +/* + * The MAPTI and MAPI commands map LPIs to ITTEs. + * Must be called with its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + u32 coll_id = its_cmd_get_collection(its_cmd); + struct its_itte *itte; + struct its_device *device; + struct its_collection *collection, *new_coll = NULL; + int lpi_nr; + + device = find_its_device(its, device_id); + if (!device) + return E_ITS_MAPTI_UNMAPPED_DEVICE; + + if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) + lpi_nr = its_cmd_get_physical_id(its_cmd); + else + lpi_nr = event_id; + if (lpi_nr < GIC_LPI_OFFSET || + lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) + return E_ITS_MAPTI_PHYSICALID_OOR; + + collection = find_collection(its, coll_id); + if (!collection) { + int ret = vgic_its_alloc_collection(its, &collection, coll_id); + if (ret) + return ret; + new_coll = collection; + } + + itte = find_itte(its, device_id, event_id); + if (!itte) { + itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL); + if (!itte) { + if (new_coll) + vgic_its_free_collection(its, coll_id); + return -ENOMEM; + } + + itte->event_id = event_id; + list_add_tail(&itte->itte_list, &device->itt_head); + } + + itte->collection = collection; + itte->lpi = lpi_nr; + itte->irq = vgic_add_lpi(kvm, lpi_nr); + update_affinity_itte(kvm, itte); + + /* + * We "cache" the configuration table entries in out struct vgic_irq's. + * However we only have those structs for mapped IRQs, so we read in + * the respective config data from memory here upon mapping the LPI. + */ + update_lpi_config(kvm, itte->irq, NULL); + + return 0; +} + +/* Requires the its_lock to be held. */ +static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device) +{ + struct its_itte *itte, *temp; + + /* + * The spec says that unmapping a device with still valid + * ITTEs associated is UNPREDICTABLE. We remove all ITTEs, + * since we cannot leave the memory unreferenced. + */ + list_for_each_entry_safe(itte, temp, &device->itt_head, itte_list) + its_free_itte(kvm, itte); + + list_del(&device->dev_list); + kfree(device); +} + +/* + * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs). + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + bool valid = its_cmd_get_validbit(its_cmd); + struct its_device *device; + + if (!vgic_its_check_id(its, its->baser_device_table, device_id)) + return E_ITS_MAPD_DEVICE_OOR; + + device = find_its_device(its, device_id); + + /* + * The spec says that calling MAPD on an already mapped device + * invalidates all cached data for this device. We implement this + * by removing the mapping and re-establishing it. + */ + if (device) + vgic_its_unmap_device(kvm, device); + + /* + * The spec does not say whether unmapping a not-mapped device + * is an error, so we are done in any case. + */ + if (!valid) + return 0; + + device = kzalloc(sizeof(struct its_device), GFP_KERNEL); + if (!device) + return -ENOMEM; + + device->device_id = device_id; + INIT_LIST_HEAD(&device->itt_head); + + list_add_tail(&device->dev_list, &its->device_list); + + return 0; +} + +/* + * The MAPC command maps collection IDs to redistributors. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u16 coll_id; + u32 target_addr; + struct its_collection *collection; + bool valid; + + valid = its_cmd_get_validbit(its_cmd); + coll_id = its_cmd_get_collection(its_cmd); + target_addr = its_cmd_get_target_addr(its_cmd); + + if (target_addr >= atomic_read(&kvm->online_vcpus)) + return E_ITS_MAPC_PROCNUM_OOR; + + if (!valid) { + vgic_its_free_collection(its, coll_id); + } else { + collection = find_collection(its, coll_id); + + if (!collection) { + int ret; + + ret = vgic_its_alloc_collection(its, &collection, + coll_id); + if (ret) + return ret; + collection->target_addr = target_addr; + } else { + collection->target_addr = target_addr; + update_affinity_collection(kvm, its, collection); + } + } + + return 0; +} + +/* + * The CLEAR command removes the pending state for a particular LPI. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_itte *itte; + + + itte = find_itte(its, device_id, event_id); + if (!itte) + return E_ITS_CLEAR_UNMAPPED_INTERRUPT; + + itte->irq->pending = false; + + return 0; +} + +/* + * The INV command syncs the configuration bits from the memory table. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_itte *itte; + + + itte = find_itte(its, device_id, event_id); + if (!itte) + return E_ITS_INV_UNMAPPED_INTERRUPT; + + return update_lpi_config(kvm, itte->irq, NULL); +} + +/* + * The INVALL command requests flushing of all IRQ data in this collection. + * Find the VCPU mapped to that collection, then iterate over the VM's list + * of mapped LPIs and update the configuration for each IRQ which targets + * the specified vcpu. The configuration will be read from the in-memory + * configuration table. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 coll_id = its_cmd_get_collection(its_cmd); + struct its_collection *collection; + struct kvm_vcpu *vcpu; + struct vgic_irq *irq; + u32 *intids; + int irq_count, i; + + collection = find_collection(its, coll_id); + if (!its_is_collection_mapped(collection)) + return E_ITS_INVALL_UNMAPPED_COLLECTION; + + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + irq_count = vgic_copy_lpi_list(kvm, &intids); + if (irq_count < 0) + return irq_count; + + for (i = 0; i < irq_count; i++) { + irq = vgic_get_irq(kvm, NULL, intids[i]); + if (!irq) + continue; + update_lpi_config(kvm, irq, vcpu); + vgic_put_irq(kvm, irq); + } + + kfree(intids); + + return 0; +} + +/* + * The MOVALL command moves the pending state of all IRQs targeting one + * redistributor to another. We don't hold the pending state in the VCPUs, + * but in the IRQs instead, so there is really not much to do for us here. + * However the spec says that no IRQ must target the old redistributor + * afterwards, so we make sure that no LPI is using the associated target_vcpu. + * This command affects all LPIs in the system that target that redistributor. + */ +static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + u32 target1_addr = its_cmd_get_target_addr(its_cmd); + u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32); + struct kvm_vcpu *vcpu1, *vcpu2; + struct vgic_irq *irq; + + if (target1_addr >= atomic_read(&kvm->online_vcpus) || + target2_addr >= atomic_read(&kvm->online_vcpus)) + return E_ITS_MOVALL_PROCNUM_OOR; + + if (target1_addr == target2_addr) + return 0; + + vcpu1 = kvm_get_vcpu(kvm, target1_addr); + vcpu2 = kvm_get_vcpu(kvm, target2_addr); + + spin_lock(&dist->lpi_list_lock); + + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + spin_lock(&irq->irq_lock); + + if (irq->target_vcpu == vcpu1) + irq->target_vcpu = vcpu2; + + spin_unlock(&irq->irq_lock); + } + + spin_unlock(&dist->lpi_list_lock); + + return 0; +} + +/* + * The INT command injects the LPI associated with that DevID/EvID pair. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 msi_data = its_cmd_get_id(its_cmd); + u64 msi_devid = its_cmd_get_deviceid(its_cmd); + + vgic_its_trigger_msi(kvm, its, msi_devid, msi_data); + + return 0; +} + +/* + * This function is called with the its_cmd lock held, but the ITS data + * structure lock dropped. + */ +static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + int ret = -ENODEV; + + mutex_lock(&its->its_lock); + switch (its_cmd_get_command(its_cmd)) { + case GITS_CMD_MAPD: + ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd); + break; + case GITS_CMD_MAPC: + ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd); + break; + case GITS_CMD_MAPI: + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); + break; + case GITS_CMD_MAPTI: + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); + break; + case GITS_CMD_MOVI: + ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd); + break; + case GITS_CMD_DISCARD: + ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd); + break; + case GITS_CMD_CLEAR: + ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd); + break; + case GITS_CMD_MOVALL: + ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd); + break; + case GITS_CMD_INT: + ret = vgic_its_cmd_handle_int(kvm, its, its_cmd); + break; + case GITS_CMD_INV: + ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd); + break; + case GITS_CMD_INVALL: + ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd); + break; + case GITS_CMD_SYNC: + /* we ignore this command: we are in sync all of the time */ + ret = 0; + break; + } + mutex_unlock(&its->its_lock); + + return ret; +} + +static u64 vgic_sanitise_its_baser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK, + GITS_BASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK, + GITS_BASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK, + GITS_BASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */ + reg &= ~GENMASK_ULL(15, 12); + + /* We support only one (ITS) page size: 64K */ + reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; + + return reg; +} + +static u64 vgic_sanitise_its_cbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK, + GITS_CBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK, + GITS_CBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK, + GITS_CBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + /* + * Sanitise the physical address to be 64k aligned. + * Also limit the physical addresses to 48 bits. + */ + reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12)); + + return reg; +} + +static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->cbaser, addr & 7, len); +} + +static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + /* When GITS_CTLR.Enable is 1, this register is RO. */ + if (its->enabled) + return; + + mutex_lock(&its->cmd_lock); + its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val); + its->cbaser = vgic_sanitise_its_cbaser(its->cbaser); + its->creadr = 0; + /* + * CWRITER is architecturally UNKNOWN on reset, but we need to reset + * it to CREADR to make sure we start with an empty command buffer. + */ + its->cwriter = its->creadr; + mutex_unlock(&its->cmd_lock); +} + +#define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12) +#define ITS_CMD_SIZE 32 +#define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5)) + +/* + * By writing to CWRITER the guest announces new commands to be processed. + * To avoid any races in the first place, we take the its_cmd lock, which + * protects our ring buffer variables, so that there is only one user + * per ITS handling commands at a given time. + */ +static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + gpa_t cbaser; + u64 cmd_buf[4]; + u32 reg; + + if (!its) + return; + + mutex_lock(&its->cmd_lock); + + reg = update_64bit_reg(its->cwriter, addr & 7, len, val); + reg = ITS_CMD_OFFSET(reg); + if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { + mutex_unlock(&its->cmd_lock); + return; + } + + its->cwriter = reg; + cbaser = CBASER_ADDRESS(its->cbaser); + + while (its->cwriter != its->creadr) { + int ret = kvm_read_guest(kvm, cbaser + its->creadr, + cmd_buf, ITS_CMD_SIZE); + /* + * If kvm_read_guest() fails, this could be due to the guest + * programming a bogus value in CBASER or something else going + * wrong from which we cannot easily recover. + * According to section 6.3.2 in the GICv3 spec we can just + * ignore that command then. + */ + if (!ret) + vgic_its_handle_command(kvm, its, cmd_buf); + + its->creadr += ITS_CMD_SIZE; + if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser)) + its->creadr = 0; + } + + mutex_unlock(&its->cmd_lock); +} + +static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->cwriter, addr & 0x7, len); +} + +static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->creadr, addr & 0x7, len); +} + +#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7) +static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u64 reg; + + switch (BASER_INDEX(addr)) { + case 0: + reg = its->baser_device_table; + break; + case 1: + reg = its->baser_coll_table; + break; + default: + reg = 0; + break; + } + + return extract_bytes(reg, addr & 7, len); +} + +#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56)) +static void vgic_mmio_write_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u64 entry_size, device_type; + u64 reg, *regptr, clearbits = 0; + + /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ + if (its->enabled) + return; + + switch (BASER_INDEX(addr)) { + case 0: + regptr = &its->baser_device_table; + entry_size = 8; + device_type = GITS_BASER_TYPE_DEVICE; + break; + case 1: + regptr = &its->baser_coll_table; + entry_size = 8; + device_type = GITS_BASER_TYPE_COLLECTION; + clearbits = GITS_BASER_INDIRECT; + break; + default: + return; + } + + reg = update_64bit_reg(*regptr, addr & 7, len, val); + reg &= ~GITS_BASER_RO_MASK; + reg &= ~clearbits; + + reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; + reg |= device_type << GITS_BASER_TYPE_SHIFT; + reg = vgic_sanitise_its_baser(reg); + + *regptr = reg; +} + +#define REGISTER_ITS_DESC(off, rd, wr, length, acc) \ +{ \ + .reg_offset = off, \ + .len = length, \ + .access_flags = acc, \ + .its_read = rd, \ + .its_write = wr, \ +} + +static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, unsigned long val) +{ + /* Ignore */ +} + +static struct vgic_register_region its_registers[] = { + REGISTER_ITS_DESC(GITS_CTLR, + vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4, + VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_IIDR, + vgic_mmio_read_its_iidr, its_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_TYPER, + vgic_mmio_read_its_typer, its_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CBASER, + vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CWRITER, + vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CREADR, + vgic_mmio_read_its_creadr, its_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_BASER, + vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_IDREGS_BASE, + vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30, + VGIC_ACCESS_32bit), +}; + +/* This is called on setting the LPI enable bit in the redistributor. */ +void vgic_enable_lpis(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ)) + its_sync_lpi_pending_table(vcpu); +} + +static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its) +{ + struct vgic_io_device *iodev = &its->iodev; + int ret; + + if (its->initialized) + return 0; + + if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) + return -ENXIO; + + iodev->regions = its_registers; + iodev->nr_regions = ARRAY_SIZE(its_registers); + kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops); + + iodev->base_addr = its->vgic_its_base; + iodev->iodev_type = IODEV_ITS; + iodev->its = its; + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr, + KVM_VGIC_V3_ITS_SIZE, &iodev->dev); + mutex_unlock(&kvm->slots_lock); + + if (!ret) + its->initialized = true; + + return ret; +} + +#define INITIAL_BASER_VALUE \ + (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \ + ((8ULL - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | \ + GITS_BASER_PAGE_SIZE_64K) + +#define INITIAL_PROPBASER_VALUE \ + (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)) + +static int vgic_its_create(struct kvm_device *dev, u32 type) +{ + struct vgic_its *its; + + if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) + return -ENODEV; + + its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL); + if (!its) + return -ENOMEM; + + mutex_init(&its->its_lock); + mutex_init(&its->cmd_lock); + + its->vgic_its_base = VGIC_ADDR_UNDEF; + + INIT_LIST_HEAD(&its->device_list); + INIT_LIST_HEAD(&its->collection_list); + + dev->kvm->arch.vgic.has_its = true; + its->initialized = false; + its->enabled = false; + its->dev = dev; + + its->baser_device_table = INITIAL_BASER_VALUE | + ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT); + its->baser_coll_table = INITIAL_BASER_VALUE | + ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT); + dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE; + + dev->private = its; + + return 0; +} + +static void vgic_its_destroy(struct kvm_device *kvm_dev) +{ + struct kvm *kvm = kvm_dev->kvm; + struct vgic_its *its = kvm_dev->private; + struct its_device *dev; + struct its_itte *itte; + struct list_head *dev_cur, *dev_temp; + struct list_head *cur, *temp; + + /* + * We may end up here without the lists ever having been initialized. + * Check this and bail out early to avoid dereferencing a NULL pointer. + */ + if (!its->device_list.next) + return; + + mutex_lock(&its->its_lock); + list_for_each_safe(dev_cur, dev_temp, &its->device_list) { + dev = container_of(dev_cur, struct its_device, dev_list); + list_for_each_safe(cur, temp, &dev->itt_head) { + itte = (container_of(cur, struct its_itte, itte_list)); + its_free_itte(kvm, itte); + } + list_del(dev_cur); + kfree(dev); + } + + list_for_each_safe(cur, temp, &its->collection_list) { + list_del(cur); + kfree(container_of(cur, struct its_collection, coll_list)); + } + mutex_unlock(&its->its_lock); + + kfree(its); +} + +static int vgic_its_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_ITS_ADDR_TYPE: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + } + break; + } + return -ENXIO; +} + +static int vgic_its_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct vgic_its *its = dev->private; + int ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + u64 addr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (its->initialized) + return -EBUSY; + + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base, + addr, SZ_64K); + if (ret) + return ret; + + its->vgic_its_base = addr; + + return 0; + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return vgic_its_init_its(dev->kvm, its); + } + break; + } + return -ENXIO; +} + +static int vgic_its_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + struct vgic_its *its = dev->private; + u64 addr = its->vgic_its_base; + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + break; + default: + return -ENXIO; + } + } + + return 0; +} + +static struct kvm_device_ops kvm_arm_vgic_its_ops = { + .name = "kvm-arm-vgic-its", + .create = vgic_its_create, + .destroy = vgic_its_destroy, + .set_attr = vgic_its_set_attr, + .get_attr = vgic_its_get_attr, + .has_attr = vgic_its_has_attr, +}; + +int kvm_vgic_register_its_device(void) +{ + return kvm_register_device_ops(&kvm_arm_vgic_its_ops, + KVM_DEV_TYPE_ARM_VGIC_ITS); +} diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 0130c4b147b7..1813f93b5cde 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -21,8 +21,8 @@ /* common helpers */ -static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment) +int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment) { if (addr & ~KVM_PHYS_MASK) return -E2BIG; @@ -210,20 +210,27 @@ static void vgic_destroy(struct kvm_device *dev) kfree(dev); } -void kvm_register_vgic_device(unsigned long type) +int kvm_register_vgic_device(unsigned long type) { + int ret = -ENODEV; + switch (type) { case KVM_DEV_TYPE_ARM_VGIC_V2: - kvm_register_device_ops(&kvm_arm_vgic_v2_ops, - KVM_DEV_TYPE_ARM_VGIC_V2); + ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops, + KVM_DEV_TYPE_ARM_VGIC_V2); break; #ifdef CONFIG_KVM_ARM_VGIC_V3 case KVM_DEV_TYPE_ARM_VGIC_V3: - kvm_register_device_ops(&kvm_arm_vgic_v3_ops, - KVM_DEV_TYPE_ARM_VGIC_V3); + ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, + KVM_DEV_TYPE_ARM_VGIC_V3); + if (ret) + break; + ret = kvm_vgic_register_its_device(); break; #endif } + + return ret; } /** vgic_attr_regs_access: allows user space to read/write VGIC registers @@ -428,4 +435,3 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = { }; #endif /* CONFIG_KVM_ARM_VGIC_V3 */ - diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index a21393637e4b..b44b359cbbad 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -102,6 +102,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, irq->source |= 1U << source_vcpu->vcpu_id; vgic_queue_irq_unlock(source_vcpu->kvm, irq); + vgic_put_irq(source_vcpu->kvm, irq); } } @@ -116,6 +117,8 @@ static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); val |= (u64)irq->targets << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); } return val; @@ -143,6 +146,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } @@ -157,6 +161,8 @@ static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); val |= (u64)irq->source << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); } return val; } @@ -178,6 +184,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, irq->pending = false; spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } @@ -201,6 +208,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, } else { spin_unlock(&irq->irq_lock); } + vgic_put_irq(vcpu->kvm, irq); } } @@ -429,6 +437,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, struct vgic_io_device dev = { .regions = vgic_v2_cpu_registers, .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers), + .iodev_type = IODEV_CPUIF, }; return vgic_uaccess(vcpu, &dev, is_write, offset, val); @@ -440,6 +449,7 @@ int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, struct vgic_io_device dev = { .regions = vgic_v2_dist_registers, .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers), + .iodev_type = IODEV_DIST, }; return vgic_uaccess(vcpu, &dev, is_write, offset, val); diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index a0c515a412a7..ff668e0dd586 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -23,12 +23,35 @@ #include "vgic-mmio.h" /* extract @num bytes at @offset bytes offset in data */ -static unsigned long extract_bytes(unsigned long data, unsigned int offset, - unsigned int num) +unsigned long extract_bytes(unsigned long data, unsigned int offset, + unsigned int num) { return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); } +/* allows updates of any half of a 64-bit register (or the whole thing) */ +u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, + unsigned long val) +{ + int lower = (offset & 4) * 8; + int upper = lower + 8 * len - 1; + + reg &= ~GENMASK_ULL(upper, lower); + val &= GENMASK_ULL(len * 8 - 1, 0); + + return reg | ((u64)val << lower); +} + +bool vgic_has_its(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return false; + + return dist->has_its; +} + static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { @@ -43,7 +66,12 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, case GICD_TYPER: value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; value = (value >> 5) - 1; - value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; + if (vgic_has_its(vcpu->kvm)) { + value |= (INTERRUPT_ID_BITS_ITS - 1) << 19; + value |= GICD_TYPER_LPIS; + } else { + value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; + } break; case GICD_IIDR: value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); @@ -80,15 +108,17 @@ static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, { int intid = VGIC_ADDR_TO_INTID(addr, 64); struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); + unsigned long ret = 0; if (!irq) return 0; /* The upper word is RAZ for us. */ - if (addr & 4) - return 0; + if (!(addr & 4)) + ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); - return extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); + vgic_put_irq(vcpu->kvm, irq); + return ret; } static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, @@ -96,15 +126,17 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, unsigned long val) { int intid = VGIC_ADDR_TO_INTID(addr, 64); - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); - - if (!irq) - return; + struct vgic_irq *irq; /* The upper word is WI for us since we don't implement Aff3. */ if (addr & 4) return; + irq = vgic_get_irq(vcpu->kvm, NULL, intid); + + if (!irq) + return; + spin_lock(&irq->irq_lock); /* We only care about and preserve Aff0, Aff1 and Aff2. */ @@ -112,6 +144,32 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); +} + +static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0; +} + + +static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + bool was_enabled = vgic_cpu->lpis_enabled; + + if (!vgic_has_its(vcpu->kvm)) + return; + + vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; + + if (!was_enabled && vgic_cpu->lpis_enabled) + vgic_enable_lpis(vcpu); } static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, @@ -125,6 +183,8 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, value |= ((target_vcpu_id & 0xffff) << 8); if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1) value |= GICR_TYPER_LAST; + if (vgic_has_its(vcpu->kvm)) + value |= GICR_TYPER_PLPIS; return extract_bytes(value, addr & 7, len); } @@ -147,6 +207,142 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, return 0; } +/* We want to avoid outer shareable. */ +u64 vgic_sanitise_shareability(u64 field) +{ + switch (field) { + case GIC_BASER_OuterShareable: + return GIC_BASER_InnerShareable; + default: + return field; + } +} + +/* Avoid any inner non-cacheable mapping. */ +u64 vgic_sanitise_inner_cacheability(u64 field) +{ + switch (field) { + case GIC_BASER_CACHE_nCnB: + case GIC_BASER_CACHE_nC: + return GIC_BASER_CACHE_RaWb; + default: + return field; + } +} + +/* Non-cacheable or same-as-inner are OK. */ +u64 vgic_sanitise_outer_cacheability(u64 field) +{ + switch (field) { + case GIC_BASER_CACHE_SameAsInner: + case GIC_BASER_CACHE_nC: + return field; + default: + return GIC_BASER_CACHE_nC; + } +} + +u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, + u64 (*sanitise_fn)(u64)) +{ + u64 field = (reg & field_mask) >> field_shift; + + field = sanitise_fn(field) << field_shift; + return (reg & ~field_mask) | field; +} + +#define PROPBASER_RES0_MASK \ + (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5)) +#define PENDBASER_RES0_MASK \ + (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \ + GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0)) + +static u64 vgic_sanitise_pendbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK, + GICR_PENDBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK, + GICR_PENDBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK, + GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + reg &= ~PENDBASER_RES0_MASK; + reg &= ~GENMASK_ULL(51, 48); + + return reg; +} + +static u64 vgic_sanitise_propbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK, + GICR_PROPBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK, + GICR_PROPBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK, + GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + reg &= ~PROPBASER_RES0_MASK; + reg &= ~GENMASK_ULL(51, 48); + return reg; +} + +static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return extract_bytes(dist->propbaser, addr & 7, len); +} + +static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 propbaser = dist->propbaser; + + /* Storing a value with LPIs already enabled is undefined */ + if (vgic_cpu->lpis_enabled) + return; + + propbaser = update_64bit_reg(propbaser, addr & 4, len, val); + propbaser = vgic_sanitise_propbaser(propbaser); + + dist->propbaser = propbaser; +} + +static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + return extract_bytes(vgic_cpu->pendbaser, addr & 7, len); +} + +static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 pendbaser = vgic_cpu->pendbaser; + + /* Storing a value with LPIs already enabled is undefined */ + if (vgic_cpu->lpis_enabled) + return; + + pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val); + pendbaser = vgic_sanitise_pendbaser(pendbaser); + + vgic_cpu->pendbaser = pendbaser; +} + /* * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the * redistributors, while SPIs are covered by registers in the distributor @@ -218,7 +414,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = { static const struct vgic_register_region vgic_v3_rdbase_registers[] = { REGISTER_DESC_WITH_LENGTH(GICR_CTLR, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_IIDR, vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, @@ -227,10 +423,10 @@ static const struct vgic_register_region vgic_v3_rdbase_registers[] = { vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, - vgic_mmio_read_raz, vgic_mmio_write_wi, 8, + vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, - vgic_mmio_read_raz, vgic_mmio_write_wi, 8, + vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, @@ -285,24 +481,18 @@ unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev) int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address) { - int nr_vcpus = atomic_read(&kvm->online_vcpus); struct kvm_vcpu *vcpu; - struct vgic_io_device *devices; int c, ret = 0; - devices = kmalloc(sizeof(struct vgic_io_device) * nr_vcpus * 2, - GFP_KERNEL); - if (!devices) - return -ENOMEM; - kvm_for_each_vcpu(c, vcpu, kvm) { gpa_t rd_base = redist_base_address + c * SZ_64K * 2; gpa_t sgi_base = rd_base + SZ_64K; - struct vgic_io_device *rd_dev = &devices[c * 2]; - struct vgic_io_device *sgi_dev = &devices[c * 2 + 1]; + struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; + struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev; kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); rd_dev->base_addr = rd_base; + rd_dev->iodev_type = IODEV_REDIST; rd_dev->regions = vgic_v3_rdbase_registers; rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers); rd_dev->redist_vcpu = vcpu; @@ -317,6 +507,7 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address) kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops); sgi_dev->base_addr = sgi_base; + sgi_dev->iodev_type = IODEV_REDIST; sgi_dev->regions = vgic_v3_sgibase_registers; sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers); sgi_dev->redist_vcpu = vcpu; @@ -335,14 +526,15 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address) if (ret) { /* The current c failed, so we start with the previous one. */ for (c--; c >= 0; c--) { + struct vgic_cpu *vgic_cpu; + + vcpu = kvm_get_vcpu(kvm, c); + vgic_cpu = &vcpu->arch.vgic_cpu; kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, - &devices[c * 2].dev); + &vgic_cpu->rd_iodev.dev); kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, - &devices[c * 2 + 1].dev); + &vgic_cpu->sgi_iodev.dev); } - kfree(devices); - } else { - kvm->arch.vgic.redist_iodevs = devices; } return ret; @@ -451,5 +643,6 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) irq->pending = true; vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_put_irq(vcpu->kvm, irq); } } diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 9f6fab74dce7..3bad3c5ed431 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -56,6 +56,8 @@ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, if (irq->enabled) value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); } return value; @@ -74,6 +76,8 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, spin_lock(&irq->irq_lock); irq->enabled = true; vgic_queue_irq_unlock(vcpu->kvm, irq); + + vgic_put_irq(vcpu->kvm, irq); } } @@ -92,6 +96,7 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, irq->enabled = false; spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } @@ -108,6 +113,8 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, if (irq->pending) value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); } return value; @@ -129,6 +136,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, irq->soft_pending = true; vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_put_irq(vcpu->kvm, irq); } } @@ -152,6 +160,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, } spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } @@ -168,6 +177,8 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, if (irq->active) value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); } return value; @@ -242,6 +253,7 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, false); + vgic_put_irq(vcpu->kvm, irq); } vgic_change_active_finish(vcpu, intid); } @@ -257,6 +269,7 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, true); + vgic_put_irq(vcpu->kvm, irq); } vgic_change_active_finish(vcpu, intid); } @@ -272,6 +285,8 @@ unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); val |= (u64)irq->priority << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); } return val; @@ -298,6 +313,8 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, /* Narrow the priority range to what we actually support */ irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); spin_unlock(&irq->irq_lock); + + vgic_put_irq(vcpu->kvm, irq); } } @@ -313,6 +330,8 @@ unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, if (irq->config == VGIC_CONFIG_EDGE) value |= (2U << (i * 2)); + + vgic_put_irq(vcpu->kvm, irq); } return value; @@ -326,7 +345,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, int i; for (i = 0; i < len * 4; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq; /* * The configuration cannot be changed for SGIs in general, @@ -337,14 +356,18 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, if (intid + i < VGIC_NR_PRIVATE_IRQS) continue; + irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); spin_lock(&irq->irq_lock); + if (test_bit(i * 2 + 1, &val)) { irq->config = VGIC_CONFIG_EDGE; } else { irq->config = VGIC_CONFIG_LEVEL; irq->pending = irq->line_level | irq->soft_pending; } + spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } @@ -450,8 +473,7 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, { struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); const struct vgic_register_region *region; - struct kvm_vcpu *r_vcpu; - unsigned long data; + unsigned long data = 0; region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, addr - iodev->base_addr); @@ -460,8 +482,21 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, return 0; } - r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; - data = region->read(r_vcpu, addr, len); + switch (iodev->iodev_type) { + case IODEV_CPUIF: + data = region->read(vcpu, addr, len); + break; + case IODEV_DIST: + data = region->read(vcpu, addr, len); + break; + case IODEV_REDIST: + data = region->read(iodev->redist_vcpu, addr, len); + break; + case IODEV_ITS: + data = region->its_read(vcpu->kvm, iodev->its, addr, len); + break; + } + vgic_data_host_to_mmio_bus(val, len, data); return 0; } @@ -471,7 +506,6 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, { struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); const struct vgic_register_region *region; - struct kvm_vcpu *r_vcpu; unsigned long data = vgic_data_mmio_bus_to_host(val, len); region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, @@ -482,8 +516,21 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, if (!check_region(region, addr, len)) return 0; - r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; - region->write(r_vcpu, addr, len, data); + switch (iodev->iodev_type) { + case IODEV_CPUIF: + region->write(vcpu, addr, len, data); + break; + case IODEV_DIST: + region->write(vcpu, addr, len, data); + break; + case IODEV_REDIST: + region->write(iodev->redist_vcpu, addr, len, data); + break; + case IODEV_ITS: + region->its_write(vcpu->kvm, iodev->its, addr, len, data); + break; + } + return 0; } @@ -513,6 +560,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, } io_device->base_addr = dist_base_address; + io_device->iodev_type = IODEV_DIST; io_device->redist_vcpu = NULL; mutex_lock(&kvm->slots_lock); diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 850901482aec..0b3ecf9d100e 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -21,10 +21,19 @@ struct vgic_register_region { unsigned int len; unsigned int bits_per_irq; unsigned int access_flags; - unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len); - void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, - unsigned long val); + union { + unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len); + }; + union { + void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + void (*its_write)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val); + }; }; extern struct kvm_io_device_ops kvm_io_gic_ops; @@ -87,6 +96,12 @@ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, unsigned long data); +unsigned long extract_bytes(unsigned long data, unsigned int offset, + unsigned int num); + +u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, + unsigned long val); + unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len); @@ -147,4 +162,12 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); +#ifdef CONFIG_KVM_ARM_VGIC_V3 +u64 vgic_sanitise_outer_cacheability(u64 reg); +u64 vgic_sanitise_inner_cacheability(u64 reg); +u64 vgic_sanitise_shareability(u64 reg); +u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, + u64 (*sanitise_fn)(u64)); +#endif + #endif diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index e31405ee5515..0bf6709d1006 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -124,6 +124,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) } spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } @@ -332,20 +333,25 @@ int vgic_v2_probe(const struct gic_kvm_info *info) vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR); kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1; + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (ret) { + kvm_err("Cannot register GICv2 KVM device\n"); + iounmap(kvm_vgic_global_state.vctrl_base); + return ret; + } + ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base, kvm_vgic_global_state.vctrl_base + resource_size(&info->vctrl), info->vctrl.start); - if (ret) { kvm_err("Cannot map VCTRL into hyp\n"); + kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); iounmap(kvm_vgic_global_state.vctrl_base); return ret; } kvm_vgic_global_state.can_emulate_gicv2 = true; - kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); - kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.type = VGIC_V2; kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 346b4ad12b49..0506543df38a 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -81,6 +81,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) else intid = val & GICH_LR_VIRTUALID; irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + if (!irq) /* An LPI could have been unmapped. */ + continue; spin_lock(&irq->irq_lock); @@ -113,6 +115,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) } spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } @@ -190,6 +193,11 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; } +#define INITIAL_PENDBASER_VALUE \ + (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) + void vgic_v3_enable(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; @@ -207,10 +215,12 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) * way, so we force SRE to 1 to demonstrate this to the guest. * This goes with the spec allowing the value to be RAO/WI. */ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { vgic_v3->vgic_sre = ICC_SRE_EL1_SRE; - else + vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; + } else { vgic_v3->vgic_sre = 0; + } /* Get the show on the road... */ vgic_v3->vgic_hcr = ICH_HCR_EN; @@ -296,6 +306,7 @@ int vgic_v3_map_resources(struct kvm *kvm) int vgic_v3_probe(const struct gic_kvm_info *info) { u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); + int ret; /* * The ListRegs field is 5 bits, but there is a architectural @@ -319,12 +330,22 @@ int vgic_v3_probe(const struct gic_kvm_info *info) } else { kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.can_emulate_gicv2 = true; - kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (ret) { + kvm_err("Cannot register GICv2 KVM device.\n"); + return ret; + } kvm_info("vgic-v2@%llx\n", info->vcpu.start); } + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); + if (ret) { + kvm_err("Cannot register GICv3 KVM device.\n"); + kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); + return ret; + } + if (kvm_vgic_global_state.vcpu_base == 0) kvm_info("disabling GICv2 emulation\n"); - kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); kvm_vgic_global_state.vctrl_base = NULL; kvm_vgic_global_state.type = VGIC_V3; diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 69b61abefa19..39f3358c6d91 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -33,10 +33,17 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state; /* * Locking order is always: - * vgic_cpu->ap_list_lock - * vgic_irq->irq_lock + * its->cmd_lock (mutex) + * its->its_lock (mutex) + * vgic_cpu->ap_list_lock + * kvm->lpi_list_lock + * vgic_irq->irq_lock * - * (that is, always take the ap_list_lock before the struct vgic_irq lock). + * If you need to take multiple locks, always take the upper lock first, + * then the lower ones, e.g. first take the its_lock, then the irq_lock. + * If you are already holding a lock and need to take a higher one, you + * have to drop the lower ranking lock first and re-aquire it after having + * taken the upper one. * * When taking more than one ap_list_lock at the same time, always take the * lowest numbered VCPU's ap_list_lock first, so: @@ -45,6 +52,41 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state; * spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); */ +/* + * Iterate over the VM's list of mapped LPIs to find the one with a + * matching interrupt ID and return a reference to the IRQ structure. + */ +static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq = NULL; + + spin_lock(&dist->lpi_list_lock); + + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + if (irq->intid != intid) + continue; + + /* + * This increases the refcount, the caller is expected to + * call vgic_put_irq() later once it's finished with the IRQ. + */ + vgic_get_irq_kref(irq); + goto out_unlock; + } + irq = NULL; + +out_unlock: + spin_unlock(&dist->lpi_list_lock); + + return irq; +} + +/* + * This looks up the virtual interrupt ID to get the corresponding + * struct vgic_irq. It also increases the refcount, so any caller is expected + * to call vgic_put_irq() once it's finished with this IRQ. + */ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 intid) { @@ -56,14 +98,43 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, if (intid <= VGIC_MAX_SPI) return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; - /* LPIs are not yet covered */ + /* LPIs */ if (intid >= VGIC_MIN_LPI) - return NULL; + return vgic_get_lpi(kvm, intid); WARN(1, "Looking up struct vgic_irq for reserved INTID"); return NULL; } +/* + * We can't do anything in here, because we lack the kvm pointer to + * lock and remove the item from the lpi_list. So we keep this function + * empty and use the return value of kref_put() to trigger the freeing. + */ +static void vgic_irq_release(struct kref *ref) +{ +} + +void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) +{ + struct vgic_dist *dist; + + if (irq->intid < VGIC_MIN_LPI) + return; + + if (!kref_put(&irq->refcount, vgic_irq_release)) + return; + + dist = &kvm->arch.vgic; + + spin_lock(&dist->lpi_list_lock); + list_del(&irq->lpi_list); + dist->lpi_list_count--; + spin_unlock(&dist->lpi_list_lock); + + kfree(irq); +} + /** * kvm_vgic_target_oracle - compute the target vcpu for an irq * @@ -236,6 +307,11 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq) goto retry; } + /* + * Grab a reference to the irq to reflect the fact that it is + * now in the ap_list. + */ + vgic_get_irq_kref(irq); list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); irq->vcpu = vcpu; @@ -269,14 +345,17 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, if (!irq) return -EINVAL; - if (irq->hw != mapped_irq) + if (irq->hw != mapped_irq) { + vgic_put_irq(kvm, irq); return -EINVAL; + } spin_lock(&irq->irq_lock); if (!vgic_validate_injection(irq, level)) { /* Nothing to see here, move along... */ spin_unlock(&irq->irq_lock); + vgic_put_irq(kvm, irq); return 0; } @@ -288,6 +367,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, } vgic_queue_irq_unlock(kvm, irq); + vgic_put_irq(kvm, irq); return 0; } @@ -330,25 +410,28 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq) irq->hwintid = phys_irq; spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); return 0; } int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); - - BUG_ON(!irq); + struct vgic_irq *irq; if (!vgic_initialized(vcpu->kvm)) return -EAGAIN; + irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); + BUG_ON(!irq); + spin_lock(&irq->irq_lock); irq->hw = false; irq->hwintid = 0; spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); return 0; } @@ -386,6 +469,15 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) list_del(&irq->ap_list); irq->vcpu = NULL; spin_unlock(&irq->irq_lock); + + /* + * This vgic_put_irq call matches the + * vgic_get_irq_kref in vgic_queue_irq_unlock, + * where we added the LPI to the ap_list. As + * we remove the irq from the list, we drop + * also drop the refcount. + */ + vgic_put_irq(vcpu->kvm, irq); continue; } @@ -614,6 +706,15 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq) spin_lock(&irq->irq_lock); map_is_active = irq->hw && irq->active; spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); return map_is_active; } + +int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + if (vgic_has_its(kvm)) + return vgic_its_inject_msi(kvm, msi); + else + return -ENODEV; +} diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 7b300ca370b7..1d8e21d5c13f 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -25,6 +25,7 @@ #define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) #define INTERRUPT_ID_BITS_SPIS 10 +#define INTERRUPT_ID_BITS_ITS 16 #define VGIC_PRI_BITS 5 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) @@ -38,9 +39,13 @@ struct vgic_vmcr { struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 intid); +void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq); void vgic_kick_vcpus(struct kvm *kvm); +int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment); + void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu); void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); @@ -59,6 +64,14 @@ int vgic_v2_map_resources(struct kvm *kvm); int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, enum vgic_type); +static inline void vgic_get_irq_kref(struct vgic_irq *irq) +{ + if (irq->intid < VGIC_MIN_LPI) + return; + + kref_get(&irq->refcount); +} + #ifdef CONFIG_KVM_ARM_VGIC_V3 void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu); void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); @@ -71,6 +84,10 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu); int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address); +bool vgic_has_its(struct kvm *kvm); +int kvm_vgic_register_its_device(void); +void vgic_enable_lpis(struct kvm_vcpu *vcpu); +int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); #else static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu) { @@ -122,9 +139,28 @@ static inline int vgic_register_redist_iodevs(struct kvm *kvm, { return -ENODEV; } + +static inline bool vgic_has_its(struct kvm *kvm) +{ + return false; +} + +static inline int kvm_vgic_register_its_device(void) +{ + return -ENODEV; +} + +static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu) +{ +} + +static inline int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + return -ENODEV; +} #endif -void kvm_register_vgic_device(unsigned long type); +int kvm_register_vgic_device(unsigned long type); int vgic_lazy_init(struct kvm *kvm); int vgic_init(struct kvm *kvm); diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 8db197bb6c7a..df99e9c3b64d 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -135,7 +135,8 @@ void kvm_free_irq_routing(struct kvm *kvm) free_irq_routing_table(rt); } -static int setup_routing_entry(struct kvm_irq_routing_table *rt, +static int setup_routing_entry(struct kvm *kvm, + struct kvm_irq_routing_table *rt, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { @@ -154,7 +155,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, e->gsi = ue->gsi; e->type = ue->type; - r = kvm_set_routing_entry(e, ue); + r = kvm_set_routing_entry(kvm, e, ue); if (r) goto out; if (e->type == KVM_IRQ_ROUTING_IRQCHIP) @@ -211,7 +212,7 @@ int kvm_set_irq_routing(struct kvm *kvm, kfree(e); goto out; } - r = setup_routing_entry(new, e, ue); + r = setup_routing_entry(kvm, new, e, ue); if (r) { kfree(e); goto out; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2e791367c576..cc081ccfcaa3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1444,6 +1444,52 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) return true; } +static int hva_to_pfn_remapped(struct vm_area_struct *vma, + unsigned long addr, bool *async, + bool write_fault, kvm_pfn_t *p_pfn) +{ + unsigned long pfn; + int r; + + r = follow_pfn(vma, addr, &pfn); + if (r) { + /* + * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does + * not call the fault handler, so do it here. + */ + bool unlocked = false; + r = fixup_user_fault(current, current->mm, addr, + (write_fault ? FAULT_FLAG_WRITE : 0), + &unlocked); + if (unlocked) + return -EAGAIN; + if (r) + return r; + + r = follow_pfn(vma, addr, &pfn); + if (r) + return r; + + } + + + /* + * Get a reference here because callers of *hva_to_pfn* and + * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the + * returned pfn. This is only needed if the VMA has VM_MIXEDMAP + * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will + * simply do nothing for reserved pfns. + * + * Whoever called remap_pfn_range is also going to call e.g. + * unmap_mapping_range before the underlying pages are freed, + * causing a call to our MMU notifier. + */ + kvm_get_pfn(pfn); + + *p_pfn = pfn; + return 0; +} + /* * Pin guest page in memory and return its pfn. * @addr: host virtual address which maps memory to the guest @@ -1463,7 +1509,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, { struct vm_area_struct *vma; kvm_pfn_t pfn = 0; - int npages; + int npages, r; /* we can do it either atomically or asynchronously, not both */ BUG_ON(atomic && async); @@ -1485,14 +1531,17 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, goto exit; } +retry: vma = find_vma_intersection(current->mm, addr, addr + 1); if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; - else if ((vma->vm_flags & VM_PFNMAP)) { - pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + - vma->vm_pgoff; - BUG_ON(!kvm_is_reserved_pfn(pfn)); + else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { + r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn); + if (r == -EAGAIN) + goto retry; + if (r < 0) + pfn = KVM_PFN_ERR_FAULT; } else { if (async && vma_is_valid(vma, write_fault)) *async = true; @@ -2348,9 +2397,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (id >= KVM_MAX_VCPU_ID) return -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus == KVM_MAX_VCPUS) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + + kvm->created_vcpus++; + mutex_unlock(&kvm->lock); + vcpu = kvm_arch_vcpu_create(kvm, id); - if (IS_ERR(vcpu)) - return PTR_ERR(vcpu); + if (IS_ERR(vcpu)) { + r = PTR_ERR(vcpu); + goto vcpu_decrement; + } preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); @@ -2359,14 +2419,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto vcpu_destroy; mutex_lock(&kvm->lock); - if (!kvm_vcpu_compatible(vcpu)) { - r = -EINVAL; - goto unlock_vcpu_destroy; - } - if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { - r = -EINVAL; - goto unlock_vcpu_destroy; - } if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; goto unlock_vcpu_destroy; @@ -2399,6 +2451,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) mutex_unlock(&kvm->lock); vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); +vcpu_decrement: + mutex_lock(&kvm->lock); + kvm->created_vcpus--; + mutex_unlock(&kvm->lock); return r; } @@ -3487,6 +3543,30 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, return r; } +struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, + gpa_t addr) +{ + struct kvm_io_bus *bus; + int dev_idx, srcu_idx; + struct kvm_io_device *iodev = NULL; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + + dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); + if (dev_idx < 0) + goto out_unlock; + + iodev = bus->range[dev_idx].dev; + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + + return iodev; +} +EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); + static int kvm_debugfs_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt)