From 15f0eb3d6a62b22b27449329f04ee9aa0d218c6a Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Wed, 15 May 2019 01:42:42 +0200 Subject: [PATCH 01/83] s390/cio: Update SCSW if it points to the end of the chain Per the POPs [1], when processing an interrupt the SCSW.CPA field of an IRB generally points to 8 bytes after the last CCW that was executed (there are exceptions, but this is the most common behavior). In the case of an error, this points us to the first un-executed CCW in the chain. But in the case of normal I/O, the address points beyond the end of the chain. While the guest generally only cares about this when possibly restarting a channel program after error recovery, we should convert the address even in the good scenario so that we provide a consistent, valid, response upon I/O completion. [1] Figure 16-6 in SA22-7832-11. The footnotes in that table also state that this is true even if the resulting address is invalid or protected, but moving to the end of the guest chain should not be a surprise. Signed-off-by: Eric Farman Message-Id: <20190514234248.36203-2-farman@linux.ibm.com> Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 0e79799e9a71..6e48b66ae31a 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -886,7 +886,11 @@ void cp_update_scsw(struct channel_program *cp, union scsw *scsw) */ list_for_each_entry(chain, &cp->ccwchain_list, next) { ccw_head = (u32)(u64)chain->ch_ccw; - if (is_cpa_within_range(cpa, ccw_head, chain->ch_len)) { + /* + * On successful execution, cpa points just beyond the end + * of the chain. + */ + if (is_cpa_within_range(cpa, ccw_head, chain->ch_len + 1)) { /* * (cpa - ccw_head) is the offset value of the host * physical ccw to its chain head. From 4e31d6aecfbbe63cf09b8fe48572d20d2062c406 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Wed, 15 May 2019 01:42:43 +0200 Subject: [PATCH 02/83] s390/cio: Set vfio-ccw FSM state before ioeventfd Otherwise, the guest can believe it's okay to start another I/O and bump into the non-idle state. This results in a cc=2 (with the asynchronous CSCH/HSCH code) returned to the guest, which is unfortunate since everything is otherwise working normally. Signed-off-by: Eric Farman Reviewed-by: Pierre Morel Message-Id: <20190514234248.36203-3-farman@linux.ibm.com> Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index ee8767f5845a..66a66ac1f3d1 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -95,11 +95,11 @@ static void vfio_ccw_sch_io_todo(struct work_struct *work) memcpy(private->io_region->irb_area, irb, sizeof(*irb)); mutex_unlock(&private->io_mutex); - if (private->io_trigger) - eventfd_signal(private->io_trigger, 1); - if (private->mdev && is_final) private->state = VFIO_CCW_STATE_IDLE; + + if (private->io_trigger) + eventfd_signal(private->io_trigger, 1); } /* From e4f3f18b12324e2c140b97f93e3259eee9696d11 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Wed, 15 May 2019 01:42:44 +0200 Subject: [PATCH 03/83] s390/cio: Split pfn_array_alloc_pin into pieces The pfn_array_alloc_pin routine is doing too much. Today, it does the alloc of the pfn_array struct and its member arrays, builds the iova address lists out of a contiguous piece of guest memory, and asks vfio to pin the resulting pages. Let's effectively revert a significant portion of commit 5c1cfb1c3948 ("vfio: ccw: refactor and improve pfn_array_alloc_pin()") such that we break pfn_array_alloc_pin() into its component pieces, and have one routine that allocates/populates the pfn_array structs, and another that actually pins the memory. In the future, we will be able to handle scenarios where pinning memory isn't actually appropriate. Signed-off-by: Eric Farman Message-Id: <20190514234248.36203-4-farman@linux.ibm.com> Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 64 ++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 6e48b66ae31a..e33265fb80b0 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -50,28 +50,25 @@ struct ccwchain { }; /* - * pfn_array_alloc_pin() - alloc memory for PFNs, then pin user pages in memory + * pfn_array_alloc() - alloc memory for PFNs * @pa: pfn_array on which to perform the operation - * @mdev: the mediated device to perform pin/unpin operations * @iova: target guest physical address * @len: number of bytes that should be pinned from @iova * - * Attempt to allocate memory for PFNs, and pin user pages in memory. + * Attempt to allocate memory for PFNs. * * Usage of pfn_array: * We expect (pa_nr == 0) and (pa_iova_pfn == NULL), any field in * this structure will be filled in by this function. * * Returns: - * Number of pages pinned on success. - * If @pa->pa_nr is not 0, or @pa->pa_iova_pfn is not NULL initially, - * returns -EINVAL. - * If no pages were pinned, returns -errno. + * 0 if PFNs are allocated + * -EINVAL if pa->pa_nr is not initially zero, or pa->pa_iova_pfn is not NULL + * -ENOMEM if alloc failed */ -static int pfn_array_alloc_pin(struct pfn_array *pa, struct device *mdev, - u64 iova, unsigned int len) +static int pfn_array_alloc(struct pfn_array *pa, u64 iova, unsigned int len) { - int i, ret = 0; + int i; if (!len) return 0; @@ -97,6 +94,22 @@ static int pfn_array_alloc_pin(struct pfn_array *pa, struct device *mdev, for (i = 1; i < pa->pa_nr; i++) pa->pa_iova_pfn[i] = pa->pa_iova_pfn[i - 1] + 1; + return 0; +} + +/* + * pfn_array_pin() - Pin user pages in memory + * @pa: pfn_array on which to perform the operation + * @mdev: the mediated device to perform pin operations + * + * Returns number of pages pinned upon success. + * If the pin request partially succeeds, or fails completely, + * all pages are left unpinned and a negative error value is returned. + */ +static int pfn_array_pin(struct pfn_array *pa, struct device *mdev) +{ + int ret = 0; + ret = vfio_pin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr, IOMMU_READ | IOMMU_WRITE, pa->pa_pfn); @@ -112,8 +125,6 @@ static int pfn_array_alloc_pin(struct pfn_array *pa, struct device *mdev, err_out: pa->pa_nr = 0; - kfree(pa->pa_iova_pfn); - pa->pa_iova_pfn = NULL; return ret; } @@ -121,7 +132,9 @@ static int pfn_array_alloc_pin(struct pfn_array *pa, struct device *mdev, /* Unpin the pages before releasing the memory. */ static void pfn_array_unpin_free(struct pfn_array *pa, struct device *mdev) { - vfio_unpin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr); + /* Only unpin if any pages were pinned to begin with */ + if (pa->pa_nr) + vfio_unpin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr); pa->pa_nr = 0; kfree(pa->pa_iova_pfn); } @@ -209,10 +222,16 @@ static long copy_from_iova(struct device *mdev, int i, ret; unsigned long l, m; - ret = pfn_array_alloc_pin(&pa, mdev, iova, n); - if (ret <= 0) + ret = pfn_array_alloc(&pa, iova, n); + if (ret < 0) return ret; + ret = pfn_array_pin(&pa, mdev); + if (ret < 0) { + pfn_array_unpin_free(&pa, mdev); + return ret; + } + l = n; for (i = 0; i < pa.pa_nr; i++) { from = pa.pa_pfn[i] << PAGE_SHIFT; @@ -560,7 +579,11 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, if (ret) goto out_init; - ret = pfn_array_alloc_pin(pat->pat_pa, cp->mdev, ccw->cda, ccw->count); + ret = pfn_array_alloc(pat->pat_pa, ccw->cda, ccw->count); + if (ret < 0) + goto out_unpin; + + ret = pfn_array_pin(pat->pat_pa, cp->mdev); if (ret < 0) goto out_unpin; @@ -590,6 +613,7 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, { struct ccw1 *ccw; struct pfn_array_table *pat; + struct pfn_array *pa; unsigned long *idaws; u64 idaw_iova; unsigned int idaw_nr, idaw_len; @@ -628,9 +652,13 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, for (i = 0; i < idaw_nr; i++) { idaw_iova = *(idaws + i); + pa = pat->pat_pa + i; - ret = pfn_array_alloc_pin(pat->pat_pa + i, cp->mdev, - idaw_iova, 1); + ret = pfn_array_alloc(pa, idaw_iova, 1); + if (ret < 0) + goto out_free_idaws; + + ret = pfn_array_pin(pa, cp->mdev); if (ret < 0) goto out_free_idaws; } From c34a12e6a3df3aae8cb6f47baf0d14ef0fbecf7f Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Wed, 15 May 2019 01:42:45 +0200 Subject: [PATCH 04/83] s390/cio: Initialize the host addresses in pfn_array Let's initialize the host address to something that is invalid, rather than letting it default to zero. This just makes it easier to notice when a pin operation has failed or been skipped. Signed-off-by: Eric Farman Message-Id: <20190514234248.36203-5-farman@linux.ibm.com> Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index e33265fb80b0..086faf2dacd3 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -91,8 +91,11 @@ static int pfn_array_alloc(struct pfn_array *pa, u64 iova, unsigned int len) pa->pa_pfn = pa->pa_iova_pfn + pa->pa_nr; pa->pa_iova_pfn[0] = pa->pa_iova >> PAGE_SHIFT; - for (i = 1; i < pa->pa_nr; i++) + pa->pa_pfn[0] = -1ULL; + for (i = 1; i < pa->pa_nr; i++) { pa->pa_iova_pfn[i] = pa->pa_iova_pfn[i - 1] + 1; + pa->pa_pfn[i] = -1ULL; + } return 0; } From 5d87fbf70fb4f4be695a3052aa8b2883be026ce7 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 16 May 2019 18:14:01 +0200 Subject: [PATCH 05/83] s390/cio: Don't pin vfio pages for empty transfers The skip flag of a CCW offers the possibility of data not being transferred, but is only meaningful for certain commands. Specifically, it is only applicable for a read, read backward, sense, or sense ID CCW and will be ignored for any other command code (SA22-7832-11 page 15-64, and figure 15-30 on page 15-75). (A sense ID is xE4, while a sense is x04 with possible modifiers in the upper four bits. So we will cover the whole "family" of sense CCWs.) For those scenarios, since there is no requirement for the target address to be valid, we should skip the call to vfio_pin_pages() and rely on the IDAL address we have allocated/built for the channel program. The fact that the individual IDAWs within the IDAL are invalid is fine, since they aren't actually checked in these cases. Set pa_nr to zero when skipping the pfn_array_pin() call, since it is defined as the number of pages pinned and is used to determine whether to call vfio_unpin_pages() upon cleanup. The pfn_array_pin() routine returns the number of pages that were pinned, but now might be skipped for some CCWs. Thus we need to calculate the expected number of pages ourselves such that we are guaranteed to allocate a reasonable number of IDAWs, which will provide a valid address in CCW.CDA regardless of whether the IDAWs are filled in with pinned/translated addresses or not. Signed-off-by: Eric Farman Message-Id: <20190516161403.79053-2-farman@linux.ibm.com> Acked-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 55 ++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 086faf2dacd3..0467838aed23 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -294,6 +294,10 @@ static long copy_ccw_from_iova(struct channel_program *cp, /* * Helpers to operate ccwchain. */ +#define ccw_is_read(_ccw) (((_ccw)->cmd_code & 0x03) == 0x02) +#define ccw_is_read_backward(_ccw) (((_ccw)->cmd_code & 0x0F) == 0x0C) +#define ccw_is_sense(_ccw) (((_ccw)->cmd_code & 0x0F) == CCW_CMD_BASIC_SENSE) + #define ccw_is_test(_ccw) (((_ccw)->cmd_code & 0x0F) == 0) #define ccw_is_noop(_ccw) ((_ccw)->cmd_code == CCW_CMD_NOOP) @@ -301,10 +305,39 @@ static long copy_ccw_from_iova(struct channel_program *cp, #define ccw_is_tic(_ccw) ((_ccw)->cmd_code == CCW_CMD_TIC) #define ccw_is_idal(_ccw) ((_ccw)->flags & CCW_FLAG_IDA) - +#define ccw_is_skip(_ccw) ((_ccw)->flags & CCW_FLAG_SKIP) #define ccw_is_chain(_ccw) ((_ccw)->flags & (CCW_FLAG_CC | CCW_FLAG_DC)) +/* + * ccw_does_data_transfer() + * + * Determine whether a CCW will move any data, such that the guest pages + * would need to be pinned before performing the I/O. + * + * Returns 1 if yes, 0 if no. + */ +static inline int ccw_does_data_transfer(struct ccw1 *ccw) +{ + /* If the skip flag is off, then data will be transferred */ + if (!ccw_is_skip(ccw)) + return 1; + + /* + * If the skip flag is on, it is only meaningful if the command + * code is a read, read backward, sense, or sense ID. In those + * cases, no data will be transferred. + */ + if (ccw_is_read(ccw) || ccw_is_read_backward(ccw)) + return 0; + + if (ccw_is_sense(ccw)) + return 0; + + /* The skip flag is on, but it is ignored for this command code. */ + return 1; +} + /* * is_cpa_within_range() * @@ -559,6 +592,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, struct pfn_array_table *pat; unsigned long *idaws; int ret; + int idaw_nr = 1; ccw = chain->ch_ccw + idx; @@ -570,6 +604,8 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, */ ccw->flags |= CCW_FLAG_IDA; return 0; + } else { + idaw_nr = idal_nr_words((void *)(u64)ccw->cda, ccw->count); } /* @@ -586,12 +622,16 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, if (ret < 0) goto out_unpin; - ret = pfn_array_pin(pat->pat_pa, cp->mdev); - if (ret < 0) - goto out_unpin; + if (ccw_does_data_transfer(ccw)) { + ret = pfn_array_pin(pat->pat_pa, cp->mdev); + if (ret < 0) + goto out_unpin; + } else { + pat->pat_pa->pa_nr = 0; + } /* Translate this direct ccw to a idal ccw. */ - idaws = kcalloc(ret, sizeof(*idaws), GFP_DMA | GFP_KERNEL); + idaws = kcalloc(idaw_nr, sizeof(*idaws), GFP_DMA | GFP_KERNEL); if (!idaws) { ret = -ENOMEM; goto out_unpin; @@ -661,6 +701,11 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, if (ret < 0) goto out_free_idaws; + if (!ccw_does_data_transfer(ccw)) { + pa->pa_nr = 0; + continue; + } + ret = pfn_array_pin(pa, cp->mdev); if (ret < 0) goto out_free_idaws; From 453eac312445ecf9027d9bd49781b62b04140960 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 16 May 2019 18:14:02 +0200 Subject: [PATCH 06/83] s390/cio: Allow zero-length CCWs in vfio-ccw It is possible that a guest might issue a CCW with a length of zero, and will expect a particular response. Consider this chain: Address Format-1 CCW -------- ----------------- 0 33110EC0 346022CC 33177468 1 33110EC8 CF200000 3318300C CCW[0] moves a little more than two pages, but also has the Suppress Length Indication (SLI) bit set to handle the expectation that considerably less data will be moved. CCW[1] also has the SLI bit set, and has a length of zero. Once vfio-ccw does its magic, the kernel issues a start subchannel on behalf of the guest with this: Address Format-1 CCW -------- ----------------- 0 021EDED0 346422CC 021F0000 1 021EDED8 CF240000 3318300C Both CCWs were converted to an IDAL and have the corresponding flags set (which is by design), but only the address of the first data address is converted to something the host is aware of. The second CCW still has the address used by the guest, which happens to be (A) (probably) an invalid address for the host, and (B) an invalid IDAW address (doubleword boundary, etc.). While the I/O fails, it doesn't fail correctly. In this example, we would receive a program check for an invalid IDAW address, instead of a unit check for an invalid command. To fix this, revert commit 4cebc5d6a6ff ("vfio: ccw: validate the count field of a ccw before pinning") and allow the individual fetch routines to process them like anything else. We'll make a slight adjustment to our allocation of the pfn_array (for direct CCWs) or IDAL (for IDAL CCWs) memory, so that we have room for at least one address even though no guest memory will be pinned and thus the IDAW will not be populated with a host address. Signed-off-by: Eric Farman Message-Id: <20190516161403.79053-3-farman@linux.ibm.com> Acked-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 0467838aed23..c77c9b4cd2a8 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -70,9 +70,6 @@ static int pfn_array_alloc(struct pfn_array *pa, u64 iova, unsigned int len) { int i; - if (!len) - return 0; - if (pa->pa_nr || pa->pa_iova_pfn) return -EINVAL; @@ -319,6 +316,10 @@ static long copy_ccw_from_iova(struct channel_program *cp, */ static inline int ccw_does_data_transfer(struct ccw1 *ccw) { + /* If the count field is zero, then no data will be transferred */ + if (ccw->count == 0) + return 0; + /* If the skip flag is off, then data will be transferred */ if (!ccw_is_skip(ccw)) return 1; @@ -405,8 +406,6 @@ static void ccwchain_cda_free(struct ccwchain *chain, int idx) if (ccw_is_test(ccw) || ccw_is_noop(ccw) || ccw_is_tic(ccw)) return; - if (!ccw->count) - return; kfree((void *)(u64)ccw->cda); } @@ -592,19 +591,13 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, struct pfn_array_table *pat; unsigned long *idaws; int ret; + int bytes = 1; int idaw_nr = 1; ccw = chain->ch_ccw + idx; - if (!ccw->count) { - /* - * We just want the translation result of any direct ccw - * to be an IDA ccw, so let's add the IDA flag for it. - * Although the flag will be ignored by firmware. - */ - ccw->flags |= CCW_FLAG_IDA; - return 0; - } else { + if (ccw->count) { + bytes = ccw->count; idaw_nr = idal_nr_words((void *)(u64)ccw->cda, ccw->count); } @@ -618,7 +611,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, if (ret) goto out_init; - ret = pfn_array_alloc(pat->pat_pa, ccw->cda, ccw->count); + ret = pfn_array_alloc(pat->pat_pa, ccw->cda, bytes); if (ret < 0) goto out_unpin; @@ -661,17 +654,18 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, u64 idaw_iova; unsigned int idaw_nr, idaw_len; int i, ret; + int bytes = 1; ccw = chain->ch_ccw + idx; - if (!ccw->count) - return 0; + if (ccw->count) + bytes = ccw->count; /* Calculate size of idaws. */ ret = copy_from_iova(cp->mdev, &idaw_iova, ccw->cda, sizeof(idaw_iova)); if (ret) return ret; - idaw_nr = idal_nr_words((void *)(idaw_iova), ccw->count); + idaw_nr = idal_nr_words((void *)(idaw_iova), bytes); idaw_len = idaw_nr * sizeof(*idaws); /* Pin data page(s) in memory. */ From 9b6e57e5a51696171de990b3c41bd53d4b8ab8ac Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 16 May 2019 18:14:03 +0200 Subject: [PATCH 07/83] s390/cio: Remove vfio-ccw checks of command codes If the CCW being processed is a No-Operation, then by definition no data is being transferred. Let's fold those checks into the normal CCW processors, rather than skipping out early. Likewise, if the CCW being processed is a "test" (a category defined here as an opcode that contains zero in the lowest four bits) then no special processing is necessary as far as vfio-ccw is concerned. These command codes have not been valid since the S/370 days, meaning they are invalid in the same way as one that ends in an eight [1] or an otherwise valid command code that is undefined for the device type in question. Considering that, let's just process "test" CCWs like any other CCW, and send everything to the hardware. [1] POPS states that a x08 is a TIC CCW, and that having any high-order bits enabled is invalid for format-1 CCWs. For format-0 CCWs, the high-order bits are ignored. Signed-off-by: Eric Farman Message-Id: <20190516161403.79053-4-farman@linux.ibm.com> Acked-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index c77c9b4cd2a8..f73cfcfdd032 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -295,8 +295,6 @@ static long copy_ccw_from_iova(struct channel_program *cp, #define ccw_is_read_backward(_ccw) (((_ccw)->cmd_code & 0x0F) == 0x0C) #define ccw_is_sense(_ccw) (((_ccw)->cmd_code & 0x0F) == CCW_CMD_BASIC_SENSE) -#define ccw_is_test(_ccw) (((_ccw)->cmd_code & 0x0F) == 0) - #define ccw_is_noop(_ccw) ((_ccw)->cmd_code == CCW_CMD_NOOP) #define ccw_is_tic(_ccw) ((_ccw)->cmd_code == CCW_CMD_TIC) @@ -320,6 +318,10 @@ static inline int ccw_does_data_transfer(struct ccw1 *ccw) if (ccw->count == 0) return 0; + /* If the command is a NOP, then no data will be transferred */ + if (ccw_is_noop(ccw)) + return 0; + /* If the skip flag is off, then data will be transferred */ if (!ccw_is_skip(ccw)) return 1; @@ -404,7 +406,7 @@ static void ccwchain_cda_free(struct ccwchain *chain, int idx) { struct ccw1 *ccw = chain->ch_ccw + idx; - if (ccw_is_test(ccw) || ccw_is_noop(ccw) || ccw_is_tic(ccw)) + if (ccw_is_tic(ccw)) return; kfree((void *)(u64)ccw->cda); @@ -730,9 +732,6 @@ static int ccwchain_fetch_one(struct ccwchain *chain, { struct ccw1 *ccw = chain->ch_ccw + idx; - if (ccw_is_test(ccw) || ccw_is_noop(ccw)) - return 0; - if (ccw_is_tic(ccw)) return ccwchain_fetch_tic(chain, idx, cp); From a646ef398e72a2ac40bea974808ffcf1bea4e7f4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 17 May 2019 12:50:43 +0200 Subject: [PATCH 08/83] s390/jump_label: replace stop_machine with smp_call_function The use of stop_machine to replace the mask bits of the jump label branch is a very heavy-weight operation. This is in fact not necessary, the mask of the branch can simply be updated, followed by a signal processor to all the other CPUs to force them to pick up the modified instruction. Signed-off-by: Martin Schwidefsky [heiko.carstens@de.ibm.com]: Change jump_label_make_nop() so we get brcl 0,offset instead of brcl 0,0. This makes sure that only the mask part of the instruction gets changed when updated. Signed-off-by: Heiko Carstens --- arch/s390/kernel/jump_label.c | 18 +++++------------- arch/s390/mm/maccess.c | 9 +++++---- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index 3f10b56bd5a3..e193630a7d2a 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -22,9 +22,9 @@ struct insn_args { static void jump_label_make_nop(struct jump_entry *entry, struct insn *insn) { - /* brcl 0,0 */ + /* brcl 0,offset */ insn->opcode = 0xc004; - insn->offset = 0; + insn->offset = (jump_entry_target(entry) - jump_entry_code(entry)) >> 1; } static void jump_label_make_branch(struct jump_entry *entry, struct insn *insn) @@ -77,23 +77,15 @@ static void __jump_label_transform(struct jump_entry *entry, s390_kernel_write(code, &new, sizeof(new)); } -static int __sm_arch_jump_label_transform(void *data) +static void __jump_label_sync(void *dummy) { - struct insn_args *args = data; - - __jump_label_transform(args->entry, args->type, 0); - return 0; } void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - struct insn_args args; - - args.entry = entry; - args.type = type; - - stop_machine_cpuslocked(__sm_arch_jump_label_transform, &args, NULL); + __jump_label_transform(entry, type, 0); + smp_call_function(__jump_label_sync, NULL, 1); } void arch_jump_label_transform_static(struct jump_entry *entry, diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 818deeb1ebc3..1864a8bb9622 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -52,21 +52,22 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz * Therefore we have a read-modify-write sequence: the function reads eight * bytes from destination at an eight byte boundary, modifies the bytes * requested and writes the result back in a loop. - * - * Note: this means that this function may not be called concurrently on - * several cpus with overlapping words, since this may potentially - * cause data corruption. */ +static DEFINE_SPINLOCK(s390_kernel_write_lock); + void notrace s390_kernel_write(void *dst, const void *src, size_t size) { + unsigned long flags; long copied; + spin_lock_irqsave(&s390_kernel_write_lock, flags); while (size) { copied = s390_kernel_write_odd(dst, src, size); dst += copied; src += copied; size -= copied; } + spin_unlock_irqrestore(&s390_kernel_write_lock, flags); } static int __memcpy_real(void *dest, void *src, size_t count) From 31885a8dad16a319a394d82807c3b708882b50a1 Mon Sep 17 00:00:00 2001 From: xiaolinkui Date: Fri, 17 May 2019 15:15:17 +0800 Subject: [PATCH 09/83] s390/idal: use struct_size() in kmalloc() Use the new struct_size() helper to keep code simple. Signed-off-by: xiaolinkui Signed-off-by: Heiko Carstens --- arch/s390/include/asm/idals.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h index 15578fd762f6..6fb7aced104a 100644 --- a/arch/s390/include/asm/idals.h +++ b/arch/s390/include/asm/idals.h @@ -122,8 +122,7 @@ idal_buffer_alloc(size_t size, int page_order) nr_ptrs = (size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_LOG; nr_chunks = (4096 << page_order) >> IDA_SIZE_LOG; - ib = kmalloc(sizeof(struct idal_buffer) + nr_ptrs*sizeof(void *), - GFP_DMA | GFP_KERNEL); + ib = kmalloc(struct_size(ib, data, nr_ptrs), GFP_DMA | GFP_KERNEL); if (ib == NULL) return ERR_PTR(-ENOMEM); ib->size = size; From e888f7419dff260202e586421b44cb526a600cc2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 May 2019 16:54:24 +0900 Subject: [PATCH 10/83] s390: do not pass $(LINUXINCLUDE) to gen_opcode_table.c I guess HOSTCFLAGS_gen_opcode_table.o was blindly copied from HOSTCFLAGS_gen_facilities.o The reason of adding $(LINUXINCLUDE) to HOSTCFLAGS_gen_facilities.o is because gen_facilities.c references some CONFIG options. (Kbuild does not cater to this for host tools automatically.) On the other hand, gen_opcode_table.c does not reference CONFIG options at all. So, there is no good reason to pass $(LINUXINCLUDE). Signed-off-by: Masahiro Yamada Signed-off-by: Heiko Carstens --- arch/s390/tools/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index 2342b84b3386..4ff6a2124522 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -15,7 +15,7 @@ hostprogs-y += gen_facilities hostprogs-y += gen_opcode_table HOSTCFLAGS_gen_facilities.o += -Wall $(LINUXINCLUDE) -HOSTCFLAGS_gen_opcode_table.o += -Wall $(LINUXINCLUDE) +HOSTCFLAGS_gen_opcode_table.o += -Wall # Ensure output directory exists _dummy := $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') From f1090b61a76f4af523418a4ff3de4324ae72ec47 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 May 2019 16:54:25 +0900 Subject: [PATCH 11/83] s390: drop unneeded -Wall addition from tools Makefile The top level Makefile adds -Wall globally for all host tools: KBUILD_HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 \ I see two "-Wall" added for compiling these tools. Of course, it is allowed to pass the same option multiple times, but we do not need to do so. Signed-off-by: Masahiro Yamada Signed-off-by: Heiko Carstens --- arch/s390/tools/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index 4ff6a2124522..8fb66c99840a 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -14,8 +14,7 @@ kapi: $(kapi-hdrs-y) hostprogs-y += gen_facilities hostprogs-y += gen_opcode_table -HOSTCFLAGS_gen_facilities.o += -Wall $(LINUXINCLUDE) -HOSTCFLAGS_gen_opcode_table.o += -Wall +HOSTCFLAGS_gen_facilities.o += $(LINUXINCLUDE) # Ensure output directory exists _dummy := $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') From c3bce92531ac4f4ed18008545d54b15f2166ddb4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 May 2019 16:54:26 +0900 Subject: [PATCH 12/83] s390: drop redundant directory creation from tools Makefile As you can see in scripts/Kbuild.include, the filechk creates the parent directory of the target as needed. This Makefile does not need to explicitly create the directory. Signed-off-by: Masahiro Yamada Signed-off-by: Heiko Carstens --- arch/s390/tools/Makefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index 8fb66c99840a..4864ea5e6ceb 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -16,9 +16,6 @@ hostprogs-y += gen_opcode_table HOSTCFLAGS_gen_facilities.o += $(LINUXINCLUDE) -# Ensure output directory exists -_dummy := $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') - filechk_facility-defs.h = $(obj)/gen_facilities filechk_dis-defs.h = \ From 10077c9f2dae1afabab2808a0326ecf3e8e5a82c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 May 2019 16:54:27 +0900 Subject: [PATCH 13/83] s390: drop meaningless 'targets' from tools Makefile 'targets' should be specified to include .*.cmd files to evaluate if_changed or friends. Here, facility-defs.h and dis-defs.h are generated by filechk. Because filechk does not generate .*.cmd file, the 'targets' addition is meaningless. The filechk correctly updates the target when its content is changed. Signed-off-by: Masahiro Yamada Signed-off-by: Heiko Carstens --- arch/s390/tools/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index 4864ea5e6ceb..b5e35e8f999a 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -6,7 +6,6 @@ kapi := arch/$(ARCH)/include/generated/asm kapi-hdrs-y := $(kapi)/facility-defs.h $(kapi)/dis-defs.h -targets += $(addprefix ../../../,$(kapi-hdrs-y)) PHONY += kapi kapi: $(kapi-hdrs-y) From dbe1c16be381cb52c80fd7b40fcd05ae538d751b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 31 May 2019 11:46:51 +0900 Subject: [PATCH 14/83] s390/purgatory: update .gitignore Since commit 4c0f032d4963 ("s390/purgatory: Omit use of bin2c"), kexec-purgatory.c is not generated. purgatory and purgatory.lds are generated files, so should be ignored by git. Signed-off-by: Masahiro Yamada Signed-off-by: Heiko Carstens --- arch/s390/purgatory/.gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/purgatory/.gitignore b/arch/s390/purgatory/.gitignore index e9e66f178a6d..04a03433c720 100644 --- a/arch/s390/purgatory/.gitignore +++ b/arch/s390/purgatory/.gitignore @@ -1,2 +1,3 @@ -kexec-purgatory.c +purgatory +purgatory.lds purgatory.ro From 8b96d9712abcfdf6b5061610f1d187e23413eb0f Mon Sep 17 00:00:00 2001 From: "Enrico Weigelt, metux IT consult" Date: Wed, 6 Mar 2019 21:32:01 +0100 Subject: [PATCH 15/83] s390/Kconfig: pedantic cleanups Formatting of Kconfig files doesn't look so pretty, so just take damp cloth and clean it up. Signed-off-by: Enrico Weigelt, metux IT consult Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 2 +- drivers/s390/block/Kconfig | 2 +- drivers/s390/char/Kconfig | 15 +++++++-------- drivers/s390/net/Kconfig | 8 ++++---- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 109243fdb6ec..e2e154051b07 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -763,7 +763,7 @@ config PCI_NR_FUNCTIONS This allows you to specify the maximum number of PCI functions which this kernel will support. -endif # PCI +endif # PCI config HAS_IOMEM def_bool PCI diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig index 9ac7574e3cfb..a8682f69effc 100644 --- a/drivers/s390/block/Kconfig +++ b/drivers/s390/block/Kconfig @@ -38,7 +38,7 @@ config DASD_PROFILE depends on DASD help Enable this option if you want to see profiling information - in /proc/dasd/statistics. + in /proc/dasd/statistics. config DASD_ECKD def_tristate y diff --git a/drivers/s390/char/Kconfig b/drivers/s390/char/Kconfig index ab0b243a947d..e2c0c60760b0 100644 --- a/drivers/s390/char/Kconfig +++ b/drivers/s390/char/Kconfig @@ -91,14 +91,14 @@ config SCLP_ASYNC need this feature and intend to run your kernel in LPAR. config SCLP_ASYNC_ID - string "Component ID for Call Home" - depends on SCLP_ASYNC - default "000000000" - help - The Component ID for Call Home is used to identify the correct - problem reporting queue the call home records should be sent to. + string "Component ID for Call Home" + depends on SCLP_ASYNC + default "000000000" + help + The Component ID for Call Home is used to identify the correct + problem reporting queue the call home records should be sent to. - If your are unsure, please use the default value "000000000". + If your are unsure, please use the default value "000000000". config HMC_DRV def_tristate m @@ -205,4 +205,3 @@ config S390_VMUR depends on S390 help Character device driver for z/VM reader, puncher and printer. - diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig index 7c5a25ddf832..ced896d1534a 100644 --- a/drivers/s390/net/Kconfig +++ b/drivers/s390/net/Kconfig @@ -7,10 +7,10 @@ config LCS prompt "Lan Channel Station Interface" depends on CCW && NETDEVICES && (ETHERNET || FDDI) help - Select this option if you want to use LCS networking on IBM System z. - This device driver supports FDDI (IEEE 802.7) and Ethernet. - To compile as a module, choose M. The module name is lcs. - If you do not know what it is, it's safe to choose Y. + Select this option if you want to use LCS networking on IBM System z. + This device driver supports FDDI (IEEE 802.7) and Ethernet. + To compile as a module, choose M. The module name is lcs. + If you do not know what it is, it's safe to choose Y. config CTCM def_tristate m From 567b722347239484d0b9ab0b42aeb24c1fe6b4e4 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 4 Apr 2019 02:19:56 -0400 Subject: [PATCH 16/83] s390/mm: mmap base does not depend on ADDR_NO_RANDOMIZE personality randomize_stack_top() checks for current task flag PF_RANDOMIZE in order to use stack randomization and PF_RANDOMIZE is set when ADDR_NO_RANDOMIZE is unset, so no need to check for ADDR_NO_RANDOMIZE in stack_maxrandom_size. [heiko.carstens@de.ibm.com]: See also commit 01578e36163c ("x86/elf: Remove the unnecessary ADDR_NO_RANDOMIZE checks") Signed-off-by: Alexandre Ghiti Signed-off-by: Heiko Carstens --- arch/s390/mm/mmap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 687f2a4d3459..cbc718ba6d78 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -24,8 +24,6 @@ static unsigned long stack_maxrandom_size(void) { if (!(current->flags & PF_RANDOMIZE)) return 0; - if (current->personality & ADDR_NO_RANDOMIZE) - return 0; return STACK_RND_MASK << PAGE_SHIFT; } From fc20f0c1d7d4c67cab0788c3920aa82a48f43cfe Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 14 Feb 2019 16:41:53 +0100 Subject: [PATCH 17/83] s390/disassembler: update opcode table Sync with binutils and add a couple of missing instructions. Signed-off-by: Martin Schwidefsky Signed-off-by: Heiko Carstens --- arch/s390/kernel/dis.c | 5 ++-- arch/s390/tools/opcodes.txt | 51 ++++++++++++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index b2c68fbf2634..7abe6ae261b4 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -242,6 +242,7 @@ static const unsigned char formats[][6] = { [INSTR_RRF_U0FF] = { F_24, U4_16, F_28, 0, 0, 0 }, [INSTR_RRF_U0RF] = { R_24, U4_16, F_28, 0, 0, 0 }, [INSTR_RRF_U0RR] = { R_24, R_28, U4_16, 0, 0, 0 }, + [INSTR_RRF_URR] = { R_24, R_28, U8_16, 0, 0, 0 }, [INSTR_RRF_UUFF] = { F_24, U4_16, F_28, U4_20, 0, 0 }, [INSTR_RRF_UUFR] = { F_24, U4_16, R_28, U4_20, 0, 0 }, [INSTR_RRF_UURF] = { R_24, U4_16, F_28, U4_20, 0, 0 }, @@ -306,7 +307,7 @@ static const unsigned char formats[][6] = { [INSTR_VRI_VVV0UU2] = { V_8, V_12, V_16, U8_28, U4_24, 0 }, [INSTR_VRR_0V] = { V_12, 0, 0, 0, 0, 0 }, [INSTR_VRR_0VV0U] = { V_12, V_16, U4_24, 0, 0, 0 }, - [INSTR_VRR_RV0U] = { R_8, V_12, U4_24, 0, 0, 0 }, + [INSTR_VRR_RV0UU] = { R_8, V_12, U4_24, U4_28, 0, 0 }, [INSTR_VRR_VRR] = { V_8, R_12, R_16, 0, 0, 0 }, [INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 }, [INSTR_VRR_VV0U] = { V_8, V_12, U4_32, 0, 0, 0 }, @@ -326,10 +327,8 @@ static const unsigned char formats[][6] = { [INSTR_VRS_RVRDU] = { R_8, V_12, D_20, B_16, U4_32, 0 }, [INSTR_VRS_VRRD] = { V_8, R_12, D_20, B_16, 0, 0 }, [INSTR_VRS_VRRDU] = { V_8, R_12, D_20, B_16, U4_32, 0 }, - [INSTR_VRS_VVRD] = { V_8, V_12, D_20, B_16, 0, 0 }, [INSTR_VRS_VVRDU] = { V_8, V_12, D_20, B_16, U4_32, 0 }, [INSTR_VRV_VVXRDU] = { V_8, D_20, VX_12, B_16, U4_32, 0 }, - [INSTR_VRX_VRRD] = { V_8, D_20, X_12, B_16, 0, 0 }, [INSTR_VRX_VRRDU] = { V_8, D_20, X_12, B_16, U4_32, 0 }, [INSTR_VRX_VV] = { V_8, V_12, 0, 0, 0, 0 }, [INSTR_VSI_URDV] = { V_32, D_20, B_16, U8_8, 0, 0 }, diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt index 64638b764d1c..46d8ed96cf06 100644 --- a/arch/s390/tools/opcodes.txt +++ b/arch/s390/tools/opcodes.txt @@ -520,6 +520,9 @@ b92e km RRE_RR b92f kmc RRE_RR b930 cgfr RRE_RR b931 clgfr RRE_RR +b938 sortl RRE_RR +b939 dfltcc RRF_R0RR2 +b93a kdsa RRE_RR b93c ppno RRE_RR b93e kimd RRE_RR b93f klmd RRE_RR @@ -538,8 +541,16 @@ b95a cxlgtr RRF_UUFR b95b cxlftr RRF_UUFR b960 cgrt RRF_U0RR b961 clgrt RRF_U0RR +b964 nngrk RRF_R0RR2 +b965 ocgrk RRF_R0RR2 +b966 nogrk RRF_R0RR2 +b967 nxgrk RRF_R0RR2 b972 crt RRF_U0RR b973 clrt RRF_U0RR +b974 nnrk RRF_R0RR2 +b975 ocrk RRF_R0RR2 +b976 nork RRF_R0RR2 +b977 nxrk RRF_R0RR2 b980 ngr RRE_RR b981 ogr RRE_RR b982 xgr RRE_RR @@ -573,6 +584,7 @@ b99f ssair RRE_R0 b9a0 clp RRF_U0RR b9a1 tpei RRE_RR b9a2 ptf RRE_R0 +b9a4 uvc RRF_URR b9aa lptea RRF_RURR2 b9ab essa RRF_U0RR b9ac irbm RRE_RR @@ -585,6 +597,7 @@ b9b3 cu42 RRE_RR b9bd trtre RRF_U0RR b9be srstu RRE_RR b9bf trte RRF_U0RR +b9c0 selhhhr RRF_RURR b9c8 ahhhr RRF_R0RR2 b9c9 shhhr RRF_R0RR2 b9ca alhhhr RRF_R0RR2 @@ -594,6 +607,9 @@ b9cf clhhr RRE_RR b9d0 pcistg RRE_RR b9d2 pcilg RRE_RR b9d3 rpcit RRE_RR +b9d4 pcistgi RRE_RR +b9d5 pciwb RRE_00 +b9d6 pcilgi RRE_RR b9d8 ahhlr RRF_R0RR2 b9d9 shhlr RRF_R0RR2 b9da alhhlr RRF_R0RR2 @@ -601,9 +617,11 @@ b9db slhhlr RRF_R0RR2 b9dd chlr RRE_RR b9df clhlr RRE_RR b9e0 locfhr RRF_U0RR -b9e1 popcnt RRE_RR +b9e1 popcnt RRF_U0RR b9e2 locgr RRF_U0RR +b9e3 selgr RRF_RURR b9e4 ngrk RRF_R0RR2 +b9e5 ncgrk RRF_R0RR2 b9e6 ogrk RRF_R0RR2 b9e7 xgrk RRF_R0RR2 b9e8 agrk RRF_R0RR2 @@ -612,8 +630,10 @@ b9ea algrk RRF_R0RR2 b9eb slgrk RRF_R0RR2 b9ec mgrk RRF_R0RR2 b9ed msgrkc RRF_R0RR2 +b9f0 selr RRF_RURR b9f2 locr RRF_U0RR b9f4 nrk RRF_R0RR2 +b9f5 ncrk RRF_R0RR2 b9f6 ork RRF_R0RR2 b9f7 xrk RRF_R0RR2 b9f8 ark RRF_R0RR2 @@ -822,6 +842,7 @@ e3d4 stpcifc RXY_RRRD e500 lasp SSE_RDRD e501 tprot SSE_RDRD e502 strag SSE_RDRD +e50a mvcrl SSE_RDRD e50e mvcsk SSE_RDRD e50f mvcdk SSE_RDRD e544 mvhhi SIL_RDI @@ -835,6 +856,18 @@ e55c chsi SIL_RDI e55d clfhsi SIL_RDU e560 tbegin SIL_RDU e561 tbeginc SIL_RDU +e601 vlebrh VRX_VRRDU +e602 vlebrg VRX_VRRDU +e603 vlebrf VRX_VRRDU +e604 vllebrz VRX_VRRDU +e605 vlbrrep VRX_VRRDU +e606 vlbr VRX_VRRDU +e607 vler VRX_VRRDU +e609 vstebrh VRX_VRRDU +e60a vstebrg VRX_VRRDU +e60b vstebrf VRX_VRRDU +e60e vstbr VRX_VRRDU +e60f vster VRX_VRRDU e634 vpkz VSI_URDV e635 vlrl VSI_URDV e637 vlrlr VRS_RRDV @@ -842,8 +875,8 @@ e63c vupkz VSI_URDV e63d vstrl VSI_URDV e63f vstrlr VRS_RRDV e649 vlip VRI_V0UU2 -e650 vcvb VRR_RV0U -e652 vcvbg VRR_RV0U +e650 vcvb VRR_RV0UU +e652 vcvbg VRR_RV0UU e658 vcvd VRI_VR0UU e659 vsrp VRI_VVUUU2 e65a vcvdg VRI_VR0UU @@ -863,13 +896,13 @@ e702 vleg VRX_VRRDU e703 vlef VRX_VRRDU e704 vllez VRX_VRRDU e705 vlrep VRX_VRRDU -e706 vl VRX_VRRD +e706 vl VRX_VRRDU e707 vlbb VRX_VRRDU e708 vsteb VRX_VRRDU e709 vsteh VRX_VRRDU e70a vsteg VRX_VRRDU e70b vstef VRX_VRRDU -e70e vst VRX_VRRD +e70e vst VRX_VRRDU e712 vgeg VRV_VVXRDU e713 vgef VRV_VVXRDU e71a vsceg VRV_VVXRDU @@ -879,11 +912,11 @@ e722 vlvg VRS_VRRDU e727 lcbb RXE_RRRDU e730 vesl VRS_VVRDU e733 verll VRS_VVRDU -e736 vlm VRS_VVRD +e736 vlm VRS_VVRDU e737 vll VRS_VRRD e738 vesrl VRS_VVRDU e73a vesra VRS_VVRDU -e73e vstm VRS_VVRD +e73e vstm VRS_VVRDU e73f vstl VRS_VRRD e740 vleib VRI_V0IU e741 vleih VRI_V0IU @@ -932,7 +965,10 @@ e781 vfene VRR_VVV0U0U e782 vfae VRR_VVV0U0U e784 vpdi VRR_VVV0U e785 vbperm VRR_VVV +e786 vsld VRI_VVV0U +e787 vsrd VRI_VVV0U e78a vstrc VRR_VVVUU0V +e78b vstrs VRR_VVVUU0V e78c vperm VRR_VVV0V e78d vsel VRR_VVV0V e78e vfms VRR_VVVU0UV @@ -1060,6 +1096,7 @@ eb9b stamy RSY_AARD ebc0 tp RSL_R0RD ebd0 pcistb RSY_RRRD ebd1 sic RSY_RRRD +ebd4 pcistbi RSY_RRRD ebdc srak RSY_RRRD ebdd slak RSY_RRRD ebde srlk RSY_RRRD From 67626fadd26977aca76d3540b80ce99233399cdf Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 3 Jun 2019 14:25:18 +0200 Subject: [PATCH 18/83] s390: enforce CONFIG_SMP There never have been distributions that shiped with CONFIG_SMP=n for s390. In addition the kernel currently doesn't even compile with CONFIG_SMP=n for s390. Most likely it wouldn't even work, even if we fix the compile error, since nobody tests it, since there is no use case that I can think of. Therefore simply enforce CONFIG_SMP and get rid of some more or less unused code. Reviewed-by: Christian Borntraeger Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 25 ++----------------------- arch/s390/include/asm/ctl_reg.h | 9 ++------- arch/s390/include/asm/percpu.h | 2 +- arch/s390/include/asm/smp.h | 30 ------------------------------ arch/s390/include/asm/spinlock.h | 4 ---- arch/s390/include/asm/tlbflush.h | 17 ----------------- arch/s390/kernel/Makefile | 2 +- arch/s390/kernel/dumpstack.c | 2 -- arch/s390/kernel/entry.S | 4 ---- arch/s390/kernel/setup.c | 2 -- arch/s390/kernel/swsusp.S | 2 -- arch/s390/lib/Makefile | 3 +-- 12 files changed, 7 insertions(+), 95 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index e2e154051b07..bdf3b5fdea53 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -30,7 +30,7 @@ config GENERIC_BUG_RELATIVE_POINTERS def_bool y config GENERIC_LOCKBREAK - def_bool y if SMP && PREEMPT + def_bool y if PREEMPT config PGSTE def_bool y if KVM @@ -113,7 +113,6 @@ config S390 select DYNAMIC_FTRACE if FUNCTION_TRACER select GENERIC_CLOCKEVENTS select GENERIC_CPU_AUTOPROBE - select GENERIC_CPU_DEVICES if !SMP select GENERIC_CPU_VULNERABILITIES select GENERIC_FIND_FIRST_BIT select GENERIC_SMP_IDLE_THREAD @@ -399,27 +398,10 @@ config SYSVIPC_COMPAT config SMP def_bool y - prompt "Symmetric multi-processing support" - ---help--- - This enables support for systems with more than one CPU. If you have - a system with only one CPU, like most personal computers, say N. If - you have a system with more than one CPU, say Y. - - If you say N here, the kernel will run on uni- and multiprocessor - machines, but will use only one CPU of a multiprocessor machine. If - you say Y here, the kernel will run on many, but not all, - uniprocessor machines. On a uniprocessor machine, the kernel - will run faster if you say N here. - - See also the SMP-HOWTO available at - . - - Even if you don't know what to do here, say Y. config NR_CPUS int "Maximum number of CPUs (2-512)" range 2 512 - depends on SMP default "64" help This allows you to specify the maximum number of CPUs which this @@ -432,7 +414,6 @@ config NR_CPUS config HOTPLUG_CPU def_bool y prompt "Support for hot-pluggable CPUs" - depends on SMP help Say Y here to be able to turn CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu/cpu#. @@ -448,7 +429,7 @@ config NODES_SPAN_OTHER_NODES config NUMA bool "NUMA support" - depends on SMP && SCHED_TOPOLOGY + depends on SCHED_TOPOLOGY default n help Enable NUMA support @@ -523,7 +504,6 @@ config SCHED_DRAWER config SCHED_TOPOLOGY def_bool y prompt "Topology scheduler support" - depends on SMP select SCHED_SMT select SCHED_MC select SCHED_BOOK @@ -829,7 +809,6 @@ menu "Dump support" config CRASH_DUMP bool "kernel crash dumps" - depends on SMP select KEXEC help Generate crash dump after being started by kexec. diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index 4600453536c2..a778a0825835 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -112,13 +112,8 @@ union ctlreg2 { }; }; -#ifdef CONFIG_SMP -# define ctl_set_bit(cr, bit) smp_ctl_set_bit(cr, bit) -# define ctl_clear_bit(cr, bit) smp_ctl_clear_bit(cr, bit) -#else -# define ctl_set_bit(cr, bit) __ctl_set_bit(cr, bit) -# define ctl_clear_bit(cr, bit) __ctl_clear_bit(cr, bit) -#endif +#define ctl_set_bit(cr, bit) smp_ctl_set_bit(cr, bit) +#define ctl_clear_bit(cr, bit) smp_ctl_clear_bit(cr, bit) #endif /* __ASSEMBLY__ */ #endif /* __ASM_CTL_REG_H */ diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h index 0095ddb58ff6..50b4ce8cddfd 100644 --- a/arch/s390/include/asm/percpu.h +++ b/arch/s390/include/asm/percpu.h @@ -16,7 +16,7 @@ * per cpu area, use weak definitions to force the compiler to * generate external references. */ -#if defined(CONFIG_SMP) && defined(MODULE) +#if defined(MODULE) #define ARCH_NEEDS_WEAK_PER_CPU #endif diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 3907ead27ffa..30ba1a3f88de 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -9,9 +9,6 @@ #define __ASM_SMP_H #include - -#ifdef CONFIG_SMP - #include #define raw_smp_processor_id() (S390_lowcore.cpu_nr) @@ -40,33 +37,6 @@ extern int smp_cpu_get_polarization(int cpu); extern void smp_fill_possible_mask(void); extern void smp_detect_cpus(void); -#else /* CONFIG_SMP */ - -#define smp_cpu_mtid 0 - -static inline void smp_call_ipl_cpu(void (*func)(void *), void *data) -{ - func(data); -} - -static inline void smp_call_online_cpu(void (*func)(void *), void *data) -{ - func(data); -} - -static inline void smp_emergency_stop(void) -{ -} - -static inline int smp_find_processor_id(u16 address) { return 0; } -static inline int smp_store_status(int cpu) { return 0; } -static inline int smp_vcpu_scheduled(int cpu) { return 1; } -static inline void smp_yield_cpu(int cpu) { } -static inline void smp_fill_possible_mask(void) { } -static inline void smp_detect_cpus(void) { } - -#endif /* CONFIG_SMP */ - static inline void smp_stop_cpu(void) { u16 pcpu = stap(); diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 0a29588aa00b..c02bff33f6c7 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -20,11 +20,7 @@ extern int spin_retry; -#ifndef CONFIG_SMP -static inline bool arch_vcpu_is_preempted(int cpu) { return false; } -#else bool arch_vcpu_is_preempted(int cpu); -#endif #define vcpu_is_preempted arch_vcpu_is_preempted diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 8c840f0904f3..82703e03f35d 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -32,7 +32,6 @@ static inline void __tlb_flush_idte(unsigned long asce) : : "a" (opt), "a" (asce) : "cc"); } -#ifdef CONFIG_SMP void smp_ptlb_all(void); /* @@ -83,22 +82,6 @@ static inline void __tlb_flush_kernel(void) else __tlb_flush_global(); } -#else -#define __tlb_flush_global() __tlb_flush_local() - -/* - * Flush TLB entries for a specific ASCE on all CPUs. - */ -static inline void __tlb_flush_mm(struct mm_struct *mm) -{ - __tlb_flush_local(); -} - -static inline void __tlb_flush_kernel(void) -{ - __tlb_flush_local(); -} -#endif static inline void __tlb_flush_mm_lazy(struct mm_struct * mm) { diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index b0478d01a0c5..0f255b54b051 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -53,6 +53,7 @@ obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o +obj-y += smp.o extra-y += head64.o vmlinux.lds @@ -60,7 +61,6 @@ obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o obj-$(CONFIG_HIBERNATION) += suspend.o swsusp.o obj-$(CONFIG_AUDIT) += audit.o diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 9e87b68be21c..ac06c3949ab3 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -199,9 +199,7 @@ void die(struct pt_regs *regs, const char *str) #ifdef CONFIG_PREEMPT pr_cont("PREEMPT "); #endif -#ifdef CONFIG_SMP pr_cont("SMP "); -#endif if (debug_pagealloc_enabled()) pr_cont("DEBUG_PAGEALLOC"); pr_cont("\n"); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 3f4d272577d3..270d1d145761 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -986,14 +986,12 @@ ENTRY(psw_idle) stg %r3,__SF_EMPTY(%r15) larl %r1,.Lpsw_idle_lpsw+4 stg %r1,__SF_EMPTY+8(%r15) -#ifdef CONFIG_SMP larl %r1,smp_cpu_mtid llgf %r1,0(%r1) ltgr %r1,%r1 jz .Lpsw_idle_stcctm .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+16(%r15) .Lpsw_idle_stcctm: -#endif oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT BPON STCK __CLOCK_IDLE_ENTER(%r2) @@ -1468,7 +1466,6 @@ ENDPROC(cleanup_critical) mvc __CLOCK_IDLE_ENTER(8,%r2),__CLOCK_IDLE_EXIT(%r2) mvc __TIMER_IDLE_ENTER(8,%r2),__TIMER_IDLE_EXIT(%r2) 1: # calculate idle cycles -#ifdef CONFIG_SMP clg %r9,BASED(.Lcleanup_idle_insn) jl 3f larl %r1,smp_cpu_mtid @@ -1486,7 +1483,6 @@ ENDPROC(cleanup_critical) la %r3,8(%r3) la %r4,8(%r4) brct %r1,2b -#endif 3: # account system time going idle lg %r9,__LC_STEAL_TIMER alg %r9,__CLOCK_IDLE_ENTER(%r2) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index f8544d517430..2b94b0ad3588 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -461,11 +461,9 @@ static void __init setup_lowcore_dat_off(void) mem_assign_absolute(S390_lowcore.restart_source, lc->restart_source); mem_assign_absolute(S390_lowcore.restart_psw, lc->restart_psw); -#ifdef CONFIG_SMP lc->spinlock_lockval = arch_spin_lockval(0); lc->spinlock_index = 0; arch_spin_lock_setup(0); -#endif lc->br_r1_trampoline = 0x07f1; /* br %r1 */ set_prefix((u32)(unsigned long) lc); diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S index 19a3c427801a..a7baf0b5f818 100644 --- a/arch/s390/kernel/swsusp.S +++ b/arch/s390/kernel/swsusp.S @@ -162,7 +162,6 @@ ENTRY(swsusp_arch_resume) larl %r1,__swsusp_reset_dma lg %r1,0(%r1) BASR_EX %r14,%r1 -#ifdef CONFIG_SMP larl %r1,smp_cpu_mt_shift icm %r1,15,0(%r1) jz smt_done @@ -172,7 +171,6 @@ smt_loop: brc 8,smt_done /* accepted */ brc 2,smt_loop /* busy, try again */ smt_done: -#endif larl %r1,.Lnew_pgm_check_psw lpswe 0(%r1) pgm_check_entry: diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 5418d10dc2a8..a1ec63abfb95 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -3,9 +3,8 @@ # Makefile for s390-specific library files.. # -lib-y += delay.o string.o uaccess.o find.o +lib-y += delay.o string.o uaccess.o find.o spinlock.o obj-y += mem.o xor.o -lib-$(CONFIG_SMP) += spinlock.o lib-$(CONFIG_KPROBES) += probes.o lib-$(CONFIG_UPROBES) += probes.o From 3e8eb22faee179798530b6a3d2639629fcf9d580 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 3 Jun 2019 15:22:25 +0200 Subject: [PATCH 19/83] s390: enforce CONFIG_HOTPLUG_CPU x86 and powerpc (partially) enforce already CONFIG_HOTPLUG_CPU. On s390 it is enabled on all distributions by default since ages. The only exception is our zfcpdump kernel. However to simplify testing, enforce HOTPLUG_CPU. This was suggested by Paul McKenney, since his rcutorture test environments for CONFIG_SMP=y only support HOTPLUG_CPU=y. Suggested-by: Paul E. McKenney Acked-by: Christian Borntraeger Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 5 ----- arch/s390/include/asm/smp.h | 5 ----- arch/s390/kernel/smp.c | 19 ------------------- 3 files changed, 29 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bdf3b5fdea53..66be2d813951 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -413,11 +413,6 @@ config NR_CPUS config HOTPLUG_CPU def_bool y - prompt "Support for hot-pluggable CPUs" - help - Say Y here to be able to turn CPUs off and on. CPUs - can be controlled through /sys/devices/system/cpu/cpu#. - Say N if you want to disable CPU hotplug. # Some NUMA nodes have memory ranges that span # other nodes. Even though a pfn is valid and diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 30ba1a3f88de..b157a81fb977 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -53,14 +53,9 @@ static inline int smp_get_base_cpu(int cpu) return cpu - (cpu % (smp_cpu_mtid + 1)); } -#ifdef CONFIG_HOTPLUG_CPU extern int smp_rescan_cpus(void); extern void __noreturn cpu_die(void); extern void __cpu_die(unsigned int cpu); extern int __cpu_disable(void); -#else -static inline int smp_rescan_cpus(void) { return 0; } -static inline void cpu_die(void) { } -#endif #endif /* __ASM_SMP_H */ diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 35fafa2b91a8..f00955940694 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -232,8 +232,6 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) return -ENOMEM; } -#ifdef CONFIG_HOTPLUG_CPU - static void pcpu_free_lowcore(struct pcpu *pcpu) { unsigned long async_stack, nodat_stack, lowcore; @@ -253,8 +251,6 @@ static void pcpu_free_lowcore(struct pcpu *pcpu) free_pages(lowcore, LC_ORDER); } -#endif /* CONFIG_HOTPLUG_CPU */ - static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) { struct lowcore *lc = pcpu->lowcore; @@ -895,8 +891,6 @@ static int __init _setup_possible_cpus(char *s) } early_param("possible_cpus", _setup_possible_cpus); -#ifdef CONFIG_HOTPLUG_CPU - int __cpu_disable(void) { unsigned long cregs[16]; @@ -937,8 +931,6 @@ void __noreturn cpu_die(void) for (;;) ; } -#endif /* CONFIG_HOTPLUG_CPU */ - void __init smp_fill_possible_mask(void) { unsigned int possible, sclp_max, cpu; @@ -996,7 +988,6 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } -#ifdef CONFIG_HOTPLUG_CPU static ssize_t cpu_configure_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1073,7 +1064,6 @@ static ssize_t cpu_configure_store(struct device *dev, return rc ? rc : count; } static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store); -#endif /* CONFIG_HOTPLUG_CPU */ static ssize_t show_cpu_address(struct device *dev, struct device_attribute *attr, char *buf) @@ -1083,9 +1073,7 @@ static ssize_t show_cpu_address(struct device *dev, static DEVICE_ATTR(address, 0444, show_cpu_address, NULL); static struct attribute *cpu_common_attrs[] = { -#ifdef CONFIG_HOTPLUG_CPU &dev_attr_configure.attr, -#endif &dev_attr_address.attr, NULL, }; @@ -1144,15 +1132,11 @@ static int smp_add_present_cpu(int cpu) out_topology: sysfs_remove_group(&s->kobj, &cpu_common_attr_group); out_cpu: -#ifdef CONFIG_HOTPLUG_CPU unregister_cpu(c); -#endif out: return rc; } -#ifdef CONFIG_HOTPLUG_CPU - int __ref smp_rescan_cpus(void) { struct sclp_core_info *info; @@ -1188,17 +1172,14 @@ static ssize_t __ref rescan_store(struct device *dev, return rc ? rc : count; } static DEVICE_ATTR_WO(rescan); -#endif /* CONFIG_HOTPLUG_CPU */ static int __init s390_smp_init(void) { int cpu, rc = 0; -#ifdef CONFIG_HOTPLUG_CPU rc = device_create_file(cpu_subsys.dev_root, &dev_attr_rescan); if (rc) return rc; -#endif for_each_present_cpu(cpu) { rc = smp_add_present_cpu(cpu); if (rc) From 10400c401754b6bc79839335c9a927a9f352639f Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 4 Jun 2019 09:58:57 +0200 Subject: [PATCH 20/83] s390/configs: remove useless UEVENT_HELPER_PATH Remove the CONFIG_UEVENT_HELPER_PATH because: 1. It is disabled since commit 1be01d4a5714 ("driver: base: Disable CONFIG_UEVENT_HELPER by default") as its dependency (UEVENT_HELPER) was made default to 'n', 2. It is not recommended (help message: "This should not be used today [...] creates a high system load") and was kept only for ancient userland, 3. Certain userland specifically requests it to be disabled (systemd README: "Legacy hotplug slows down the system and confuses udev"). Signed-off-by: Krzysztof Kozlowski Acked-by: Geert Uytterhoeven Signed-off-by: Heiko Carstens --- arch/s390/configs/defconfig | 1 - arch/s390/configs/zfcpdump_defconfig | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index c59b922cb6c5..18cff2e4607d 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -94,7 +94,6 @@ CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y CONFIG_BPF_JIT=y -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 7dc7f58c4287..d92bab844b73 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -24,7 +24,6 @@ CONFIG_CRASH_DUMP=y # CONFIG_SECCOMP is not set CONFIG_NET=y # CONFIG_IUCV is not set -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_RAM=y # CONFIG_BLK_DEV_XPRAM is not set From bae0aae2f8f971e95182deab11f56a79018ba89b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 4 Jun 2019 17:29:47 +0900 Subject: [PATCH 21/83] s390: fix unrecognized __aligned() in uapi header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __aligned() is a shorthand that is only available in the kernel space because it is defined in include/linux/compiler_attributes.h, which is not exported to the user space. Detected by compile-testing exported headers. ./usr/include/asm/runtime_instr.h:60:37: error: expected declaration specifiers or ‘...’ before numeric constant } __attribute__((packed)) __aligned(8); ^ Signed-off-by: Masahiro Yamada Signed-off-by: Heiko Carstens --- arch/s390/include/uapi/asm/runtime_instr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/uapi/asm/runtime_instr.h b/arch/s390/include/uapi/asm/runtime_instr.h index 45c9ec984e6b..455da46e3193 100644 --- a/arch/s390/include/uapi/asm/runtime_instr.h +++ b/arch/s390/include/uapi/asm/runtime_instr.h @@ -57,7 +57,7 @@ struct runtime_instr_cb { __u64 sf; __u64 rsic; __u64 reserved8; -} __packed __aligned(8); +} __attribute__((__packed__, __aligned__(8))); static inline void load_runtime_instr_cb(struct runtime_instr_cb *cb) { From e1ab11012e1cc981b43219580e010a20ec8df7d8 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Wed, 5 Jun 2019 07:19:54 +0200 Subject: [PATCH 22/83] s390/zcrypt: support special flagged EP11 cprbs Within an EP11 cprb there exists a byte field flags. Bit 0x20 of this field indicates a special cprb. A special cprb triggers special handling in the firmware below the OS layer. However, a special cprb also needs to have the S bit in GPR0 set when NQAP is called. This was not the case for EP11 cprbs and this patch now introduces the code to support this. Signed-off-by: Harald Freudenberger Signed-off-by: Heiko Carstens --- drivers/s390/crypto/zcrypt_msgtype6.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c index 0cbcc238ef98..12fe9deb265e 100644 --- a/drivers/s390/crypto/zcrypt_msgtype6.c +++ b/drivers/s390/crypto/zcrypt_msgtype6.c @@ -567,6 +567,10 @@ static int xcrb_msg_to_type6_ep11cprb_msgx(struct ap_message *ap_msg, payload_hdr = (struct pld_hdr *)((&(msg->pld_lenfmt))+lfmt); *fcode = payload_hdr->func_val & 0xFFFF; + /* enable special processing based on the cprbs flags special bit */ + if (msg->cprbx.flags & 0x20) + ap_msg->special = 1; + return 0; } From 34c636a0c15729ce77ab7b4aa968587098b76577 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 4 Jun 2019 13:51:36 +0200 Subject: [PATCH 23/83] s390/cio: fix kdoc for tiqdio_thinint_handler Add missing parameter description to fix the following warning: drivers/s390/cio/qdio_thinint.c:183: warning: Function parameter or member 'floating' not described in 'tiqdio_thinint_handler' Signed-off-by: Sebastian Ott Signed-off-by: Heiko Carstens --- drivers/s390/cio/qdio_thinint.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index 28d59ac2204c..b84ac7ae8a3e 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -178,6 +178,7 @@ static inline void tiqdio_call_inq_handlers(struct qdio_irq *irq) /** * tiqdio_thinint_handler - thin interrupt handler for qdio * @airq: pointer to adapter interrupt descriptor + * @floating: flag to recognize floating vs. directed interrupts (unused) */ static void tiqdio_thinint_handler(struct airq_struct *airq, bool floating) { From b0bb8fbd49af94c946837c4e15d84ff9ff6f0796 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Jun 2019 13:10:51 +0200 Subject: [PATCH 24/83] s390/boot: disable address-of-packed-member warning Get rid of gcc9 warnings like this: arch/s390/boot/ipl_report.c: In function 'find_bootdata_space': arch/s390/boot/ipl_report.c:42:26: warning: taking address of packed member of 'struct ipl_rb_components' may result in an unaligned pointer value [-Waddress-of-packed-member] 42 | for_each_rb_entry(comp, comps) | ^~~~~ This is effectively the s390 variant of commit 20c6c1890455 ("x86/boot: Disable the address-of-packed-member compiler warning"). Reviewed-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index de8521fc9de5..e48013cf50a2 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -30,6 +30,7 @@ KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-option,-ffreestanding) +KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,)) UTS_MACHINE := s390x From 6887560c039f13c21d7fa6df363c4db0f3e12fa2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 6 Jun 2019 16:37:56 +0200 Subject: [PATCH 25/83] s390/jump_label: remove unused structure definition Signed-off-by: Heiko Carstens --- arch/s390/kernel/jump_label.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index e193630a7d2a..ab584e8e3527 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -15,11 +15,6 @@ struct insn { s32 offset; } __packed; -struct insn_args { - struct jump_entry *entry; - enum jump_label_type type; -}; - static void jump_label_make_nop(struct jump_entry *entry, struct insn *insn) { /* brcl 0,offset */ From 04310324c6f482921c071444833e70fe861b73d9 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 3 Jun 2019 07:47:04 +0200 Subject: [PATCH 26/83] s390/qdio: handle PENDING state for QEBSM devices When a CQ-enabled device uses QEBSM for SBAL state inspection, get_buf_states() can return the PENDING state for an Output Queue. get_outbound_buffer_frontier() isn't prepared for this, and any PENDING buffer will permanently stall all further completion processing on this Queue. This isn't a concern for non-QEBSM devices, as get_buf_states() for such devices will manually turn PENDING buffers into EMPTY ones. Fixes: 104ea556ee7f ("qdio: support asynchronous delivery of storage blocks") Signed-off-by: Julian Wiedmann Signed-off-by: Heiko Carstens --- drivers/s390/cio/qdio_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 7b7620de2acd..730c4e68094b 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -736,6 +736,7 @@ static int get_outbound_buffer_frontier(struct qdio_q *q, unsigned int start) switch (state) { case SLSB_P_OUTPUT_EMPTY: + case SLSB_P_OUTPUT_PENDING: /* the adapter got it */ DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "out empty:%1d %02x", q->nr, count); From dc3988f40fdf7a51bd5480c3383372f463e4dfa9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 8 Jun 2019 23:27:15 -0300 Subject: [PATCH 27/83] docs: Debugging390.txt: convert table to ascii artwork The first bit/value table inside the document is very hard to read and won't fit ReST format. Also, some columns aren't properly aligned. Convert it to a nice ascii artwork table with makes it easier to read as plain text and is compatible with ReST format parser on Sphinx. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Heiko Carstens --- Documentation/s390/Debugging390.txt | 210 ++++++++++++++++------------ 1 file changed, 120 insertions(+), 90 deletions(-) diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/Debugging390.txt index 5ae7f868a007..c35804c238ad 100644 --- a/Documentation/s390/Debugging390.txt +++ b/Documentation/s390/Debugging390.txt @@ -78,96 +78,126 @@ e.g. switching address translation off requires that you have a logical=physical mapping for the address you are currently running at. - Bit Value -s/390 z/Architecture -0 0 Reserved ( must be 0 ) otherwise specification exception occurs. - -1 1 Program Event Recording 1 PER enabled, - PER is used to facilitate debugging e.g. single stepping. - -2-4 2-4 Reserved ( must be 0 ). - -5 5 Dynamic address translation 1=DAT on. - -6 6 Input/Output interrupt Mask - -7 7 External interrupt Mask used primarily for interprocessor - signalling and clock interrupts. - -8-11 8-11 PSW Key used for complex memory protection mechanism - (not used under linux) - -12 12 1 on s/390 0 on z/Architecture - -13 13 Machine Check Mask 1=enable machine check interrupts - -14 14 Wait State. Set this to 1 to stop the processor except for - interrupts and give time to other LPARS. Used in CPU idle in - the kernel to increase overall usage of processor resources. - -15 15 Problem state ( if set to 1 certain instructions are disabled ) - all linux user programs run with this bit 1 - ( useful info for debugging under VM ). - -16-17 16-17 Address Space Control - - 00 Primary Space Mode: - The register CR1 contains the primary address-space control ele- - ment (PASCE), which points to the primary space region/segment - table origin. - - 01 Access register mode - - 10 Secondary Space Mode: - The register CR7 contains the secondary address-space control - element (SASCE), which points to the secondary space region or - segment table origin. - - 11 Home Space Mode: - The register CR13 contains the home space address-space control - element (HASCE), which points to the home space region/segment - table origin. - - See "Address Spaces on Linux for s/390 & z/Architecture" below - for more information about address space usage in Linux. - -18-19 18-19 Condition codes (CC) - -20 20 Fixed point overflow mask if 1=FPU exceptions for this event - occur ( normally 0 ) - -21 21 Decimal overflow mask if 1=FPU exceptions for this event occur - ( normally 0 ) - -22 22 Exponent underflow mask if 1=FPU exceptions for this event occur - ( normally 0 ) - -23 23 Significance Mask if 1=FPU exceptions for this event occur - ( normally 0 ) - -24-31 24-30 Reserved Must be 0. - - 31 Extended Addressing Mode - 32 Basic Addressing Mode - Used to set addressing mode - PSW 31 PSW 32 - 0 0 24 bit - 0 1 31 bit - 1 1 64 bit - -32 1=31 bit addressing mode 0=24 bit addressing mode (for backward - compatibility), linux always runs with this bit set to 1 - -33-64 Instruction address. - 33-63 Reserved must be 0 - 64-127 Address - In 24 bits mode bits 64-103=0 bits 104-127 Address - In 31 bits mode bits 64-96=0 bits 97-127 Address - Note: unlike 31 bit mode on s/390 bit 96 must be zero - when loading the address with LPSWE otherwise a - specification exception occurs, LPSW is fully backward - compatible. - ++-------------------------+-------------------------------------------------+ +| Bit | | ++--------+----------------+ Value | +| s/390 | z/Architecture | | ++========+================+=================================================+ +| 0 | 0 | Reserved (must be 0) otherwise specification | +| | | exception occurs. | ++--------+----------------+-------------------------------------------------+ +| 1 | 1 | Program Event Recording 1 PER enabled, | +| | | PER is used to facilitate debugging e.g. | +| | | single stepping. | ++--------+----------------+-------------------------------------------------+ +| 2-4 | 2-4 | Reserved (must be 0). | ++--------+----------------+-------------------------------------------------+ +| 5 | 5 | Dynamic address translation 1=DAT on. | ++--------+----------------+-------------------------------------------------+ +| 6 | 6 | Input/Output interrupt Mask | ++--------+----------------+-------------------------------------------------+ +| 7 | 7 | External interrupt Mask used primarily for | +| | | interprocessor signalling and clock interrupts. | ++--------+----------------+-------------------------------------------------+ +| 8-11 | 8-11 | PSW Key used for complex memory protection | +| | | mechanism (not used under linux) | ++--------+----------------+-------------------------------------------------+ +| 12 | 12 | 1 on s/390 0 on z/Architecture | ++--------+----------------+-------------------------------------------------+ +| 13 | 13 | Machine Check Mask 1=enable machine check | +| | | interrupts | ++--------+----------------+-------------------------------------------------+ +| 14 | 14 | Wait State. Set this to 1 to stop the processor | +| | | except for interrupts and give time to other | +| | | LPARS. Used in CPU idle in the kernel to | +| | | increase overall usage of processor resources. | ++--------+----------------+-------------------------------------------------+ +| 15 | 15 | Problem state (if set to 1 certain instructions | +| | | are disabled). All linux user programs run with | +| | | this bit 1 (useful info for debugging under VM).| ++--------+----------------+-------------------------------------------------+ +| 16-17 | 16-17 | Address Space Control | +| | | | +| | | 00 Primary Space Mode: | +| | | | +| | | The register CR1 contains the primary | +| | | address-space control element (PASCE), which | +| | | points to the primary space region/segment | +| | | table origin. | +| | | | +| | | 01 Access register mode | +| | | | +| | | 10 Secondary Space Mode: | +| | | | +| | | The register CR7 contains the secondary | +| | | address-space control element (SASCE), which | +| | | points to the secondary space region or | +| | | segment table origin. | +| | | | +| | | 11 Home Space Mode: | +| | | | +| | | The register CR13 contains the home space | +| | | address-space control element (HASCE), which | +| | | points to the home space region/segment | +| | | table origin. | +| | | | +| | | See "Address Spaces on Linux for s/390 & | +| | | z/Architecture" below for more information | +| | | about address space usage in Linux. | ++--------+----------------+-------------------------------------------------+ +| 18-19 | 18-19 | Condition codes (CC) | ++--------+----------------+-------------------------------------------------+ +| 20 | 20 | Fixed point overflow mask if 1=FPU exceptions | +| | | for this event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 21 | 21 | Decimal overflow mask if 1=FPU exceptions for | +| | | this event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 22 | 22 | Exponent underflow mask if 1=FPU exceptions | +| | | for this event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 23 | 23 | Significance Mask if 1=FPU exceptions for this | +| | | event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 24-31 | 24-30 | Reserved Must be 0. | +| +----------------+-------------------------------------------------+ +| | 31 | Extended Addressing Mode | +| +----------------+-------------------------------------------------+ +| | 32 | Basic Addressing Mode | +| | | | +| | | Used to set addressing mode | +| | | | +| | | +---------+----------+----------+ | +| | | | PSW 31 | PSW 32 | | | +| | | +---------+----------+----------+ | +| | | | 0 | 0 | 24 bit | | +| | | +---------+----------+----------+ | +| | | | 0 | 1 | 31 bit | | +| | | +---------+----------+----------+ | +| | | | 1 | 1 | 64 bit | | +| | | +---------+----------+----------+ | ++--------+----------------+-------------------------------------------------+ +| 32 | | 1=31 bit addressing mode 0=24 bit addressing | +| | | mode (for backward compatibility), linux | +| | | always runs with this bit set to 1 | ++--------+----------------+-------------------------------------------------+ +| 33-64 | | Instruction address. | +| +----------------+-------------------------------------------------+ +| | 33-63 | Reserved must be 0 | +| +----------------+-------------------------------------------------+ +| | 64-127 | Address | +| | | | +| | | - In 24 bits mode bits 64-103=0 bits 104-127 | +| | | Address | +| | | - In 31 bits mode bits 64-96=0 bits 97-127 | +| | | Address | +| | | | +| | | Note: | +| | | unlike 31 bit mode on s/390 bit 96 must be | +| | | zero when loading the address with LPSWE | +| | | otherwise a specification exception occurs, | +| | | LPSW is fully backward compatible. | ++--------+----------------+-------------------------------------------------+ Prefix Page(s) -------------- From 8b4a503d659b32cae8266aeb306f7fd6717e6a53 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 8 Jun 2019 23:27:16 -0300 Subject: [PATCH 28/83] docs: s390: convert docs to ReST and rename to *.rst Convert all text files with s390 documentation to ReST format. Tried to preserve as much as possible the original document format. Still, some of the files required some work in order for it to be visible on both plain text and after converted to html. The conversion is actually: - add blank lines and identation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Heiko Carstens --- .../admin-guide/kernel-parameters.txt | 4 +- Documentation/driver-api/s390-drivers.rst | 4 +- Documentation/s390/{3270.txt => 3270.rst} | 85 +- Documentation/s390/{cds.txt => cds.rst} | 354 ++- .../s390/{CommonIO => common_io.rst} | 49 +- Documentation/s390/{DASD => dasd.rst} | 33 +- .../{Debugging390.txt => debugging390.rst} | 2391 ++++++++++------- .../{driver-model.txt => driver-model.rst} | 179 +- Documentation/s390/index.rst | 30 + .../s390/{monreader.txt => monreader.rst} | 85 +- Documentation/s390/{qeth.txt => qeth.rst} | 36 +- Documentation/s390/s390dbf.rst | 803 ++++++ Documentation/s390/s390dbf.txt | 667 ----- Documentation/s390/text_files.rst | 11 + .../s390/{vfio-ap.txt => vfio-ap.rst} | 487 ++-- .../s390/{vfio-ccw.txt => vfio-ccw.rst} | 90 +- .../s390/{zfcpdump.txt => zfcpdump.rst} | 2 + MAINTAINERS | 4 +- arch/s390/Kconfig | 4 +- arch/s390/include/asm/debug.h | 4 +- drivers/s390/char/zcore.c | 2 +- 21 files changed, 3090 insertions(+), 2234 deletions(-) rename Documentation/s390/{3270.txt => 3270.rst} (90%) rename Documentation/s390/{cds.txt => cds.rst} (64%) rename Documentation/s390/{CommonIO => common_io.rst} (87%) rename Documentation/s390/{DASD => dasd.rst} (92%) rename Documentation/s390/{Debugging390.txt => debugging390.rst} (53%) rename Documentation/s390/{driver-model.txt => driver-model.rst} (73%) create mode 100644 Documentation/s390/index.rst rename Documentation/s390/{monreader.txt => monreader.rst} (81%) rename Documentation/s390/{qeth.txt => qeth.rst} (62%) create mode 100644 Documentation/s390/s390dbf.rst delete mode 100644 Documentation/s390/s390dbf.txt create mode 100644 Documentation/s390/text_files.rst rename Documentation/s390/{vfio-ap.txt => vfio-ap.rst} (72%) rename Documentation/s390/{vfio-ccw.txt => vfio-ccw.rst} (89%) rename Documentation/s390/{zfcpdump.txt => zfcpdump.rst} (97%) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..b9b0623be925 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -478,7 +478,7 @@ others). ccw_timeout_log [S390] - See Documentation/s390/CommonIO for details. + See Documentation/s390/common_io.rst for details. cgroup_disable= [KNL] Disable a particular controller Format: {name of the controller(s) to disable} @@ -516,7 +516,7 @@ /selinux/checkreqprot. cio_ignore= [S390] - See Documentation/s390/CommonIO for details. + See Documentation/s390/common_io.rst for details. clk_ignore_unused [CLK] Prevents the clock framework from automatically gating diff --git a/Documentation/driver-api/s390-drivers.rst b/Documentation/driver-api/s390-drivers.rst index 30e6aa7e160b..5158577bc29b 100644 --- a/Documentation/driver-api/s390-drivers.rst +++ b/Documentation/driver-api/s390-drivers.rst @@ -27,7 +27,7 @@ not strictly considered I/O devices. They are considered here as well, although they are not the focus of this document. Some additional information can also be found in the kernel source under -Documentation/s390/driver-model.txt. +Documentation/s390/driver-model.rst. The css bus =========== @@ -38,7 +38,7 @@ into several categories: * Standard I/O subchannels, for use by the system. They have a child device on the ccw bus and are described below. * I/O subchannels bound to the vfio-ccw driver. See - Documentation/s390/vfio-ccw.txt. + Documentation/s390/vfio-ccw.rst. * Message subchannels. No Linux driver currently exists. * CHSC subchannels (at most one). The chsc subchannel driver can be used to send asynchronous chsc commands. diff --git a/Documentation/s390/3270.txt b/Documentation/s390/3270.rst similarity index 90% rename from Documentation/s390/3270.txt rename to Documentation/s390/3270.rst index 7c715de99774..e09e77954238 100644 --- a/Documentation/s390/3270.txt +++ b/Documentation/s390/3270.rst @@ -1,13 +1,17 @@ +=============================== IBM 3270 Display System support +=============================== This file describes the driver that supports local channel attachment of IBM 3270 devices. It consists of three sections: + * Introduction * Installation * Operation -INTRODUCTION. +Introduction +============ This paper describes installing and operating 3270 devices under Linux/390. A 3270 device is a block-mode rows-and-columns terminal of @@ -17,12 +21,12 @@ twenty and thirty years ago. You may have 3270s in-house and not know it. If you're using the VM-ESA operating system, define a 3270 to your virtual machine by using the command "DEF GRAF " This paper presumes you will be -defining four 3270s with the CP/CMS commands +defining four 3270s with the CP/CMS commands: - DEF GRAF 620 - DEF GRAF 621 - DEF GRAF 622 - DEF GRAF 623 + - DEF GRAF 620 + - DEF GRAF 621 + - DEF GRAF 622 + - DEF GRAF 623 Your network connection from VM-ESA allows you to use x3270, tn3270, or another 3270 emulator, started from an xterm window on your PC or @@ -34,7 +38,8 @@ This paper covers installation of the driver and operation of a dialed-in x3270. -INSTALLATION. +Installation +============ You install the driver by installing a patch, doing a kernel build, and running the configuration script (config3270.sh, in this directory). @@ -59,13 +64,15 @@ Use #CP TERM CONMODE 3270 to change it to 3270. If you generate only at boot time to a 3270 if it is a 3215. In brief, these are the steps: + 1. Install the tub3270 patch - 2. (If a module) add a line to a file in /etc/modprobe.d/*.conf + 2. (If a module) add a line to a file in `/etc/modprobe.d/*.conf` 3. (If VM) define devices with DEF GRAF 4. Reboot 5. Configure To test that everything works, assuming VM and x3270, + 1. Bring up an x3270 window. 2. Use the DIAL command in that window. 3. You should immediately see a Linux login screen. @@ -74,7 +81,8 @@ Here are the installation steps in detail: 1. The 3270 driver is a part of the official Linux kernel source. Build a tree with the kernel source and any necessary - patches. Then do + patches. Then do:: + make oldconfig (If you wish to disable 3215 console support, edit .config; change CONFIG_TN3215's value to "n"; @@ -84,20 +92,22 @@ Here are the installation steps in detail: make modules_install 2. (Perform this step only if you have configured tub3270 as a - module.) Add a line to a file /etc/modprobe.d/*.conf to automatically + module.) Add a line to a file `/etc/modprobe.d/*.conf` to automatically load the driver when it's needed. With this line added, you will see login prompts appear on your 3270s as soon as boot is complete (or with emulated 3270s, as soon as you dial into your vm guest using the command "DIAL "). Since the line-mode major number is - 227, the line to add should be: + 227, the line to add should be:: + alias char-major-227 tub3270 3. Define graphic devices to your vm guest machine, if you haven't already. Define them before you reboot (reipl): - DEFINE GRAF 620 - DEFINE GRAF 621 - DEFINE GRAF 622 - DEFINE GRAF 623 + + - DEFINE GRAF 620 + - DEFINE GRAF 621 + - DEFINE GRAF 622 + - DEFINE GRAF 623 4. Reboot. The reboot process scans hardware devices, including 3270s, and this enables the tub3270 driver once loaded to respond @@ -107,21 +117,23 @@ Here are the installation steps in detail: 5. Run the 3270 configuration script config3270. It is distributed in this same directory, Documentation/s390, as - config3270.sh. Inspect the output script it produces, + config3270.sh. Inspect the output script it produces, /tmp/mkdev3270, and then run that script. This will create the necessary character special device files and make the necessary changes to /etc/inittab. Then notify /sbin/init that /etc/inittab has changed, by issuing - the telinit command with the q operand: + the telinit command with the q operand:: + cd Documentation/s390 sh config3270.sh sh /tmp/mkdev3270 telinit q - This should be sufficient for your first time. If your 3270 + This should be sufficient for your first time. If your 3270 configuration has changed and you're reusing config3270, you - should follow these steps: + should follow these steps:: + Change 3270 configuration Reboot Run config3270 and /tmp/mkdev3270 @@ -132,8 +144,10 @@ Here are the testing steps in detail: 1. Bring up an x3270 window, or use an actual hardware 3278 or 3279, or use the 3270 emulator of your choice. You would be running the emulator on your PC or workstation. You would use - the command, for example, + the command, for example:: + x3270 vm-esa-domain-name & + if you wanted a 3278 Model 4 with 43 rows of 80 columns, the default model number. The driver does not take advantage of extended attributes. @@ -144,7 +158,8 @@ Here are the testing steps in detail: 2. Use the DIAL command instead of the LOGIN command to connect to one of the virtual 3270s you defined with the DEF GRAF - commands: + commands:: + dial my-vm-guest-name 3. You should immediately see a login prompt from your @@ -171,14 +186,17 @@ Here are the testing steps in detail: Wrong major number? Wrong minor number? There's your problem! - D. Do you get the message + D. Do you get the message:: + "HCPDIA047E my-vm-guest-name 0620 does not exist"? + If so, you must issue the command "DEF GRAF 620" from your VM 3215 console and then reboot the system. OPERATION. +========== The driver defines three areas on the 3270 screen: the log area, the input area, and the status area. @@ -203,8 +221,10 @@ which indicates no scrolling will occur. (If you hit ENTER with "Linux Running" and nothing typed, the application receives a newline.) You may change the scrolling timeout value. For example, the following -command line: +command line:: + echo scrolltime=60 > /proc/tty/driver/tty3270 + changes the scrolling timeout value to 60 sec. Set scrolltime to 0 if you wish to prevent scrolling entirely. @@ -228,7 +248,8 @@ cause an EOF also by typing "^D" and hitting ENTER. No PF key is preassigned to cause a job suspension, but you may cause a job suspension by typing "^Z" and hitting ENTER. You may wish to assign this function to a PF key. To make PF7 cause job suspension, -execute the command: +execute the command:: + echo pf7=^z > /proc/tty/driver/tty3270 If the input you type does not end with the two characters "^n", the @@ -243,8 +264,10 @@ command is entered into the stack only when the input area is not made invisible (such as for password entry) and it is not identical to the current top entry. PF10 rotates backward through the command stack; PF11 rotates forward. You may assign the backward function to any PF -key (or PA key, for that matter), say, PA3, with the command: +key (or PA key, for that matter), say, PA3, with the command:: + echo -e pa3=\\033k > /proc/tty/driver/tty3270 + This assigns the string ESC-k to PA3. Similarly, the string ESC-j performs the forward function. (Rationale: In bash with vi-mode line editing, ESC-k and ESC-j retrieve backward and forward history. @@ -252,15 +275,19 @@ Suggestions welcome.) Is a stack size of twenty commands not to your liking? Change it on the fly. To change to saving the last 100 commands, execute the -command: +command:: + echo recallsize=100 > /proc/tty/driver/tty3270 Have a command you issue frequently? Assign it to a PF or PA key! Use -the command - echo pf24="mkdir foobar; cd foobar" > /proc/tty/driver/tty3270 +the command:: + + echo pf24="mkdir foobar; cd foobar" > /proc/tty/driver/tty3270 + to execute the commands mkdir foobar and cd foobar immediately when you hit PF24. Want to see the command line first, before you execute it? -Use the -n option of the echo command: +Use the -n option of the echo command:: + echo -n pf24="mkdir foo; cd foo" > /proc/tty/driver/tty3270 diff --git a/Documentation/s390/cds.txt b/Documentation/s390/cds.rst similarity index 64% rename from Documentation/s390/cds.txt rename to Documentation/s390/cds.rst index 480a78ef5a1e..7006d8209d2e 100644 --- a/Documentation/s390/cds.txt +++ b/Documentation/s390/cds.rst @@ -1,14 +1,18 @@ +=========================== Linux for S/390 and zSeries +=========================== Common Device Support (CDS) Device Driver I/O Support Routines -Authors : Ingo Adlung - Cornelia Huck +Authors: + - Ingo Adlung + - Cornelia Huck Copyright, IBM Corp. 1999-2002 Introduction +============ This document describes the common device support routines for Linux/390. Different than other hardware architectures, ESA/390 has defined a unified @@ -27,18 +31,20 @@ Operation manual (IBM Form. No. SA22-7201). In order to build common device support for ESA/390 I/O interfaces, a functional layer was introduced that provides generic I/O access methods to -the hardware. +the hardware. -The common device support layer comprises the I/O support routines defined -below. Some of them implement common Linux device driver interfaces, while +The common device support layer comprises the I/O support routines defined +below. Some of them implement common Linux device driver interfaces, while some of them are ESA/390 platform specific. Note: -In order to write a driver for S/390, you also need to look into the interface -described in Documentation/s390/driver-model.txt. + In order to write a driver for S/390, you also need to look into the interface + described in Documentation/s390/driver-model.rst. Note for porting drivers from 2.4: + The major changes are: + * The functions use a ccw_device instead of an irq (subchannel). * All drivers must define a ccw_driver (see driver-model.txt) and the associated functions. @@ -57,19 +63,16 @@ The major changes are: ccw_device_get_ciw() get commands from extended sense data. -ccw_device_start() -ccw_device_start_timeout() -ccw_device_start_key() -ccw_device_start_key_timeout() +ccw_device_start(), ccw_device_start_timeout(), ccw_device_start_key(), ccw_device_start_key_timeout() initiate an I/O request. ccw_device_resume() resume channel program execution. -ccw_device_halt() +ccw_device_halt() terminate the current I/O request processed on the device. -do_IRQ() +do_IRQ() generic interrupt routine. This function is called by the interrupt entry routine whenever an I/O interrupt is presented to the system. The do_IRQ() routine determines the interrupt status and calls the device specific @@ -82,12 +85,15 @@ first level interrupt handler only and does not comprise a device driver callable interface. Instead, the functional description of do_IO() also describes the input to the device specific interrupt handler. -Note: All explanations apply also to the 64 bit architecture s390x. +Note: + All explanations apply also to the 64 bit architecture s390x. Common Device Support (CDS) for Linux/390 Device Drivers +======================================================== General Information +------------------- The following chapters describe the I/O related interface routines the Linux/390 common device support (CDS) provides to allow for device specific @@ -101,6 +107,7 @@ can be found in the architecture specific C header file linux/arch/s390/include/asm/irq.h. Overview of CDS interface concepts +---------------------------------- Different to other hardware platforms, the ESA/390 architecture doesn't define interrupt lines managed by a specific interrupt controller and bus systems @@ -126,7 +133,7 @@ has to call every single device driver registered on this IRQ in order to determine the device driver owning the device that raised the interrupt. Up to kernel 2.4, Linux/390 used to provide interfaces via the IRQ (subchannel). -For internal use of the common I/O layer, these are still there. However, +For internal use of the common I/O layer, these are still there. However, device drivers should use the new calling interface via the ccw_device only. During its startup the Linux/390 system checks for peripheral devices. Each @@ -134,7 +141,7 @@ of those devices is uniquely defined by a so called subchannel by the ESA/390 channel subsystem. While the subchannel numbers are system generated, each subchannel also takes a user defined attribute, the so called device number. Both subchannel number and device number cannot exceed 65535. During sysfs -initialisation, the information about control unit type and device types that +initialisation, the information about control unit type and device types that imply specific I/O commands (channel command words - CCWs) in order to operate the device are gathered. Device drivers can retrieve this set of hardware information during their initialization step to recognize the devices they @@ -164,18 +171,26 @@ get_ciw() - get command information word This call enables a device driver to get information about supported commands from the extended SenseID data. -struct ciw * -ccw_device_get_ciw(struct ccw_device *cdev, __u32 cmd); +:: -cdev - The ccw_device for which the command is to be retrieved. -cmd - The command type to be retrieved. + struct ciw * + ccw_device_get_ciw(struct ccw_device *cdev, __u32 cmd); + +==== ======================================================== +cdev The ccw_device for which the command is to be retrieved. +cmd The command type to be retrieved. +==== ======================================================== ccw_device_get_ciw() returns: -NULL - No extended data available, invalid device or command not found. -!NULL - The command requested. +===== ================================================================ + NULL No extended data available, invalid device or command not found. +!NULL The command requested. +===== ================================================================ -ccw_device_start() - Initiate I/O Request +:: + + ccw_device_start() - Initiate I/O Request The ccw_device_start() routines is the I/O request front-end processor. All device driver I/O requests must be issued using this routine. A device driver @@ -186,93 +201,105 @@ This description also covers the status information passed to the device driver's interrupt handler as this is related to the rules (flags) defined with the associated I/O request when calling ccw_device_start(). -int ccw_device_start(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - unsigned long flags); -int ccw_device_start_timeout(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - unsigned long flags, - int expires); -int ccw_device_start_key(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - __u8 key, - unsigned long flags); -int ccw_device_start_key_timeout(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - __u8 key, - unsigned long flags, - int expires); +:: -cdev : ccw_device the I/O is destined for -cpa : logical start address of channel program -user_intparm : user specific interrupt information; will be presented - back to the device driver's interrupt handler. Allows a - device driver to associate the interrupt with a - particular I/O request. -lpm : defines the channel path to be used for a specific I/O - request. A value of 0 will make cio use the opm. -key : the storage key to use for the I/O (useful for operating on a - storage with a storage key != default key) -flag : defines the action to be performed for I/O processing -expires : timeout value in jiffies. The common I/O layer will terminate - the running program after this and call the interrupt handler - with ERR_PTR(-ETIMEDOUT) as irb. + int ccw_device_start(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + unsigned long flags); + int ccw_device_start_timeout(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + unsigned long flags, + int expires); + int ccw_device_start_key(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + __u8 key, + unsigned long flags); + int ccw_device_start_key_timeout(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + __u8 key, + unsigned long flags, + int expires); -Possible flag values are : +============= ============================================================= +cdev ccw_device the I/O is destined for +cpa logical start address of channel program +user_intparm user specific interrupt information; will be presented + back to the device driver's interrupt handler. Allows a + device driver to associate the interrupt with a + particular I/O request. +lpm defines the channel path to be used for a specific I/O + request. A value of 0 will make cio use the opm. +key the storage key to use for the I/O (useful for operating on a + storage with a storage key != default key) +flag defines the action to be performed for I/O processing +expires timeout value in jiffies. The common I/O layer will terminate + the running program after this and call the interrupt handler + with ERR_PTR(-ETIMEDOUT) as irb. +============= ============================================================= -DOIO_ALLOW_SUSPEND - channel program may become suspended -DOIO_DENY_PREFETCH - don't allow for CCW prefetch; usually - this implies the channel program might - become modified -DOIO_SUPPRESS_INTER - don't call the handler on intermediate status +Possible flag values are: -The cpa parameter points to the first format 1 CCW of a channel program : +========================= ============================================= +DOIO_ALLOW_SUSPEND channel program may become suspended +DOIO_DENY_PREFETCH don't allow for CCW prefetch; usually + this implies the channel program might + become modified +DOIO_SUPPRESS_INTER don't call the handler on intermediate status +========================= ============================================= -struct ccw1 { - __u8 cmd_code;/* command code */ - __u8 flags; /* flags, like IDA addressing, etc. */ - __u16 count; /* byte count */ - __u32 cda; /* data address */ -} __attribute__ ((packed,aligned(8))); +The cpa parameter points to the first format 1 CCW of a channel program:: -with the following CCW flags values defined : + struct ccw1 { + __u8 cmd_code;/* command code */ + __u8 flags; /* flags, like IDA addressing, etc. */ + __u16 count; /* byte count */ + __u32 cda; /* data address */ + } __attribute__ ((packed,aligned(8))); -CCW_FLAG_DC - data chaining -CCW_FLAG_CC - command chaining -CCW_FLAG_SLI - suppress incorrect length -CCW_FLAG_SKIP - skip -CCW_FLAG_PCI - PCI -CCW_FLAG_IDA - indirect addressing -CCW_FLAG_SUSPEND - suspend +with the following CCW flags values defined: + +=================== ========================= +CCW_FLAG_DC data chaining +CCW_FLAG_CC command chaining +CCW_FLAG_SLI suppress incorrect length +CCW_FLAG_SKIP skip +CCW_FLAG_PCI PCI +CCW_FLAG_IDA indirect addressing +CCW_FLAG_SUSPEND suspend +=================== ========================= Via ccw_device_set_options(), the device driver may specify the following options for the device: -DOIO_EARLY_NOTIFICATION - allow for early interrupt notification -DOIO_REPORT_ALL - report all interrupt conditions +========================= ====================================== +DOIO_EARLY_NOTIFICATION allow for early interrupt notification +DOIO_REPORT_ALL report all interrupt conditions +========================= ====================================== -The ccw_device_start() function returns : +The ccw_device_start() function returns: - 0 - successful completion or request successfully initiated --EBUSY - The device is currently processing a previous I/O request, or there is - a status pending at the device. --ENODEV - cdev is invalid, the device is not operational or the ccw_device is - not online. +======== ====================================================================== + 0 successful completion or request successfully initiated + -EBUSY The device is currently processing a previous I/O request, or there is + a status pending at the device. +-ENODEV cdev is invalid, the device is not operational or the ccw_device is + not online. +======== ====================================================================== When the I/O request completes, the CDS first level interrupt handler will accumulate the status in a struct irb and then call the device interrupt handler. -The intparm field will contain the value the device driver has associated with a -particular I/O request. If a pending device status was recognized, +The intparm field will contain the value the device driver has associated with a +particular I/O request. If a pending device status was recognized, intparm will be set to 0 (zero). This may happen during I/O initiation or delayed by an alert status notification. In any case this status is not related to the current (last) I/O request. In case of a delayed status notification no special @@ -282,9 +309,11 @@ never started, even though ccw_device_start() returned with successful completio The irb may contain an error value, and the device driver should check for this first: --ETIMEDOUT: the common I/O layer terminated the request after the specified - timeout value --EIO: the common I/O layer terminated the request due to an error state +========== ================================================================= +-ETIMEDOUT the common I/O layer terminated the request after the specified + timeout value +-EIO the common I/O layer terminated the request due to an error state +========== ================================================================= If the concurrent sense flag in the extended status word (esw) in the irb is set, the field erw.scnt in the esw describes the number of device specific @@ -294,6 +323,7 @@ sensing by the device driver itself is required. The device interrupt handler can use the following definitions to investigate the primary unit check source coded in sense byte 0 : +======================= ==== SNS0_CMD_REJECT 0x80 SNS0_INTERVENTION_REQ 0x40 SNS0_BUS_OUT_CHECK 0x20 @@ -301,36 +331,41 @@ SNS0_EQUIPMENT_CHECK 0x10 SNS0_DATA_CHECK 0x08 SNS0_OVERRUN 0x04 SNS0_INCOMPL_DOMAIN 0x01 +======================= ==== Depending on the device status, multiple of those values may be set together. Please refer to the device specific documentation for details. The irb->scsw.cstat field provides the (accumulated) subchannel status : -SCHN_STAT_PCI - program controlled interrupt -SCHN_STAT_INCORR_LEN - incorrect length -SCHN_STAT_PROG_CHECK - program check -SCHN_STAT_PROT_CHECK - protection check -SCHN_STAT_CHN_DATA_CHK - channel data check -SCHN_STAT_CHN_CTRL_CHK - channel control check -SCHN_STAT_INTF_CTRL_CHK - interface control check -SCHN_STAT_CHAIN_CHECK - chaining check +========================= ============================ +SCHN_STAT_PCI program controlled interrupt +SCHN_STAT_INCORR_LEN incorrect length +SCHN_STAT_PROG_CHECK program check +SCHN_STAT_PROT_CHECK protection check +SCHN_STAT_CHN_DATA_CHK channel data check +SCHN_STAT_CHN_CTRL_CHK channel control check +SCHN_STAT_INTF_CTRL_CHK interface control check +SCHN_STAT_CHAIN_CHECK chaining check +========================= ============================ The irb->scsw.dstat field provides the (accumulated) device status : -DEV_STAT_ATTENTION - attention -DEV_STAT_STAT_MOD - status modifier -DEV_STAT_CU_END - control unit end -DEV_STAT_BUSY - busy -DEV_STAT_CHN_END - channel end -DEV_STAT_DEV_END - device end -DEV_STAT_UNIT_CHECK - unit check -DEV_STAT_UNIT_EXCEP - unit exception +===================== ================= +DEV_STAT_ATTENTION attention +DEV_STAT_STAT_MOD status modifier +DEV_STAT_CU_END control unit end +DEV_STAT_BUSY busy +DEV_STAT_CHN_END channel end +DEV_STAT_DEV_END device end +DEV_STAT_UNIT_CHECK unit check +DEV_STAT_UNIT_EXCEP unit exception +===================== ================= Please see the ESA/390 Principles of Operation manual for details on the individual flag meanings. -Usage Notes : +Usage Notes: ccw_device_start() must be called disabled and with the ccw device lock held. @@ -374,32 +409,39 @@ secondary status without error (alert status) is presented, this indicates successful completion for all overlapping ccw_device_start() requests that have been issued since the last secondary (final) status. -Channel programs that intend to set the suspend flag on a channel command word -(CCW) must start the I/O operation with the DOIO_ALLOW_SUSPEND option or the -suspend flag will cause a channel program check. At the time the channel program -becomes suspended an intermediate interrupt will be generated by the channel +Channel programs that intend to set the suspend flag on a channel command word +(CCW) must start the I/O operation with the DOIO_ALLOW_SUSPEND option or the +suspend flag will cause a channel program check. At the time the channel program +becomes suspended an intermediate interrupt will be generated by the channel subsystem. -ccw_device_resume() - Resume Channel Program Execution +ccw_device_resume() - Resume Channel Program Execution -If a device driver chooses to suspend the current channel program execution by -setting the CCW suspend flag on a particular CCW, the channel program execution -is suspended. In order to resume channel program execution the CIO layer -provides the ccw_device_resume() routine. +If a device driver chooses to suspend the current channel program execution by +setting the CCW suspend flag on a particular CCW, the channel program execution +is suspended. In order to resume channel program execution the CIO layer +provides the ccw_device_resume() routine. -int ccw_device_resume(struct ccw_device *cdev); +:: -cdev - ccw_device the resume operation is requested for + int ccw_device_resume(struct ccw_device *cdev); + +==== ================================================ +cdev ccw_device the resume operation is requested for +==== ================================================ The ccw_device_resume() function returns: - 0 - suspended channel program is resumed --EBUSY - status pending --ENODEV - cdev invalid or not-operational subchannel --EINVAL - resume function not applicable --ENOTCONN - there is no I/O request pending for completion +========= ============================================== + 0 suspended channel program is resumed + -EBUSY status pending + -ENODEV cdev invalid or not-operational subchannel + -EINVAL resume function not applicable +-ENOTCONN there is no I/O request pending for completion +========= ============================================== Usage Notes: + Please have a look at the ccw_device_start() usage notes for more details on suspended channel programs. @@ -412,22 +454,28 @@ command is provided. ccw_device_halt() must be called disabled and with the ccw device lock held. -int ccw_device_halt(struct ccw_device *cdev, - unsigned long intparm); +:: -cdev : ccw_device the halt operation is requested for -intparm : interruption parameter; value is only used if no I/O - is outstanding, otherwise the intparm associated with - the I/O request is returned + int ccw_device_halt(struct ccw_device *cdev, + unsigned long intparm); -The ccw_device_halt() function returns : +======= ===================================================== +cdev ccw_device the halt operation is requested for +intparm interruption parameter; value is only used if no I/O + is outstanding, otherwise the intparm associated with + the I/O request is returned +======= ===================================================== - 0 - request successfully initiated --EBUSY - the device is currently busy, or status pending. --ENODEV - cdev invalid. --EINVAL - The device is not operational or the ccw device is not online. +The ccw_device_halt() function returns: -Usage Notes : +======= ============================================================== + 0 request successfully initiated +-EBUSY the device is currently busy, or status pending. +-ENODEV cdev invalid. +-EINVAL The device is not operational or the ccw device is not online. +======= ============================================================== + +Usage Notes: A device driver may write a never-ending channel program by writing a channel program that at its end loops back to its beginning by means of a transfer in @@ -438,25 +486,34 @@ can then perform an appropriate action. Prior to interrupt of an outstanding read to a network device (with or without PCI flag) a ccw_device_halt() is required to end the pending operation. -ccw_device_clear() - Terminage I/O Request Processing +:: + + ccw_device_clear() - Terminage I/O Request Processing In order to terminate all I/O processing at the subchannel, the clear subchannel (CSCH) command is used. It can be issued via ccw_device_clear(). ccw_device_clear() must be called disabled and with the ccw device lock held. -int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm); +:: -cdev: ccw_device the clear operation is requested for -intparm: interruption parameter (see ccw_device_halt()) + int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm); + +======= =============================================== +cdev ccw_device the clear operation is requested for +intparm interruption parameter (see ccw_device_halt()) +======= =============================================== The ccw_device_clear() function returns: - 0 - request successfully initiated --ENODEV - cdev invalid --EINVAL - The device is not operational or the ccw device is not online. +======= ============================================================== + 0 request successfully initiated +-ENODEV cdev invalid +-EINVAL The device is not operational or the ccw device is not online. +======= ============================================================== Miscellaneous Support Routines +------------------------------ This chapter describes various routines to be used in a Linux/390 device driver programming environment. @@ -466,7 +523,8 @@ get_ccwdev_lock() Get the address of the device specific lock. This is then used in spin_lock() / spin_unlock() calls. +:: -__u8 ccw_device_get_path_mask(struct ccw_device *cdev); + __u8 ccw_device_get_path_mask(struct ccw_device *cdev); Get the mask of the path currently available for cdev. diff --git a/Documentation/s390/CommonIO b/Documentation/s390/common_io.rst similarity index 87% rename from Documentation/s390/CommonIO rename to Documentation/s390/common_io.rst index 6e0f63f343b4..846485681ce7 100644 --- a/Documentation/s390/CommonIO +++ b/Documentation/s390/common_io.rst @@ -1,5 +1,9 @@ -S/390 common I/O-Layer - command line parameters, procfs and debugfs entries -============================================================================ +====================== +S/390 common I/O-Layer +====================== + +command line parameters, procfs and debugfs entries +=================================================== Command line parameters ----------------------- @@ -13,7 +17,7 @@ Command line parameters device := {all | [!]ipldev | [!]condev | [!] | [!]-} The given devices will be ignored by the common I/O-layer; no detection - and device sensing will be done on any of those devices. The subchannel to + and device sensing will be done on any of those devices. The subchannel to which the device in question is attached will be treated as if no device was attached. @@ -28,14 +32,20 @@ Command line parameters keywords can be used to refer to the CCW based boot device and CCW console device respectively (these are probably useful only when combined with the '!' operator). The '!' operator will cause the I/O-layer to _not_ ignore a device. - The command line is parsed from left to right. + The command line + is parsed from left to right. + + For example:: - For example, cio_ignore=0.0.0023-0.0.0042,0.0.4711 + will ignore all devices ranging from 0.0.0023 to 0.0.0042 and the device 0.0.4711, if detected. - As another example, + + As another example:: + cio_ignore=all,!0.0.4711,!0.0.fd00-0.0.fd02 + will ignore all devices but 0.0.4711, 0.0.fd00, 0.0.fd01, 0.0.fd02. By default, no devices are ignored. @@ -48,40 +58,45 @@ Command line parameters Lists the ranges of devices (by bus id) which are ignored by common I/O. - You can un-ignore certain or all devices by piping to /proc/cio_ignore. - "free all" will un-ignore all ignored devices, + You can un-ignore certain or all devices by piping to /proc/cio_ignore. + "free all" will un-ignore all ignored devices, "free , , ..." will un-ignore the specified devices. For example, if devices 0.0.0023 to 0.0.0042 and 0.0.4711 are ignored, + - echo free 0.0.0030-0.0.0032 > /proc/cio_ignore will un-ignore devices 0.0.0030 to 0.0.0032 and will leave devices 0.0.0023 to 0.0.002f, 0.0.0033 to 0.0.0042 and 0.0.4711 ignored; - echo free 0.0.0041 > /proc/cio_ignore will furthermore un-ignore device 0.0.0041; - - echo free all > /proc/cio_ignore will un-ignore all remaining ignored + - echo free all > /proc/cio_ignore will un-ignore all remaining ignored devices. - When a device is un-ignored, device recognition and sensing is performed and + When a device is un-ignored, device recognition and sensing is performed and the device driver will be notified if possible, so the device will become available to the system. Note that un-ignoring is performed asynchronously. - You can also add ranges of devices to be ignored by piping to + You can also add ranges of devices to be ignored by piping to /proc/cio_ignore; "add , , ..." will ignore the specified devices. Note: While already known devices can be added to the list of devices to be - ignored, there will be no effect on then. However, if such a device + ignored, there will be no effect on then. However, if such a device disappears and then reappears, it will then be ignored. To make known devices go away, you need the "purge" command (see below). - For example, + For example:: + "echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore" + will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored devices. - You can remove already known but now ignored devices via + You can remove already known but now ignored devices via:: + "echo purge > /proc/cio_ignore" + All devices ignored but still registered and not online (= not in use) will be deregistered and thus removed from the system. @@ -115,11 +130,11 @@ debugfs entries Various debug messages from the common I/O-layer. - /sys/kernel/debug/s390dbf/cio_trace/hex_ascii - Logs the calling of functions in the common I/O-layer and, if applicable, + Logs the calling of functions in the common I/O-layer and, if applicable, which subchannel they were called for, as well as dumps of some data structures (like irb in an error case). - The level of logging can be changed to be more or less verbose by piping to + The level of logging can be changed to be more or less verbose by piping to /sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the - documentation on the S/390 debug feature (Documentation/s390/s390dbf.txt) + documentation on the S/390 debug feature (Documentation/s390/s390dbf.rst) for details. diff --git a/Documentation/s390/DASD b/Documentation/s390/dasd.rst similarity index 92% rename from Documentation/s390/DASD rename to Documentation/s390/dasd.rst index 9963f1e9c98a..9e22247285c8 100644 --- a/Documentation/s390/DASD +++ b/Documentation/s390/dasd.rst @@ -1,4 +1,6 @@ +================== DASD device driver +================== S/390's disk devices (DASDs) are managed by Linux via the DASD device driver. It is valid for all types of DASDs and represents them to @@ -14,14 +16,14 @@ parameters are to be given in hexadecimal notation without a leading If you supply kernel parameters the different instances are processed in order of appearance and a minor number is reserved for any device covered by the supplied range up to 64 volumes. Additional DASDs are -ignored. If you do not supply the 'dasd=' kernel parameter at all, the +ignored. If you do not supply the 'dasd=' kernel parameter at all, the DASD driver registers all supported DASDs of your system to a minor number in ascending order of the subchannel number. The driver currently supports ECKD-devices and there are stubs for support of the FBA and CKD architectures. For the FBA architecture only some smart data structures are missing to make the support -complete. +complete. We performed our testing on 3380 and 3390 type disks of different sizes, under VM and on the bare hardware (LPAR), using internal disks of the multiprise as well as a RAMAC virtual array. Disks exported by @@ -34,19 +36,22 @@ accessibility of the DASD from other OSs. In a later stage we will provide support of partitions, maybe VTOC oriented or using a kind of partition table in the label record. -USAGE +Usage +===== -Low-level format (?CKD only) For using an ECKD-DASD as a Linux harddisk you have to low-level format the tracks by issuing the BLKDASDFORMAT-ioctl on that device. This will erase any data on that volume including IBM volume -labels, VTOCs etc. The ioctl may take a 'struct format_data *' or -'NULL' as an argument. -typedef struct { +labels, VTOCs etc. The ioctl may take a `struct format_data *` or +'NULL' as an argument:: + + typedef struct { int start_unit; int stop_unit; int blksize; -} format_data_t; + } format_data_t; + When a NULL argument is passed to the BLKDASDFORMAT ioctl the whole disk is formatted to a blocksize of 1024 bytes. Otherwise start_unit and stop_unit are the first and last track to be formatted. If @@ -56,17 +61,23 @@ up to the last track. blksize can be any power of two between 512 and 1kB blocks anyway and you gain approx. 50% of capacity increasing your blksize from 512 byte to 1kB. --Make a filesystem +Make a filesystem +================= + Then you can mk??fs the filesystem of your choice on that volume or partition. For reasons of sanity you should build your filesystem on -the partition /dev/dd?1 instead of the whole volume. You only lose 3kB +the partition /dev/dd?1 instead of the whole volume. You only lose 3kB but may be sure that you can reuse your data after introduction of a real partition table. -BUGS: +Bugs +==== + - Performance sometimes is rather low because we don't fully exploit clustering -TODO-List: +TODO-List +========= + - Add IBM'S Disk layout to genhd - Enhance driver to use more than one major number - Enable usage as a module diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/debugging390.rst similarity index 53% rename from Documentation/s390/Debugging390.txt rename to Documentation/s390/debugging390.rst index c35804c238ad..d49305fd5e1a 100644 --- a/Documentation/s390/Debugging390.txt +++ b/Documentation/s390/debugging390.rst @@ -1,9 +1,12 @@ +============================================= +Debugging on Linux for s/390 & z/Architecture +============================================= - Debugging on Linux for s/390 & z/Architecture - by - Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) - Copyright (C) 2000-2001 IBM Deutschland Entwicklung GmbH, IBM Corporation - Best viewed with fixed width fonts +Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + +Copyright (C) 2000-2001 IBM Deutschland Entwicklung GmbH, IBM Corporation + +.. Best viewed with fixed width fonts Overview of Document: ===================== @@ -17,32 +20,32 @@ It is intended like the Enterprise Systems Architecture/390 Reference Summary to be printed out & used as a quick cheat sheet self help style reference when problems occur. -Contents -======== -Register Set -Address Spaces on Intel Linux -Address Spaces on Linux for s/390 & z/Architecture -The Linux for s/390 & z/Architecture Kernel Task Structure -Register Usage & Stackframes on Linux for s/390 & z/Architecture -A sample program with comments -Compiling programs for debugging on Linux for s/390 & z/Architecture -Debugging under VM -s/390 & z/Architecture IO Overview -Debugging IO on s/390 & z/Architecture under VM -GDB on s/390 & z/Architecture -Stack chaining in gdb by hand -Examining core dumps -ldd -Debugging modules -The proc file system -SysRq -References -Special Thanks +.. Contents + ======== + Register Set + Address Spaces on Intel Linux + Address Spaces on Linux for s/390 & z/Architecture + The Linux for s/390 & z/Architecture Kernel Task Structure + Register Usage & Stackframes on Linux for s/390 & z/Architecture + A sample program with comments + Compiling programs for debugging on Linux for s/390 & z/Architecture + Debugging under VM + s/390 & z/Architecture IO Overview + Debugging IO on s/390 & z/Architecture under VM + GDB on s/390 & z/Architecture + Stack chaining in gdb by hand + Examining core dumps + ldd + Debugging modules + The proc file system + SysRq + References + Special Thanks Register Set ============ The current architectures have the following registers. - + 16 General propose registers, 32 bit on s/390 and 64 bit on z/Architecture, r0-r15 (or gpr0-gpr15), used for arithmetic and addressing. @@ -59,20 +62,22 @@ Access register 0 (and access register 1 on z/Architecture, which needs a 64 bit pointer) is currently used by the pthread library as a pointer to the current running threads private area. -16 64 bit floating point registers (fp0-fp15 ) IEEE & HFP floating -point format compliant on G5 upwards & a Floating point control reg (FPC) -4 64 bit registers (fp0,fp2,fp4 & fp6) HFP only on older machines. +16 64-bit floating point registers (fp0-fp15 ) IEEE & HFP floating +point format compliant on G5 upwards & a Floating point control reg (FPC) + +4 64-bit registers (fp0,fp2,fp4 & fp6) HFP only on older machines. + Note: -Linux (currently) always uses IEEE & emulates G5 IEEE format on older machines, -( provided the kernel is configured for this ). + Linux (currently) always uses IEEE & emulates G5 IEEE format on older + machines, ( provided the kernel is configured for this ). The PSW is the most important register on the machine it -is 64 bit on s/390 & 128 bit on z/Architecture & serves the roles of +is 64 bit on s/390 & 128 bit on z/Architecture & serves the roles of a program counter (pc), condition code register,memory space designator. In IBM standard notation I am counting bit 0 as the MSB. It has several advantages over a normal program counter -in that you can change address translation & program counter +in that you can change address translation & program counter in a single instruction. To change address translation, e.g. switching address translation off requires that you have a logical=physical mapping for the address you are @@ -206,14 +211,18 @@ It exists between the real addresses 0-4096 on s/390 and between 0-8192 on z/Architecture and is exchanged with one page on s/390 or two pages on z/Architecture in absolute storage by the set prefix instruction during Linux startup. + This page is mapped to a different prefix for each processor in an SMP configuration (assuming the OS designer is sane of course). + Bytes 0-512 (200 hex) on s/390 and 0-512, 4096-4544, 4604-5119 currently on z/Architecture are used by the processor itself for holding such information as exception indications and entry points for exceptions. + Bytes after 0xc00 hex are used by linux for per processor globals on s/390 and z/Architecture (there is a gap on z/Architecture currently between 0xc00 and 0x1000, too, which is used by Linux). + The closest thing to this on traditional architectures is the interrupt vector table. This is a good thing & does simplify some of the kernel coding however it means that we now cannot catch stray NULL pointers in the @@ -225,27 +234,29 @@ Address Spaces on Intel Linux ============================= The traditional Intel Linux is approximately mapped as follows forgive -the ascii art. -0xFFFFFFFF 4GB Himem ***************** - * * - * Kernel Space * - * * - ***************** **************** -User Space Himem * User Stack * * * -(typically 0xC0000000 3GB ) ***************** * * - * Shared Libs * * Next Process * - ***************** * to * - * * <== * Run * <== - * User Program * * * - * Data BSS * * * - * Text * * * - * Sections * * * -0x00000000 ***************** **************** +the ascii art:: + + 0xFFFFFFFF 4GB Himem ***************** + * * + * Kernel Space * + * * + ***************** **************** + User Space Himem * User Stack * * * + (typically 0xC0000000 3GB ) ***************** * * + * Shared Libs * * Next Process * + ***************** * to * + * * <== * Run * <== + * User Program * * * + * Data BSS * * * + * Text * * * + * Sections * * * + 0x00000000 ***************** **************** Now it is easy to see that on Intel it is quite easy to recognise a kernel address as being one greater than user space himem (in this case 0xC0000000), and addresses of less than this are the ones in the current running program on this processor (if an smp box). + If using the virtual machine ( VM ) as a debugger it is quite difficult to know which user process is running as the address space you are looking at could be from any process in the run queue. @@ -256,6 +267,7 @@ of Real Address=Virtual Address-User Space Himem. This means that on Intel the kernel linux can typically only address Himem=0xFFFFFFFF-0xC0000000=1GB & this is all the RAM these machines can typically use. + They can lower User Himem to 2GB or lower & thus be able to use 2GB of RAM however this shrinks the maximum size of User Space from 3GB to 2GB they have a no win limit of 4GB unless @@ -264,31 +276,31 @@ they go to 64 Bit. On 390 our limitations & strengths make us slightly different. For backward compatibility we are only allowed use 31 bits (2GB) -of our 32 bit addresses, however, we use entirely separate address +of our 32 bit addresses, however, we use entirely separate address spaces for the user & kernel. This means we can support 2GB of non Extended RAM on s/390, & more -with the Extended memory management swap device & +with the Extended memory management swap device & currently 4TB of physical memory currently on z/Architecture. Address Spaces on Linux for s/390 & z/Architecture ================================================== -Our addressing scheme is basically as follows: +Our addressing scheme is basically as follows:: - Primary Space Home Space -Himem 0x7fffffff 2GB on s/390 ***************** **************** -currently 0x3ffffffffff (2^42)-1 * User Stack * * * -on z/Architecture. ***************** * * - * Shared Libs * * * - ***************** * * - * * * Kernel * - * User Program * * * - * Data BSS * * * - * Text * * * - * Sections * * * -0x00000000 ***************** **************** + Primary Space Home Space + Himem 0x7fffffff 2GB on s/390 ***************** **************** + currently 0x3ffffffffff (2^42)-1 * User Stack * * * + on z/Architecture. ***************** * * + * Shared Libs * * * + ***************** * * + * * * Kernel * + * User Program * * * + * Data BSS * * * + * Text * * * + * Sections * * * + 0x00000000 ***************** **************** This also means that we need to look at the PSW problem state bit and the addressing mode to decide whether we are looking at user or kernel space. @@ -304,20 +316,25 @@ instruction on a user space address is performed. When also looking at the ASCE control registers, this means: User space: + - runs in primary or access register mode - cr1 contains the user asce - cr7 contains the user asce - cr13 contains the kernel asce Kernel space: + - runs in home space mode - cr1 contains the user or kernel asce - -> the kernel asce is loaded when a uaccess requires primary or - secondary address mode + + - the kernel asce is loaded when a uaccess requires primary or + secondary address mode + - cr7 contains the user or kernel asce, (changed with set_fs()) - cr13 contains the kernel asce In case of uaccess the kernel changes to: + - primary space mode in case of a uaccess (copy_to_user) and uses e.g. the mvcp instruction to access user space. However the kernel will stay in home space mode if the mvcos instruction is available @@ -337,41 +354,44 @@ Virtual Addresses on s/390 & z/Architecture A virtual address on s/390 is made up of 3 parts The SX (segment index, roughly corresponding to the PGD & PMD in Linux terminology) being bits 1-11. + The PX (page index, corresponding to the page table entry (pte) in Linux terminology) being bits 12-19. + The remaining bits BX (the byte index are the offset in the page ) i.e. bits 20 to 31. On z/Architecture in linux we currently make up an address from 4 parts. -The region index bits (RX) 0-32 we currently use bits 22-32 -The segment index (SX) being bits 33-43 -The page index (PX) being bits 44-51 -The byte index (BX) being bits 52-63 + +- The region index bits (RX) 0-32 we currently use bits 22-32 +- The segment index (SX) being bits 33-43 +- The page index (PX) being bits 44-51 +- The byte index (BX) being bits 52-63 Notes: -1) s/390 has no PMD so the PMD is really the PGD also. -A lot of this stuff is defined in pgtable.h. + 1) s/390 has no PMD so the PMD is really the PGD also. + A lot of this stuff is defined in pgtable.h. -2) Also seeing as s/390's page indexes are only 1k in size -(bits 12-19 x 4 bytes per pte ) we use 1 ( page 4k ) -to make the best use of memory by updating 4 segment indices -entries each time we mess with a PMD & use offsets -0,1024,2048 & 3072 in this page as for our segment indexes. -On z/Architecture our page indexes are now 2k in size -( bits 12-19 x 8 bytes per pte ) we do a similar trick -but only mess with 2 segment indices each time we mess with -a PMD. + 2) Also seeing as s/390's page indexes are only 1k in size + (bits 12-19 x 4 bytes per pte ) we use 1 ( page 4k ) + to make the best use of memory by updating 4 segment indices + entries each time we mess with a PMD & use offsets + 0,1024,2048 & 3072 in this page as for our segment indexes. + On z/Architecture our page indexes are now 2k in size + ( bits 12-19 x 8 bytes per pte ) we do a similar trick + but only mess with 2 segment indices each time we mess with + a PMD. + + 3) As z/Architecture supports up to a massive 5-level page table lookup we + can only use 3 currently on Linux ( as this is all the generic kernel + currently supports ) however this may change in future + this allows us to access ( according to my sums ) + 4TB of virtual storage per process i.e. + 4096*512(PTES)*1024(PMDS)*2048(PGD) = 4398046511104 bytes, + enough for another 2 or 3 of years I think :-). + to do this we use a region-third-table designation type in + our address space control registers. -3) As z/Architecture supports up to a massive 5-level page table lookup we -can only use 3 currently on Linux ( as this is all the generic kernel -currently supports ) however this may change in future -this allows us to access ( according to my sums ) -4TB of virtual storage per process i.e. -4096*512(PTES)*1024(PMDS)*2048(PGD) = 4398046511104 bytes, -enough for another 2 or 3 of years I think :-). -to do this we use a region-third-table designation type in -our address space control registers. - The Linux for s/390 & z/Architecture Kernel Task Structure ========================================================== @@ -382,42 +402,43 @@ the __LC_KERNEL_STACK variable in the spare prefix area for this cpu (which we use for per-processor globals). The kernel stack pointer is intimately tied with the task structure for -each processor as follows. +each processor as follows:: - s/390 - ************************ - * 1 page kernel stack * - * ( 4K ) * - ************************ - * 1 page task_struct * - * ( 4K ) * -8K aligned ************************ + s/390 + ************************ + * 1 page kernel stack * + * ( 4K ) * + ************************ + * 1 page task_struct * + * ( 4K ) * + 8K aligned ************************ - z/Architecture - ************************ - * 2 page kernel stack * - * ( 8K ) * - ************************ - * 2 page task_struct * - * ( 8K ) * -16K aligned ************************ + z/Architecture + ************************ + * 2 page kernel stack * + * ( 8K ) * + ************************ + * 2 page task_struct * + * ( 8K ) * + 16K aligned ************************ What this means is that we don't need to dedicate any register or global variable to point to the current running process & can retrieve it with the -following very simple construct for s/390 & one very similar for z/Architecture. +following very simple construct for s/390 & one very similar for +z/Architecture:: -static inline struct task_struct * get_current(void) -{ - struct task_struct *current; - __asm__("lhi %0,-8192\n\t" - "nr %0,15" - : "=r" (current) ); - return current; -} + static inline struct task_struct * get_current(void) + { + struct task_struct *current; + __asm__("lhi %0,-8192\n\t" + "nr %0,15" + : "=r" (current) ); + return current; + } i.e. just anding the current kernel stack pointer with the mask -8192. Thankfully because Linux doesn't have support for nested IO interrupts -& our devices have large buffers can survive interrupts being shut for +& our devices have large buffers can survive interrupts being shut for short amounts of time we don't need a separate stack for interrupts. @@ -428,7 +449,7 @@ Register Usage & Stackframes on Linux for s/390 & z/Architecture Overview: --------- This is the code that gcc produces at the top & the bottom of -each function. It usually is fairly consistent & similar from +each function. It usually is fairly consistent & similar from function to function & if you know its layout you can probably make some headway in finding the ultimate cause of a problem after a crash without a source level debugger. @@ -443,87 +464,95 @@ didn't have to maintain compatibility with older linkage formats. Glossary: --------- alloca: -This is a built in compiler function for runtime allocation -of extra space on the callers stack which is obviously freed -up on function exit ( e.g. the caller may choose to allocate nothing -of a buffer of 4k if required for temporary purposes ), it generates -very efficient code ( a few cycles ) when compared to alternatives -like malloc. + This is a built in compiler function for runtime allocation + of extra space on the callers stack which is obviously freed + up on function exit ( e.g. the caller may choose to allocate nothing + of a buffer of 4k if required for temporary purposes ), it generates + very efficient code ( a few cycles ) when compared to alternatives + like malloc. -automatics: These are local variables on the stack, -i.e they aren't in registers & they aren't static. +automatics: + These are local variables on the stack, i.e they aren't in registers & + they aren't static. back-chain: -This is a pointer to the stack pointer before entering a -framed functions ( see frameless function ) prologue got by -dereferencing the address of the current stack pointer, - i.e. got by accessing the 32 bit value at the stack pointers -current location. + This is a pointer to the stack pointer before entering a + framed functions ( see frameless function ) prologue got by + dereferencing the address of the current stack pointer, + i.e. got by accessing the 32 bit value at the stack pointers + current location. base-pointer: -This is a pointer to the back of the literal pool which -is an area just behind each procedure used to store constants -in each function. + This is a pointer to the back of the literal pool which + is an area just behind each procedure used to store constants + in each function. -call-clobbered: The caller probably needs to save these registers if there -is something of value in them, on the stack or elsewhere before making a -call to another procedure so that it can restore it later. +call-clobbered: + The caller probably needs to save these registers if there + is something of value in them, on the stack or elsewhere before making a + call to another procedure so that it can restore it later. epilogue: -The code generated by the compiler to return to the caller. + The code generated by the compiler to return to the caller. -frameless-function -A frameless function in Linux for s390 & z/Architecture is one which doesn't -need more than the register save area (96 bytes on s/390, 160 on z/Architecture) -given to it by the caller. -A frameless function never: -1) Sets up a back chain. -2) Calls alloca. -3) Calls other normal functions -4) Has automatics. +frameless-function: + A frameless function in Linux for s390 & z/Architecture is one which doesn't + need more than the register save area (96 bytes on s/390, 160 on z/Architecture) + given to it by the caller. + + A frameless function never: + + 1) Sets up a back chain. + 2) Calls alloca. + 3) Calls other normal functions + 4) Has automatics. GOT-pointer: -This is a pointer to the global-offset-table in ELF -( Executable Linkable Format, Linux'es most common executable format ), -all globals & shared library objects are found using this pointer. + This is a pointer to the global-offset-table in ELF + ( Executable Linkable Format, Linux'es most common executable format ), + all globals & shared library objects are found using this pointer. lazy-binding -ELF shared libraries are typically only loaded when routines in the shared -library are actually first called at runtime. This is lazy binding. + ELF shared libraries are typically only loaded when routines in the shared + library are actually first called at runtime. This is lazy binding. procedure-linkage-table -This is a table found from the GOT which contains pointers to routines -in other shared libraries which can't be called to by easier means. + This is a table found from the GOT which contains pointers to routines + in other shared libraries which can't be called to by easier means. prologue: -The code generated by the compiler to set up the stack frame. + The code generated by the compiler to set up the stack frame. outgoing-args: -This is extra area allocated on the stack of the calling function if the -parameters for the callee's cannot all be put in registers, the same -area can be reused by each function the caller calls. + This is extra area allocated on the stack of the calling function if the + parameters for the callee's cannot all be put in registers, the same + area can be reused by each function the caller calls. routine-descriptor: -A COFF executable format based concept of a procedure reference -actually being 8 bytes or more as opposed to a simple pointer to the routine. -This is typically defined as follows -Routine Descriptor offset 0=Pointer to Function -Routine Descriptor offset 4=Pointer to Table of Contents -The table of contents/TOC is roughly equivalent to a GOT pointer. -& it means that shared libraries etc. can be shared between several -environments each with their own TOC. + A COFF executable format based concept of a procedure reference + actually being 8 bytes or more as opposed to a simple pointer to the routine. + This is typically defined as follows: - -static-chain: This is used in nested functions a concept adopted from pascal -by gcc not used in ansi C or C++ ( although quite useful ), basically it -is a pointer used to reference local variables of enclosing functions. -You might come across this stuff once or twice in your lifetime. + - Routine Descriptor offset 0=Pointer to Function + - Routine Descriptor offset 4=Pointer to Table of Contents -e.g. -The function below should return 11 though gcc may get upset & toss warnings -about unused variables. -int FunctionA(int a) -{ + The table of contents/TOC is roughly equivalent to a GOT pointer. + & it means that shared libraries etc. can be shared between several + environments each with their own TOC. + +static-chain: + This is used in nested functions a concept adopted from pascal + by gcc not used in ansi C or C++ ( although quite useful ), basically it + is a pointer used to reference local variables of enclosing functions. + You might come across this stuff once or twice in your lifetime. + + e.g. + + The function below should return 11 though gcc may get upset & toss warnings + about unused variables:: + + int FunctionA(int a) + { int b; FunctionC(int c) { @@ -531,19 +560,21 @@ int FunctionA(int a) } FunctionC(10); return(b); -} + } s/390 & z/Architecture Register usage ===================================== + +======== ========================================== =============== r0 used by syscalls/assembly call-clobbered -r1 used by syscalls/assembly call-clobbered +r1 used by syscalls/assembly call-clobbered r2 argument 0 / return value 0 call-clobbered r3 argument 1 / return value 1 (if long long) call-clobbered r4 argument 2 call-clobbered r5 argument 3 call-clobbered -r6 argument 4 saved -r7 pointer-to arguments 5 to ... saved +r6 argument 4 saved +r7 pointer-to arguments 5 to ... saved r8 this & that saved r9 this & that saved r10 static-chain ( if nested function ) saved @@ -557,65 +588,74 @@ f0 argument 0 / return value ( float/double ) call-clobbered f2 argument 1 call-clobbered f4 z/Architecture argument 2 saved f6 z/Architecture argument 3 saved +======== ========================================== =============== + The remaining floating points f1,f3,f5 f7-f15 are call-clobbered. Notes: ------ 1) The only requirement is that registers which are used -by the callee are saved, e.g. the compiler is perfectly -capable of using r11 for purposes other than a frame a -frame pointer if a frame pointer is not needed. -2) In functions with variable arguments e.g. printf the calling procedure -is identical to one without variable arguments & the same number of -parameters. However, the prologue of this function is somewhat more -hairy owing to it having to move these parameters to the stack to -get va_start, va_arg & va_end to work. + by the callee are saved, e.g. the compiler is perfectly + capable of using r11 for purposes other than a frame a + frame pointer if a frame pointer is not needed. +2) In functions with variable arguments e.g. printf the calling procedure + is identical to one without variable arguments & the same number of + parameters. However, the prologue of this function is somewhat more + hairy owing to it having to move these parameters to the stack to + get va_start, va_arg & va_end to work. 3) Access registers are currently unused by gcc but are used in -the kernel. Possibilities exist to use them at the moment for -temporary storage but it isn't recommended. + the kernel. Possibilities exist to use them at the moment for + temporary storage but it isn't recommended. 4) Only 4 of the floating point registers are used for -parameter passing as older machines such as G3 only have only 4 -& it keeps the stack frame compatible with other compilers. -However with IEEE floating point emulation under linux on the -older machines you are free to use the other 12. -5) A long long or double parameter cannot be have the -first 4 bytes in a register & the second four bytes in the -outgoing args area. It must be purely in the outgoing args -area if crossing this boundary. + parameter passing as older machines such as G3 only have only 4 + & it keeps the stack frame compatible with other compilers. + However with IEEE floating point emulation under linux on the + older machines you are free to use the other 12. +5) A long long or double parameter cannot be have the + first 4 bytes in a register & the second four bytes in the + outgoing args area. It must be purely in the outgoing args + area if crossing this boundary. 6) Floating point parameters are mixed with outgoing args -on the outgoing args area in the order the are passed in as parameters. -7) Floating point arguments 2 & 3 are saved in the outgoing args area for -z/Architecture + on the outgoing args area in the order the are passed in as parameters. +7) Floating point arguments 2 & 3 are saved in the outgoing args area for + z/Architecture Stack Frame Layout ------------------ + +========= ============== ====================================================== s/390 z/Architecture -0 0 back chain ( a 0 here signifies end of back chain ) -4 8 eos ( end of stack, not used on Linux for S390 used in other linkage formats ) -8 16 glue used in other s/390 linkage formats for saved routine descriptors etc. -12 24 glue used in other s/390 linkage formats for saved routine descriptors etc. -16 32 scratch area -20 40 scratch area -24 48 saved r6 of caller function -28 56 saved r7 of caller function -32 64 saved r8 of caller function -36 72 saved r9 of caller function -40 80 saved r10 of caller function -44 88 saved r11 of caller function -48 96 saved r12 of caller function -52 104 saved r13 of caller function -56 112 saved r14 of caller function -60 120 saved r15 of caller function -64 128 saved f4 of caller function -72 132 saved f6 of caller function -80 undefined -96 160 outgoing args passed from caller to callee -96+x 160+x possible stack alignment ( 8 bytes desirable ) -96+x+y 160+x+y alloca space of caller ( if used ) -96+x+y+z 160+x+y+z automatics of caller ( if used ) -0 back-chain +========= ============== ====================================================== +0 0 back chain ( a 0 here signifies end of back chain ) +4 8 eos ( end of stack, not used on Linux for S390 used + in other linkage formats ) +8 16 glue used in other s/390 linkage formats for saved + routine descriptors etc. +12 24 glue used in other s/390 linkage formats for saved + routine descriptors etc. +16 32 scratch area +20 40 scratch area +24 48 saved r6 of caller function +28 56 saved r7 of caller function +32 64 saved r8 of caller function +36 72 saved r9 of caller function +40 80 saved r10 of caller function +44 88 saved r11 of caller function +48 96 saved r12 of caller function +52 104 saved r13 of caller function +56 112 saved r14 of caller function +60 120 saved r15 of caller function +64 128 saved f4 of caller function +72 132 saved f6 of caller function +80 undefined +96 160 outgoing args passed from caller to callee +96+x 160+x possible stack alignment ( 8 bytes desirable ) +96+x+y 160+x+y alloca space of caller ( if used ) +96+x+y+z 160+x+y+z automatics of caller ( if used ) +0 back-chain +========= ============== ====================================================== A sample program with comments. =============================== @@ -623,82 +663,86 @@ A sample program with comments. Comments on the function test ----------------------------- 1) It didn't need to set up a pointer to the constant pool gpr13 as it is not -used ( :-( ). + used ( :-( ). 2) This is a frameless function & no stack is bought. 3) The compiler was clever enough to recognise that it could return the -value in r2 as well as use it for the passed in parameter ( :-) ). -4) The basr ( branch relative & save ) trick works as follows the instruction -has a special case with r0,r0 with some instruction operands is understood as -the literal value 0, some risc architectures also do this ). So now -we are branching to the next address & the address new program counter is -in r13,so now we subtract the size of the function prologue we have executed -+ the size of the literal pool to get to the top of the literal pool -0040037c int test(int b) -{ # Function prologue below - 40037c: 90 de f0 34 stm %r13,%r14,52(%r15) # Save registers r13 & r14 - 400380: 0d d0 basr %r13,%r0 # Set up pointer to constant pool using - 400382: a7 da ff fa ahi %r13,-6 # basr trick - return(5+b); - # Huge main program - 400386: a7 2a 00 05 ahi %r2,5 # add 5 to r2 + value in r2 as well as use it for the passed in parameter ( :-) ). +4) The basr ( branch relative & save ) trick works as follows the instruction + has a special case with r0,r0 with some instruction operands is understood as + the literal value 0, some risc architectures also do this ). So now + we are branching to the next address & the address new program counter is + in r13,so now we subtract the size of the function prologue we have executed + the size of the literal pool to get to the top of the literal pool:: - # Function epilogue below - 40038a: 98 de f0 34 lm %r13,%r14,52(%r15) # restore registers r13 & 14 - 40038e: 07 fe br %r14 # return -} + + 0040037c int test(int b) + { # Function prologue below + 40037c: 90 de f0 34 stm %r13,%r14,52(%r15) # Save registers r13 & r14 + 400380: 0d d0 basr %r13,%r0 # Set up pointer to constant pool using + 400382: a7 da ff fa ahi %r13,-6 # basr trick + return(5+b); + # Huge main program + 400386: a7 2a 00 05 ahi %r2,5 # add 5 to r2 + + # Function epilogue below + 40038a: 98 de f0 34 lm %r13,%r14,52(%r15) # restore registers r13 & 14 + 40038e: 07 fe br %r14 # return + } Comments on the function main ----------------------------- -1) The compiler did this function optimally ( 8-) ) +1) The compiler did this function optimally ( 8-) ):: -Literal pool for main. -400390: ff ff ff ec .long 0xffffffec -main(int argc,char *argv[]) -{ # Function prologue below - 400394: 90 bf f0 2c stm %r11,%r15,44(%r15) # Save necessary registers - 400398: 18 0f lr %r0,%r15 # copy stack pointer to r0 - 40039a: a7 fa ff a0 ahi %r15,-96 # Make area for callee saving - 40039e: 0d d0 basr %r13,%r0 # Set up r13 to point to - 4003a0: a7 da ff f0 ahi %r13,-16 # literal pool - 4003a4: 50 00 f0 00 st %r0,0(%r15) # Save backchain + Literal pool for main. + 400390: ff ff ff ec .long 0xffffffec + main(int argc,char *argv[]) + { # Function prologue below + 400394: 90 bf f0 2c stm %r11,%r15,44(%r15) # Save necessary registers + 400398: 18 0f lr %r0,%r15 # copy stack pointer to r0 + 40039a: a7 fa ff a0 ahi %r15,-96 # Make area for callee saving + 40039e: 0d d0 basr %r13,%r0 # Set up r13 to point to + 4003a0: a7 da ff f0 ahi %r13,-16 # literal pool + 4003a4: 50 00 f0 00 st %r0,0(%r15) # Save backchain return(test(5)); # Main Program Below - 4003a8: 58 e0 d0 00 l %r14,0(%r13) # load relative address of test from - # literal pool - 4003ac: a7 28 00 05 lhi %r2,5 # Set first parameter to 5 - 4003b0: 4d ee d0 00 bas %r14,0(%r14,%r13) # jump to test setting r14 as return + 4003a8: 58 e0 d0 00 l %r14,0(%r13) # load relative address of test from + # literal pool + 4003ac: a7 28 00 05 lhi %r2,5 # Set first parameter to 5 + 4003b0: 4d ee d0 00 bas %r14,0(%r14,%r13) # jump to test setting r14 as return # address using branch & save instruction. # Function Epilogue below - 4003b4: 98 bf f0 8c lm %r11,%r15,140(%r15)# Restore necessary registers. - 4003b8: 07 fe br %r14 # return to do program exit -} + 4003b4: 98 bf f0 8c lm %r11,%r15,140(%r15)# Restore necessary registers. + 4003b8: 07 fe br %r14 # return to do program exit + } Compiler updates ---------------- -main(int argc,char *argv[]) -{ - 4004fc: 90 7f f0 1c stm %r7,%r15,28(%r15) - 400500: a7 d5 00 04 bras %r13,400508 - 400504: 00 40 04 f4 .long 0x004004f4 - # compiler now puts constant pool in code to so it saves an instruction - 400508: 18 0f lr %r0,%r15 - 40050a: a7 fa ff a0 ahi %r15,-96 - 40050e: 50 00 f0 00 st %r0,0(%r15) +:: + + main(int argc,char *argv[]) + { + 4004fc: 90 7f f0 1c stm %r7,%r15,28(%r15) + 400500: a7 d5 00 04 bras %r13,400508 + 400504: 00 40 04 f4 .long 0x004004f4 + # compiler now puts constant pool in code to so it saves an instruction + 400508: 18 0f lr %r0,%r15 + 40050a: a7 fa ff a0 ahi %r15,-96 + 40050e: 50 00 f0 00 st %r0,0(%r15) return(test(5)); - 400512: 58 10 d0 00 l %r1,0(%r13) - 400516: a7 28 00 05 lhi %r2,5 - 40051a: 0d e1 basr %r14,%r1 - # compiler adds 1 extra instruction to epilogue this is done to - # avoid processor pipeline stalls owing to data dependencies on g5 & - # above as register 14 in the old code was needed directly after being loaded - # by the lm %r11,%r15,140(%r15) for the br %14. - 40051c: 58 40 f0 98 l %r4,152(%r15) - 400520: 98 7f f0 7c lm %r7,%r15,124(%r15) - 400524: 07 f4 br %r4 -} + 400512: 58 10 d0 00 l %r1,0(%r13) + 400516: a7 28 00 05 lhi %r2,5 + 40051a: 0d e1 basr %r14,%r1 + # compiler adds 1 extra instruction to epilogue this is done to + # avoid processor pipeline stalls owing to data dependencies on g5 & + # above as register 14 in the old code was needed directly after being loaded + # by the lm %r11,%r15,140(%r15) for the br %14. + 40051c: 58 40 f0 98 l %r4,152(%r15) + 400520: 98 7f f0 7c lm %r7,%r15,124(%r15) + 400524: 07 f4 br %r4 + } Hartmut ( our compiler developer ) also has been threatening to take out the @@ -709,38 +753,39 @@ have been warned. -------------------------------------- If you understand the stuff above you'll understand the stuff -below too so I'll avoid repeating myself & just say that +below too so I'll avoid repeating myself & just say that some of the instructions have g's on the end of them to indicate -they are 64 bit & the stack offsets are a bigger, +they are 64 bit & the stack offsets are a bigger, the only other difference you'll find between 32 & 64 bit is that -we now use f4 & f6 for floating point arguments on 64 bit. -00000000800005b0 : -int test(int b) -{ +we now use f4 & f6 for floating point arguments on 64 bit:: + + 00000000800005b0 : + int test(int b) + { return(5+b); - 800005b0: a7 2a 00 05 ahi %r2,5 - 800005b4: b9 14 00 22 lgfr %r2,%r2 # downcast to integer - 800005b8: 07 fe br %r14 - 800005ba: 07 07 bcr 0,%r7 + 800005b0: a7 2a 00 05 ahi %r2,5 + 800005b4: b9 14 00 22 lgfr %r2,%r2 # downcast to integer + 800005b8: 07 fe br %r14 + 800005ba: 07 07 bcr 0,%r7 -} + } -00000000800005bc
: -main(int argc,char *argv[]) -{ - 800005bc: eb bf f0 58 00 24 stmg %r11,%r15,88(%r15) - 800005c2: b9 04 00 1f lgr %r1,%r15 - 800005c6: a7 fb ff 60 aghi %r15,-160 - 800005ca: e3 10 f0 00 00 24 stg %r1,0(%r15) + 00000000800005bc
: + main(int argc,char *argv[]) + { + 800005bc: eb bf f0 58 00 24 stmg %r11,%r15,88(%r15) + 800005c2: b9 04 00 1f lgr %r1,%r15 + 800005c6: a7 fb ff 60 aghi %r15,-160 + 800005ca: e3 10 f0 00 00 24 stg %r1,0(%r15) return(test(5)); - 800005d0: a7 29 00 05 lghi %r2,5 - # brasl allows jumps > 64k & is overkill here bras would do fune - 800005d4: c0 e5 ff ff ff ee brasl %r14,800005b0 - 800005da: e3 40 f1 10 00 04 lg %r4,272(%r15) - 800005e0: eb bf f0 f8 00 04 lmg %r11,%r15,248(%r15) - 800005e6: 07 f4 br %r4 -} + 800005d0: a7 29 00 05 lghi %r2,5 + # brasl allows jumps > 64k & is overkill here bras would do fune + 800005d4: c0 e5 ff ff ff ee brasl %r14,800005b0 + 800005da: e3 40 f1 10 00 04 lg %r4,272(%r15) + 800005e0: eb bf f0 f8 00 04 lmg %r11,%r15,248(%r15) + 800005e6: 07 f4 br %r4 + } @@ -749,15 +794,15 @@ Compiling programs for debugging on Linux for s/390 & z/Architecture -gdwarf-2 now works it should be considered the default debugging format for s/390 & z/Architecture as it is more reliable for debugging shared libraries, normal -g debugging works much better now -Thanks to the IBM java compiler developers bug reports. +Thanks to the IBM java compiler developers bug reports. -This is typically done adding/appending the flags -g or -gdwarf-2 to the +This is typically done adding/appending the flags -g or -gdwarf-2 to the CFLAGS & LDFLAGS variables Makefile of the program concerned. If using gdb & you would like accurate displays of registers & - stack traces compile without optimisation i.e make sure +stack traces compile without optimisation i.e make sure that there is no -O2 or similar on the CFLAGS line of the Makefile & -the emitted gcc commands, obviously this will produce worse code +the emitted gcc commands, obviously this will produce worse code ( not advisable for shipment ) but it is an aid to the debugging process. This aids debugging because the compiler will copy parameters passed in @@ -766,7 +811,7 @@ parameters will work, however some larger programs which use inline functions will not compile without optimisation. Debugging with optimisation has since much improved after fixing -some bugs, please make sure you are using gdb-5.0 or later developed +some bugs, please make sure you are using gdb-5.0 or later developed after Nov'2000. @@ -779,7 +824,7 @@ Notes Addresses & values in the VM debugger are always hex never decimal Address ranges are of the format - or . -For example, the address range 0x2000 to 0x3000 can be described as 2000-3000 +For example, the address range 0x2000 to 0x3000 can be described as 2000-3000 or 2000.1000 The VM Debugger is case insensitive. @@ -798,27 +843,31 @@ operands are nibble (half byte aligned). So if you have an objdump listing by hand, it is quite easy to follow, and if you don't have an objdump listing keep a copy of the s/390 Reference Summary or alternatively the s/390 principles of operation next to you. -e.g. even I can guess that +e.g. even I can guess that 0001AFF8' LR 180F CC 0 -is a ( load register ) lr r0,r15 +is a ( load register ) lr r0,r15 Also it is very easy to tell the length of a 390 instruction from the 2 most significant bits in the instruction (not that this info is really useful except if you are trying to make sense of a hexdump of code). Here is a table + +======================= ================== Bits Instruction Length ------------------------------------------- +======================= ================== 00 2 Bytes 01 4 Bytes 10 4 Bytes 11 6 Bytes +======================= ================== The debugger also displays other useful info on the same line such as the addresses being operated on destination addresses of branches & condition codes. -e.g. -00019736' AHI A7DAFF0E CC 1 -000198BA' BRC A7840004 -> 000198C2' CC 0 -000198CE' STM 900EF068 >> 0FA95E78 CC 2 +e.g.:: + + 00019736' AHI A7DAFF0E CC 1 + 000198BA' BRC A7840004 -> 000198C2' CC 0 + 000198CE' STM 900EF068 >> 0FA95E78 CC 2 @@ -826,54 +875,79 @@ Useful VM debugger commands --------------------------- I suppose I'd better mention this before I start -to list the current active traces do -Q TR +to list the current active traces do:: + + Q TR + there can be a maximum of 255 of these per set ( more about trace sets later ). -To stop traces issue a -TR END. -To delete a particular breakpoint issue -TR DEL + +To stop traces issue a:: + + TR END. + +To delete a particular breakpoint issue:: + + TR DEL The PA1 key drops to CP mode so you can issue debugger commands, -Doing alt c (on my 3270 console at least ) clears the screen. +Doing alt c (on my 3270 console at least ) clears the screen. + hitting b comes back to the running operating system from cp mode ( in our case linux ). + It is typically useful to add shortcuts to your profile.exec file if you have one ( this is roughly equivalent to autoexec.bat in DOS ). -file here are a few from mine. -/* this gives me command history on issuing f12 */ -set pf12 retrieve -/* this continues */ -set pf8 imm b -/* goes to trace set a */ -set pf1 imm tr goto a -/* goes to trace set b */ -set pf2 imm tr goto b -/* goes to trace set c */ -set pf3 imm tr goto c +file here are a few from mine:: + + /* this gives me command history on issuing f12 */ + set pf12 retrieve + /* this continues */ + set pf8 imm b + /* goes to trace set a */ + set pf1 imm tr goto a + /* goes to trace set b */ + set pf2 imm tr goto b + /* goes to trace set c */ + set pf3 imm tr goto c Instruction Tracing ------------------- -Setting a simple breakpoint -TR I PSWA
-To debug a particular function try -TR I R -TR I on its own will single step. -TR I DATA will trace for particular mnemonics -e.g. -TR I DATA 4D R 0197BC.4000 +Setting a simple breakpoint:: + + TR I PSWA
+ +To debug a particular function try:: + + TR I R + TR I on its own will single step. + TR I DATA will trace for particular mnemonics + +e.g.:: + + TR I DATA 4D R 0197BC.4000 + will trace for BAS'es ( opcode 4D ) in the range 0197BC.4000 + if you were inclined you could add traces for all branch instructions & -suffix them with the run prefix so you would have a backtrace on screen -when a program crashes. -TR BR will trace branches into or out of an address. -e.g. -TR BR INTO 0 is often quite useful if a program is getting awkward & deciding +suffix them with the run prefix so you would have a backtrace on screen +when a program crashes:: + + TR BR will trace branches into or out of an address. + +e.g.:: + + TR BR INTO 0 + +is often quite useful if a program is getting awkward & deciding to branch to 0 & crashing as this will stop at the address before in jumps to 0. -TR I R
RUN cmd d g + +:: + + TR I R
RUN cmd d g + single steps a range of addresses but stays running & displays the gprs on each step. @@ -881,93 +955,129 @@ displays the gprs on each step. Displaying & modifying Registers -------------------------------- -D G will display all the gprs -Adding a extra G to all the commands is necessary to access the full 64 bit +D G + will display all the gprs + +Adding a extra G to all the commands is necessary to access the full 64 bit content in VM on z/Architecture. Obviously this isn't required for access registers as these are still 32 bit. -e.g. DGG instead of DG -D X will display all the control registers -D AR will display all the access registers -D AR4-7 will display access registers 4 to 7 -CPU ALL D G will display the GRPS of all CPUS in the configuration -D PSW will display the current PSW -st PSW 2000 will put the value 2000 into the PSW & -cause crash your machine. -D PREFIX displays the prefix offset + +e.g. + +DGG + instead of DG + +D X + will display all the control registers +D AR + will display all the access registers +D AR4-7 + will display access registers 4 to 7 +CPU ALL D G + will display the GRPS of all CPUS in the configuration +D PSW + will display the current PSW +st PSW 2000 + will put the value 2000 into the PSW & cause crash your machine. +D PREFIX + displays the prefix offset Displaying Memory ----------------- -To display memory mapped using the current PSW's mapping try -D +To display memory mapped using the current PSW's mapping try:: + + D + To make VM display a message each time it hits a particular address and -continue try -D I will disassemble/display a range of instructions. -ST addr 32 bit word will store a 32 bit aligned address -D T will display the EBCDIC in an address (if you are that way inclined) -D R will display real addresses ( without DAT ) but with prefixing. +continue try: + +D I + will disassemble/display a range of instructions. + +ST addr 32 bit word + will store a 32 bit aligned address +D T + will display the EBCDIC in an address (if you are that way inclined) +D R + will display real addresses ( without DAT ) but with prefixing. + There are other complex options to display if you need to get at say home space but are in primary space the easiest thing to do is to temporarily modify the PSW to the other addressing mode, display the stuff & then restore it. - + Hints ----- If you want to issue a debugger command without halting your virtual machine -with the PA1 key try prefixing the command with #CP e.g. -#cp tr i pswa 2000 +with the PA1 key try prefixing the command with #CP e.g.:: + + #cp tr i pswa 2000 + also suffixing most debugger commands with RUN will cause them not to stop just display the mnemonic at the current instruction on the console. + If you have several breakpoints you want to put into your program & you get fed up of cross referencing with System.map you can do the following trick for several symbols. -grep do_signal System.map -which emits the following among other things -0001f4e0 T do_signal -now you can do -TR I PSWA 0001f4e0 cmd msg * do_signal +:: + + grep do_signal System.map + +which emits the following among other things:: + + 0001f4e0 T do_signal + +now you can do:: + + TR I PSWA 0001f4e0 cmd msg * do_signal + This sends a message to your own console each time do_signal is entered. ( As an aside I wrote a perl script once which automatically generated a REXX script with breakpoints on every kernel procedure, this isn't a good idea because there are thousands of these routines & VM can only set 255 breakpoints -at a time so you nearly had to spend as long pruning the file down as you would +at a time so you nearly had to spend as long pruning the file down as you would entering the msgs by hand), however, the trick might be useful for a single object file. In the 3270 terminal emulator x3270 there is a very useful option in the file menu called "Save Screen In File" - this is very good for keeping a copy of traces. -From CMS help will give you online help on a particular command. -e.g. -HELP DISPLAY +From CMS help will give you online help on a particular command. +e.g.:: + + HELP DISPLAY Also CP has a file called profile.exec which automatically gets called on startup of CMS ( like autoexec.bat ), keeping on a DOS analogy session CP has a feature similar to doskey, it may be useful for you to -use profile.exec to define some keystrokes. -e.g. +use profile.exec to define some keystrokes. + SET PF9 IMM B -This does a single step in VM on pressing F8. + This does a single step in VM on pressing F8. + SET PF10 ^ -This sets up the ^ key. -which can be used for ^c (ctrl-c),^z (ctrl-z) which can't be typed directly -into some 3270 consoles. + This sets up the ^ key. + which can be used for ^c (ctrl-c),^z (ctrl-z) which can't be typed + directly into some 3270 consoles. + SET PF11 ^- -This types the starting keystrokes for a sysrq see SysRq below. + This types the starting keystrokes for a sysrq see SysRq below. SET PF12 RETRIEVE -This retrieves command history on pressing F12. + This retrieves command history on pressing F12. Sometimes in VM the display is set up to scroll automatically this can be very annoying if there are messages you wish to look at to stop this do + TERM MORE 255 255 -This will nearly stop automatic screen updates, however it will -cause a denial of service if lots of messages go to the 3270 console, -so it would be foolish to use this as the default on a production machine. - + This will nearly stop automatic screen updates, however it will + cause a denial of service if lots of messages go to the 3270 console, + so it would be foolish to use this as the default on a production machine. + Tracing particular processes ---------------------------- @@ -976,69 +1086,116 @@ very seldom collide with text segments of user programs ( thanks Martin ), this simplifies debugging the kernel. However it is quite common for user processes to have addresses which collide this can make debugging a particular process under VM painful under normal -circumstances as the process may change when doing a -TR I R
. +circumstances as the process may change when doing a:: + + TR I R
. + Thankfully after reading VM's online help I figured out how to debug I particular process. Your first problem is to find the STD ( segment table designation ) of the program you wish to debug. There are several ways you can do this here are a few -1) objdump --syms | grep main -To get the address of main in the program. -tr i pswa
+ +Run:: + + objdump --syms | grep main + +To get the address of main in the program. Then:: + + tr i pswa
+ Start the program, if VM drops to CP on what looks like the entry point of the main function this is most likely the process you wish to debug. Now do a D X13 or D XG13 on z/Architecture. -On 31 bit the STD is bits 1-19 ( the STO segment table origin ) + +On 31 bit the STD is bits 1-19 ( the STO segment table origin ) & 25-31 ( the STL segment table length ) of CR13. -now type -TR I R STD 0.7fffffff -e.g. -TR I R STD 8F32E1FF 0.7fffffff -Another very useful variation is -TR STORE INTO STD
+ +now type:: + + TR I R STD 0.7fffffff + +e.g.:: + + TR I R STD 8F32E1FF 0.7fffffff + +Another very useful variation is:: + + TR STORE INTO STD
+ for finding out when a particular variable changes. -An alternative way of finding the STD of a currently running process +An alternative way of finding the STD of a currently running process is to do the following, ( this method is more complex but could be quite convenient if you aren't updating the kernel much & so your kernel structures will stay constant for a reasonable period of time ). -grep task /proc//status -from this you should see something like -task: 0f160000 ksp: 0f161de8 pt_regs: 0f161f68 +:: + + grep task /proc//status + +from this you should see something like:: + + task: 0f160000 ksp: 0f161de8 pt_regs: 0f161f68 + This now gives you a pointer to the task structure. -Now make CC:="s390-gcc -g" kernel/sched.s + +Now make:: + + CC:="s390-gcc -g" kernel/sched.s + To get the task_struct stabinfo. + ( task_struct is defined in include/linux/sched.h ). + Now we want to look at task->active_mm->pgd + on my machine the active_mm in the task structure stab is active_mm:(4,12),672,32 + its offset is 672/8=84=0x54 + the pgd member in the mm_struct stab is pgd:(4,6)=*(29,5),96,32 so its offset is 96/8=12=0xc -so we'll -hexdump -s 0xf160054 /dev/mem | more +so we'll:: + + hexdump -s 0xf160054 /dev/mem | more + i.e. task_struct+active_mm offset -to look at the active_mm member -f160054 0fee cc60 0019 e334 0000 0000 0000 0011 -hexdump -s 0x0feecc6c /dev/mem | more -i.e. active_mm+pgd offset -feecc6c 0f2c 0000 0000 0001 0000 0001 0000 0010 +to look at the active_mm member:: + + f160054 0fee cc60 0019 e334 0000 0000 0000 0011 + +:: + + hexdump -s 0x0feecc6c /dev/mem | more + +i.e. active_mm+pgd offset:: + + feecc6c 0f2c 0000 0000 0001 0000 0001 0000 0010 + we get something like -now do -TR I R STD 0.7fffffff +now do:: + + TR I R STD 0.7fffffff + i.e. the 0x7f is added because the pgd only gives the page table origin & we need to set the low bits to the maximum possible segment table length. -TR I R STD 0f2c007f 0.7fffffff -on z/Architecture you'll probably need to do -TR I R STD 0.ffffffffffffffff + +:: + + TR I R STD 0f2c007f 0.7fffffff + +on z/Architecture you'll probably need to do:: + + TR I R STD 0.ffffffffffffffff + to set the TableType to 0x1 & the Table length to 3. @@ -1051,40 +1208,51 @@ You can restart linux & trace these using the tr prog trace option. -The most common ones you will normally be tracing for is -1=operation exception -2=privileged operation exception -4=protection exception -5=addressing exception -6=specification exception -10=segment translation exception -11=page translation exception +The most common ones you will normally be tracing for is: + +- 1=operation exception +- 2=privileged operation exception +- 4=protection exception +- 5=addressing exception +- 6=specification exception +- 10=segment translation exception +- 11=page translation exception The full list of these is on page 22 of the current s/390 Reference Summary. e.g. + tr prog 10 will trace segment translation exceptions. + tr prog on its own will trace all program interruption codes. Trace Sets ---------- On starting VM you are initially in the INITIAL trace set. You can do a Q TR to verify this. -If you have a complex tracing situation where you wish to wait for instance +If you have a complex tracing situation where you wish to wait for instance till a driver is open before you start tracing IO, but know in your heart that you are going to have to make several runs through the code till you -have a clue whats going on. +have a clue whats going on. + +What you can do is:: + + TR I PSWA -What you can do is -TR I PSWA hit b to continue till breakpoint + reach the breakpoint -now do your -TR GOTO B -TR IO 7c08-7c09 inst int run + +now do your:: + + TR GOTO B + TR IO 7c08-7c09 inst int run + or whatever the IO channels you wish to trace are & hit b -To got back to the initial trace set do -TR GOTO INITIAL +To got back to the initial trace set do:: + + TR GOTO INITIAL + & the TR I PSWA will be the only active breakpoint again. @@ -1093,11 +1261,14 @@ Tracing linux syscalls under VM Syscalls are implemented on Linux for S390 by the Supervisor call instruction (SVC). There 256 possibilities of these as the instruction is made up of a 0xA opcode and the second byte being the syscall number. They are traced using the -simple command: -TR SVC +simple command:: + + TR SVC + the syscalls are defined in linux/arch/s390/include/asm/unistd.h -e.g. to trace all file opens just do -TR SVC 5 ( as this is the syscall number of open ) +e.g. to trace all file opens just do:: + + TR SVC 5 ( as this is the syscall number of open ) SMP Specific commands @@ -1105,33 +1276,51 @@ SMP Specific commands To find out how many cpus you have Q CPUS displays all the CPU's available to your virtual machine To find the cpu that the current cpu VM debugger commands are being directed at -do Q CPU to change the current cpu VM debugger commands are being directed at do -CPU +do Q CPU to change the current cpu VM debugger commands are being directed at +do:: + + CPU On a SMP guest issue a command to all CPUs try prefixing the command with cpu -all. To issue a command to a particular cpu try cpu e.g. -CPU 01 TR I R 2000.3000 +all. To issue a command to a particular cpu try cpu e.g.:: + + CPU 01 TR I R 2000.3000 + If you are running on a guest with several cpus & you have a IO related problem & cannot follow the flow of code but you know it isn't smp related. -from the bash prompt issue -shutdown -h now or halt. -do a Q CPUS to find out how many cpus you have -detach each one of them from cp except cpu 0 -by issuing a -DETACH CPU 01-(number of cpus in configuration) + +from the bash prompt issue:: + + shutdown -h now or halt. + +do a:: + + Q CPUS + +to find out how many cpus you have detach each one of them from cp except +cpu 0 by issuing a:: + + DETACH CPU 01-(number of cpus in configuration) + & boot linux again. -TR SIGP will trace inter processor signal processor instructions. -DEFINE CPU 01-(number in configuration) -will get your guests cpus back. + +TR SIGP + will trace inter processor signal processor instructions. + +DEFINE CPU 01-(number in configuration) + will get your guests cpus back. Help for displaying ascii textstrings ------------------------------------- On the very latest VM Nucleus'es VM can now display ascii -( thanks Neale for the hint ) by doing -D TX. -e.g. -D TX0.100 +( thanks Neale for the hint ) by doing:: + + D TX. + +e.g.:: + + D TX0.100 Alternatively ============= @@ -1143,66 +1332,85 @@ to your xterm if you are debugging from a linuxbox. This is quite useful when looking at a parameter passed in as a text string under VM ( unless you are good at decoding ASCII in your head ). -e.g. consider tracing an open syscall -TR SVC 5 -We have stopped at a breakpoint -000151B0' SVC 0A05 -> 0001909A' CC 0 +e.g. consider tracing an open syscall:: + + TR SVC 5 + +We have stopped at a breakpoint:: + + 000151B0' SVC 0A05 -> 0001909A' CC 0 D 20.8 to check the SVC old psw in the prefix area and see was it from userspace (for the layout of the prefix area consult the "Fixed Storage Locations" chapter of the s/390 Reference Summary if you have it available). -V00000020 070C2000 800151B2 + +:: + + V00000020 070C2000 800151B2 + The problem state bit wasn't set & it's also too early in the boot sequence -for it to be a userspace SVC if it was we would have to temporarily switch the +for it to be a userspace SVC if it was we would have to temporarily switch the psw to user space addressing so we could get at the first parameter of the open in gpr2. -Next do a -D G2 -GPR 2 = 00014CB4 -Now display what gpr2 is pointing to -D 00014CB4.20 -V00014CB4 2F646576 2F636F6E 736F6C65 00001BF5 -V00014CC4 FC00014C B4001001 E0001000 B8070707 + +Next do a:: + + D G2 + GPR 2 = 00014CB4 + +Now display what gpr2 is pointing to:: + + D 00014CB4.20 + V00014CB4 2F646576 2F636F6E 736F6C65 00001BF5 + V00014CC4 FC00014C B4001001 E0001000 B8070707 + Now copy the text till the first 00 hex ( which is the end of the string -to an xterm & do hex2ascii on it. -hex2ascii 2F646576 2F636F6E 736F6C65 00 -outputs -Decoded Hex:=/ d e v / c o n s o l e 0x00 +to an xterm & do hex2ascii on it:: + + hex2ascii 2F646576 2F636F6E 736F6C65 00 + +outputs:: + + Decoded Hex:=/ d e v / c o n s o l e 0x00 + We were opening the console device, You can compile the code below yourself for practice :-), -/* - * hex2ascii.c - * a useful little tool for converting a hexadecimal command line to ascii - * - * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) - * (C) 2000 IBM Deutschland Entwicklung GmbH, IBM Corporation. - */ -#include -int main(int argc,char *argv[]) -{ - int cnt1,cnt2,len,toggle=0; - int startcnt=1; - unsigned char c,hex; - - if(argc>1&&(strcmp(argv[1],"-a")==0)) - startcnt=2; - printf("Decoded Hex:="); - for(cnt1=startcnt;cnt1 + + int main(int argc,char *argv[]) { - len=strlen(argv[cnt1]); - for(cnt2=0;cnt21&&(strcmp(argv[1],"-a")==0)) + startcnt=2; + printf("Decoded Hex:="); + for(cnt1=startcnt;cnt1='0'&&c<='9') + len=strlen(argv[cnt1]); + for(cnt2=0;cnt2='0'&&c<='9') c=c-'0'; - if(c>='A'&&c<='F') + if(c>='A'&&c<='F') c=c-'A'+10; - if(c>='a'&&c<='f') + if(c>='a'&&c<='f') c=c-'a'+10; - switch(toggle) - { + switch(toggle) + { case 0: hex=c<<4; toggle=1; @@ -1224,11 +1432,11 @@ int main(int argc,char *argv[]) } toggle=0; break; - } + } + } } + printf("\n"); } - printf("\n"); -} @@ -1248,48 +1456,58 @@ should be able to sniff further back if you follow the following tricks. 1) A kernel address should be easy to recognise since it is in primary space & the problem state bit isn't set & also The Hi bit of the address is set. -2) Another backchain should also be easy to recognise since it is an +2) Another backchain should also be easy to recognise since it is an address pointing to another address approximately 100 bytes or 0x70 hex behind the current stackpointer. Here is some practice. + boot the kernel & hit PA1 at some random time -d g to display the gprs, this should display something like -GPR 0 = 00000001 00156018 0014359C 00000000 -GPR 4 = 00000001 001B8888 000003E0 00000000 -GPR 8 = 00100080 00100084 00000000 000FE000 -GPR 12 = 00010400 8001B2DC 8001B36A 000FFED8 + +d g to display the gprs, this should display something like:: + + GPR 0 = 00000001 00156018 0014359C 00000000 + GPR 4 = 00000001 001B8888 000003E0 00000000 + GPR 8 = 00100080 00100084 00000000 000FE000 + GPR 12 = 00010400 8001B2DC 8001B36A 000FFED8 + Note that GPR14 is a return address but as we are real men we are going to trace the stack. -display 0x40 bytes after the stack pointer. +display 0x40 bytes after the stack pointer:: -V000FFED8 000FFF38 8001B838 80014C8E 000FFF38 -V000FFEE8 00000000 00000000 000003E0 00000000 -V000FFEF8 00100080 00100084 00000000 000FE000 -V000FFF08 00010400 8001B2DC 8001B36A 000FFED8 + V000FFED8 000FFF38 8001B838 80014C8E 000FFF38 + V000FFEE8 00000000 00000000 000003E0 00000000 + V000FFEF8 00100080 00100084 00000000 000FE000 + V000FFF08 00010400 8001B2DC 8001B36A 000FFED8 Ah now look at whats in sp+56 (sp+0x38) this is 8001B36A our saved r14 if you look above at our stackframe & also agrees with GPR14. -now backchain -d 000FFF38.40 -we now are taking the contents of SP to get our first backchain. +now backchain:: -V000FFF38 000FFFA0 00000000 00014995 00147094 -V000FFF48 00147090 001470A0 000003E0 00000000 -V000FFF58 00100080 00100084 00000000 001BF1D0 -V000FFF68 00010400 800149BA 80014CA6 000FFF38 + d 000FFF38.40 + +we now are taking the contents of SP to get our first backchain:: + + V000FFF38 000FFFA0 00000000 00014995 00147094 + V000FFF48 00147090 001470A0 000003E0 00000000 + V000FFF58 00100080 00100084 00000000 001BF1D0 + V000FFF68 00010400 800149BA 80014CA6 000FFF38 This displays a 2nd return address of 80014CA6 -now do d 000FFFA0.40 for our 3rd backchain +now do:: -V000FFFA0 04B52002 0001107F 00000000 00000000 -V000FFFB0 00000000 00000000 FF000000 0001107F -V000FFFC0 00000000 00000000 00000000 00000000 -V000FFFD0 00010400 80010802 8001085A 000FFFA0 + d 000FFFA0.40 + +for our 3rd backchain:: + + V000FFFA0 04B52002 0001107F 00000000 00000000 + V000FFFB0 00000000 00000000 FF000000 0001107F + V000FFFC0 00000000 00000000 00000000 00000000 + V000FFFD0 00010400 80010802 8001085A 000FFFA0 our 3rd return address is 8001085A @@ -1297,23 +1515,35 @@ our 3rd return address is 8001085A as the 04B52002 looks suspiciously like rubbish it is fair to assume that the kernel entry routines for the sake of optimisation don't set up a backchain. -now look at System.map to see if the addresses make any sense. +now look at System.map to see if the addresses make any sense:: + + grep -i 0001b3 System.map + +outputs among other things:: + + 0001b304 T cpu_idle -grep -i 0001b3 System.map -outputs among other things -0001b304 T cpu_idle so 8001B36A is cpu_idle+0x66 ( quiet the cpu is asleep, don't wake it ) +:: + + grep -i 00014 System.map + +produces among other things:: + + 00014a78 T start_kernel -grep -i 00014 System.map -produces among other things -00014a78 T start_kernel so 0014CA6 is start_kernel+some hex number I can't add in my head. -grep -i 00108 System.map -this produces -00010800 T _stext +:: + + grep -i 00108 System.map + +this produces:: + + 00010800 T _stext + so 8001085A is _stext+0x5a Congrats you've done your first backchain. @@ -1337,47 +1567,49 @@ system might be choking with around 64. Here is some of the common IO terminology: Subchannel: -This is the logical number most IO commands use to talk to an IO device. There -can be up to 0x10000 (65536) of these in a configuration, typically there are a -few hundred. Under VM for simplicity they are allocated contiguously, however -on the native hardware they are not. They typically stay consistent between -boots provided no new hardware is inserted or removed. -Under Linux for s390 we use these as IRQ's and also when issuing an IO command -(CLEAR SUBCHANNEL, HALT SUBCHANNEL, MODIFY SUBCHANNEL, RESUME SUBCHANNEL, -START SUBCHANNEL, STORE SUBCHANNEL and TEST SUBCHANNEL). We use this as the ID -of the device we wish to talk to. The most important of these instructions are -START SUBCHANNEL (to start IO), TEST SUBCHANNEL (to check whether the IO -completed successfully) and HALT SUBCHANNEL (to kill IO). A subchannel can have -up to 8 channel paths to a device, this offers redundancy if one is not -available. + This is the logical number most IO commands use to talk to an IO device. There + can be up to 0x10000 (65536) of these in a configuration, typically there are a + few hundred. Under VM for simplicity they are allocated contiguously, however + on the native hardware they are not. They typically stay consistent between + boots provided no new hardware is inserted or removed. + + Under Linux for s390 we use these as IRQ's and also when issuing an IO command + (CLEAR SUBCHANNEL, HALT SUBCHANNEL, MODIFY SUBCHANNEL, RESUME SUBCHANNEL, + START SUBCHANNEL, STORE SUBCHANNEL and TEST SUBCHANNEL). We use this as the ID + of the device we wish to talk to. The most important of these instructions are + START SUBCHANNEL (to start IO), TEST SUBCHANNEL (to check whether the IO + completed successfully) and HALT SUBCHANNEL (to kill IO). A subchannel can have + up to 8 channel paths to a device, this offers redundancy if one is not + available. Device Number: -This number remains static and is closely tied to the hardware. There are 65536 -of these, made up of a CHPID (Channel Path ID, the most significant 8 bits) and -another lsb 8 bits. These remain static even if more devices are inserted or -removed from the hardware. There is a 1 to 1 mapping between subchannels and -device numbers, provided devices aren't inserted or removed. + This number remains static and is closely tied to the hardware. There are 65536 + of these, made up of a CHPID (Channel Path ID, the most significant 8 bits) and + another lsb 8 bits. These remain static even if more devices are inserted or + removed from the hardware. There is a 1 to 1 mapping between subchannels and + device numbers, provided devices aren't inserted or removed. Channel Control Words: -CCWs are linked lists of instructions initially pointed to by an operation -request block (ORB), which is initially given to Start Subchannel (SSCH) -command along with the subchannel number for the IO subsystem to process -while the CPU continues executing normal code. -CCWs come in two flavours, Format 0 (24 bit for backward compatibility) and -Format 1 (31 bit). These are typically used to issue read and write (and many -other) instructions. They consist of a length field and an absolute address -field. -Each IO typically gets 1 or 2 interrupts, one for channel end (primary status) -when the channel is idle, and the second for device end (secondary status). -Sometimes you get both concurrently. You check how the IO went on by issuing a -TEST SUBCHANNEL at each interrupt, from which you receive an Interruption -response block (IRB). If you get channel and device end status in the IRB -without channel checks etc. your IO probably went okay. If you didn't you -probably need to examine the IRB, extended status word etc. -If an error occurs, more sophisticated control units have a facility known as -concurrent sense. This means that if an error occurs Extended sense information -will be presented in the Extended status word in the IRB. If not you have to -issue a subsequent SENSE CCW command after the test subchannel. + CCWs are linked lists of instructions initially pointed to by an operation + request block (ORB), which is initially given to Start Subchannel (SSCH) + command along with the subchannel number for the IO subsystem to process + while the CPU continues executing normal code. + CCWs come in two flavours, Format 0 (24 bit for backward compatibility) and + Format 1 (31 bit). These are typically used to issue read and write (and many + other) instructions. They consist of a length field and an absolute address + field. + + Each IO typically gets 1 or 2 interrupts, one for channel end (primary status) + when the channel is idle, and the second for device end (secondary status). + Sometimes you get both concurrently. You check how the IO went on by issuing a + TEST SUBCHANNEL at each interrupt, from which you receive an Interruption + response block (IRB). If you get channel and device end status in the IRB + without channel checks etc. your IO probably went okay. If you didn't you + probably need to examine the IRB, extended status word etc. + If an error occurs, more sophisticated control units have a facility known as + concurrent sense. This means that if an error occurs Extended sense information + will be presented in the Extended status word in the IRB. If not you have to + issue a subsequent SENSE CCW command after the test subchannel. TPI (Test pending interrupt) can also be used for polled IO, but in @@ -1388,58 +1620,62 @@ Store Subchannel and Modify Subchannel can be used to examine and modify operating characteristics of a subchannel (e.g. channel paths). Other IO related Terms: -Sysplex: S390's Clustering Technology -QDIO: S390's new high speed IO architecture to support devices such as gigabit -ethernet, this architecture is also designed to be forward compatible with -upcoming 64 bit machines. + +Sysplex: + S390's Clustering Technology +QDIO: + S390's new high speed IO architecture to support devices such as gigabit + ethernet, this architecture is also designed to be forward compatible with + upcoming 64 bit machines. -General Concepts +General Concepts +---------------- Input Output Processors (IOP's) are responsible for communicating between the mainframe CPU's & the channel & relieve the mainframe CPU's from the -burden of communicating with IO devices directly, this allows the CPU's to -concentrate on data processing. +burden of communicating with IO devices directly, this allows the CPU's to +concentrate on data processing. -IOP's can use one or more links ( known as channel paths ) to talk to each +IOP's can use one or more links ( known as channel paths ) to talk to each IO device. It first checks for path availability & chooses an available one, then starts ( & sometimes terminates IO ). There are two types of channel path: ESCON & the Parallel IO interface. IO devices are attached to control units, control units provide the -logic to interface the channel paths & channel path IO protocols to +logic to interface the channel paths & channel path IO protocols to the IO devices, they can be integrated with the devices or housed separately -& often talk to several similar devices ( typical examples would be raid -controllers or a control unit which connects to 1000 3270 terminals ). +& often talk to several similar devices ( typical examples would be raid +controllers or a control unit which connects to 1000 3270 terminals ):: - +---------------------------------------------------------------+ - | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ | - | | CPU | | CPU | | CPU | | CPU | | Main | | Expanded | | - | | | | | | | | | | Memory | | Storage | | - | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ | - |---------------------------------------------------------------+ - | IOP | IOP | IOP | - |--------------------------------------------------------------- - | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | - ---------------------------------------------------------------- - || || - || Bus & Tag Channel Path || ESCON - || ====================== || Channel - || || || || Path - +----------+ +----------+ +----------+ - | | | | | | - | CU | | CU | | CU | - | | | | | | - +----------+ +----------+ +----------+ - | | | | | -+----------+ +----------+ +----------+ +----------+ +----------+ -|I/O Device| |I/O Device| |I/O Device| |I/O Device| |I/O Device| -+----------+ +----------+ +----------+ +----------+ +----------+ - CPU = Central Processing Unit - C = Channel - IOP = IP Processor - CU = Control Unit + +---------------------------------------------------------------+ + | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ | + | | CPU | | CPU | | CPU | | CPU | | Main | | Expanded | | + | | | | | | | | | | Memory | | Storage | | + | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ | + |---------------------------------------------------------------+ + | IOP | IOP | IOP | + |--------------------------------------------------------------- + | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | + ---------------------------------------------------------------- + || || + || Bus & Tag Channel Path || ESCON + || ====================== || Channel + || || || || Path + +----------+ +----------+ +----------+ + | | | | | | + | CU | | CU | | CU | + | | | | | | + +----------+ +----------+ +----------+ + | | | | | + +----------+ +----------+ +----------+ +----------+ +----------+ + |I/O Device| |I/O Device| |I/O Device| |I/O Device| |I/O Device| + +----------+ +----------+ +----------+ +----------+ +----------+ + CPU = Central Processing Unit + C = Channel + IOP = IP Processor + CU = Control Unit The 390 IO systems come in 2 flavours the current 390 machines support both @@ -1447,7 +1683,7 @@ The Older 360 & 370 Interface,sometimes called the Parallel I/O interface, sometimes called Bus-and Tag & sometimes Original Equipment Manufacturers Interface (OEMI). -This byte wide Parallel channel path/bus has parity & data on the "Bus" cable +This byte wide Parallel channel path/bus has parity & data on the "Bus" cable and control lines on the "Tag" cable. These can operate in byte multiplex mode for sharing between several slow devices or burst mode and monopolize the channel for the whole burst. Up to 256 devices can be addressed on one of these @@ -1459,13 +1695,13 @@ support only transfer rates of 3.0, 2.0 & 1.0 MB/sec. One of these paths can be daisy chained to up to 8 control units. -ESCON if fibre optic it is also called FICON +ESCON if fibre optic it is also called FICON Was introduced by IBM in 1990. Has 2 fibre optic cables and uses either leds or lasers for communication at a signaling rate of up to 200 megabits/sec. As 10bits are transferred for every 8 bits info this drops to 160 megabits/sec and to 18.6 Megabytes/sec once control info and CRC are added. ESCON only operates in burst mode. - + ESCONs typical max cable length is 3km for the led version and 20km for the laser version known as XDF (extended distance facility). This can be further extended by using an ESCON director which triples the above mentioned ranges. @@ -1489,31 +1725,29 @@ Debugging IO on s/390 & z/Architecture under VM Now we are ready to go on with IO tracing commands under VM -A few self explanatory queries: -Q OSA -Q CTC -Q DISK ( This command is CMS specific ) -Q DASD +A few self explanatory queries:: + Q OSA + Q CTC + Q DISK ( This command is CMS specific ) + Q DASD +Q OSA on my machine returns:: - - - -Q OSA on my machine returns -OSA 7C08 ON OSA 7C08 SUBCHANNEL = 0000 -OSA 7C09 ON OSA 7C09 SUBCHANNEL = 0001 -OSA 7C14 ON OSA 7C14 SUBCHANNEL = 0002 -OSA 7C15 ON OSA 7C15 SUBCHANNEL = 0003 + OSA 7C08 ON OSA 7C08 SUBCHANNEL = 0000 + OSA 7C09 ON OSA 7C09 SUBCHANNEL = 0001 + OSA 7C14 ON OSA 7C14 SUBCHANNEL = 0002 + OSA 7C15 ON OSA 7C15 SUBCHANNEL = 0003 If you have a guest with certain privileges you may be able to see devices which don't belong to you. To avoid this, add the option V. -e.g. -Q V OSA +e.g.:: + + Q V OSA Now using the device numbers returned by this command we will Trace the io starting up on the first device 7c08 & 7c09 -In our simplest case we can trace the +In our simplest case we can trace the start subchannels like TR SSCH 7C08-7C09 or the halt subchannels @@ -1524,34 +1758,47 @@ A good trick is tracing all the IO's and CCWS and spooling them into the reader of another VM guest so he can ftp the logfile back to his own machine. I'll do a small bit of this and give you a look at the output. -1) Spool stdout to VM reader -SP PRT TO (another vm guest ) or * for the local vm guest -2) Fill the reader with the trace -TR IO 7c08-7c09 INST INT CCW PRT RUN -3) Start up linux -i 00c -4) Finish the trace -TR END -5) close the reader -C PRT -6) list reader contents -RDRLIST -7) copy it to linux4's minidisk -RECEIVE / LOG TXT A1 ( replace +1) Spool stdout to VM reader:: + + SP PRT TO (another vm guest ) or * for the local vm guest + +2) Fill the reader with the trace:: + + TR IO 7c08-7c09 INST INT CCW PRT RUN + +3) Start up linux:: + + i 00c +4) Finish the trace:: + + TR END + +5) close the reader:: + + C PRT + +6) list reader contents:: + + RDRLIST + +7) copy it to linux4's minidisk:: + + RECEIVE / LOG TXT A1 ( replace + 8) filel & press F11 to look at it -You should see something like: +You should see something like:: -00020942' SSCH B2334000 0048813C CC 0 SCH 0000 DEV 7C08 - CPA 000FFDF0 PARM 00E2C9C4 KEY 0 FPI C0 LPM 80 - CCW 000FFDF0 E4200100 00487FE8 0000 E4240100 ........ - IDAL 43D8AFE8 - IDAL 0FB76000 -00020B0A' I/O DEV 7C08 -> 000197BC' SCH 0000 PARM 00E2C9C4 -00021628' TSCH B2354000 >> 00488164 CC 0 SCH 0000 DEV 7C08 - CCWA 000FFDF8 DEV STS 0C SCH STS 00 CNT 00EC - KEY 0 FPI C0 CC 0 CTLS 4007 -00022238' STSCH B2344000 >> 00488108 CC 0 SCH 0000 DEV 7C08 + 00020942' SSCH B2334000 0048813C CC 0 SCH 0000 DEV 7C08 + CPA 000FFDF0 PARM 00E2C9C4 KEY 0 FPI C0 LPM 80 + CCW 000FFDF0 E4200100 00487FE8 0000 E4240100 ........ + IDAL 43D8AFE8 + IDAL 0FB76000 + 00020B0A' I/O DEV 7C08 -> 000197BC' SCH 0000 PARM 00E2C9C4 + 00021628' TSCH B2354000 >> 00488164 CC 0 SCH 0000 DEV 7C08 + CCWA 000FFDF8 DEV STS 0C SCH STS 00 CNT 00EC + KEY 0 FPI C0 CC 0 CTLS 4007 + 00022238' STSCH B2344000 >> 00488108 CC 0 SCH 0000 DEV 7C08 If you don't like messing up your readed ( because you possibly booted from it ) you can alternatively spool it to another readers guest. @@ -1563,43 +1810,58 @@ These commands are listed only because they have been of use to me in the past & may be of use to you too. For more complete info on each of the commands use type HELP from CMS. -detaching devices -DET -ATT -attach a device to guest * for your own guest -READY cause VM to issue a fake interrupt. -The VARY command is normally only available to VM administrators. -VARY ON PATH TO -VARY OFF PATH FROM +detaching devices:: + + DET + ATT + +attach a device to guest * for your own guest + +READY + cause VM to issue a fake interrupt. + +The VARY command is normally only available to VM administrators:: + + VARY ON PATH TO + VARY OFF PATH FROM + This is used to switch on or off channel paths to devices. Q CHPID -This displays state of devices using this channel path + This displays state of devices using this channel path + D SCHIB -This displays the subchannel information SCHIB block for the device. -this I believe is also only available to administrators. + This displays the subchannel information SCHIB block for the device. + this I believe is also only available to administrators. + DEFINE CTC -defines a virtual CTC channel to channel connection -2 need to be defined on each guest for the CTC driver to use. + defines a virtual CTC channel to channel connection + 2 need to be defined on each guest for the CTC driver to use. + COUPLE devno userid remote devno -Joins a local virtual device to a remote virtual device -( commonly used for the CTC driver ). + Joins a local virtual device to a remote virtual device + ( commonly used for the CTC driver ). + +Building a VM ramdisk under CMS which linux can use:: + + def vfb- -Building a VM ramdisk under CMS which linux can use -def vfb- blocksize is commonly 4096 for linux. -Formatting it -format (blksize -Sharing a disk between multiple guests -LINK userid devno1 devno2 mode password +Formatting it:: + + format (blksize + +Sharing a disk between multiple guests:: + + LINK userid devno1 devno2 mode password GDB on S390 =========== -N.B. if compiling for debugging gdb works better without optimisation +N.B. if compiling for debugging gdb works better without optimisation ( see Compiling programs for debugging ) invocation @@ -1609,113 +1871,169 @@ gdb Online help ----------- help: gives help on commands -e.g. -help -help display + +e.g.:: + + help + help display + Note gdb's online help is very good use it. Assembly -------- -info registers: displays registers other than floating point. -info all-registers: displays floating points as well. -disassemble: disassembles -e.g. -disassemble without parameters will disassemble the current function -disassemble $pc $pc+10 +info registers: + displays registers other than floating point. + +info all-registers: + displays floating points as well. + +disassemble: + disassembles + +e.g.:: + + disassemble without parameters will disassemble the current function + disassemble $pc $pc+10 Viewing & modifying variables ----------------------------- -print or p: displays variable or register +print or p: + displays variable or register + e.g. p/x $sp will display the stack pointer -display: prints variable or register each time program stops -e.g. -display/x $pc will display the program counter -display argc +display: + prints variable or register each time program stops -undisplay : undo's display's +e.g.:: -info breakpoints: shows all current breakpoints + display/x $pc will display the program counter + display argc -info stack: shows stack back trace (if this doesn't work too well, I'll show -you the stacktrace by hand below). +undisplay: + undo's display's -info locals: displays local variables. +info breakpoints: + shows all current breakpoints -info args: display current procedure arguments. +info stack: + shows stack back trace (if this doesn't work too well, I'll show + you the stacktrace by hand below). -set args: will set argc & argv each time the victim program is invoked. +info locals: + displays local variables. -set =value -set argc=100 -set $pc=0 +info args: + display current procedure arguments. + +set args: + will set argc & argv each time the victim program is invoked + +e.g.:: + + set =value + set argc=100 + set $pc=0 Modifying execution ------------------- -step: steps n lines of sourcecode -step steps 1 line. -step 100 steps 100 lines of code. +step: + steps n lines of sourcecode -next: like step except this will not step into subroutines +step + steps 1 line. -stepi: steps a single machine code instruction. -e.g. stepi 100 +step 100 + steps 100 lines of code. -nexti: steps a single machine code instruction but will not step into -subroutines. +next: + like step except this will not step into subroutines -finish: will run until exit of the current routine +stepi: + steps a single machine code instruction. -run: (re)starts a program +e.g.:: -cont: continues a program + stepi 100 -quit: exits gdb. +nexti: + steps a single machine code instruction but will not step into + subroutines. + +finish: + will run until exit of the current routine + +run: + (re)starts a program + +cont: + continues a program + +quit: + exits gdb. breakpoints ------------ break -sets a breakpoint -e.g. + sets a breakpoint -break main +e.g.:: -break *$pc - -break *0x400618 + break main + break *$pc + break *0x400618 Here's a really useful one for large programs + rbr -Set a breakpoint for all functions matching REGEXP -e.g. -rbr 390 + Set a breakpoint for all functions matching REGEXP + +e.g.:: + + rbr 390 + will set a breakpoint with all functions with 390 in their name. info breakpoints -lists all breakpoints + lists all breakpoints + +delete: + delete breakpoint by number or delete them all -delete: delete breakpoint by number or delete them all e.g. -delete 1 will delete the first breakpoint -delete will delete them all -watch: This will set a watchpoint ( usually hardware assisted ), +delete 1 + will delete the first breakpoint + + +delete + will delete them all + +watch: + This will set a watchpoint ( usually hardware assisted ), + This will watch a variable till it changes + e.g. -watch cnt, will watch the variable cnt till it changes. + +watch cnt + will watch the variable cnt till it changes. + As an aside unfortunately gdb's, architecture independent watchpoint code is inconsistent & not very good, watchpoints usually work but not always. -info watchpoints: Display currently active watchpoints +info watchpoints: + Display currently active watchpoints condition: ( another useful one ) -Specify breakpoint number N to break only if COND is true. -Usage is `condition N COND', where N is an integer and COND is an + Specify breakpoint number N to break only if COND is true. + +Usage is `condition N COND`, where N is an integer and COND is an expression to be evaluated whenever breakpoint N is reached. @@ -1723,41 +2041,51 @@ expression to be evaluated whenever breakpoint N is reached. User defined functions/macros ----------------------------- define: ( Note this is very very useful,simple & powerful ) + usage define end -examples which you should consider putting into .gdbinit in your home directory -define d -stepi -disassemble $pc $pc+10 -end +examples which you should consider putting into .gdbinit in your home +directory:: -define e -nexti -disassemble $pc $pc+10 -end + define d + stepi + disassemble $pc $pc+10 + end + define e + nexti + disassemble $pc $pc+10 + end Other hard to classify stuff ---------------------------- signal n: -sends the victim program a signal. -e.g. signal 3 will send a SIGQUIT. + sends the victim program a signal. + +e.g. `signal 3` will send a SIGQUIT. info signals: -what gdb does when the victim receives certain signals. + what gdb does when the victim receives certain signals. list: -e.g. -list lists current function source -list 1,10 list first 10 lines of current file. + +e.g.: + +list + lists current function source +list 1,10 + list first 10 lines of current file. + list test.c:1,10 directory: -Adds directories to be searched for source if gdb cannot find the source. -(note it is a bit sensitive about slashes) -e.g. To add the root of the filesystem to the searchpath do -directory // + Adds directories to be searched for source if gdb cannot find the source. + (note it is a bit sensitive about slashes) + +e.g. To add the root of the filesystem to the searchpath do:: + + directory // call @@ -1765,153 +2093,205 @@ This calls a function in the victim program, this is pretty powerful e.g. (gdb) call printf("hello world") outputs: -$1 = 11 +$1 = 11 You might now be thinking that the line above didn't work, something extra had to be done. (gdb) call fflush(stdout) hello world$2 = 0 -As an aside the debugger also calls malloc & free under the hood +As an aside the debugger also calls malloc & free under the hood to make space for the "hello world" string. hints ----- -1) command completion works just like bash -( if you are a bad typist like me this really helps ) +1) command completion works just like bash + ( if you are a bad typist like me this really helps ) + e.g. hit br & cursor up & down :-). 2) if you have a debugging problem that takes a few steps to recreate put the steps into a file called .gdbinit in your current working directory -if you have defined a few extra useful user defined commands put these in +if you have defined a few extra useful user defined commands put these in your home directory & they will be read each time gdb is launched. -A typical .gdbinit file might be. -break main -run -break runtime_exception -cont +A typical .gdbinit file might be.:: + + break main + run + break runtime_exception + cont stack chaining in gdb by hand ----------------------------- -This is done using a the same trick described for VM -p/x (*($sp+56))&0x7fffffff get the first backchain. +This is done using a the same trick described for VM:: + + p/x (*($sp+56))&0x7fffffff + +get the first backchain. For z/Architecture Replace 56 with 112 & ignore the &0x7fffffff in the macros below & do nasty casts to longs like the following as gdb unfortunately deals with printed arguments as ints which messes up everything. -i.e. here is a 3rd backchain dereference -p/x *(long *)(***(long ***)$sp+112) + +i.e. here is a 3rd backchain dereference:: + + p/x *(long *)(***(long ***)$sp+112) -this outputs -$5 = 0x528f18 +this outputs:: + + $5 = 0x528f18 + on my machine. -Now you can use -info symbol (*($sp+56))&0x7fffffff -you might see something like. -rl_getc + 36 in section .text telling you what is located at address 0x528f18 -Now do. -p/x (*(*$sp+56))&0x7fffffff -This outputs -$6 = 0x528ed0 -Now do. -info symbol (*(*$sp+56))&0x7fffffff -rl_read_key + 180 in section .text -now do -p/x (*(**$sp+56))&0x7fffffff + +Now you can use:: + + info symbol (*($sp+56))&0x7fffffff + +you might see something like:: + + rl_getc + 36 in section .text + +telling you what is located at address 0x528f18 +Now do:: + + p/x (*(*$sp+56))&0x7fffffff + +This outputs:: + + $6 = 0x528ed0 + +Now do:: + + info symbol (*(*$sp+56))&0x7fffffff + rl_read_key + 180 in section .text + +now do:: + + p/x (*(**$sp+56))&0x7fffffff + & so on. Disassembling instructions without debug info --------------------------------------------- gdb typically complains if there is a lack of debugging -symbols in the disassemble command with +symbols in the disassemble command with "No function contains specified address." To get around -this do -x/xi
-e.g. -x/20xi 0x400730 +this do:: + + x/xi
+ +e.g.:: + + x/20xi 0x400730 -Note: Remember gdb has history just like bash you don't need to retype the -whole line just use the up & down arrows. +Note: + Remember gdb has history just like bash you don't need to retype the + whole line just use the up & down arrows. For more info ------------- -From your linuxbox do -man gdb or info gdb. +From your linuxbox do:: + + man gdb + +or:: + + info gdb. core dumps ---------- -What a core dump ?, + +What a core dump ? +^^^^^^^^^^^^^^^^^^ + A core dump is a file generated by the kernel (if allowed) which contains the registers and all active pages of the program which has crashed. + From this file gdb will allow you to look at the registers, stack trace and memory of the program as if it just crashed on your system. It is usually called core and created in the current working directory. + This is very useful in that a customer can mail a core dump to a technical support department and the technical support department can reconstruct what happened. Provided they have an identical copy of this program with debugging symbols compiled in and the source base of this build is available. + In short it is far more useful than something like a crash log could ever hope to be. -Why have I never seen one ?. -Probably because you haven't used the command -ulimit -c unlimited in bash -to allow core dumps, now do -ulimit -a +Why have I never seen one ? +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Probably because you haven't used the command:: + + ulimit -c unlimited in bash + +to allow core dumps, now do:: + + ulimit -a + to verify that the limit was accepted. A sample core dump -To create this I'm going to do -ulimit -c unlimited -gdb -to launch gdb (my victim app. ) now be bad & do the following from another -telnet/xterm session to the same machine -ps -aux | grep gdb -kill -SIGSEGV -or alternatively use killall -SIGSEGV gdb if you have the killall command. -Now look at the core dump. -./gdb core -Displays the following -GNU gdb 4.18 -Copyright 1998 Free Software Foundation, Inc. -GDB is free software, covered by the GNU General Public License, and you are -welcome to change it and/or distribute copies of it under certain conditions. -Type "show copying" to see the conditions. -There is absolutely no warranty for GDB. Type "show warranty" for details. -This GDB was configured as "s390-ibm-linux"... -Core was generated by `./gdb'. -Program terminated with signal 11, Segmentation fault. -Reading symbols from /usr/lib/libncurses.so.4...done. -Reading symbols from /lib/libm.so.6...done. -Reading symbols from /lib/libc.so.6...done. -Reading symbols from /lib/ld-linux.so.2...done. -#0 0x40126d1a in read () from /lib/libc.so.6 -Setting up the environment for debugging gdb. -Breakpoint 1 at 0x4dc6f8: file utils.c, line 471. -Breakpoint 2 at 0x4d87a4: file top.c, line 2609. -(top-gdb) info stack -#0 0x40126d1a in read () from /lib/libc.so.6 -#1 0x528f26 in rl_getc (stream=0x7ffffde8) at input.c:402 -#2 0x528ed0 in rl_read_key () at input.c:381 -#3 0x5167e6 in readline_internal_char () at readline.c:454 -#4 0x5168ee in readline_internal_charloop () at readline.c:507 -#5 0x51692c in readline_internal () at readline.c:521 -#6 0x5164fe in readline (prompt=0x7ffff810) - at readline.c:349 -#7 0x4d7a8a in command_line_input (prompt=0x564420 "(gdb) ", repeat=1, - annotation_suffix=0x4d6b44 "prompt") at top.c:2091 -#8 0x4d6cf0 in command_loop () at top.c:1345 -#9 0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635 + To create this I'm going to do:: + + ulimit -c unlimited + gdb + +to launch gdb (my victim app. ) now be bad & do the following from another +telnet/xterm session to the same machine:: + + ps -aux | grep gdb + kill -SIGSEGV + +or alternatively use `killall -SIGSEGV gdb` if you have the killall command. + +Now look at the core dump:: + + ./gdb core + +Displays the following:: + + GNU gdb 4.18 + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "s390-ibm-linux"... + Core was generated by `./gdb'. + Program terminated with signal 11, Segmentation fault. + Reading symbols from /usr/lib/libncurses.so.4...done. + Reading symbols from /lib/libm.so.6...done. + Reading symbols from /lib/libc.so.6...done. + Reading symbols from /lib/ld-linux.so.2...done. + #0 0x40126d1a in read () from /lib/libc.so.6 + Setting up the environment for debugging gdb. + Breakpoint 1 at 0x4dc6f8: file utils.c, line 471. + Breakpoint 2 at 0x4d87a4: file top.c, line 2609. + (top-gdb) info stack + #0 0x40126d1a in read () from /lib/libc.so.6 + #1 0x528f26 in rl_getc (stream=0x7ffffde8) at input.c:402 + #2 0x528ed0 in rl_read_key () at input.c:381 + #3 0x5167e6 in readline_internal_char () at readline.c:454 + #4 0x5168ee in readline_internal_charloop () at readline.c:507 + #5 0x51692c in readline_internal () at readline.c:521 + #6 0x5164fe in readline (prompt=0x7ffff810) + at readline.c:349 + #7 0x4d7a8a in command_line_input (prompt=0x564420 "(gdb) ", repeat=1, + annotation_suffix=0x4d6b44 "prompt") at top.c:2091 + #8 0x4d6cf0 in command_loop () at top.c:1345 + #9 0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635 LDD @@ -1919,27 +2299,32 @@ LDD This is a program which lists the shared libraries which a library needs, Note you also get the relocations of the shared library text segments which help when using objdump --source. -e.g. - ldd ./gdb -outputs -libncurses.so.4 => /usr/lib/libncurses.so.4 (0x40018000) -libm.so.6 => /lib/libm.so.6 (0x4005e000) -libc.so.6 => /lib/libc.so.6 (0x40084000) -/lib/ld-linux.so.2 => /lib/ld-linux.so.2 (0x40000000) + +e.g.:: + + ldd ./gdb + +outputs:: + + libncurses.so.4 => /usr/lib/libncurses.so.4 (0x40018000) + libm.so.6 => /lib/libm.so.6 (0x4005e000) + libc.so.6 => /lib/libc.so.6 (0x40084000) + /lib/ld-linux.so.2 => /lib/ld-linux.so.2 (0x40000000) Debugging shared libraries ========================== Most programs use shared libraries, however it can be very painful -when you single step instruction into a function like printf for the +when you single step instruction into a function like printf for the first time & you end up in functions like _dl_runtime_resolve this is -the ld.so doing lazy binding, lazy binding is a concept in ELF where -shared library functions are not loaded into memory unless they are +the ld.so doing lazy binding, lazy binding is a concept in ELF where +shared library functions are not loaded into memory unless they are actually used, great for saving memory but a pain to debug. -To get around this either relink the program -static or exit gdb type -export LD_BIND_NOW=true this will stop lazy binding & restart the gdb'ing + +To get around this either relink the program -static or exit gdb type +export LD_BIND_NOW=true this will stop lazy binding & restart the gdb'ing the program in question. - + Debugging modules @@ -1955,106 +2340,127 @@ It is a filesystem created by the kernel with files which are created on demand by the kernel if read, or can be used to modify kernel parameters, it is a powerful concept. -e.g. +e.g.:: + + cat /proc/sys/net/ipv4/ip_forward + +On my machine outputs:: + + 0 + +telling me ip_forwarding is not on to switch it on I can do:: + + echo 1 > /proc/sys/net/ipv4/ip_forward + +cat it again:: + + cat /proc/sys/net/ipv4/ip_forward + +On my machine now outputs:: + + 1 -cat /proc/sys/net/ipv4/ip_forward -On my machine outputs -0 -telling me ip_forwarding is not on to switch it on I can do -echo 1 > /proc/sys/net/ipv4/ip_forward -cat it again -cat /proc/sys/net/ipv4/ip_forward -On my machine now outputs -1 IP forwarding is on. + There is a lot of useful info in here best found by going in and having a look around, so I'll take you through some entries I consider important. All the processes running on the machine have their own entry defined by /proc/ -So lets have a look at the init process -cd /proc/1 -cat cmdline -emits -init [2] +So lets have a look at the init process:: + + cd /proc/1 + cat cmdline + +emits:: + + init [2] + +:: + + cd /proc/1/fd -cd /proc/1/fd This contains numerical entries of all the open files, -some of these you can cat e.g. stdout (2) +some of these you can cat e.g. stdout (2):: -cat /proc/29/maps -on my machine emits + cat /proc/29/maps -00400000-00478000 r-xp 00000000 5f:00 4103 /bin/bash -00478000-0047e000 rw-p 00077000 5f:00 4103 /bin/bash -0047e000-00492000 rwxp 00000000 00:00 0 -40000000-40015000 r-xp 00000000 5f:00 14382 /lib/ld-2.1.2.so -40015000-40016000 rw-p 00014000 5f:00 14382 /lib/ld-2.1.2.so -40016000-40017000 rwxp 00000000 00:00 0 -40017000-40018000 rw-p 00000000 00:00 0 -40018000-4001b000 r-xp 00000000 5f:00 14435 /lib/libtermcap.so.2.0.8 -4001b000-4001c000 rw-p 00002000 5f:00 14435 /lib/libtermcap.so.2.0.8 -4001c000-4010d000 r-xp 00000000 5f:00 14387 /lib/libc-2.1.2.so -4010d000-40111000 rw-p 000f0000 5f:00 14387 /lib/libc-2.1.2.so -40111000-40114000 rw-p 00000000 00:00 0 -40114000-4011e000 r-xp 00000000 5f:00 14408 /lib/libnss_files-2.1.2.so -4011e000-4011f000 rw-p 00009000 5f:00 14408 /lib/libnss_files-2.1.2.so -7fffd000-80000000 rwxp ffffe000 00:00 0 +on my machine emits:: + + 00400000-00478000 r-xp 00000000 5f:00 4103 /bin/bash + 00478000-0047e000 rw-p 00077000 5f:00 4103 /bin/bash + 0047e000-00492000 rwxp 00000000 00:00 0 + 40000000-40015000 r-xp 00000000 5f:00 14382 /lib/ld-2.1.2.so + 40015000-40016000 rw-p 00014000 5f:00 14382 /lib/ld-2.1.2.so + 40016000-40017000 rwxp 00000000 00:00 0 + 40017000-40018000 rw-p 00000000 00:00 0 + 40018000-4001b000 r-xp 00000000 5f:00 14435 /lib/libtermcap.so.2.0.8 + 4001b000-4001c000 rw-p 00002000 5f:00 14435 /lib/libtermcap.so.2.0.8 + 4001c000-4010d000 r-xp 00000000 5f:00 14387 /lib/libc-2.1.2.so + 4010d000-40111000 rw-p 000f0000 5f:00 14387 /lib/libc-2.1.2.so + 40111000-40114000 rw-p 00000000 00:00 0 + 40114000-4011e000 r-xp 00000000 5f:00 14408 /lib/libnss_files-2.1.2.so + 4011e000-4011f000 rw-p 00009000 5f:00 14408 /lib/libnss_files-2.1.2.so + 7fffd000-80000000 rwxp ffffe000 00:00 0 Showing us the shared libraries init uses where they are in memory & memory access permissions for each virtual memory area. /proc/1/cwd is a softlink to the current working directory. -/proc/1/root is the root of the filesystem for this process. + +/proc/1/root is the root of the filesystem for this process. /proc/1/mem is the current running processes memory which you can read & write to like a file. + strace uses this sometimes as it is a bit faster than the rather inefficient ptrace interface for peeking at DATA. +:: -cat status + cat status -Name: init -State: S (sleeping) -Pid: 1 -PPid: 0 -Uid: 0 0 0 0 -Gid: 0 0 0 0 -Groups: -VmSize: 408 kB -VmLck: 0 kB -VmRSS: 208 kB -VmData: 24 kB -VmStk: 8 kB -VmExe: 368 kB -VmLib: 0 kB -SigPnd: 0000000000000000 -SigBlk: 0000000000000000 -SigIgn: 7fffffffd7f0d8fc -SigCgt: 00000000280b2603 -CapInh: 00000000fffffeff -CapPrm: 00000000ffffffff -CapEff: 00000000fffffeff + Name: init + State: S (sleeping) + Pid: 1 + PPid: 0 + Uid: 0 0 0 0 + Gid: 0 0 0 0 + Groups: + VmSize: 408 kB + VmLck: 0 kB + VmRSS: 208 kB + VmData: 24 kB + VmStk: 8 kB + VmExe: 368 kB + VmLib: 0 kB + SigPnd: 0000000000000000 + SigBlk: 0000000000000000 + SigIgn: 7fffffffd7f0d8fc + SigCgt: 00000000280b2603 + CapInh: 00000000fffffeff + CapPrm: 00000000ffffffff + CapEff: 00000000fffffeff + + User PSW: 070de000 80414146 + task: 004b6000 tss: 004b62d8 ksp: 004b7ca8 pt_regs: 004b7f68 + User GPRS: + 00000400 00000000 0000000b 7ffffa90 + 00000000 00000000 00000000 0045d9f4 + 0045cafc 7ffffa90 7fffff18 0045cb08 + 00010400 804039e8 80403af8 7ffff8b0 + User ACRS: + 00000000 00000000 00000000 00000000 + 00000001 00000000 00000000 00000000 + 00000000 00000000 00000000 00000000 + 00000000 00000000 00000000 00000000 + Kernel BackChain CallChain BackChain CallChain + 004b7ca8 8002bd0c 004b7d18 8002b92c + 004b7db8 8005cd50 004b7e38 8005d12a + 004b7f08 80019114 -User PSW: 070de000 80414146 -task: 004b6000 tss: 004b62d8 ksp: 004b7ca8 pt_regs: 004b7f68 -User GPRS: -00000400 00000000 0000000b 7ffffa90 -00000000 00000000 00000000 0045d9f4 -0045cafc 7ffffa90 7fffff18 0045cb08 -00010400 804039e8 80403af8 7ffff8b0 -User ACRS: -00000000 00000000 00000000 00000000 -00000001 00000000 00000000 00000000 -00000000 00000000 00000000 00000000 -00000000 00000000 00000000 00000000 -Kernel BackChain CallChain BackChain CallChain - 004b7ca8 8002bd0c 004b7d18 8002b92c - 004b7db8 8005cd50 004b7e38 8005d12a - 004b7f08 80019114 Showing among other things memory usage & status of some signals & the processes'es registers from the kernel task_structure as well as a backchain which may be useful if a process crashes @@ -2067,11 +2473,16 @@ debug feature Some of our drivers now support a "debug feature" in /proc/s390dbf see s390dbf.txt in the linux/Documentation directory for more info. -e.g. -to switch on the lcs "debug feature" -echo 5 > /proc/s390dbf/lcs/level -& then after the error occurred. -cat /proc/s390dbf/lcs/sprintf >/logfile + +e.g. +to switch on the lcs "debug feature":: + + echo 5 > /proc/s390dbf/lcs/level + +& then after the error occurred:: + + cat /proc/s390dbf/lcs/sprintf >/logfile + the logfile now contains some information which may help tech support resolve a problem in the field. @@ -2083,35 +2494,50 @@ ifconfig is a quite useful command it gives the current state of network drivers. If you suspect your network device driver is dead -one way to check is type -ifconfig +one way to check is type:: + + ifconfig + e.g. tr0 -You should see something like -tr0 Link encap:16/4 Mbps Token Ring (New) HWaddr 00:04:AC:20:8E:48 - inet addr:9.164.185.132 Bcast:9.164.191.255 Mask:255.255.224.0 - UP BROADCAST RUNNING MULTICAST MTU:2000 Metric:1 - RX packets:246134 errors:0 dropped:0 overruns:0 frame:0 - TX packets:5 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:100 + +You should see something like:: + + ifconfig tr0 + tr0 Link encap:16/4 Mbps Token Ring (New) HWaddr 00:04:AC:20:8E:48 + inet addr:9.164.185.132 Bcast:9.164.191.255 Mask:255.255.224.0 + UP BROADCAST RUNNING MULTICAST MTU:2000 Metric:1 + RX packets:246134 errors:0 dropped:0 overruns:0 frame:0 + TX packets:5 errors:0 dropped:0 overruns:0 carrier:0 + collisions:0 txqueuelen:100 if the device doesn't say up -try -/etc/rc.d/init.d/network start +try:: + + /etc/rc.d/init.d/network start + ( this starts the network stack & hopefully calls ifconfig tr0 up ). ifconfig looks at the output of /proc/net/dev and presents it in a more presentable form. + Now ping the device from a machine in the same subnet. + if the RX packets count & TX packets counts don't increment you probably have problems. -next -cat /proc/net/arp + +next:: + + cat /proc/net/arp + Do you see any hardware addresses in the cache if not you may have problems. -Next try -ping -c 5 i.e. the Bcast field above in the output of +Next try:: + + ping -c 5 + +i.e. the Bcast field above in the output of ifconfig. Do you see any replies from machines other than the local machine if not you may have problems. also if the TX packets count in ifconfig -hasn't incremented either you have serious problems in your driver -(e.g. the txbusy field of the network device being stuck on ) +hasn't incremented either you have serious problems in your driver +(e.g. the txbusy field of the network device being stuck on ) or you may have multiple network devices connected. @@ -2119,28 +2545,43 @@ chandev ------- There is a new device layer for channel devices, some drivers e.g. lcs are registered with this layer. + If the device uses the channel device layer you'll be -able to find what interrupts it uses & the current state +able to find what interrupts it uses & the current state of the device. + See the manpage chandev.8 &type cat /proc/chandev for more info. SysRq ===== This is now supported by linux for s/390 & z/Architecture. -To enable it do compile the kernel with -Kernel Hacking -> Magic SysRq Key Enabled -echo "1" > /proc/sys/kernel/sysrq -also type -echo "8" >/proc/sys/kernel/printk + +To enable it do compile the kernel with:: + + Kernel Hacking -> Magic SysRq Key Enabled + +Then:: + + echo "1" > /proc/sys/kernel/sysrq + +also type:: + + echo "8" >/proc/sys/kernel/printk + To make printk output go to console. -On 390 all commands are prefixed with -^- -e.g. -^-t will show tasks. -^-? or some unknown command will display help. + +On 390 all commands are prefixed with:: + + ^- + +e.g.:: + + ^-t will show tasks. + ^-? or some unknown command will display help. + The sysrq key reading is very picky ( I have to type the keys in an - xterm session & paste them into the x3270 console ) +xterm session & paste them into the x3270 console ) & it may be wise to predefine the keys as described in the VM hints above This is particularly useful for syncing disks unmounting & rebooting @@ -2150,19 +2591,19 @@ Read Documentation/admin-guide/sysrq.rst for more info References: =========== -Enterprise Systems Architecture Reference Summary -Enterprise Systems Architecture Principles of Operation -Hartmut Penners s390 stack frame sheet. -IBM Mainframe Channel Attachment a technology brief from a CISCO webpage -Various bits of man & info pages of Linux. -Linux & GDB source. -Various info & man pages. -CMS Help on tracing commands. -Linux for s/390 Elf Application Binary Interface -Linux for z/Series Elf Application Binary Interface ( Both Highly Recommended ) -z/Architecture Principles of Operation SA22-7832-00 -Enterprise Systems Architecture/390 Reference Summary SA22-7209-01 & the -Enterprise Systems Architecture/390 Principles of Operation SA22-7201-05 +- Enterprise Systems Architecture Reference Summary +- Enterprise Systems Architecture Principles of Operation +- Hartmut Penners s390 stack frame sheet. +- IBM Mainframe Channel Attachment a technology brief from a CISCO webpage +- Various bits of man & info pages of Linux. +- Linux & GDB source. +- Various info & man pages. +- CMS Help on tracing commands. +- Linux for s/390 Elf Application Binary Interface +- Linux for z/Series Elf Application Binary Interface ( Both Highly Recommended ) +- z/Architecture Principles of Operation SA22-7832-00 +- Enterprise Systems Architecture/390 Reference Summary SA22-7209-01 & the +- Enterprise Systems Architecture/390 Principles of Operation SA22-7201-05 Special Thanks ============== diff --git a/Documentation/s390/driver-model.txt b/Documentation/s390/driver-model.rst similarity index 73% rename from Documentation/s390/driver-model.txt rename to Documentation/s390/driver-model.rst index ed265cf54cde..ad4bc2dbea43 100644 --- a/Documentation/s390/driver-model.txt +++ b/Documentation/s390/driver-model.rst @@ -1,5 +1,6 @@ +============================= S/390 driver model interfaces ------------------------------ +============================= 1. CCW devices -------------- @@ -7,13 +8,13 @@ S/390 driver model interfaces All devices which can be addressed by means of ccws are called 'CCW devices' - even if they aren't actually driven by ccws. -All ccw devices are accessed via a subchannel, this is reflected in the -structures under devices/: +All ccw devices are accessed via a subchannel, this is reflected in the +structures under devices/:: -devices/ + devices/ - system/ - css0/ - - 0.0.0000/0.0.0815/ + - 0.0.0000/0.0.0815/ - 0.0.0001/0.0.4711/ - 0.0.0002/ - 0.1.0000/0.1.1234/ @@ -35,14 +36,18 @@ be found under bus/ccw/devices/. All ccw devices export some data via sysfs. -cutype: The control unit type / model. +cutype: + The control unit type / model. -devtype: The device type / model, if applicable. +devtype: + The device type / model, if applicable. -availability: Can be 'good' or 'boxed'; 'no path' or 'no device' for +availability: + Can be 'good' or 'boxed'; 'no path' or 'no device' for disconnected devices. -online: An interface to set the device online and offline. +online: + An interface to set the device online and offline. In the special case of the device being disconnected (see the notify function under 1.2), piping 0 to online will forcibly delete the device. @@ -52,9 +57,11 @@ The device drivers can add entries to export per-device data and interfaces. There is also some data exported on a per-subchannel basis (see under bus/css/devices/): -chpids: Via which chpids the device is connected. +chpids: + Via which chpids the device is connected. -pimpampom: The path installed, path available and path operational masks. +pimpampom: + The path installed, path available and path operational masks. There also might be additional data, for example for block devices. @@ -74,77 +81,93 @@ b. After a. has been performed, if necessary, the device is finally brought up ------------------------------------ The basic struct ccw_device and struct ccw_driver data structures can be found -under include/asm/ccwdev.h. +under include/asm/ccwdev.h:: -struct ccw_device { - spinlock_t *ccwlock; - struct ccw_device_private *private; - struct ccw_device_id id; + struct ccw_device { + spinlock_t *ccwlock; + struct ccw_device_private *private; + struct ccw_device_id id; - struct ccw_driver *drv; - struct device dev; + struct ccw_driver *drv; + struct device dev; int online; void (*handler) (struct ccw_device *dev, unsigned long intparm, - struct irb *irb); -}; + struct irb *irb); + }; -struct ccw_driver { - struct module *owner; - struct ccw_device_id *ids; - int (*probe) (struct ccw_device *); + struct ccw_driver { + struct module *owner; + struct ccw_device_id *ids; + int (*probe) (struct ccw_device *); int (*remove) (struct ccw_device *); int (*set_online) (struct ccw_device *); int (*set_offline) (struct ccw_device *); int (*notify) (struct ccw_device *, int); struct device_driver driver; char *name; -}; + }; The 'private' field contains data needed for internal i/o operation only, and is not available to the device driver. Each driver should declare in a MODULE_DEVICE_TABLE into which CU types/models and/or device types/models it is interested. This information can later be found -in the struct ccw_device_id fields: +in the struct ccw_device_id fields:: -struct ccw_device_id { - __u16 match_flags; + struct ccw_device_id { + __u16 match_flags; - __u16 cu_type; - __u16 dev_type; - __u8 cu_model; - __u8 dev_model; + __u16 cu_type; + __u16 dev_type; + __u8 cu_model; + __u8 dev_model; unsigned long driver_info; -}; + }; The functions in ccw_driver should be used in the following way: -probe: This function is called by the device layer for each device the driver + +probe: + This function is called by the device layer for each device the driver is interested in. The driver should only allocate private structures to put in dev->driver_data and create attributes (if needed). Also, the interrupt handler (see below) should be set here. -int (*probe) (struct ccw_device *cdev); +:: -Parameters: cdev - the device to be probed. + int (*probe) (struct ccw_device *cdev); + +Parameters: + cdev + - the device to be probed. -remove: This function is called by the device layer upon removal of the driver, +remove: + This function is called by the device layer upon removal of the driver, the device or the module. The driver should perform cleanups here. -int (*remove) (struct ccw_device *cdev); +:: -Parameters: cdev - the device to be removed. + int (*remove) (struct ccw_device *cdev); + +Parameters: + cdev + - the device to be removed. -set_online: This function is called by the common I/O layer when the device is +set_online: + This function is called by the common I/O layer when the device is activated via the 'online' attribute. The driver should finally setup and activate the device here. -int (*set_online) (struct ccw_device *); +:: -Parameters: cdev - the device to be activated. The common layer has + int (*set_online) (struct ccw_device *); + +Parameters: + cdev + - the device to be activated. The common layer has verified that the device is not already online. @@ -152,15 +175,22 @@ set_offline: This function is called by the common I/O layer when the device is de-activated via the 'online' attribute. The driver should shut down the device, but not de-allocate its private data. -int (*set_offline) (struct ccw_device *); +:: -Parameters: cdev - the device to be deactivated. The common layer has + int (*set_offline) (struct ccw_device *); + +Parameters: + cdev + - the device to be deactivated. The common layer has verified that the device is online. -notify: This function is called by the common I/O layer for some state changes +notify: + This function is called by the common I/O layer for some state changes of the device. + Signalled to the driver are: + * In online state, device detached (CIO_GONE) or last path gone (CIO_NO_PATH). The driver must return !0 to keep the device; for return code 0, the device will be deleted as usual (also when no @@ -173,32 +203,40 @@ notify: This function is called by the common I/O layer for some state changes return code of the notify function the device driver signals if it wants the device back: !0 for keeping, 0 to make the device being removed and re-registered. - -int (*notify) (struct ccw_device *, int); -Parameters: cdev - the device whose state changed. - event - the event that happened. This can be one of CIO_GONE, - CIO_NO_PATH or CIO_OPER. +:: + + int (*notify) (struct ccw_device *, int); + +Parameters: + cdev + - the device whose state changed. + + event + - the event that happened. This can be one of CIO_GONE, + CIO_NO_PATH or CIO_OPER. The handler field of the struct ccw_device is meant to be set to the interrupt -handler for the device. In order to accommodate drivers which use several +handler for the device. In order to accommodate drivers which use several distinct handlers (e.g. multi subchannel devices), this is a member of ccw_device instead of ccw_driver. The handler is registered with the common layer during set_online() processing before the driver is called, and is deregistered during set_offline() after the -driver has been called. Also, after registering / before deregistering, path +driver has been called. Also, after registering / before deregistering, path grouping resp. disbanding of the path group (if applicable) are performed. -void (*handler) (struct ccw_device *dev, unsigned long intparm, struct irb *irb); +:: -Parameters: dev - the device the handler is called for + void (*handler) (struct ccw_device *dev, unsigned long intparm, struct irb *irb); + +Parameters: dev - the device the handler is called for intparm - the intparm which allows the device driver to identify - the i/o the interrupt is associated with, or to recognize - the interrupt as unsolicited. - irb - interruption response block which contains the accumulated - status. + the i/o the interrupt is associated with, or to recognize + the interrupt as unsolicited. + irb - interruption response block which contains the accumulated + status. -The device driver is called from the common ccw_device layer and can retrieve +The device driver is called from the common ccw_device layer and can retrieve information about the interrupt from the irb parameter. @@ -237,23 +275,27 @@ only the logical state and not the physical state, since we cannot track the latter consistently due to lacking machine support (we don't need to be aware of it anyway). -status - Can be 'online' or 'offline'. +status + - Can be 'online' or 'offline'. Piping 'on' or 'off' sets the chpid logically online/offline. Piping 'on' to an online chpid triggers path reprobing for all devices the chpid connects to. This can be used to force the kernel to re-use a channel path the user knows to be online, but the machine hasn't created a machine check for. -type - The physical type of the channel path. +type + - The physical type of the channel path. -shared - Whether the channel path is shared. +shared + - Whether the channel path is shared. -cmg - The channel measurement group. +cmg + - The channel measurement group. 3. System devices ----------------- -3.1 xpram +3.1 xpram --------- xpram shows up under devices/system/ as 'xpram'. @@ -279,9 +321,8 @@ Netiucv connections show up under devices/iucv/ as "netiucv". The interfa number is assigned sequentially to the connections defined via the 'connection' attribute. -user - shows the connection partner. - -buffer - maximum buffer size. - Pipe to it to change buffer size. - +user + - shows the connection partner. +buffer + - maximum buffer size. Pipe to it to change buffer size. diff --git a/Documentation/s390/index.rst b/Documentation/s390/index.rst new file mode 100644 index 000000000000..1a914da2a07b --- /dev/null +++ b/Documentation/s390/index.rst @@ -0,0 +1,30 @@ +:orphan: + +================= +s390 Architecture +================= + +.. toctree:: + :maxdepth: 1 + + cds + 3270 + debugging390 + driver-model + monreader + qeth + s390dbf + vfio-ap + vfio-ccw + zfcpdump + dasd + common_io + + text_files + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/s390/monreader.txt b/Documentation/s390/monreader.rst similarity index 81% rename from Documentation/s390/monreader.txt rename to Documentation/s390/monreader.rst index d3729585fdb0..1e857575c113 100644 --- a/Documentation/s390/monreader.txt +++ b/Documentation/s390/monreader.rst @@ -1,24 +1,26 @@ +================================================= +Linux API for read access to z/VM Monitor Records +================================================= Date : 2004-Nov-26 + Author: Gerald Schaefer (geraldsc@de.ibm.com) - Linux API for read access to z/VM Monitor Records - ================================================= Description =========== This item delivers a new Linux API in the form of a misc char device that is usable from user space and allows read access to the z/VM Monitor Records -collected by the *MONITOR System Service of z/VM. +collected by the `*MONITOR` System Service of z/VM. User Requirements ================= The z/VM guest on which you want to access this API needs to be configured in -order to allow IUCV connections to the *MONITOR service, i.e. it needs the -IUCV *MONITOR statement in its user entry. If the monitor DCSS to be used is +order to allow IUCV connections to the `*MONITOR` service, i.e. it needs the +IUCV `*MONITOR` statement in its user entry. If the monitor DCSS to be used is restricted (likely), you also need the NAMESAVE statement. This item will use the IUCV device driver to access the z/VM services, so you need a kernel with IUCV support. You also need z/VM version 4.4 or 5.1. @@ -50,7 +52,9 @@ Your guest virtual storage has to end below the starting address of the DCSS and you have to specify the "mem=" kernel parameter in your parmfile with a value greater than the ending address of the DCSS. -Example: DEF STOR 140M +Example:: + + DEF STOR 140M This defines 140MB storage size for your guest, the parameter "mem=160M" is added to the parmfile. @@ -66,24 +70,27 @@ kernel, the kernel parameter "monreader.mondcss=" can be specified in the parmfile. The default name for the DCSS is "MONDCSS" if none is specified. In case that -there are other users already connected to the *MONITOR service (e.g. +there are other users already connected to the `*MONITOR` service (e.g. Performance Toolkit), the monitor DCSS is already defined and you have to use the same DCSS. The CP command Q MONITOR (Class E privileged) shows the name of the monitor DCSS, if already defined, and the users connected to the -*MONITOR service. +`*MONITOR` service. Refer to the "z/VM Performance" book (SC24-6109-00) on how to create a monitor DCSS if your z/VM doesn't have one already, you need Class E privileges to define and save a DCSS. Example: -------- -modprobe monreader mondcss=MYDCSS + +:: + + modprobe monreader mondcss=MYDCSS This loads the module and sets the DCSS name to "MYDCSS". NOTE: ----- -This API provides no interface to control the *MONITOR service, e.g. specify +This API provides no interface to control the `*MONITOR` service, e.g. specify which data should be collected. This can be done by the CP command MONITOR (Class E privileged), see "CP Command and Utility Reference". @@ -98,6 +105,7 @@ If your distribution does not support udev, a device node will not be created automatically and you have to create it manually after loading the module. Therefore you need to know the major and minor numbers of the device. These numbers can be found in /sys/class/misc/monreader/dev. + Typing cat /sys/class/misc/monreader/dev will give an output of the form :. The device node can be created via the mknod command, enter mknod c , where is the name of the device node @@ -105,10 +113,13 @@ to be created. Example: -------- -# modprobe monreader -# cat /sys/class/misc/monreader/dev -10:63 -# mknod /dev/monreader c 10 63 + +:: + + # modprobe monreader + # cat /sys/class/misc/monreader/dev + 10:63 + # mknod /dev/monreader c 10 63 This loads the module with the default monitor DCSS (MONDCSS) and creates a device node. @@ -133,20 +144,21 @@ last byte of data. The start address is needed to handle "end-of-frame" records correctly (domain 1, record 13), i.e. it can be used to determine the record start offset relative to a 4K page (frame) boundary. -See "Appendix A: *MONITOR" in the "z/VM Performance" document for a description +See "Appendix A: `*MONITOR`" in the "z/VM Performance" document for a description of the monitor control element layout. The layout of the monitor records can be found here (z/VM 5.1): http://www.vm.ibm.com/pubs/mon510/index.html -The layout of the data stream provided by the monreader device is as follows: -... -<0 byte read> - \ - | -... |- data set - | - / -<0 byte read> -... +The layout of the data stream provided by the monreader device is as follows:: + + ... + <0 byte read> + \ + | + ... |- data set + | + / + <0 byte read> + ... There may be more than one combination of MCE and corresponding record set within one data set and the end of each data set is indicated by a successful @@ -165,15 +177,19 @@ As with most char devices, error conditions are indicated by returning a negative value for the number of bytes read. In this case, the errno variable indicates the error condition: -EIO: reply failed, read data is invalid and the application +EIO: + reply failed, read data is invalid and the application should discard the data read since the last successful read with 0 size. -EFAULT: copy_to_user failed, read data is invalid and the application should - discard the data read since the last successful read with 0 size. -EAGAIN: occurs on a non-blocking read if there is no data available at the - moment. There is no data missing or corrupted, just try again or rather - use polling for non-blocking reads. -EOVERFLOW: message limit reached, the data read since the last successful - read with 0 size is valid but subsequent records may be missing. +EFAULT: + copy_to_user failed, read data is invalid and the application should + discard the data read since the last successful read with 0 size. +EAGAIN: + occurs on a non-blocking read if there is no data available at the + moment. There is no data missing or corrupted, just try again or rather + use polling for non-blocking reads. +EOVERFLOW: + message limit reached, the data read since the last successful + read with 0 size is valid but subsequent records may be missing. In the last case (EOVERFLOW) there may be missing data, in the first two cases (EIO, EFAULT) there will be missing data. It's up to the application if it will @@ -183,7 +199,7 @@ Open: ----- Only one user is allowed to open the char device. If it is already in use, the open function will fail (return a negative value) and set errno to EBUSY. -The open function may also fail if an IUCV connection to the *MONITOR service +The open function may also fail if an IUCV connection to the `*MONITOR` service cannot be established. In this case errno will be set to EIO and an error message with an IPUSER SEVER code will be printed into syslog. The IPUSER SEVER codes are described in the "z/VM Performance" book, Appendix A. @@ -194,4 +210,3 @@ As soon as the device is opened, incoming messages will be accepted and they will account for the message limit, i.e. opening the device without reading from it will provoke the "message limit reached" error (EOVERFLOW error code) eventually. - diff --git a/Documentation/s390/qeth.txt b/Documentation/s390/qeth.rst similarity index 62% rename from Documentation/s390/qeth.txt rename to Documentation/s390/qeth.rst index aa06fcf5f8c2..f02fdaa68de0 100644 --- a/Documentation/s390/qeth.txt +++ b/Documentation/s390/qeth.rst @@ -1,8 +1,12 @@ +============================= IBM s390 QDIO Ethernet Driver +============================= OSA and HiperSockets Bridge Port Support +======================================== Uevents +------- To generate the events the device must be assigned a role of either a primary or a secondary Bridge Port. For more information, see @@ -13,12 +17,15 @@ of some configured Bridge Port device on the channel changes, a udev event with ACTION=CHANGE is emitted on behalf of the corresponding ccwgroup device. The event has the following attributes: -BRIDGEPORT=statechange - indicates that the Bridge Port device changed +BRIDGEPORT=statechange + indicates that the Bridge Port device changed its state. -ROLE={primary|secondary|none} - the role assigned to the port. +ROLE={primary|secondary|none} + the role assigned to the port. -STATE={active|standby|inactive} - the newly assumed state of the port. +STATE={active|standby|inactive} + the newly assumed state of the port. When run on HiperSockets Bridge Capable Port hardware with host address notifications enabled, a udev event with ACTION=CHANGE is emitted. @@ -26,25 +33,32 @@ It is emitted on behalf of the corresponding ccwgroup device when a host or a VLAN is registered or unregistered on the network served by the device. The event has the following attributes: -BRIDGEDHOST={reset|register|deregister|abort} - host address +BRIDGEDHOST={reset|register|deregister|abort} + host address notifications are started afresh, a new host or VLAN is registered or deregistered on the Bridge Port HiperSockets channel, or address notifications are aborted. -VLAN=numeric-vlan-id - VLAN ID on which the event occurred. Not included +VLAN=numeric-vlan-id + VLAN ID on which the event occurred. Not included if no VLAN is involved in the event. -MAC=xx:xx:xx:xx:xx:xx - MAC address of the host that is being registered +MAC=xx:xx:xx:xx:xx:xx + MAC address of the host that is being registered or deregistered from the HiperSockets channel. Not reported if the event reports the creation or destruction of a VLAN. -NTOK_BUSID=x.y.zzzz - device bus ID (CSSID, SSID and device number). +NTOK_BUSID=x.y.zzzz + device bus ID (CSSID, SSID and device number). -NTOK_IID=xx - device IID. +NTOK_IID=xx + device IID. -NTOK_CHPID=xx - device CHPID. +NTOK_CHPID=xx + device CHPID. -NTOK_CHID=xxxx - device channel ID. +NTOK_CHID=xxxx + device channel ID. -Note that the NTOK_* attributes refer to devices other than the one +Note that the `NTOK_*` attributes refer to devices other than the one connected to the system on which the OS is running. diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst new file mode 100644 index 000000000000..ec2a1faa414b --- /dev/null +++ b/Documentation/s390/s390dbf.rst @@ -0,0 +1,803 @@ +================== +S390 Debug Feature +================== + +files: + - arch/s390/kernel/debug.c + - arch/s390/include/asm/debug.h + +Description: +------------ +The goal of this feature is to provide a kernel debug logging API +where log records can be stored efficiently in memory, where each component +(e.g. device drivers) can have one separate debug log. +One purpose of this is to inspect the debug logs after a production system crash +in order to analyze the reason for the crash. + +If the system still runs but only a subcomponent which uses dbf fails, +it is possible to look at the debug logs on a live system via the Linux +debugfs filesystem. + +The debug feature may also very useful for kernel and driver development. + +Design: +------- +Kernel components (e.g. device drivers) can register themselves at the debug +feature with the function call debug_register(). This function initializes a +debug log for the caller. For each debug log exists a number of debug areas +where exactly one is active at one time. Each debug area consists of contiguous +pages in memory. In the debug areas there are stored debug entries (log records) +which are written by event- and exception-calls. + +An event-call writes the specified debug entry to the active debug +area and updates the log pointer for the active area. If the end +of the active debug area is reached, a wrap around is done (ring buffer) +and the next debug entry will be written at the beginning of the active +debug area. + +An exception-call writes the specified debug entry to the log and +switches to the next debug area. This is done in order to be sure +that the records which describe the origin of the exception are not +overwritten when a wrap around for the current area occurs. + +The debug areas themselves are also ordered in form of a ring buffer. +When an exception is thrown in the last debug area, the following debug +entries are then written again in the very first area. + +There are three versions for the event- and exception-calls: One for +logging raw data, one for text and one for numbers. + +Each debug entry contains the following data: + +- Timestamp +- Cpu-Number of calling task +- Level of debug entry (0...6) +- Return Address to caller +- Flag, if entry is an exception or not + +The debug logs can be inspected in a live system through entries in +the debugfs-filesystem. Under the toplevel directory "s390dbf" there is +a directory for each registered component, which is named like the +corresponding component. The debugfs normally should be mounted to +/sys/kernel/debug therefore the debug feature can be accessed under +/sys/kernel/debug/s390dbf. + +The content of the directories are files which represent different views +to the debug log. Each component can decide which views should be +used through registering them with the function debug_register_view(). +Predefined views for hex/ascii, sprintf and raw binary data are provided. +It is also possible to define other views. The content of +a view can be inspected simply by reading the corresponding debugfs file. + +All debug logs have an actual debug level (range from 0 to 6). +The default level is 3. Event and Exception functions have a 'level' +parameter. Only debug entries with a level that is lower or equal +than the actual level are written to the log. This means, when +writing events, high priority log entries should have a low level +value whereas low priority entries should have a high one. +The actual debug level can be changed with the help of the debugfs-filesystem +through writing a number string "x" to the 'level' debugfs file which is +provided for every debug log. Debugging can be switched off completely +by using "-" on the 'level' debugfs file. + +Example:: + + > echo "-" > /sys/kernel/debug/s390dbf/dasd/level + +It is also possible to deactivate the debug feature globally for every +debug log. You can change the behavior using 2 sysctl parameters in +/proc/sys/s390dbf: + +There are currently 2 possible triggers, which stop the debug feature +globally. The first possibility is to use the "debug_active" sysctl. If +set to 1 the debug feature is running. If "debug_active" is set to 0 the +debug feature is turned off. + +The second trigger which stops the debug feature is a kernel oops. +That prevents the debug feature from overwriting debug information that +happened before the oops. After an oops you can reactivate the debug feature +by piping 1 to /proc/sys/s390dbf/debug_active. Nevertheless, its not +suggested to use an oopsed kernel in a production environment. + +If you want to disallow the deactivation of the debug feature, you can use +the "debug_stoppable" sysctl. If you set "debug_stoppable" to 0 the debug +feature cannot be stopped. If the debug feature is already stopped, it +will stay deactivated. + +---------------------------------------------------------------------------- + +Kernel Interfaces: +------------------ + +:: + + debug_info_t *debug_register(char *name, int pages, int nr_areas, + int buf_size); + +Parameter: + name: + Name of debug log (e.g. used for debugfs entry) + pages: + Number of pages, which will be allocated per area + nr_areas: + Number of debug areas + buf_size: + Size of data area in each debug entry + +Return Value: + Handle for generated debug area + + NULL if register failed + +Description: Allocates memory for a debug log + Must not be called within an interrupt handler + +---------------------------------------------------------------------------- + +:: + + debug_info_t *debug_register_mode(char *name, int pages, int nr_areas, + int buf_size, mode_t mode, uid_t uid, + gid_t gid); + +Parameter: + name: + Name of debug log (e.g. used for debugfs entry) + pages: + Number of pages, which will be allocated per area + nr_areas: + Number of debug areas + buf_size: + Size of data area in each debug entry + mode: + File mode for debugfs files. E.g. S_IRWXUGO + uid: + User ID for debugfs files. Currently only 0 is + supported. + gid: + Group ID for debugfs files. Currently only 0 is + supported. + +Return Value: + Handle for generated debug area + + NULL if register failed + +Description: + Allocates memory for a debug log + Must not be called within an interrupt handler + +--------------------------------------------------------------------------- + +:: + + void debug_unregister (debug_info_t * id); + +Parameter: + id: + handle for debug log + +Return Value: + none + +Description: + frees memory for a debug log and removes all registered debug + views. + + Must not be called within an interrupt handler + +--------------------------------------------------------------------------- + +:: + + void debug_set_level (debug_info_t * id, int new_level); + +Parameter: id: handle for debug log + new_level: new debug level + +Return Value: + none + +Description: + Sets new actual debug level if new_level is valid. + +--------------------------------------------------------------------------- + +:: + + bool debug_level_enabled (debug_info_t * id, int level); + +Parameter: + id: + handle for debug log + level: + debug level + +Return Value: + True if level is less or equal to the current debug level. + +Description: + Returns true if debug events for the specified level would be + logged. Otherwise returns false. + +--------------------------------------------------------------------------- + +:: + + void debug_stop_all(void); + +Parameter: + none + +Return Value: + none + +Description: + stops the debug feature if stopping is allowed. Currently + used in case of a kernel oops. + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_event (debug_info_t* id, int level, void* data, + int length); + +Parameter: + id: + handle for debug log + level: + debug level + data: + pointer to data for debug entry + length: + length of data in bytes + +Return Value: + Address of written debug entry + +Description: + writes debug entry to active debug area (if level <= actual + debug level) + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_int_event (debug_info_t * id, int level, + unsigned int data); + debug_entry_t* debug_long_event(debug_info_t * id, int level, + unsigned long data); + +Parameter: + id: + handle for debug log + level: + debug level + data: + integer value for debug entry + +Return Value: + Address of written debug entry + +Description: + writes debug entry to active debug area (if level <= actual + debug level) + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_text_event (debug_info_t * id, int level, + const char* data); + +Parameter: + id: + handle for debug log + level: + debug level + data: + string for debug entry + +Return Value: + Address of written debug entry + +Description: + writes debug entry in ascii format to active debug area + (if level <= actual debug level) + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_sprintf_event (debug_info_t * id, int level, + char* string,...); + +Parameter: + id: + handle for debug log + level: + debug level + string: + format string for debug entry + ...: + varargs used as in sprintf() + +Return Value: Address of written debug entry + +Description: + writes debug entry with format string and varargs (longs) to + active debug area (if level $<=$ actual debug level). + floats and long long datatypes cannot be used as varargs. + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_exception (debug_info_t* id, int level, void* data, + int length); + +Parameter: + id: + handle for debug log + level: + debug level + data: + pointer to data for debug entry + length: + length of data in bytes + +Return Value: + Address of written debug entry + +Description: + writes debug entry to active debug area (if level <= actual + debug level) and switches to next debug area + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_int_exception (debug_info_t * id, int level, + unsigned int data); + debug_entry_t* debug_long_exception(debug_info_t * id, int level, + unsigned long data); + +Parameter: id: handle for debug log + level: debug level + data: integer value for debug entry + +Return Value: Address of written debug entry + +Description: writes debug entry to active debug area (if level <= actual + debug level) and switches to next debug area + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_text_exception (debug_info_t * id, int level, + const char* data); + +Parameter: id: handle for debug log + level: debug level + data: string for debug entry + +Return Value: Address of written debug entry + +Description: writes debug entry in ascii format to active debug area + (if level <= actual debug level) and switches to next debug + area + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_sprintf_exception (debug_info_t * id, int level, + char* string,...); + +Parameter: id: handle for debug log + level: debug level + string: format string for debug entry + ...: varargs used as in sprintf() + +Return Value: Address of written debug entry + +Description: writes debug entry with format string and varargs (longs) to + active debug area (if level $<=$ actual debug level) and + switches to next debug area. + floats and long long datatypes cannot be used as varargs. + +--------------------------------------------------------------------------- + +:: + + int debug_register_view (debug_info_t * id, struct debug_view *view); + +Parameter: id: handle for debug log + view: pointer to debug view struct + +Return Value: 0 : ok + < 0: Error + +Description: registers new debug view and creates debugfs dir entry + +--------------------------------------------------------------------------- + +:: + + int debug_unregister_view (debug_info_t * id, struct debug_view *view); + +Parameter: id: handle for debug log + view: pointer to debug view struct + +Return Value: 0 : ok + < 0: Error + +Description: unregisters debug view and removes debugfs dir entry + + + +Predefined views: +----------------- + +extern struct debug_view debug_hex_ascii_view; + +extern struct debug_view debug_raw_view; + +extern struct debug_view debug_sprintf_view; + +Examples +-------- + +:: + + /* + * hex_ascii- + raw-view Example + */ + + #include + #include + + static debug_info_t* debug_info; + + static int init(void) + { + /* register 4 debug areas with one page each and 4 byte data field */ + + debug_info = debug_register ("test", 1, 4, 4 ); + debug_register_view(debug_info,&debug_hex_ascii_view); + debug_register_view(debug_info,&debug_raw_view); + + debug_text_event(debug_info, 4 , "one "); + debug_int_exception(debug_info, 4, 4711); + debug_event(debug_info, 3, &debug_info, 4); + + return 0; + } + + static void cleanup(void) + { + debug_unregister (debug_info); + } + + module_init(init); + module_exit(cleanup); + +--------------------------------------------------------------------------- + +:: + + /* + * sprintf-view Example + */ + + #include + #include + + static debug_info_t* debug_info; + + static int init(void) + { + /* register 4 debug areas with one page each and data field for */ + /* format string pointer + 2 varargs (= 3 * sizeof(long)) */ + + debug_info = debug_register ("test", 1, 4, sizeof(long) * 3); + debug_register_view(debug_info,&debug_sprintf_view); + + debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__); + debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info); + + return 0; + } + + static void cleanup(void) + { + debug_unregister (debug_info); + } + + module_init(init); + module_exit(cleanup); + +Debugfs Interface +----------------- +Views to the debug logs can be investigated through reading the corresponding +debugfs-files: + +Example:: + + > ls /sys/kernel/debug/s390dbf/dasd + flush hex_ascii level pages raw + > cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s + 00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | .... + 00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE + 00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | .... + 00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP + 01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD + 01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | .... + 01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ... + 01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | .... + 01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE + 01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | .... + +See section about predefined views for explanation of the above output! + +Changing the debug level +------------------------ + +Example:: + + + > cat /sys/kernel/debug/s390dbf/dasd/level + 3 + > echo "5" > /sys/kernel/debug/s390dbf/dasd/level + > cat /sys/kernel/debug/s390dbf/dasd/level + 5 + +Flushing debug areas +-------------------- +Debug areas can be flushed with piping the number of the desired +area (0...n) to the debugfs file "flush". When using "-" all debug areas +are flushed. + +Examples: + +1. Flush debug area 0:: + + > echo "0" > /sys/kernel/debug/s390dbf/dasd/flush + +2. Flush all debug areas:: + + > echo "-" > /sys/kernel/debug/s390dbf/dasd/flush + +Changing the size of debug areas +------------------------------------ +It is possible the change the size of debug areas through piping +the number of pages to the debugfs file "pages". The resize request will +also flush the debug areas. + +Example: + +Define 4 pages for the debug areas of debug feature "dasd":: + + > echo "4" > /sys/kernel/debug/s390dbf/dasd/pages + +Stooping the debug feature +-------------------------- +Example: + +1. Check if stopping is allowed:: + + > cat /proc/sys/s390dbf/debug_stoppable + +2. Stop debug feature:: + + > echo 0 > /proc/sys/s390dbf/debug_active + +lcrash Interface +---------------- +It is planned that the dump analysis tool lcrash gets an additional command +'s390dbf' to display all the debug logs. With this tool it will be possible +to investigate the debug logs on a live system and with a memory dump after +a system crash. + +Investigating raw memory +------------------------ +One last possibility to investigate the debug logs at a live +system and after a system crash is to look at the raw memory +under VM or at the Service Element. +It is possible to find the anker of the debug-logs through +the 'debug_area_first' symbol in the System map. Then one has +to follow the correct pointers of the data-structures defined +in debug.h and find the debug-areas in memory. +Normally modules which use the debug feature will also have +a global variable with the pointer to the debug-logs. Following +this pointer it will also be possible to find the debug logs in +memory. + +For this method it is recommended to use '16 * x + 4' byte (x = 0..n) +for the length of the data field in debug_register() in +order to see the debug entries well formatted. + + +Predefined Views +---------------- + +There are three predefined views: hex_ascii, raw and sprintf. +The hex_ascii view shows the data field in hex and ascii representation +(e.g. '45 43 4b 44 | ECKD'). +The raw view returns a bytestream as the debug areas are stored in memory. + +The sprintf view formats the debug entries in the same way as the sprintf +function would do. The sprintf event/exception functions write to the +debug entry a pointer to the format string (size = sizeof(long)) +and for each vararg a long value. So e.g. for a debug entry with a format +string plus two varargs one would need to allocate a (3 * sizeof(long)) +byte data area in the debug_register() function. + +IMPORTANT: + Using "%s" in sprintf event functions is dangerous. You can only + use "%s" in the sprintf event functions, if the memory for the passed string + is available as long as the debug feature exists. The reason behind this is + that due to performance considerations only a pointer to the string is stored + in the debug feature. If you log a string that is freed afterwards, you will + get an OOPS when inspecting the debug feature, because then the debug feature + will access the already freed memory. + +NOTE: + If using the sprintf view do NOT use other event/exception functions + than the sprintf-event and -exception functions. + +The format of the hex_ascii and sprintf view is as follows: + +- Number of area +- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated + Universal Time (UTC), January 1, 1970) +- level of debug entry +- Exception flag (* = Exception) +- Cpu-Number of calling task +- Return Address to caller +- data field + +The format of the raw view is: + +- Header as described in debug.h +- datafield + +A typical line of the hex_ascii view will look like the following (first line +is only for explanation and will not be displayed when 'cating' the view): + +area time level exception cpu caller data (hex + ascii) +-------------------------------------------------------------------------- +00 00964419409:440690 1 - 00 88023fe + + +Defining views +-------------- + +Views are specified with the 'debug_view' structure. There are defined +callback functions which are used for reading and writing the debugfs files:: + + struct debug_view { + char name[DEBUG_MAX_PROCF_LEN]; + debug_prolog_proc_t* prolog_proc; + debug_header_proc_t* header_proc; + debug_format_proc_t* format_proc; + debug_input_proc_t* input_proc; + void* private_data; + }; + +where:: + + typedef int (debug_header_proc_t) (debug_info_t* id, + struct debug_view* view, + int area, + debug_entry_t* entry, + char* out_buf); + + typedef int (debug_format_proc_t) (debug_info_t* id, + struct debug_view* view, char* out_buf, + const char* in_buf); + typedef int (debug_prolog_proc_t) (debug_info_t* id, + struct debug_view* view, + char* out_buf); + typedef int (debug_input_proc_t) (debug_info_t* id, + struct debug_view* view, + struct file* file, const char* user_buf, + size_t in_buf_size, loff_t* offset); + + +The "private_data" member can be used as pointer to view specific data. +It is not used by the debug feature itself. + +The output when reading a debugfs file is structured like this:: + + "prolog_proc output" + + "header_proc output 1" "format_proc output 1" + "header_proc output 2" "format_proc output 2" + "header_proc output 3" "format_proc output 3" + ... + +When a view is read from the debugfs, the Debug Feature calls the +'prolog_proc' once for writing the prolog. +Then 'header_proc' and 'format_proc' are called for each +existing debug entry. + +The input_proc can be used to implement functionality when it is written to +the view (e.g. like with 'echo "0" > /sys/kernel/debug/s390dbf/dasd/level). + +For header_proc there can be used the default function +debug_dflt_header_fn() which is defined in debug.h. +and which produces the same header output as the predefined views. +E.g:: + + 00 00964419409:440761 2 - 00 88023ec + +In order to see how to use the callback functions check the implementation +of the default views! + +Example:: + + #include + + #define UNKNOWNSTR "data: %08x" + + const char* messages[] = + {"This error...........\n", + "That error...........\n", + "Problem..............\n", + "Something went wrong.\n", + "Everything ok........\n", + NULL + }; + + static int debug_test_format_fn( + debug_info_t * id, struct debug_view *view, + char *out_buf, const char *in_buf + ) + { + int i, rc = 0; + + if(id->buf_size >= 4) { + int msg_nr = *((int*)in_buf); + if(msg_nr < sizeof(messages)/sizeof(char*) - 1) + rc += sprintf(out_buf, "%s", messages[msg_nr]); + else + rc += sprintf(out_buf, UNKNOWNSTR, msg_nr); + } + out: + return rc; + } + + struct debug_view debug_test_view = { + "myview", /* name of view */ + NULL, /* no prolog */ + &debug_dflt_header_fn, /* default header for each entry */ + &debug_test_format_fn, /* our own format function */ + NULL, /* no input function */ + NULL /* no private data */ + }; + +test: +===== + +:: + + debug_info_t *debug_info; + ... + debug_info = debug_register ("test", 0, 4, 4 )); + debug_register_view(debug_info, &debug_test_view); + for(i = 0; i < 10; i ++) debug_int_event(debug_info, 1, i); + + > cat /sys/kernel/debug/s390dbf/test/myview + 00 00964419734:611402 1 - 00 88042ca This error........... + 00 00964419734:611405 1 - 00 88042ca That error........... + 00 00964419734:611408 1 - 00 88042ca Problem.............. + 00 00964419734:611411 1 - 00 88042ca Something went wrong. + 00 00964419734:611414 1 - 00 88042ca Everything ok........ + 00 00964419734:611417 1 - 00 88042ca data: 00000005 + 00 00964419734:611419 1 - 00 88042ca data: 00000006 + 00 00964419734:611422 1 - 00 88042ca data: 00000007 + 00 00964419734:611425 1 - 00 88042ca data: 00000008 + 00 00964419734:611428 1 - 00 88042ca data: 00000009 diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt deleted file mode 100644 index 61329fd62e89..000000000000 --- a/Documentation/s390/s390dbf.txt +++ /dev/null @@ -1,667 +0,0 @@ -S390 Debug Feature -================== - -files: arch/s390/kernel/debug.c - arch/s390/include/asm/debug.h - -Description: ------------- -The goal of this feature is to provide a kernel debug logging API -where log records can be stored efficiently in memory, where each component -(e.g. device drivers) can have one separate debug log. -One purpose of this is to inspect the debug logs after a production system crash -in order to analyze the reason for the crash. -If the system still runs but only a subcomponent which uses dbf fails, -it is possible to look at the debug logs on a live system via the Linux -debugfs filesystem. -The debug feature may also very useful for kernel and driver development. - -Design: -------- -Kernel components (e.g. device drivers) can register themselves at the debug -feature with the function call debug_register(). This function initializes a -debug log for the caller. For each debug log exists a number of debug areas -where exactly one is active at one time. Each debug area consists of contiguous -pages in memory. In the debug areas there are stored debug entries (log records) -which are written by event- and exception-calls. - -An event-call writes the specified debug entry to the active debug -area and updates the log pointer for the active area. If the end -of the active debug area is reached, a wrap around is done (ring buffer) -and the next debug entry will be written at the beginning of the active -debug area. - -An exception-call writes the specified debug entry to the log and -switches to the next debug area. This is done in order to be sure -that the records which describe the origin of the exception are not -overwritten when a wrap around for the current area occurs. - -The debug areas themselves are also ordered in form of a ring buffer. -When an exception is thrown in the last debug area, the following debug -entries are then written again in the very first area. - -There are three versions for the event- and exception-calls: One for -logging raw data, one for text and one for numbers. - -Each debug entry contains the following data: - -- Timestamp -- Cpu-Number of calling task -- Level of debug entry (0...6) -- Return Address to caller -- Flag, if entry is an exception or not - -The debug logs can be inspected in a live system through entries in -the debugfs-filesystem. Under the toplevel directory "s390dbf" there is -a directory for each registered component, which is named like the -corresponding component. The debugfs normally should be mounted to -/sys/kernel/debug therefore the debug feature can be accessed under -/sys/kernel/debug/s390dbf. - -The content of the directories are files which represent different views -to the debug log. Each component can decide which views should be -used through registering them with the function debug_register_view(). -Predefined views for hex/ascii, sprintf and raw binary data are provided. -It is also possible to define other views. The content of -a view can be inspected simply by reading the corresponding debugfs file. - -All debug logs have an actual debug level (range from 0 to 6). -The default level is 3. Event and Exception functions have a 'level' -parameter. Only debug entries with a level that is lower or equal -than the actual level are written to the log. This means, when -writing events, high priority log entries should have a low level -value whereas low priority entries should have a high one. -The actual debug level can be changed with the help of the debugfs-filesystem -through writing a number string "x" to the 'level' debugfs file which is -provided for every debug log. Debugging can be switched off completely -by using "-" on the 'level' debugfs file. - -Example: - -> echo "-" > /sys/kernel/debug/s390dbf/dasd/level - -It is also possible to deactivate the debug feature globally for every -debug log. You can change the behavior using 2 sysctl parameters in -/proc/sys/s390dbf: -There are currently 2 possible triggers, which stop the debug feature -globally. The first possibility is to use the "debug_active" sysctl. If -set to 1 the debug feature is running. If "debug_active" is set to 0 the -debug feature is turned off. -The second trigger which stops the debug feature is a kernel oops. -That prevents the debug feature from overwriting debug information that -happened before the oops. After an oops you can reactivate the debug feature -by piping 1 to /proc/sys/s390dbf/debug_active. Nevertheless, its not -suggested to use an oopsed kernel in a production environment. -If you want to disallow the deactivation of the debug feature, you can use -the "debug_stoppable" sysctl. If you set "debug_stoppable" to 0 the debug -feature cannot be stopped. If the debug feature is already stopped, it -will stay deactivated. - -Kernel Interfaces: ------------------- - ----------------------------------------------------------------------------- -debug_info_t *debug_register(char *name, int pages, int nr_areas, - int buf_size); - -Parameter: name: Name of debug log (e.g. used for debugfs entry) - pages: number of pages, which will be allocated per area - nr_areas: number of debug areas - buf_size: size of data area in each debug entry - -Return Value: Handle for generated debug area - NULL if register failed - -Description: Allocates memory for a debug log - Must not be called within an interrupt handler - ----------------------------------------------------------------------------- -debug_info_t *debug_register_mode(char *name, int pages, int nr_areas, - int buf_size, mode_t mode, uid_t uid, - gid_t gid); - -Parameter: name: Name of debug log (e.g. used for debugfs entry) - pages: Number of pages, which will be allocated per area - nr_areas: Number of debug areas - buf_size: Size of data area in each debug entry - mode: File mode for debugfs files. E.g. S_IRWXUGO - uid: User ID for debugfs files. Currently only 0 is - supported. - gid: Group ID for debugfs files. Currently only 0 is - supported. - -Return Value: Handle for generated debug area - NULL if register failed - -Description: Allocates memory for a debug log - Must not be called within an interrupt handler - ---------------------------------------------------------------------------- -void debug_unregister (debug_info_t * id); - -Parameter: id: handle for debug log - -Return Value: none - -Description: frees memory for a debug log and removes all registered debug - views. - Must not be called within an interrupt handler - ---------------------------------------------------------------------------- -void debug_set_level (debug_info_t * id, int new_level); - -Parameter: id: handle for debug log - new_level: new debug level - -Return Value: none - -Description: Sets new actual debug level if new_level is valid. - ---------------------------------------------------------------------------- -bool debug_level_enabled (debug_info_t * id, int level); - -Parameter: id: handle for debug log - level: debug level - -Return Value: True if level is less or equal to the current debug level. - -Description: Returns true if debug events for the specified level would be - logged. Otherwise returns false. ---------------------------------------------------------------------------- -void debug_stop_all(void); - -Parameter: none - -Return Value: none - -Description: stops the debug feature if stopping is allowed. Currently - used in case of a kernel oops. - ---------------------------------------------------------------------------- -debug_entry_t* debug_event (debug_info_t* id, int level, void* data, - int length); - -Parameter: id: handle for debug log - level: debug level - data: pointer to data for debug entry - length: length of data in bytes - -Return Value: Address of written debug entry - -Description: writes debug entry to active debug area (if level <= actual - debug level) - ---------------------------------------------------------------------------- -debug_entry_t* debug_int_event (debug_info_t * id, int level, - unsigned int data); -debug_entry_t* debug_long_event(debug_info_t * id, int level, - unsigned long data); - -Parameter: id: handle for debug log - level: debug level - data: integer value for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry to active debug area (if level <= actual - debug level) - ---------------------------------------------------------------------------- -debug_entry_t* debug_text_event (debug_info_t * id, int level, - const char* data); - -Parameter: id: handle for debug log - level: debug level - data: string for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry in ascii format to active debug area - (if level <= actual debug level) - ---------------------------------------------------------------------------- -debug_entry_t* debug_sprintf_event (debug_info_t * id, int level, - char* string,...); - -Parameter: id: handle for debug log - level: debug level - string: format string for debug entry - ...: varargs used as in sprintf() - -Return Value: Address of written debug entry - -Description: writes debug entry with format string and varargs (longs) to - active debug area (if level $<=$ actual debug level). - floats and long long datatypes cannot be used as varargs. - ---------------------------------------------------------------------------- - -debug_entry_t* debug_exception (debug_info_t* id, int level, void* data, - int length); - -Parameter: id: handle for debug log - level: debug level - data: pointer to data for debug entry - length: length of data in bytes - -Return Value: Address of written debug entry - -Description: writes debug entry to active debug area (if level <= actual - debug level) and switches to next debug area - ---------------------------------------------------------------------------- -debug_entry_t* debug_int_exception (debug_info_t * id, int level, - unsigned int data); -debug_entry_t* debug_long_exception(debug_info_t * id, int level, - unsigned long data); - -Parameter: id: handle for debug log - level: debug level - data: integer value for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry to active debug area (if level <= actual - debug level) and switches to next debug area - ---------------------------------------------------------------------------- -debug_entry_t* debug_text_exception (debug_info_t * id, int level, - const char* data); - -Parameter: id: handle for debug log - level: debug level - data: string for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry in ascii format to active debug area - (if level <= actual debug level) and switches to next debug - area - ---------------------------------------------------------------------------- -debug_entry_t* debug_sprintf_exception (debug_info_t * id, int level, - char* string,...); - -Parameter: id: handle for debug log - level: debug level - string: format string for debug entry - ...: varargs used as in sprintf() - -Return Value: Address of written debug entry - -Description: writes debug entry with format string and varargs (longs) to - active debug area (if level $<=$ actual debug level) and - switches to next debug area. - floats and long long datatypes cannot be used as varargs. - ---------------------------------------------------------------------------- - -int debug_register_view (debug_info_t * id, struct debug_view *view); - -Parameter: id: handle for debug log - view: pointer to debug view struct - -Return Value: 0 : ok - < 0: Error - -Description: registers new debug view and creates debugfs dir entry - ---------------------------------------------------------------------------- -int debug_unregister_view (debug_info_t * id, struct debug_view *view); - -Parameter: id: handle for debug log - view: pointer to debug view struct - -Return Value: 0 : ok - < 0: Error - -Description: unregisters debug view and removes debugfs dir entry - - - -Predefined views: ------------------ - -extern struct debug_view debug_hex_ascii_view; -extern struct debug_view debug_raw_view; -extern struct debug_view debug_sprintf_view; - -Examples --------- - -/* - * hex_ascii- + raw-view Example - */ - -#include -#include - -static debug_info_t* debug_info; - -static int init(void) -{ - /* register 4 debug areas with one page each and 4 byte data field */ - - debug_info = debug_register ("test", 1, 4, 4 ); - debug_register_view(debug_info,&debug_hex_ascii_view); - debug_register_view(debug_info,&debug_raw_view); - - debug_text_event(debug_info, 4 , "one "); - debug_int_exception(debug_info, 4, 4711); - debug_event(debug_info, 3, &debug_info, 4); - - return 0; -} - -static void cleanup(void) -{ - debug_unregister (debug_info); -} - -module_init(init); -module_exit(cleanup); - ---------------------------------------------------------------------------- - -/* - * sprintf-view Example - */ - -#include -#include - -static debug_info_t* debug_info; - -static int init(void) -{ - /* register 4 debug areas with one page each and data field for */ - /* format string pointer + 2 varargs (= 3 * sizeof(long)) */ - - debug_info = debug_register ("test", 1, 4, sizeof(long) * 3); - debug_register_view(debug_info,&debug_sprintf_view); - - debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__); - debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info); - - return 0; -} - -static void cleanup(void) -{ - debug_unregister (debug_info); -} - -module_init(init); -module_exit(cleanup); - - - -Debugfs Interface ----------------- -Views to the debug logs can be investigated through reading the corresponding -debugfs-files: - -Example: - -> ls /sys/kernel/debug/s390dbf/dasd -flush hex_ascii level pages raw -> cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s -00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | .... -00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE -00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | .... -00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP -01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD -01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | .... -01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ... -01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | .... -01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE -01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | .... - -See section about predefined views for explanation of the above output! - -Changing the debug level ------------------------- - -Example: - - -> cat /sys/kernel/debug/s390dbf/dasd/level -3 -> echo "5" > /sys/kernel/debug/s390dbf/dasd/level -> cat /sys/kernel/debug/s390dbf/dasd/level -5 - -Flushing debug areas --------------------- -Debug areas can be flushed with piping the number of the desired -area (0...n) to the debugfs file "flush". When using "-" all debug areas -are flushed. - -Examples: - -1. Flush debug area 0: -> echo "0" > /sys/kernel/debug/s390dbf/dasd/flush - -2. Flush all debug areas: -> echo "-" > /sys/kernel/debug/s390dbf/dasd/flush - -Changing the size of debug areas ------------------------------------- -It is possible the change the size of debug areas through piping -the number of pages to the debugfs file "pages". The resize request will -also flush the debug areas. - -Example: - -Define 4 pages for the debug areas of debug feature "dasd": -> echo "4" > /sys/kernel/debug/s390dbf/dasd/pages - -Stooping the debug feature --------------------------- -Example: - -1. Check if stopping is allowed -> cat /proc/sys/s390dbf/debug_stoppable -2. Stop debug feature -> echo 0 > /proc/sys/s390dbf/debug_active - -lcrash Interface ----------------- -It is planned that the dump analysis tool lcrash gets an additional command -'s390dbf' to display all the debug logs. With this tool it will be possible -to investigate the debug logs on a live system and with a memory dump after -a system crash. - -Investigating raw memory ------------------------- -One last possibility to investigate the debug logs at a live -system and after a system crash is to look at the raw memory -under VM or at the Service Element. -It is possible to find the anker of the debug-logs through -the 'debug_area_first' symbol in the System map. Then one has -to follow the correct pointers of the data-structures defined -in debug.h and find the debug-areas in memory. -Normally modules which use the debug feature will also have -a global variable with the pointer to the debug-logs. Following -this pointer it will also be possible to find the debug logs in -memory. - -For this method it is recommended to use '16 * x + 4' byte (x = 0..n) -for the length of the data field in debug_register() in -order to see the debug entries well formatted. - - -Predefined Views ----------------- - -There are three predefined views: hex_ascii, raw and sprintf. -The hex_ascii view shows the data field in hex and ascii representation -(e.g. '45 43 4b 44 | ECKD'). -The raw view returns a bytestream as the debug areas are stored in memory. - -The sprintf view formats the debug entries in the same way as the sprintf -function would do. The sprintf event/exception functions write to the -debug entry a pointer to the format string (size = sizeof(long)) -and for each vararg a long value. So e.g. for a debug entry with a format -string plus two varargs one would need to allocate a (3 * sizeof(long)) -byte data area in the debug_register() function. - -IMPORTANT: Using "%s" in sprintf event functions is dangerous. You can only -use "%s" in the sprintf event functions, if the memory for the passed string is -available as long as the debug feature exists. The reason behind this is that -due to performance considerations only a pointer to the string is stored in -the debug feature. If you log a string that is freed afterwards, you will get -an OOPS when inspecting the debug feature, because then the debug feature will -access the already freed memory. - -NOTE: If using the sprintf view do NOT use other event/exception functions -than the sprintf-event and -exception functions. - -The format of the hex_ascii and sprintf view is as follows: -- Number of area -- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated - Universal Time (UTC), January 1, 1970) -- level of debug entry -- Exception flag (* = Exception) -- Cpu-Number of calling task -- Return Address to caller -- data field - -The format of the raw view is: -- Header as described in debug.h -- datafield - -A typical line of the hex_ascii view will look like the following (first line -is only for explanation and will not be displayed when 'cating' the view): - -area time level exception cpu caller data (hex + ascii) --------------------------------------------------------------------------- -00 00964419409:440690 1 - 00 88023fe - - -Defining views --------------- - -Views are specified with the 'debug_view' structure. There are defined -callback functions which are used for reading and writing the debugfs files: - -struct debug_view { - char name[DEBUG_MAX_PROCF_LEN]; - debug_prolog_proc_t* prolog_proc; - debug_header_proc_t* header_proc; - debug_format_proc_t* format_proc; - debug_input_proc_t* input_proc; - void* private_data; -}; - -where - -typedef int (debug_header_proc_t) (debug_info_t* id, - struct debug_view* view, - int area, - debug_entry_t* entry, - char* out_buf); - -typedef int (debug_format_proc_t) (debug_info_t* id, - struct debug_view* view, char* out_buf, - const char* in_buf); -typedef int (debug_prolog_proc_t) (debug_info_t* id, - struct debug_view* view, - char* out_buf); -typedef int (debug_input_proc_t) (debug_info_t* id, - struct debug_view* view, - struct file* file, const char* user_buf, - size_t in_buf_size, loff_t* offset); - - -The "private_data" member can be used as pointer to view specific data. -It is not used by the debug feature itself. - -The output when reading a debugfs file is structured like this: - -"prolog_proc output" - -"header_proc output 1" "format_proc output 1" -"header_proc output 2" "format_proc output 2" -"header_proc output 3" "format_proc output 3" -... - -When a view is read from the debugfs, the Debug Feature calls the -'prolog_proc' once for writing the prolog. -Then 'header_proc' and 'format_proc' are called for each -existing debug entry. - -The input_proc can be used to implement functionality when it is written to -the view (e.g. like with 'echo "0" > /sys/kernel/debug/s390dbf/dasd/level). - -For header_proc there can be used the default function -debug_dflt_header_fn() which is defined in debug.h. -and which produces the same header output as the predefined views. -E.g: -00 00964419409:440761 2 - 00 88023ec - -In order to see how to use the callback functions check the implementation -of the default views! - -Example - -#include - -#define UNKNOWNSTR "data: %08x" - -const char* messages[] = -{"This error...........\n", - "That error...........\n", - "Problem..............\n", - "Something went wrong.\n", - "Everything ok........\n", - NULL -}; - -static int debug_test_format_fn( - debug_info_t * id, struct debug_view *view, - char *out_buf, const char *in_buf -) -{ - int i, rc = 0; - - if(id->buf_size >= 4) { - int msg_nr = *((int*)in_buf); - if(msg_nr < sizeof(messages)/sizeof(char*) - 1) - rc += sprintf(out_buf, "%s", messages[msg_nr]); - else - rc += sprintf(out_buf, UNKNOWNSTR, msg_nr); - } - out: - return rc; -} - -struct debug_view debug_test_view = { - "myview", /* name of view */ - NULL, /* no prolog */ - &debug_dflt_header_fn, /* default header for each entry */ - &debug_test_format_fn, /* our own format function */ - NULL, /* no input function */ - NULL /* no private data */ -}; - -===== -test: -===== -debug_info_t *debug_info; -... -debug_info = debug_register ("test", 0, 4, 4 )); -debug_register_view(debug_info, &debug_test_view); -for(i = 0; i < 10; i ++) debug_int_event(debug_info, 1, i); - -> cat /sys/kernel/debug/s390dbf/test/myview -00 00964419734:611402 1 - 00 88042ca This error........... -00 00964419734:611405 1 - 00 88042ca That error........... -00 00964419734:611408 1 - 00 88042ca Problem.............. -00 00964419734:611411 1 - 00 88042ca Something went wrong. -00 00964419734:611414 1 - 00 88042ca Everything ok........ -00 00964419734:611417 1 - 00 88042ca data: 00000005 -00 00964419734:611419 1 - 00 88042ca data: 00000006 -00 00964419734:611422 1 - 00 88042ca data: 00000007 -00 00964419734:611425 1 - 00 88042ca data: 00000008 -00 00964419734:611428 1 - 00 88042ca data: 00000009 diff --git a/Documentation/s390/text_files.rst b/Documentation/s390/text_files.rst new file mode 100644 index 000000000000..c94d05d4fa17 --- /dev/null +++ b/Documentation/s390/text_files.rst @@ -0,0 +1,11 @@ +ibm 3270 changelog +------------------ + +.. include:: 3270.ChangeLog + :literal: + +ibm 3270 config3270.sh +---------------------- + +.. literalinclude:: config3270.sh + :language: shell diff --git a/Documentation/s390/vfio-ap.txt b/Documentation/s390/vfio-ap.rst similarity index 72% rename from Documentation/s390/vfio-ap.txt rename to Documentation/s390/vfio-ap.rst index 65167cfe4485..b5c51f7c748d 100644 --- a/Documentation/s390/vfio-ap.txt +++ b/Documentation/s390/vfio-ap.rst @@ -1,4 +1,9 @@ -Introduction: +=============================== +Adjunct Processor (AP) facility +=============================== + + +Introduction ============ The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards. @@ -11,7 +16,7 @@ framework. This implementation relies considerably on the s390 virtualization facilities which do most of the hard work of providing direct access to AP devices. -AP Architectural Overview: +AP Architectural Overview ========================= To facilitate the comprehension of the design, let's start with some definitions: @@ -31,13 +36,13 @@ definitions: in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and creates a sysfs device for each assigned adapter. For example, if AP adapters 4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following - sysfs device entries: + sysfs device entries:: /sys/devices/ap/card04 /sys/devices/ap/card0a Symbolic links to these devices will also be created in the AP bus devices - sub-directory: + sub-directory:: /sys/bus/ap/devices/[card04] /sys/bus/ap/devices/[card04] @@ -84,7 +89,7 @@ definitions: the cross product of the AP adapter and usage domain numbers detected when the AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the - following sysfs entries: + following sysfs entries:: /sys/devices/ap/card04/04.0006 /sys/devices/ap/card04/04.0047 @@ -92,7 +97,7 @@ definitions: /sys/devices/ap/card0a/0a.0047 The following symbolic links to these devices will be created in the AP bus - devices subdirectory: + devices subdirectory:: /sys/bus/ap/devices/[04.0006] /sys/bus/ap/devices/[04.0047] @@ -112,7 +117,7 @@ definitions: domain that is not one of the usage domains, but the modified domain must be one of the control domains. -AP and SIE: +AP and SIE ========== Let's now take a look at how AP instructions executed on a guest are interpreted by the hardware. @@ -153,7 +158,7 @@ and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6), The APQNs can provide secure key functionality - i.e., a private key is stored on the adapter card for each of its domains - so each APQN must be assigned to -at most one guest or to the linux host. +at most one guest or to the linux host:: Example 1: Valid configuration: ------------------------------ @@ -181,8 +186,8 @@ at most one guest or to the linux host. This is an invalid configuration because both guests have access to APQN (1,6). -The Design: -=========== +The Design +========== The design introduces three new objects: 1. AP matrix device @@ -205,43 +210,43 @@ The VFIO AP (vfio_ap) device driver serves the following purposes: Reserve APQNs for exclusive use of KVM guests --------------------------------------------- The following block diagram illustrates the mechanism by which APQNs are -reserved: +reserved:: - +------------------+ - 7 remove | | - +--------------------> cex4queue driver | - | | | - | +------------------+ - | - | - | +------------------+ +-----------------+ - | 5 register driver | | 3 create | | - | +----------------> Device core +----------> matrix device | - | | | | | | - | | +--------^---------+ +-----------------+ - | | | - | | +-------------------+ - | | +-----------------------------------+ | - | | | 4 register AP driver | | 2 register device - | | | | | -+--------+---+-v---+ +--------+-------+-+ -| | | | -| ap_bus +--------------------- > vfio_ap driver | -| | 8 probe | | -+--------^---------+ +--^--^------------+ -6 edit | | | - apmask | +-----------------------------+ | 9 mdev create - aqmask | | 1 modprobe | -+--------+-----+---+ +----------------+-+ +------------------+ -| | | |8 create | mediated | -| admin | | VFIO device core |---------> matrix | -| + | | | device | -+------+-+---------+ +--------^---------+ +--------^---------+ - | | | | - | | 9 create vfio_ap-passthrough | | - | +------------------------------+ | - +-------------------------------------------------------------+ - 10 assign adapter/domain/control domain + +------------------+ + 7 remove | | + +--------------------> cex4queue driver | + | | | + | +------------------+ + | + | + | +------------------+ +----------------+ + | 5 register driver | | 3 create | | + | +----------------> Device core +----------> matrix device | + | | | | | | + | | +--------^---------+ +----------------+ + | | | + | | +-------------------+ + | | +-----------------------------------+ | + | | | 4 register AP driver | | 2 register device + | | | | | + +--------+---+-v---+ +--------+-------+-+ + | | | | + | ap_bus +--------------------- > vfio_ap driver | + | | 8 probe | | + +--------^---------+ +--^--^------------+ + 6 edit | | | + apmask | +-----------------------------+ | 9 mdev create + aqmask | | 1 modprobe | + +--------+-----+---+ +----------------+-+ +----------------+ + | | | |8 create | mediated | + | admin | | VFIO device core |---------> matrix | + | + | | | device | + +------+-+---------+ +--------^---------+ +--------^-------+ + | | | | + | | 9 create vfio_ap-passthrough | | + | +------------------------------+ | + +-------------------------------------------------------------+ + 10 assign adapter/domain/control domain The process for reserving an AP queue for use by a KVM guest is: @@ -250,7 +255,7 @@ The process for reserving an AP queue for use by a KVM guest is: device with the device core. This will serve as the parent device for all mediated matrix devices used to configure an AP matrix for a guest. 3. The /sys/devices/vfio_ap/matrix device is created by the device core -4 The vfio_ap device driver will register with the AP bus for AP queue devices +4. The vfio_ap device driver will register with the AP bus for AP queue devices of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap driver's probe and remove callback interfaces. Devices older than CEX4 queues are not supported to simplify the implementation by not needlessly @@ -266,13 +271,14 @@ The process for reserving an AP queue for use by a KVM guest is: it. 9. The administrator creates a passthrough type mediated matrix device to be used by a guest -10 The administrator assigns the adapters, usage domains and control domains - to be exclusively used by a guest. +10. The administrator assigns the adapters, usage domains and control domains + to be exclusively used by a guest. Set up the VFIO mediated device interfaces ------------------------------------------ The VFIO AP device driver utilizes the common interface of the VFIO mediated device core driver to: + * Register an AP mediated bus driver to add a mediated matrix device to and remove it from a VFIO group. * Create and destroy a mediated matrix device @@ -280,25 +286,25 @@ device core driver to: * Add a mediated matrix device to and remove it from an IOMMU group The following high-level block diagram shows the main components and interfaces -of the VFIO AP mediated matrix device driver: +of the VFIO AP mediated matrix device driver:: - +-------------+ - | | - | +---------+ | mdev_register_driver() +--------------+ - | | Mdev | +<-----------------------+ | - | | bus | | | vfio_mdev.ko | - | | driver | +----------------------->+ |<-> VFIO user - | +---------+ | probe()/remove() +--------------+ APIs - | | - | MDEV CORE | - | MODULE | - | mdev.ko | - | +---------+ | mdev_register_device() +--------------+ - | |Physical | +<-----------------------+ | - | | device | | | vfio_ap.ko |<-> matrix - | |interface| +----------------------->+ | device - | +---------+ | callback +--------------+ - +-------------+ + +-------------+ + | | + | +---------+ | mdev_register_driver() +--------------+ + | | Mdev | +<-----------------------+ | + | | bus | | | vfio_mdev.ko | + | | driver | +----------------------->+ |<-> VFIO user + | +---------+ | probe()/remove() +--------------+ APIs + | | + | MDEV CORE | + | MODULE | + | mdev.ko | + | +---------+ | mdev_register_device() +--------------+ + | |Physical | +<-----------------------+ | + | | device | | | vfio_ap.ko |<-> matrix + | |interface| +----------------------->+ | device + | +---------+ | callback +--------------+ + +-------------+ During initialization of the vfio_ap module, the matrix device is registered with an 'mdev_parent_ops' structure that provides the sysfs attribute @@ -306,7 +312,8 @@ structures, mdev functions and callback interfaces for managing the mediated matrix device. * sysfs attribute structures: - * supported_type_groups + + supported_type_groups The VFIO mediated device framework supports creation of user-defined mediated device types. These mediated device types are specified via the 'supported_type_groups' structure when a device is registered @@ -318,61 +325,72 @@ matrix device. The VFIO AP device driver will register one mediated device type for passthrough devices: + /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough + Only the read-only attributes required by the VFIO mdev framework will - be provided: - ... name - ... device_api - ... available_instances - ... device_api - Where: - * name: specifies the name of the mediated device type - * device_api: the mediated device type's API - * available_instances: the number of mediated matrix passthrough devices - that can be created - * device_api: specifies the VFIO API - * mdev_attr_groups + be provided:: + + ... name + ... device_api + ... available_instances + ... device_api + + Where: + + * name: + specifies the name of the mediated device type + * device_api: + the mediated device type's API + * available_instances: + the number of mediated matrix passthrough devices + that can be created + * device_api: + specifies the VFIO API + mdev_attr_groups This attribute group identifies the user-defined sysfs attributes of the mediated device. When a device is registered with the VFIO mediated device framework, the sysfs attribute files identified in the 'mdev_attr_groups' structure will be created in the mediated matrix device's directory. The sysfs attributes for a mediated matrix device are: - * assign_adapter: - * unassign_adapter: + + assign_adapter / unassign_adapter: Write-only attributes for assigning/unassigning an AP adapter to/from the mediated matrix device. To assign/unassign an adapter, the APID of the adapter is echoed to the respective attribute file. - * assign_domain: - * unassign_domain: + assign_domain / unassign_domain: Write-only attributes for assigning/unassigning an AP usage domain to/from the mediated matrix device. To assign/unassign a domain, the domain number of the the usage domain is echoed to the respective attribute file. - * matrix: + matrix: A read-only file for displaying the APQNs derived from the cross product of the adapter and domain numbers assigned to the mediated matrix device. - * assign_control_domain: - * unassign_control_domain: + assign_control_domain / unassign_control_domain: Write-only attributes for assigning/unassigning an AP control domain to/from the mediated matrix device. To assign/unassign a control domain, the ID of the domain to be assigned/unassigned is echoed to the respective attribute file. - * control_domains: + control_domains: A read-only file for displaying the control domain numbers assigned to the mediated matrix device. * functions: - * create: + + create: allocates the ap_matrix_mdev structure used by the vfio_ap driver to: + * Store the reference to the KVM structure for the guest using the mdev * Store the AP matrix configuration for the adapters, domains, and control domains assigned via the corresponding sysfs attributes files - * remove: + + remove: deallocates the mediated matrix device's ap_matrix_mdev structure. This will be allowed only if a running guest is not using the mdev. * callback interfaces - * open: + + open: The vfio_ap driver uses this callback to register a VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix device. The open is invoked when QEMU connects the VFIO iommu group @@ -380,16 +398,17 @@ matrix device. to configure the KVM guest is provided via this callback. The KVM structure, is used to configure the guest's access to the AP matrix defined via the mediated matrix device's sysfs attribute files. - * release: + release: unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix device and deconfigures the guest's AP matrix. -Configure the APM, AQM and ADM in the CRYCB: +Configure the APM, AQM and ADM in the CRYCB ------------------------------------------- Configuring the AP matrix for a KVM guest will be performed when the VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier function is called when QEMU connects to KVM. The guest's AP matrix is configured via it's CRYCB by: + * Setting the bits in the APM corresponding to the APIDs assigned to the mediated matrix device via its 'assign_adapter' interface. * Setting the bits in the AQM corresponding to the domains assigned to the @@ -418,12 +437,12 @@ available to a KVM guest via the following CPU model features: Note: If the user chooses to specify a CPU model different than the 'host' model to QEMU, the CPU model features and facilities need to be turned on -explicitly; for example: +explicitly; for example:: /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on A guest can be precluded from using AP features/facilities by turning them off -explicitly; for example: +explicitly; for example:: /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off @@ -435,7 +454,7 @@ the APFT facility is not installed on the guest, then the probe of device drivers will fail since only type 10 and newer devices can be configured for guest use. -Example: +Example ======= Let's now provide an example to illustrate how KVM guests may be given access to AP facilities. For this example, we will show how to configure @@ -444,30 +463,36 @@ look like this: Guest1 ------ +=========== ===== ============ CARD.DOMAIN TYPE MODE ------------------------------- +=========== ===== ============ 05 CEX5C CCA-Coproc 05.0004 CEX5C CCA-Coproc 05.00ab CEX5C CCA-Coproc 06 CEX5A Accelerator 06.0004 CEX5A Accelerator 06.00ab CEX5C CCA-Coproc +=========== ===== ============ Guest2 ------ +=========== ===== ============ CARD.DOMAIN TYPE MODE ------------------------------- +=========== ===== ============ 05 CEX5A Accelerator 05.0047 CEX5A Accelerator 05.00ff CEX5A Accelerator +=========== ===== ============ Guest2 ------ +=========== ===== ============ CARD.DOMAIN TYPE MODE ------------------------------- +=========== ===== ============ 06 CEX5A Accelerator 06.0047 CEX5A Accelerator 06.00ff CEX5A Accelerator +=========== ===== ============ These are the steps: @@ -492,25 +517,26 @@ These are the steps: * VFIO_MDEV_DEVICE * KVM - If using make menuconfig select the following to build the vfio_ap module: - -> Device Drivers - -> IOMMU Hardware Support - select S390 AP IOMMU Support - -> VFIO Non-Privileged userspace driver framework - -> Mediated device driver frramework - -> VFIO driver for Mediated devices - -> I/O subsystem - -> VFIO support for AP devices + If using make menuconfig select the following to build the vfio_ap module:: + + -> Device Drivers + -> IOMMU Hardware Support + select S390 AP IOMMU Support + -> VFIO Non-Privileged userspace driver framework + -> Mediated device driver frramework + -> VFIO driver for Mediated devices + -> I/O subsystem + -> VFIO support for AP devices 2. Secure the AP queues to be used by the three guests so that the host can not access them. To secure them, there are two sysfs files that specify bitmasks marking a subset of the APQN range as 'usable by the default AP queue device drivers' or 'not usable by the default device drivers' and thus available for use by the vfio_ap device driver'. The location of the sysfs - files containing the masks are: + files containing the masks are:: - /sys/bus/ap/apmask - /sys/bus/ap/aqmask + /sys/bus/ap/apmask + /sys/bus/ap/aqmask The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs (APID). Each bit in the mask, from left to right (i.e., from most significant @@ -526,7 +552,7 @@ These are the steps: queue device drivers; otherwise, the APQI is usable by the vfio_ap device driver. - Take, for example, the following mask: + Take, for example, the following mask:: 0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff @@ -548,68 +574,70 @@ These are the steps: respective sysfs mask file in one of two formats: * An absolute hex string starting with 0x - like "0x12345678" - sets - the mask. If the given string is shorter than the mask, it is padded - with 0s on the right; for example, specifying a mask value of 0x41 is - the same as specifying: + the mask. If the given string is shorter than the mask, it is padded + with 0s on the right; for example, specifying a mask value of 0x41 is + the same as specifying:: - 0x4100000000000000000000000000000000000000000000000000000000000000 + 0x4100000000000000000000000000000000000000000000000000000000000000 - Keep in mind that the mask reads from left to right (i.e., most - significant to least significant bit in big endian order), so the mask - above identifies device numbers 1 and 7 (01000001). + Keep in mind that the mask reads from left to right (i.e., most + significant to least significant bit in big endian order), so the mask + above identifies device numbers 1 and 7 (01000001). - If the string is longer than the mask, the operation is terminated with - an error (EINVAL). + If the string is longer than the mask, the operation is terminated with + an error (EINVAL). * Individual bits in the mask can be switched on and off by specifying - each bit number to be switched in a comma separated list. Each bit - number string must be prepended with a ('+') or minus ('-') to indicate - the corresponding bit is to be switched on ('+') or off ('-'). Some - valid values are: + each bit number to be switched in a comma separated list. Each bit + number string must be prepended with a ('+') or minus ('-') to indicate + the corresponding bit is to be switched on ('+') or off ('-'). Some + valid values are: - "+0" switches bit 0 on - "-13" switches bit 13 off - "+0x41" switches bit 65 on - "-0xff" switches bit 255 off + - "+0" switches bit 0 on + - "-13" switches bit 13 off + - "+0x41" switches bit 65 on + - "-0xff" switches bit 255 off - The following example: - +0,-6,+0x47,-0xf0 + The following example: - Switches bits 0 and 71 (0x47) on - Switches bits 6 and 240 (0xf0) off + +0,-6,+0x47,-0xf0 - Note that the bits not specified in the list remain as they were before - the operation. + Switches bits 0 and 71 (0x47) on + + Switches bits 6 and 240 (0xf0) off + + Note that the bits not specified in the list remain as they were before + the operation. 2. The masks can also be changed at boot time via parameters on the kernel command line like this: - ap.apmask=0xffff ap.aqmask=0x40 + ap.apmask=0xffff ap.aqmask=0x40 - This would create the following masks: + This would create the following masks:: - apmask: - 0xffff000000000000000000000000000000000000000000000000000000000000 + apmask: + 0xffff000000000000000000000000000000000000000000000000000000000000 - aqmask: - 0x4000000000000000000000000000000000000000000000000000000000000000 + aqmask: + 0x4000000000000000000000000000000000000000000000000000000000000000 - Resulting in these two pools: + Resulting in these two pools:: - default drivers pool: adapter 0-15, domain 1 - alternate drivers pool: adapter 16-255, domains 0, 2-255 + default drivers pool: adapter 0-15, domain 1 + alternate drivers pool: adapter 16-255, domains 0, 2-255 - Securing the APQNs for our example: - ---------------------------------- +Securing the APQNs for our example +---------------------------------- To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047, 06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding - APQNs can either be removed from the default masks: + APQNs can either be removed from the default masks:: echo -5,-6 > /sys/bus/ap/apmask echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask - Or the masks can be set as follows: + Or the masks can be set as follows:: echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \ > apmask @@ -620,19 +648,19 @@ These are the steps: This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The sysfs directory for the vfio_ap device driver will now contain symbolic links - to the AP queue devices bound to it: + to the AP queue devices bound to it:: - /sys/bus/ap - ... [drivers] - ...... [vfio_ap] - ......... [05.0004] - ......... [05.0047] - ......... [05.00ab] - ......... [05.00ff] - ......... [06.0004] - ......... [06.0047] - ......... [06.00ab] - ......... [06.00ff] + /sys/bus/ap + ... [drivers] + ...... [vfio_ap] + ......... [05.0004] + ......... [05.0047] + ......... [05.00ab] + ......... [05.00ff] + ......... [06.0004] + ......... [06.0047] + ......... [06.00ab] + ......... [06.00ff] Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later) can be bound to the vfio_ap device driver. The reason for this is to @@ -645,96 +673,96 @@ These are the steps: queue device can be read from the parent card's sysfs directory. For example, to see the hardware type of the queue 05.0004: - cat /sys/bus/ap/devices/card05/hwtype + cat /sys/bus/ap/devices/card05/hwtype The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the vfio_ap device driver. 3. Create the mediated devices needed to configure the AP matrixes for the three guests and to provide an interface to the vfio_ap driver for - use by the guests: + use by the guests:: - /sys/devices/vfio_ap/matrix/ - --- [mdev_supported_types] - ------ [vfio_ap-passthrough] (passthrough mediated matrix device type) - --------- create - --------- [devices] + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] (passthrough mediated matrix device type) + --------- create + --------- [devices] - To create the mediated devices for the three guests: + To create the mediated devices for the three guests:: uuidgen > create uuidgen > create uuidgen > create - or + or - echo $uuid1 > create - echo $uuid2 > create - echo $uuid3 > create + echo $uuid1 > create + echo $uuid2 > create + echo $uuid3 > create This will create three mediated devices in the [devices] subdirectory named after the UUID written to the create attribute file. We call them $uuid1, - $uuid2 and $uuid3 and this is the sysfs directory structure after creation: + $uuid2 and $uuid3 and this is the sysfs directory structure after creation:: - /sys/devices/vfio_ap/matrix/ - --- [mdev_supported_types] - ------ [vfio_ap-passthrough] - --------- [devices] - ------------ [$uuid1] - --------------- assign_adapter - --------------- assign_control_domain - --------------- assign_domain - --------------- matrix - --------------- unassign_adapter - --------------- unassign_control_domain - --------------- unassign_domain + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] + --------- [devices] + ------------ [$uuid1] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + --------------- unassign_control_domain + --------------- unassign_domain - ------------ [$uuid2] - --------------- assign_adapter - --------------- assign_control_domain - --------------- assign_domain - --------------- matrix - --------------- unassign_adapter - ----------------unassign_control_domain - ----------------unassign_domain + ------------ [$uuid2] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + ----------------unassign_control_domain + ----------------unassign_domain - ------------ [$uuid3] - --------------- assign_adapter - --------------- assign_control_domain - --------------- assign_domain - --------------- matrix - --------------- unassign_adapter - ----------------unassign_control_domain - ----------------unassign_domain + ------------ [$uuid3] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + ----------------unassign_control_domain + ----------------unassign_domain 4. The administrator now needs to configure the matrixes for the mediated devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3). - This is how the matrix is configured for Guest1: + This is how the matrix is configured for Guest1:: echo 5 > assign_adapter echo 6 > assign_adapter echo 4 > assign_domain echo 0xab > assign_domain - Control domains can similarly be assigned using the assign_control_domain - sysfs file. + Control domains can similarly be assigned using the assign_control_domain + sysfs file. - If a mistake is made configuring an adapter, domain or control domain, - you can use the unassign_xxx files to unassign the adapter, domain or - control domain. + If a mistake is made configuring an adapter, domain or control domain, + you can use the unassign_xxx files to unassign the adapter, domain or + control domain. - To display the matrix configuration for Guest1: + To display the matrix configuration for Guest1:: - cat matrix + cat matrix - This is how the matrix is configured for Guest2: + This is how the matrix is configured for Guest2:: echo 5 > assign_adapter echo 0x47 > assign_domain echo 0xff > assign_domain - This is how the matrix is configured for Guest3: + This is how the matrix is configured for Guest3:: echo 6 > assign_adapter echo 0x47 > assign_domain @@ -783,24 +811,24 @@ These are the steps: configured for the system. If a control domain number higher than the maximum is specified, the operation will terminate with an error (ENODEV). -5. Start Guest1: +5. Start Guest1:: - /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ - -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ... + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ... -7. Start Guest2: +7. Start Guest2:: - /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ - -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ... + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ... -7. Start Guest3: +7. Start Guest3:: - /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ - -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ... + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ... When the guest is shut down, the mediated matrix devices may be removed. -Using our example again, to remove the mediated matrix device $uuid1: +Using our example again, to remove the mediated matrix device $uuid1:: /sys/devices/vfio_ap/matrix/ --- [mdev_supported_types] @@ -809,18 +837,19 @@ Using our example again, to remove the mediated matrix device $uuid1: ------------ [$uuid1] --------------- remove +:: echo 1 > remove - This will remove all of the mdev matrix device's sysfs structures including - the mdev device itself. To recreate and reconfigure the mdev matrix device, - all of the steps starting with step 3 will have to be performed again. Note - that the remove will fail if a guest using the mdev is still running. +This will remove all of the mdev matrix device's sysfs structures including +the mdev device itself. To recreate and reconfigure the mdev matrix device, +all of the steps starting with step 3 will have to be performed again. Note +that the remove will fail if a guest using the mdev is still running. - It is not necessary to remove an mdev matrix device, but one may want to - remove it if no guest will use it during the remaining lifetime of the linux - host. If the mdev matrix device is removed, one may want to also reconfigure - the pool of adapters and queues reserved for use by the default drivers. +It is not necessary to remove an mdev matrix device, but one may want to +remove it if no guest will use it during the remaining lifetime of the linux +host. If the mdev matrix device is removed, one may want to also reconfigure +the pool of adapters and queues reserved for use by the default drivers. Limitations =========== diff --git a/Documentation/s390/vfio-ccw.txt b/Documentation/s390/vfio-ccw.rst similarity index 89% rename from Documentation/s390/vfio-ccw.txt rename to Documentation/s390/vfio-ccw.rst index 2be11ad864ff..1f6d0b56d53e 100644 --- a/Documentation/s390/vfio-ccw.txt +++ b/Documentation/s390/vfio-ccw.rst @@ -1,3 +1,4 @@ +================================== vfio-ccw: the basic infrastructure ================================== @@ -11,9 +12,11 @@ virtual machine, while vfio is the means. Different than other hardware architectures, s390 has defined a unified I/O access method, which is so called Channel I/O. It has its own access patterns: + - Channel programs run asynchronously on a separate (co)processor. - The channel subsystem will access any memory designated by the caller in the channel program directly, i.e. there is no iommu involved. + Thus when we introduce vfio support for these devices, we realize it with a mediated device (mdev) implementation. The vfio mdev will be added to an iommu group, so as to make itself able to be managed by the @@ -24,6 +27,7 @@ to perform I/O instructions. This document does not intend to explain the s390 I/O architecture in every detail. More information/reference could be found here: + - A good start to know Channel I/O in general: https://en.wikipedia.org/wiki/Channel_I/O - s390 architecture: @@ -80,6 +84,7 @@ until interrupted. The I/O completion result is received by the interrupt handler in the form of interrupt response block (IRB). Back to vfio-ccw, in short: + - ORBs and channel programs are built in guest kernel (with guest physical addresses). - ORBs and channel programs are passed to the host kernel. @@ -106,6 +111,7 @@ it gets sent to hardware. Within this implementation, we have two drivers for two types of devices: + - The vfio_ccw driver for the physical subchannel device. This is an I/O subchannel driver for the real subchannel device. It realizes a group of callbacks and registers to the mdev framework as a @@ -137,7 +143,7 @@ devices: vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu backend for the physical devices to pin and unpin pages by demand. -Below is a high Level block diagram. +Below is a high Level block diagram:: +-------------+ | | @@ -158,6 +164,7 @@ Below is a high Level block diagram. +-------------+ The process of how these work together. + 1. vfio_ccw.ko drives the physical I/O subchannel, and registers the physical device (with callbacks) to mdev framework. When vfio_ccw probing the subchannel device, it registers device @@ -178,17 +185,17 @@ vfio-ccw I/O region An I/O region is used to accept channel program request from user space and store I/O interrupt result for user space to retrieve. The -definition of the region is: +definition of the region is:: -struct ccw_io_region { -#define ORB_AREA_SIZE 12 - __u8 orb_area[ORB_AREA_SIZE]; -#define SCSW_AREA_SIZE 12 - __u8 scsw_area[SCSW_AREA_SIZE]; -#define IRB_AREA_SIZE 96 - __u8 irb_area[IRB_AREA_SIZE]; - __u32 ret_code; -} __packed; + struct ccw_io_region { + #define ORB_AREA_SIZE 12 + __u8 orb_area[ORB_AREA_SIZE]; + #define SCSW_AREA_SIZE 12 + __u8 scsw_area[SCSW_AREA_SIZE]; + #define IRB_AREA_SIZE 96 + __u8 irb_area[IRB_AREA_SIZE]; + __u32 ret_code; + } __packed; While starting an I/O request, orb_area should be filled with the guest ORB, and scsw_area should be filled with the SCSW of the Virtual @@ -205,7 +212,7 @@ vfio-ccw follows what vfio-pci did on the s390 platform and uses vfio-iommu-type1 as the vfio iommu backend. * CCW translation APIs - A group of APIs (start with 'cp_') to do CCW translation. The CCWs + A group of APIs (start with `cp_`) to do CCW translation. The CCWs passed in by a user space program are organized with their guest physical memory addresses. These APIs will copy the CCWs into kernel space, and assemble a runnable kernel channel program by updating the @@ -217,12 +224,14 @@ vfio-iommu-type1 as the vfio iommu backend. This driver utilizes the CCW translation APIs and introduces vfio_ccw, which is the driver for the I/O subchannel devices you want to pass through. - vfio_ccw implements the following vfio ioctls: + vfio_ccw implements the following vfio ioctls:: + VFIO_DEVICE_GET_INFO VFIO_DEVICE_GET_IRQ_INFO VFIO_DEVICE_GET_REGION_INFO VFIO_DEVICE_RESET VFIO_DEVICE_SET_IRQS + This provides an I/O region, so that the user space program can pass a channel program to the kernel, to do further CCW translation before issuing them to a real device. @@ -236,32 +245,49 @@ bit more detail how an I/O request triggered by the QEMU guest will be handled (without error handling). Explanation: -Q1-Q7: QEMU side process. -K1-K5: Kernel side process. -Q1. Get I/O region info during initialization. -Q2. Setup event notifier and handler to handle I/O completion. +- Q1-Q7: QEMU side process. +- K1-K5: Kernel side process. + +Q1. + Get I/O region info during initialization. + +Q2. + Setup event notifier and handler to handle I/O completion. ... ... -Q3. Intercept a ssch instruction. -Q4. Write the guest channel program and ORB to the I/O region. - K1. Copy from guest to kernel. - K2. Translate the guest channel program to a host kernel space - channel program, which becomes runnable for a real device. - K3. With the necessary information contained in the orb passed in - by QEMU, issue the ccwchain to the device. - K4. Return the ssch CC code. -Q5. Return the CC code to the guest. +Q3. + Intercept a ssch instruction. +Q4. + Write the guest channel program and ORB to the I/O region. + + K1. + Copy from guest to kernel. + K2. + Translate the guest channel program to a host kernel space + channel program, which becomes runnable for a real device. + K3. + With the necessary information contained in the orb passed in + by QEMU, issue the ccwchain to the device. + K4. + Return the ssch CC code. +Q5. + Return the CC code to the guest. ... ... - K5. Interrupt handler gets the I/O result and write the result to - the I/O region. - K6. Signal QEMU to retrieve the result. -Q6. Get the signal and event handler reads out the result from the I/O + K5. + Interrupt handler gets the I/O result and write the result to + the I/O region. + K6. + Signal QEMU to retrieve the result. + +Q6. + Get the signal and event handler reads out the result from the I/O region. -Q7. Update the irb for the guest. +Q7. + Update the irb for the guest. Limitations ----------- @@ -295,6 +321,6 @@ Reference 1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832) 2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204) 3. https://en.wikipedia.org/wiki/Channel_I/O -4. Documentation/s390/cds.txt +4. Documentation/s390/cds.rst 5. Documentation/vfio.txt 6. Documentation/vfio-mediated-device.txt diff --git a/Documentation/s390/zfcpdump.txt b/Documentation/s390/zfcpdump.rst similarity index 97% rename from Documentation/s390/zfcpdump.txt rename to Documentation/s390/zfcpdump.rst index b064aa59714d..54e8e7caf7e7 100644 --- a/Documentation/s390/zfcpdump.txt +++ b/Documentation/s390/zfcpdump.rst @@ -1,4 +1,6 @@ +================================== The s390 SCSI dump tool (zfcpdump) +================================== System z machines (z900 or higher) provide hardware support for creating system dumps on SCSI disks. The dump process is initiated by booting a dump tool, which diff --git a/MAINTAINERS b/MAINTAINERS index a6954776a37e..0e904873fb0a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13703,7 +13703,7 @@ L: linux-s390@vger.kernel.org L: kvm@vger.kernel.org S: Supported F: drivers/s390/cio/vfio_ccw* -F: Documentation/s390/vfio-ccw.txt +F: Documentation/s390/vfio-ccw.rst F: include/uapi/linux/vfio_ccw.h S390 ZCRYPT DRIVER @@ -13723,7 +13723,7 @@ S: Supported F: drivers/s390/crypto/vfio_ap_drv.c F: drivers/s390/crypto/vfio_ap_private.h F: drivers/s390/crypto/vfio_ap_ops.c -F: Documentation/s390/vfio-ap.txt +F: Documentation/s390/vfio-ap.rst S390 ZFCP DRIVER M: Steffen Maier diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 66be2d813951..65522d6956ca 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -810,9 +810,9 @@ config CRASH_DUMP Crash dump kernels are loaded in the main kernel with kexec-tools into a specially reserved region and then later executed after a crash by kdump/kexec. - Refer to for more details on this. + Refer to for more details on this. This option also enables s390 zfcpdump. - See also + See also endmenu diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index c305d39f5016..b94783f71322 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -152,7 +152,7 @@ static inline debug_entry_t *debug_text_event(debug_info_t *id, int level, /* * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are - * stored in the s390dbf. See Documentation/s390/s390dbf.txt for more details! + * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details! */ extern debug_entry_t * __debug_sprintf_event(debug_info_t *id, int level, char *string, ...) @@ -210,7 +210,7 @@ static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level, /* * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are - * stored in the s390dbf. See Documentation/s390/s390dbf.txt for more details! + * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details! */ extern debug_entry_t * __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index 405a60538630..08f812475f5e 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -4,7 +4,7 @@ * dumps on SCSI disks (zfcpdump). The "zcore/mem" debugfs file shows the same * dump format as s390 standalone dumps. * - * For more information please refer to Documentation/s390/zfcpdump.txt + * For more information please refer to Documentation/s390/zfcpdump.rst * * Copyright IBM Corp. 2003, 2008 * Author(s): Michael Holzheu From a20aa857e0c207c27d4b2c98af7d97539faf2cc5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 8 Jun 2019 23:27:17 -0300 Subject: [PATCH 29/83] s390: include/asm/debug.h add kerneldoc markups Instead of keeping the documentation inside s390dbf.rst, move them to arch/s390/include/asm/debug.h, using standard kernel-doc markups. Keeping the documentation close to the code helps to keep it updated. It also makes easier to document other stuff inside debug.h, as all it needs is to add kernel-doc markups inside it, as the file will be already be included at the produced documentation. - Those were converted to kerneldoc using this script specially designed to parse ths file, and manually editted: Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Heiko Carstens --- Documentation/s390/s390dbf.rst | 672 +-------------------------------- arch/s390/include/asm/debug.h | 231 ++++++++++++ 2 files changed, 232 insertions(+), 671 deletions(-) diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst index ec2a1faa414b..d2595b548879 100644 --- a/Documentation/s390/s390dbf.rst +++ b/Documentation/s390/s390dbf.rst @@ -104,684 +104,14 @@ the "debug_stoppable" sysctl. If you set "debug_stoppable" to 0 the debug feature cannot be stopped. If the debug feature is already stopped, it will stay deactivated. ----------------------------------------------------------------------------- - Kernel Interfaces: ------------------ -:: - - debug_info_t *debug_register(char *name, int pages, int nr_areas, - int buf_size); - -Parameter: - name: - Name of debug log (e.g. used for debugfs entry) - pages: - Number of pages, which will be allocated per area - nr_areas: - Number of debug areas - buf_size: - Size of data area in each debug entry - -Return Value: - Handle for generated debug area - - NULL if register failed - -Description: Allocates memory for a debug log - Must not be called within an interrupt handler - ----------------------------------------------------------------------------- - -:: - - debug_info_t *debug_register_mode(char *name, int pages, int nr_areas, - int buf_size, mode_t mode, uid_t uid, - gid_t gid); - -Parameter: - name: - Name of debug log (e.g. used for debugfs entry) - pages: - Number of pages, which will be allocated per area - nr_areas: - Number of debug areas - buf_size: - Size of data area in each debug entry - mode: - File mode for debugfs files. E.g. S_IRWXUGO - uid: - User ID for debugfs files. Currently only 0 is - supported. - gid: - Group ID for debugfs files. Currently only 0 is - supported. - -Return Value: - Handle for generated debug area - - NULL if register failed - -Description: - Allocates memory for a debug log - Must not be called within an interrupt handler - ---------------------------------------------------------------------------- - -:: - - void debug_unregister (debug_info_t * id); - -Parameter: - id: - handle for debug log - -Return Value: - none - -Description: - frees memory for a debug log and removes all registered debug - views. - - Must not be called within an interrupt handler - ---------------------------------------------------------------------------- - -:: - - void debug_set_level (debug_info_t * id, int new_level); - -Parameter: id: handle for debug log - new_level: new debug level - -Return Value: - none - -Description: - Sets new actual debug level if new_level is valid. - ---------------------------------------------------------------------------- - -:: - - bool debug_level_enabled (debug_info_t * id, int level); - -Parameter: - id: - handle for debug log - level: - debug level - -Return Value: - True if level is less or equal to the current debug level. - -Description: - Returns true if debug events for the specified level would be - logged. Otherwise returns false. - ---------------------------------------------------------------------------- - -:: - - void debug_stop_all(void); - -Parameter: - none - -Return Value: - none - -Description: - stops the debug feature if stopping is allowed. Currently - used in case of a kernel oops. - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_event (debug_info_t* id, int level, void* data, - int length); - -Parameter: - id: - handle for debug log - level: - debug level - data: - pointer to data for debug entry - length: - length of data in bytes - -Return Value: - Address of written debug entry - -Description: - writes debug entry to active debug area (if level <= actual - debug level) - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_int_event (debug_info_t * id, int level, - unsigned int data); - debug_entry_t* debug_long_event(debug_info_t * id, int level, - unsigned long data); - -Parameter: - id: - handle for debug log - level: - debug level - data: - integer value for debug entry - -Return Value: - Address of written debug entry - -Description: - writes debug entry to active debug area (if level <= actual - debug level) - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_text_event (debug_info_t * id, int level, - const char* data); - -Parameter: - id: - handle for debug log - level: - debug level - data: - string for debug entry - -Return Value: - Address of written debug entry - -Description: - writes debug entry in ascii format to active debug area - (if level <= actual debug level) - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_sprintf_event (debug_info_t * id, int level, - char* string,...); - -Parameter: - id: - handle for debug log - level: - debug level - string: - format string for debug entry - ...: - varargs used as in sprintf() - -Return Value: Address of written debug entry - -Description: - writes debug entry with format string and varargs (longs) to - active debug area (if level $<=$ actual debug level). - floats and long long datatypes cannot be used as varargs. - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_exception (debug_info_t* id, int level, void* data, - int length); - -Parameter: - id: - handle for debug log - level: - debug level - data: - pointer to data for debug entry - length: - length of data in bytes - -Return Value: - Address of written debug entry - -Description: - writes debug entry to active debug area (if level <= actual - debug level) and switches to next debug area - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_int_exception (debug_info_t * id, int level, - unsigned int data); - debug_entry_t* debug_long_exception(debug_info_t * id, int level, - unsigned long data); - -Parameter: id: handle for debug log - level: debug level - data: integer value for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry to active debug area (if level <= actual - debug level) and switches to next debug area - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_text_exception (debug_info_t * id, int level, - const char* data); - -Parameter: id: handle for debug log - level: debug level - data: string for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry in ascii format to active debug area - (if level <= actual debug level) and switches to next debug - area - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_sprintf_exception (debug_info_t * id, int level, - char* string,...); - -Parameter: id: handle for debug log - level: debug level - string: format string for debug entry - ...: varargs used as in sprintf() - -Return Value: Address of written debug entry - -Description: writes debug entry with format string and varargs (longs) to - active debug area (if level $<=$ actual debug level) and - switches to next debug area. - floats and long long datatypes cannot be used as varargs. - ---------------------------------------------------------------------------- - -:: - - int debug_register_view (debug_info_t * id, struct debug_view *view); - -Parameter: id: handle for debug log - view: pointer to debug view struct - -Return Value: 0 : ok - < 0: Error - -Description: registers new debug view and creates debugfs dir entry - ---------------------------------------------------------------------------- - -:: - - int debug_unregister_view (debug_info_t * id, struct debug_view *view); - -Parameter: id: handle for debug log - view: pointer to debug view struct - -Return Value: 0 : ok - < 0: Error - -Description: unregisters debug view and removes debugfs dir entry - - +.. kernel-doc:: arch/s390/include/asm/debug.h Predefined views: ----------------- -extern struct debug_view debug_hex_ascii_view; - -extern struct debug_view debug_raw_view; - -extern struct debug_view debug_sprintf_view; - -Examples --------- - -:: - - /* - * hex_ascii- + raw-view Example - */ - - #include - #include - - static debug_info_t* debug_info; - - static int init(void) - { - /* register 4 debug areas with one page each and 4 byte data field */ - - debug_info = debug_register ("test", 1, 4, 4 ); - debug_register_view(debug_info,&debug_hex_ascii_view); - debug_register_view(debug_info,&debug_raw_view); - - debug_text_event(debug_info, 4 , "one "); - debug_int_exception(debug_info, 4, 4711); - debug_event(debug_info, 3, &debug_info, 4); - - return 0; - } - - static void cleanup(void) - { - debug_unregister (debug_info); - } - - module_init(init); - module_exit(cleanup); - ---------------------------------------------------------------------------- - -:: - - /* - * sprintf-view Example - */ - - #include - #include - - static debug_info_t* debug_info; - - static int init(void) - { - /* register 4 debug areas with one page each and data field for */ - /* format string pointer + 2 varargs (= 3 * sizeof(long)) */ - - debug_info = debug_register ("test", 1, 4, sizeof(long) * 3); - debug_register_view(debug_info,&debug_sprintf_view); - - debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__); - debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info); - - return 0; - } - - static void cleanup(void) - { - debug_unregister (debug_info); - } - - module_init(init); - module_exit(cleanup); - -Debugfs Interface ------------------ -Views to the debug logs can be investigated through reading the corresponding -debugfs-files: - -Example:: - - > ls /sys/kernel/debug/s390dbf/dasd - flush hex_ascii level pages raw - > cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s - 00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | .... - 00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE - 00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | .... - 00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP - 01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD - 01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | .... - 01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ... - 01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | .... - 01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE - 01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | .... - -See section about predefined views for explanation of the above output! - -Changing the debug level ------------------------- - -Example:: - - - > cat /sys/kernel/debug/s390dbf/dasd/level - 3 - > echo "5" > /sys/kernel/debug/s390dbf/dasd/level - > cat /sys/kernel/debug/s390dbf/dasd/level - 5 - -Flushing debug areas --------------------- -Debug areas can be flushed with piping the number of the desired -area (0...n) to the debugfs file "flush". When using "-" all debug areas -are flushed. - -Examples: - -1. Flush debug area 0:: - - > echo "0" > /sys/kernel/debug/s390dbf/dasd/flush - -2. Flush all debug areas:: - - > echo "-" > /sys/kernel/debug/s390dbf/dasd/flush - -Changing the size of debug areas ------------------------------------- -It is possible the change the size of debug areas through piping -the number of pages to the debugfs file "pages". The resize request will -also flush the debug areas. - -Example: - -Define 4 pages for the debug areas of debug feature "dasd":: - - > echo "4" > /sys/kernel/debug/s390dbf/dasd/pages - -Stooping the debug feature --------------------------- -Example: - -1. Check if stopping is allowed:: - - > cat /proc/sys/s390dbf/debug_stoppable - -2. Stop debug feature:: - - > echo 0 > /proc/sys/s390dbf/debug_active - -lcrash Interface ----------------- -It is planned that the dump analysis tool lcrash gets an additional command -'s390dbf' to display all the debug logs. With this tool it will be possible -to investigate the debug logs on a live system and with a memory dump after -a system crash. - -Investigating raw memory ------------------------- -One last possibility to investigate the debug logs at a live -system and after a system crash is to look at the raw memory -under VM or at the Service Element. -It is possible to find the anker of the debug-logs through -the 'debug_area_first' symbol in the System map. Then one has -to follow the correct pointers of the data-structures defined -in debug.h and find the debug-areas in memory. -Normally modules which use the debug feature will also have -a global variable with the pointer to the debug-logs. Following -this pointer it will also be possible to find the debug logs in -memory. - -For this method it is recommended to use '16 * x + 4' byte (x = 0..n) -for the length of the data field in debug_register() in -order to see the debug entries well formatted. - - -Predefined Views ----------------- - -There are three predefined views: hex_ascii, raw and sprintf. -The hex_ascii view shows the data field in hex and ascii representation -(e.g. '45 43 4b 44 | ECKD'). -The raw view returns a bytestream as the debug areas are stored in memory. - -The sprintf view formats the debug entries in the same way as the sprintf -function would do. The sprintf event/exception functions write to the -debug entry a pointer to the format string (size = sizeof(long)) -and for each vararg a long value. So e.g. for a debug entry with a format -string plus two varargs one would need to allocate a (3 * sizeof(long)) -byte data area in the debug_register() function. - -IMPORTANT: - Using "%s" in sprintf event functions is dangerous. You can only - use "%s" in the sprintf event functions, if the memory for the passed string - is available as long as the debug feature exists. The reason behind this is - that due to performance considerations only a pointer to the string is stored - in the debug feature. If you log a string that is freed afterwards, you will - get an OOPS when inspecting the debug feature, because then the debug feature - will access the already freed memory. - -NOTE: - If using the sprintf view do NOT use other event/exception functions - than the sprintf-event and -exception functions. - -The format of the hex_ascii and sprintf view is as follows: - -- Number of area -- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated - Universal Time (UTC), January 1, 1970) -- level of debug entry -- Exception flag (* = Exception) -- Cpu-Number of calling task -- Return Address to caller -- data field - -The format of the raw view is: - -- Header as described in debug.h -- datafield - -A typical line of the hex_ascii view will look like the following (first line -is only for explanation and will not be displayed when 'cating' the view): - -area time level exception cpu caller data (hex + ascii) --------------------------------------------------------------------------- -00 00964419409:440690 1 - 00 88023fe - - -Defining views --------------- - -Views are specified with the 'debug_view' structure. There are defined -callback functions which are used for reading and writing the debugfs files:: - - struct debug_view { - char name[DEBUG_MAX_PROCF_LEN]; - debug_prolog_proc_t* prolog_proc; - debug_header_proc_t* header_proc; - debug_format_proc_t* format_proc; - debug_input_proc_t* input_proc; - void* private_data; - }; - -where:: - - typedef int (debug_header_proc_t) (debug_info_t* id, - struct debug_view* view, - int area, - debug_entry_t* entry, - char* out_buf); - - typedef int (debug_format_proc_t) (debug_info_t* id, - struct debug_view* view, char* out_buf, - const char* in_buf); - typedef int (debug_prolog_proc_t) (debug_info_t* id, - struct debug_view* view, - char* out_buf); - typedef int (debug_input_proc_t) (debug_info_t* id, - struct debug_view* view, - struct file* file, const char* user_buf, - size_t in_buf_size, loff_t* offset); - - -The "private_data" member can be used as pointer to view specific data. -It is not used by the debug feature itself. - -The output when reading a debugfs file is structured like this:: - - "prolog_proc output" - - "header_proc output 1" "format_proc output 1" - "header_proc output 2" "format_proc output 2" - "header_proc output 3" "format_proc output 3" - ... - -When a view is read from the debugfs, the Debug Feature calls the -'prolog_proc' once for writing the prolog. -Then 'header_proc' and 'format_proc' are called for each -existing debug entry. - -The input_proc can be used to implement functionality when it is written to -the view (e.g. like with 'echo "0" > /sys/kernel/debug/s390dbf/dasd/level). - -For header_proc there can be used the default function -debug_dflt_header_fn() which is defined in debug.h. -and which produces the same header output as the predefined views. -E.g:: - - 00 00964419409:440761 2 - 00 88023ec - -In order to see how to use the callback functions check the implementation -of the default views! - -Example:: - - #include - - #define UNKNOWNSTR "data: %08x" - - const char* messages[] = - {"This error...........\n", - "That error...........\n", - "Problem..............\n", - "Something went wrong.\n", - "Everything ok........\n", - NULL - }; - - static int debug_test_format_fn( - debug_info_t * id, struct debug_view *view, - char *out_buf, const char *in_buf - ) - { - int i, rc = 0; - - if(id->buf_size >= 4) { - int msg_nr = *((int*)in_buf); - if(msg_nr < sizeof(messages)/sizeof(char*) - 1) - rc += sprintf(out_buf, "%s", messages[msg_nr]); - else - rc += sprintf(out_buf, UNKNOWNSTR, msg_nr); - } - out: - return rc; - } - - struct debug_view debug_test_view = { - "myview", /* name of view */ - NULL, /* no prolog */ - &debug_dflt_header_fn, /* default header for each entry */ - &debug_test_format_fn, /* our own format function */ - NULL, /* no input function */ - NULL /* no private data */ - }; - -test: -===== - :: debug_info_t *debug_info; diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index b94783f71322..02c36eedd780 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -95,25 +95,106 @@ debug_entry_t *debug_exception_common(debug_info_t *id, int level, /* Debug Feature API: */ +/** + * debug_register() - allocates memory for a debug log. + * + * @name: Name of debug log (e.g. used for debugfs entry) + * @pages: Number of pages, which will be allocated per area + * @nr_areas: Number of debug areas + * @buf_size: Size of data area in each debug entry + * + * Return: + * - Handler for generated debug area + * - %NULL if register failed + * + * Must not be called within an interrupt handler. + */ debug_info_t *debug_register(const char *name, int pages, int nr_areas, int buf_size); +/** + * debug_register_mode() - allocates memory for a debug log. + * + * @name: Name of debug log (e.g. used for debugfs entry) + * @pages: Number of pages, which will be allocated per area + * @nr_areas: Number of debug areas + * @buf_size: Size of data area in each debug entry + * @mode: File mode for debugfs files. E.g. S_IRWXUGO + * @uid: User ID for debugfs files. Currently only 0 is supported. + * @gid: Group ID for debugfs files. Currently only 0 is supported. + * + * Return: + * - Handler for generated debug area + * - %NULL if register failed + * + * Must not be called within an interrupt handler + */ debug_info_t *debug_register_mode(const char *name, int pages, int nr_areas, int buf_size, umode_t mode, uid_t uid, gid_t gid); +/** + * debug_unregister() - frees memory for a debug log and removes all + * registered debug + * views. + * + * @id: handle for debug log + * + * Return: + * none + * + * Must not be called within an interrupt handler + */ void debug_unregister(debug_info_t *id); +/** + * debug_set_level() - Sets new actual debug level if new_level is valid. + * + * @id: handle for debug log + * @new_level: new debug level + * + * Return: + * none + */ void debug_set_level(debug_info_t *id, int new_level); void debug_set_critical(void); + +/** + * debug_stop_all() - stops the debug feature if stopping is allowed. + * + * Return: + * - none + */ void debug_stop_all(void); +/** + * debug_level_enabled() - Returns true if debug events for the specified + * level would be logged. Otherwise returns false. + * + * @id: handle for debug log + * @level: debug level + * + * Return: + * - %true if level is less or equal to the current debug level. + */ static inline bool debug_level_enabled(debug_info_t *id, int level) { return level <= id->level; } +/** + * debug_event() - writes debug entry to active debug area + * (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @data: pointer to data for debug entry + * @length: length of data in bytes + * + * Return: + * - Address of written debug entry + */ static inline debug_entry_t *debug_event(debug_info_t *id, int level, void *data, int length) { @@ -122,6 +203,18 @@ static inline debug_entry_t *debug_event(debug_info_t *id, int level, return debug_event_common(id, level, data, length); } +/** + * debug_int_event() - writes debug entry to active debug area + * (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @tag: integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_int_event(debug_info_t *id, int level, unsigned int tag) { @@ -132,6 +225,18 @@ static inline debug_entry_t *debug_int_event(debug_info_t *id, int level, return debug_event_common(id, level, &t, sizeof(unsigned int)); } +/** + * debug_long_event() - writes debug entry to active debug area + * (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @tag: integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_long_event(debug_info_t *id, int level, unsigned long tag) { @@ -142,6 +247,18 @@ static inline debug_entry_t *debug_long_event(debug_info_t *id, int level, return debug_event_common(id, level, &t, sizeof(unsigned long)); } +/** + * debug_text_event() - writes debug entry in ascii format to active + * debug area (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @txt: string for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_text_event(debug_info_t *id, int level, const char *txt) { @@ -158,6 +275,22 @@ extern debug_entry_t * __debug_sprintf_event(debug_info_t *id, int level, char *string, ...) __attribute__ ((format(printf, 3, 4))); +/** + * debug_sprintf_event() - writes debug entry with format string + * and varargs (longs) to active debug area + * (if level $<=$ actual debug level). + * + * @_id: handle for debug log + * @_level: debug level + * @_fmt: format string for debug entry + * @...: varargs used as in sprintf() + * + * Return: + * - Address of written debug entry + * - %NULL if error + * + * floats and long long datatypes cannot be used as varargs. + */ #define debug_sprintf_event(_id, _level, _fmt, ...) \ ({ \ debug_entry_t *__ret; \ @@ -172,6 +305,20 @@ __debug_sprintf_event(debug_info_t *id, int level, char *string, ...) __ret; \ }) +/** + * debug_exception() - writes debug entry to active debug area + * (if level <= actual debug level) and switches + * to next debug area + * + * @id: handle for debug log + * @level: debug level + * @data: pointer to data for debug entry + * @length: length of data in bytes + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_exception(debug_info_t *id, int level, void *data, int length) { @@ -180,6 +327,19 @@ static inline debug_entry_t *debug_exception(debug_info_t *id, int level, return debug_exception_common(id, level, data, length); } +/** + * debug_int_exception() - writes debug entry to active debug area + * (if level <= actual debug level) + * and switches to next debug area + * + * @id: handle for debug log + * @level: debug level + * @tag: integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_int_exception(debug_info_t *id, int level, unsigned int tag) { @@ -190,6 +350,19 @@ static inline debug_entry_t *debug_int_exception(debug_info_t *id, int level, return debug_exception_common(id, level, &t, sizeof(unsigned int)); } +/** + * debug_long_exception() - writes debug entry to active debug area + * (if level <= actual debug level) + * and switches to next debug area + * + * @id: handle for debug log + * @level: debug level + * @tag: integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_long_exception (debug_info_t *id, int level, unsigned long tag) { @@ -200,6 +373,20 @@ static inline debug_entry_t *debug_long_exception (debug_info_t *id, int level, return debug_exception_common(id, level, &t, sizeof(unsigned long)); } +/** + * debug_text_exception() - writes debug entry in ascii format to active + * debug area (if level <= actual debug level) + * and switches to next debug + * area + * + * @id: handle for debug log + * @level: debug level + * @txt: string for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level, const char *txt) { @@ -216,6 +403,24 @@ extern debug_entry_t * __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) __attribute__ ((format(printf, 3, 4))); + +/** + * debug_sprintf_exception() - writes debug entry with format string and + * varargs (longs) to active debug area + * (if level $<=$ actual debug level) + * and switches to next debug area. + * + * @_id: handle for debug log + * @_level: debug level + * @_fmt: format string for debug entry + * @...: varargs used as in sprintf() + * + * Return: + * - Address of written debug entry + * - %NULL if error + * + * floats and long long datatypes cannot be used as varargs. + */ #define debug_sprintf_exception(_id, _level, _fmt, ...) \ ({ \ debug_entry_t *__ret; \ @@ -230,7 +435,33 @@ __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) __ret; \ }) +/** + * debug_register_view() - registers new debug view and creates debugfs + * dir entry + * + * @id: handle for debug log + * @view: pointer to debug view struct + * + * Return: + * - 0 : ok + * - < 0: Error + */ int debug_register_view(debug_info_t *id, struct debug_view *view); + +/** + * debug_unregister_view() + * + * @id: handle for debug log + * @view: pointer to debug view struct + * + * Return: + * - 0 : ok + * - < 0: Error + * + * + * unregisters debug view and removes debugfs dir entry + */ + int debug_unregister_view(debug_info_t *id, struct debug_view *view); /* From eec0a43ddd3667bec3d1dfccb75df69ba6fce331 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Sun, 9 Jun 2019 14:37:57 +0200 Subject: [PATCH 30/83] RAID/s390: remove invalid 'r' inline asm operand modifier gcc silently ignores unsupported inline asm operand modifiers, effectively turning '%r0' into '%0', but upcoming clang 9 complains about them: lib/raid6/s390vx8.c:63:16: error: invalid operand in inline asm: 'VLM $2,$3,0,${1:r}' asm volatile ("VLM %2,%3,0,%r1" ^ Clean up what look like a typo 'r' inline asm operand modifier usage. Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- lib/raid6/s390vx.uc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/raid6/s390vx.uc b/lib/raid6/s390vx.uc index 914ebe98fc21..9e597e1f91a4 100644 --- a/lib/raid6/s390vx.uc +++ b/lib/raid6/s390vx.uc @@ -60,7 +60,7 @@ static inline void LOAD_DATA(int x, u8 *ptr) typedef struct { u8 _[16 * $#]; } addrtype; register addrtype *__ptr asm("1") = (addrtype *) ptr; - asm volatile ("VLM %2,%3,0,%r1" + asm volatile ("VLM %2,%3,0,%1" : : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + $# - 1)); } From 2980ba6ae8ca558ff06155f17cf7139256e7d9ac Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 10 Jun 2019 17:22:38 +0200 Subject: [PATCH 31/83] s390/kdump: get rid of compile warning Move the CONFIG_CRASH_DUMP ifdef to get rid of this: arch/s390/kernel/machine_kexec.c:146:22: warning: 'do_start_kdump' defined but not used [-Wunused-function] Signed-off-by: Heiko Carstens --- arch/s390/kernel/machine_kexec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 8a1ae140c5e2..444a19125a81 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -141,7 +141,6 @@ static noinline void __machine_kdump(void *image) */ store_status(__do_machine_kdump, image); } -#endif static unsigned long do_start_kdump(unsigned long addr) { @@ -155,6 +154,8 @@ static unsigned long do_start_kdump(unsigned long addr) return rc; } +#endif /* CONFIG_CRASH_DUMP */ + /* * Check if kdump checksums are valid: We call purgatory with parameter "0" */ From 58443b676b9eef9f79ea1fe85ccfdffd4dc3bc6f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 12 Jun 2019 15:33:03 +0200 Subject: [PATCH 32/83] s390/pkey: Use -ENODEV instead of -EOPNOTSUPP systemd-modules-load.service automatically tries to load the pkey module on systems that have MSA. Pkey also requires the MSA3 facility and a bunch of subfunctions. Failing with -EOPNOTSUPP makes "systemd-modules-load.service" fail on any system that does not have all needed subfunctions. For example, when running under QEMU TCG (but also on systems where protected keys are disabled via the HMC). Let's use -ENODEV, so systemd-modules-load.service properly ignores failing to load the pkey module because of missing HW functionality. While at it, also convert the -EOPNOTSUPP in pkey_clr2protkey() to -ENODEV. Reviewed-by: Cornelia Huck Reviewed-by: Harald Freudenberger Signed-off-by: David Hildenbrand Signed-off-by: Heiko Carstens --- drivers/s390/crypto/pkey_api.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index 45eb0c14b880..7f418d2d8cdf 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -690,7 +690,7 @@ int pkey_clr2protkey(u32 keytype, */ if (!cpacf_test_func(&pckmo_functions, fc)) { DEBUG_ERR("%s pckmo functions not available\n", __func__); - return -EOPNOTSUPP; + return -ENODEV; } /* prepare param block */ @@ -1695,15 +1695,15 @@ static int __init pkey_init(void) * are able to work with protected keys. */ if (!cpacf_query(CPACF_PCKMO, &pckmo_functions)) - return -EOPNOTSUPP; + return -ENODEV; /* check for kmc instructions available */ if (!cpacf_query(CPACF_KMC, &kmc_functions)) - return -EOPNOTSUPP; + return -ENODEV; if (!cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_128) || !cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_192) || !cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_256)) - return -EOPNOTSUPP; + return -ENODEV; pkey_debug_init(); From 1c0908fcdaeb35ff200241280518fbe356f11e57 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 12 Jun 2019 15:33:04 +0200 Subject: [PATCH 33/83] s390/crypto: ghash: Use -ENODEV instead of -EOPNOTSUPP Let's use the error value that is typically used if HW support is not available when trying to load a module - this is also what systemd's systemd-modules-load.service expects. Reviewed-by: Cornelia Huck Reviewed-by: Harald Freudenberger Signed-off-by: David Hildenbrand Signed-off-by: Heiko Carstens --- arch/s390/crypto/ghash_s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c index 86aed30fad3a..eeeb6a7737a4 100644 --- a/arch/s390/crypto/ghash_s390.c +++ b/arch/s390/crypto/ghash_s390.c @@ -137,7 +137,7 @@ static struct shash_alg ghash_alg = { static int __init ghash_mod_init(void) { if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_GHASH)) - return -EOPNOTSUPP; + return -ENODEV; return crypto_register_shash(&ghash_alg); } From ba6a98fe79b416541d11d6d4a89ba8d86b5409a2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 12 Jun 2019 15:33:05 +0200 Subject: [PATCH 34/83] s390/crypto: prng: Use -ENODEV instead of -EOPNOTSUPP Let's use the error value that is typically used if HW support is not available when trying to load a module - this is also what systemd's systemd-modules-load.service expects. Reviewed-by: Cornelia Huck Reviewed-by: Harald Freudenberger Signed-off-by: David Hildenbrand Signed-off-by: Heiko Carstens --- arch/s390/crypto/prng.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index 12cca467af7d..d977643fa627 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -824,7 +824,7 @@ static int __init prng_init(void) /* check if the CPU has a PRNG */ if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) - return -EOPNOTSUPP; + return -ENODEV; /* check if TRNG subfunction is available */ if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) @@ -837,7 +837,7 @@ static int __init prng_init(void) if (prng_mode == PRNG_MODE_SHA512) { pr_err("The prng module cannot " "start in SHA-512 mode\n"); - return -EOPNOTSUPP; + return -ENODEV; } prng_mode = PRNG_MODE_TDES; } else From 45488c48e49b6ded9850bb0293668a92f96293c2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 12 Jun 2019 15:33:06 +0200 Subject: [PATCH 35/83] s390/crypto: sha: Use -ENODEV instead of -EOPNOTSUPP Let's use the error value that is typically used if HW support is not available when trying to load a module - this is also what systemd's systemd-modules-load.service expects. Reviewed-by: Cornelia Huck Reviewed-by: Harald Freudenberger Signed-off-by: David Hildenbrand Signed-off-by: Heiko Carstens --- arch/s390/crypto/sha1_s390.c | 2 +- arch/s390/crypto/sha256_s390.c | 2 +- arch/s390/crypto/sha512_s390.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c index 009572e8276d..7c15542d3685 100644 --- a/arch/s390/crypto/sha1_s390.c +++ b/arch/s390/crypto/sha1_s390.c @@ -86,7 +86,7 @@ static struct shash_alg alg = { static int __init sha1_s390_init(void) { if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1)) - return -EOPNOTSUPP; + return -ENODEV; return crypto_register_shash(&alg); } diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c index 62833a1d8724..af7505148f80 100644 --- a/arch/s390/crypto/sha256_s390.c +++ b/arch/s390/crypto/sha256_s390.c @@ -117,7 +117,7 @@ static int __init sha256_s390_init(void) int ret; if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) - return -EOPNOTSUPP; + return -ENODEV; ret = crypto_register_shash(&sha256_alg); if (ret < 0) goto out; diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c index be589c340d15..ad29db085a18 100644 --- a/arch/s390/crypto/sha512_s390.c +++ b/arch/s390/crypto/sha512_s390.c @@ -127,7 +127,7 @@ static int __init init(void) int ret; if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512)) - return -EOPNOTSUPP; + return -ENODEV; if ((ret = crypto_register_shash(&sha512_alg)) < 0) goto out; if ((ret = crypto_register_shash(&sha384_alg)) < 0) From 64e1f0c531d1072cd97939bf0d8df42b26713543 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Thu, 13 Sep 2018 18:57:16 +0200 Subject: [PATCH 36/83] s390/mm: force swiotlb for protected virtualization On s390, protected virtualization guests have to use bounced I/O buffers. That requires some plumbing. Let us make sure, any device that uses DMA API with direct ops correctly is spared from the problems, that a hypervisor attempting I/O to a non-shared page would bring. Signed-off-by: Halil Pasic Reviewed-by: Claudio Imbrenda Reviewed-by: Michael Mueller Tested-by: Michael Mueller Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 4 +++ arch/s390/include/asm/mem_encrypt.h | 17 +++++++++++ arch/s390/mm/init.c | 47 +++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 arch/s390/include/asm/mem_encrypt.h diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 65522d6956ca..35bb76491600 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +config ARCH_HAS_MEM_ENCRYPT + def_bool y + config MMU def_bool y @@ -186,6 +189,7 @@ config S390 select VIRT_CPU_ACCOUNTING select ARCH_HAS_SCALED_CPUTIME select HAVE_NMI + select SWIOTLB config SCHED_OMIT_FRAME_POINTER diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..3eb018508190 --- /dev/null +++ b/arch/s390/include/asm/mem_encrypt.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef S390_MEM_ENCRYPT_H__ +#define S390_MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#define sme_me_mask 0ULL + +static inline bool sme_active(void) { return false; } +extern bool sev_active(void); + +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + +#endif /* __ASSEMBLY__ */ + +#endif /* S390_MEM_ENCRYPT_H__ */ diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 14d1eae9fe43..f0bee6af3960 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +44,8 @@ #include #include #include +#include +#include pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir); @@ -128,6 +132,47 @@ void mark_rodata_ro(void) pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); } +int set_memory_encrypted(unsigned long addr, int numpages) +{ + int i; + + /* make specified pages unshared, (swiotlb, dma_free) */ + for (i = 0; i < numpages; ++i) { + uv_remove_shared(addr); + addr += PAGE_SIZE; + } + return 0; +} + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + int i; + /* make specified pages shared (swiotlb, dma_alloca) */ + for (i = 0; i < numpages; ++i) { + uv_set_shared(addr); + addr += PAGE_SIZE; + } + return 0; +} + +/* are we a protected virtualization guest? */ +bool sev_active(void) +{ + return is_prot_virt_guest(); +} + +/* protected virtualization */ +static void pv_init(void) +{ + if (!is_prot_virt_guest()) + return; + + /* make sure bounce buffers are shared */ + swiotlb_init(1); + swiotlb_update_mem_attributes(); + swiotlb_force = SWIOTLB_FORCE; +} + void __init mem_init(void) { cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); @@ -136,6 +181,8 @@ void __init mem_init(void) set_max_mapnr(max_low_pfn); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + pv_init(); + /* Setup guest page hinting */ cmma_init(); From bb99332a2b558e1f28b4c5011f9ea3b46f1c8806 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Tue, 2 Apr 2019 18:47:29 +0200 Subject: [PATCH 37/83] s390/cio: introduce DMA pools to cio To support protected virtualization cio will need to make sure the memory used for communication with the hypervisor is DMA memory. Let us introduce one global pool for cio. Our DMA pools are implemented as a gen_pool backed with DMA pages. The idea is to avoid each allocation effectively wasting a page, as we typically allocate much less than PAGE_SIZE. Signed-off-by: Halil Pasic Reviewed-by: Sebastian Ott Reviewed-by: Cornelia Huck Reviewed-by: Michael Mueller Tested-by: Michael Mueller Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + arch/s390/include/asm/cio.h | 11 +++ drivers/s390/cio/css.c | 133 ++++++++++++++++++++++++++++++++++-- 3 files changed, 141 insertions(+), 4 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 35bb76491600..fdb4246265a5 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -190,6 +190,7 @@ config S390 select ARCH_HAS_SCALED_CPUTIME select HAVE_NMI select SWIOTLB + select GENERIC_ALLOCATOR config SCHED_OMIT_FRAME_POINTER diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 1727180e8ca1..58e7db912c30 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -7,6 +7,7 @@ #include #include +#include #include #define LPM_ANYPATH 0xff @@ -328,6 +329,16 @@ static inline u8 pathmask_to_pos(u8 mask) void channel_subsystem_reinit(void); extern void css_schedule_reprobe(void); +extern void *cio_dma_zalloc(size_t size); +extern void cio_dma_free(void *cpu_addr, size_t size); +extern struct device *cio_get_dma_css_dev(void); + +void *cio_gp_dma_zalloc(struct gen_pool *gp_dma, struct device *dma_dev, + size_t size); +void cio_gp_dma_free(struct gen_pool *gp_dma, void *cpu_addr, size_t size); +void cio_gp_dma_destroy(struct gen_pool *gp_dma, struct device *dma_dev); +struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages); + /* Function from drivers/s390/cio/chsc.c */ int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index aea502922646..7b1a440a1f8e 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -224,6 +226,12 @@ struct subchannel *css_alloc_subchannel(struct subchannel_id schid, INIT_WORK(&sch->todo_work, css_sch_todo); sch->dev.release = &css_subchannel_release; device_initialize(&sch->dev); + /* + * The physical addresses of some the dma structures that can + * belong to a subchannel need to fit 31 bit width (e.g. ccw). + */ + sch->dev.coherent_dma_mask = DMA_BIT_MASK(31); + sch->dev.dma_mask = &sch->dev.coherent_dma_mask; return sch; err: @@ -899,6 +907,13 @@ static int __init setup_css(int nr) dev_set_name(&css->device, "css%x", nr); css->device.groups = cssdev_attr_groups; css->device.release = channel_subsystem_release; + /* + * We currently allocate notifier bits with this (using + * css->device as the device argument with the DMA API) + * and are fine with 64 bit addresses. + */ + css->device.coherent_dma_mask = DMA_BIT_MASK(64); + css->device.dma_mask = &css->device.coherent_dma_mask; mutex_init(&css->mutex); css->cssid = chsc_get_cssid(nr); @@ -1018,6 +1033,111 @@ static struct notifier_block css_power_notifier = { .notifier_call = css_power_event, }; +#define CIO_DMA_GFP (GFP_KERNEL | __GFP_ZERO) +static struct gen_pool *cio_dma_pool; + +/* Currently cio supports only a single css */ +struct device *cio_get_dma_css_dev(void) +{ + return &channel_subsystems[0]->device; +} + +struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages) +{ + struct gen_pool *gp_dma; + void *cpu_addr; + dma_addr_t dma_addr; + int i; + + gp_dma = gen_pool_create(3, -1); + if (!gp_dma) + return NULL; + for (i = 0; i < nr_pages; ++i) { + cpu_addr = dma_alloc_coherent(dma_dev, PAGE_SIZE, &dma_addr, + CIO_DMA_GFP); + if (!cpu_addr) + return gp_dma; + gen_pool_add_virt(gp_dma, (unsigned long) cpu_addr, + dma_addr, PAGE_SIZE, -1); + } + return gp_dma; +} + +static void __gp_dma_free_dma(struct gen_pool *pool, + struct gen_pool_chunk *chunk, void *data) +{ + size_t chunk_size = chunk->end_addr - chunk->start_addr + 1; + + dma_free_coherent((struct device *) data, chunk_size, + (void *) chunk->start_addr, + (dma_addr_t) chunk->phys_addr); +} + +void cio_gp_dma_destroy(struct gen_pool *gp_dma, struct device *dma_dev) +{ + if (!gp_dma) + return; + /* this is quite ugly but no better idea */ + gen_pool_for_each_chunk(gp_dma, __gp_dma_free_dma, dma_dev); + gen_pool_destroy(gp_dma); +} + +static int cio_dma_pool_init(void) +{ + /* No need to free up the resources: compiled in */ + cio_dma_pool = cio_gp_dma_create(cio_get_dma_css_dev(), 1); + if (!cio_dma_pool) + return -ENOMEM; + return 0; +} + +void *cio_gp_dma_zalloc(struct gen_pool *gp_dma, struct device *dma_dev, + size_t size) +{ + dma_addr_t dma_addr; + unsigned long addr; + size_t chunk_size; + + if (!gp_dma) + return NULL; + addr = gen_pool_alloc(gp_dma, size); + while (!addr) { + chunk_size = round_up(size, PAGE_SIZE); + addr = (unsigned long) dma_alloc_coherent(dma_dev, + chunk_size, &dma_addr, CIO_DMA_GFP); + if (!addr) + return NULL; + gen_pool_add_virt(gp_dma, addr, dma_addr, chunk_size, -1); + addr = gen_pool_alloc(gp_dma, size); + } + return (void *) addr; +} + +void cio_gp_dma_free(struct gen_pool *gp_dma, void *cpu_addr, size_t size) +{ + if (!cpu_addr) + return; + memset(cpu_addr, 0, size); + gen_pool_free(gp_dma, (unsigned long) cpu_addr, size); +} + +/* + * Allocate dma memory from the css global pool. Intended for memory not + * specific to any single device within the css. The allocated memory + * is not guaranteed to be 31-bit addressable. + * + * Caution: Not suitable for early stuff like console. + */ +void *cio_dma_zalloc(size_t size) +{ + return cio_gp_dma_zalloc(cio_dma_pool, cio_get_dma_css_dev(), size); +} + +void cio_dma_free(void *cpu_addr, size_t size) +{ + cio_gp_dma_free(cio_dma_pool, cpu_addr, size); +} + /* * Now that the driver core is running, we can setup our channel subsystem. * The struct subchannel's are created during probing. @@ -1059,16 +1179,21 @@ static int __init css_bus_init(void) if (ret) goto out_unregister; ret = register_pm_notifier(&css_power_notifier); - if (ret) { - unregister_reboot_notifier(&css_reboot_notifier); - goto out_unregister; - } + if (ret) + goto out_unregister_rn; + ret = cio_dma_pool_init(); + if (ret) + goto out_unregister_pmn; css_init_done = 1; /* Enable default isc for I/O subchannels. */ isc_register(IO_SCH_ISC); return 0; +out_unregister_pmn: + unregister_pm_notifier(&css_power_notifier); +out_unregister_rn: + unregister_reboot_notifier(&css_reboot_notifier); out_unregister: while (i-- > 0) { struct channel_subsystem *css = channel_subsystems[i]; From 37db8985b2116c89a3cbaf87083a02f83afaba5b Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Tue, 26 Mar 2019 12:41:09 +0100 Subject: [PATCH 38/83] s390/cio: add basic protected virtualization support As virtio-ccw devices are channel devices, we need to use the dma area within the common I/O layer for any communication with the hypervisor. Note that we do not need to use that area for control blocks directly referenced by instructions, e.g. the orb. It handles neither QDIO in the common code, nor any device type specific stuff (like channel programs constructed by the DASD driver). An interesting side effect is that virtio structures are now going to get allocated in 31 bit addressable storage. Signed-off-by: Halil Pasic Reviewed-by: Sebastian Ott Reviewed-by: Cornelia Huck Reviewed-by: Michael Mueller Tested-by: Michael Mueller Signed-off-by: Heiko Carstens --- arch/s390/include/asm/ccwdev.h | 4 ++ drivers/s390/cio/ccwreq.c | 9 +++-- drivers/s390/cio/device.c | 68 ++++++++++++++++++++++++++------ drivers/s390/cio/device_fsm.c | 49 +++++++++++++---------- drivers/s390/cio/device_id.c | 20 +++++----- drivers/s390/cio/device_ops.c | 21 +++++++++- drivers/s390/cio/device_pgid.c | 22 ++++++----- drivers/s390/cio/device_status.c | 24 +++++------ drivers/s390/cio/io_sch.h | 20 +++++++--- drivers/s390/virtio/virtio_ccw.c | 10 ----- 10 files changed, 164 insertions(+), 83 deletions(-) diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h index a29dd430fb40..865ce1cb86d5 100644 --- a/arch/s390/include/asm/ccwdev.h +++ b/arch/s390/include/asm/ccwdev.h @@ -226,6 +226,10 @@ extern int ccw_device_enable_console(struct ccw_device *); extern void ccw_device_wait_idle(struct ccw_device *); extern int ccw_device_force_console(struct ccw_device *); +extern void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size); +extern void ccw_device_dma_free(struct ccw_device *cdev, + void *cpu_addr, size_t size); + int ccw_device_siosl(struct ccw_device *); extern void ccw_device_get_schid(struct ccw_device *, struct subchannel_id *); diff --git a/drivers/s390/cio/ccwreq.c b/drivers/s390/cio/ccwreq.c index 603268a33ea1..73582a0a2622 100644 --- a/drivers/s390/cio/ccwreq.c +++ b/drivers/s390/cio/ccwreq.c @@ -63,7 +63,7 @@ static void ccwreq_stop(struct ccw_device *cdev, int rc) return; req->done = 1; ccw_device_set_timeout(cdev, 0); - memset(&cdev->private->irb, 0, sizeof(struct irb)); + memset(&cdev->private->dma_area->irb, 0, sizeof(struct irb)); if (rc && rc != -ENODEV && req->drc) rc = req->drc; req->callback(cdev, req->data, rc); @@ -86,7 +86,7 @@ static void ccwreq_do(struct ccw_device *cdev) continue; } /* Perform start function. */ - memset(&cdev->private->irb, 0, sizeof(struct irb)); + memset(&cdev->private->dma_area->irb, 0, sizeof(struct irb)); rc = cio_start(sch, cp, (u8) req->mask); if (rc == 0) { /* I/O started successfully. */ @@ -169,7 +169,7 @@ int ccw_request_cancel(struct ccw_device *cdev) */ static enum io_status ccwreq_status(struct ccw_device *cdev, struct irb *lcirb) { - struct irb *irb = &cdev->private->irb; + struct irb *irb = &cdev->private->dma_area->irb; struct cmd_scsw *scsw = &irb->scsw.cmd; enum uc_todo todo; @@ -187,7 +187,8 @@ static enum io_status ccwreq_status(struct ccw_device *cdev, struct irb *lcirb) CIO_TRACE_EVENT(2, "sensedata"); CIO_HEX_EVENT(2, &cdev->private->dev_id, sizeof(struct ccw_dev_id)); - CIO_HEX_EVENT(2, &cdev->private->irb.ecw, SENSE_MAX_COUNT); + CIO_HEX_EVENT(2, &cdev->private->dma_area->irb.ecw, + SENSE_MAX_COUNT); /* Check for command reject. */ if (irb->ecw[0] & SNS0_CMD_REJECT) return IO_REJECTED; diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 1540229a37bb..9985b7484a6b 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -687,6 +688,9 @@ ccw_device_release(struct device *dev) struct ccw_device *cdev; cdev = to_ccwdev(dev); + cio_gp_dma_free(cdev->private->dma_pool, cdev->private->dma_area, + sizeof(*cdev->private->dma_area)); + cio_gp_dma_destroy(cdev->private->dma_pool, &cdev->dev); /* Release reference of parent subchannel. */ put_device(cdev->dev.parent); kfree(cdev->private); @@ -696,15 +700,33 @@ ccw_device_release(struct device *dev) static struct ccw_device * io_subchannel_allocate_dev(struct subchannel *sch) { struct ccw_device *cdev; + struct gen_pool *dma_pool; cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); - if (cdev) { - cdev->private = kzalloc(sizeof(struct ccw_device_private), - GFP_KERNEL | GFP_DMA); - if (cdev->private) - return cdev; - } + if (!cdev) + goto err_cdev; + cdev->private = kzalloc(sizeof(struct ccw_device_private), + GFP_KERNEL | GFP_DMA); + if (!cdev->private) + goto err_priv; + cdev->dev.coherent_dma_mask = sch->dev.coherent_dma_mask; + cdev->dev.dma_mask = &cdev->dev.coherent_dma_mask; + dma_pool = cio_gp_dma_create(&cdev->dev, 1); + if (!dma_pool) + goto err_dma_pool; + cdev->private->dma_pool = dma_pool; + cdev->private->dma_area = cio_gp_dma_zalloc(dma_pool, &cdev->dev, + sizeof(*cdev->private->dma_area)); + if (!cdev->private->dma_area) + goto err_dma_area; + return cdev; +err_dma_area: + cio_gp_dma_destroy(dma_pool, &cdev->dev); +err_dma_pool: + kfree(cdev->private); +err_priv: kfree(cdev); +err_cdev: return ERR_PTR(-ENOMEM); } @@ -884,7 +906,7 @@ io_subchannel_recog_done(struct ccw_device *cdev) wake_up(&ccw_device_init_wq); break; case DEV_STATE_OFFLINE: - /* + /* * We can't register the device in interrupt context so * we schedule a work item. */ @@ -1062,6 +1084,14 @@ static int io_subchannel_probe(struct subchannel *sch) if (!io_priv) goto out_schedule; + io_priv->dma_area = dma_alloc_coherent(&sch->dev, + sizeof(*io_priv->dma_area), + &io_priv->dma_area_dma, GFP_KERNEL); + if (!io_priv->dma_area) { + kfree(io_priv); + goto out_schedule; + } + set_io_private(sch, io_priv); css_schedule_eval(sch->schid); return 0; @@ -1088,6 +1118,8 @@ static int io_subchannel_remove(struct subchannel *sch) set_io_private(sch, NULL); spin_unlock_irq(sch->lock); out_free: + dma_free_coherent(&sch->dev, sizeof(*io_priv->dma_area), + io_priv->dma_area, io_priv->dma_area_dma); kfree(io_priv); sysfs_remove_group(&sch->dev.kobj, &io_subchannel_attr_group); return 0; @@ -1593,13 +1625,19 @@ struct ccw_device * __init ccw_device_create_console(struct ccw_driver *drv) return ERR_CAST(sch); io_priv = kzalloc(sizeof(*io_priv), GFP_KERNEL | GFP_DMA); - if (!io_priv) { - put_device(&sch->dev); - return ERR_PTR(-ENOMEM); - } + if (!io_priv) + goto err_priv; + io_priv->dma_area = dma_alloc_coherent(&sch->dev, + sizeof(*io_priv->dma_area), + &io_priv->dma_area_dma, GFP_KERNEL); + if (!io_priv->dma_area) + goto err_dma_area; set_io_private(sch, io_priv); cdev = io_subchannel_create_ccwdev(sch); if (IS_ERR(cdev)) { + dma_free_coherent(&sch->dev, sizeof(*io_priv->dma_area), + io_priv->dma_area, io_priv->dma_area_dma); + set_io_private(sch, NULL); put_device(&sch->dev); kfree(io_priv); return cdev; @@ -1607,6 +1645,12 @@ struct ccw_device * __init ccw_device_create_console(struct ccw_driver *drv) cdev->drv = drv; ccw_device_set_int_class(cdev); return cdev; + +err_dma_area: + kfree(io_priv); +err_priv: + put_device(&sch->dev); + return ERR_PTR(-ENOMEM); } void __init ccw_device_destroy_console(struct ccw_device *cdev) @@ -1617,6 +1661,8 @@ void __init ccw_device_destroy_console(struct ccw_device *cdev) set_io_private(sch, NULL); put_device(&sch->dev); put_device(&cdev->dev); + dma_free_coherent(&sch->dev, sizeof(*io_priv->dma_area), + io_priv->dma_area, io_priv->dma_area_dma); kfree(io_priv); } diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index 9169af7dbb43..8fc267324ebb 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -67,8 +67,10 @@ static void ccw_timeout_log(struct ccw_device *cdev) sizeof(struct tcw), 0); } else { printk(KERN_WARNING "cio: orb indicates command mode\n"); - if ((void *)(addr_t)orb->cmd.cpa == &private->sense_ccw || - (void *)(addr_t)orb->cmd.cpa == cdev->private->iccws) + if ((void *)(addr_t)orb->cmd.cpa == + &private->dma_area->sense_ccw || + (void *)(addr_t)orb->cmd.cpa == + cdev->private->dma_area->iccws) printk(KERN_WARNING "cio: last channel program " "(intern):\n"); else @@ -143,18 +145,22 @@ ccw_device_cancel_halt_clear(struct ccw_device *cdev) void ccw_device_update_sense_data(struct ccw_device *cdev) { memset(&cdev->id, 0, sizeof(cdev->id)); - cdev->id.cu_type = cdev->private->senseid.cu_type; - cdev->id.cu_model = cdev->private->senseid.cu_model; - cdev->id.dev_type = cdev->private->senseid.dev_type; - cdev->id.dev_model = cdev->private->senseid.dev_model; + cdev->id.cu_type = cdev->private->dma_area->senseid.cu_type; + cdev->id.cu_model = cdev->private->dma_area->senseid.cu_model; + cdev->id.dev_type = cdev->private->dma_area->senseid.dev_type; + cdev->id.dev_model = cdev->private->dma_area->senseid.dev_model; } int ccw_device_test_sense_data(struct ccw_device *cdev) { - return cdev->id.cu_type == cdev->private->senseid.cu_type && - cdev->id.cu_model == cdev->private->senseid.cu_model && - cdev->id.dev_type == cdev->private->senseid.dev_type && - cdev->id.dev_model == cdev->private->senseid.dev_model; + return cdev->id.cu_type == + cdev->private->dma_area->senseid.cu_type && + cdev->id.cu_model == + cdev->private->dma_area->senseid.cu_model && + cdev->id.dev_type == + cdev->private->dma_area->senseid.dev_type && + cdev->id.dev_model == + cdev->private->dma_area->senseid.dev_model; } /* @@ -342,7 +348,7 @@ ccw_device_done(struct ccw_device *cdev, int state) cio_disable_subchannel(sch); /* Reset device status. */ - memset(&cdev->private->irb, 0, sizeof(struct irb)); + memset(&cdev->private->dma_area->irb, 0, sizeof(struct irb)); cdev->private->state = state; @@ -509,13 +515,14 @@ void ccw_device_verify_done(struct ccw_device *cdev, int err) ccw_device_done(cdev, DEV_STATE_ONLINE); /* Deliver fake irb to device driver, if needed. */ if (cdev->private->flags.fake_irb) { - create_fake_irb(&cdev->private->irb, + create_fake_irb(&cdev->private->dma_area->irb, cdev->private->flags.fake_irb); cdev->private->flags.fake_irb = 0; if (cdev->handler) cdev->handler(cdev, cdev->private->intparm, - &cdev->private->irb); - memset(&cdev->private->irb, 0, sizeof(struct irb)); + &cdev->private->dma_area->irb); + memset(&cdev->private->dma_area->irb, 0, + sizeof(struct irb)); } ccw_device_report_path_events(cdev); ccw_device_handle_broken_paths(cdev); @@ -672,7 +679,8 @@ ccw_device_online_verify(struct ccw_device *cdev, enum dev_event dev_event) if (scsw_actl(&sch->schib.scsw) != 0 || (scsw_stctl(&sch->schib.scsw) & SCSW_STCTL_STATUS_PEND) || - (scsw_stctl(&cdev->private->irb.scsw) & SCSW_STCTL_STATUS_PEND)) { + (scsw_stctl(&cdev->private->dma_area->irb.scsw) & + SCSW_STCTL_STATUS_PEND)) { /* * No final status yet or final status not yet delivered * to the device driver. Can't do path verification now, @@ -719,7 +727,7 @@ static int ccw_device_call_handler(struct ccw_device *cdev) * - fast notification was requested (primary status) * - unsolicited interrupts */ - stctl = scsw_stctl(&cdev->private->irb.scsw); + stctl = scsw_stctl(&cdev->private->dma_area->irb.scsw); ending_status = (stctl & SCSW_STCTL_SEC_STATUS) || (stctl == (SCSW_STCTL_ALERT_STATUS | SCSW_STCTL_STATUS_PEND)) || (stctl == SCSW_STCTL_STATUS_PEND); @@ -735,9 +743,9 @@ static int ccw_device_call_handler(struct ccw_device *cdev) if (cdev->handler) cdev->handler(cdev, cdev->private->intparm, - &cdev->private->irb); + &cdev->private->dma_area->irb); - memset(&cdev->private->irb, 0, sizeof(struct irb)); + memset(&cdev->private->dma_area->irb, 0, sizeof(struct irb)); return 1; } @@ -759,7 +767,8 @@ ccw_device_irq(struct ccw_device *cdev, enum dev_event dev_event) /* Unit check but no sense data. Need basic sense. */ if (ccw_device_do_sense(cdev, irb) != 0) goto call_handler_unsol; - memcpy(&cdev->private->irb, irb, sizeof(struct irb)); + memcpy(&cdev->private->dma_area->irb, irb, + sizeof(struct irb)); cdev->private->state = DEV_STATE_W4SENSE; cdev->private->intparm = 0; return; @@ -842,7 +851,7 @@ ccw_device_w4sense(struct ccw_device *cdev, enum dev_event dev_event) if (scsw_fctl(&irb->scsw) & (SCSW_FCTL_CLEAR_FUNC | SCSW_FCTL_HALT_FUNC)) { cdev->private->flags.dosense = 0; - memset(&cdev->private->irb, 0, sizeof(struct irb)); + memset(&cdev->private->dma_area->irb, 0, sizeof(struct irb)); ccw_device_accumulate_irb(cdev, irb); goto call_handler; } diff --git a/drivers/s390/cio/device_id.c b/drivers/s390/cio/device_id.c index f6df83a9dfbb..740996d0dc8c 100644 --- a/drivers/s390/cio/device_id.c +++ b/drivers/s390/cio/device_id.c @@ -99,7 +99,7 @@ static int diag210_to_senseid(struct senseid *senseid, struct diag210 *diag) static int diag210_get_dev_info(struct ccw_device *cdev) { struct ccw_dev_id *dev_id = &cdev->private->dev_id; - struct senseid *senseid = &cdev->private->senseid; + struct senseid *senseid = &cdev->private->dma_area->senseid; struct diag210 diag_data; int rc; @@ -134,8 +134,10 @@ static int diag210_get_dev_info(struct ccw_device *cdev) static void snsid_init(struct ccw_device *cdev) { cdev->private->flags.esid = 0; - memset(&cdev->private->senseid, 0, sizeof(cdev->private->senseid)); - cdev->private->senseid.cu_type = 0xffff; + + memset(&cdev->private->dma_area->senseid, 0, + sizeof(cdev->private->dma_area->senseid)); + cdev->private->dma_area->senseid.cu_type = 0xffff; } /* @@ -143,16 +145,16 @@ static void snsid_init(struct ccw_device *cdev) */ static int snsid_check(struct ccw_device *cdev, void *data) { - struct cmd_scsw *scsw = &cdev->private->irb.scsw.cmd; + struct cmd_scsw *scsw = &cdev->private->dma_area->irb.scsw.cmd; int len = sizeof(struct senseid) - scsw->count; /* Check for incomplete SENSE ID data. */ if (len < SENSE_ID_MIN_LEN) goto out_restart; - if (cdev->private->senseid.cu_type == 0xffff) + if (cdev->private->dma_area->senseid.cu_type == 0xffff) goto out_restart; /* Check for incompatible SENSE ID data. */ - if (cdev->private->senseid.reserved != 0xff) + if (cdev->private->dma_area->senseid.reserved != 0xff) return -EOPNOTSUPP; /* Check for extended-identification information. */ if (len > SENSE_ID_BASIC_LEN) @@ -170,7 +172,7 @@ static int snsid_check(struct ccw_device *cdev, void *data) static void snsid_callback(struct ccw_device *cdev, void *data, int rc) { struct ccw_dev_id *id = &cdev->private->dev_id; - struct senseid *senseid = &cdev->private->senseid; + struct senseid *senseid = &cdev->private->dma_area->senseid; int vm = 0; if (rc && MACHINE_IS_VM) { @@ -200,7 +202,7 @@ void ccw_device_sense_id_start(struct ccw_device *cdev) { struct subchannel *sch = to_subchannel(cdev->dev.parent); struct ccw_request *req = &cdev->private->req; - struct ccw1 *cp = cdev->private->iccws; + struct ccw1 *cp = cdev->private->dma_area->iccws; CIO_TRACE_EVENT(4, "snsid"); CIO_HEX_EVENT(4, &cdev->private->dev_id, sizeof(cdev->private->dev_id)); @@ -208,7 +210,7 @@ void ccw_device_sense_id_start(struct ccw_device *cdev) snsid_init(cdev); /* Channel program setup. */ cp->cmd_code = CCW_CMD_SENSE_ID; - cp->cda = (u32) (addr_t) &cdev->private->senseid; + cp->cda = (u32) (addr_t) &cdev->private->dma_area->senseid; cp->count = sizeof(struct senseid); cp->flags = CCW_FLAG_SLI; /* Request setup. */ diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index 4435ae0b3027..d722458c5928 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -429,8 +429,8 @@ struct ciw *ccw_device_get_ciw(struct ccw_device *cdev, __u32 ct) if (cdev->private->flags.esid == 0) return NULL; for (ciw_cnt = 0; ciw_cnt < MAX_CIWS; ciw_cnt++) - if (cdev->private->senseid.ciw[ciw_cnt].ct == ct) - return cdev->private->senseid.ciw + ciw_cnt; + if (cdev->private->dma_area->senseid.ciw[ciw_cnt].ct == ct) + return cdev->private->dma_area->senseid.ciw + ciw_cnt; return NULL; } @@ -699,6 +699,23 @@ void ccw_device_get_schid(struct ccw_device *cdev, struct subchannel_id *schid) } EXPORT_SYMBOL_GPL(ccw_device_get_schid); +/* + * Allocate zeroed dma coherent 31 bit addressable memory using + * the subchannels dma pool. Maximal size of allocation supported + * is PAGE_SIZE. + */ +void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size) +{ + return cio_gp_dma_zalloc(cdev->private->dma_pool, &cdev->dev, size); +} +EXPORT_SYMBOL(ccw_device_dma_zalloc); + +void ccw_device_dma_free(struct ccw_device *cdev, void *cpu_addr, size_t size) +{ + cio_gp_dma_free(cdev->private->dma_pool, cpu_addr, size); +} +EXPORT_SYMBOL(ccw_device_dma_free); + EXPORT_SYMBOL(ccw_device_set_options_mask); EXPORT_SYMBOL(ccw_device_set_options); EXPORT_SYMBOL(ccw_device_clear_options); diff --git a/drivers/s390/cio/device_pgid.c b/drivers/s390/cio/device_pgid.c index d30a3babf176..767a85635a0f 100644 --- a/drivers/s390/cio/device_pgid.c +++ b/drivers/s390/cio/device_pgid.c @@ -57,7 +57,7 @@ static void verify_done(struct ccw_device *cdev, int rc) static void nop_build_cp(struct ccw_device *cdev) { struct ccw_request *req = &cdev->private->req; - struct ccw1 *cp = cdev->private->iccws; + struct ccw1 *cp = cdev->private->dma_area->iccws; cp->cmd_code = CCW_CMD_NOOP; cp->cda = 0; @@ -134,9 +134,9 @@ static void nop_callback(struct ccw_device *cdev, void *data, int rc) static void spid_build_cp(struct ccw_device *cdev, u8 fn) { struct ccw_request *req = &cdev->private->req; - struct ccw1 *cp = cdev->private->iccws; + struct ccw1 *cp = cdev->private->dma_area->iccws; int i = pathmask_to_pos(req->lpm); - struct pgid *pgid = &cdev->private->pgid[i]; + struct pgid *pgid = &cdev->private->dma_area->pgid[i]; pgid->inf.fc = fn; cp->cmd_code = CCW_CMD_SET_PGID; @@ -300,7 +300,7 @@ static int pgid_cmp(struct pgid *p1, struct pgid *p2) static void pgid_analyze(struct ccw_device *cdev, struct pgid **p, int *mismatch, u8 *reserved, u8 *reset) { - struct pgid *pgid = &cdev->private->pgid[0]; + struct pgid *pgid = &cdev->private->dma_area->pgid[0]; struct pgid *first = NULL; int lpm; int i; @@ -342,7 +342,7 @@ static u8 pgid_to_donepm(struct ccw_device *cdev) lpm = 0x80 >> i; if ((cdev->private->pgid_valid_mask & lpm) == 0) continue; - pgid = &cdev->private->pgid[i]; + pgid = &cdev->private->dma_area->pgid[i]; if (sch->opm & lpm) { if (pgid->inf.ps.state1 != SNID_STATE1_GROUPED) continue; @@ -368,7 +368,8 @@ static void pgid_fill(struct ccw_device *cdev, struct pgid *pgid) int i; for (i = 0; i < 8; i++) - memcpy(&cdev->private->pgid[i], pgid, sizeof(struct pgid)); + memcpy(&cdev->private->dma_area->pgid[i], pgid, + sizeof(struct pgid)); } /* @@ -435,12 +436,12 @@ static void snid_done(struct ccw_device *cdev, int rc) static void snid_build_cp(struct ccw_device *cdev) { struct ccw_request *req = &cdev->private->req; - struct ccw1 *cp = cdev->private->iccws; + struct ccw1 *cp = cdev->private->dma_area->iccws; int i = pathmask_to_pos(req->lpm); /* Channel program setup. */ cp->cmd_code = CCW_CMD_SENSE_PGID; - cp->cda = (u32) (addr_t) &cdev->private->pgid[i]; + cp->cda = (u32) (addr_t) &cdev->private->dma_area->pgid[i]; cp->count = sizeof(struct pgid); cp->flags = CCW_FLAG_SLI; req->cp = cp; @@ -516,7 +517,8 @@ static void verify_start(struct ccw_device *cdev) sch->lpm = sch->schib.pmcw.pam; /* Initialize PGID data. */ - memset(cdev->private->pgid, 0, sizeof(cdev->private->pgid)); + memset(cdev->private->dma_area->pgid, 0, + sizeof(cdev->private->dma_area->pgid)); cdev->private->pgid_valid_mask = 0; cdev->private->pgid_todo_mask = sch->schib.pmcw.pam; cdev->private->path_notoper_mask = 0; @@ -626,7 +628,7 @@ struct stlck_data { static void stlck_build_cp(struct ccw_device *cdev, void *buf1, void *buf2) { struct ccw_request *req = &cdev->private->req; - struct ccw1 *cp = cdev->private->iccws; + struct ccw1 *cp = cdev->private->dma_area->iccws; cp[0].cmd_code = CCW_CMD_STLCK; cp[0].cda = (u32) (addr_t) buf1; diff --git a/drivers/s390/cio/device_status.c b/drivers/s390/cio/device_status.c index 7d5c7892b2c4..0bd8f2642732 100644 --- a/drivers/s390/cio/device_status.c +++ b/drivers/s390/cio/device_status.c @@ -79,15 +79,15 @@ ccw_device_accumulate_ecw(struct ccw_device *cdev, struct irb *irb) * are condition that have to be met for the extended control * bit to have meaning. Sick. */ - cdev->private->irb.scsw.cmd.ectl = 0; + cdev->private->dma_area->irb.scsw.cmd.ectl = 0; if ((irb->scsw.cmd.stctl & SCSW_STCTL_ALERT_STATUS) && !(irb->scsw.cmd.stctl & SCSW_STCTL_INTER_STATUS)) - cdev->private->irb.scsw.cmd.ectl = irb->scsw.cmd.ectl; + cdev->private->dma_area->irb.scsw.cmd.ectl = irb->scsw.cmd.ectl; /* Check if extended control word is valid. */ - if (!cdev->private->irb.scsw.cmd.ectl) + if (!cdev->private->dma_area->irb.scsw.cmd.ectl) return; /* Copy concurrent sense / model dependent information. */ - memcpy (&cdev->private->irb.ecw, irb->ecw, sizeof (irb->ecw)); + memcpy(&cdev->private->dma_area->irb.ecw, irb->ecw, sizeof(irb->ecw)); } /* @@ -118,7 +118,7 @@ ccw_device_accumulate_esw(struct ccw_device *cdev, struct irb *irb) if (!ccw_device_accumulate_esw_valid(irb)) return; - cdev_irb = &cdev->private->irb; + cdev_irb = &cdev->private->dma_area->irb; /* Copy last path used mask. */ cdev_irb->esw.esw1.lpum = irb->esw.esw1.lpum; @@ -210,7 +210,7 @@ ccw_device_accumulate_irb(struct ccw_device *cdev, struct irb *irb) ccw_device_path_notoper(cdev); /* No irb accumulation for transport mode irbs. */ if (scsw_is_tm(&irb->scsw)) { - memcpy(&cdev->private->irb, irb, sizeof(struct irb)); + memcpy(&cdev->private->dma_area->irb, irb, sizeof(struct irb)); return; } /* @@ -219,7 +219,7 @@ ccw_device_accumulate_irb(struct ccw_device *cdev, struct irb *irb) if (!scsw_is_solicited(&irb->scsw)) return; - cdev_irb = &cdev->private->irb; + cdev_irb = &cdev->private->dma_area->irb; /* * If the clear function had been performed, all formerly pending @@ -227,7 +227,7 @@ ccw_device_accumulate_irb(struct ccw_device *cdev, struct irb *irb) * intermediate accumulated status to the device driver. */ if (irb->scsw.cmd.fctl & SCSW_FCTL_CLEAR_FUNC) - memset(&cdev->private->irb, 0, sizeof(struct irb)); + memset(&cdev->private->dma_area->irb, 0, sizeof(struct irb)); /* Copy bits which are valid only for the start function. */ if (irb->scsw.cmd.fctl & SCSW_FCTL_START_FUNC) { @@ -329,9 +329,9 @@ ccw_device_do_sense(struct ccw_device *cdev, struct irb *irb) /* * We have ending status but no sense information. Do a basic sense. */ - sense_ccw = &to_io_private(sch)->sense_ccw; + sense_ccw = &to_io_private(sch)->dma_area->sense_ccw; sense_ccw->cmd_code = CCW_CMD_BASIC_SENSE; - sense_ccw->cda = (__u32) __pa(cdev->private->irb.ecw); + sense_ccw->cda = (__u32) __pa(cdev->private->dma_area->irb.ecw); sense_ccw->count = SENSE_MAX_COUNT; sense_ccw->flags = CCW_FLAG_SLI; @@ -364,7 +364,7 @@ ccw_device_accumulate_basic_sense(struct ccw_device *cdev, struct irb *irb) if (!(irb->scsw.cmd.dstat & DEV_STAT_UNIT_CHECK) && (irb->scsw.cmd.dstat & DEV_STAT_CHN_END)) { - cdev->private->irb.esw.esw0.erw.cons = 1; + cdev->private->dma_area->irb.esw.esw0.erw.cons = 1; cdev->private->flags.dosense = 0; } /* Check if path verification is required. */ @@ -386,7 +386,7 @@ ccw_device_accumulate_and_sense(struct ccw_device *cdev, struct irb *irb) /* Check for basic sense. */ if (cdev->private->flags.dosense && !(irb->scsw.cmd.dstat & DEV_STAT_UNIT_CHECK)) { - cdev->private->irb.esw.esw0.erw.cons = 1; + cdev->private->dma_area->irb.esw.esw0.erw.cons = 1; cdev->private->flags.dosense = 0; return 0; } diff --git a/drivers/s390/cio/io_sch.h b/drivers/s390/cio/io_sch.h index 90e4e3a7841b..c03b4a19974e 100644 --- a/drivers/s390/cio/io_sch.h +++ b/drivers/s390/cio/io_sch.h @@ -9,15 +9,20 @@ #include "css.h" #include "orb.h" +struct io_subchannel_dma_area { + struct ccw1 sense_ccw; /* static ccw for sense command */ +}; + struct io_subchannel_private { union orb orb; /* operation request block */ - struct ccw1 sense_ccw; /* static ccw for sense command */ struct ccw_device *cdev;/* pointer to the child ccw device */ struct { unsigned int suspend:1; /* allow suspend */ unsigned int prefetch:1;/* deny prefetch */ unsigned int inter:1; /* suppress intermediate interrupts */ } __packed options; + struct io_subchannel_dma_area *dma_area; + dma_addr_t dma_area_dma; } __aligned(8); #define to_io_private(n) ((struct io_subchannel_private *) \ @@ -115,6 +120,13 @@ enum cdev_todo { #define FAKE_CMD_IRB 1 #define FAKE_TM_IRB 2 +struct ccw_device_dma_area { + struct senseid senseid; /* SenseID info */ + struct ccw1 iccws[2]; /* ccws for SNID/SID/SPGID commands */ + struct irb irb; /* device status */ + struct pgid pgid[8]; /* path group IDs per chpid*/ +}; + struct ccw_device_private { struct ccw_device *cdev; struct subchannel *sch; @@ -156,11 +168,7 @@ struct ccw_device_private { } __attribute__((packed)) flags; unsigned long intparm; /* user interruption parameter */ struct qdio_irq *qdio_data; - struct irb irb; /* device status */ int async_kill_io_rc; - struct senseid senseid; /* SenseID info */ - struct pgid pgid[8]; /* path group IDs per chpid*/ - struct ccw1 iccws[2]; /* ccws for SNID/SID/SPGID commands */ struct work_struct todo_work; enum cdev_todo todo; wait_queue_head_t wait_q; @@ -169,6 +177,8 @@ struct ccw_device_private { struct list_head cmb_list; /* list of measured devices */ u64 cmb_start_time; /* clock value of cmb reset */ void *cmb_wait; /* deferred cmb enable/disable */ + struct gen_pool *dma_pool; + struct ccw_device_dma_area *dma_area; enum interruption_class int_class; }; diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 6a3076881321..f995798bb025 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -66,7 +66,6 @@ struct virtio_ccw_device { bool device_lost; unsigned int config_ready; void *airq_info; - u64 dma_mask; }; struct vq_info_block_legacy { @@ -1255,16 +1254,7 @@ static int virtio_ccw_online(struct ccw_device *cdev) ret = -ENOMEM; goto out_free; } - vcdev->vdev.dev.parent = &cdev->dev; - cdev->dev.dma_mask = &vcdev->dma_mask; - /* we are fine with common virtio infrastructure using 64 bit DMA */ - ret = dma_set_mask_and_coherent(&cdev->dev, DMA_BIT_MASK(64)); - if (ret) { - dev_warn(&cdev->dev, "Failed to enable 64-bit DMA.\n"); - goto out_free; - } - vcdev->config_block = kzalloc(sizeof(*vcdev->config_block), GFP_DMA | GFP_KERNEL); if (!vcdev->config_block) { From b50623e5db802e41736f3305cb54c03bc7f0e30a Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Thu, 13 Sep 2018 18:57:16 +0200 Subject: [PATCH 39/83] s390/airq: use DMA memory for adapter interrupts Protected virtualization guests have to use shared pages for airq notifier bit vectors, because the hypervisor needs to write these bits. Let us make sure we allocate DMA memory for the notifier bit vectors by replacing the kmem_cache with a dma_cache and kalloc() with cio_dma_zalloc(). Signed-off-by: Halil Pasic Reviewed-by: Sebastian Ott Reviewed-by: Michael Mueller Tested-by: Michael Mueller Signed-off-by: Heiko Carstens --- arch/s390/include/asm/airq.h | 2 ++ drivers/s390/cio/airq.c | 37 ++++++++++++++++++++++-------------- drivers/s390/cio/cio.h | 2 ++ drivers/s390/cio/css.c | 1 + 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index c10d2ee2dfda..01936fdfaddb 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -11,6 +11,7 @@ #define _ASM_S390_AIRQ_H #include +#include struct airq_struct { struct hlist_node list; /* Handler queueing. */ @@ -29,6 +30,7 @@ void unregister_adapter_interrupt(struct airq_struct *airq); /* Adapter interrupt bit vector */ struct airq_iv { unsigned long *vector; /* Adapter interrupt bit vector */ + dma_addr_t vector_dma; /* Adapter interrupt bit vector dma */ unsigned long *avail; /* Allocation bit mask for the bit vector */ unsigned long *bitlock; /* Lock bit mask for the bit vector */ unsigned long *ptr; /* Pointer associated with each bit */ diff --git a/drivers/s390/cio/airq.c b/drivers/s390/cio/airq.c index 4534afc63591..427b2e24a8ce 100644 --- a/drivers/s390/cio/airq.c +++ b/drivers/s390/cio/airq.c @@ -16,9 +16,11 @@ #include #include #include +#include #include #include +#include #include "cio.h" #include "cio_debug.h" @@ -27,7 +29,7 @@ static DEFINE_SPINLOCK(airq_lists_lock); static struct hlist_head airq_lists[MAX_ISC+1]; -static struct kmem_cache *airq_iv_cache; +static struct dma_pool *airq_iv_cache; /** * register_adapter_interrupt() - register adapter interrupt handler @@ -115,6 +117,11 @@ void __init init_airq_interrupts(void) setup_irq(THIN_INTERRUPT, &airq_interrupt); } +static inline unsigned long iv_size(unsigned long bits) +{ + return BITS_TO_LONGS(bits) * sizeof(unsigned long); +} + /** * airq_iv_create - create an interrupt vector * @bits: number of bits in the interrupt vector @@ -132,17 +139,19 @@ struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags) goto out; iv->bits = bits; iv->flags = flags; - size = BITS_TO_LONGS(bits) * sizeof(unsigned long); + size = iv_size(bits); if (flags & AIRQ_IV_CACHELINE) { - if ((cache_line_size() * BITS_PER_BYTE) < bits) + if ((cache_line_size() * BITS_PER_BYTE) < bits + || !airq_iv_cache) goto out_free; - iv->vector = kmem_cache_zalloc(airq_iv_cache, GFP_KERNEL); + iv->vector = dma_pool_zalloc(airq_iv_cache, GFP_KERNEL, + &iv->vector_dma); if (!iv->vector) goto out_free; } else { - iv->vector = kzalloc(size, GFP_KERNEL); + iv->vector = cio_dma_zalloc(size); if (!iv->vector) goto out_free; } @@ -178,10 +187,10 @@ struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags) kfree(iv->ptr); kfree(iv->bitlock); kfree(iv->avail); - if (iv->flags & AIRQ_IV_CACHELINE) - kmem_cache_free(airq_iv_cache, iv->vector); + if (iv->flags & AIRQ_IV_CACHELINE && iv->vector) + dma_pool_free(airq_iv_cache, iv->vector, iv->vector_dma); else - kfree(iv->vector); + cio_dma_free(iv->vector, size); kfree(iv); out: return NULL; @@ -198,9 +207,9 @@ void airq_iv_release(struct airq_iv *iv) kfree(iv->ptr); kfree(iv->bitlock); if (iv->flags & AIRQ_IV_CACHELINE) - kmem_cache_free(airq_iv_cache, iv->vector); + dma_pool_free(airq_iv_cache, iv->vector, iv->vector_dma); else - kfree(iv->vector); + cio_dma_free(iv->vector, iv_size(iv->bits)); kfree(iv->avail); kfree(iv); } @@ -295,12 +304,12 @@ unsigned long airq_iv_scan(struct airq_iv *iv, unsigned long start, } EXPORT_SYMBOL(airq_iv_scan); -static int __init airq_init(void) +int __init airq_init(void) { - airq_iv_cache = kmem_cache_create("airq_iv_cache", cache_line_size(), - cache_line_size(), 0, NULL); + airq_iv_cache = dma_pool_create("airq_iv_cache", cio_get_dma_css_dev(), + cache_line_size(), + cache_line_size(), PAGE_SIZE); if (!airq_iv_cache) return -ENOMEM; return 0; } -subsys_initcall(airq_init); diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h index 06a91743335a..4d6c7d16416e 100644 --- a/drivers/s390/cio/cio.h +++ b/drivers/s390/cio/cio.h @@ -135,6 +135,8 @@ extern int cio_commit_config(struct subchannel *sch); int cio_tm_start_key(struct subchannel *sch, struct tcw *tcw, u8 lpm, u8 key); int cio_tm_intrg(struct subchannel *sch); +extern int __init airq_init(void); + /* Use with care. */ #ifdef CONFIG_CCW_CONSOLE extern struct subchannel *cio_probe_console(void); diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 7b1a440a1f8e..7159933d9d3e 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -1184,6 +1184,7 @@ static int __init css_bus_init(void) ret = cio_dma_pool_init(); if (ret) goto out_unregister_pmn; + airq_init(); css_init_done = 1; /* Enable default isc for I/O subchannels. */ From 01b3fb1ea00d5b2af77f41da69dd9dc859c12748 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Thu, 23 May 2019 16:50:07 +0200 Subject: [PATCH 40/83] virtio/s390: use cacheline aligned airq bit vectors The flag AIRQ_IV_CACHELINE was recently added to airq_iv_create(). Let us use it! We actually wanted the vector to span a cacheline all along. Signed-off-by: Halil Pasic Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Reviewed-by: Michael Mueller Tested-by: Michael Mueller Signed-off-by: Heiko Carstens --- drivers/s390/virtio/virtio_ccw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index f995798bb025..1da7430f94c8 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -216,7 +216,8 @@ static struct airq_info *new_airq_info(void) if (!info) return NULL; rwlock_init(&info->lock); - info->aiv = airq_iv_create(VIRTIO_IV_BITS, AIRQ_IV_ALLOC | AIRQ_IV_PTR); + info->aiv = airq_iv_create(VIRTIO_IV_BITS, AIRQ_IV_ALLOC | AIRQ_IV_PTR + | AIRQ_IV_CACHELINE); if (!info->aiv) { kfree(info); return NULL; From 22a4a639b9cebff4568f32202e96d6f286251b72 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Mon, 3 Dec 2018 17:18:07 +0100 Subject: [PATCH 41/83] virtio/s390: add indirection to indicators access This will come in handy soon when we pull out the indicators from virtio_ccw_device to a memory area that is shared with the hypervisor (in particular for protected virtualization guests). Signed-off-by: Halil Pasic Reviewed-by: Pierre Morel Reviewed-by: Cornelia Huck Reviewed-by: Michael Mueller Tested-by: Michael Mueller Signed-off-by: Heiko Carstens --- drivers/s390/virtio/virtio_ccw.c | 40 ++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 1da7430f94c8..e96a8cc56ec2 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -68,6 +68,16 @@ struct virtio_ccw_device { void *airq_info; }; +static inline unsigned long *indicators(struct virtio_ccw_device *vcdev) +{ + return &vcdev->indicators; +} + +static inline unsigned long *indicators2(struct virtio_ccw_device *vcdev) +{ + return &vcdev->indicators2; +} + struct vq_info_block_legacy { __u64 queue; __u32 align; @@ -338,17 +348,17 @@ static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev, ccw->cda = (__u32)(unsigned long) thinint_area; } else { /* payload is the address of the indicators */ - indicatorp = kmalloc(sizeof(&vcdev->indicators), + indicatorp = kmalloc(sizeof(indicators(vcdev)), GFP_DMA | GFP_KERNEL); if (!indicatorp) return; *indicatorp = 0; ccw->cmd_code = CCW_CMD_SET_IND; - ccw->count = sizeof(&vcdev->indicators); + ccw->count = sizeof(indicators(vcdev)); ccw->cda = (__u32)(unsigned long) indicatorp; } /* Deregister indicators from host. */ - vcdev->indicators = 0; + *indicators(vcdev) = 0; ccw->flags = 0; ret = ccw_io_helper(vcdev, ccw, vcdev->is_thinint ? @@ -657,10 +667,10 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, * We need a data area under 2G to communicate. Our payload is * the address of the indicators. */ - indicatorp = kmalloc(sizeof(&vcdev->indicators), GFP_DMA | GFP_KERNEL); + indicatorp = kmalloc(sizeof(indicators(vcdev)), GFP_DMA | GFP_KERNEL); if (!indicatorp) goto out; - *indicatorp = (unsigned long) &vcdev->indicators; + *indicatorp = (unsigned long) indicators(vcdev); if (vcdev->is_thinint) { ret = virtio_ccw_register_adapter_ind(vcdev, vqs, nvqs, ccw); if (ret) @@ -669,21 +679,21 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, } if (!vcdev->is_thinint) { /* Register queue indicators with host. */ - vcdev->indicators = 0; + *indicators(vcdev) = 0; ccw->cmd_code = CCW_CMD_SET_IND; ccw->flags = 0; - ccw->count = sizeof(&vcdev->indicators); + ccw->count = sizeof(indicators(vcdev)); ccw->cda = (__u32)(unsigned long) indicatorp; ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND); if (ret) goto out; } /* Register indicators2 with host for config changes */ - *indicatorp = (unsigned long) &vcdev->indicators2; - vcdev->indicators2 = 0; + *indicatorp = (unsigned long) indicators2(vcdev); + *indicators2(vcdev) = 0; ccw->cmd_code = CCW_CMD_SET_CONF_IND; ccw->flags = 0; - ccw->count = sizeof(&vcdev->indicators2); + ccw->count = sizeof(indicators2(vcdev)); ccw->cda = (__u32)(unsigned long) indicatorp; ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_CONF_IND); if (ret) @@ -1093,17 +1103,17 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev, vcdev->err = -EIO; } virtio_ccw_check_activity(vcdev, activity); - for_each_set_bit(i, &vcdev->indicators, - sizeof(vcdev->indicators) * BITS_PER_BYTE) { + for_each_set_bit(i, indicators(vcdev), + sizeof(*indicators(vcdev)) * BITS_PER_BYTE) { /* The bit clear must happen before the vring kick. */ - clear_bit(i, &vcdev->indicators); + clear_bit(i, indicators(vcdev)); barrier(); vq = virtio_ccw_vq_by_ind(vcdev, i); vring_interrupt(0, vq); } - if (test_bit(0, &vcdev->indicators2)) { + if (test_bit(0, indicators2(vcdev))) { virtio_config_changed(&vcdev->vdev); - clear_bit(0, &vcdev->indicators2); + clear_bit(0, indicators2(vcdev)); } } From 48720ba56891570e3b750b271d80efb631478630 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Mon, 1 Oct 2018 19:01:58 +0200 Subject: [PATCH 42/83] virtio/s390: use DMA memory for ccw I/O and classic notifiers Before virtio-ccw could get away with not using DMA API for the pieces of memory it does ccw I/O with. With protected virtualization this has to change, since the hypervisor needs to read and sometimes also write these pieces of memory. The hypervisor is supposed to poke the classic notifiers, if these are used, out of band with regards to ccw I/O. So these need to be allocated as DMA memory (which is shared memory for protected virtualization guests). Let us factor out everything from struct virtio_ccw_device that needs to be DMA memory in a satellite that is allocated as such. Note: The control blocks of I/O instructions do not need to be shared. These are marshalled by the ultravisor. Signed-off-by: Halil Pasic Reviewed-by: Pierre Morel Reviewed-by: Cornelia Huck Reviewed-by: Michael Mueller Tested-by: Michael Mueller Signed-off-by: Heiko Carstens --- drivers/s390/virtio/virtio_ccw.c | 169 ++++++++++++++++--------------- 1 file changed, 89 insertions(+), 80 deletions(-) diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index e96a8cc56ec2..800252955a2f 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -46,9 +46,15 @@ struct vq_config_block { #define VIRTIO_CCW_CONFIG_SIZE 0x100 /* same as PCI config space size, should be enough for all drivers */ +struct vcdev_dma_area { + unsigned long indicators; + unsigned long indicators2; + struct vq_config_block config_block; + __u8 status; +}; + struct virtio_ccw_device { struct virtio_device vdev; - __u8 *status; __u8 config[VIRTIO_CCW_CONFIG_SIZE]; struct ccw_device *cdev; __u32 curr_io; @@ -58,24 +64,22 @@ struct virtio_ccw_device { spinlock_t lock; struct mutex io_lock; /* Serializes I/O requests */ struct list_head virtqueues; - unsigned long indicators; - unsigned long indicators2; - struct vq_config_block *config_block; bool is_thinint; bool going_away; bool device_lost; unsigned int config_ready; void *airq_info; + struct vcdev_dma_area *dma_area; }; static inline unsigned long *indicators(struct virtio_ccw_device *vcdev) { - return &vcdev->indicators; + return &vcdev->dma_area->indicators; } static inline unsigned long *indicators2(struct virtio_ccw_device *vcdev) { - return &vcdev->indicators2; + return &vcdev->dma_area->indicators2; } struct vq_info_block_legacy { @@ -336,8 +340,8 @@ static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev, struct airq_info *airq_info = vcdev->airq_info; if (vcdev->is_thinint) { - thinint_area = kzalloc(sizeof(*thinint_area), - GFP_DMA | GFP_KERNEL); + thinint_area = ccw_device_dma_zalloc(vcdev->cdev, + sizeof(*thinint_area)); if (!thinint_area) return; thinint_area->summary_indicator = @@ -348,8 +352,8 @@ static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev, ccw->cda = (__u32)(unsigned long) thinint_area; } else { /* payload is the address of the indicators */ - indicatorp = kmalloc(sizeof(indicators(vcdev)), - GFP_DMA | GFP_KERNEL); + indicatorp = ccw_device_dma_zalloc(vcdev->cdev, + sizeof(indicators(vcdev))); if (!indicatorp) return; *indicatorp = 0; @@ -369,8 +373,8 @@ static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev, "Failed to deregister indicators (%d)\n", ret); else if (vcdev->is_thinint) virtio_ccw_drop_indicators(vcdev); - kfree(indicatorp); - kfree(thinint_area); + ccw_device_dma_free(vcdev->cdev, indicatorp, sizeof(indicators(vcdev))); + ccw_device_dma_free(vcdev->cdev, thinint_area, sizeof(*thinint_area)); } static inline long __do_kvm_notify(struct subchannel_id schid, @@ -417,15 +421,15 @@ static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev, { int ret; - vcdev->config_block->index = index; + vcdev->dma_area->config_block.index = index; ccw->cmd_code = CCW_CMD_READ_VQ_CONF; ccw->flags = 0; ccw->count = sizeof(struct vq_config_block); - ccw->cda = (__u32)(unsigned long)(vcdev->config_block); + ccw->cda = (__u32)(unsigned long)(&vcdev->dma_area->config_block); ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_VQ_CONF); if (ret) return ret; - return vcdev->config_block->num ?: -ENOENT; + return vcdev->dma_area->config_block.num ?: -ENOENT; } static void virtio_ccw_del_vq(struct virtqueue *vq, struct ccw1 *ccw) @@ -470,7 +474,8 @@ static void virtio_ccw_del_vq(struct virtqueue *vq, struct ccw1 *ccw) ret, index); vring_del_virtqueue(vq); - kfree(info->info_block); + ccw_device_dma_free(vcdev->cdev, info->info_block, + sizeof(*info->info_block)); kfree(info); } @@ -480,7 +485,7 @@ static void virtio_ccw_del_vqs(struct virtio_device *vdev) struct ccw1 *ccw; struct virtio_ccw_device *vcdev = to_vc_device(vdev); - ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); + ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw)); if (!ccw) return; @@ -489,7 +494,7 @@ static void virtio_ccw_del_vqs(struct virtio_device *vdev) list_for_each_entry_safe(vq, n, &vdev->vqs, list) virtio_ccw_del_vq(vq, ccw); - kfree(ccw); + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); } static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, @@ -512,8 +517,8 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, err = -ENOMEM; goto out_err; } - info->info_block = kzalloc(sizeof(*info->info_block), - GFP_DMA | GFP_KERNEL); + info->info_block = ccw_device_dma_zalloc(vcdev->cdev, + sizeof(*info->info_block)); if (!info->info_block) { dev_warn(&vcdev->cdev->dev, "no info block\n"); err = -ENOMEM; @@ -577,7 +582,8 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, if (vq) vring_del_virtqueue(vq); if (info) { - kfree(info->info_block); + ccw_device_dma_free(vcdev->cdev, info->info_block, + sizeof(*info->info_block)); } kfree(info); return ERR_PTR(err); @@ -591,7 +597,8 @@ static int virtio_ccw_register_adapter_ind(struct virtio_ccw_device *vcdev, struct virtio_thinint_area *thinint_area = NULL; struct airq_info *info; - thinint_area = kzalloc(sizeof(*thinint_area), GFP_DMA | GFP_KERNEL); + thinint_area = ccw_device_dma_zalloc(vcdev->cdev, + sizeof(*thinint_area)); if (!thinint_area) { ret = -ENOMEM; goto out; @@ -627,7 +634,7 @@ static int virtio_ccw_register_adapter_ind(struct virtio_ccw_device *vcdev, virtio_ccw_drop_indicators(vcdev); } out: - kfree(thinint_area); + ccw_device_dma_free(vcdev->cdev, thinint_area, sizeof(*thinint_area)); return ret; } @@ -643,7 +650,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, int ret, i, queue_idx = 0; struct ccw1 *ccw; - ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); + ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw)); if (!ccw) return -ENOMEM; @@ -667,7 +674,8 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, * We need a data area under 2G to communicate. Our payload is * the address of the indicators. */ - indicatorp = kmalloc(sizeof(indicators(vcdev)), GFP_DMA | GFP_KERNEL); + indicatorp = ccw_device_dma_zalloc(vcdev->cdev, + sizeof(indicators(vcdev))); if (!indicatorp) goto out; *indicatorp = (unsigned long) indicators(vcdev); @@ -699,12 +707,16 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, if (ret) goto out; - kfree(indicatorp); - kfree(ccw); + if (indicatorp) + ccw_device_dma_free(vcdev->cdev, indicatorp, + sizeof(indicators(vcdev))); + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); return 0; out: - kfree(indicatorp); - kfree(ccw); + if (indicatorp) + ccw_device_dma_free(vcdev->cdev, indicatorp, + sizeof(indicators(vcdev))); + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); virtio_ccw_del_vqs(vdev); return ret; } @@ -714,12 +726,12 @@ static void virtio_ccw_reset(struct virtio_device *vdev) struct virtio_ccw_device *vcdev = to_vc_device(vdev); struct ccw1 *ccw; - ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); + ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw)); if (!ccw) return; /* Zero status bits. */ - *vcdev->status = 0; + vcdev->dma_area->status = 0; /* Send a reset ccw on device. */ ccw->cmd_code = CCW_CMD_VDEV_RESET; @@ -727,7 +739,7 @@ static void virtio_ccw_reset(struct virtio_device *vdev) ccw->count = 0; ccw->cda = 0; ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_RESET); - kfree(ccw); + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); } static u64 virtio_ccw_get_features(struct virtio_device *vdev) @@ -738,11 +750,11 @@ static u64 virtio_ccw_get_features(struct virtio_device *vdev) u64 rc; struct ccw1 *ccw; - ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); + ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw)); if (!ccw) return 0; - features = kzalloc(sizeof(*features), GFP_DMA | GFP_KERNEL); + features = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*features)); if (!features) { rc = 0; goto out_free; @@ -775,8 +787,8 @@ static u64 virtio_ccw_get_features(struct virtio_device *vdev) rc |= (u64)le32_to_cpu(features->features) << 32; out_free: - kfree(features); - kfree(ccw); + ccw_device_dma_free(vcdev->cdev, features, sizeof(*features)); + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); return rc; } @@ -801,11 +813,11 @@ static int virtio_ccw_finalize_features(struct virtio_device *vdev) return -EINVAL; } - ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); + ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw)); if (!ccw) return -ENOMEM; - features = kzalloc(sizeof(*features), GFP_DMA | GFP_KERNEL); + features = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*features)); if (!features) { ret = -ENOMEM; goto out_free; @@ -840,8 +852,8 @@ static int virtio_ccw_finalize_features(struct virtio_device *vdev) ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_FEAT); out_free: - kfree(features); - kfree(ccw); + ccw_device_dma_free(vcdev->cdev, features, sizeof(*features)); + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); return ret; } @@ -855,11 +867,12 @@ static void virtio_ccw_get_config(struct virtio_device *vdev, void *config_area; unsigned long flags; - ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); + ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw)); if (!ccw) return; - config_area = kzalloc(VIRTIO_CCW_CONFIG_SIZE, GFP_DMA | GFP_KERNEL); + config_area = ccw_device_dma_zalloc(vcdev->cdev, + VIRTIO_CCW_CONFIG_SIZE); if (!config_area) goto out_free; @@ -881,8 +894,8 @@ static void virtio_ccw_get_config(struct virtio_device *vdev, memcpy(buf, config_area + offset, len); out_free: - kfree(config_area); - kfree(ccw); + ccw_device_dma_free(vcdev->cdev, config_area, VIRTIO_CCW_CONFIG_SIZE); + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); } static void virtio_ccw_set_config(struct virtio_device *vdev, @@ -894,11 +907,12 @@ static void virtio_ccw_set_config(struct virtio_device *vdev, void *config_area; unsigned long flags; - ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); + ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw)); if (!ccw) return; - config_area = kzalloc(VIRTIO_CCW_CONFIG_SIZE, GFP_DMA | GFP_KERNEL); + config_area = ccw_device_dma_zalloc(vcdev->cdev, + VIRTIO_CCW_CONFIG_SIZE); if (!config_area) goto out_free; @@ -917,61 +931,61 @@ static void virtio_ccw_set_config(struct virtio_device *vdev, ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_CONFIG); out_free: - kfree(config_area); - kfree(ccw); + ccw_device_dma_free(vcdev->cdev, config_area, VIRTIO_CCW_CONFIG_SIZE); + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); } static u8 virtio_ccw_get_status(struct virtio_device *vdev) { struct virtio_ccw_device *vcdev = to_vc_device(vdev); - u8 old_status = *vcdev->status; + u8 old_status = vcdev->dma_area->status; struct ccw1 *ccw; if (vcdev->revision < 1) - return *vcdev->status; + return vcdev->dma_area->status; - ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); + ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw)); if (!ccw) return old_status; ccw->cmd_code = CCW_CMD_READ_STATUS; ccw->flags = 0; - ccw->count = sizeof(*vcdev->status); - ccw->cda = (__u32)(unsigned long)vcdev->status; + ccw->count = sizeof(vcdev->dma_area->status); + ccw->cda = (__u32)(unsigned long)&vcdev->dma_area->status; ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_STATUS); /* * If the channel program failed (should only happen if the device * was hotunplugged, and then we clean up via the machine check - * handler anyway), vcdev->status was not overwritten and we just + * handler anyway), vcdev->dma_area->status was not overwritten and we just * return the old status, which is fine. */ - kfree(ccw); + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); - return *vcdev->status; + return vcdev->dma_area->status; } static void virtio_ccw_set_status(struct virtio_device *vdev, u8 status) { struct virtio_ccw_device *vcdev = to_vc_device(vdev); - u8 old_status = *vcdev->status; + u8 old_status = vcdev->dma_area->status; struct ccw1 *ccw; int ret; - ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); + ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw)); if (!ccw) return; /* Write the status to the host. */ - *vcdev->status = status; + vcdev->dma_area->status = status; ccw->cmd_code = CCW_CMD_WRITE_STATUS; ccw->flags = 0; ccw->count = sizeof(status); - ccw->cda = (__u32)(unsigned long)vcdev->status; + ccw->cda = (__u32)(unsigned long)&vcdev->dma_area->status; ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_STATUS); /* Write failed? We assume status is unchanged. */ if (ret) - *vcdev->status = old_status; - kfree(ccw); + vcdev->dma_area->status = old_status; + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); } static const char *virtio_ccw_bus_name(struct virtio_device *vdev) @@ -1004,8 +1018,8 @@ static void virtio_ccw_release_dev(struct device *_d) struct virtio_device *dev = dev_to_virtio(_d); struct virtio_ccw_device *vcdev = to_vc_device(dev); - kfree(vcdev->status); - kfree(vcdev->config_block); + ccw_device_dma_free(vcdev->cdev, vcdev->dma_area, + sizeof(*vcdev->dma_area)); kfree(vcdev); } @@ -1213,12 +1227,12 @@ static int virtio_ccw_set_transport_rev(struct virtio_ccw_device *vcdev) struct ccw1 *ccw; int ret; - ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); + ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw)); if (!ccw) return -ENOMEM; - rev = kzalloc(sizeof(*rev), GFP_DMA | GFP_KERNEL); + rev = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*rev)); if (!rev) { - kfree(ccw); + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); return -ENOMEM; } @@ -1248,8 +1262,8 @@ static int virtio_ccw_set_transport_rev(struct virtio_ccw_device *vcdev) } } while (ret == -EOPNOTSUPP); - kfree(ccw); - kfree(rev); + ccw_device_dma_free(vcdev->cdev, ccw, sizeof(*ccw)); + ccw_device_dma_free(vcdev->cdev, rev, sizeof(*rev)); return ret; } @@ -1266,14 +1280,10 @@ static int virtio_ccw_online(struct ccw_device *cdev) goto out_free; } vcdev->vdev.dev.parent = &cdev->dev; - vcdev->config_block = kzalloc(sizeof(*vcdev->config_block), - GFP_DMA | GFP_KERNEL); - if (!vcdev->config_block) { - ret = -ENOMEM; - goto out_free; - } - vcdev->status = kzalloc(sizeof(*vcdev->status), GFP_DMA | GFP_KERNEL); - if (!vcdev->status) { + vcdev->cdev = cdev; + vcdev->dma_area = ccw_device_dma_zalloc(vcdev->cdev, + sizeof(*vcdev->dma_area)); + if (!vcdev->dma_area) { ret = -ENOMEM; goto out_free; } @@ -1282,7 +1292,6 @@ static int virtio_ccw_online(struct ccw_device *cdev) vcdev->vdev.dev.release = virtio_ccw_release_dev; vcdev->vdev.config = &virtio_ccw_config_ops; - vcdev->cdev = cdev; init_waitqueue_head(&vcdev->wait_q); INIT_LIST_HEAD(&vcdev->virtqueues); spin_lock_init(&vcdev->lock); @@ -1313,8 +1322,8 @@ static int virtio_ccw_online(struct ccw_device *cdev) return ret; out_free: if (vcdev) { - kfree(vcdev->status); - kfree(vcdev->config_block); + ccw_device_dma_free(vcdev->cdev, vcdev->dma_area, + sizeof(*vcdev->dma_area)); } kfree(vcdev); return ret; From 39c7dcb158924f84e04f4c2433d164eee845a732 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Tue, 26 Mar 2019 19:03:47 +0100 Subject: [PATCH 43/83] virtio/s390: make airq summary indicators DMA The hypervisor needs to interact with the summary indicators, so these need to be DMA memory as well (at least for protected virtualization guests). Signed-off-by: Halil Pasic Reviewed-by: Cornelia Huck Reviewed-by: Michael Mueller Tested-by: Michael Mueller Signed-off-by: Heiko Carstens --- drivers/s390/virtio/virtio_ccw.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 800252955a2f..1a55e5942d36 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -140,11 +140,17 @@ static int virtio_ccw_use_airq = 1; struct airq_info { rwlock_t lock; - u8 summary_indicator; + u8 summary_indicator_idx; struct airq_struct airq; struct airq_iv *aiv; }; static struct airq_info *airq_areas[MAX_AIRQ_AREAS]; +static u8 *summary_indicators; + +static inline u8 *get_summary_indicator(struct airq_info *info) +{ + return summary_indicators + info->summary_indicator_idx; +} #define CCW_CMD_SET_VQ 0x13 #define CCW_CMD_VDEV_RESET 0x33 @@ -209,7 +215,7 @@ static void virtio_airq_handler(struct airq_struct *airq, bool floating) break; vring_interrupt(0, (void *)airq_iv_get_ptr(info->aiv, ai)); } - info->summary_indicator = 0; + *(get_summary_indicator(info)) = 0; smp_wmb(); /* Walk through indicators field, summary indicator not active. */ for (ai = 0;;) { @@ -221,7 +227,7 @@ static void virtio_airq_handler(struct airq_struct *airq, bool floating) read_unlock(&info->lock); } -static struct airq_info *new_airq_info(void) +static struct airq_info *new_airq_info(int index) { struct airq_info *info; int rc; @@ -237,7 +243,8 @@ static struct airq_info *new_airq_info(void) return NULL; } info->airq.handler = virtio_airq_handler; - info->airq.lsi_ptr = &info->summary_indicator; + info->summary_indicator_idx = index; + info->airq.lsi_ptr = get_summary_indicator(info); info->airq.lsi_mask = 0xff; info->airq.isc = VIRTIO_AIRQ_ISC; rc = register_adapter_interrupt(&info->airq); @@ -259,7 +266,7 @@ static unsigned long get_airq_indicator(struct virtqueue *vqs[], int nvqs, for (i = 0; i < MAX_AIRQ_AREAS && !indicator_addr; i++) { if (!airq_areas[i]) - airq_areas[i] = new_airq_info(); + airq_areas[i] = new_airq_info(i); info = airq_areas[i]; if (!info) return 0; @@ -345,7 +352,7 @@ static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev, if (!thinint_area) return; thinint_area->summary_indicator = - (unsigned long) &airq_info->summary_indicator; + (unsigned long) get_summary_indicator(airq_info); thinint_area->isc = VIRTIO_AIRQ_ISC; ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER; ccw->count = sizeof(*thinint_area); @@ -613,7 +620,7 @@ static int virtio_ccw_register_adapter_ind(struct virtio_ccw_device *vcdev, } info = vcdev->airq_info; thinint_area->summary_indicator = - (unsigned long) &info->summary_indicator; + (unsigned long) get_summary_indicator(info); thinint_area->isc = VIRTIO_AIRQ_ISC; ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER; ccw->flags = CCW_FLAG_SLI; @@ -1493,8 +1500,17 @@ static void __init no_auto_parse(void) static int __init virtio_ccw_init(void) { + int rc; + /* parse no_auto string before we do anything further */ no_auto_parse(); - return ccw_driver_register(&virtio_ccw_driver); + + summary_indicators = cio_dma_zalloc(MAX_AIRQ_AREAS); + if (!summary_indicators) + return -ENOMEM; + rc = ccw_driver_register(&virtio_ccw_driver); + if (rc) + cio_dma_free(summary_indicators, MAX_AIRQ_AREAS); + return rc; } device_initcall(virtio_ccw_init); From b4e3133b65987f349a1cba96169c4485909c91ad Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Fri, 14 Jun 2019 13:02:16 +0200 Subject: [PATCH 44/83] s390/traps: simplify data exception handler Simplify conditions and remove unnecessary variable in data exception handler. Signed-off-by: Vasily Gorbik Reviewed-by: Heiko Carstens Reviewed-by: Hendrik Brueckner Signed-off-by: Heiko Carstens --- arch/s390/kernel/traps.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 82e81a9f7112..4736b6ec0ad2 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -229,17 +229,11 @@ void vector_exception(struct pt_regs *regs) void data_exception(struct pt_regs *regs) { - int signal = 0; - save_fpu_regs(); if (current->thread.fpu.fpc & FPC_DXC_MASK) - signal = SIGFPE; - else - signal = SIGILL; - if (signal == SIGFPE) do_fp_trap(regs, current->thread.fpu.fpc); - else if (signal) - do_trap(regs, signal, ILL_ILLOPN, "data exception"); + else + do_trap(regs, SIGILL, ILL_ILLOPN, "data exception"); } void space_switch_exception(struct pt_regs *regs) From 7928260539f3a13b5b23a3fa0a7c0e4f5255940b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 8 Jun 2019 11:39:05 +0200 Subject: [PATCH 45/83] processor: remove spin_cpu_yield spin_cpu_yield is unused, therefore remove it. Acked-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Signed-off-by: Heiko Carstens --- arch/powerpc/include/asm/processor.h | 2 -- include/linux/processor.h | 9 --------- 2 files changed, 11 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index ef573fe9873e..a9993e7a443b 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -346,8 +346,6 @@ static inline unsigned long __pack_fe01(unsigned int fpmode) #define spin_cpu_relax() barrier() -#define spin_cpu_yield() spin_cpu_relax() - #define spin_end() HMT_medium() #define spin_until_cond(cond) \ diff --git a/include/linux/processor.h b/include/linux/processor.h index dbc952eec869..dc78bdc7079a 100644 --- a/include/linux/processor.h +++ b/include/linux/processor.h @@ -32,15 +32,6 @@ #define spin_cpu_relax() cpu_relax() #endif -/* - * spin_cpu_yield may be called to yield (undirected) to the hypervisor if - * necessary. This should be used if the wait is expected to take longer - * than context switch overhead, but we can't sleep or do a directed yield. - */ -#ifndef spin_cpu_yield -#define spin_cpu_yield() cpu_relax_yield() -#endif - #ifndef spin_end #define spin_end() #endif From 38f2c691a4b3e89d476f8e8350d1ca299974b89d Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 17 May 2019 12:50:42 +0200 Subject: [PATCH 46/83] s390: improve wait logic of stop_machine The stop_machine loop to advance the state machine and to wait for all affected CPUs to check-in calls cpu_relax_yield in a tight loop until the last missing CPUs acknowledged the state transition. On a virtual system where not all logical CPUs are backed by real CPUs all the time it can take a while for all CPUs to check-in. With the current definition of cpu_relax_yield a diagnose 0x44 is done which tells the hypervisor to schedule *some* other CPU. That can be any CPU and not necessarily one of the CPUs that need to run in order to advance the state machine. This can lead to a pretty bad diagnose 0x44 storm until the last missing CPU finally checked-in. Replace the undirected cpu_relax_yield based on diagnose 0x44 with a directed yield. Each CPU in the wait loop will pick up the next CPU in the cpumask of stop_machine. The diagnose 0x9c is used to tell the hypervisor to run this next CPU instead of the current one. If there is only a limited number of real CPUs backing the virtual CPUs we end up with the real CPUs passed around in a round-robin fashion. [heiko.carstens@de.ibm.com]: Use cpumask_next_wrap as suggested by Peter Zijlstra. Signed-off-by: Martin Schwidefsky Acked-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Signed-off-by: Heiko Carstens --- arch/s390/include/asm/processor.h | 3 ++- arch/s390/kernel/processor.c | 17 ++++++++++++----- arch/s390/kernel/smp.c | 2 +- include/linux/sched.h | 2 +- kernel/stop_machine.c | 14 +++++++++----- 5 files changed, 25 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index b0fcbc37b637..445ce9ee4404 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -36,6 +36,7 @@ #ifndef __ASSEMBLY__ +#include #include #include #include @@ -225,7 +226,7 @@ static __no_kasan_or_inline unsigned short stap(void) * Give up the time slice of the virtual PU. */ #define cpu_relax_yield cpu_relax_yield -void cpu_relax_yield(void); +void cpu_relax_yield(const struct cpumask *cpumask); #define cpu_relax() barrier() diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 5de13307b703..4cdaefec1b7c 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -31,6 +31,7 @@ struct cpu_info { }; static DEFINE_PER_CPU(struct cpu_info, cpu_info); +static DEFINE_PER_CPU(int, cpu_relax_retry); static bool machine_has_cpu_mhz; @@ -58,13 +59,19 @@ void s390_update_cpu_mhz(void) on_each_cpu(update_cpu_mhz, NULL, 0); } -void notrace cpu_relax_yield(void) +void notrace cpu_relax_yield(const struct cpumask *cpumask) { - if (!smp_cpu_mtid && MACHINE_HAS_DIAG44) { - diag_stat_inc(DIAG_STAT_X044); - asm volatile("diag 0,0,0x44"); + int cpu, this_cpu; + + this_cpu = smp_processor_id(); + if (__this_cpu_inc_return(cpu_relax_retry) >= spin_retry) { + __this_cpu_write(cpu_relax_retry, 0); + cpu = cpumask_next_wrap(this_cpu, cpumask, this_cpu, false); + if (cpu >= nr_cpu_ids) + return; + if (arch_vcpu_is_preempted(cpu)) + smp_yield_cpu(cpu); } - barrier(); } EXPORT_SYMBOL(cpu_relax_yield); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index f00955940694..44974654cbd0 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -414,7 +414,7 @@ void smp_yield_cpu(int cpu) diag_stat_inc_norecursion(DIAG_STAT_X09C); asm volatile("diag %0,0,0x9c" : : "d" (pcpu_devices[cpu].address)); - } else if (MACHINE_HAS_DIAG44) { + } else if (MACHINE_HAS_DIAG44 && !smp_cpu_mtid) { diag_stat_inc_norecursion(DIAG_STAT_X044); asm volatile("diag 0,0,0x44"); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 11837410690f..1f9f3160da7e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1519,7 +1519,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma #endif #ifndef cpu_relax_yield -#define cpu_relax_yield() cpu_relax() +#define cpu_relax_yield(cpumask) cpu_relax() #endif extern int yield_to(struct task_struct *p, bool preempt); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2b5a6754646f..b8b0c5ff8da9 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -183,6 +183,7 @@ static int multi_cpu_stop(void *data) struct multi_stop_data *msdata = data; enum multi_stop_state curstate = MULTI_STOP_NONE; int cpu = smp_processor_id(), err = 0; + const struct cpumask *cpumask; unsigned long flags; bool is_active; @@ -192,15 +193,18 @@ static int multi_cpu_stop(void *data) */ local_save_flags(flags); - if (!msdata->active_cpus) - is_active = cpu == cpumask_first(cpu_online_mask); - else - is_active = cpumask_test_cpu(cpu, msdata->active_cpus); + if (!msdata->active_cpus) { + cpumask = cpu_online_mask; + is_active = cpu == cpumask_first(cpumask); + } else { + cpumask = msdata->active_cpus; + is_active = cpumask_test_cpu(cpu, cpumask); + } /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ - cpu_relax_yield(); + cpu_relax_yield(cpumask); if (msdata->state != curstate) { curstate = msdata->state; switch (curstate) { From 4ecf0a43e729a7e641d800c294faabe87378fc05 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 8 Jun 2019 12:13:57 +0200 Subject: [PATCH 47/83] processor: get rid of cpu_relax_yield stop_machine is the only user left of cpu_relax_yield. Given that it now has special semantics which are tied to stop_machine introduce a weak stop_machine_yield function which architectures can override, and get rid of the generic cpu_relax_yield implementation. Acked-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Signed-off-by: Heiko Carstens --- arch/s390/include/asm/processor.h | 6 ------ arch/s390/kernel/processor.c | 4 ++-- include/linux/sched.h | 4 ---- include/linux/stop_machine.h | 1 + kernel/stop_machine.c | 7 ++++++- 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 445ce9ee4404..14883b1562e0 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -222,12 +222,6 @@ static __no_kasan_or_inline unsigned short stap(void) return cpu_address; } -/* - * Give up the time slice of the virtual PU. - */ -#define cpu_relax_yield cpu_relax_yield -void cpu_relax_yield(const struct cpumask *cpumask); - #define cpu_relax() barrier() #define ECAG_CACHE_ATTRIBUTE 0 diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 4cdaefec1b7c..6ebc2117c66c 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -7,6 +7,7 @@ #define KMSG_COMPONENT "cpu" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include #include #include #include @@ -59,7 +60,7 @@ void s390_update_cpu_mhz(void) on_each_cpu(update_cpu_mhz, NULL, 0); } -void notrace cpu_relax_yield(const struct cpumask *cpumask) +void notrace stop_machine_yield(const struct cpumask *cpumask) { int cpu, this_cpu; @@ -73,7 +74,6 @@ void notrace cpu_relax_yield(const struct cpumask *cpumask) smp_yield_cpu(cpu); } } -EXPORT_SYMBOL(cpu_relax_yield); /* * cpu_init - initializes state that is per-CPU. diff --git a/include/linux/sched.h b/include/linux/sched.h index 1f9f3160da7e..911675416b05 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1518,10 +1518,6 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma } #endif -#ifndef cpu_relax_yield -#define cpu_relax_yield(cpumask) cpu_relax() -#endif - extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 6d3635c86dbe..f9a0c6189852 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -36,6 +36,7 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); void stop_machine_park(int cpu); void stop_machine_unpark(int cpu); +void stop_machine_yield(const struct cpumask *cpumask); #else /* CONFIG_SMP */ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b8b0c5ff8da9..b4f83f7bdf86 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -177,6 +177,11 @@ static void ack_state(struct multi_stop_data *msdata) set_state(msdata, msdata->state + 1); } +void __weak stop_machine_yield(const struct cpumask *cpumask) +{ + cpu_relax(); +} + /* This is the cpu_stop function which stops the CPU. */ static int multi_cpu_stop(void *data) { @@ -204,7 +209,7 @@ static int multi_cpu_stop(void *data) /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ - cpu_relax_yield(cpumask); + stop_machine_yield(cpumask); if (msdata->state != curstate) { curstate = msdata->state; switch (curstate) { From 39c00378e337c869b0c9cd35e108fc0c8671d644 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 2 Jan 2019 08:11:40 +0100 Subject: [PATCH 48/83] Update default configuration Signed-off-by: Martin Schwidefsky Signed-off-by: Heiko Carstens --- arch/s390/configs/debug_defconfig | 2 ++ arch/s390/configs/defconfig | 7 ++++--- arch/s390/configs/performance_defconfig | 2 ++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index b0920b35f87b..a6dc01a22048 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -88,6 +88,7 @@ CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y CONFIG_CHSC_SCH=y CONFIG_VFIO_AP=m +CONFIG_VFIO_CCW=m CONFIG_CRASH_DUMP=y CONFIG_BINFMT_MISC=m CONFIG_HIBERNATION=y @@ -498,6 +499,7 @@ CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y CONFIG_S390_AP_IOMMU=y +CONFIG_S390_CCW_IOMMU=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 18cff2e4607d..697a40a8af3e 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -39,7 +39,7 @@ CONFIG_NR_CPUS=256 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_VERIFY_SIG=y +# CONFIG_RELOCATABLE is not set CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y @@ -53,7 +53,6 @@ CONFIG_MODULE_UNLOAD=y CONFIG_BLK_DEV_INTEGRITY=y CONFIG_PARTITION_ADVANCED=y CONFIG_IBM_PARTITION=y -CONFIG_DEFAULT_DEADLINE=y CONFIG_BINFMT_MISC=m CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y @@ -94,13 +93,13 @@ CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y CONFIG_BPF_JIT=y +CONFIG_UEVENT_HELPER=y CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y CONFIG_VIRTIO_BLK=y CONFIG_SCSI=y -# CONFIG_SCSI_MQ_DEFAULT is not set CONFIG_BLK_DEV_SD=y CONFIG_CHR_DEV_ST=y CONFIG_BLK_DEV_SR=y @@ -161,6 +160,7 @@ CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y # CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_AUTHENC=m CONFIG_CRYPTO_TEST=m @@ -173,6 +173,7 @@ CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_XTS=m +CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 09aa5cb14873..e4bc40073003 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -86,6 +86,7 @@ CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y CONFIG_CHSC_SCH=y CONFIG_VFIO_AP=m +CONFIG_VFIO_CCW=m CONFIG_CRASH_DUMP=y CONFIG_BINFMT_MISC=m CONFIG_HIBERNATION=y @@ -495,6 +496,7 @@ CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y CONFIG_S390_AP_IOMMU=y +CONFIG_S390_CCW_IOMMU=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y From 812271b91006e59407ea30ee72341d92b240f667 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 6 Jun 2019 22:28:23 +0200 Subject: [PATCH 49/83] s390/cio: Squash cp_free() and cp_unpin_free() The routine cp_free() does nothing but call cp_unpin_free(), and while most places call cp_free() there is one caller of cp_unpin_free() used when the cp is guaranteed to have not been marked initialized. This seems like a dubious way to make a distinction, so let's combine these routines and make cp_free() do all the work. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20190606202831.44135-2-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 36 +++++++++++++++------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index f73cfcfdd032..47cd7f94f42f 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -412,23 +412,6 @@ static void ccwchain_cda_free(struct ccwchain *chain, int idx) kfree((void *)(u64)ccw->cda); } -/* Unpin the pages then free the memory resources. */ -static void cp_unpin_free(struct channel_program *cp) -{ - struct ccwchain *chain, *temp; - int i; - - cp->initialized = false; - list_for_each_entry_safe(chain, temp, &cp->ccwchain_list, next) { - for (i = 0; i < chain->ch_len; i++) { - pfn_array_table_unpin_free(chain->ch_pat + i, - cp->mdev); - ccwchain_cda_free(chain, i); - } - ccwchain_free(chain); - } -} - /** * ccwchain_calc_length - calculate the length of the ccw chain. * @iova: guest physical address of the target ccw chain @@ -796,7 +779,7 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) /* Now loop for its TICs. */ ret = ccwchain_loop_tic(chain, cp); if (ret) - cp_unpin_free(cp); + cp_free(cp); /* It is safe to force: if not set but idals used * ccwchain_calc_length returns an error. */ @@ -819,8 +802,21 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) */ void cp_free(struct channel_program *cp) { - if (cp->initialized) - cp_unpin_free(cp); + struct ccwchain *chain, *temp; + int i; + + if (!cp->initialized) + return; + + cp->initialized = false; + list_for_each_entry_safe(chain, temp, &cp->ccwchain_list, next) { + for (i = 0; i < chain->ch_len; i++) { + pfn_array_table_unpin_free(chain->ch_pat + i, + cp->mdev); + ccwchain_cda_free(chain, i); + } + ccwchain_free(chain); + } } /** From e64bd68946bec40d9dfe6ea5d22733119908e6f3 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 6 Jun 2019 22:28:24 +0200 Subject: [PATCH 50/83] s390/cio: Refactor the routine that handles TIC CCWs Extract the "does the target of this TIC already exist?" check from ccwchain_handle_tic(), so that it's easier to refactor that function into one that cp_init() is able to use. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20190606202831.44135-3-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 47cd7f94f42f..628daf1a8f9a 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -502,10 +502,6 @@ static int ccwchain_handle_tic(struct ccw1 *tic, struct channel_program *cp) struct ccwchain *chain; int len, ret; - /* May transfer to an existing chain. */ - if (tic_target_chain_exists(tic, cp)) - return 0; - /* Get chain length. */ len = ccwchain_calc_length(tic->cda, cp); if (len < 0) @@ -540,6 +536,10 @@ static int ccwchain_loop_tic(struct ccwchain *chain, struct channel_program *cp) if (!ccw_is_tic(tic)) continue; + /* May transfer to an existing chain. */ + if (tic_target_chain_exists(tic, cp)) + continue; + ret = ccwchain_handle_tic(tic, cp); if (ret) return ret; From 363fe5f7aee0c2001af4cc4d7dd028cfa2eee64a Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 6 Jun 2019 22:28:25 +0200 Subject: [PATCH 51/83] s390/cio: Generalize the TIC handler Refactor ccwchain_handle_tic() into a routine that handles a channel program address (which itself is a CCW pointer), rather than a CCW pointer that is only a TIC CCW. This will make it easier to reuse this code for other CCW commands. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20190606202831.44135-4-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 628daf1a8f9a..52735cdb0270 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -497,13 +497,13 @@ static int tic_target_chain_exists(struct ccw1 *tic, struct channel_program *cp) static int ccwchain_loop_tic(struct ccwchain *chain, struct channel_program *cp); -static int ccwchain_handle_tic(struct ccw1 *tic, struct channel_program *cp) +static int ccwchain_handle_ccw(u32 cda, struct channel_program *cp) { struct ccwchain *chain; int len, ret; /* Get chain length. */ - len = ccwchain_calc_length(tic->cda, cp); + len = ccwchain_calc_length(cda, cp); if (len < 0) return len; @@ -511,10 +511,10 @@ static int ccwchain_handle_tic(struct ccw1 *tic, struct channel_program *cp) chain = ccwchain_alloc(cp, len); if (!chain) return -ENOMEM; - chain->ch_iova = tic->cda; + chain->ch_iova = cda; /* Copy the new chain from user. */ - ret = copy_ccw_from_iova(cp, chain->ch_ccw, tic->cda, len); + ret = copy_ccw_from_iova(cp, chain->ch_ccw, cda, len); if (ret) { ccwchain_free(chain); return ret; @@ -540,7 +540,8 @@ static int ccwchain_loop_tic(struct ccwchain *chain, struct channel_program *cp) if (tic_target_chain_exists(tic, cp)) continue; - ret = ccwchain_handle_tic(tic, cp); + /* Build a ccwchain for the next segment */ + ret = ccwchain_handle_ccw(tic->cda, cp); if (ret) return ret; } From 99afcb05d973f7f74c0c4b8a8c5f6f87c8427aa3 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 6 Jun 2019 22:28:26 +0200 Subject: [PATCH 52/83] s390/cio: Use generalized CCW handler in cp_init() It is now pretty apparent that ccwchain_handle_ccw() (nee ccwchain_handle_tic()) does everything that cp_init() wants to do. Let's remove that duplicated code from cp_init() and let ccwchain_handle_ccw() handle it itself. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20190606202831.44135-5-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 52735cdb0270..5b98bea433b7 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -744,9 +744,7 @@ static int ccwchain_fetch_one(struct ccwchain *chain, */ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) { - u64 iova = orb->cmd.cpa; - struct ccwchain *chain; - int len, ret; + int ret; /* * XXX: @@ -759,28 +757,11 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) memcpy(&cp->orb, orb, sizeof(*orb)); cp->mdev = mdev; - /* Get chain length. */ - len = ccwchain_calc_length(iova, cp); - if (len < 0) - return len; - - /* Alloc mem for the head chain. */ - chain = ccwchain_alloc(cp, len); - if (!chain) - return -ENOMEM; - chain->ch_iova = iova; - - /* Copy the head chain from guest. */ - ret = copy_ccw_from_iova(cp, chain->ch_ccw, iova, len); - if (ret) { - ccwchain_free(chain); - return ret; - } - - /* Now loop for its TICs. */ - ret = ccwchain_loop_tic(chain, cp); + /* Build a ccwchain for the first CCW segment */ + ret = ccwchain_handle_ccw(orb->cmd.cpa, cp); if (ret) cp_free(cp); + /* It is safe to force: if not set but idals used * ccwchain_calc_length returns an error. */ From cc06ee983cffc28fbd9c31908137f7b3c097f5a7 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 6 Jun 2019 22:28:27 +0200 Subject: [PATCH 53/83] vfio-ccw: Rearrange pfn_array and pfn_array_table arrays While processing a channel program, we currently have two nested arrays that carry a slightly different structure. The direct CCW path creates this: ccwchain->pfn_array_table[1]->pfn_array[#pages] while an IDA CCW creates: ccwchain->pfn_array_table[#idaws]->pfn_array[1] The distinction appears to state that each pfn_array_table entry points to an array of contiguous pages, represented by a pfn_array, um, array. Since the direct-addressed scenario can ONLY represent contiguous pages, it makes the intermediate array necessary but difficult to recognize. Meanwhile, since an IDAL can contain non-contiguous pages and there is no logic in vfio-ccw to detect adjacent IDAWs, it is the second array that is necessary but appearing to be superfluous. I am not aware of any documentation that states the pfn_array[] needs to be of contiguous pages; it is just what the code does today. I don't see any reason for this either, let's just flip the IDA codepath around so that it generates: ch_pat->pfn_array_table[1]->pfn_array[#idaws] This will bring it in line with the direct-addressed codepath, so that we can understand the behavior of this memory regardless of what type of CCW is being processed. And it means the casual observer does not need to know/care whether the pfn_array[] represents contiguous pages or not. NB: The existing vfio-ccw code only supports 4K-block Format-2 IDAs, so that "#pages" == "#idaws" in this area. This means that we will have difficulty with this overlap in terminology if support for Format-1 or 2K-block Format-2 IDAs is ever added. I don't think that this patch changes our ability to make that distinction. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20190606202831.44135-6-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 5b98bea433b7..86a0e76ef2b5 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -635,7 +635,6 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, { struct ccw1 *ccw; struct pfn_array_table *pat; - struct pfn_array *pa; unsigned long *idaws; u64 idaw_iova; unsigned int idaw_nr, idaw_len; @@ -656,10 +655,14 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, /* Pin data page(s) in memory. */ pat = chain->ch_pat + idx; - ret = pfn_array_table_init(pat, idaw_nr); + ret = pfn_array_table_init(pat, 1); if (ret) goto out_init; + ret = pfn_array_alloc(pat->pat_pa, idaw_iova, bytes); + if (ret) + goto out_unpin; + /* Translate idal ccw to use new allocated idaws. */ idaws = kzalloc(idaw_len, GFP_DMA | GFP_KERNEL); if (!idaws) { @@ -673,22 +676,15 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, ccw->cda = virt_to_phys(idaws); - for (i = 0; i < idaw_nr; i++) { - idaw_iova = *(idaws + i); - pa = pat->pat_pa + i; + for (i = 0; i < idaw_nr; i++) + pat->pat_pa->pa_iova_pfn[i] = idaws[i] >> PAGE_SHIFT; - ret = pfn_array_alloc(pa, idaw_iova, 1); - if (ret < 0) - goto out_free_idaws; - - if (!ccw_does_data_transfer(ccw)) { - pa->pa_nr = 0; - continue; - } - - ret = pfn_array_pin(pa, cp->mdev); + if (ccw_does_data_transfer(ccw)) { + ret = pfn_array_pin(pat->pat_pa, cp->mdev); if (ret < 0) goto out_free_idaws; + } else { + pat->pat_pa->pa_nr = 0; } pfn_array_table_idal_create_words(pat, idaws); From 8aabf0edae4a6cc82042785079075aebf7cd5c79 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 6 Jun 2019 22:28:28 +0200 Subject: [PATCH 54/83] vfio-ccw: Adjust the first IDAW outside of the nested loops Now that pfn_array_table[] is always an array of 1, it seems silly to check for the very first entry in an array in the middle of two nested loops, since we know it'll only ever happen once. Let's move this outside the loops to simplify things, even though the "k" variable is still necessary. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20190606202831.44135-7-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 86a0e76ef2b5..ab9f8f0d1b44 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -201,11 +201,12 @@ static inline void pfn_array_table_idal_create_words( pa = pat->pat_pa + i; for (j = 0; j < pa->pa_nr; j++) { idaws[k] = pa->pa_pfn[j] << PAGE_SHIFT; - if (k == 0) - idaws[k] += pa->pa_iova & (PAGE_SIZE - 1); k++; } } + + /* Adjust the first IDAW, since it may not start on a page boundary */ + idaws[0] += pat->pat_pa->pa_iova & (PAGE_SIZE - 1); } From e7eaf91b0aad276b164277dd6d20cdf3ee1c77e6 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 6 Jun 2019 22:28:29 +0200 Subject: [PATCH 55/83] vfio-ccw: Remove pfn_array_table Now that both CCW codepaths build this nested array: ccwchain->pfn_array_table[1]->pfn_array[#idaws/#pages] We can collapse this into simply: ccwchain->pfn_array[#idaws/#pages] Let's do that, so that we don't have to continually navigate two nested arrays when the first array always has a count of one. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20190606202831.44135-8-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 118 +++++++++------------------------ 1 file changed, 33 insertions(+), 85 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index ab9f8f0d1b44..76ffcc823944 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -33,11 +33,6 @@ struct pfn_array { int pa_nr; }; -struct pfn_array_table { - struct pfn_array *pat_pa; - int pat_nr; -}; - struct ccwchain { struct list_head next; struct ccw1 *ch_ccw; @@ -46,7 +41,7 @@ struct ccwchain { /* Count of the valid ccws in chain. */ int ch_len; /* Pinned PAGEs for the original data. */ - struct pfn_array_table *ch_pat; + struct pfn_array *ch_pa; }; /* @@ -139,55 +134,23 @@ static void pfn_array_unpin_free(struct pfn_array *pa, struct device *mdev) kfree(pa->pa_iova_pfn); } -static int pfn_array_table_init(struct pfn_array_table *pat, int nr) -{ - pat->pat_pa = kcalloc(nr, sizeof(*pat->pat_pa), GFP_KERNEL); - if (unlikely(ZERO_OR_NULL_PTR(pat->pat_pa))) { - pat->pat_nr = 0; - return -ENOMEM; - } - - pat->pat_nr = nr; - - return 0; -} - -static void pfn_array_table_unpin_free(struct pfn_array_table *pat, - struct device *mdev) +static bool pfn_array_iova_pinned(struct pfn_array *pa, unsigned long iova) { + unsigned long iova_pfn = iova >> PAGE_SHIFT; int i; - for (i = 0; i < pat->pat_nr; i++) - pfn_array_unpin_free(pat->pat_pa + i, mdev); - - if (pat->pat_nr) { - kfree(pat->pat_pa); - pat->pat_pa = NULL; - pat->pat_nr = 0; - } -} - -static bool pfn_array_table_iova_pinned(struct pfn_array_table *pat, - unsigned long iova) -{ - struct pfn_array *pa = pat->pat_pa; - unsigned long iova_pfn = iova >> PAGE_SHIFT; - int i, j; - - for (i = 0; i < pat->pat_nr; i++, pa++) - for (j = 0; j < pa->pa_nr; j++) - if (pa->pa_iova_pfn[j] == iova_pfn) - return true; + for (i = 0; i < pa->pa_nr; i++) + if (pa->pa_iova_pfn[i] == iova_pfn) + return true; return false; } -/* Create the list idal words for a pfn_array_table. */ -static inline void pfn_array_table_idal_create_words( - struct pfn_array_table *pat, +/* Create the list of IDAL words for a pfn_array. */ +static inline void pfn_array_idal_create_words( + struct pfn_array *pa, unsigned long *idaws) { - struct pfn_array *pa; - int i, j, k; + int i; /* * Idal words (execept the first one) rely on the memory being 4k @@ -196,17 +159,12 @@ static inline void pfn_array_table_idal_create_words( * there will be no problem here to simply use the phys to create an * idaw. */ - k = 0; - for (i = 0; i < pat->pat_nr; i++) { - pa = pat->pat_pa + i; - for (j = 0; j < pa->pa_nr; j++) { - idaws[k] = pa->pa_pfn[j] << PAGE_SHIFT; - k++; - } - } + + for (i = 0; i < pa->pa_nr; i++) + idaws[i] = pa->pa_pfn[i] << PAGE_SHIFT; /* Adjust the first IDAW, since it may not start on a page boundary */ - idaws[0] += pat->pat_pa->pa_iova & (PAGE_SIZE - 1); + idaws[0] += pa->pa_iova & (PAGE_SIZE - 1); } @@ -378,7 +336,7 @@ static struct ccwchain *ccwchain_alloc(struct channel_program *cp, int len) /* Make ccw address aligned to 8. */ size = ((sizeof(*chain) + 7L) & -8L) + sizeof(*chain->ch_ccw) * len + - sizeof(*chain->ch_pat) * len; + sizeof(*chain->ch_pa) * len; chain = kzalloc(size, GFP_DMA | GFP_KERNEL); if (!chain) return NULL; @@ -387,7 +345,7 @@ static struct ccwchain *ccwchain_alloc(struct channel_program *cp, int len) chain->ch_ccw = (struct ccw1 *)data; data = (u8 *)(chain->ch_ccw) + sizeof(*chain->ch_ccw) * len; - chain->ch_pat = (struct pfn_array_table *)data; + chain->ch_pa = (struct pfn_array *)data; chain->ch_len = len; @@ -575,7 +533,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, struct channel_program *cp) { struct ccw1 *ccw; - struct pfn_array_table *pat; + struct pfn_array *pa; unsigned long *idaws; int ret; int bytes = 1; @@ -593,21 +551,17 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, * The number of pages actually is the count of the idaws which will be * needed when translating a direct ccw to a idal ccw. */ - pat = chain->ch_pat + idx; - ret = pfn_array_table_init(pat, 1); - if (ret) - goto out_init; - - ret = pfn_array_alloc(pat->pat_pa, ccw->cda, bytes); + pa = chain->ch_pa + idx; + ret = pfn_array_alloc(pa, ccw->cda, bytes); if (ret < 0) goto out_unpin; if (ccw_does_data_transfer(ccw)) { - ret = pfn_array_pin(pat->pat_pa, cp->mdev); + ret = pfn_array_pin(pa, cp->mdev); if (ret < 0) goto out_unpin; } else { - pat->pat_pa->pa_nr = 0; + pa->pa_nr = 0; } /* Translate this direct ccw to a idal ccw. */ @@ -619,12 +573,12 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, ccw->cda = (__u32) virt_to_phys(idaws); ccw->flags |= CCW_FLAG_IDA; - pfn_array_table_idal_create_words(pat, idaws); + pfn_array_idal_create_words(pa, idaws); return 0; out_unpin: - pfn_array_table_unpin_free(pat, cp->mdev); + pfn_array_unpin_free(pa, cp->mdev); out_init: ccw->cda = 0; return ret; @@ -635,7 +589,7 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, struct channel_program *cp) { struct ccw1 *ccw; - struct pfn_array_table *pat; + struct pfn_array *pa; unsigned long *idaws; u64 idaw_iova; unsigned int idaw_nr, idaw_len; @@ -655,15 +609,11 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, idaw_len = idaw_nr * sizeof(*idaws); /* Pin data page(s) in memory. */ - pat = chain->ch_pat + idx; - ret = pfn_array_table_init(pat, 1); + pa = chain->ch_pa + idx; + ret = pfn_array_alloc(pa, idaw_iova, bytes); if (ret) goto out_init; - ret = pfn_array_alloc(pat->pat_pa, idaw_iova, bytes); - if (ret) - goto out_unpin; - /* Translate idal ccw to use new allocated idaws. */ idaws = kzalloc(idaw_len, GFP_DMA | GFP_KERNEL); if (!idaws) { @@ -678,24 +628,24 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, ccw->cda = virt_to_phys(idaws); for (i = 0; i < idaw_nr; i++) - pat->pat_pa->pa_iova_pfn[i] = idaws[i] >> PAGE_SHIFT; + pa->pa_iova_pfn[i] = idaws[i] >> PAGE_SHIFT; if (ccw_does_data_transfer(ccw)) { - ret = pfn_array_pin(pat->pat_pa, cp->mdev); + ret = pfn_array_pin(pa, cp->mdev); if (ret < 0) goto out_free_idaws; } else { - pat->pat_pa->pa_nr = 0; + pa->pa_nr = 0; } - pfn_array_table_idal_create_words(pat, idaws); + pfn_array_idal_create_words(pa, idaws); return 0; out_free_idaws: kfree(idaws); out_unpin: - pfn_array_table_unpin_free(pat, cp->mdev); + pfn_array_unpin_free(pa, cp->mdev); out_init: ccw->cda = 0; return ret; @@ -790,8 +740,7 @@ void cp_free(struct channel_program *cp) cp->initialized = false; list_for_each_entry_safe(chain, temp, &cp->ccwchain_list, next) { for (i = 0; i < chain->ch_len; i++) { - pfn_array_table_unpin_free(chain->ch_pat + i, - cp->mdev); + pfn_array_unpin_free(chain->ch_pa + i, cp->mdev); ccwchain_cda_free(chain, i); } ccwchain_free(chain); @@ -967,8 +916,7 @@ bool cp_iova_pinned(struct channel_program *cp, u64 iova) list_for_each_entry(chain, &cp->ccwchain_list, next) { for (i = 0; i < chain->ch_len; i++) - if (pfn_array_table_iova_pinned(chain->ch_pat + i, - iova)) + if (pfn_array_iova_pinned(chain->ch_pa + i, iova)) return true; } From e8573b39a81b9933bb8b3fffcc7533b27d82231d Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 6 Jun 2019 22:28:30 +0200 Subject: [PATCH 56/83] vfio-ccw: Rearrange IDAL allocation in direct CCW This is purely deck furniture, to help understand the merge of the direct and indirect handlers. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20190606202831.44135-9-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 76ffcc823944..8205d0b527fc 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -537,13 +537,21 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, unsigned long *idaws; int ret; int bytes = 1; - int idaw_nr = 1; + int idaw_nr; ccw = chain->ch_ccw + idx; - if (ccw->count) { + if (ccw->count) bytes = ccw->count; - idaw_nr = idal_nr_words((void *)(u64)ccw->cda, ccw->count); + + /* Calculate size of IDAL */ + idaw_nr = idal_nr_words((void *)(u64)ccw->cda, bytes); + + /* Allocate an IDAL from host storage */ + idaws = kcalloc(idaw_nr, sizeof(*idaws), GFP_DMA | GFP_KERNEL); + if (!idaws) { + ret = -ENOMEM; + goto out_init; } /* @@ -554,7 +562,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, pa = chain->ch_pa + idx; ret = pfn_array_alloc(pa, ccw->cda, bytes); if (ret < 0) - goto out_unpin; + goto out_free_idaws; if (ccw_does_data_transfer(ccw)) { ret = pfn_array_pin(pa, cp->mdev); @@ -564,21 +572,18 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, pa->pa_nr = 0; } - /* Translate this direct ccw to a idal ccw. */ - idaws = kcalloc(idaw_nr, sizeof(*idaws), GFP_DMA | GFP_KERNEL); - if (!idaws) { - ret = -ENOMEM; - goto out_unpin; - } ccw->cda = (__u32) virt_to_phys(idaws); ccw->flags |= CCW_FLAG_IDA; + /* Populate the IDAL with pinned/translated addresses from pfn */ pfn_array_idal_create_words(pa, idaws); return 0; out_unpin: pfn_array_unpin_free(pa, cp->mdev); +out_free_idaws: + kfree(idaws); out_init: ccw->cda = 0; return ret; From 01aa26c672c0eb771de4aaa2a8ccf6055778887b Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 6 Jun 2019 22:28:31 +0200 Subject: [PATCH 57/83] s390/cio: Combine direct and indirect CCW paths With both the direct-addressed and indirect-addressed CCW paths simplified to this point, the amount of shared code between them is (hopefully) more easily visible. Move the processing of IDA-specific bits into the direct-addressed path, and add some useful commentary of what the individual pieces are doing. This allows us to remove the entire ccwchain_fetch_idal() routine and maintain a single function for any non-TIC CCW. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20190606202831.44135-10-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 115 +++++++++++---------------------- 1 file changed, 39 insertions(+), 76 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 8205d0b527fc..90d86e1354c1 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -534,10 +534,12 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, { struct ccw1 *ccw; struct pfn_array *pa; + u64 iova; unsigned long *idaws; int ret; int bytes = 1; - int idaw_nr; + int idaw_nr, idal_len; + int i; ccw = chain->ch_ccw + idx; @@ -545,7 +547,17 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, bytes = ccw->count; /* Calculate size of IDAL */ - idaw_nr = idal_nr_words((void *)(u64)ccw->cda, bytes); + if (ccw_is_idal(ccw)) { + /* Read first IDAW to see if it's 4K-aligned or not. */ + /* All subsequent IDAws will be 4K-aligned. */ + ret = copy_from_iova(cp->mdev, &iova, ccw->cda, sizeof(iova)); + if (ret) + return ret; + } else { + iova = ccw->cda; + } + idaw_nr = idal_nr_words((void *)iova, bytes); + idal_len = idaw_nr * sizeof(*idaws); /* Allocate an IDAL from host storage */ idaws = kcalloc(idaw_nr, sizeof(*idaws), GFP_DMA | GFP_KERNEL); @@ -555,15 +567,36 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, } /* - * Pin data page(s) in memory. - * The number of pages actually is the count of the idaws which will be - * needed when translating a direct ccw to a idal ccw. + * Allocate an array of pfn's for pages to pin/translate. + * The number of pages is actually the count of the idaws + * required for the data transfer, since we only only support + * 4K IDAWs today. */ pa = chain->ch_pa + idx; - ret = pfn_array_alloc(pa, ccw->cda, bytes); + ret = pfn_array_alloc(pa, iova, bytes); if (ret < 0) goto out_free_idaws; + if (ccw_is_idal(ccw)) { + /* Copy guest IDAL into host IDAL */ + ret = copy_from_iova(cp->mdev, idaws, ccw->cda, idal_len); + if (ret) + goto out_unpin; + + /* + * Copy guest IDAWs into pfn_array, in case the memory they + * occupy is not contiguous. + */ + for (i = 0; i < idaw_nr; i++) + pa->pa_iova_pfn[i] = idaws[i] >> PAGE_SHIFT; + } else { + /* + * No action is required here; the iova addresses in pfn_array + * were initialized sequentially in pfn_array_alloc() beginning + * with the contents of ccw->cda. + */ + } + if (ccw_does_data_transfer(ccw)) { ret = pfn_array_pin(pa, cp->mdev); if (ret < 0) @@ -589,73 +622,6 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, return ret; } -static int ccwchain_fetch_idal(struct ccwchain *chain, - int idx, - struct channel_program *cp) -{ - struct ccw1 *ccw; - struct pfn_array *pa; - unsigned long *idaws; - u64 idaw_iova; - unsigned int idaw_nr, idaw_len; - int i, ret; - int bytes = 1; - - ccw = chain->ch_ccw + idx; - - if (ccw->count) - bytes = ccw->count; - - /* Calculate size of idaws. */ - ret = copy_from_iova(cp->mdev, &idaw_iova, ccw->cda, sizeof(idaw_iova)); - if (ret) - return ret; - idaw_nr = idal_nr_words((void *)(idaw_iova), bytes); - idaw_len = idaw_nr * sizeof(*idaws); - - /* Pin data page(s) in memory. */ - pa = chain->ch_pa + idx; - ret = pfn_array_alloc(pa, idaw_iova, bytes); - if (ret) - goto out_init; - - /* Translate idal ccw to use new allocated idaws. */ - idaws = kzalloc(idaw_len, GFP_DMA | GFP_KERNEL); - if (!idaws) { - ret = -ENOMEM; - goto out_unpin; - } - - ret = copy_from_iova(cp->mdev, idaws, ccw->cda, idaw_len); - if (ret) - goto out_free_idaws; - - ccw->cda = virt_to_phys(idaws); - - for (i = 0; i < idaw_nr; i++) - pa->pa_iova_pfn[i] = idaws[i] >> PAGE_SHIFT; - - if (ccw_does_data_transfer(ccw)) { - ret = pfn_array_pin(pa, cp->mdev); - if (ret < 0) - goto out_free_idaws; - } else { - pa->pa_nr = 0; - } - - pfn_array_idal_create_words(pa, idaws); - - return 0; - -out_free_idaws: - kfree(idaws); -out_unpin: - pfn_array_unpin_free(pa, cp->mdev); -out_init: - ccw->cda = 0; - return ret; -} - /* * Fetch one ccw. * To reduce memory copy, we'll pin the cda page in memory, @@ -671,9 +637,6 @@ static int ccwchain_fetch_one(struct ccwchain *chain, if (ccw_is_tic(ccw)) return ccwchain_fetch_tic(chain, idx, cp); - if (ccw_is_idal(ccw)) - return ccwchain_fetch_idal(chain, idx, cp); - return ccwchain_fetch_direct(chain, idx, cp); } From d1523a8f4b8beca90e6ada5ad41faa9776575287 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 17 Jun 2019 14:02:40 +0200 Subject: [PATCH 58/83] s390: replace defconfig with performance_defconfig Replace defconfig with performance_defconfig. defconfig had some more or less random debug options enabled, where nobody knows why anymore. Just remove the old defconfig and replace it with performance_defconfig, which reduces the number of configs to maintain. A config with debugging options enabled is debug_defconfig which is supposed to be rather close to performance_defconfig except that is has debug options enabled. Acked-by: Vasily Gorbik Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/configs/defconfig | 608 +++++++++++++++++---- arch/s390/configs/performance_defconfig | 680 ------------------------ 2 files changed, 517 insertions(+), 771 deletions(-) delete mode 100644 arch/s390/configs/performance_defconfig diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 697a40a8af3e..e4bc40073003 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -1,21 +1,22 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_USELIB=y CONFIG_AUDIT=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y -# CONFIG_CPU_ISOLATION is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y -CONFIG_CGROUPS=y +CONFIG_NUMA_BALANCING=y +# CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y CONFIG_BLK_CGROUP=y -CONFIG_CGROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_FREEZER=y @@ -26,96 +27,402 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y -CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_AUTOGROUP=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y # CONFIG_SYSFS_SYSCALL is not set +CONFIG_CHECKPOINT_RESTORE=y CONFIG_BPF_SYSCALL=y CONFIG_USERFAULTFD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y +CONFIG_OPROFILE=m +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +CONFIG_MODULE_SIG_SHA256=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_WBT=y +CONFIG_BLK_WBT_SQ=y +CONFIG_PARTITION_ADVANCED=y +CONFIG_IBM_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_CFQ_GROUP_IOSCHED=y +CONFIG_DEFAULT_DEADLINE=y CONFIG_LIVEPATCH=y -CONFIG_NR_CPUS=256 +CONFIG_TUNE_ZEC12=y +CONFIG_NR_CPUS=512 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y -# CONFIG_RELOCATABLE is not set -CONFIG_CRASH_DUMP=y -CONFIG_HIBERNATION=y -CONFIG_PM_DEBUG=y -CONFIG_CMM=m -CONFIG_OPROFILE=y -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -CONFIG_STATIC_KEYS_SELFTEST=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_BLK_DEV_INTEGRITY=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_IBM_PARTITION=y -CONFIG_BINFMT_MISC=m +CONFIG_KEXEC_VERIFY_SIG=y +CONFIG_EXPOLINE=y +CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y +CONFIG_MEM_SOFT_DIRTY=y CONFIG_ZSWAP=y CONFIG_ZBUD=m CONFIG_ZSMALLOC=m CONFIG_ZSMALLOC_STAT=y +CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y +CONFIG_PCI=y +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_S390=y +CONFIG_CHSC_SCH=y +CONFIG_VFIO_AP=m +CONFIG_VFIO_CCW=m +CONFIG_CRASH_DUMP=y +CONFIG_BINFMT_MISC=m +CONFIG_HIBERNATION=y +CONFIG_PM_DEBUG=y CONFIG_NET=y CONFIG_PACKET=y +CONFIG_PACKET_DIAG=m CONFIG_UNIX=y -CONFIG_NET_KEY=y +CONFIG_UNIX_DIAG=m +CONFIG_XFRM_USER=m +CONFIG_NET_KEY=m +CONFIG_SMC=m +CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_MODE_TRANSPORT=m +CONFIG_INET_XFRM_MODE_TUNNEL=m +CONFIG_INET_XFRM_MODE_BEET=m +CONFIG_INET_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_INET6_XFRM_MODE_TRANSPORT=m +CONFIG_INET6_XFRM_MODE_TUNNEL=m +CONFIG_INET6_XFRM_MODE_BEET=m +CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_TABLES=m +CONFIG_NFT_CT=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_NAT=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NETFILTER_XT_SET=m +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +CONFIG_IP_SET=m +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_PE_SIP=m +CONFIG_NF_CONNTRACK_IPV4=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_CHAIN_ROUTE_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NFT_CHAIN_NAT_IPV4=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +CONFIG_NF_CONNTRACK_IPV6=m +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_CHAIN_ROUTE_IPV6=m +CONFIG_NFT_CHAIN_NAT_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_NF_TABLES_BRIDGE=y +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m CONFIG_L2TP=m CONFIG_L2TP_DEBUGFS=m -CONFIG_VLAN_8021Q=y +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_BRIDGE=m +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y CONFIG_NET_SCHED=y CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m CONFIG_NET_SCH_SFQ=m CONFIG_NET_SCH_TEQL=m CONFIG_NET_SCH_TBF=m CONFIG_NET_SCH_GRED=m CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +CONFIG_NET_CLS_BASIC=m CONFIG_NET_CLS_TCINDEX=m CONFIG_NET_CLS_ROUTE4=m CONFIG_NET_CLS_FW=m CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=y +CONFIG_NET_CLS_BPF=m CONFIG_NET_CLS_ACT=y -CONFIG_NET_ACT_POLICE=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_DNS_RESOLVER=y +CONFIG_OPENVSWITCH=m +CONFIG_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y -CONFIG_UEVENT_HELPER=y +CONFIG_NET_PKTGEN=m CONFIG_DEVTMPFS=y +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=0 +CONFIG_CONNECTOR=y +CONFIG_ZRAM=m CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=32768 CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_NVME=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_GENWQE=m +CONFIG_RAID_ATTRS=m CONFIG_SCSI=y CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_ST=y -CONFIG_BLK_DEV_SR=y -CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_ST=m +CONFIG_CHR_DEV_OSST=m +CONFIG_BLK_DEV_SR=m CONFIG_CHR_DEV_SG=y +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=y +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SRP_ATTRS=m +CONFIG_ISCSI_TCP=m +CONFIG_SCSI_DEBUG=m CONFIG_ZFCP=y -CONFIG_SCSI_VIRTIO=y +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +CONFIG_SCSI_OSD_INITIATOR=m +CONFIG_SCSI_OSD_ULD=m CONFIG_MD=y +CONFIG_BLK_DEV_MD=y CONFIG_MD_LINEAR=m CONFIG_MD_MULTIPATH=m -CONFIG_BLK_DEV_DM=y +CONFIG_MD_FAULTY=m +CONFIG_BLK_DEV_DM=m CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m CONFIG_DM_MIRROR=m CONFIG_DM_LOG_USERSPACE=m CONFIG_DM_RAID=m @@ -123,73 +430,216 @@ CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_QL=m CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_DELAY=m CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m CONFIG_DM_VERITY=m CONFIG_DM_SWITCH=m CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=m CONFIG_EQUALIZER=m +CONFIG_IFB=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_VXLAN=m CONFIG_TUN=m -CONFIG_VIRTIO_NET=y -# CONFIG_NET_VENDOR_ALACRITECH is not set -# CONFIG_NET_VENDOR_AURORA is not set -# CONFIG_NET_VENDOR_CORTINA is not set -# CONFIG_NET_VENDOR_SOLARFLARE is not set -# CONFIG_NET_VENDOR_SOCIONEXT is not set -# CONFIG_NET_VENDOR_SYNOPSYS is not set -# CONFIG_INPUT is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +# CONFIG_NET_VENDOR_ARC is not set +# CONFIG_NET_VENDOR_CHELSIO is not set +# CONFIG_NET_VENDOR_INTEL is not set +# CONFIG_NET_VENDOR_MARVELL is not set +CONFIG_MLX4_EN=m +CONFIG_MLX5_CORE=m +CONFIG_MLX5_CORE_EN=y +# CONFIG_NET_VENDOR_NATSEMI is not set +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_MPPE=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_ISM=m +CONFIG_INPUT_EVDEV=y +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set -# CONFIG_VT is not set -CONFIG_DEVKMEM=y +CONFIG_LEGACY_PTY_COUNT=0 +CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m -CONFIG_VIRTIO_BALLOON=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TN3270_FS=y +# CONFIG_HWMON is not set +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_NOWAYOUT=y +CONFIG_SOFT_WATCHDOG=m +CONFIG_DIAG288_WATCHDOG=m +CONFIG_DRM=y +CONFIG_DRM_VIRTIO_GPU=y +CONFIG_FRAMEBUFFER_CONSOLE=y +# CONFIG_HID is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_ACCESS=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_VFIO=m +CONFIG_VFIO_PCI=m +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=y +CONFIG_S390_AP_IOMMU=y +CONFIG_S390_CCW_IOMMU=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y +CONFIG_JBD2_DEBUG=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +CONFIG_JFS_STATISTICS=y CONFIG_XFS_FS=y CONFIG_XFS_QUOTA=y CONFIG_XFS_POSIX_ACL=y CONFIG_XFS_RT=y +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m CONFIG_BTRFS_FS=y CONFIG_BTRFS_FS_POSIX_ACL=y +CONFIG_NILFS2_FS=m +CONFIG_FS_DAX=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FS_ENCRYPTION=y CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=y +CONFIG_CUSE=m +CONFIG_OVERLAY_FS=m +CONFIG_FSCACHE=m +CONFIG_CACHEFILES=m +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_NTFS_FS=m +CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y -# CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" +CONFIG_CONFIGFS_FS=m +CONFIG_ECRYPT_FS=m +CONFIG_CRAMFS=m +CONFIG_SQUASHFS=m +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_ROMFS_FS=m +CONFIG_NFS_FS=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFSD=m +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_V4_SECURITY_LABEL=y +CONFIG_CIFS=m +CONFIG_CIFS_STATS=y +CONFIG_CIFS_STATS2=y +CONFIG_CIFS_WEAK_PW_HASH=y +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +# CONFIG_CIFS_DEBUG is not set +CONFIG_CIFS_DFS_UPCALL=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_GDB_SCRIPTS=y +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=1024 +CONFIG_UNUSED_SYMBOLS=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_MEMORY_INIT=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_RCU_TORTURE_TEST=m +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +CONFIG_LATENCYTOP=y +CONFIG_SCHED_TRACER=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_STACK_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_HIST_TRIGGERS=y +CONFIG_LKDTM=m +CONFIG_PERCPU_TEST=m +CONFIG_ATOMIC64_SELFTEST=y +CONFIG_TEST_BPF=m +CONFIG_BUG_ON_DATA_CORRUPTION=y +CONFIG_S390_PTDUMP=y +CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y +CONFIG_ENCRYPTED_KEYS=m +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0 +CONFIG_SECURITY_SELINUX_DISABLE=y +CONFIG_INTEGRITY_SIGNATURE=y +CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y +CONFIG_IMA=y +CONFIG_IMA_DEFAULT_HASH_SHA256=y +CONFIG_IMA_WRITE_POLICY=y +CONFIG_IMA_APPRAISE=y +CONFIG_CRYPTO_FIPS=y +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_USER=m +# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set +CONFIG_CRYPTO_PCRYPT=m CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_AUTHENC=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m -CONFIG_CRYPTO_GCM=m -CONFIG_CRYPTO_CBC=y -CONFIG_CRYPTO_CFB=m -CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_XTS=m -CONFIG_CRYPTO_ADIANTUM=m -CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA512=m +CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m @@ -199,16 +649,16 @@ CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SM4=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_842=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m CONFIG_CRYPTO_ANSI_CPRNG=m CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_ZCRYPT=m CONFIG_PKEY=m CONFIG_CRYPTO_PAES_S390=m @@ -217,38 +667,14 @@ CONFIG_CRYPTO_SHA256_S390=m CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m +CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_CRC32_S390=y CONFIG_CRC7=m -# CONFIG_XZ_DEC_X86 is not set -# CONFIG_XZ_DEC_POWERPC is not set -# CONFIG_XZ_DEC_IA64 is not set -# CONFIG_XZ_DEC_ARM is not set -# CONFIG_XZ_DEC_ARMTHUMB is not set -# CONFIG_XZ_DEC_SPARC is not set -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_INFO_DWARF4=y -CONFIG_GDB_SCRIPTS=y -CONFIG_UNUSED_SYMBOLS=y -CONFIG_DEBUG_SECTION_MISMATCH=y -CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_PAGEALLOC=y -CONFIG_DETECT_HUNG_TASK=y -CONFIG_PANIC_ON_OOPS=y -CONFIG_PROVE_LOCKING=y -CONFIG_LOCK_STAT=y -CONFIG_DEBUG_LOCKDEP=y -CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_LIST=y -CONFIG_DEBUG_SG=y -CONFIG_DEBUG_NOTIFIERS=y -CONFIG_RCU_CPU_STALL_TIMEOUT=60 -CONFIG_LATENCYTOP=y -CONFIG_SCHED_TRACER=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y -CONFIG_STACK_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_FUNCTION_PROFILER=y -# CONFIG_RUNTIME_TESTING_MENU is not set -CONFIG_S390_PTDUMP=y +CONFIG_CRC8=m +CONFIG_CORDIC=m +CONFIG_CMM=m +CONFIG_APPLDATA_BASE=y +CONFIG_KVM=m +CONFIG_KVM_S390_UCONTROL=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_VSOCK=m diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig deleted file mode 100644 index e4bc40073003..000000000000 --- a/arch/s390/configs/performance_defconfig +++ /dev/null @@ -1,680 +0,0 @@ -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_AUDIT=y -CONFIG_NO_HZ_IDLE=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_NUMA_BALANCING=y -# CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set -CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y -CONFIG_BLK_CGROUP=y -CONFIG_CFS_BANDWIDTH=y -CONFIG_RT_GROUP_SCHED=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_HUGETLB=y -CONFIG_CPUSETS=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_PERF=y -CONFIG_NAMESPACES=y -CONFIG_USER_NS=y -CONFIG_SCHED_AUTOGROUP=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set -CONFIG_CHECKPOINT_RESTORE=y -CONFIG_BPF_SYSCALL=y -CONFIG_USERFAULTFD=y -# CONFIG_COMPAT_BRK is not set -CONFIG_PROFILING=y -CONFIG_OPROFILE=m -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -CONFIG_MODULES=y -CONFIG_MODULE_FORCE_LOAD=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG=y -CONFIG_MODULE_SIG_SHA256=y -CONFIG_BLK_DEV_INTEGRITY=y -CONFIG_BLK_DEV_THROTTLING=y -CONFIG_BLK_WBT=y -CONFIG_BLK_WBT_SQ=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_IBM_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -CONFIG_UNIXWARE_DISKLABEL=y -CONFIG_CFQ_GROUP_IOSCHED=y -CONFIG_DEFAULT_DEADLINE=y -CONFIG_LIVEPATCH=y -CONFIG_TUNE_ZEC12=y -CONFIG_NR_CPUS=512 -CONFIG_NUMA=y -CONFIG_HZ_100=y -CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_VERIFY_SIG=y -CONFIG_EXPOLINE=y -CONFIG_EXPOLINE_AUTO=y -CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTREMOVE=y -CONFIG_KSM=y -CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y -CONFIG_MEM_SOFT_DIRTY=y -CONFIG_ZSWAP=y -CONFIG_ZBUD=m -CONFIG_ZSMALLOC=m -CONFIG_ZSMALLOC_STAT=y -CONFIG_DEFERRED_STRUCT_PAGE_INIT=y -CONFIG_IDLE_PAGE_TRACKING=y -CONFIG_PCI=y -CONFIG_HOTPLUG_PCI=y -CONFIG_HOTPLUG_PCI_S390=y -CONFIG_CHSC_SCH=y -CONFIG_VFIO_AP=m -CONFIG_VFIO_CCW=m -CONFIG_CRASH_DUMP=y -CONFIG_BINFMT_MISC=m -CONFIG_HIBERNATION=y -CONFIG_PM_DEBUG=y -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_PACKET_DIAG=m -CONFIG_UNIX=y -CONFIG_UNIX_DIAG=m -CONFIG_XFRM_USER=m -CONFIG_NET_KEY=m -CONFIG_SMC=m -CONFIG_SMC_DIAG=m -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_NET_IPIP=m -CONFIG_NET_IPGRE_DEMUX=m -CONFIG_NET_IPGRE=m -CONFIG_NET_IPGRE_BROADCAST=y -CONFIG_IP_MROUTE=y -CONFIG_IP_MROUTE_MULTIPLE_TABLES=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -CONFIG_NET_IPVTI=m -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m -CONFIG_INET_DIAG=m -CONFIG_INET_UDP_DIAG=m -CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_HSTCP=m -CONFIG_TCP_CONG_HYBLA=m -CONFIG_TCP_CONG_SCALABLE=m -CONFIG_TCP_CONG_LP=m -CONFIG_TCP_CONG_VENO=m -CONFIG_TCP_CONG_YEAH=m -CONFIG_TCP_CONG_ILLINOIS=m -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_IPCOMP=m -CONFIG_IPV6_MIP6=m -CONFIG_INET6_XFRM_MODE_TRANSPORT=m -CONFIG_INET6_XFRM_MODE_TUNNEL=m -CONFIG_INET6_XFRM_MODE_BEET=m -CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m -CONFIG_IPV6_VTI=m -CONFIG_IPV6_SIT=m -CONFIG_IPV6_GRE=m -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_SUBTREES=y -CONFIG_NETFILTER=y -CONFIG_NF_CONNTRACK=m -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_TIMEOUT=y -CONFIG_NF_CONNTRACK_TIMESTAMP=y -CONFIG_NF_CONNTRACK_AMANDA=m -CONFIG_NF_CONNTRACK_FTP=m -CONFIG_NF_CONNTRACK_H323=m -CONFIG_NF_CONNTRACK_IRC=m -CONFIG_NF_CONNTRACK_NETBIOS_NS=m -CONFIG_NF_CONNTRACK_SNMP=m -CONFIG_NF_CONNTRACK_PPTP=m -CONFIG_NF_CONNTRACK_SANE=m -CONFIG_NF_CONNTRACK_SIP=m -CONFIG_NF_CONNTRACK_TFTP=m -CONFIG_NF_CT_NETLINK=m -CONFIG_NF_CT_NETLINK_TIMEOUT=m -CONFIG_NF_TABLES=m -CONFIG_NFT_CT=m -CONFIG_NFT_COUNTER=m -CONFIG_NFT_LOG=m -CONFIG_NFT_LIMIT=m -CONFIG_NFT_NAT=m -CONFIG_NFT_COMPAT=m -CONFIG_NFT_HASH=m -CONFIG_NETFILTER_XT_SET=m -CONFIG_NETFILTER_XT_TARGET_AUDIT=m -CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m -CONFIG_NETFILTER_XT_TARGET_CONNMARK=m -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m -CONFIG_NETFILTER_XT_TARGET_CT=m -CONFIG_NETFILTER_XT_TARGET_DSCP=m -CONFIG_NETFILTER_XT_TARGET_HMARK=m -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m -CONFIG_NETFILTER_XT_TARGET_LOG=m -CONFIG_NETFILTER_XT_TARGET_MARK=m -CONFIG_NETFILTER_XT_TARGET_NFLOG=m -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m -CONFIG_NETFILTER_XT_TARGET_TEE=m -CONFIG_NETFILTER_XT_TARGET_TPROXY=m -CONFIG_NETFILTER_XT_TARGET_TRACE=m -CONFIG_NETFILTER_XT_TARGET_SECMARK=m -CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m -CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m -CONFIG_NETFILTER_XT_MATCH_BPF=m -CONFIG_NETFILTER_XT_MATCH_CLUSTER=m -CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m -CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m -CONFIG_NETFILTER_XT_MATCH_CONNMARK=m -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m -CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m -CONFIG_NETFILTER_XT_MATCH_DSCP=m -CONFIG_NETFILTER_XT_MATCH_ESP=m -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m -CONFIG_NETFILTER_XT_MATCH_HELPER=m -CONFIG_NETFILTER_XT_MATCH_IPRANGE=m -CONFIG_NETFILTER_XT_MATCH_IPVS=m -CONFIG_NETFILTER_XT_MATCH_LENGTH=m -CONFIG_NETFILTER_XT_MATCH_LIMIT=m -CONFIG_NETFILTER_XT_MATCH_MAC=m -CONFIG_NETFILTER_XT_MATCH_MARK=m -CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m -CONFIG_NETFILTER_XT_MATCH_NFACCT=m -CONFIG_NETFILTER_XT_MATCH_OSF=m -CONFIG_NETFILTER_XT_MATCH_OWNER=m -CONFIG_NETFILTER_XT_MATCH_POLICY=m -CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m -CONFIG_NETFILTER_XT_MATCH_QUOTA=m -CONFIG_NETFILTER_XT_MATCH_RATEEST=m -CONFIG_NETFILTER_XT_MATCH_REALM=m -CONFIG_NETFILTER_XT_MATCH_RECENT=m -CONFIG_NETFILTER_XT_MATCH_STATE=m -CONFIG_NETFILTER_XT_MATCH_STATISTIC=m -CONFIG_NETFILTER_XT_MATCH_STRING=m -CONFIG_NETFILTER_XT_MATCH_TCPMSS=m -CONFIG_NETFILTER_XT_MATCH_TIME=m -CONFIG_NETFILTER_XT_MATCH_U32=m -CONFIG_IP_SET=m -CONFIG_IP_SET_BITMAP_IP=m -CONFIG_IP_SET_BITMAP_IPMAC=m -CONFIG_IP_SET_BITMAP_PORT=m -CONFIG_IP_SET_HASH_IP=m -CONFIG_IP_SET_HASH_IPPORT=m -CONFIG_IP_SET_HASH_IPPORTIP=m -CONFIG_IP_SET_HASH_IPPORTNET=m -CONFIG_IP_SET_HASH_NETPORTNET=m -CONFIG_IP_SET_HASH_NET=m -CONFIG_IP_SET_HASH_NETNET=m -CONFIG_IP_SET_HASH_NETPORT=m -CONFIG_IP_SET_HASH_NETIFACE=m -CONFIG_IP_SET_LIST_SET=m -CONFIG_IP_VS=m -CONFIG_IP_VS_PROTO_TCP=y -CONFIG_IP_VS_PROTO_UDP=y -CONFIG_IP_VS_PROTO_ESP=y -CONFIG_IP_VS_PROTO_AH=y -CONFIG_IP_VS_RR=m -CONFIG_IP_VS_WRR=m -CONFIG_IP_VS_LC=m -CONFIG_IP_VS_WLC=m -CONFIG_IP_VS_LBLC=m -CONFIG_IP_VS_LBLCR=m -CONFIG_IP_VS_DH=m -CONFIG_IP_VS_SH=m -CONFIG_IP_VS_SED=m -CONFIG_IP_VS_NQ=m -CONFIG_IP_VS_FTP=m -CONFIG_IP_VS_PE_SIP=m -CONFIG_NF_CONNTRACK_IPV4=m -CONFIG_NF_TABLES_IPV4=y -CONFIG_NFT_CHAIN_ROUTE_IPV4=m -CONFIG_NF_TABLES_ARP=y -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_IP_NF_IPTABLES=m -CONFIG_IP_NF_MATCH_AH=m -CONFIG_IP_NF_MATCH_ECN=m -CONFIG_IP_NF_MATCH_RPFILTER=m -CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m -CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_TTL=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m -CONFIG_IP_NF_ARPFILTER=m -CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NF_CONNTRACK_IPV6=m -CONFIG_NF_TABLES_IPV6=y -CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_AH=m -CONFIG_IP6_NF_MATCH_EUI64=m -CONFIG_IP6_NF_MATCH_FRAG=m -CONFIG_IP6_NF_MATCH_OPTS=m -CONFIG_IP6_NF_MATCH_HL=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_MATCH_MH=m -CONFIG_IP6_NF_MATCH_RPFILTER=m -CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_RAW=m -CONFIG_IP6_NF_SECURITY=m -CONFIG_IP6_NF_NAT=m -CONFIG_IP6_NF_TARGET_MASQUERADE=m -CONFIG_NF_TABLES_BRIDGE=y -CONFIG_RDS=m -CONFIG_RDS_RDMA=m -CONFIG_RDS_TCP=m -CONFIG_L2TP=m -CONFIG_L2TP_DEBUGFS=m -CONFIG_L2TP_V3=y -CONFIG_L2TP_IP=m -CONFIG_L2TP_ETH=m -CONFIG_BRIDGE=m -CONFIG_VLAN_8021Q=m -CONFIG_VLAN_8021Q_GVRP=y -CONFIG_NET_SCHED=y -CONFIG_NET_SCH_CBQ=m -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_HFSC=m -CONFIG_NET_SCH_PRIO=m -CONFIG_NET_SCH_MULTIQ=m -CONFIG_NET_SCH_RED=m -CONFIG_NET_SCH_SFB=m -CONFIG_NET_SCH_SFQ=m -CONFIG_NET_SCH_TEQL=m -CONFIG_NET_SCH_TBF=m -CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m -CONFIG_NET_SCH_NETEM=m -CONFIG_NET_SCH_DRR=m -CONFIG_NET_SCH_MQPRIO=m -CONFIG_NET_SCH_CHOKE=m -CONFIG_NET_SCH_QFQ=m -CONFIG_NET_SCH_CODEL=m -CONFIG_NET_SCH_FQ_CODEL=m -CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_SCH_PLUG=m -CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m -CONFIG_NET_CLS_ROUTE4=m -CONFIG_NET_CLS_FW=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_PERF=y -CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m -CONFIG_NET_CLS_FLOW=m -CONFIG_NET_CLS_CGROUP=y -CONFIG_NET_CLS_BPF=m -CONFIG_NET_CLS_ACT=y -CONFIG_NET_ACT_POLICE=m -CONFIG_NET_ACT_GACT=m -CONFIG_GACT_PROB=y -CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_IPT=m -CONFIG_NET_ACT_NAT=m -CONFIG_NET_ACT_PEDIT=m -CONFIG_NET_ACT_SIMP=m -CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_ACT_CSUM=m -CONFIG_DNS_RESOLVER=y -CONFIG_OPENVSWITCH=m -CONFIG_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS=m -CONFIG_NETLINK_DIAG=m -CONFIG_CGROUP_NET_PRIO=y -CONFIG_BPF_JIT=y -CONFIG_NET_PKTGEN=m -CONFIG_DEVTMPFS=y -CONFIG_DMA_CMA=y -CONFIG_CMA_SIZE_MBYTES=0 -CONFIG_CONNECTOR=y -CONFIG_ZRAM=m -CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_CRYPTOLOOP=m -CONFIG_BLK_DEV_DRBD=m -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=32768 -CONFIG_VIRTIO_BLK=y -CONFIG_BLK_DEV_RBD=m -CONFIG_BLK_DEV_NVME=m -CONFIG_ENCLOSURE_SERVICES=m -CONFIG_GENWQE=m -CONFIG_RAID_ATTRS=m -CONFIG_SCSI=y -CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_ST=m -CONFIG_CHR_DEV_OSST=m -CONFIG_BLK_DEV_SR=m -CONFIG_CHR_DEV_SG=y -CONFIG_CHR_DEV_SCH=m -CONFIG_SCSI_ENCLOSURE=m -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_LOGGING=y -CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=y -CONFIG_SCSI_SAS_LIBSAS=m -CONFIG_SCSI_SRP_ATTRS=m -CONFIG_ISCSI_TCP=m -CONFIG_SCSI_DEBUG=m -CONFIG_ZFCP=y -CONFIG_SCSI_VIRTIO=m -CONFIG_SCSI_DH=y -CONFIG_SCSI_DH_RDAC=m -CONFIG_SCSI_DH_HP_SW=m -CONFIG_SCSI_DH_EMC=m -CONFIG_SCSI_DH_ALUA=m -CONFIG_SCSI_OSD_INITIATOR=m -CONFIG_SCSI_OSD_ULD=m -CONFIG_MD=y -CONFIG_BLK_DEV_MD=y -CONFIG_MD_LINEAR=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m -CONFIG_BLK_DEV_DM=m -CONFIG_DM_CRYPT=m -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_THIN_PROVISIONING=m -CONFIG_DM_MIRROR=m -CONFIG_DM_LOG_USERSPACE=m -CONFIG_DM_RAID=m -CONFIG_DM_ZERO=m -CONFIG_DM_MULTIPATH=m -CONFIG_DM_MULTIPATH_QL=m -CONFIG_DM_MULTIPATH_ST=m -CONFIG_DM_DELAY=m -CONFIG_DM_UEVENT=y -CONFIG_DM_FLAKEY=m -CONFIG_DM_VERITY=m -CONFIG_DM_SWITCH=m -CONFIG_NETDEVICES=y -CONFIG_BONDING=m -CONFIG_DUMMY=m -CONFIG_EQUALIZER=m -CONFIG_IFB=m -CONFIG_MACVLAN=m -CONFIG_MACVTAP=m -CONFIG_VXLAN=m -CONFIG_TUN=m -CONFIG_VETH=m -CONFIG_VIRTIO_NET=m -CONFIG_NLMON=m -# CONFIG_NET_VENDOR_ARC is not set -# CONFIG_NET_VENDOR_CHELSIO is not set -# CONFIG_NET_VENDOR_INTEL is not set -# CONFIG_NET_VENDOR_MARVELL is not set -CONFIG_MLX4_EN=m -CONFIG_MLX5_CORE=m -CONFIG_MLX5_CORE_EN=y -# CONFIG_NET_VENDOR_NATSEMI is not set -CONFIG_PPP=m -CONFIG_PPP_BSDCOMP=m -CONFIG_PPP_DEFLATE=m -CONFIG_PPP_MPPE=m -CONFIG_PPPOE=m -CONFIG_PPTP=m -CONFIG_PPPOL2TP=m -CONFIG_PPP_ASYNC=m -CONFIG_PPP_SYNC_TTY=m -CONFIG_ISM=m -CONFIG_INPUT_EVDEV=y -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_SERIO is not set -CONFIG_LEGACY_PTY_COUNT=0 -CONFIG_HW_RANDOM_VIRTIO=m -CONFIG_RAW_DRIVER=m -CONFIG_HANGCHECK_TIMER=m -CONFIG_TN3270_FS=y -# CONFIG_HWMON is not set -CONFIG_WATCHDOG=y -CONFIG_WATCHDOG_NOWAYOUT=y -CONFIG_SOFT_WATCHDOG=m -CONFIG_DIAG288_WATCHDOG=m -CONFIG_DRM=y -CONFIG_DRM_VIRTIO_GPU=y -CONFIG_FRAMEBUFFER_CONSOLE=y -# CONFIG_HID is not set -# CONFIG_USB_SUPPORT is not set -CONFIG_INFINIBAND=m -CONFIG_INFINIBAND_USER_ACCESS=m -CONFIG_MLX4_INFINIBAND=m -CONFIG_MLX5_INFINIBAND=m -CONFIG_VFIO=m -CONFIG_VFIO_PCI=m -CONFIG_VFIO_MDEV=m -CONFIG_VFIO_MDEV_DEVICE=m -CONFIG_VIRTIO_PCI=m -CONFIG_VIRTIO_BALLOON=m -CONFIG_VIRTIO_INPUT=y -CONFIG_S390_AP_IOMMU=y -CONFIG_S390_CCW_IOMMU=y -CONFIG_EXT4_FS=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -CONFIG_JBD2_DEBUG=y -CONFIG_JFS_FS=m -CONFIG_JFS_POSIX_ACL=y -CONFIG_JFS_SECURITY=y -CONFIG_JFS_STATISTICS=y -CONFIG_XFS_FS=y -CONFIG_XFS_QUOTA=y -CONFIG_XFS_POSIX_ACL=y -CONFIG_XFS_RT=y -CONFIG_GFS2_FS=m -CONFIG_GFS2_FS_LOCKING_DLM=y -CONFIG_OCFS2_FS=m -CONFIG_BTRFS_FS=y -CONFIG_BTRFS_FS_POSIX_ACL=y -CONFIG_NILFS2_FS=m -CONFIG_FS_DAX=y -CONFIG_EXPORTFS_BLOCK_OPS=y -CONFIG_FS_ENCRYPTION=y -CONFIG_FANOTIFY=y -CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -CONFIG_QFMT_V1=m -CONFIG_QFMT_V2=m -CONFIG_AUTOFS4_FS=m -CONFIG_FUSE_FS=y -CONFIG_CUSE=m -CONFIG_OVERLAY_FS=m -CONFIG_FSCACHE=m -CONFIG_CACHEFILES=m -CONFIG_ISO9660_FS=y -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_UDF_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_NTFS_FS=m -CONFIG_NTFS_RW=y -CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_HUGETLBFS=y -CONFIG_CONFIGFS_FS=m -CONFIG_ECRYPT_FS=m -CONFIG_CRAMFS=m -CONFIG_SQUASHFS=m -CONFIG_SQUASHFS_XATTR=y -CONFIG_SQUASHFS_LZO=y -CONFIG_SQUASHFS_XZ=y -CONFIG_ROMFS_FS=m -CONFIG_NFS_FS=m -CONFIG_NFS_V3_ACL=y -CONFIG_NFS_V4=m -CONFIG_NFS_SWAP=y -CONFIG_NFSD=m -CONFIG_NFSD_V3_ACL=y -CONFIG_NFSD_V4=y -CONFIG_NFSD_V4_SECURITY_LABEL=y -CONFIG_CIFS=m -CONFIG_CIFS_STATS=y -CONFIG_CIFS_STATS2=y -CONFIG_CIFS_WEAK_PW_HASH=y -CONFIG_CIFS_UPCALL=y -CONFIG_CIFS_XATTR=y -CONFIG_CIFS_POSIX=y -# CONFIG_CIFS_DEBUG is not set -CONFIG_CIFS_DFS_UPCALL=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_CODEPAGE_850=m -CONFIG_NLS_ASCII=m -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_ISO8859_15=m -CONFIG_NLS_UTF8=m -CONFIG_DLM=m -CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_INFO_DWARF4=y -CONFIG_GDB_SCRIPTS=y -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_FRAME_WARN=1024 -CONFIG_UNUSED_SYMBOLS=y -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_MEMORY_INIT=y -CONFIG_PANIC_ON_OOPS=y -CONFIG_RCU_TORTURE_TEST=m -CONFIG_RCU_CPU_STALL_TIMEOUT=60 -CONFIG_LATENCYTOP=y -CONFIG_SCHED_TRACER=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_STACK_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_FUNCTION_PROFILER=y -CONFIG_HIST_TRIGGERS=y -CONFIG_LKDTM=m -CONFIG_PERCPU_TEST=m -CONFIG_ATOMIC64_SELFTEST=y -CONFIG_TEST_BPF=m -CONFIG_BUG_ON_DATA_CORRUPTION=y -CONFIG_S390_PTDUMP=y -CONFIG_PERSISTENT_KEYRINGS=y -CONFIG_BIG_KEYS=y -CONFIG_ENCRYPTED_KEYS=m -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0 -CONFIG_SECURITY_SELINUX_DISABLE=y -CONFIG_INTEGRITY_SIGNATURE=y -CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y -CONFIG_IMA=y -CONFIG_IMA_DEFAULT_HASH_SHA256=y -CONFIG_IMA_WRITE_POLICY=y -CONFIG_IMA_APPRAISE=y -CONFIG_CRYPTO_FIPS=y -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m -CONFIG_CRYPTO_USER=m -# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set -CONFIG_CRYPTO_PCRYPT=m -CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA512=m -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_TGR192=m -CONFIG_CRYPTO_WP512=m -CONFIG_CRYPTO_AES_TI=m -CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_CAMELLIA=m -CONFIG_CRYPTO_CAST5=m -CONFIG_CRYPTO_CAST6=m -CONFIG_CRYPTO_FCRYPT=m -CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_SEED=m -CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_TEA=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_842=m -CONFIG_CRYPTO_LZ4=m -CONFIG_CRYPTO_LZ4HC=m -CONFIG_CRYPTO_ANSI_CPRNG=m -CONFIG_CRYPTO_USER_API_HASH=m -CONFIG_CRYPTO_USER_API_SKCIPHER=m -CONFIG_CRYPTO_USER_API_RNG=m -CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_ZCRYPT=m -CONFIG_PKEY=m -CONFIG_CRYPTO_PAES_S390=m -CONFIG_CRYPTO_SHA1_S390=m -CONFIG_CRYPTO_SHA256_S390=m -CONFIG_CRYPTO_SHA512_S390=m -CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_AES_S390=m -CONFIG_CRYPTO_GHASH_S390=m -CONFIG_CRYPTO_CRC32_S390=y -CONFIG_CRC7=m -CONFIG_CRC8=m -CONFIG_CORDIC=m -CONFIG_CMM=m -CONFIG_APPLDATA_BASE=y -CONFIG_KVM=m -CONFIG_KVM_S390_UCONTROL=y -CONFIG_VHOST_NET=m -CONFIG_VHOST_VSOCK=m From 191fa92b344831e3d1b15e0bf698dfe9755a81d2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 17 Jun 2019 14:02:40 +0200 Subject: [PATCH 59/83] s390/sclp: remove call home support This feature has never been used, so remove it. Acked-by: Vasily Gorbik Acked-by: Hendrik Brueckner Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- Documentation/sysctl/kernel.txt | 16 --- drivers/s390/char/Kconfig | 21 ---- drivers/s390/char/Makefile | 1 - drivers/s390/char/sclp_async.c | 189 -------------------------------- 4 files changed, 227 deletions(-) delete mode 100644 drivers/s390/char/sclp_async.c diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index f0c86fbb3b48..5af8b131ccbc 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -23,7 +23,6 @@ show up in /proc/sys/kernel: - auto_msgmni - bootloader_type [ X86 only ] - bootloader_version [ X86 only ] -- callhome [ S390 only ] - cap_last_cap - core_pattern - core_pipe_limit @@ -171,21 +170,6 @@ Documentation/x86/boot.txt for additional information. ============================================================== -callhome: - -Controls the kernel's callhome behavior in case of a kernel panic. - -The s390 hardware allows an operating system to send a notification -to a service organization (callhome) in case of an operating system panic. - -When the value in this file is 0 (which is the default behavior) -nothing happens in case of a kernel panic. If this value is set to "1" -the complete kernel oops message is send to the IBM customer service -organization in case the mainframe the Linux operating system is running -on has a service contract with IBM. - -============================================================== - cap_last_cap Highest valid capability of the running kernel. Exports diff --git a/drivers/s390/char/Kconfig b/drivers/s390/char/Kconfig index e2c0c60760b0..6cc4b19acf85 100644 --- a/drivers/s390/char/Kconfig +++ b/drivers/s390/char/Kconfig @@ -79,27 +79,6 @@ config SCLP_VT220_CONSOLE Include support for using an IBM SCLP VT220-compatible terminal as a Linux system console. -config SCLP_ASYNC - def_tristate m - prompt "Support for Call Home via Asynchronous SCLP Records" - depends on S390 - help - This option enables the call home function, which is able to inform - the service element and connected organisations about a kernel panic. - You should only select this option if you know what you are doing, - want for inform other people about your kernel panics, - need this feature and intend to run your kernel in LPAR. - -config SCLP_ASYNC_ID - string "Component ID for Call Home" - depends on SCLP_ASYNC - default "000000000" - help - The Component ID for Call Home is used to identify the correct - problem reporting queue the call home records should be sent to. - - If your are unsure, please use the default value "000000000". - config HMC_DRV def_tristate m prompt "Support for file transfers from HMC drive CD/DVD-ROM" diff --git a/drivers/s390/char/Makefile b/drivers/s390/char/Makefile index 3072b89785dd..b8a8816d94e7 100644 --- a/drivers/s390/char/Makefile +++ b/drivers/s390/char/Makefile @@ -31,7 +31,6 @@ obj-$(CONFIG_TN3215) += con3215.o obj-$(CONFIG_SCLP_TTY) += sclp_tty.o obj-$(CONFIG_SCLP_CONSOLE) += sclp_con.o obj-$(CONFIG_SCLP_VT220_TTY) += sclp_vt220.o -obj-$(CONFIG_SCLP_ASYNC) += sclp_async.o obj-$(CONFIG_PCI) += sclp_pci.o diff --git a/drivers/s390/char/sclp_async.c b/drivers/s390/char/sclp_async.c deleted file mode 100644 index e69b12a40636..000000000000 --- a/drivers/s390/char/sclp_async.c +++ /dev/null @@ -1,189 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Enable Asynchronous Notification via SCLP. - * - * Copyright IBM Corp. 2009 - * Author(s): Hans-Joachim Picht - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "sclp.h" - -static int callhome_enabled; -static struct sclp_req *request; -static struct sclp_async_sccb *sccb; -static int sclp_async_send_wait(char *message); -static struct ctl_table_header *callhome_sysctl_header; -static DEFINE_SPINLOCK(sclp_async_lock); -#define SCLP_NORMAL_WRITE 0x00 - -struct async_evbuf { - struct evbuf_header header; - u64 reserved; - u8 rflags; - u8 empty; - u8 rtype; - u8 otype; - char comp_id[12]; - char data[3000]; /* there is still some space left */ -} __attribute__((packed)); - -struct sclp_async_sccb { - struct sccb_header header; - struct async_evbuf evbuf; -} __attribute__((packed)); - -static struct sclp_register sclp_async_register = { - .send_mask = EVTYP_ASYNC_MASK, -}; - -static int call_home_on_panic(struct notifier_block *self, - unsigned long event, void *data) -{ - strncat(data, init_utsname()->nodename, - sizeof(init_utsname()->nodename)); - sclp_async_send_wait(data); - return NOTIFY_DONE; -} - -static struct notifier_block call_home_panic_nb = { - .notifier_call = call_home_on_panic, - .priority = INT_MAX, -}; - -static int zero; -static int one = 1; - -static struct ctl_table callhome_table[] = { - { - .procname = "callhome", - .data = &callhome_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - {} -}; - -static struct ctl_table kern_dir_table[] = { - { - .procname = "kernel", - .maxlen = 0, - .mode = 0555, - .child = callhome_table, - }, - {} -}; - -/* - * Function used to transfer asynchronous notification - * records which waits for send completion - */ -static int sclp_async_send_wait(char *message) -{ - struct async_evbuf *evb; - int rc; - unsigned long flags; - - if (!callhome_enabled) - return 0; - sccb->evbuf.header.type = EVTYP_ASYNC; - sccb->evbuf.rtype = 0xA5; - sccb->evbuf.otype = 0x00; - evb = &sccb->evbuf; - request->command = SCLP_CMDW_WRITE_EVENT_DATA; - request->sccb = sccb; - request->status = SCLP_REQ_FILLED; - strncpy(sccb->evbuf.data, message, sizeof(sccb->evbuf.data)); - /* - * Retain Queue - * e.g. 5639CC140 500 Red Hat RHEL5 Linux for zSeries (RHEL AS) - */ - strncpy(sccb->evbuf.comp_id, CONFIG_SCLP_ASYNC_ID, - sizeof(sccb->evbuf.comp_id)); - sccb->evbuf.header.length = sizeof(sccb->evbuf); - sccb->header.length = sizeof(sccb->evbuf) + sizeof(sccb->header); - sccb->header.function_code = SCLP_NORMAL_WRITE; - rc = sclp_add_request(request); - if (rc) - return rc; - spin_lock_irqsave(&sclp_async_lock, flags); - while (request->status != SCLP_REQ_DONE && - request->status != SCLP_REQ_FAILED) { - sclp_sync_wait(); - } - spin_unlock_irqrestore(&sclp_async_lock, flags); - if (request->status != SCLP_REQ_DONE) - return -EIO; - rc = ((struct sclp_async_sccb *) - request->sccb)->header.response_code; - if (rc != 0x0020) - return -EIO; - if (evb->header.flags != 0x80) - return -EIO; - return rc; -} - -static int __init sclp_async_init(void) -{ - int rc; - - rc = sclp_register(&sclp_async_register); - if (rc) - return rc; - rc = -EOPNOTSUPP; - if (!(sclp_async_register.sclp_receive_mask & EVTYP_ASYNC_MASK)) - goto out_sclp; - rc = -ENOMEM; - callhome_sysctl_header = register_sysctl_table(kern_dir_table); - if (!callhome_sysctl_header) - goto out_sclp; - request = kzalloc(sizeof(struct sclp_req), GFP_KERNEL); - sccb = (struct sclp_async_sccb *) get_zeroed_page(GFP_KERNEL | GFP_DMA); - if (!request || !sccb) - goto out_mem; - rc = atomic_notifier_chain_register(&panic_notifier_list, - &call_home_panic_nb); - if (!rc) - goto out; -out_mem: - kfree(request); - free_page((unsigned long) sccb); - unregister_sysctl_table(callhome_sysctl_header); -out_sclp: - sclp_unregister(&sclp_async_register); -out: - return rc; -} -module_init(sclp_async_init); - -static void __exit sclp_async_exit(void) -{ - atomic_notifier_chain_unregister(&panic_notifier_list, - &call_home_panic_nb); - unregister_sysctl_table(callhome_sysctl_header); - sclp_unregister(&sclp_async_register); - free_page((unsigned long) sccb); - kfree(request); -} -module_exit(sclp_async_exit); - -MODULE_AUTHOR("Copyright IBM Corp. 2009"); -MODULE_AUTHOR("Hans-Joachim Picht "); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SCLP Asynchronous Notification Records"); From 4f18d869ffd056c7858f3d617c71345cf19be008 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 17 Jun 2019 14:02:41 +0200 Subject: [PATCH 60/83] s390: fix stfle zero padding The stfle inline assembly returns the number of double words written (condition code 0) or the double words it would have written (condition code 3), if the memory array it got as parameter would have been large enough. The current stfle implementation assumes that the array is always large enough and clears those parts of the array that have not been written to with a subsequent memset call. If however the array is not large enough memset will get a negative length parameter, which means that memset clears memory until it gets an exception and the kernel crashes. To fix this simply limit the maximum length. Move also the inline assembly to an extra function to avoid clobbering of register 0, which might happen because of the added min_t invocation together with code instrumentation. The bug was introduced with commit 14375bc4eb8d ("[S390] cleanup facility list handling") but was rather harmless, since it would only write to a rather large array. It became a potential problem with commit 3ab121ab1866 ("[S390] kernel: Add z/VM LGR detection"). Since then it writes to an array with only four double words, while some machines already deliver three double words. As soon as machines have a facility bit within the fifth double a crash on IPL would happen. Fixes: 14375bc4eb8d ("[S390] cleanup facility list handling") Cc: # v2.6.37+ Reviewed-by: Vasily Gorbik Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/facility.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h index e78cda94456b..68c476b20b57 100644 --- a/arch/s390/include/asm/facility.h +++ b/arch/s390/include/asm/facility.h @@ -59,6 +59,18 @@ static inline int test_facility(unsigned long nr) return __test_facility(nr, &S390_lowcore.stfle_fac_list); } +static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size) +{ + register unsigned long reg0 asm("0") = size - 1; + + asm volatile( + ".insn s,0xb2b00000,0(%1)" /* stfle */ + : "+d" (reg0) + : "a" (stfle_fac_list) + : "memory", "cc"); + return reg0; +} + /** * stfle - Store facility list extended * @stfle_fac_list: array where facility list can be stored @@ -75,13 +87,8 @@ static inline void __stfle(u64 *stfle_fac_list, int size) memcpy(stfle_fac_list, &S390_lowcore.stfl_fac_list, 4); if (S390_lowcore.stfl_fac_list & 0x01000000) { /* More facility bits available with stfle */ - register unsigned long reg0 asm("0") = size - 1; - - asm volatile(".insn s,0xb2b00000,0(%1)" /* stfle */ - : "+d" (reg0) - : "a" (stfle_fac_list) - : "memory", "cc"); - nr = (reg0 + 1) * 8; /* # bytes stored by stfle */ + nr = __stfle_asm(stfle_fac_list, size); + nr = min_t(unsigned long, (nr + 1) * 8, size * 8); } memset((char *) stfle_fac_list + nr, 0, size * 8 - nr); } From 96e5aaf914060a02955b09487e176769a75ae225 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Fri, 14 Jun 2019 11:18:28 +0200 Subject: [PATCH 61/83] s390/cio: move struct node_descriptor to cio.h This allows device drivers (eg. qeth) to use the struct when processing information retrieved via RCD. Signed-off-by: Julian Wiedmann Acked-by: Sebastian Ott Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/cio.h | 30 ++++++++++++++++++++++++++++++ drivers/s390/cio/chsc.c | 30 ------------------------------ 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 58e7db912c30..b5bfb3123cb1 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -264,6 +264,36 @@ struct ciw { #define CIW_TYPE_SII 0x1 /* set interface identifier */ #define CIW_TYPE_RNI 0x2 /* read node identifier */ +/* + * Node Descriptor as defined in SA22-7204, "Common I/O-Device Commands" + */ + +#define ND_VALIDITY_VALID 0 +#define ND_VALIDITY_OUTDATED 1 +#define ND_VALIDITY_INVALID 2 + +struct node_descriptor { + /* Flags. */ + union { + struct { + u32 validity:3; + u32 reserved:5; + } __packed; + u8 byte0; + } __packed; + + /* Node parameters. */ + u32 params:24; + + /* Node ID. */ + char type[6]; + char model[3]; + char manufacturer[3]; + char plant[2]; + char seq[12]; + u16 tag; +} __packed; + /* * Flags used as input parameters for do_IO() */ diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index a835b31aad99..6392a1b95b02 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -322,36 +322,6 @@ struct chsc_sei { } u; } __packed __aligned(PAGE_SIZE); -/* - * Node Descriptor as defined in SA22-7204, "Common I/O-Device Commands" - */ - -#define ND_VALIDITY_VALID 0 -#define ND_VALIDITY_OUTDATED 1 -#define ND_VALIDITY_INVALID 2 - -struct node_descriptor { - /* Flags. */ - union { - struct { - u32 validity:3; - u32 reserved:5; - } __packed; - u8 byte0; - } __packed; - - /* Node parameters. */ - u32 params:24; - - /* Node ID. */ - char type[6]; - char model[3]; - char manufacturer[3]; - char plant[2]; - char seq[12]; - u16 tag; -} __packed; - /* * Link Incident Record as defined in SA22-7202, "ESCON I/O Interface" */ From 1d897e478da3db91593d86227854802102e2fcd3 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Tue, 18 Jun 2019 22:23:48 +0200 Subject: [PATCH 62/83] vfio-ccw: Move guest_cp storage into common struct Rather than allocating/freeing a piece of memory every time we try to figure out how long a CCW chain is, let's use a piece of memory allocated for each device. The io_mutex added with commit 4f76617378ee9 ("vfio-ccw: protect the I/O region") is held for the duration of the VFIO_CCW_EVENT_IO_REQ event that accesses/uses this space, so there should be no race concerns with another CPU attempting an (unexpected) SSCH for the same device. Suggested-by: Cornelia Huck Signed-off-by: Eric Farman Message-Id: <20190618202352.39702-2-farman@linux.ibm.com> Reviewed-by: Cornelia Huck Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 23 ++++------------------- drivers/s390/cio/vfio_ccw_cp.h | 7 +++++++ drivers/s390/cio/vfio_ccw_drv.c | 7 +++++++ 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 90d86e1354c1..f358502376be 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -16,12 +16,6 @@ #include "vfio_ccw_cp.h" -/* - * Max length for ccw chain. - * XXX: Limit to 256, need to check more? - */ -#define CCWCHAIN_LEN_MAX 256 - struct pfn_array { /* Starting guest physical I/O address. */ unsigned long pa_iova; @@ -386,7 +380,7 @@ static void ccwchain_cda_free(struct ccwchain *chain, int idx) */ static int ccwchain_calc_length(u64 iova, struct channel_program *cp) { - struct ccw1 *ccw, *p; + struct ccw1 *ccw = cp->guest_cp; int cnt; /* @@ -394,15 +388,9 @@ static int ccwchain_calc_length(u64 iova, struct channel_program *cp) * Currently the chain length is limited to CCWCHAIN_LEN_MAX (256). * So copying 2K is enough (safe). */ - p = ccw = kcalloc(CCWCHAIN_LEN_MAX, sizeof(*ccw), GFP_KERNEL); - if (!ccw) - return -ENOMEM; - cnt = copy_ccw_from_iova(cp, ccw, iova, CCWCHAIN_LEN_MAX); - if (cnt) { - kfree(ccw); + if (cnt) return cnt; - } cnt = 0; do { @@ -413,10 +401,8 @@ static int ccwchain_calc_length(u64 iova, struct channel_program *cp) * orb specified one of the unsupported formats, we defer * checking for IDAWs in unsupported formats to here. */ - if ((!cp->orb.cmd.c64 || cp->orb.cmd.i2k) && ccw_is_idal(ccw)) { - kfree(p); + if ((!cp->orb.cmd.c64 || cp->orb.cmd.i2k) && ccw_is_idal(ccw)) return -EOPNOTSUPP; - } /* * We want to keep counting if the current CCW has the @@ -435,7 +421,6 @@ static int ccwchain_calc_length(u64 iova, struct channel_program *cp) if (cnt == CCWCHAIN_LEN_MAX + 1) cnt = -EINVAL; - kfree(p); return cnt; } @@ -461,7 +446,7 @@ static int ccwchain_handle_ccw(u32 cda, struct channel_program *cp) struct ccwchain *chain; int len, ret; - /* Get chain length. */ + /* Copy the chain from cda to cp, and count the CCWs in it */ len = ccwchain_calc_length(cda, cp); if (len < 0) return len; diff --git a/drivers/s390/cio/vfio_ccw_cp.h b/drivers/s390/cio/vfio_ccw_cp.h index 3c20cd208da5..7cdc38049033 100644 --- a/drivers/s390/cio/vfio_ccw_cp.h +++ b/drivers/s390/cio/vfio_ccw_cp.h @@ -16,6 +16,12 @@ #include "orb.h" +/* + * Max length for ccw chain. + * XXX: Limit to 256, need to check more? + */ +#define CCWCHAIN_LEN_MAX 256 + /** * struct channel_program - manage information for channel program * @ccwchain_list: list head of ccwchains @@ -32,6 +38,7 @@ struct channel_program { union orb orb; struct device *mdev; bool initialized; + struct ccw1 *guest_cp; }; extern int cp_init(struct channel_program *cp, struct device *mdev, diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 66a66ac1f3d1..34a9a5e3fd36 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -129,6 +129,11 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) if (!private) return -ENOMEM; + private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), + GFP_KERNEL); + if (!private->cp.guest_cp) + goto out_free; + private->io_region = kmem_cache_zalloc(vfio_ccw_io_region, GFP_KERNEL | GFP_DMA); if (!private->io_region) @@ -169,6 +174,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); if (private->io_region) kmem_cache_free(vfio_ccw_io_region, private->io_region); + kfree(private->cp.guest_cp); kfree(private); return ret; } @@ -185,6 +191,7 @@ static int vfio_ccw_sch_remove(struct subchannel *sch) kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); kmem_cache_free(vfio_ccw_io_region, private->io_region); + kfree(private->cp.guest_cp); kfree(private); return 0; From 6246590230e9c1225d2ce8d0160376b4d5511281 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Tue, 18 Jun 2019 22:23:49 +0200 Subject: [PATCH 63/83] vfio-ccw: Skip second copy of guest cp to host We already pinned/copied/unpinned 2K (256 CCWs) of guest memory to the host space anchored off vfio_ccw_private. There's no need to do that again once we have the length calculated, when we could just copy the section we need to the "permanent" space for the I/O. Signed-off-by: Eric Farman Message-Id: <20190618202352.39702-3-farman@linux.ibm.com> Reviewed-by: Cornelia Huck Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index f358502376be..37d513e86530 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -444,7 +444,7 @@ static int ccwchain_loop_tic(struct ccwchain *chain, static int ccwchain_handle_ccw(u32 cda, struct channel_program *cp) { struct ccwchain *chain; - int len, ret; + int len; /* Copy the chain from cda to cp, and count the CCWs in it */ len = ccwchain_calc_length(cda, cp); @@ -457,12 +457,8 @@ static int ccwchain_handle_ccw(u32 cda, struct channel_program *cp) return -ENOMEM; chain->ch_iova = cda; - /* Copy the new chain from user. */ - ret = copy_ccw_from_iova(cp, chain->ch_ccw, cda, len); - if (ret) { - ccwchain_free(chain); - return ret; - } + /* Copy the actual CCWs into the new chain */ + memcpy(chain->ch_ccw, cp->guest_cp, len * sizeof(struct ccw1)); /* Loop for tics on this new chain. */ return ccwchain_loop_tic(chain, cp); From ded563f31d0eb4de0bcb101b755cdf6f4ed805e4 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Tue, 18 Jun 2019 22:23:50 +0200 Subject: [PATCH 64/83] vfio-ccw: Copy CCW data outside length calculation It doesn't make much sense to "hide" the copy to the channel_program struct inside a routine that calculates the length of the chain. Let's move it to the calling routine, which will later copy from channel_program to the memory it allocated itself. Signed-off-by: Eric Farman Message-Id: <20190618202352.39702-4-farman@linux.ibm.com> Reviewed-by: Cornelia Huck Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 37d513e86530..a55f8d110920 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -381,18 +381,8 @@ static void ccwchain_cda_free(struct ccwchain *chain, int idx) static int ccwchain_calc_length(u64 iova, struct channel_program *cp) { struct ccw1 *ccw = cp->guest_cp; - int cnt; + int cnt = 0; - /* - * Copy current chain from guest to host kernel. - * Currently the chain length is limited to CCWCHAIN_LEN_MAX (256). - * So copying 2K is enough (safe). - */ - cnt = copy_ccw_from_iova(cp, ccw, iova, CCWCHAIN_LEN_MAX); - if (cnt) - return cnt; - - cnt = 0; do { cnt++; @@ -446,7 +436,12 @@ static int ccwchain_handle_ccw(u32 cda, struct channel_program *cp) struct ccwchain *chain; int len; - /* Copy the chain from cda to cp, and count the CCWs in it */ + /* Copy 2K (the most we support today) of possible CCWs */ + len = copy_ccw_from_iova(cp, cp->guest_cp, cda, CCWCHAIN_LEN_MAX); + if (len) + return len; + + /* Count the CCWs in the current chain */ len = ccwchain_calc_length(cda, cp); if (len < 0) return len; From 7f8e89a8f2fdb58461b01bb33ed8a5aa8119bf17 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Tue, 18 Jun 2019 22:23:51 +0200 Subject: [PATCH 65/83] vfio-ccw: Factor out the ccw0-to-ccw1 transition This is a really useful function, but it's buried in the copy_ccw_from_iova() routine so that ccwchain_calc_length() can just work with Format-1 CCWs while doing its counting. But it means we're translating a full 2K of "CCWs" to Format-1, when in reality there's probably far fewer in that space. Let's factor it out, so maybe we can do something with it later. Signed-off-by: Eric Farman Message-Id: <20190618202352.39702-5-farman@linux.ibm.com> Reviewed-by: Cornelia Huck Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 48 ++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index a55f8d110920..9a8bf06281e0 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -161,6 +161,27 @@ static inline void pfn_array_idal_create_words( idaws[0] += pa->pa_iova & (PAGE_SIZE - 1); } +void convert_ccw0_to_ccw1(struct ccw1 *source, unsigned long len) +{ + struct ccw0 ccw0; + struct ccw1 *pccw1 = source; + int i; + + for (i = 0; i < len; i++) { + ccw0 = *(struct ccw0 *)pccw1; + if ((pccw1->cmd_code & 0x0f) == CCW_CMD_TIC) { + pccw1->cmd_code = CCW_CMD_TIC; + pccw1->flags = 0; + pccw1->count = 0; + } else { + pccw1->cmd_code = ccw0.cmd_code; + pccw1->flags = ccw0.flags; + pccw1->count = ccw0.count; + } + pccw1->cda = ccw0.cda; + pccw1++; + } +} /* * Within the domain (@mdev), copy @n bytes from a guest physical @@ -211,32 +232,9 @@ static long copy_ccw_from_iova(struct channel_program *cp, struct ccw1 *to, u64 iova, unsigned long len) { - struct ccw0 ccw0; - struct ccw1 *pccw1; int ret; - int i; ret = copy_from_iova(cp->mdev, to, iova, len * sizeof(struct ccw1)); - if (ret) - return ret; - - if (!cp->orb.cmd.fmt) { - pccw1 = to; - for (i = 0; i < len; i++) { - ccw0 = *(struct ccw0 *)pccw1; - if ((pccw1->cmd_code & 0x0f) == CCW_CMD_TIC) { - pccw1->cmd_code = CCW_CMD_TIC; - pccw1->flags = 0; - pccw1->count = 0; - } else { - pccw1->cmd_code = ccw0.cmd_code; - pccw1->flags = ccw0.flags; - pccw1->count = ccw0.count; - } - pccw1->cda = ccw0.cda; - pccw1++; - } - } return ret; } @@ -441,6 +439,10 @@ static int ccwchain_handle_ccw(u32 cda, struct channel_program *cp) if (len) return len; + /* Convert any Format-0 CCWs to Format-1 */ + if (!cp->orb.cmd.fmt) + convert_ccw0_to_ccw1(cp->guest_cp, len); + /* Count the CCWs in the current chain */ len = ccwchain_calc_length(cda, cp); if (len < 0) From 5223bee837e8d90d752de744c5702706a7bb13d9 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Tue, 18 Jun 2019 22:23:52 +0200 Subject: [PATCH 66/83] vfio-ccw: Remove copy_ccw_from_iova() Just to keep things tidy. Signed-off-by: Eric Farman Message-Id: <20190618202352.39702-6-farman@linux.ibm.com> Reviewed-by: Cornelia Huck Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 9a8bf06281e0..9cddc1288059 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -228,17 +228,6 @@ static long copy_from_iova(struct device *mdev, return l; } -static long copy_ccw_from_iova(struct channel_program *cp, - struct ccw1 *to, u64 iova, - unsigned long len) -{ - int ret; - - ret = copy_from_iova(cp->mdev, to, iova, len * sizeof(struct ccw1)); - - return ret; -} - /* * Helpers to operate ccwchain. */ @@ -435,7 +424,8 @@ static int ccwchain_handle_ccw(u32 cda, struct channel_program *cp) int len; /* Copy 2K (the most we support today) of possible CCWs */ - len = copy_ccw_from_iova(cp, cp->guest_cp, cda, CCWCHAIN_LEN_MAX); + len = copy_from_iova(cp->mdev, cp->guest_cp, cda, + CCWCHAIN_LEN_MAX * sizeof(struct ccw1)); if (len) return len; From dbd66558dd28e69471cac7c1431bb0d8df221498 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 24 Jun 2019 11:07:21 +0200 Subject: [PATCH 67/83] vfio-ccw: make convert_ccw0_to_ccw1 static Reported by sparse. Fixes: 7f8e89a8f2fd ("vfio-ccw: Factor out the ccw0-to-ccw1 transition") Signed-off-by: Cornelia Huck Message-Id: <20190624090721.16241-1-cohuck@redhat.com> Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- drivers/s390/cio/vfio_ccw_cp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 9cddc1288059..a7b9dfd5b464 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -161,7 +161,7 @@ static inline void pfn_array_idal_create_words( idaws[0] += pa->pa_iova & (PAGE_SIZE - 1); } -void convert_ccw0_to_ccw1(struct ccw1 *source, unsigned long len) +static void convert_ccw0_to_ccw1(struct ccw1 *source, unsigned long len) { struct ccw0 ccw0; struct ccw1 *pccw1 = source; From ebc3d179150347f3b6d97d8f249378bb2218f95e Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 13 Jun 2019 13:08:15 +0200 Subject: [PATCH 68/83] s390/cio: introduce driver_override on the css bus Sometimes, we want to control which of the matching drivers binds to a subchannel device (e.g. for subchannels we want to handle via vfio-ccw). For pci devices, a mechanism to do so has been introduced in 782a985d7af2 ("PCI: Introduce new device binding path using pci_dev.driver_override"). It makes sense to introduce the driver_override attribute for subchannel devices as well, so that we can easily extend the 'driverctl' tool (which makes use of the driver_override attribute for pci). Note that unlike pci we still require a driver override to match the subchannel type; matching more than one subchannel type is probably not useful anyway. Signed-off-by: Cornelia Huck Reviewed-by: Halil Pasic Reviewed-by: Sebastian Ott Signed-off-by: Sebastian Ott Signed-off-by: Vasily Gorbik --- Documentation/ABI/testing/sysfs-bus-css | 23 +++++++++++ drivers/s390/cio/cio.h | 1 + drivers/s390/cio/css.c | 53 +++++++++++++++++++++++++ 3 files changed, 77 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-css b/Documentation/ABI/testing/sysfs-bus-css index 2979c40c10e9..966f8504bd7b 100644 --- a/Documentation/ABI/testing/sysfs-bus-css +++ b/Documentation/ABI/testing/sysfs-bus-css @@ -33,3 +33,26 @@ Description: Contains the PIM/PAM/POM values, as reported by the in sync with the values current in the channel subsystem). Note: This is an I/O-subchannel specific attribute. Users: s390-tools, HAL + +What: /sys/bus/css/devices/.../driver_override +Date: June 2019 +Contact: Cornelia Huck + linux-s390@vger.kernel.org +Description: This file allows the driver for a device to be specified. When + specified, only a driver with a name matching the value written + to driver_override will have an opportunity to bind to the + device. The override is specified by writing a string to the + driver_override file (echo vfio-ccw > driver_override) and + may be cleared with an empty string (echo > driver_override). + This returns the device to standard matching rules binding. + Writing to driver_override does not automatically unbind the + device from its current driver or make any attempt to + automatically load the specified driver. If no driver with a + matching name is currently loaded in the kernel, the device + will not bind to any driver. This also allows devices to + opt-out of driver binding using a driver_override name such as + "none". Only a single driver may be specified in the override, + there is no support for parsing delimiters. + Note that unlike the mechanism of the same name for pci, this + file does not allow to override basic matching rules. I.e., + the driver must still match the subchannel type of the device. diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h index 4d6c7d16416e..ba7d2480613b 100644 --- a/drivers/s390/cio/cio.h +++ b/drivers/s390/cio/cio.h @@ -113,6 +113,7 @@ struct subchannel { enum sch_todo todo; struct work_struct todo_work; struct schib_config config; + char *driver_override; /* Driver name to force a match */ } __attribute__ ((aligned(8))); DECLARE_PER_CPU_ALIGNED(struct irb, cio_irb); diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 7159933d9d3e..e1f2d0eed544 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -167,6 +167,7 @@ static void css_subchannel_release(struct device *dev) sch->config.intparm = 0; cio_commit_config(sch); + kfree(sch->driver_override); kfree(sch->lock); kfree(sch); } @@ -323,9 +324,57 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR_RO(modalias); +static ssize_t driver_override_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct subchannel *sch = to_subchannel(dev); + char *driver_override, *old, *cp; + + /* We need to keep extra room for a newline */ + if (count >= (PAGE_SIZE - 1)) + return -EINVAL; + + driver_override = kstrndup(buf, count, GFP_KERNEL); + if (!driver_override) + return -ENOMEM; + + cp = strchr(driver_override, '\n'); + if (cp) + *cp = '\0'; + + device_lock(dev); + old = sch->driver_override; + if (strlen(driver_override)) { + sch->driver_override = driver_override; + } else { + kfree(driver_override); + sch->driver_override = NULL; + } + device_unlock(dev); + + kfree(old); + + return count; +} + +static ssize_t driver_override_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct subchannel *sch = to_subchannel(dev); + ssize_t len; + + device_lock(dev); + len = snprintf(buf, PAGE_SIZE, "%s\n", sch->driver_override); + device_unlock(dev); + return len; +} +static DEVICE_ATTR_RW(driver_override); + static struct attribute *subch_attrs[] = { &dev_attr_type.attr, &dev_attr_modalias.attr, + &dev_attr_driver_override.attr, NULL, }; @@ -1348,6 +1397,10 @@ static int css_bus_match(struct device *dev, struct device_driver *drv) struct css_driver *driver = to_cssdriver(drv); struct css_device_id *id; + /* When driver_override is set, only bind to the matching driver */ + if (sch->driver_override && strcmp(sch->driver_override, drv->name)) + return 0; + for (id = driver->subchannel_type; id->match_flags; id++) { if (sch->st == id->type) return 1; From 83eb1a415023e6489bf5adb467f20156722172f0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 26 Jun 2019 13:06:58 +0300 Subject: [PATCH 69/83] s390/dasd: Fix a precision vs width bug in dasd_feature_list() The "len" variable is the length of the option up to the next option or to the end of the string which ever first. We want to print the invalid option so we want precision "%.*s" but the format is width "%*s" so it prints up to the end of the string. Signed-off-by: Dan Carpenter Tested-by: Stefan Haberland Signed-off-by: Stefan Haberland Signed-off-by: Vasily Gorbik --- drivers/s390/block/dasd_devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index fab35c6170cc..245f33c2f71e 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -203,7 +203,7 @@ static int __init dasd_feature_list(char *str) else if (len == 8 && !strncmp(str, "failfast", 8)) features |= DASD_FEATURE_FAILFAST; else { - pr_warn("%*s is not a supported device option\n", + pr_warn("%.*s is not a supported device option\n", len, str); rc = -EINVAL; } From e54e4785cb5cb4896cf4285964aeef2125612fb2 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 18 Jun 2019 11:25:59 +0200 Subject: [PATCH 70/83] s390/qdio: (re-)initialize tiqdio list entries When tiqdio_remove_input_queues() removes a queue from the tiq_list as part of qdio_shutdown(), it doesn't re-initialize the queue's list entry and the prev/next pointers go stale. If a subsequent qdio_establish() fails while sending the ESTABLISH cmd, it calls qdio_shutdown() again in QDIO_IRQ_STATE_ERR state and tiqdio_remove_input_queues() will attempt to remove the queue entry a second time. This dereferences the stale pointers, and bad things ensue. Fix this by re-initializing the list entry after removing it from the list. For good practice also initialize the list entry when the queue is first allocated, and remove the quirky checks that papered over this omission. Note that prior to commit e521813468f7 ("s390/qdio: fix access to uninitialized qdio_q fields"), these checks were bogus anyway. setup_queues_misc() clears the whole queue struct, and thus needs to re-init the prev/next pointers as well. Fixes: 779e6e1c724d ("[S390] qdio: new qdio driver.") Cc: Signed-off-by: Julian Wiedmann Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio_setup.c | 2 ++ drivers/s390/cio/qdio_thinint.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c index 99d7d2566a3a..d4101cecdc8d 100644 --- a/drivers/s390/cio/qdio_setup.c +++ b/drivers/s390/cio/qdio_setup.c @@ -150,6 +150,7 @@ static int __qdio_allocate_qs(struct qdio_q **irq_ptr_qs, int nr_queues) return -ENOMEM; } irq_ptr_qs[i] = q; + INIT_LIST_HEAD(&q->entry); } return 0; } @@ -178,6 +179,7 @@ static void setup_queues_misc(struct qdio_q *q, struct qdio_irq *irq_ptr, q->mask = 1 << (31 - i); q->nr = i; q->handler = handler; + INIT_LIST_HEAD(&q->entry); } static void setup_storage_lists(struct qdio_q *q, struct qdio_irq *irq_ptr, diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index b84ac7ae8a3e..75e4357c1f9d 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -87,14 +87,14 @@ void tiqdio_remove_input_queues(struct qdio_irq *irq_ptr) struct qdio_q *q; q = irq_ptr->input_qs[0]; - /* if establish triggered an error */ - if (!q || !q->entry.prev || !q->entry.next) + if (!q) return; mutex_lock(&tiq_list_lock); list_del_rcu(&q->entry); mutex_unlock(&tiq_list_lock); synchronize_rcu(); + INIT_LIST_HEAD(&q->entry); } static inline int has_multiple_inq_on_dsci(struct qdio_irq *irq_ptr) From ac6639cd3db607d386616487902b4cc1850a7be5 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 18 Jun 2019 13:12:20 +0200 Subject: [PATCH 71/83] s390/qdio: don't touch the dsci in tiqdio_add_input_queues() Current code sets the dsci to 0x00000080. Which doesn't make any sense, as the indicator area is located in the _left-most_ byte. Worse: if the dsci is the _shared_ indicator, this potentially clears the indication of activity for a _different_ device. tiqdio_thinint_handler() will then have no reason to call that device's IRQ handler, and the device ends up stalling. Fixes: d0c9d4a89fff ("[S390] qdio: set correct bit in dsci") Cc: Signed-off-by: Julian Wiedmann Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio_thinint.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index 75e4357c1f9d..93ee067c10ca 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -79,7 +79,6 @@ void tiqdio_add_input_queues(struct qdio_irq *irq_ptr) mutex_lock(&tiq_list_lock); list_add_rcu(&irq_ptr->input_qs[0]->entry, &tiq_list); mutex_unlock(&tiq_list_lock); - xchg(irq_ptr->dsci, 1 << 7); } void tiqdio_remove_input_queues(struct qdio_irq *irq_ptr) From 20955746320e252b41c6b3505587766012e3e06d Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 20 Jun 2019 10:18:31 +0200 Subject: [PATCH 72/83] s390/kasan: avoid false positives during stack unwind Avoid kasan false positive when current task is interrupted in-between stack frame allocation and backchain write instructions leaving new stack frame backchain invalid. In particular if backchain is 0 the unwinder tries to read pt_regs from the stack and might hit kasan poisoned bytes, leading to kasan "stack-out-of-bounds" report. Disable kasan instrumentation of unwinder stack reads, since this limitation couldn't be handled otherwise with current backchain unwinder implementation. Fixes: 78c98f907413 ("s390/unwind: introduce stack unwind API") Reported-by: Julian Wiedmann Tested-by: Benjamin Block Signed-off-by: Vasily Gorbik --- arch/s390/kernel/unwind_bc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c index 57fd4e902f1f..3ce8a0808059 100644 --- a/arch/s390/kernel/unwind_bc.c +++ b/arch/s390/kernel/unwind_bc.c @@ -46,18 +46,18 @@ bool unwind_next_frame(struct unwind_state *state) regs = state->regs; if (unlikely(regs)) { - sp = READ_ONCE_TASK_STACK(state->task, regs->gprs[15]); + sp = READ_ONCE_NOCHECK(regs->gprs[15]); if (unlikely(outside_of_stack(state, sp))) { if (!update_stack_info(state, sp)) goto out_err; } sf = (struct stack_frame *) sp; - ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + ip = READ_ONCE_NOCHECK(sf->gprs[8]); reliable = false; regs = NULL; } else { sf = (struct stack_frame *) state->sp; - sp = READ_ONCE_TASK_STACK(state->task, sf->back_chain); + sp = READ_ONCE_NOCHECK(sf->back_chain); if (likely(sp)) { /* Non-zero back-chain points to the previous frame */ if (unlikely(outside_of_stack(state, sp))) { @@ -65,7 +65,7 @@ bool unwind_next_frame(struct unwind_state *state) goto out_err; } sf = (struct stack_frame *) sp; - ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + ip = READ_ONCE_NOCHECK(sf->gprs[8]); reliable = true; } else { /* No back-chain, look for a pt_regs structure */ @@ -73,9 +73,9 @@ bool unwind_next_frame(struct unwind_state *state) if (!on_stack(info, sp, sizeof(struct pt_regs))) goto out_stop; regs = (struct pt_regs *) sp; - if (user_mode(regs)) + if (READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE) goto out_stop; - ip = READ_ONCE_TASK_STACK(state->task, regs->psw.addr); + ip = READ_ONCE_NOCHECK(regs->psw.addr); reliable = true; } } @@ -132,11 +132,11 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, /* Get the instruction pointer from pt_regs or the stack frame */ if (regs) { - ip = READ_ONCE_TASK_STACK(state->task, regs->psw.addr); + ip = READ_ONCE_NOCHECK(regs->psw.addr); reliable = true; } else { sf = (struct stack_frame *) sp; - ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + ip = READ_ONCE_NOCHECK(sf->gprs[8]); reliable = false; } From da1776733617c4a92319eddb4e765ce60426b20a Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 20 Jun 2019 10:18:35 +0200 Subject: [PATCH 73/83] s390/unwind: cleanup unused READ_ONCE_TASK_STACK Kasan instrumentation of backchain unwinder stack reads is disabled completely and simply uses READ_ONCE_NOCHECK now. READ_ONCE_TASK_STACK macro is unused and could be removed. Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/unwind.h | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h index 6eb2ef105d87..d827b5b9a32c 100644 --- a/arch/s390/include/asm/unwind.h +++ b/arch/s390/include/asm/unwind.h @@ -79,23 +79,4 @@ static inline void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} -#ifdef CONFIG_KASAN -/* - * This disables KASAN checking when reading a value from another task's stack, - * since the other task could be running on another CPU and could have poisoned - * the stack in the meantime. - */ -#define READ_ONCE_TASK_STACK(task, x) \ -({ \ - unsigned long val; \ - if (task == current) \ - val = READ_ONCE(x); \ - else \ - val = READ_ONCE_NOCHECK(x); \ - val; \ -}) -#else -#define READ_ONCE_TASK_STACK(task, x) READ_ONCE(x) -#endif - #endif /* _ASM_S390_UNWIND_H */ From e5282de931057e2baa4bd73235a0773fde6e9649 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Tue, 21 May 2019 17:34:34 +0200 Subject: [PATCH 74/83] s390: ap: kvm: add PQAP interception for AQIC We prepare the interception of the PQAP/AQIC instruction for the case the AQIC facility is enabled in the guest. First of all we do not want to change existing behavior when intercepting AP instructions without the SIE allowing the guest to use AP instructions. In this patch we only handle the AQIC interception allowed by facility 65 which will be enabled when the complete interception infrastructure will be present. We add a callback inside the KVM arch structure for s390 for a VFIO driver to handle a specific response to the PQAP instruction with the AQIC command and only this command. But we want to be able to return a correct answer to the guest even there is no VFIO AP driver in the kernel. Therefor, we inject the correct exceptions from inside KVM for the case the callback is not initialized, which happens when the vfio_ap driver is not loaded. We do consider the responsibility of the driver to always initialize the PQAP callback if it defines queues by initializing the CRYCB for a guest. If the callback has been setup we call it. If not we setup an answer considering that no queue is available for the guest when no callback has been setup. Signed-off-by: Pierre Morel Reviewed-by: Tony Krowiak Acked-by: Harald Freudenberger Acked-by: Christian Borntraeger Signed-off-by: Halil Pasic Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/kvm_host.h | 7 +++ arch/s390/kvm/priv.c | 86 +++++++++++++++++++++++++++ drivers/s390/crypto/vfio_ap_private.h | 2 + 3 files changed, 95 insertions(+) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 2b00a3ebee08..4a928e2c667b 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -720,8 +721,14 @@ struct kvm_s390_cpu_model { unsigned short ibc; }; +struct kvm_s390_module_hook { + int (*hook)(struct kvm_vcpu *vcpu); + struct module *owner; +}; + struct kvm_s390_crypto { struct kvm_s390_crypto_cb *crycb; + struct kvm_s390_module_hook *pqap_hook; __u32 crycbd; __u8 aes_kw; __u8 dea_kw; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 8679bd74d337..ed52ffa8d5d4 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "gaccess.h" #include "kvm-s390.h" #include "trace.h" @@ -592,6 +593,89 @@ static int handle_io_inst(struct kvm_vcpu *vcpu) } } +/* + * handle_pqap: Handling pqap interception + * @vcpu: the vcpu having issue the pqap instruction + * + * We now support PQAP/AQIC instructions and we need to correctly + * answer the guest even if no dedicated driver's hook is available. + * + * The intercepting code calls a dedicated callback for this instruction + * if a driver did register one in the CRYPTO satellite of the + * SIE block. + * + * If no callback is available, the queues are not available, return this + * response code to the caller and set CC to 3. + * Else return the response code returned by the callback. + */ +static int handle_pqap(struct kvm_vcpu *vcpu) +{ + struct ap_queue_status status = {}; + unsigned long reg0; + int ret; + uint8_t fc; + + /* Verify that the AP instruction are available */ + if (!ap_instructions_available()) + return -EOPNOTSUPP; + /* Verify that the guest is allowed to use AP instructions */ + if (!(vcpu->arch.sie_block->eca & ECA_APIE)) + return -EOPNOTSUPP; + /* + * The only possibly intercepted functions when AP instructions are + * available for the guest are AQIC and TAPQ with the t bit set + * since we do not set IC.3 (FIII) we currently will only intercept + * the AQIC function code. + */ + reg0 = vcpu->run->s.regs.gprs[0]; + fc = (reg0 >> 24) & 0xff; + if (WARN_ON_ONCE(fc != 0x03)) + return -EOPNOTSUPP; + + /* PQAP instruction is allowed for guest kernel only */ + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + /* Common PQAP instruction specification exceptions */ + /* bits 41-47 must all be zeros */ + if (reg0 & 0x007f0000UL) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + /* APFT not install and T bit set */ + if (!test_kvm_facility(vcpu->kvm, 15) && (reg0 & 0x00800000UL)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + /* APXA not installed and APID greater 64 or APQI greater 16 */ + if (!(vcpu->kvm->arch.crypto.crycbd & 0x02) && (reg0 & 0x0000c0f0UL)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + /* AQIC function code specific exception */ + /* facility 65 not present for AQIC function code */ + if (!test_kvm_facility(vcpu->kvm, 65)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + /* + * Verify that the hook callback is registered, lock the owner + * and call the hook. + */ + if (vcpu->kvm->arch.crypto.pqap_hook) { + if (!try_module_get(vcpu->kvm->arch.crypto.pqap_hook->owner)) + return -EOPNOTSUPP; + ret = vcpu->kvm->arch.crypto.pqap_hook->hook(vcpu); + module_put(vcpu->kvm->arch.crypto.pqap_hook->owner); + if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000) + kvm_s390_set_psw_cc(vcpu, 3); + return ret; + } + /* + * A vfio_driver must register a hook. + * No hook means no driver to enable the SIE CRYCB and no queues. + * We send this response to the guest. + */ + status.response_code = 0x01; + memcpy(&vcpu->run->s.regs.gprs[1], &status, sizeof(status)); + kvm_s390_set_psw_cc(vcpu, 3); + return 0; +} + static int handle_stfl(struct kvm_vcpu *vcpu) { int rc; @@ -878,6 +962,8 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu) return handle_sthyi(vcpu); case 0x7d: return handle_stsi(vcpu); + case 0xaf: + return handle_pqap(vcpu); case 0xb1: return handle_stfl(vcpu); case 0xb2: diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 76b7f98e47e9..a910be124595 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -16,6 +16,7 @@ #include #include #include +#include #include "ap_bus.h" @@ -81,6 +82,7 @@ struct ap_matrix_mdev { struct ap_matrix matrix; struct notifier_block group_notifier; struct kvm *kvm; + struct kvm_s390_module_hook pqap_hook; }; extern int vfio_ap_mdev_register(void); From 62e358ce586b53562a5efcfdbaddc5bd285e6484 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Tue, 21 May 2019 17:34:35 +0200 Subject: [PATCH 75/83] vfio: ap: register IOMMU VFIO notifier To be able to use the VFIO interface to facilitate the mediated device memory pinning/unpinning we need to register a notifier for IOMMU. While we will start to pin one guest page for the interrupt indicator byte, this is still ok with ballooning as this page will never be used by the guest virtio-balloon driver. So the pinned page will never be freed. And even a broken guest does so, that would not impact the host as the original page is still in control by vfio. Signed-off-by: Pierre Morel Reviewed-by: Cornelia Huck Reviewed-by: Tony Krowiak Acked-by: Harald Freudenberger Signed-off-by: Halil Pasic Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/vfio_ap_ops.c | 43 ++++++++++++++++++++++++++- drivers/s390/crypto/vfio_ap_private.h | 2 ++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 900b9cf20ca5..e8e87bf4c744 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -759,6 +759,35 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev, return 0; } +/* + * vfio_ap_mdev_iommu_notifier: IOMMU notifier callback + * + * @nb: The notifier block + * @action: Action to be taken + * @data: data associated with the request + * + * For an UNMAP request, unpin the guest IOVA (the NIB guest address we + * pinned before). Other requests are ignored. + * + */ +static int vfio_ap_mdev_iommu_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct ap_matrix_mdev *matrix_mdev; + + matrix_mdev = container_of(nb, struct ap_matrix_mdev, iommu_notifier); + + if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) { + struct vfio_iommu_type1_dma_unmap *unmap = data; + unsigned long g_pfn = unmap->iova >> PAGE_SHIFT; + + vfio_unpin_pages(mdev_dev(matrix_mdev->mdev), &g_pfn, 1); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + static int vfio_ap_mdev_group_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -858,7 +887,17 @@ static int vfio_ap_mdev_open(struct mdev_device *mdev) return ret; } - return 0; + matrix_mdev->iommu_notifier.notifier_call = vfio_ap_mdev_iommu_notifier; + events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; + ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + &events, &matrix_mdev->iommu_notifier); + if (!ret) + return ret; + + vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, + &matrix_mdev->group_notifier); + module_put(THIS_MODULE); + return ret; } static void vfio_ap_mdev_release(struct mdev_device *mdev) @@ -869,6 +908,8 @@ static void vfio_ap_mdev_release(struct mdev_device *mdev) kvm_arch_crypto_clear_masks(matrix_mdev->kvm); vfio_ap_mdev_reset_queues(mdev); + vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + &matrix_mdev->iommu_notifier); vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &matrix_mdev->group_notifier); matrix_mdev->kvm = NULL; diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index a910be124595..18dcc4d769cc 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -81,8 +81,10 @@ struct ap_matrix_mdev { struct list_head node; struct ap_matrix matrix; struct notifier_block group_notifier; + struct notifier_block iommu_notifier; struct kvm *kvm; struct kvm_s390_module_hook pqap_hook; + struct mdev_device *mdev; }; extern int vfio_ap_mdev_register(void); From ec89b55e3bce7c8a4bc6b1203280e81342d6745c Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Tue, 21 May 2019 17:34:36 +0200 Subject: [PATCH 76/83] s390: ap: implement PAPQ AQIC interception in kernel We register a AP PQAP instruction hook during the open of the mediated device. And unregister it on release. During the probe of the AP device, we allocate a vfio_ap_queue structure to keep track of the information we need for the PQAP/AQIC instruction interception. In the AP PQAP instruction hook, if we receive a demand to enable IRQs, - we retrieve the vfio_ap_queue based on the APQN we receive in REG1, - we retrieve the page of the guest address, (NIB), from register REG2 - we retrieve the mediated device to use the VFIO pinning infrastructure to pin the page of the guest address, - we retrieve the pointer to KVM to register the guest ISC and retrieve the host ISC - finaly we activate GISA If we receive a demand to disable IRQs, - we deactivate GISA - unregister from the GIB - unpin the NIB When removing the AP device from the driver the device is reseted and this process unregisters the GISA from the GIB, and unpins the NIB address then we free the vfio_ap_queue structure. Signed-off-by: Pierre Morel Acked-by: Tony Krowiak Acked-by: Harald Freudenberger Signed-off-by: Halil Pasic Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/vfio_ap_drv.c | 34 ++- drivers/s390/crypto/vfio_ap_ops.c | 337 +++++++++++++++++++++++++- drivers/s390/crypto/vfio_ap_private.h | 11 + 3 files changed, 375 insertions(+), 7 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c index e9824c35c34f..003662aa8060 100644 --- a/drivers/s390/crypto/vfio_ap_drv.c +++ b/drivers/s390/crypto/vfio_ap_drv.c @@ -5,6 +5,7 @@ * Copyright IBM Corp. 2018 * * Author(s): Tony Krowiak + * Pierre Morel */ #include @@ -40,14 +41,45 @@ static struct ap_device_id ap_queue_ids[] = { MODULE_DEVICE_TABLE(vfio_ap, ap_queue_ids); +/** + * vfio_ap_queue_dev_probe: + * + * Allocate a vfio_ap_queue structure and associate it + * with the device as driver_data. + */ static int vfio_ap_queue_dev_probe(struct ap_device *apdev) { + struct vfio_ap_queue *q; + + q = kzalloc(sizeof(*q), GFP_KERNEL); + if (!q) + return -ENOMEM; + dev_set_drvdata(&apdev->device, q); + q->apqn = to_ap_queue(&apdev->device)->qid; + q->saved_isc = VFIO_AP_ISC_INVALID; return 0; } +/** + * vfio_ap_queue_dev_remove: + * + * Takes the matrix lock to avoid actions on this device while removing + * Free the associated vfio_ap_queue structure + */ static void vfio_ap_queue_dev_remove(struct ap_device *apdev) { - /* Nothing to do yet */ + struct vfio_ap_queue *q; + int apid, apqi; + + mutex_lock(&matrix_dev->lock); + q = dev_get_drvdata(&apdev->device); + dev_set_drvdata(&apdev->device, NULL); + apid = AP_QID_CARD(q->apqn); + apqi = AP_QID_QUEUE(q->apqn); + vfio_ap_mdev_reset_queue(apid, apqi, 1); + vfio_ap_irq_disable(q); + kfree(q); + mutex_unlock(&matrix_dev->lock); } static void vfio_ap_matrix_dev_release(struct device *dev) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index e8e87bf4c744..2c9fb1423a39 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -24,6 +24,296 @@ #define VFIO_AP_MDEV_TYPE_HWVIRT "passthrough" #define VFIO_AP_MDEV_NAME_HWVIRT "VFIO AP Passthrough Device" +static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev); + +static int match_apqn(struct device *dev, void *data) +{ + struct vfio_ap_queue *q = dev_get_drvdata(dev); + + return (q->apqn == *(int *)(data)) ? 1 : 0; +} + +/** + * vfio_ap_get_queue: Retrieve a queue with a specific APQN from a list + * @matrix_mdev: the associated mediated matrix + * @apqn: The queue APQN + * + * Retrieve a queue with a specific APQN from the list of the + * devices of the vfio_ap_drv. + * Verify that the APID and the APQI are set in the matrix. + * + * Returns the pointer to the associated vfio_ap_queue + */ +static struct vfio_ap_queue *vfio_ap_get_queue( + struct ap_matrix_mdev *matrix_mdev, + int apqn) +{ + struct vfio_ap_queue *q; + struct device *dev; + + if (!test_bit_inv(AP_QID_CARD(apqn), matrix_mdev->matrix.apm)) + return NULL; + if (!test_bit_inv(AP_QID_QUEUE(apqn), matrix_mdev->matrix.aqm)) + return NULL; + + dev = driver_find_device(&matrix_dev->vfio_ap_drv->driver, NULL, + &apqn, match_apqn); + if (!dev) + return NULL; + q = dev_get_drvdata(dev); + q->matrix_mdev = matrix_mdev; + put_device(dev); + + return q; +} + +/** + * vfio_ap_wait_for_irqclear + * @apqn: The AP Queue number + * + * Checks the IRQ bit for the status of this APQN using ap_tapq. + * Returns if the ap_tapq function succeeded and the bit is clear. + * Returns if ap_tapq function failed with invalid, deconfigured or + * checkstopped AP. + * Otherwise retries up to 5 times after waiting 20ms. + * + */ +static void vfio_ap_wait_for_irqclear(int apqn) +{ + struct ap_queue_status status; + int retry = 5; + + do { + status = ap_tapq(apqn, NULL); + switch (status.response_code) { + case AP_RESPONSE_NORMAL: + case AP_RESPONSE_RESET_IN_PROGRESS: + if (!status.irq_enabled) + return; + /* Fall through */ + case AP_RESPONSE_BUSY: + msleep(20); + break; + case AP_RESPONSE_Q_NOT_AVAIL: + case AP_RESPONSE_DECONFIGURED: + case AP_RESPONSE_CHECKSTOPPED: + default: + WARN_ONCE(1, "%s: tapq rc %02x: %04x\n", __func__, + status.response_code, apqn); + return; + } + } while (--retry); + + WARN_ONCE(1, "%s: tapq rc %02x: %04x could not clear IR bit\n", + __func__, status.response_code, apqn); +} + +/** + * vfio_ap_free_aqic_resources + * @q: The vfio_ap_queue + * + * Unregisters the ISC in the GIB when the saved ISC not invalid. + * Unpin the guest's page holding the NIB when it exist. + * Reset the saved_pfn and saved_isc to invalid values. + * Clear the pointer to the matrix mediated device. + * + */ +static void vfio_ap_free_aqic_resources(struct vfio_ap_queue *q) +{ + if (q->saved_isc != VFIO_AP_ISC_INVALID && q->matrix_mdev) + kvm_s390_gisc_unregister(q->matrix_mdev->kvm, q->saved_isc); + if (q->saved_pfn && q->matrix_mdev) + vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev), + &q->saved_pfn, 1); + q->saved_pfn = 0; + q->saved_isc = VFIO_AP_ISC_INVALID; + q->matrix_mdev = NULL; +} + +/** + * vfio_ap_irq_disable + * @q: The vfio_ap_queue + * + * Uses ap_aqic to disable the interruption and in case of success, reset + * in progress or IRQ disable command already proceeded: calls + * vfio_ap_wait_for_irqclear() to check for the IRQ bit to be clear + * and calls vfio_ap_free_aqic_resources() to free the resources associated + * with the AP interrupt handling. + * + * In the case the AP is busy, or a reset is in progress, + * retries after 20ms, up to 5 times. + * + * Returns if ap_aqic function failed with invalid, deconfigured or + * checkstopped AP. + */ +struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q) +{ + struct ap_qirq_ctrl aqic_gisa = {}; + struct ap_queue_status status; + int retries = 5; + + do { + status = ap_aqic(q->apqn, aqic_gisa, NULL); + switch (status.response_code) { + case AP_RESPONSE_OTHERWISE_CHANGED: + case AP_RESPONSE_NORMAL: + vfio_ap_wait_for_irqclear(q->apqn); + goto end_free; + case AP_RESPONSE_RESET_IN_PROGRESS: + case AP_RESPONSE_BUSY: + msleep(20); + break; + case AP_RESPONSE_Q_NOT_AVAIL: + case AP_RESPONSE_DECONFIGURED: + case AP_RESPONSE_CHECKSTOPPED: + case AP_RESPONSE_INVALID_ADDRESS: + default: + /* All cases in default means AP not operational */ + WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__, + status.response_code); + goto end_free; + } + } while (retries--); + + WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__, + status.response_code); +end_free: + vfio_ap_free_aqic_resources(q); + return status; +} + +/** + * vfio_ap_setirq: Enable Interruption for a APQN + * + * @dev: the device associated with the ap_queue + * @q: the vfio_ap_queue holding AQIC parameters + * + * Pin the NIB saved in *q + * Register the guest ISC to GIB interface and retrieve the + * host ISC to issue the host side PQAP/AQIC + * + * Response.status may be set to AP_RESPONSE_INVALID_ADDRESS in case the + * vfio_pin_pages failed. + * + * Otherwise return the ap_queue_status returned by the ap_aqic(), + * all retry handling will be done by the guest. + */ +static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q, + int isc, + unsigned long nib) +{ + struct ap_qirq_ctrl aqic_gisa = {}; + struct ap_queue_status status = {}; + struct kvm_s390_gisa *gisa; + struct kvm *kvm; + unsigned long h_nib, g_pfn, h_pfn; + int ret; + + g_pfn = nib >> PAGE_SHIFT; + ret = vfio_pin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1, + IOMMU_READ | IOMMU_WRITE, &h_pfn); + switch (ret) { + case 1: + break; + default: + status.response_code = AP_RESPONSE_INVALID_ADDRESS; + return status; + } + + kvm = q->matrix_mdev->kvm; + gisa = kvm->arch.gisa_int.origin; + + h_nib = (h_pfn << PAGE_SHIFT) | (nib & ~PAGE_MASK); + aqic_gisa.gisc = isc; + aqic_gisa.isc = kvm_s390_gisc_register(kvm, isc); + aqic_gisa.ir = 1; + aqic_gisa.gisa = (uint64_t)gisa >> 4; + + status = ap_aqic(q->apqn, aqic_gisa, (void *)h_nib); + switch (status.response_code) { + case AP_RESPONSE_NORMAL: + /* See if we did clear older IRQ configuration */ + vfio_ap_free_aqic_resources(q); + q->saved_pfn = g_pfn; + q->saved_isc = isc; + break; + case AP_RESPONSE_OTHERWISE_CHANGED: + /* We could not modify IRQ setings: clear new configuration */ + vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1); + kvm_s390_gisc_unregister(kvm, isc); + break; + default: + pr_warn("%s: apqn %04x: response: %02x\n", __func__, q->apqn, + status.response_code); + vfio_ap_irq_disable(q); + break; + } + + return status; +} + +/** + * handle_pqap: PQAP instruction callback + * + * @vcpu: The vcpu on which we received the PQAP instruction + * + * Get the general register contents to initialize internal variables. + * REG[0]: APQN + * REG[1]: IR and ISC + * REG[2]: NIB + * + * Response.status may be set to following Response Code: + * - AP_RESPONSE_Q_NOT_AVAIL: if the queue is not available + * - AP_RESPONSE_DECONFIGURED: if the queue is not configured + * - AP_RESPONSE_NORMAL (0) : in case of successs + * Check vfio_ap_setirq() and vfio_ap_clrirq() for other possible RC. + * We take the matrix_dev lock to ensure serialization on queues and + * mediated device access. + * + * Return 0 if we could handle the request inside KVM. + * otherwise, returns -EOPNOTSUPP to let QEMU handle the fault. + */ +static int handle_pqap(struct kvm_vcpu *vcpu) +{ + uint64_t status; + uint16_t apqn; + struct vfio_ap_queue *q; + struct ap_queue_status qstatus = { + .response_code = AP_RESPONSE_Q_NOT_AVAIL, }; + struct ap_matrix_mdev *matrix_mdev; + + /* If we do not use the AIV facility just go to userland */ + if (!(vcpu->arch.sie_block->eca & ECA_AIV)) + return -EOPNOTSUPP; + + apqn = vcpu->run->s.regs.gprs[0] & 0xffff; + mutex_lock(&matrix_dev->lock); + + if (!vcpu->kvm->arch.crypto.pqap_hook) + goto out_unlock; + matrix_mdev = container_of(vcpu->kvm->arch.crypto.pqap_hook, + struct ap_matrix_mdev, pqap_hook); + + q = vfio_ap_get_queue(matrix_mdev, apqn); + if (!q) + goto out_unlock; + + status = vcpu->run->s.regs.gprs[1]; + + /* If IR bit(16) is set we enable the interrupt */ + if ((status >> (63 - 16)) & 0x01) + qstatus = vfio_ap_irq_enable(q, status & 0x07, + vcpu->run->s.regs.gprs[2]); + else + qstatus = vfio_ap_irq_disable(q); + +out_unlock: + memcpy(&vcpu->run->s.regs.gprs[1], &qstatus, sizeof(qstatus)); + vcpu->run->s.regs.gprs[1] >>= 32; + mutex_unlock(&matrix_dev->lock); + return 0; +} + static void vfio_ap_matrix_init(struct ap_config_info *info, struct ap_matrix *matrix) { @@ -45,8 +335,11 @@ static int vfio_ap_mdev_create(struct kobject *kobj, struct mdev_device *mdev) return -ENOMEM; } + matrix_mdev->mdev = mdev; vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix); mdev_set_drvdata(mdev, matrix_mdev); + matrix_mdev->pqap_hook.hook = handle_pqap; + matrix_mdev->pqap_hook.owner = THIS_MODULE; mutex_lock(&matrix_dev->lock); list_add(&matrix_mdev->node, &matrix_dev->mdev_list); mutex_unlock(&matrix_dev->lock); @@ -62,6 +355,7 @@ static int vfio_ap_mdev_remove(struct mdev_device *mdev) return -EBUSY; mutex_lock(&matrix_dev->lock); + vfio_ap_mdev_reset_queues(mdev); list_del(&matrix_mdev->node); mutex_unlock(&matrix_dev->lock); @@ -754,6 +1048,8 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev, } matrix_mdev->kvm = kvm; + kvm_get_kvm(kvm); + kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook; mutex_unlock(&matrix_dev->lock); return 0; @@ -819,15 +1115,36 @@ static int vfio_ap_mdev_group_notifier(struct notifier_block *nb, return NOTIFY_OK; } -static int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi, - unsigned int retry) +static void vfio_ap_irq_disable_apqn(int apqn) +{ + struct device *dev; + struct vfio_ap_queue *q; + + dev = driver_find_device(&matrix_dev->vfio_ap_drv->driver, NULL, + &apqn, match_apqn); + if (dev) { + q = dev_get_drvdata(dev); + vfio_ap_irq_disable(q); + put_device(dev); + } +} + +int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi, + unsigned int retry) { struct ap_queue_status status; + int retry2 = 2; + int apqn = AP_MKQID(apid, apqi); do { - status = ap_zapq(AP_MKQID(apid, apqi)); + status = ap_zapq(apqn); switch (status.response_code) { case AP_RESPONSE_NORMAL: + while (!status.queue_empty && retry2--) { + msleep(20); + status = ap_tapq(apqn, NULL); + } + WARN_ON_ONCE(retry <= 0); return 0; case AP_RESPONSE_RESET_IN_PROGRESS: case AP_RESPONSE_BUSY: @@ -861,6 +1178,7 @@ static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev) */ if (ret) rc = ret; + vfio_ap_irq_disable_apqn(AP_MKQID(apid, apqi)); } } @@ -904,15 +1222,20 @@ static void vfio_ap_mdev_release(struct mdev_device *mdev) { struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); - if (matrix_mdev->kvm) + mutex_lock(&matrix_dev->lock); + if (matrix_mdev->kvm) { kvm_arch_crypto_clear_masks(matrix_mdev->kvm); + matrix_mdev->kvm->arch.crypto.pqap_hook = NULL; + vfio_ap_mdev_reset_queues(mdev); + kvm_put_kvm(matrix_mdev->kvm); + matrix_mdev->kvm = NULL; + } + mutex_unlock(&matrix_dev->lock); - vfio_ap_mdev_reset_queues(mdev); vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &matrix_mdev->iommu_notifier); vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &matrix_mdev->group_notifier); - matrix_mdev->kvm = NULL; module_put(THIS_MODULE); } @@ -941,6 +1264,7 @@ static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev, { int ret; + mutex_lock(&matrix_dev->lock); switch (cmd) { case VFIO_DEVICE_GET_INFO: ret = vfio_ap_mdev_get_device_info(arg); @@ -952,6 +1276,7 @@ static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev, ret = -EOPNOTSUPP; break; } + mutex_unlock(&matrix_dev->lock); return ret; } diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 18dcc4d769cc..f46dde56b464 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -4,6 +4,7 @@ * * Author(s): Tony Krowiak * Halil Pasic + * Pierre Morel * * Copyright IBM Corp. 2018 */ @@ -89,5 +90,15 @@ struct ap_matrix_mdev { extern int vfio_ap_mdev_register(void); extern void vfio_ap_mdev_unregister(void); +int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi, + unsigned int retry); +struct vfio_ap_queue { + struct ap_matrix_mdev *matrix_mdev; + unsigned long saved_pfn; + int apqn; +#define VFIO_AP_ISC_INVALID 0xff + unsigned char saved_isc; +}; +struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q); #endif /* _VFIO_AP_PRIVATE_H_ */ From 05f31e3bf6b34fe6e4922868d132f6455f81d5bf Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Tue, 21 May 2019 17:34:37 +0200 Subject: [PATCH 77/83] s390: ap: kvm: Enable PQAP/AQIC facility for the guest AP Queue Interruption Control (AQIC) facility gives the guest the possibility to control interruption for the Cryptographic Adjunct Processor queues. Signed-off-by: Pierre Morel Reviewed-by: Tony Krowiak Acked-by: Harald Freudenberger Acked-by: Christian Borntraeger Signed-off-by: Halil Pasic [ Modified while picking: we may not expose STFLE facility 65 unconditionally because AIV is a pre-requirement.] Signed-off-by: Vasily Gorbik --- arch/s390/kvm/kvm-s390.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 28ebd647784c..1c4113f0f2a8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2461,6 +2461,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) set_kvm_facility(kvm->arch.model.fac_list, 147); } + if (css_general_characteristics.aiv && test_facility(65)) + set_kvm_facility(kvm->arch.model.fac_mask, 65); + kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); kvm->arch.model.ibc = sclp.ibc & 0x0fff; From c7ff0e918a7cb39f6bfb2a7bdc30199986ff1571 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 27 Jun 2019 15:13:05 +0200 Subject: [PATCH 78/83] s390/pci: deal with devices that have no support for MIO instructions Unfortunately we have to handle a class of devices that don't support the new MIO instructions. Adjust resource assignment and mapping accordingly. Signed-off-by: Sebastian Ott Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 5 +++++ arch/s390/pci/pci.c | 15 ++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 305befd55326..a2399eff84ca 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -194,6 +194,11 @@ int zpci_init_iommu(struct zpci_dev *zdev); void zpci_destroy_iommu(struct zpci_dev *zdev); #ifdef CONFIG_PCI +static inline bool zpci_use_mio(struct zpci_dev *zdev) +{ + return static_branch_likely(&have_mio) && zdev->mio_capable; +} + /* Error handling and recovery */ void zpci_event_error(void *); void zpci_event_availability(void *); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 86ca7f88fb22..b8a64cbb5dea 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -421,12 +421,12 @@ static void zpci_map_resources(struct pci_dev *pdev) if (!len) continue; - if (static_branch_likely(&have_mio)) + if (zpci_use_mio(zdev)) pdev->resource[i].start = (resource_size_t __force) zdev->bars[i].mio_wb; else - pdev->resource[i].start = - (resource_size_t __force) pci_iomap(pdev, i, 0); + pdev->resource[i].start = (resource_size_t __force) + pci_iomap_range_fh(pdev, i, 0, 0); pdev->resource[i].end = pdev->resource[i].start + len - 1; } @@ -444,18 +444,19 @@ static void zpci_map_resources(struct pci_dev *pdev) static void zpci_unmap_resources(struct pci_dev *pdev) { + struct zpci_dev *zdev = to_zpci(pdev); resource_size_t len; int i; - if (static_branch_likely(&have_mio)) + if (zpci_use_mio(zdev)) return; for (i = 0; i < PCI_BAR_COUNT; i++) { len = pci_resource_len(pdev, i); if (!len) continue; - pci_iounmap(pdev, (void __iomem __force *) - pdev->resource[i].start); + pci_iounmap_fh(pdev, (void __iomem __force *) + pdev->resource[i].start); } } @@ -528,7 +529,7 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev, if (zdev->bars[i].val & 4) flags |= IORESOURCE_MEM_64; - if (static_branch_likely(&have_mio)) + if (zpci_use_mio(zdev)) addr = (unsigned long) zdev->bars[i].mio_wb; else addr = ZPCI_ADDR(entry); From 6ae3483d411638e471ca0498629b17939f1c20f4 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 27 Jun 2019 15:47:13 +0200 Subject: [PATCH 79/83] s390/pci: correctly handle MIO opt-out Do not issue CLP_SET_ENABLE_MIO after opting out of MIO instruction usage. This should not fix a bug but reduce overhead within firmware. Signed-off-by: Sebastian Ott Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_clp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index d03631dba7c2..9bdff4defef1 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -291,7 +291,7 @@ int clp_enable_fh(struct zpci_dev *zdev, u8 nr_dma_as) goto out; zdev->fh = fh; - if (zdev->mio_capable) { + if (zpci_use_mio(zdev)) { rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_MIO); zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); if (rc) From c382cbc6dbf513d73cf896ad43a3789ad42c2e2f Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Tue, 2 Jul 2019 20:09:28 +0200 Subject: [PATCH 80/83] vfio-ccw: Fix the conversion of Format-0 CCWs to Format-1 When processing Format-0 CCWs, we use the "len" variable as the number of CCWs to convert to Format-1. But that variable contains zero here, and is not a meaningful CCW count until ccwchain_calc_length() returns. Since that routine requires and expects Format-1 CCWs to identify the chaining behavior, the format conversion must be done first. Convert the 2KB we copied even if it's more than we need. Fixes: 7f8e89a8f2fd ("vfio-ccw: Factor out the ccw0-to-ccw1 transition") Reported-by: Farhan Ali Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20190702180928.18113-1-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index a7b9dfd5b464..1d4c893ead23 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -431,7 +431,7 @@ static int ccwchain_handle_ccw(u32 cda, struct channel_program *cp) /* Convert any Format-0 CCWs to Format-1 */ if (!cp->orb.cmd.fmt) - convert_ccw0_to_ccw1(cp->guest_cp, len); + convert_ccw0_to_ccw1(cp->guest_cp, CCWCHAIN_LEN_MAX); /* Count the CCWs in the current chain */ len = ccwchain_calc_length(cda, cp); From f11977be1af94c6d42350fe85fa073d24ba75fac Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Wed, 3 Jul 2019 12:19:47 +0200 Subject: [PATCH 81/83] docs: s390: restore important non-kdoc parts of s390dbf.rst Complements previous ("s390: include/asm/debug.h add kerneldoc markups") which seemed to have dropped important non-kdoc parts such as user space interface (level, size, flush) as well as views and caution regarding strings in the sprintf view. Signed-off-by: Steffen Maier Acked-by: Christian Borntraeger Message-Id: <1562149189-1417-2-git-send-email-maier@linux.ibm.com> Signed-off-by: Vasily Gorbik --- Documentation/s390/s390dbf.rst | 339 +++++++++++++++++++++++++++++++++ 1 file changed, 339 insertions(+) diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst index d2595b548879..01d66251643d 100644 --- a/Documentation/s390/s390dbf.rst +++ b/Documentation/s390/s390dbf.rst @@ -112,6 +112,345 @@ Kernel Interfaces: Predefined views: ----------------- +extern struct debug_view debug_hex_ascii_view; + +extern struct debug_view debug_raw_view; + +extern struct debug_view debug_sprintf_view; + +Examples +-------- + +:: + + /* + * hex_ascii- + raw-view Example + */ + + #include + #include + + static debug_info_t* debug_info; + + static int init(void) + { + /* register 4 debug areas with one page each and 4 byte data field */ + + debug_info = debug_register ("test", 1, 4, 4 ); + debug_register_view(debug_info,&debug_hex_ascii_view); + debug_register_view(debug_info,&debug_raw_view); + + debug_text_event(debug_info, 4 , "one "); + debug_int_exception(debug_info, 4, 4711); + debug_event(debug_info, 3, &debug_info, 4); + + return 0; + } + + static void cleanup(void) + { + debug_unregister (debug_info); + } + + module_init(init); + module_exit(cleanup); + +:: + + /* + * sprintf-view Example + */ + + #include + #include + + static debug_info_t* debug_info; + + static int init(void) + { + /* register 4 debug areas with one page each and data field for */ + /* format string pointer + 2 varargs (= 3 * sizeof(long)) */ + + debug_info = debug_register ("test", 1, 4, sizeof(long) * 3); + debug_register_view(debug_info,&debug_sprintf_view); + + debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__); + debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info); + + return 0; + } + + static void cleanup(void) + { + debug_unregister (debug_info); + } + + module_init(init); + module_exit(cleanup); + +Debugfs Interface +----------------- +Views to the debug logs can be investigated through reading the corresponding +debugfs-files: + +Example:: + + > ls /sys/kernel/debug/s390dbf/dasd + flush hex_ascii level pages raw + > cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s + 00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | .... + 00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE + 00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | .... + 00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP + 01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD + 01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | .... + 01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ... + 01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | .... + 01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE + 01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | .... + +See section about predefined views for explanation of the above output! + +Changing the debug level +------------------------ + +Example:: + + + > cat /sys/kernel/debug/s390dbf/dasd/level + 3 + > echo "5" > /sys/kernel/debug/s390dbf/dasd/level + > cat /sys/kernel/debug/s390dbf/dasd/level + 5 + +Flushing debug areas +-------------------- +Debug areas can be flushed with piping the number of the desired +area (0...n) to the debugfs file "flush". When using "-" all debug areas +are flushed. + +Examples: + +1. Flush debug area 0:: + + > echo "0" > /sys/kernel/debug/s390dbf/dasd/flush + +2. Flush all debug areas:: + + > echo "-" > /sys/kernel/debug/s390dbf/dasd/flush + +Changing the size of debug areas +------------------------------------ +It is possible the change the size of debug areas through piping +the number of pages to the debugfs file "pages". The resize request will +also flush the debug areas. + +Example: + +Define 4 pages for the debug areas of debug feature "dasd":: + + > echo "4" > /sys/kernel/debug/s390dbf/dasd/pages + +Stooping the debug feature +-------------------------- +Example: + +1. Check if stopping is allowed:: + + > cat /proc/sys/s390dbf/debug_stoppable + +2. Stop debug feature:: + + > echo 0 > /proc/sys/s390dbf/debug_active + +lcrash Interface +---------------- +It is planned that the dump analysis tool lcrash gets an additional command +'s390dbf' to display all the debug logs. With this tool it will be possible +to investigate the debug logs on a live system and with a memory dump after +a system crash. + +Investigating raw memory +------------------------ +One last possibility to investigate the debug logs at a live +system and after a system crash is to look at the raw memory +under VM or at the Service Element. +It is possible to find the anker of the debug-logs through +the 'debug_area_first' symbol in the System map. Then one has +to follow the correct pointers of the data-structures defined +in debug.h and find the debug-areas in memory. +Normally modules which use the debug feature will also have +a global variable with the pointer to the debug-logs. Following +this pointer it will also be possible to find the debug logs in +memory. + +For this method it is recommended to use '16 * x + 4' byte (x = 0..n) +for the length of the data field in debug_register() in +order to see the debug entries well formatted. + + +Predefined Views +---------------- + +There are three predefined views: hex_ascii, raw and sprintf. +The hex_ascii view shows the data field in hex and ascii representation +(e.g. '45 43 4b 44 | ECKD'). +The raw view returns a bytestream as the debug areas are stored in memory. + +The sprintf view formats the debug entries in the same way as the sprintf +function would do. The sprintf event/exception functions write to the +debug entry a pointer to the format string (size = sizeof(long)) +and for each vararg a long value. So e.g. for a debug entry with a format +string plus two varargs one would need to allocate a (3 * sizeof(long)) +byte data area in the debug_register() function. + +IMPORTANT: + Using "%s" in sprintf event functions is dangerous. You can only + use "%s" in the sprintf event functions, if the memory for the passed string + is available as long as the debug feature exists. The reason behind this is + that due to performance considerations only a pointer to the string is stored + in the debug feature. If you log a string that is freed afterwards, you will + get an OOPS when inspecting the debug feature, because then the debug feature + will access the already freed memory. + +NOTE: + If using the sprintf view do NOT use other event/exception functions + than the sprintf-event and -exception functions. + +The format of the hex_ascii and sprintf view is as follows: + +- Number of area +- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated + Universal Time (UTC), January 1, 1970) +- level of debug entry +- Exception flag (* = Exception) +- Cpu-Number of calling task +- Return Address to caller +- data field + +The format of the raw view is: + +- Header as described in debug.h +- datafield + +A typical line of the hex_ascii view will look like the following (first line +is only for explanation and will not be displayed when 'cating' the view): + +area time level exception cpu caller data (hex + ascii) +-------------------------------------------------------------------------- +00 00964419409:440690 1 - 00 88023fe + + +Defining views +-------------- + +Views are specified with the 'debug_view' structure. There are defined +callback functions which are used for reading and writing the debugfs files:: + + struct debug_view { + char name[DEBUG_MAX_PROCF_LEN]; + debug_prolog_proc_t* prolog_proc; + debug_header_proc_t* header_proc; + debug_format_proc_t* format_proc; + debug_input_proc_t* input_proc; + void* private_data; + }; + +where:: + + typedef int (debug_header_proc_t) (debug_info_t* id, + struct debug_view* view, + int area, + debug_entry_t* entry, + char* out_buf); + + typedef int (debug_format_proc_t) (debug_info_t* id, + struct debug_view* view, char* out_buf, + const char* in_buf); + typedef int (debug_prolog_proc_t) (debug_info_t* id, + struct debug_view* view, + char* out_buf); + typedef int (debug_input_proc_t) (debug_info_t* id, + struct debug_view* view, + struct file* file, const char* user_buf, + size_t in_buf_size, loff_t* offset); + + +The "private_data" member can be used as pointer to view specific data. +It is not used by the debug feature itself. + +The output when reading a debugfs file is structured like this:: + + "prolog_proc output" + + "header_proc output 1" "format_proc output 1" + "header_proc output 2" "format_proc output 2" + "header_proc output 3" "format_proc output 3" + ... + +When a view is read from the debugfs, the Debug Feature calls the +'prolog_proc' once for writing the prolog. +Then 'header_proc' and 'format_proc' are called for each +existing debug entry. + +The input_proc can be used to implement functionality when it is written to +the view (e.g. like with 'echo "0" > /sys/kernel/debug/s390dbf/dasd/level). + +For header_proc there can be used the default function +debug_dflt_header_fn() which is defined in debug.h. +and which produces the same header output as the predefined views. +E.g:: + + 00 00964419409:440761 2 - 00 88023ec + +In order to see how to use the callback functions check the implementation +of the default views! + +Example:: + + #include + + #define UNKNOWNSTR "data: %08x" + + const char* messages[] = + {"This error...........\n", + "That error...........\n", + "Problem..............\n", + "Something went wrong.\n", + "Everything ok........\n", + NULL + }; + + static int debug_test_format_fn( + debug_info_t * id, struct debug_view *view, + char *out_buf, const char *in_buf + ) + { + int i, rc = 0; + + if(id->buf_size >= 4) { + int msg_nr = *((int*)in_buf); + if(msg_nr < sizeof(messages)/sizeof(char*) - 1) + rc += sprintf(out_buf, "%s", messages[msg_nr]); + else + rc += sprintf(out_buf, UNKNOWNSTR, msg_nr); + } + out: + return rc; + } + + struct debug_view debug_test_view = { + "myview", /* name of view */ + NULL, /* no prolog */ + &debug_dflt_header_fn, /* default header for each entry */ + &debug_test_format_fn, /* our own format function */ + NULL, /* no input function */ + NULL /* no private data */ + }; + +test: +===== + :: debug_info_t *debug_info; From 0328e519a726ff6e4abacba838eb00415171c34b Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Wed, 3 Jul 2019 12:19:48 +0200 Subject: [PATCH 82/83] docs: s390: unify and update s390dbf kdocs at debug.c For non-static-inlines, debug.c already had non-compliant function header docs. So move the pure prototype kdocs of ("s390: include/asm/debug.h add kerneldoc markups") from debug.h to debug.c and merge them with the old function docs. Also, I had the impression that kdoc typically is at the implementation in the compile unit rather than at the prototype in the header file. While at it, update the short kdoc description to distinguish the different functions. And a few more consistency cleanups. Added a new kdoc for debug_set_critical() since debug.h comments it as part of the API. Signed-off-by: Steffen Maier Acked-by: Christian Borntraeger Message-Id: <1562149189-1417-3-git-send-email-maier@linux.ibm.com> Signed-off-by: Vasily Gorbik --- Documentation/s390/s390dbf.rst | 1 + arch/s390/include/asm/debug.h | 112 +++++---------------------------- arch/s390/kernel/debug.c | 105 +++++++++++++++++++++++++------ 3 files changed, 102 insertions(+), 116 deletions(-) diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst index 01d66251643d..be42892b159e 100644 --- a/Documentation/s390/s390dbf.rst +++ b/Documentation/s390/s390dbf.rst @@ -107,6 +107,7 @@ will stay deactivated. Kernel Interfaces: ------------------ +.. kernel-doc:: arch/s390/kernel/debug.c .. kernel-doc:: arch/s390/include/asm/debug.h Predefined views: diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index 02c36eedd780..310134015541 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -95,77 +95,19 @@ debug_entry_t *debug_exception_common(debug_info_t *id, int level, /* Debug Feature API: */ -/** - * debug_register() - allocates memory for a debug log. - * - * @name: Name of debug log (e.g. used for debugfs entry) - * @pages: Number of pages, which will be allocated per area - * @nr_areas: Number of debug areas - * @buf_size: Size of data area in each debug entry - * - * Return: - * - Handler for generated debug area - * - %NULL if register failed - * - * Must not be called within an interrupt handler. - */ debug_info_t *debug_register(const char *name, int pages, int nr_areas, int buf_size); -/** - * debug_register_mode() - allocates memory for a debug log. - * - * @name: Name of debug log (e.g. used for debugfs entry) - * @pages: Number of pages, which will be allocated per area - * @nr_areas: Number of debug areas - * @buf_size: Size of data area in each debug entry - * @mode: File mode for debugfs files. E.g. S_IRWXUGO - * @uid: User ID for debugfs files. Currently only 0 is supported. - * @gid: Group ID for debugfs files. Currently only 0 is supported. - * - * Return: - * - Handler for generated debug area - * - %NULL if register failed - * - * Must not be called within an interrupt handler - */ debug_info_t *debug_register_mode(const char *name, int pages, int nr_areas, int buf_size, umode_t mode, uid_t uid, gid_t gid); -/** - * debug_unregister() - frees memory for a debug log and removes all - * registered debug - * views. - * - * @id: handle for debug log - * - * Return: - * none - * - * Must not be called within an interrupt handler - */ void debug_unregister(debug_info_t *id); -/** - * debug_set_level() - Sets new actual debug level if new_level is valid. - * - * @id: handle for debug log - * @new_level: new debug level - * - * Return: - * none - */ void debug_set_level(debug_info_t *id, int new_level); void debug_set_critical(void); -/** - * debug_stop_all() - stops the debug feature if stopping is allowed. - * - * Return: - * - none - */ void debug_stop_all(void); /** @@ -184,7 +126,7 @@ static inline bool debug_level_enabled(debug_info_t *id, int level) } /** - * debug_event() - writes debug entry to active debug area + * debug_event() - writes binary debug entry to active debug area * (if level <= actual debug level) * * @id: handle for debug log @@ -194,6 +136,7 @@ static inline bool debug_level_enabled(debug_info_t *id, int level) * * Return: * - Address of written debug entry + * - %NULL if error */ static inline debug_entry_t *debug_event(debug_info_t *id, int level, void *data, int length) @@ -204,7 +147,7 @@ static inline debug_entry_t *debug_event(debug_info_t *id, int level, } /** - * debug_int_event() - writes debug entry to active debug area + * debug_int_event() - writes unsigned integer debug entry to active debug area * (if level <= actual debug level) * * @id: handle for debug log @@ -226,12 +169,12 @@ static inline debug_entry_t *debug_int_event(debug_info_t *id, int level, } /** - * debug_long_event() - writes debug entry to active debug area + * debug_long_event() - writes unsigned long debug entry to active debug area * (if level <= actual debug level) * * @id: handle for debug log * @level: debug level - * @tag: integer value for debug entry + * @tag: long integer value for debug entry * * Return: * - Address of written debug entry @@ -248,7 +191,7 @@ static inline debug_entry_t *debug_long_event(debug_info_t *id, int level, } /** - * debug_text_event() - writes debug entry in ascii format to active + * debug_text_event() - writes string debug entry in ascii format to active * debug area (if level <= actual debug level) * * @id: handle for debug log @@ -306,9 +249,9 @@ __debug_sprintf_event(debug_info_t *id, int level, char *string, ...) }) /** - * debug_exception() - writes debug entry to active debug area - * (if level <= actual debug level) and switches - * to next debug area + * debug_exception() - writes binary debug entry to active debug area + * (if level <= actual debug level) + * and switches to next debug area * * @id: handle for debug log * @level: debug level @@ -328,7 +271,7 @@ static inline debug_entry_t *debug_exception(debug_info_t *id, int level, } /** - * debug_int_exception() - writes debug entry to active debug area + * debug_int_exception() - writes unsigned int debug entry to active debug area * (if level <= actual debug level) * and switches to next debug area * @@ -351,13 +294,13 @@ static inline debug_entry_t *debug_int_exception(debug_info_t *id, int level, } /** - * debug_long_exception() - writes debug entry to active debug area + * debug_long_exception() - writes long debug entry to active debug area * (if level <= actual debug level) * and switches to next debug area * * @id: handle for debug log * @level: debug level - * @tag: integer value for debug entry + * @tag: long integer value for debug entry * * Return: * - Address of written debug entry @@ -374,9 +317,9 @@ static inline debug_entry_t *debug_long_exception (debug_info_t *id, int level, } /** - * debug_text_exception() - writes debug entry in ascii format to active + * debug_text_exception() - writes string debug entry in ascii format to active * debug area (if level <= actual debug level) - * and switches to next debug + * and switches to next debug area * area * * @id: handle for debug log @@ -407,7 +350,7 @@ __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) /** * debug_sprintf_exception() - writes debug entry with format string and * varargs (longs) to active debug area - * (if level $<=$ actual debug level) + * (if level <= actual debug level) * and switches to next debug area. * * @_id: handle for debug log @@ -435,33 +378,8 @@ __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) __ret; \ }) -/** - * debug_register_view() - registers new debug view and creates debugfs - * dir entry - * - * @id: handle for debug log - * @view: pointer to debug view struct - * - * Return: - * - 0 : ok - * - < 0: Error - */ int debug_register_view(debug_info_t *id, struct debug_view *view); -/** - * debug_unregister_view() - * - * @id: handle for debug log - * @view: pointer to debug view struct - * - * Return: - * - 0 : ok - * - < 0: Error - * - * - * unregisters debug view and removes debugfs dir entry - */ - int debug_unregister_view(debug_info_t *id, struct debug_view *view); /* diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 0ebf08c3b35e..6d321f5f101d 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -647,11 +647,23 @@ static int debug_close(struct inode *inode, struct file *file) return 0; /* success */ } -/* - * debug_register_mode: - * - Creates and initializes debug area for the caller - * The mode parameter allows to specify access rights for the s390dbf files - * - Returns handle for debug area +/** + * debug_register_mode() - creates and initializes debug area. + * + * @name: Name of debug log (e.g. used for debugfs entry) + * @pages_per_area: Number of pages, which will be allocated per area + * @nr_areas: Number of debug areas + * @buf_size: Size of data area in each debug entry + * @mode: File mode for debugfs files. E.g. S_IRWXUGO + * @uid: User ID for debugfs files. Currently only 0 is supported. + * @gid: Group ID for debugfs files. Currently only 0 is supported. + * + * Return: + * - Handle for generated debug area + * - %NULL if register failed + * + * Allocates memory for a debug log. + * Must not be called within an interrupt handler. */ debug_info_t *debug_register_mode(const char *name, int pages_per_area, int nr_areas, int buf_size, umode_t mode, @@ -681,10 +693,21 @@ debug_info_t *debug_register_mode(const char *name, int pages_per_area, } EXPORT_SYMBOL(debug_register_mode); -/* - * debug_register: - * - creates and initializes debug area for the caller - * - returns handle for debug area +/** + * debug_register() - creates and initializes debug area with default file mode. + * + * @name: Name of debug log (e.g. used for debugfs entry) + * @pages_per_area: Number of pages, which will be allocated per area + * @nr_areas: Number of debug areas + * @buf_size: Size of data area in each debug entry + * + * Return: + * - Handle for generated debug area + * - %NULL if register failed + * + * Allocates memory for a debug log. + * The debugfs file mode access permissions are read and write for user. + * Must not be called within an interrupt handler. */ debug_info_t *debug_register(const char *name, int pages_per_area, int nr_areas, int buf_size) @@ -694,9 +717,13 @@ debug_info_t *debug_register(const char *name, int pages_per_area, } EXPORT_SYMBOL(debug_register); -/* - * debug_unregister: - * - give back debug area +/** + * debug_unregister() - give back debug area. + * + * @id: handle for debug log + * + * Return: + * none */ void debug_unregister(debug_info_t *id) { @@ -745,9 +772,14 @@ static int debug_set_size(debug_info_t *id, int nr_areas, int pages_per_area) return rc; } -/* - * debug_set_level: - * - set actual debug level +/** + * debug_set_level() - Sets new actual debug level if new_level is valid. + * + * @id: handle for debug log + * @new_level: new debug level + * + * Return: + * none */ void debug_set_level(debug_info_t *id, int new_level) { @@ -873,6 +905,14 @@ static struct ctl_table s390dbf_dir_table[] = { static struct ctl_table_header *s390dbf_sysctl_header; +/** + * debug_stop_all() - stops the debug feature if stopping is allowed. + * + * Return: + * - none + * + * Currently used in case of a kernel oops. + */ void debug_stop_all(void) { if (debug_stoppable) @@ -880,6 +920,17 @@ void debug_stop_all(void) } EXPORT_SYMBOL(debug_stop_all); +/** + * debug_set_critical() - event/exception functions try lock instead of spin. + * + * Return: + * - none + * + * Currently used in case of stopping all CPUs but the current one. + * Once in this state, functions to write a debug entry for an + * event or exception no longer spin on the debug area lock, + * but only try to get it and fail if they do not get the lock. + */ void debug_set_critical(void) { debug_critical = 1; @@ -1036,8 +1087,16 @@ debug_entry_t *__debug_sprintf_exception(debug_info_t *id, int level, char *stri } EXPORT_SYMBOL(__debug_sprintf_exception); -/* - * debug_register_view: +/** + * debug_register_view() - registers new debug view and creates debugfs + * dir entry + * + * @id: handle for debug log + * @view: pointer to debug view struct + * + * Return: + * - 0 : ok + * - < 0: Error */ int debug_register_view(debug_info_t *id, struct debug_view *view) { @@ -1077,8 +1136,16 @@ int debug_register_view(debug_info_t *id, struct debug_view *view) } EXPORT_SYMBOL(debug_register_view); -/* - * debug_unregister_view: +/** + * debug_unregister_view() - unregisters debug view and removes debugfs + * dir entry + * + * @id: handle for debug log + * @view: pointer to debug view struct + * + * Return: + * - 0 : ok + * - < 0: Error */ int debug_unregister_view(debug_info_t *id, struct debug_view *view) { From 499723d12063aab97dfe6b41c822e9c1c74eff3e Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Wed, 3 Jul 2019 12:19:49 +0200 Subject: [PATCH 83/83] docs: s390: s390dbf: typos and formatting, update crash command Signed-off-by: Steffen Maier Acked-by: Christian Borntraeger Message-Id: <1562149189-1417-4-git-send-email-maier@linux.ibm.com> Signed-off-by: Vasily Gorbik --- Documentation/s390/s390dbf.rst | 122 ++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 54 deletions(-) diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst index be42892b159e..cdb36842b898 100644 --- a/Documentation/s390/s390dbf.rst +++ b/Documentation/s390/s390dbf.rst @@ -23,7 +23,8 @@ The debug feature may also very useful for kernel and driver development. Design: ------- Kernel components (e.g. device drivers) can register themselves at the debug -feature with the function call debug_register(). This function initializes a +feature with the function call :c:func:`debug_register()`. +This function initializes a debug log for the caller. For each debug log exists a number of debug areas where exactly one is active at one time. Each debug area consists of contiguous pages in memory. In the debug areas there are stored debug entries (log records) @@ -44,8 +45,9 @@ The debug areas themselves are also ordered in form of a ring buffer. When an exception is thrown in the last debug area, the following debug entries are then written again in the very first area. -There are three versions for the event- and exception-calls: One for -logging raw data, one for text and one for numbers. +There are four versions for the event- and exception-calls: One for +logging raw data, one for text, one for numbers (unsigned int and long), +and one for sprintf-like formatted strings. Each debug entry contains the following data: @@ -56,29 +58,29 @@ Each debug entry contains the following data: - Flag, if entry is an exception or not The debug logs can be inspected in a live system through entries in -the debugfs-filesystem. Under the toplevel directory "s390dbf" there is +the debugfs-filesystem. Under the toplevel directory "``s390dbf``" there is a directory for each registered component, which is named like the corresponding component. The debugfs normally should be mounted to -/sys/kernel/debug therefore the debug feature can be accessed under -/sys/kernel/debug/s390dbf. +``/sys/kernel/debug`` therefore the debug feature can be accessed under +``/sys/kernel/debug/s390dbf``. The content of the directories are files which represent different views to the debug log. Each component can decide which views should be -used through registering them with the function debug_register_view(). +used through registering them with the function :c:func:`debug_register_view()`. Predefined views for hex/ascii, sprintf and raw binary data are provided. It is also possible to define other views. The content of a view can be inspected simply by reading the corresponding debugfs file. All debug logs have an actual debug level (range from 0 to 6). -The default level is 3. Event and Exception functions have a 'level' +The default level is 3. Event and Exception functions have a :c:data:`level` parameter. Only debug entries with a level that is lower or equal than the actual level are written to the log. This means, when writing events, high priority log entries should have a low level value whereas low priority entries should have a high one. The actual debug level can be changed with the help of the debugfs-filesystem -through writing a number string "x" to the 'level' debugfs file which is +through writing a number string "x" to the ``level`` debugfs file which is provided for every debug log. Debugging can be switched off completely -by using "-" on the 'level' debugfs file. +by using "-" on the ``level`` debugfs file. Example:: @@ -86,21 +88,21 @@ Example:: It is also possible to deactivate the debug feature globally for every debug log. You can change the behavior using 2 sysctl parameters in -/proc/sys/s390dbf: +``/proc/sys/s390dbf``: There are currently 2 possible triggers, which stop the debug feature -globally. The first possibility is to use the "debug_active" sysctl. If -set to 1 the debug feature is running. If "debug_active" is set to 0 the +globally. The first possibility is to use the ``debug_active`` sysctl. If +set to 1 the debug feature is running. If ``debug_active`` is set to 0 the debug feature is turned off. The second trigger which stops the debug feature is a kernel oops. That prevents the debug feature from overwriting debug information that happened before the oops. After an oops you can reactivate the debug feature -by piping 1 to /proc/sys/s390dbf/debug_active. Nevertheless, its not +by piping 1 to ``/proc/sys/s390dbf/debug_active``. Nevertheless, it's not suggested to use an oopsed kernel in a production environment. If you want to disallow the deactivation of the debug feature, you can use -the "debug_stoppable" sysctl. If you set "debug_stoppable" to 0 the debug +the ``debug_stoppable`` sysctl. If you set ``debug_stoppable`` to 0 the debug feature cannot be stopped. If the debug feature is already stopped, it will stay deactivated. @@ -113,16 +115,18 @@ Kernel Interfaces: Predefined views: ----------------- -extern struct debug_view debug_hex_ascii_view; +.. code-block:: c -extern struct debug_view debug_raw_view; + extern struct debug_view debug_hex_ascii_view; -extern struct debug_view debug_sprintf_view; + extern struct debug_view debug_raw_view; + + extern struct debug_view debug_sprintf_view; Examples -------- -:: +.. code-block:: c /* * hex_ascii- + raw-view Example @@ -131,15 +135,15 @@ Examples #include #include - static debug_info_t* debug_info; + static debug_info_t *debug_info; static int init(void) { /* register 4 debug areas with one page each and 4 byte data field */ - debug_info = debug_register ("test", 1, 4, 4 ); - debug_register_view(debug_info,&debug_hex_ascii_view); - debug_register_view(debug_info,&debug_raw_view); + debug_info = debug_register("test", 1, 4, 4 ); + debug_register_view(debug_info, &debug_hex_ascii_view); + debug_register_view(debug_info, &debug_raw_view); debug_text_event(debug_info, 4 , "one "); debug_int_exception(debug_info, 4, 4711); @@ -150,13 +154,13 @@ Examples static void cleanup(void) { - debug_unregister (debug_info); + debug_unregister(debug_info); } module_init(init); module_exit(cleanup); -:: +.. code-block:: c /* * sprintf-view Example @@ -165,15 +169,15 @@ Examples #include #include - static debug_info_t* debug_info; + static debug_info_t *debug_info; static int init(void) { /* register 4 debug areas with one page each and data field for */ /* format string pointer + 2 varargs (= 3 * sizeof(long)) */ - debug_info = debug_register ("test", 1, 4, sizeof(long) * 3); - debug_register_view(debug_info,&debug_sprintf_view); + debug_info = debug_register("test", 1, 4, sizeof(long) * 3); + debug_register_view(debug_info, &debug_sprintf_view); debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__); debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info); @@ -183,7 +187,7 @@ Examples static void cleanup(void) { - debug_unregister (debug_info); + debug_unregister(debug_info); } module_init(init); @@ -252,7 +256,7 @@ Define 4 pages for the debug areas of debug feature "dasd":: > echo "4" > /sys/kernel/debug/s390dbf/dasd/pages -Stooping the debug feature +Stopping the debug feature -------------------------- Example: @@ -264,10 +268,11 @@ Example: > echo 0 > /proc/sys/s390dbf/debug_active -lcrash Interface +crash Interface ---------------- -It is planned that the dump analysis tool lcrash gets an additional command -'s390dbf' to display all the debug logs. With this tool it will be possible +The ``crash`` tool since v5.1.0 has a built-in command +``s390dbf`` to display all the debug logs or export them to the file system. +With this tool it is possible to investigate the debug logs on a live system and with a memory dump after a system crash. @@ -276,8 +281,8 @@ Investigating raw memory One last possibility to investigate the debug logs at a live system and after a system crash is to look at the raw memory under VM or at the Service Element. -It is possible to find the anker of the debug-logs through -the 'debug_area_first' symbol in the System map. Then one has +It is possible to find the anchor of the debug-logs through +the ``debug_area_first`` symbol in the System map. Then one has to follow the correct pointers of the data-structures defined in debug.h and find the debug-areas in memory. Normally modules which use the debug feature will also have @@ -286,7 +291,7 @@ this pointer it will also be possible to find the debug logs in memory. For this method it is recommended to use '16 * x + 4' byte (x = 0..n) -for the length of the data field in debug_register() in +for the length of the data field in :c:func:`debug_register()` in order to see the debug entries well formatted. @@ -295,7 +300,7 @@ Predefined Views There are three predefined views: hex_ascii, raw and sprintf. The hex_ascii view shows the data field in hex and ascii representation -(e.g. '45 43 4b 44 | ECKD'). +(e.g. ``45 43 4b 44 | ECKD``). The raw view returns a bytestream as the debug areas are stored in memory. The sprintf view formats the debug entries in the same way as the sprintf @@ -335,18 +340,20 @@ The format of the raw view is: - datafield A typical line of the hex_ascii view will look like the following (first line -is only for explanation and will not be displayed when 'cating' the view): +is only for explanation and will not be displayed when 'cating' the view):: -area time level exception cpu caller data (hex + ascii) --------------------------------------------------------------------------- -00 00964419409:440690 1 - 00 88023fe + area time level exception cpu caller data (hex + ascii) + -------------------------------------------------------------------------- + 00 00964419409:440690 1 - 00 88023fe Defining views -------------- Views are specified with the 'debug_view' structure. There are defined -callback functions which are used for reading and writing the debugfs files:: +callback functions which are used for reading and writing the debugfs files: + +.. code-block:: c struct debug_view { char name[DEBUG_MAX_PROCF_LEN]; @@ -357,7 +364,9 @@ callback functions which are used for reading and writing the debugfs files:: void* private_data; }; -where:: +where: + +.. code-block:: c typedef int (debug_header_proc_t) (debug_info_t* id, struct debug_view* view, @@ -395,10 +404,10 @@ Then 'header_proc' and 'format_proc' are called for each existing debug entry. The input_proc can be used to implement functionality when it is written to -the view (e.g. like with 'echo "0" > /sys/kernel/debug/s390dbf/dasd/level). +the view (e.g. like with ``echo "0" > /sys/kernel/debug/s390dbf/dasd/level``). For header_proc there can be used the default function -debug_dflt_header_fn() which is defined in debug.h. +:c:func:`debug_dflt_header_fn()` which is defined in debug.h. and which produces the same header output as the predefined views. E.g:: @@ -407,7 +416,9 @@ E.g:: In order to see how to use the callback functions check the implementation of the default views! -Example:: +Example: + +.. code-block:: c #include @@ -423,21 +434,20 @@ Example:: }; static int debug_test_format_fn( - debug_info_t * id, struct debug_view *view, + debug_info_t *id, struct debug_view *view, char *out_buf, const char *in_buf ) { int i, rc = 0; - if(id->buf_size >= 4) { + if (id->buf_size >= 4) { int msg_nr = *((int*)in_buf); - if(msg_nr < sizeof(messages)/sizeof(char*) - 1) + if (msg_nr < sizeof(messages) / sizeof(char*) - 1) rc += sprintf(out_buf, "%s", messages[msg_nr]); else rc += sprintf(out_buf, UNKNOWNSTR, msg_nr); } - out: - return rc; + return rc; } struct debug_view debug_test_view = { @@ -452,13 +462,17 @@ Example:: test: ===== -:: +.. code-block:: c debug_info_t *debug_info; + int i; ... - debug_info = debug_register ("test", 0, 4, 4 )); + debug_info = debug_register("test", 0, 4, 4); debug_register_view(debug_info, &debug_test_view); - for(i = 0; i < 10; i ++) debug_int_event(debug_info, 1, i); + for (i = 0; i < 10; i ++) + debug_int_event(debug_info, 1, i); + +:: > cat /sys/kernel/debug/s390dbf/test/myview 00 00964419734:611402 1 - 00 88042ca This error...........