Discussion:
[Xen-devel] [PATCH v6 06/11] pci: add support to size ROM BARs to pci_size_mem_bar
Roger Pau Monne
2017-09-19 15:29:31 UTC
Permalink
Signed-off-by: Roger Pau Monné <***@citrix.com>
---
Cc: Jan Beulich <***@suse.com>
---
Changes since v5:
- Use the flags field.
- Introduce a mask local variable.
- Simplify return.

Changes since v4:
- New in this version.
---
xen/drivers/passthrough/pci.c | 29 +++++++++++++++--------------
xen/include/xen/pci.h | 2 ++
2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index ba58b4d0cc..92c1f9354a 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -610,11 +610,17 @@ int pci_size_mem_bar(pci_sbdf_t sbdf, unsigned int pos, bool last,
sbdf.func, pos);
uint64_t addr, size;
bool vf = flags & PCI_BAR_VF;
-
- ASSERT((bar & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_MEMORY);
+ bool rom = flags & PCI_BAR_ROM;
+ bool is64bits = !rom && (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+ PCI_BASE_ADDRESS_MEM_TYPE_64;
+ uint32_t mask = rom ? (uint32_t)PCI_ROM_ADDRESS_MASK
+ : (uint32_t)PCI_BASE_ADDRESS_MEM_MASK;
+
+ ASSERT(!(rom && vf));
+ ASSERT(rom ||
+ (bar & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_MEMORY);
pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos, ~0);
- if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
- PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ if ( is64bits )
{
if ( last )
{
@@ -627,10 +633,9 @@ int pci_size_mem_bar(pci_sbdf_t sbdf, unsigned int pos, bool last,
hi = pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos + 4);
pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos + 4, ~0);
}
- size = pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos) &
- PCI_BASE_ADDRESS_MEM_MASK;
- if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
- PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ size = pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func,
+ pos) & mask;
+ if ( is64bits )
{
size |= (uint64_t)pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev,
sbdf.func, pos + 4) << 32;
@@ -640,17 +645,13 @@ int pci_size_mem_bar(pci_sbdf_t sbdf, unsigned int pos, bool last,
size |= (uint64_t)~0 << 32;
pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos, bar);
size = -size;
- addr = (bar & PCI_BASE_ADDRESS_MEM_MASK) | ((uint64_t)hi << 32);
+ addr = (bar & mask) | ((uint64_t)hi << 32);

if ( paddr )
*paddr = addr;
*psize = size;

- if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
- PCI_BASE_ADDRESS_MEM_TYPE_64 )
- return 2;
-
- return 1;
+ return is64bits ? 2 : 1;
}

int pci_add_device(u16 seg, u8 bus, u8 devfn,
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 2bee6a3247..4489edf9b5 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -191,6 +191,8 @@ const char *parse_pci_seg(const char *, unsigned int *seg, unsigned int *bus,

#define _PCI_BAR_VF 0
#define PCI_BAR_VF (1u << _PCI_BAR_VF)
+#define _PCI_BAR_ROM 1
+#define PCI_BAR_ROM (1u << _PCI_BAR_ROM)
int pci_size_mem_bar(pci_sbdf_t sbdf, unsigned int pos, bool last,
uint64_t *addr, uint64_t *size, unsigned int flags);
--
2.11.0 (Apple Git-81)
Roger Pau Monne
2017-09-19 15:29:35 UTC
Permalink
This is needed for MSI-X, since MSI-X will need to be initialized
before parsing the BARs, so that the header BAR handlers are aware of
the MSI-X related holes and make sure they are not mapped in order for
the trap handlers to work properly.

Signed-off-by: Roger Pau Monné <***@citrix.com>
Reviewed-by: Jan Beulich <***@suse.com>
---
Cc: Jan Beulich <***@suse.com>
Cc: Andrew Cooper <***@citrix.com>
---
Changes since v4:
- Add a middle priority and add the PCI header to it.

Changes since v3:
- Add a numerial suffix to the section used to store the pointer to
each initializer function, and sort them at link time.
---
xen/arch/arm/xen.lds.S | 4 ++--
xen/arch/x86/xen.lds.S | 4 ++--
xen/drivers/vpci/header.c | 2 +-
xen/drivers/vpci/msi.c | 2 +-
xen/include/xen/vpci.h | 8 ++++++--
5 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/xen/arch/arm/xen.lds.S b/xen/arch/arm/xen.lds.S
index eb14909645..4a08435f7e 100644
--- a/xen/arch/arm/xen.lds.S
+++ b/xen/arch/arm/xen.lds.S
@@ -68,7 +68,7 @@ SECTIONS

#if defined(CONFIG_HAS_PCI) && defined(CONFIG_LATE_HWDOM)
__start_vpci_array = .;
- *(.data.vpci)
+ *(SORT(.data.vpci.*))
__end_vpci_array = .;
#endif
} :text
@@ -182,7 +182,7 @@ SECTIONS

#if defined(CONFIG_HAS_PCI) && !defined(CONFIG_LATE_HWDOM)
__start_vpci_array = .;
- *(.data.vpci)
+ *(SORT(.data.vpci.*))
__end_vpci_array = .;
#endif
} :text
diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
index 61775953d6..3c44fb410e 100644
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -127,7 +127,7 @@ SECTIONS

#if defined(CONFIG_HAS_PCI) && defined(CONFIG_LATE_HWDOM)
__start_vpci_array = .;
- *(.data.vpci)
+ *(SORT(.data.vpci.*))
__end_vpci_array = .;
#endif
} :text
@@ -222,7 +222,7 @@ SECTIONS

#if defined(CONFIG_HAS_PCI) && !defined(CONFIG_LATE_HWDOM)
__start_vpci_array = .;
- *(.data.vpci)
+ *(SORT(.data.vpci.*))
__end_vpci_array = .;
#endif
} :text
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index c0d38c8b91..07a6bbf0be 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -465,7 +465,7 @@ static int vpci_init_bars(struct pci_dev *pdev)

return 0;
}
-REGISTER_VPCI_INIT(vpci_init_bars);
+REGISTER_VPCI_INIT(vpci_init_bars, VPCI_PRIORITY_MIDDLE);

/*
* Local variables:
diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
index 933adba0ff..7a0b0521c5 100644
--- a/xen/drivers/vpci/msi.c
+++ b/xen/drivers/vpci/msi.c
@@ -307,7 +307,7 @@ static int vpci_init_msi(struct pci_dev *pdev)

return 0;
}
-REGISTER_VPCI_INIT(vpci_init_msi);
+REGISTER_VPCI_INIT(vpci_init_msi, VPCI_PRIORITY_LOW);

void vpci_dump_msi(void)
{
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 5b582b8012..c6913631c0 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -13,9 +13,13 @@ typedef void vpci_write_t(const struct pci_dev *pdev, unsigned int reg,

typedef int vpci_register_init_t(struct pci_dev *dev);

-#define REGISTER_VPCI_INIT(x) \
+#define VPCI_PRIORITY_HIGH "1"
+#define VPCI_PRIORITY_MIDDLE "5"
+#define VPCI_PRIORITY_LOW "9"
+
+#define REGISTER_VPCI_INIT(x, p) \
static vpci_register_init_t *const x##_entry \
- __used_section(".data.vpci") = x
+ __used_section(".data.vpci." p) = x

/* Add vPCI handlers to device. */
int __must_check vpci_add_handlers(struct pci_dev *dev);
--
2.11.0 (Apple Git-81)
Roger Pau Monne
2017-09-19 15:29:29 UTC
Permalink
So that MMCFG regions not present in the MCFG ACPI table can be added
at run time by the hardware domain.

Signed-off-by: Roger Pau Monné <***@citrix.com>
---
Cc: Jan Beulich <***@suse.com>
Cc: Andrew Cooper <***@citrix.com>
---
Changes since v5:
- Check for has_vpci before calling register_vpci_mmcfg_handler
instead of checking for is_hvm_domain.

Changes since v4:
- Change the hardware_domain check in hvm_physdev_op to a vpci check.
- Only register the MMCFG area, but don't scan it.

Changes since v3:
- New in this version.
---
xen/arch/x86/hvm/hypercall.c | 4 ++++
xen/arch/x86/hvm/io.c | 7 +++----
xen/arch/x86/physdev.c | 11 +++++++++++
3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/hvm/hypercall.c b/xen/arch/x86/hvm/hypercall.c
index 5742dd1797..d81160c1f7 100644
--- a/xen/arch/x86/hvm/hypercall.c
+++ b/xen/arch/x86/hvm/hypercall.c
@@ -89,6 +89,10 @@ static long hvm_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
if ( !has_pirq(curr->domain) )
return -ENOSYS;
break;
+ case PHYSDEVOP_pci_mmcfg_reserved:
+ if ( !has_vpci(curr->domain) )
+ return -ENOSYS;
+ break;
}

if ( !curr->hcall_compat )
diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
index 7ee20eb5d4..ff167bdfc7 100644
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -496,10 +496,9 @@ static const struct hvm_mmio_ops vpci_mmcfg_ops = {
.write = vpci_mmcfg_write,
};

-int __hwdom_init register_vpci_mmcfg_handler(struct domain *d, paddr_t addr,
- unsigned int start_bus,
- unsigned int end_bus,
- unsigned int seg)
+int register_vpci_mmcfg_handler(struct domain *d, paddr_t addr,
+ unsigned int start_bus, unsigned int end_bus,
+ unsigned int seg)
{
struct hvm_mmcfg *mmcfg;

diff --git a/xen/arch/x86/physdev.c b/xen/arch/x86/physdev.c
index 0eb409758f..b36add32f1 100644
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -559,6 +559,17 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)

ret = pci_mmcfg_reserved(info.address, info.segment,
info.start_bus, info.end_bus, info.flags);
+ if ( !ret && has_vpci(currd) )
+ {
+ /*
+ * For HVM (PVH) domains try to add the newly found MMCFG to the
+ * domain.
+ */
+ ret = register_vpci_mmcfg_handler(currd, info.address,
+ info.start_bus, info.end_bus,
+ info.segment);
+ }
+
break;
}
--
2.11.0 (Apple Git-81)
Jan Beulich
2017-10-04 08:31:37 UTC
Permalink
Post by Roger Pau Monne
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -559,6 +559,17 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
ret = pci_mmcfg_reserved(info.address, info.segment,
info.start_bus, info.end_bus, info.flags);
+ if ( !ret && has_vpci(currd) )
+ {
+ /*
+ * For HVM (PVH) domains try to add the newly found MMCFG to the
+ * domain.
+ */
+ ret = register_vpci_mmcfg_handler(currd, info.address,
+ info.start_bus, info.end_bus,
+ info.segment);
+ }
+
break;
}
I think it is wrong to report back -EEXIST here for an exact match
region which we have on record already.

Jan
Roger Pau Monne
2017-09-19 15:29:33 UTC
Permalink
Introduce a set of handlers that trap accesses to the PCI BARs and the command
register, in order to snoop BAR sizing and BAR relocation.

The command handler is used to detect changes to bit 2 (response to
memory space accesses), and maps/unmaps the BARs of the device into
the guest p2m. A rangeset is used in order to figure out which memory
to map/unmap. This makes it easier to keep track of the possible
overlaps with other BARs, and will also simplify MSI-X support, where
certain regions of a BAR might be used for the MSI-X table or PBA.

The BAR register handlers are used to detect attempts by the guest to size or
relocate the BARs.

Note that the long running BAR mapping and unmapping operations are
deferred to be performed by hvm_io_pending, so that they can be safely
preempted.

Signed-off-by: Roger Pau Monné <***@citrix.com>
---
Cc: Andrew Cooper <***@citrix.com>
Cc: George Dunlap <***@eu.citrix.com>
Cc: Ian Jackson <***@eu.citrix.com>
Cc: Jan Beulich <***@suse.com>
Cc: Konrad Rzeszutek Wilk <***@oracle.com>
Cc: Stefano Stabellini <***@kernel.org>
Cc: Tim Deegan <***@xen.org>
Cc: Wei Liu <***@citrix.com>
---
Changes since v5:
- Switch to the new handler type.
- Use pci_sbdf_t to size the BARs.
- Use a single return for vpci_modify_bar.
- Do not return an error code from vpci_modify_bars, just log the
failure.
- Remove the 'sizing' parameter. Instead just let the guest write
directly to the BAR, and read the value back. This simplifies the
BAR register handlers, specially the read one.
- Ignore ROM BAR writes with memory decoding enabled and ROM enabled.
- Do not propagate failures to setup the ROM BAR in vpci_init_bars.
- Add preemption support to the BAR mapping/unmapping operations.

Changes since v4:
- Expand commit message to mention the reason behind the usage of
rangesets.
- Fix comment related to the inclusiveness of rangesets.
- Fix off-by-one error in the calculation of the end of memory
regions.
- Store the state of the BAR (mapped/unmapped) in the vpci_bar
enabled field, previously was only used by ROMs.
- Fix double negation of return code.
- Modify vpci_cmd_write so it has a single call to pci_conf_write16.
- Print a warning when trying to write to the BAR with memory
decoding enabled (and ignore the write).
- Remove header_type local variable, it's used only once.
- Move the read of the command register.
- Restore previous command register value in the exit paths.
- Only set address to INVALID_PADDR if the initial BAR value matches
~0 & PCI_BASE_ADDRESS_MEM_MASK.
- Don't disable the enabled bit in the expansion ROM register, memory
decoding is already disabled and takes precedence.
- Don't use INVALID_PADDR, just set the initial BAR address to the
value found in the hardware.
- Introduce rom_enabled to store the status of the
PCI_ROM_ADDRESS_ENABLE bit.
- Reorder fields of the structure to prevent holes.

Changes since v3:
- Propagate previous changes: drop xen_ prefix and use u8/u16/u32
instead of the previous half_word/word/double_word.
- Constify some of the paramerters.
- s/VPCI_BAR_MEM/VPCI_BAR_MEM32/.
- Simplify the number of fields stored for each BAR, a single address
field is stored and contains the address of the BAR both on Xen and
in the guest.
- Allow the guest to move the BARs around in the physical memory map.
- Add support for expansion ROM BARs.
- Do not cache the value of the command register.
- Remove a label used in vpci_cmd_write.
- Fix the calculation of the sizing mask in vpci_bar_write.
- Check the memory decode bit in order to decide if a BAR is
positioned or not.
- Disable memory decoding before sizing the BARs in Xen.
- When mapping/unmapping BARs check if there's overlap between BARs,
in order to avoid unmapping memory required by another BAR.
- Introduce a macro to check whether a BAR is mappable or not.
- Add a comment regarding the lack of support for SR-IOV.
- Remove the usage of the GENMASK macro.

Changes since v2:
- Detect unset BARs and allow the hardware domain to position them.
---
xen/arch/x86/hvm/ioreq.c | 4 +
xen/drivers/vpci/Makefile | 2 +-
xen/drivers/vpci/header.c | 478 ++++++++++++++++++++++++++++++++++++++++++++++
xen/include/xen/sched.h | 8 +
xen/include/xen/vpci.h | 41 ++++
5 files changed, 532 insertions(+), 1 deletion(-)
create mode 100644 xen/drivers/vpci/header.c

diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index 3e7a88e053..f6588ceab4 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -26,6 +26,7 @@
#include <xen/domain.h>
#include <xen/event.h>
#include <xen/paging.h>
+#include <xen/vpci.h>

#include <asm/hvm/hvm.h>
#include <asm/hvm/ioreq.h>
@@ -48,6 +49,9 @@ bool_t hvm_io_pending(struct vcpu *v)
struct domain *d = v->domain;
struct hvm_ioreq_server *s;

+ if ( has_vpci(v->domain) && vpci_check_pending(v) )
+ return 1;
+
list_for_each_entry ( s,
&d->arch.hvm_domain.ioreq_server.list,
list_entry )
diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
index 840a906470..241467212f 100644
--- a/xen/drivers/vpci/Makefile
+++ b/xen/drivers/vpci/Makefile
@@ -1 +1 @@
-obj-y += vpci.o
+obj-y += vpci.o header.o
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
new file mode 100644
index 0000000000..c0d38c8b91
--- /dev/null
+++ b/xen/drivers/vpci/header.c
@@ -0,0 +1,478 @@
+/*
+ * Generic functionality for handling accesses to the PCI header from the
+ * configuration space.
+ *
+ * Copyright (C) 2017 Citrix Systems R&D
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/sched.h>
+#include <xen/vpci.h>
+#include <xen/p2m-common.h>
+#include <xen/softirq.h>
+
+#include <asm/event.h>
+
+#define MAPPABLE_BAR(x) \
+ ((x)->type == VPCI_BAR_MEM32 || (x)->type == VPCI_BAR_MEM64_LO || \
+ (x)->type == VPCI_BAR_ROM)
+
+struct map_data {
+ struct domain *d;
+ bool map;
+};
+
+static int vpci_map_range(unsigned long s, unsigned long e, void *data,
+ unsigned long *c)
+{
+ const struct map_data *map = data;
+ int rc;
+
+ for ( ; ; )
+ {
+ unsigned long size = e - s + 1;
+
+ rc = (map->map ? map_mmio_regions : unmap_mmio_regions)
+ (map->d, _gfn(s), size, _mfn(s));
+ if ( rc == 0 )
+ {
+ *c += size;
+ break;
+ }
+ if ( rc < 0 )
+ {
+ printk(XENLOG_G_WARNING
+ "Failed to identity %smap [%" PRI_gfn ", %" PRI_gfn ") for d%d: %d\n",
+ map ? "" : "un", s, e, map->d->domain_id, rc);
+ break;
+ }
+ *c += rc;
+ s += rc;
+ if ( general_preempt_check() )
+ {
+ if ( !is_idle_vcpu(current) )
+ return -ERESTART;
+
+ process_pending_softirqs();
+ }
+ }
+
+ return rc;
+}
+
+static int vpci_map_memory(struct domain *d, struct rangeset *mem, bool map)
+{
+ struct map_data data = { .d = d, .map = map };
+
+ return rangeset_consume_ranges(mem, vpci_map_range, &data);
+}
+
+bool vpci_check_pending(struct vcpu *v)
+{
+ if ( v->vpci.mem )
+ {
+ int rc = vpci_map_memory(v->domain, v->vpci.mem, v->vpci.map);
+
+ if ( rc == -ERESTART )
+ return true;
+
+ rangeset_destroy(v->vpci.mem);
+ v->vpci.mem = NULL;
+ }
+
+ return false;
+}
+
+static int vpci_maybe_defer_map(struct domain *d, struct rangeset *mem,
+ bool map)
+{
+ struct vcpu *curr = current;
+ int rc = 0;
+
+ if ( is_idle_vcpu(curr) )
+ {
+ rc = vpci_map_memory(d, mem, map);
+ rangeset_destroy(mem);
+ }
+ else
+ {
+ ASSERT(curr->domain == d);
+ curr->vpci.mem = mem;
+ curr->vpci.map = map;
+ }
+
+ return rc;
+}
+
+static int vpci_check_bar_overlap(const struct pci_dev *pdev,
+ const struct vpci_bar *rom,
+ struct rangeset *mem)
+{
+ const struct pci_dev *cmp;
+
+ /* Check for overlaps with other device's BARs. */
+ list_for_each_entry(cmp, &pdev->domain->arch.pdev_list, domain_list)
+ {
+ unsigned int i;
+
+ if ( rom == NULL && pdev == cmp )
+ continue;
+
+ for ( i = 0; i < ARRAY_SIZE(cmp->vpci->header.bars); i++ )
+ {
+ const struct vpci_bar *bar = &cmp->vpci->header.bars[i];
+ unsigned long start = PFN_DOWN(bar->addr);
+ unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
+ int rc;
+
+ if ( rom == bar || !bar->enabled || !MAPPABLE_BAR(bar) ||
+ !rangeset_overlaps_range(mem, start, end) )
+ continue;
+
+ rc = rangeset_remove_range(mem, start, end);
+ if ( rc )
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static void vpci_modify_bars(const struct pci_dev *pdev, bool map)
+{
+ struct vpci_header *header = &pdev->vpci->header;
+ struct rangeset *mem = rangeset_new(NULL, NULL, 0);
+ unsigned int i;
+ int rc;
+
+ if ( !mem )
+ return;
+
+ /*
+ * Create a rangeset that represents the current device BARs memory region
+ * and compare it against all the currently active BAR memory regions. If
+ * an overlap is found, subtract it from the region to be
+ * mapped/unmapped.
+ *
+ * NB: the rangeset uses inclusive frame numbers.
+ */
+
+ /* First fill the rangeset with all the BARs of this device. */
+ for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ {
+ const struct vpci_bar *bar = &header->bars[i];
+
+ if ( !MAPPABLE_BAR(bar) ||
+ (bar->type == VPCI_BAR_ROM && !bar->rom_enabled) )
+ continue;
+
+ rc = rangeset_add_range(mem, PFN_DOWN(bar->addr),
+ PFN_DOWN(bar->addr + bar->size - 1));
+ if ( rc )
+ {
+ rangeset_destroy(mem);
+ return;
+ }
+ }
+
+ /* Check for overlaps with other device's BARs. */
+ rc = vpci_check_bar_overlap(pdev, NULL, mem);
+ if ( rc )
+ {
+ rangeset_destroy(mem);
+ return;
+ }
+
+ rc = vpci_maybe_defer_map(pdev->domain, mem, map);
+ if ( !rc )
+ for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ if ( header->bars[i].type != VPCI_BAR_ROM ||
+ header->bars[i].rom_enabled )
+ header->bars[i].enabled = map;
+}
+
+static void vpci_modify_rom(const struct pci_dev *pdev,
+ struct vpci_bar *rom, bool map)
+{
+ struct rangeset *mem = rangeset_new(NULL, NULL, 0);
+ int rc;
+
+ ASSERT(rom->type == VPCI_BAR_ROM);
+
+ if ( !mem )
+ return;
+
+ /* First fill the rangeset with the ROM BAR. */
+ rc = rangeset_add_range(mem, PFN_DOWN(rom->addr),
+ PFN_DOWN(rom->addr + rom->size - 1));
+ if ( rc )
+ {
+ rangeset_destroy(mem);
+ return;
+ }
+
+ /*
+ * Check for overlaps with other BARs (either on this device or other
+ * devices).
+ */
+ rc = vpci_check_bar_overlap(pdev, rom, mem);
+ if ( rc )
+ {
+ rangeset_destroy(mem);
+ return;
+ }
+
+ rc = vpci_maybe_defer_map(pdev->domain, mem, map);
+ if ( !rc )
+ rom->enabled = map;
+}
+
+static uint32_t vpci_cmd_read(const struct pci_dev *pdev, unsigned int reg,
+ void *data)
+{
+ return pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), reg);
+}
+
+static void vpci_cmd_write(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t cmd, void *data)
+{
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ uint16_t current_cmd = pci_conf_read16(seg, bus, slot, func, reg);
+
+ /*
+ * Let the guest play with all the bits directly except for the
+ * memory decoding one.
+ */
+ if ( (cmd ^ current_cmd) & PCI_COMMAND_MEMORY )
+ vpci_modify_bars(pdev, cmd & PCI_COMMAND_MEMORY);
+
+ pci_conf_write16(seg, bus, slot, func, reg, cmd);
+}
+
+static uint32_t vpci_bar_read(const struct pci_dev *pdev, unsigned int reg,
+ void *data)
+{
+ return pci_conf_read32(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), reg);
+}
+
+static void vpci_bar_write(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data)
+{
+ struct vpci_bar *bar = data;
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ bool hi = false;
+
+ if ( pci_conf_read16(seg, bus, slot, func, PCI_COMMAND) &
+ PCI_COMMAND_MEMORY )
+ {
+ gprintk(XENLOG_WARNING,
+ "%04x:%02x:%02x.%u: ignored BAR write with memory decoding enabled\n",
+ seg, bus, slot, func);
+ return;
+ }
+
+ if ( bar->type == VPCI_BAR_MEM64_HI )
+ {
+ ASSERT(reg > PCI_BASE_ADDRESS_0);
+ bar--;
+ hi = true;
+ }
+ else
+ val &= PCI_BASE_ADDRESS_MEM_MASK;
+
+ /*
+ * Update the cached address, so that when memory decoding is enabled
+ * Xen can map the BAR into the guest p2m.
+ */
+ bar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
+ bar->addr |= (uint64_t)val << (hi ? 32 : 0);
+
+ /* Make sure Xen writes back the same value for the BAR RO bits. */
+ if ( !hi )
+ {
+ val |= bar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
+ : PCI_BASE_ADDRESS_MEM_TYPE_64;
+ val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
+ }
+
+ pci_conf_write32(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), reg, val);
+}
+
+static void vpci_rom_write(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data)
+{
+ struct vpci_bar *rom = data;
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ uint16_t cmd = pci_conf_read16(seg, bus, slot, func, PCI_COMMAND);
+
+ if ( (pci_conf_read16(seg, bus, slot, func, PCI_COMMAND) &
+ PCI_COMMAND_MEMORY) && rom->rom_enabled )
+ {
+ gprintk(XENLOG_WARNING,
+ "%04x:%02x:%02x.%u: ignored ROM BAR write with memory decoding enabled\n",
+ seg, bus, slot, func);
+ return;
+ }
+
+ rom->addr = val & PCI_ROM_ADDRESS_MASK;
+
+ /* Check if ROM BAR should be mapped/unmapped. */
+ if ( (cmd & PCI_COMMAND_MEMORY) &&
+ rom->rom_enabled != (val & PCI_ROM_ADDRESS_ENABLE) )
+ vpci_modify_rom(pdev, rom, val & PCI_ROM_ADDRESS_ENABLE);
+
+ rom->rom_enabled = val & PCI_ROM_ADDRESS_ENABLE;
+ pci_conf_write32(pdev->seg, pdev->bus, slot, func, reg, val);
+}
+
+static int vpci_init_bars(struct pci_dev *pdev)
+{
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ uint16_t cmd;
+ uint64_t addr, size;
+ unsigned int i, num_bars, rom_reg;
+ struct vpci_header *header = &pdev->vpci->header;
+ struct vpci_bar *bars = header->bars;
+ pci_sbdf_t sbdf = {
+ .seg = seg,
+ .bus = bus,
+ .dev = slot,
+ .func = func,
+ };
+ int rc;
+
+ switch ( pci_conf_read8(seg, bus, slot, func, PCI_HEADER_TYPE) & 0x7f )
+ {
+ case PCI_HEADER_TYPE_NORMAL:
+ num_bars = 6;
+ rom_reg = PCI_ROM_ADDRESS;
+ break;
+ case PCI_HEADER_TYPE_BRIDGE:
+ num_bars = 2;
+ rom_reg = PCI_ROM_ADDRESS1;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ /* Setup a handler for the command register. */
+ rc = vpci_add_register(pdev, vpci_cmd_read, vpci_cmd_write, PCI_COMMAND,
+ 2, header);
+ if ( rc )
+ return rc;
+
+ /* Disable memory decoding before sizing. */
+ cmd = pci_conf_read16(seg, bus, slot, func, PCI_COMMAND);
+ if ( cmd & PCI_COMMAND_MEMORY )
+ pci_conf_write16(seg, bus, slot, func, PCI_COMMAND,
+ cmd & ~PCI_COMMAND_MEMORY);
+
+ for ( i = 0; i < num_bars; i++ )
+ {
+ uint8_t reg = PCI_BASE_ADDRESS_0 + i * 4;
+ uint32_t val = pci_conf_read32(seg, bus, slot, func, reg);
+
+ if ( i && bars[i - 1].type == VPCI_BAR_MEM64_LO )
+ {
+ bars[i].type = VPCI_BAR_MEM64_HI;
+ rc = vpci_add_register(pdev, vpci_bar_read, vpci_bar_write, reg, 4,
+ &bars[i]);
+ if ( rc )
+ {
+ pci_conf_write16(seg, bus, slot, func, PCI_COMMAND, cmd);
+ return rc;
+ }
+
+ continue;
+ }
+ if ( (val & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO )
+ {
+ bars[i].type = VPCI_BAR_IO;
+ continue;
+ }
+ if ( (val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+ PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ bars[i].type = VPCI_BAR_MEM64_LO;
+ else
+ bars[i].type = VPCI_BAR_MEM32;
+
+ /* Size the BAR and map it. */
+ rc = pci_size_mem_bar(sbdf, reg, i == num_bars - 1, &addr, &size, 0);
+ if ( rc < 0 )
+ {
+ pci_conf_write16(seg, bus, slot, func, PCI_COMMAND, cmd);
+ return rc;
+ }
+
+ if ( size == 0 )
+ {
+ bars[i].type = VPCI_BAR_EMPTY;
+ continue;
+ }
+
+ bars[i].addr = addr;
+ bars[i].size = size;
+ bars[i].prefetchable = val & PCI_BASE_ADDRESS_MEM_PREFETCH;
+
+ rc = vpci_add_register(pdev, vpci_bar_read, vpci_bar_write, reg, 4,
+ &bars[i]);
+ if ( rc )
+ {
+ pci_conf_write16(seg, bus, slot, func, PCI_COMMAND, cmd);
+ return rc;
+ }
+ }
+
+ /* Check expansion ROM. */
+ rc = pci_size_mem_bar(sbdf, rom_reg, true, &addr, &size, PCI_BAR_ROM);
+ if ( rc > 0 && size )
+ {
+ struct vpci_bar *rom = &header->bars[num_bars];
+
+ rom->type = VPCI_BAR_ROM;
+ rom->size = size;
+ rom->addr = addr;
+
+ rc = vpci_add_register(pdev, vpci_bar_read, vpci_rom_write, rom_reg, 4,
+ rom);
+ if ( rc )
+ rom->type = VPCI_BAR_EMPTY;
+ }
+
+ if ( cmd & PCI_COMMAND_MEMORY )
+ {
+ vpci_modify_bars(pdev, true);
+ pci_conf_write16(seg, bus, slot, func, PCI_COMMAND, cmd);
+ }
+
+ return 0;
+}
+REGISTER_VPCI_INIT(vpci_init_bars);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index b03afb450d..39a330ffca 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -20,6 +20,9 @@
#include <xen/smp.h>
#include <xen/perfc.h>
#include <asm/atomic.h>
+#ifdef CONFIG_HAS_PCI
+#include <xen/vpci.h>
+#endif
#include <xen/wait.h>
#include <public/xen.h>
#include <public/domctl.h>
@@ -264,6 +267,11 @@ struct vcpu

struct evtchn_fifo_vcpu *evtchn_fifo;

+#ifdef CONFIG_HAS_PCI
+ /* vPCI per-vCPU area, used to store data for long running operations. */
+ struct vpci_vcpu vpci;
+#endif
+
struct arch_vcpu arch;
};

diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index b42e38ed54..4e0b67c2f1 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -35,11 +35,52 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
uint32_t data);

+/*
+ * Check for pending vPCI operations on this vcpu. Returns true if the vcpu
+ * should not run.
+ */
+bool vpci_check_pending(struct vcpu *v);
+
struct vpci {
/* List of vPCI handlers for a device. */
struct list_head handlers;
spinlock_t lock;
+
+#ifdef __XEN__
+ /* Hide the rest of the vpci struct from the user-space test harness. */
+ struct vpci_header {
+ /* Information about the PCI BARs of this device. */
+ struct vpci_bar {
+ paddr_t addr;
+ uint64_t size;
+ enum {
+ VPCI_BAR_EMPTY,
+ VPCI_BAR_IO,
+ VPCI_BAR_MEM32,
+ VPCI_BAR_MEM64_LO,
+ VPCI_BAR_MEM64_HI,
+ VPCI_BAR_ROM,
+ } type;
+ bool prefetchable;
+ /* Store whether the BAR is mapped into guest p2m. */
+ bool enabled;
+ /*
+ * Store whether the ROM enable bit is set (doesn't imply ROM BAR
+ * is mapped into guest p2m). Only used for type VPCI_BAR_ROM.
+ */
+ bool rom_enabled;
+ } bars[7]; /* At most 6 BARS + 1 expansion ROM BAR. */
+ /* FIXME: currently there's no support for SR-IOV. */
+ } header;
+#endif
+};
+
+#ifdef __XEN__
+struct vpci_vcpu {
+ struct rangeset *mem;
+ bool map;
};
+#endif

#endif
--
2.11.0 (Apple Git-81)
Jan Beulich
2017-10-04 08:33:33 UTC
Permalink
Post by Roger Pau Monne
@@ -48,6 +49,9 @@ bool_t hvm_io_pending(struct vcpu *v)
struct domain *d = v->domain;
struct hvm_ioreq_server *s;
+ if ( has_vpci(v->domain) && vpci_check_pending(v) )
has_vpci(d)
Post by Roger Pau Monne
+ return 1;
Indentation.
Post by Roger Pau Monne
+static int vpci_map_range(unsigned long s, unsigned long e, void *data,
+ unsigned long *c)
+{
+ const struct map_data *map = data;
+ int rc;
+
+ for ( ; ; )
+ {
+ unsigned long size = e - s + 1;
+
+ rc = (map->map ? map_mmio_regions : unmap_mmio_regions)
+ (map->d, _gfn(s), size, _mfn(s));
+ if ( rc == 0 )
+ {
+ *c += size;
+ break;
+ }
+ if ( rc < 0 )
+ {
+ printk(XENLOG_G_WARNING
+ "Failed to identity %smap [%" PRI_gfn ", %" PRI_gfn ") for d%d: %d\n",
+ map ? "" : "un", s, e, map->d->domain_id, rc);
+ break;
+ }
ASSERT(rc < size) ?
Post by Roger Pau Monne
+bool vpci_check_pending(struct vcpu *v)
"check" in the function name generally suggests (to me at least) that
the parameter ought to be const. Perhaps vpci_process_pending()?
Post by Roger Pau Monne
+{
+ if ( v->vpci.mem )
+ {
+ int rc = vpci_map_memory(v->domain, v->vpci.mem, v->vpci.map);
+
+ if ( rc == -ERESTART )
+ return true;
There's no real need for the local variable if all other return values
are simply discarded here. However, ...
Post by Roger Pau Monne
+ rangeset_destroy(v->vpci.mem);
+ v->vpci.mem = NULL;
+ }
+
+ return false;
+}
... I'm not convinced this is a good error handling model. I don't
recall how previous versions dealt with this, but iirc we agreed to
generally make all such Dom0 handling best effort (here: don't skip the
remaining ranges if mapping of one failed). An exception may want/need
to be -ENOMEM.
Post by Roger Pau Monne
+static int vpci_check_bar_overlap(const struct pci_dev *pdev,
+ const struct vpci_bar *rom,
+ struct rangeset *mem)
+{
+ const struct pci_dev *cmp;
+
+ /* Check for overlaps with other device's BARs. */
+ list_for_each_entry(cmp, &pdev->domain->arch.pdev_list, domain_list)
+ {
+ unsigned int i;
+
+ if ( rom == NULL && pdev == cmp )
+ continue;
This check looks rather unmotivated (or even bogus) without a comment.
The other special casing of ROM BARs further down also isn't all that
obvious (and right now I can't even convince myself it's correct).
Post by Roger Pau Monne
+static void vpci_modify_bars(const struct pci_dev *pdev, bool map)
+{
+ struct vpci_header *header = &pdev->vpci->header;
+ struct rangeset *mem = rangeset_new(NULL, NULL, 0);
+ unsigned int i;
+ int rc;
+
+ if ( !mem )
+ return;
+
+ /*
+ * Create a rangeset that represents the current device BARs memory region
+ * and compare it against all the currently active BAR memory regions. If
+ * an overlap is found, subtract it from the region to be
+ * mapped/unmapped.
+ *
+ * NB: the rangeset uses inclusive frame numbers.
+ */
+
+ /* First fill the rangeset with all the BARs of this device. */
+ for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ {
+ const struct vpci_bar *bar = &header->bars[i];
+
+ if ( !MAPPABLE_BAR(bar) ||
+ (bar->type == VPCI_BAR_ROM && !bar->rom_enabled) )
+ continue;
+
+ rc = rangeset_add_range(mem, PFN_DOWN(bar->addr),
+ PFN_DOWN(bar->addr + bar->size - 1));
+ if ( rc )
+ {
+ rangeset_destroy(mem);
+ return;
I'm afraid -ENOMEM here (which sadly is possible, as we don't maintain
any reserves) would produce a very hard to diagnose misbehavior. I think
you want to log a message here.
Post by Roger Pau Monne
+ }
+ }
+
+ /* Check for overlaps with other device's BARs. */
+ rc = vpci_check_bar_overlap(pdev, NULL, mem);
Why is this not symmetrical with vpci_modify_rom() (which also checks
overlaps inside the current device)?
Post by Roger Pau Monne
+ if ( rc )
+ {
+ rangeset_destroy(mem);
+ return;
Same error handling comment as above, despite failure here being less
likely (hopefully at least). Perhaps worth joining the two paths.
Post by Roger Pau Monne
+ }
+
+ rc = vpci_maybe_defer_map(pdev->domain, mem, map);
+ if ( !rc )
+ for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ if ( header->bars[i].type != VPCI_BAR_ROM ||
+ header->bars[i].rom_enabled )
+ header->bars[i].enabled = map;
Hmm, you're updating state here regardless of possible failure in the
deferred operation (see the discarded error code in
vpci_check_pending()).
Post by Roger Pau Monne
+static uint32_t vpci_cmd_read(const struct pci_dev *pdev, unsigned int reg,
+ void *data)
+{
+ return pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), reg);
+}
Wouldn't it be worthwhile having generic read functions dealing with
simple cases like this (and the BAR) one?
Post by Roger Pau Monne
+static void vpci_cmd_write(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t cmd, void *data)
+{
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ uint16_t current_cmd = pci_conf_read16(seg, bus, slot, func, reg);
+
+ /*
+ * Let the guest play with all the bits directly except for the
+ * memory decoding one.
+ */
Please could you make clear it's only Dom0 we apply this lax model to?
Perhaps simply s/the guest/Dom0/.
Post by Roger Pau Monne
+static void vpci_bar_write(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data)
+{
+ struct vpci_bar *bar = data;
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ bool hi = false;
+
+ if ( pci_conf_read16(seg, bus, slot, func, PCI_COMMAND) &
+ PCI_COMMAND_MEMORY )
+ {
+ gprintk(XENLOG_WARNING,
+ "%04x:%02x:%02x.%u: ignored BAR write with memory decoding enabled\n",
+ seg, bus, slot, func);
Indentation. Also any chance to log which BAR it was?
Post by Roger Pau Monne
+static void vpci_rom_write(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data)
+{
+ struct vpci_bar *rom = data;
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ uint16_t cmd = pci_conf_read16(seg, bus, slot, func, PCI_COMMAND);
+
+ if ( (pci_conf_read16(seg, bus, slot, func, PCI_COMMAND) &
Please use cmd here.
Post by Roger Pau Monne
+ PCI_COMMAND_MEMORY) && rom->rom_enabled )
+ {
+ gprintk(XENLOG_WARNING,
+ "%04x:%02x:%02x.%u: ignored ROM BAR write with memory decoding enabled\n",
+ seg, bus, slot, func);
Indentation again.
Post by Roger Pau Monne
+static int vpci_init_bars(struct pci_dev *pdev)
+{
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ uint16_t cmd;
+ uint64_t addr, size;
+ unsigned int i, num_bars, rom_reg;
+ struct vpci_header *header = &pdev->vpci->header;
+ struct vpci_bar *bars = header->bars;
+ pci_sbdf_t sbdf = {
+ .seg = seg,
+ .bus = bus,
+ .dev = slot,
+ .func = func,
+ };
+ int rc;
+
+ switch ( pci_conf_read8(seg, bus, slot, func, PCI_HEADER_TYPE) & 0x7f )
+ {
+ num_bars = 6;
+ rom_reg = PCI_ROM_ADDRESS;
+ break;
+ num_bars = 2;
+ rom_reg = PCI_ROM_ADDRESS1;
+ break;
+ return -EOPNOTSUPP;
+ }
+
+ /* Setup a handler for the command register. */
+ rc = vpci_add_register(pdev, vpci_cmd_read, vpci_cmd_write, PCI_COMMAND,
+ 2, header);
+ if ( rc )
+ return rc;
+
+ /* Disable memory decoding before sizing. */
+ cmd = pci_conf_read16(seg, bus, slot, func, PCI_COMMAND);
+ if ( cmd & PCI_COMMAND_MEMORY )
+ pci_conf_write16(seg, bus, slot, func, PCI_COMMAND,
+ cmd & ~PCI_COMMAND_MEMORY);
+
+ for ( i = 0; i < num_bars; i++ )
+ {
+ uint8_t reg = PCI_BASE_ADDRESS_0 + i * 4;
+ uint32_t val = pci_conf_read32(seg, bus, slot, func, reg);
+
+ if ( i && bars[i - 1].type == VPCI_BAR_MEM64_LO )
+ {
+ bars[i].type = VPCI_BAR_MEM64_HI;
+ rc = vpci_add_register(pdev, vpci_bar_read, vpci_bar_write, reg, 4,
+ &bars[i]);
+ if ( rc )
+ {
+ pci_conf_write16(seg, bus, slot, func, PCI_COMMAND, cmd);
+ return rc;
+ }
+
+ continue;
+ }
You don't need val up to here - please defer the read.
Post by Roger Pau Monne
+ if ( (val & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO )
+ {
+ bars[i].type = VPCI_BAR_IO;
+ continue;
+ }
+ if ( (val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+ PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ bars[i].type = VPCI_BAR_MEM64_LO;
+ else
+ bars[i].type = VPCI_BAR_MEM32;
+
+ /* Size the BAR and map it. */
Isn't the map part of this comment stale now? And without it,
considering the function name called, it is perhaps no longer worth
having it.
Post by Roger Pau Monne
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -20,6 +20,9 @@
#include <xen/smp.h>
#include <xen/perfc.h>
#include <asm/atomic.h>
+#ifdef CONFIG_HAS_PCI
+#include <xen/vpci.h>
+#endif
Perhaps the conditional would better live in that header.
Post by Roger Pau Monne
@@ -264,6 +267,11 @@ struct vcpu
struct evtchn_fifo_vcpu *evtchn_fifo;
+#ifdef CONFIG_HAS_PCI
+ /* vPCI per-vCPU area, used to store data for long running operations. */
+ struct vpci_vcpu vpci;
+#endif
And perhaps the header would better provide an empty structure for the
"else" case. Another option would be to include the fields ...
Post by Roger Pau Monne
struct arch_vcpu arch;
... in this structure.
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -35,11 +35,52 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
uint32_t data);
+/*
+ * Check for pending vPCI operations on this vcpu. Returns true if the vcpu
+ * should not run.
+ */
+bool vpci_check_pending(struct vcpu *v);
+
struct vpci {
/* List of vPCI handlers for a device. */
struct list_head handlers;
spinlock_t lock;
+
+#ifdef __XEN__
+ /* Hide the rest of the vpci struct from the user-space test harness. */
+ struct vpci_header {
+ /* Information about the PCI BARs of this device. */
+ struct vpci_bar {
+ paddr_t addr;
+ uint64_t size;
+ enum {
+ VPCI_BAR_EMPTY,
+ VPCI_BAR_IO,
+ VPCI_BAR_MEM32,
+ VPCI_BAR_MEM64_LO,
+ VPCI_BAR_MEM64_HI,
+ VPCI_BAR_ROM,
+ } type;
+ bool prefetchable;
+ /* Store whether the BAR is mapped into guest p2m. */
+ bool enabled;
+ /*
+ * Store whether the ROM enable bit is set (doesn't imply ROM BAR
+ * is mapped into guest p2m). Only used for type VPCI_BAR_ROM.
+ */
+ bool rom_enabled;
Especially with the error handling issue in mind that I've mentioned
earlier, I wonder whether this field shouldn't be dropped, along the
lines of you also no longer caching the memory decode enable bit in the
command register.

Jan
Roger Pau Monné
2017-10-05 09:20:54 UTC
Permalink
Post by Jan Beulich
Post by Roger Pau Monne
+static int vpci_check_bar_overlap(const struct pci_dev *pdev,
+ const struct vpci_bar *rom,
+ struct rangeset *mem)
+{
+ const struct pci_dev *cmp;
+
+ /* Check for overlaps with other device's BARs. */
+ list_for_each_entry(cmp, &pdev->domain->arch.pdev_list, domain_list)
+ {
+ unsigned int i;
+
+ if ( rom == NULL && pdev == cmp )
+ continue;
This check looks rather unmotivated (or even bogus) without a comment.
The other special casing of ROM BARs further down also isn't all that
obvious (and right now I can't even convince myself it's correct).
I've added the following comment before this check, which I think
explains the logic for this check, and the one below:

Since ROM BARs can be enabled independently of the memory decoding
bit we need to check for overlapping in slightly different
ways depending on the case.

If !rom it means the memory decoding bit has been toggled, and all
BARs belonging to the device will be {un}mapped, hence the rangeset
will contain the mappings for the whole device. In this case there's
no need to check for overlaps with BARs that belong to the same
device because the rangeset is able to deal with overlapping areas.

OTOH, if rom is set if means a single ROM BAR is being {un}mapped,
and hence the check for overlaps should be performed against all
the possible BARs, even the ones that belong to the device being
modified.
Post by Jan Beulich
Post by Roger Pau Monne
+ }
+ }
+
+ /* Check for overlaps with other device's BARs. */
+ rc = vpci_check_bar_overlap(pdev, NULL, mem);
Why is this not symmetrical with vpci_modify_rom() (which also checks
overlaps inside the current device)?
I think the comment above should answer the question here, the
difference is because in this case Xen is mapping a whole device, so
vpci_check_bar_overlap should not check for overlap with BARs that
belong to the same device. OTOH, when mapping a ROM BAR Xen should
check for such overlap, because the regular BARs will already be
mapped.
Post by Jan Beulich
Post by Roger Pau Monne
+ }
+
+ rc = vpci_maybe_defer_map(pdev->domain, mem, map);
+ if ( !rc )
+ for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ if ( header->bars[i].type != VPCI_BAR_ROM ||
+ header->bars[i].rom_enabled )
+ header->bars[i].enabled = map;
Hmm, you're updating state here regardless of possible failure in the
deferred operation (see the discarded error code in
vpci_check_pending()).
Yes, I've fixed the code above to try to map/unmap as much as
possible, even when a failure happens.

I agree that enabling/disabling here with the operation being deferred
is not ideal, but I also think we would end up doing the same
regardless of the outcome of the deferred operation. If some
mapping/unmapping of BARs failed, the memory decoding should be
enabled anyway. I can add a comment along this lines if you think
that's OK.
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -20,6 +20,9 @@
#include <xen/smp.h>
#include <xen/perfc.h>
#include <asm/atomic.h>
+#ifdef CONFIG_HAS_PCI
+#include <xen/vpci.h>
+#endif
Perhaps the conditional would better live in that header.
Post by Roger Pau Monne
@@ -264,6 +267,11 @@ struct vcpu
struct evtchn_fifo_vcpu *evtchn_fifo;
+#ifdef CONFIG_HAS_PCI
+ /* vPCI per-vCPU area, used to store data for long running operations. */
+ struct vpci_vcpu vpci;
+#endif
And perhaps the header would better provide an empty structure for the
"else" case. Another option would be to include the fields ...
Post by Roger Pau Monne
struct arch_vcpu arch;
... in this structure.
I've thought about placing the vpci data inside of arch vpcu, but it
felt a little bit weird because the vpci code should be arch-agnostic,
so placing some of it's data inside of an arch specific structure
seemed wrong. I will do as you suggest and provide an empty
vpci_vpcu structure in the !CONFIG_HAS_PCI case, together with hiding
the CONFIG_HAS_PCI macros inside of the header itself.
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -35,11 +35,52 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
uint32_t data);
+/*
+ * Check for pending vPCI operations on this vcpu. Returns true if the vcpu
+ * should not run.
+ */
+bool vpci_check_pending(struct vcpu *v);
+
struct vpci {
/* List of vPCI handlers for a device. */
struct list_head handlers;
spinlock_t lock;
+
+#ifdef __XEN__
+ /* Hide the rest of the vpci struct from the user-space test harness. */
+ struct vpci_header {
+ /* Information about the PCI BARs of this device. */
+ struct vpci_bar {
+ paddr_t addr;
+ uint64_t size;
+ enum {
+ VPCI_BAR_EMPTY,
+ VPCI_BAR_IO,
+ VPCI_BAR_MEM32,
+ VPCI_BAR_MEM64_LO,
+ VPCI_BAR_MEM64_HI,
+ VPCI_BAR_ROM,
+ } type;
+ bool prefetchable;
+ /* Store whether the BAR is mapped into guest p2m. */
+ bool enabled;
+ /*
+ * Store whether the ROM enable bit is set (doesn't imply ROM BAR
+ * is mapped into guest p2m). Only used for type VPCI_BAR_ROM.
+ */
+ bool rom_enabled;
Especially with the error handling issue in mind that I've mentioned
earlier, I wonder whether this field shouldn't be dropped, along the
lines of you also no longer caching the memory decode enable bit in the
command register.
Removing rom_enabled would imply doing a register read in
vpci_modify_bars in order to know whether the ROM BAR is enabled or
not, which is not trivial because depending on the header type the
position of the ROM BAR is different.

Another option would be to store the prefetch/enable bits inside of
the addr field, but that would also require more masking/unmasking of
the fields when the values are used or updated.

Roger.
Jan Beulich
2017-10-05 10:01:46 UTC
Permalink
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
+static int vpci_check_bar_overlap(const struct pci_dev *pdev,
+ const struct vpci_bar *rom,
+ struct rangeset *mem)
+{
+ const struct pci_dev *cmp;
+
+ /* Check for overlaps with other device's BARs. */
+ list_for_each_entry(cmp, &pdev->domain->arch.pdev_list, domain_list)
+ {
+ unsigned int i;
+
+ if ( rom == NULL && pdev == cmp )
+ continue;
This check looks rather unmotivated (or even bogus) without a comment.
The other special casing of ROM BARs further down also isn't all that
obvious (and right now I can't even convince myself it's correct).
I've added the following comment before this check, which I think
Since ROM BARs can be enabled independently of the memory decoding
bit we need to check for overlapping in slightly different
ways depending on the case.
If !rom it means the memory decoding bit has been toggled, and all
BARs belonging to the device will be {un}mapped,
That's not precise: When mapping, you may still skip the ROM one
if its enable bit is clear. Whether the difference matters for
unmapping when the ROM is already unmapped I can't tell right
away. Nevertheless I think ...
Post by Roger Pau Monné
hence the rangeset
will contain the mappings for the whole device. In this case there's
no need to check for overlaps with BARs that belong to the same
device because the rangeset is able to deal with overlapping areas.
... the conclusion is correct, as I would expect the ROM range to
simply not be part of the rangeset then.
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
+ }
+ }
+
+ /* Check for overlaps with other device's BARs. */
+ rc = vpci_check_bar_overlap(pdev, NULL, mem);
Why is this not symmetrical with vpci_modify_rom() (which also checks
overlaps inside the current device)?
I think the comment above should answer the question here, the
difference is because in this case Xen is mapping a whole device, so
vpci_check_bar_overlap should not check for overlap with BARs that
belong to the same device. OTOH, when mapping a ROM BAR Xen should
check for such overlap, because the regular BARs will already be
mapped.
Right. Part of my confusion results from the naming of these
two functions (pretty similar despite their different call sites)
as well as their placement (modify_bars() sitting ahead of
the CMD write is fine, as it's a helper of that function, but
modify_rom() would better be moved down to make clear
whose helper it is; it's questionable whether this being a
separate helper function is actually useful).
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
+ }
+
+ rc = vpci_maybe_defer_map(pdev->domain, mem, map);
+ if ( !rc )
+ for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ if ( header->bars[i].type != VPCI_BAR_ROM ||
+ header->bars[i].rom_enabled )
+ header->bars[i].enabled = map;
Hmm, you're updating state here regardless of possible failure in the
deferred operation (see the discarded error code in
vpci_check_pending()).
Yes, I've fixed the code above to try to map/unmap as much as
possible, even when a failure happens.
I agree that enabling/disabling here with the operation being deferred
is not ideal, but I also think we would end up doing the same
regardless of the outcome of the deferred operation. If some
mapping/unmapping of BARs failed, the memory decoding should be
enabled anyway. I can add a comment along this lines if you think
that's OK.
Yes, at least explaining why things are the (not fully correct) way
they are would help (also to tell anyone wanting to improve this
what it actually is that would need changing). Of course even
better would be if maintained state would match the state
hardware is in.
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -35,11 +35,52 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
uint32_t data);
+/*
+ * Check for pending vPCI operations on this vcpu. Returns true if the vcpu
+ * should not run.
+ */
+bool vpci_check_pending(struct vcpu *v);
+
struct vpci {
/* List of vPCI handlers for a device. */
struct list_head handlers;
spinlock_t lock;
+
+#ifdef __XEN__
+ /* Hide the rest of the vpci struct from the user-space test harness. */
+ struct vpci_header {
+ /* Information about the PCI BARs of this device. */
+ struct vpci_bar {
+ paddr_t addr;
+ uint64_t size;
+ enum {
+ VPCI_BAR_EMPTY,
+ VPCI_BAR_IO,
+ VPCI_BAR_MEM32,
+ VPCI_BAR_MEM64_LO,
+ VPCI_BAR_MEM64_HI,
+ VPCI_BAR_ROM,
+ } type;
+ bool prefetchable;
+ /* Store whether the BAR is mapped into guest p2m. */
+ bool enabled;
+ /*
+ * Store whether the ROM enable bit is set (doesn't imply ROM BAR
+ * is mapped into guest p2m). Only used for type VPCI_BAR_ROM.
+ */
+ bool rom_enabled;
Especially with the error handling issue in mind that I've mentioned
earlier, I wonder whether this field shouldn't be dropped, along the
lines of you also no longer caching the memory decode enable bit in the
command register.
Removing rom_enabled would imply doing a register read in
vpci_modify_bars in order to know whether the ROM BAR is enabled or
not, which is not trivial because depending on the header type the
position of the ROM BAR is different.
As said - I wouldn't mind the field if it was always in sync with the
hardware one. And it was for a reason that I mentioned the
memory decode bit, which you no longer cache. I think both
should be treated the same.
Post by Roger Pau Monné
Another option would be to store the prefetch/enable bits inside of
the addr field, but that would also require more masking/unmasking of
the fields when the values are used or updated.
I didn't ask for these two to be eliminated.

Jan
Roger Pau Monné
2017-10-05 11:09:28 UTC
Permalink
Post by Jan Beulich
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
+ }
+
+ rc = vpci_maybe_defer_map(pdev->domain, mem, map);
+ if ( !rc )
+ for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ if ( header->bars[i].type != VPCI_BAR_ROM ||
+ header->bars[i].rom_enabled )
+ header->bars[i].enabled = map;
Hmm, you're updating state here regardless of possible failure in the
deferred operation (see the discarded error code in
vpci_check_pending()).
Yes, I've fixed the code above to try to map/unmap as much as
possible, even when a failure happens.
I agree that enabling/disabling here with the operation being deferred
is not ideal, but I also think we would end up doing the same
regardless of the outcome of the deferred operation. If some
mapping/unmapping of BARs failed, the memory decoding should be
enabled anyway. I can add a comment along this lines if you think
that's OK.
Yes, at least explaining why things are the (not fully correct) way
they are would help (also to tell anyone wanting to improve this
what it actually is that would need changing). Of course even
better would be if maintained state would match the state
hardware is in.
I see the current code in this version is slightly confusing regarding
the usage of the 'enabled' field. I've fixed the code so that the
'enabled' field matches the following conditions:

- For non-ROM BARs: the 'enabled' field matches the value of the
memory decoding bit, but it doesn't guarantee that the range is
fully mapped/unmapped in the guest p2m.
- For ROM BARs: the 'enabled' bit matches the value of the memory
decoding bit & the ROM enable bit, but again it doesn't guarantee
that the memory is fully mapped/unmapped in the guest p2m.
Post by Jan Beulich
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -35,11 +35,52 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
uint32_t data);
+/*
+ * Check for pending vPCI operations on this vcpu. Returns true if the vcpu
+ * should not run.
+ */
+bool vpci_check_pending(struct vcpu *v);
+
struct vpci {
/* List of vPCI handlers for a device. */
struct list_head handlers;
spinlock_t lock;
+
+#ifdef __XEN__
+ /* Hide the rest of the vpci struct from the user-space test harness. */
+ struct vpci_header {
+ /* Information about the PCI BARs of this device. */
+ struct vpci_bar {
+ paddr_t addr;
+ uint64_t size;
+ enum {
+ VPCI_BAR_EMPTY,
+ VPCI_BAR_IO,
+ VPCI_BAR_MEM32,
+ VPCI_BAR_MEM64_LO,
+ VPCI_BAR_MEM64_HI,
+ VPCI_BAR_ROM,
+ } type;
+ bool prefetchable;
+ /* Store whether the BAR is mapped into guest p2m. */
+ bool enabled;
+ /*
+ * Store whether the ROM enable bit is set (doesn't imply ROM BAR
+ * is mapped into guest p2m). Only used for type VPCI_BAR_ROM.
+ */
+ bool rom_enabled;
Especially with the error handling issue in mind that I've mentioned
earlier, I wonder whether this field shouldn't be dropped, along the
lines of you also no longer caching the memory decode enable bit in the
command register.
Removing rom_enabled would imply doing a register read in
vpci_modify_bars in order to know whether the ROM BAR is enabled or
not, which is not trivial because depending on the header type the
position of the ROM BAR is different.
As said - I wouldn't mind the field if it was always in sync with the
hardware one. And it was for a reason that I mentioned the
memory decode bit, which you no longer cache. I think both
should be treated the same.
I think I'm missing something, rom_enabled matches exactly the state
of the ROM enable bit. There's no way rom_enabled will get updated
without the BAR ROM also being updated in vpci_rom_write.

In line with my comments above regarding the 'enabled' field, what
about adding:

- rom_enabled matches the state of the ROM BAR enable bit. It doesn't
take into account the state of the memory decoding bit. As such, it
cannot be used to detect if the ROM BAR memory is active or not,
the 'enabled' bit should be used in that case.

Roger.
Jan Beulich
2017-10-05 11:55:39 UTC
Permalink
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -35,11 +35,52 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
uint32_t data);
+/*
+ * Check for pending vPCI operations on this vcpu. Returns true if the vcpu
+ * should not run.
+ */
+bool vpci_check_pending(struct vcpu *v);
+
struct vpci {
/* List of vPCI handlers for a device. */
struct list_head handlers;
spinlock_t lock;
+
+#ifdef __XEN__
+ /* Hide the rest of the vpci struct from the user-space test harness. */
+ struct vpci_header {
+ /* Information about the PCI BARs of this device. */
+ struct vpci_bar {
+ paddr_t addr;
+ uint64_t size;
+ enum {
+ VPCI_BAR_EMPTY,
+ VPCI_BAR_IO,
+ VPCI_BAR_MEM32,
+ VPCI_BAR_MEM64_LO,
+ VPCI_BAR_MEM64_HI,
+ VPCI_BAR_ROM,
+ } type;
+ bool prefetchable;
+ /* Store whether the BAR is mapped into guest p2m. */
+ bool enabled;
+ /*
+ * Store whether the ROM enable bit is set (doesn't imply ROM BAR
+ * is mapped into guest p2m). Only used for type VPCI_BAR_ROM.
+ */
+ bool rom_enabled;
Especially with the error handling issue in mind that I've mentioned
earlier, I wonder whether this field shouldn't be dropped, along the
lines of you also no longer caching the memory decode enable bit in the
command register.
Removing rom_enabled would imply doing a register read in
vpci_modify_bars in order to know whether the ROM BAR is enabled or
not, which is not trivial because depending on the header type the
position of the ROM BAR is different.
As said - I wouldn't mind the field if it was always in sync with the
hardware one. And it was for a reason that I mentioned the
memory decode bit, which you no longer cache. I think both
should be treated the same.
I think I'm missing something, rom_enabled matches exactly the state
of the ROM enable bit. There's no way rom_enabled will get updated
without the BAR ROM also being updated in vpci_rom_write.
Oh, I'm sorry for not being precise here: I think the hardware
bit should only be set once the mapping is complete. That's
not how the code currently behaves, so yes, right now the
cached bit apparently properly reflects the actual one. With
the possibly deferred mapping, that wouldn't be the case.

Jan
Roger Pau Monné
2017-10-05 12:02:08 UTC
Permalink
Post by Jan Beulich
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -35,11 +35,52 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
uint32_t data);
+/*
+ * Check for pending vPCI operations on this vcpu. Returns true if the vcpu
+ * should not run.
+ */
+bool vpci_check_pending(struct vcpu *v);
+
struct vpci {
/* List of vPCI handlers for a device. */
struct list_head handlers;
spinlock_t lock;
+
+#ifdef __XEN__
+ /* Hide the rest of the vpci struct from the user-space test harness. */
+ struct vpci_header {
+ /* Information about the PCI BARs of this device. */
+ struct vpci_bar {
+ paddr_t addr;
+ uint64_t size;
+ enum {
+ VPCI_BAR_EMPTY,
+ VPCI_BAR_IO,
+ VPCI_BAR_MEM32,
+ VPCI_BAR_MEM64_LO,
+ VPCI_BAR_MEM64_HI,
+ VPCI_BAR_ROM,
+ } type;
+ bool prefetchable;
+ /* Store whether the BAR is mapped into guest p2m. */
+ bool enabled;
+ /*
+ * Store whether the ROM enable bit is set (doesn't imply ROM BAR
+ * is mapped into guest p2m). Only used for type VPCI_BAR_ROM.
+ */
+ bool rom_enabled;
Especially with the error handling issue in mind that I've mentioned
earlier, I wonder whether this field shouldn't be dropped, along the
lines of you also no longer caching the memory decode enable bit in the
command register.
Removing rom_enabled would imply doing a register read in
vpci_modify_bars in order to know whether the ROM BAR is enabled or
not, which is not trivial because depending on the header type the
position of the ROM BAR is different.
As said - I wouldn't mind the field if it was always in sync with the
hardware one. And it was for a reason that I mentioned the
memory decode bit, which you no longer cache. I think both
should be treated the same.
I think I'm missing something, rom_enabled matches exactly the state
of the ROM enable bit. There's no way rom_enabled will get updated
without the BAR ROM also being updated in vpci_rom_write.
Oh, I'm sorry for not being precise here: I think the hardware
bit should only be set once the mapping is complete. That's
not how the code currently behaves, so yes, right now the
cached bit apparently properly reflects the actual one. With
the possibly deferred mapping, that wouldn't be the case.
I could add some tail code to vpci_process_pending that sets the
memory decoding or ROM BAR enable bit together with the rom_enable and
enabled fields in the header struct. Would you agree to this?

Thanks, Roger.
Jan Beulich
2017-10-05 13:09:18 UTC
Permalink
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
+ bool rom_enabled;
Especially with the error handling issue in mind that I've mentioned
earlier, I wonder whether this field shouldn't be dropped, along the
lines of you also no longer caching the memory decode enable bit in the
command register.
Removing rom_enabled would imply doing a register read in
vpci_modify_bars in order to know whether the ROM BAR is enabled or
not, which is not trivial because depending on the header type the
position of the ROM BAR is different.
As said - I wouldn't mind the field if it was always in sync with the
hardware one. And it was for a reason that I mentioned the
memory decode bit, which you no longer cache. I think both
should be treated the same.
I think I'm missing something, rom_enabled matches exactly the state
of the ROM enable bit. There's no way rom_enabled will get updated
without the BAR ROM also being updated in vpci_rom_write.
Oh, I'm sorry for not being precise here: I think the hardware
bit should only be set once the mapping is complete. That's
not how the code currently behaves, so yes, right now the
cached bit apparently properly reflects the actual one. With
the possibly deferred mapping, that wouldn't be the case.
I could add some tail code to vpci_process_pending that sets the
memory decoding or ROM BAR enable bit together with the rom_enable and
enabled fields in the header struct. Would you agree to this?
If that's cleanly doable, sure. I had assumed you didn't do it
because you couldn't reasonably update state at that later point.

Jan


_______________________________________________
Xen-devel mailing list
Julien Grall
2017-10-05 10:56:20 UTC
Permalink
Hi Roger,
Post by Roger Pau Monne
+static int vpci_map_range(unsigned long s, unsigned long e, void *data,
+ unsigned long *c)
+{
+ const struct map_data *map = data;
+ int rc;
+
+ for ( ; ; )
+ {
+ unsigned long size = e - s + 1;
+
+ rc = (map->map ? map_mmio_regions : unmap_mmio_regions)
+ (map->d, _gfn(s), size, _mfn(s));
Again, this is relying on *map_mmio_regions to support preemption. This
is not the case on ARM.

I am not asking to add preemption in the ARM code. But we should at
least add a check similar to XEN_DOMCTL_memory_mapping ( if (size > 64)
) to remind us that *map_mmio_regions have to be fixed.


Similarly, on IRC said you will add a TODO regarding the lack of passing
the type of the BAR.

Cheers,
--
Julien Grall
Roger Pau Monné
2017-10-05 11:26:25 UTC
Permalink
Post by Julien Grall
Hi Roger,
Post by Roger Pau Monne
+static int vpci_map_range(unsigned long s, unsigned long e, void *data,
+ unsigned long *c)
+{
+ const struct map_data *map = data;
+ int rc;
+
+ for ( ; ; )
+ {
+ unsigned long size = e - s + 1;
+
+ rc = (map->map ? map_mmio_regions : unmap_mmio_regions)
+ (map->d, _gfn(s), size, _mfn(s));
Again, this is relying on *map_mmio_regions to support preemption. This is
not the case on ARM.
I am not asking to add preemption in the ARM code. But we should at least
add a check similar to XEN_DOMCTL_memory_mapping ( if (size > 64) ) to
remind us that *map_mmio_regions have to be fixed.
I've added a bodge for ARM in order to limit the mappings to 64 for
each call to {un}map_mmio_regions.
Post by Julien Grall
Similarly, on IRC said you will add a TODO regarding the lack of passing the
type of the BAR.
Sorry, not sure if we spoke about this before or after sending this
series, but in any case I've added it now.

Thanks, Roger.
Roger Pau Monne
2017-09-19 15:29:27 UTC
Permalink
This functionality is going to reside in vpci.c (and the corresponding
vpci.h header), and should be arch-agnostic. The handlers introduced
in this patch setup the basic functionality required in order to trap
accesses to the PCI config space, and allow decoding the address and
finding the corresponding handler that should handle the access
(although no handlers are implemented).

Note that the traps to the PCI IO ports registers (0xcf8/0xcfc) are
setup inside of a x86 HVM file, since that's not shared with other
arches.

A new XEN_X86_EMU_VPCI x86 domain flag is added in order to signal Xen
whether a domain should use the newly introduced vPCI handlers, this
is only enabled for PVH Dom0 at the moment.

A very simple user-space test is also provided, so that the basic
functionality of the vPCI traps can be asserted. This has been proven
quite helpful during development, since the logic to handle partial
accesses or accesses that expand across multiple registers is not
trivial.

The handlers for the registers are added to a linked list that's keep
sorted at all times. Both the read and write handlers support accesses
that expand across multiple emulated registers and contain gaps not
emulated.

Signed-off-by: Roger Pau Monné <***@citrix.com>
---
Cc: Ian Jackson <***@eu.citrix.com>
Cc: Wei Liu <***@citrix.com>
Cc: Jan Beulich <***@suse.com>
Cc: Andrew Cooper <***@citrix.com>
Cc: Paul Durrant <***@citrix.com>
---
Changes since v5:
- Use a spinlock per pci device.
- Use the recently introduced pci_sbdf_t type.
- Fix test harness to use the right handler type and the newly
introduced lock.
- Move the position of the vpci sections in the linker scripts.
- Constify domain and pci_dev in vpci_{read/write}.
- Fix typos in comments.
- Use _XEN_VPCI_H_ as header guard.

Changes since v4:
* User-space test harness:
- Do not redirect the output of the test.
- Add main.c and emul.h as dependencies of the Makefile target.
- Use the same rule to modify the vpci and list headers.
- Remove underscores from local macro variables.
- Add _check suffix to the test harness multiread function.
- Change the value written by every different size in the multiwrite
test.
- Use { } to initialize the r16 and r20 arrays (instead of { 0 }).
- Perform some of the read checks with the local variable directly.
- Expand some comments.
- Implement a dummy rwlock.
* Hypervisor code:
- Guard the linker script changes with CONFIG_HAS_PCI.
- Rename vpci_access_check to vpci_access_allowed and make it return
bool.
- Make hvm_pci_decode_addr return the register as return value.
- Use ~3 instead of 0xfffc to remove the register offset when
checking accesses to IO ports.
- s/head/prev in vpci_add_register.
- Add parentheses around & in vpci_add_register.
- Fix register removal.
- Change the BUGs in vpci_{read/write}_hw helpers to
ASSERT_UNREACHABLE.
- Make merge_result static and change the computation of the mask to
avoid using a uint64_t.
- Modify vpci_read to only read from hardware the not-emulated gaps.
- Remove the vpci_val union and use a uint32_t instead.
- Change handler read type to return a uint32_t instead of modifying
a variable passed by reference.
- Constify the data opaque parameter of read handlers.
- Change the size parameter of the vpci_{read/write} functions to
unsigned int.
- Place the array of initialization handlers in init.rodata or
.rodata depending on whether late-hwdom is enabled.
- Remove the pci_devs lock, assume the Dom0 is well behaved and won't
remove the device while trying to access it.
- Change the recursive spinlock into a rw lock for performance
reasons.

Changes since v3:
* User-space test harness:
- Fix spaces in container_of macro.
- Implement a dummy locking functions.
- Remove 'current' macro make current a pointer to the statically
allocated vpcu.
- Remove unneeded parentheses in the pci_conf_readX macros.
- Fix the name of the write test macro.
- Remove the dummy EXPORT_SYMBOL macro (this was needed by the RB
code only).
- Import the max macro.
- Test all possible read/write size combinations with all possible
emulated register sizes.
- Introduce a test for register removal.
* Hypervisor code:
- Use a sorted list in order to store the config space handlers.
- Remove some unneeded 'else' branches.
- Make the IO port handlers always return X86EMUL_OKAY, and set the
data to all 1's in case of read failure (write are simply ignored).
- In hvm_select_ioreq_server reuse local variables when calling
XEN_DMOP_PCI_SBDF.
- Store the pointers to the initialization functions in the .rodata
section.
- Do not ignore the return value of xen_vpci_add_handlers in
setup_one_hwdom_device.
- Remove the vpci_init macro.
- Do not hide the pointers inside of the vpci_{read/write}_t
typedefs.
- Rename priv_data to private in vpci_register.
- Simplify checking for register overlap in vpci_register_cmp.
- Check that the offset and the length match before removing a
register in xen_vpci_remove_register.
- Make vpci_read_hw return a value rather than storing it in a
pointer passed by parameter.
- Handler dispatcher functions vpci_{read/write} no longer return an
error code, errors on reads/writes should be treated like hardware
(writes ignored, reads return all 1's or garbage).
- Make sure pcidevs is locked before calling pci_get_pdev_by_domain.
- Use a recursive spinlock for the vpci lock, so that spin_is_locked
checks that the current CPU is holding the lock.
- Make the code less error-chatty by removing some of the printk's.
- Pass the slot and the function as separate parameters to the
handler dispatchers (instead of passing devfn).
- Allow handlers to be registered with either a read or write
function only, the missing handler will be replaced by a dummy
handler (writes ignored, reads return 1's).
- Introduce PCI_CFG_SPACE_* defines from Linux.
- Simplify the handler dispatchers by removing the recursion, now the
dispatchers iterate over the list of sorted handlers and call them
in order.
- Remove the GENMASK_BYTES, SHIFT_RIGHT_BYTES and ADD_RESULT macros,
and instead provide a merge_result function in order to merge a
register output into a partial result.
- Rename the fields of the vpci_val union to u8/u16/u32.
- Remove the return values from the read/write handlers, errors
should be handled internally and signaled as would be done on
native hardware.
- Remove the usage of the GENMASK macro.

Changes since v2:
- Generalize the PCI address decoding and use it for IOREQ code also.

Changes since v1:
- Allow access to cross a word-boundary.
- Add locking.
- Add cleanup to xen_vpci_add_handlers in case of failure.
---
.gitignore | 3 +
tools/libxl/libxl_x86.c | 2 +-
tools/tests/Makefile | 1 +
tools/tests/vpci/Makefile | 37 ++++
tools/tests/vpci/emul.h | 133 +++++++++++
tools/tests/vpci/main.c | 308 ++++++++++++++++++++++++++
xen/arch/arm/xen.lds.S | 12 +
xen/arch/x86/domain.c | 18 +-
xen/arch/x86/hvm/hvm.c | 2 +
xen/arch/x86/hvm/io.c | 103 +++++++++
xen/arch/x86/setup.c | 3 +-
xen/arch/x86/xen.lds.S | 12 +
xen/drivers/Makefile | 2 +-
xen/drivers/passthrough/pci.c | 9 +-
xen/drivers/vpci/Makefile | 1 +
xen/drivers/vpci/vpci.c | 450 ++++++++++++++++++++++++++++++++++++++
xen/include/asm-x86/domain.h | 1 +
xen/include/asm-x86/hvm/io.h | 3 +
xen/include/public/arch-x86/xen.h | 5 +-
xen/include/xen/pci.h | 3 +
xen/include/xen/pci_regs.h | 8 +
xen/include/xen/vpci.h | 54 +++++
22 files changed, 1161 insertions(+), 9 deletions(-)
create mode 100644 tools/tests/vpci/Makefile
create mode 100644 tools/tests/vpci/emul.h
create mode 100644 tools/tests/vpci/main.c
create mode 100644 xen/drivers/vpci/Makefile
create mode 100644 xen/drivers/vpci/vpci.c
create mode 100644 xen/include/xen/vpci.h

diff --git a/.gitignore b/.gitignore
index cc16649457..1c670b27d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -240,6 +240,9 @@ tools/tests/regression/build/*
tools/tests/regression/downloads/*
tools/tests/mem-sharing/memshrtool
tools/tests/mce-test/tools/xen-mceinj
+tools/tests/vpci/list.h
+tools/tests/vpci/vpci.[hc]
+tools/tests/vpci/test_vpci
tools/xcutils/lsevtchn
tools/xcutils/readnotes
tools/xenbackendd/_paths.h
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index 455f6f0bed..dd7fc78a99 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -11,7 +11,7 @@ int libxl__arch_domain_prepare_config(libxl__gc *gc,
if (d_config->c_info.type == LIBXL_DOMAIN_TYPE_HVM) {
if (d_config->b_info.device_model_version !=
LIBXL_DEVICE_MODEL_VERSION_NONE) {
- xc_config->emulation_flags = XEN_X86_EMU_ALL;
+ xc_config->emulation_flags = (XEN_X86_EMU_ALL & ~XEN_X86_EMU_VPCI);
} else if (libxl_defbool_val(d_config->b_info.u.hvm.apic)) {
/*
* HVM guests without device model may want
diff --git a/tools/tests/Makefile b/tools/tests/Makefile
index 7162945121..f6942a93fb 100644
--- a/tools/tests/Makefile
+++ b/tools/tests/Makefile
@@ -13,6 +13,7 @@ endif
SUBDIRS-$(CONFIG_X86) += x86_emulator
SUBDIRS-y += xen-access
SUBDIRS-y += xenstore
+SUBDIRS-$(CONFIG_HAS_PCI) += vpci

.PHONY: all clean install distclean uninstall
all clean distclean: %: subdirs-%
diff --git a/tools/tests/vpci/Makefile b/tools/tests/vpci/Makefile
new file mode 100644
index 0000000000..e45fcb5cd9
--- /dev/null
+++ b/tools/tests/vpci/Makefile
@@ -0,0 +1,37 @@
+XEN_ROOT=$(CURDIR)/../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+TARGET := test_vpci
+
+.PHONY: all
+all: $(TARGET)
+
+.PHONY: run
+run: $(TARGET)
+ ./$(TARGET)
+
+$(TARGET): vpci.c vpci.h list.h main.c emul.h
+ $(HOSTCC) -g -o $@ vpci.c main.c
+
+.PHONY: clean
+clean:
+ rm -rf $(TARGET) *.o *~ vpci.h vpci.c list.h
+
+.PHONY: distclean
+distclean: clean
+
+.PHONY: install
+install:
+
+vpci.c: $(XEN_ROOT)/xen/drivers/vpci/vpci.c
+ # Trick the compiler so it doesn't complain about missing symbols
+ sed -e '/#include/d' \
+ -e '1s;^;#include "emul.h"\
+ vpci_register_init_t *const __start_vpci_array[1]\;\
+ vpci_register_init_t *const __end_vpci_array[1]\;\
+ ;' <$< >$@
+
+list.h: $(XEN_ROOT)/xen/include/xen/list.h
+vpci.h: $(XEN_ROOT)/xen/include/xen/vpci.h
+list.h vpci.h:
+ sed -e '/#include/d' <$< >$@
diff --git a/tools/tests/vpci/emul.h b/tools/tests/vpci/emul.h
new file mode 100644
index 0000000000..ebd676723d
--- /dev/null
+++ b/tools/tests/vpci/emul.h
@@ -0,0 +1,133 @@
+/*
+ * Unit tests for the generic vPCI handler code.
+ *
+ * Copyright (C) 2017 Citrix Systems R&D
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _TEST_VPCI_
+#define _TEST_VPCI_
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <assert.h>
+
+#define container_of(ptr, type, member) ({ \
+ typeof(((type *)0)->member) *mptr = (ptr); \
+ \
+ (type *)((char *)mptr - offsetof(type, member)); \
+})
+
+#define smp_wmb()
+#define prefetch(x) __builtin_prefetch(x)
+#define ASSERT(x) assert(x)
+#define __must_check __attribute__((__warn_unused_result__))
+
+#include "list.h"
+
+struct domain {
+};
+
+struct pci_dev {
+ struct vpci *vpci;
+};
+
+struct vcpu
+{
+ const struct domain *domain;
+};
+
+extern const struct vcpu *current;
+extern const struct pci_dev test_pdev;
+
+typedef bool spinlock_t;
+#define spin_lock_init(l) (*(l) = false)
+#define spin_lock(l) (*(l) = true)
+#define spin_unlock(l) (*(l) = false)
+
+typedef union {
+ uint32_t sbdf;
+ struct {
+ union {
+ uint16_t bdf;
+ struct {
+ union {
+ struct {
+ uint8_t func : 3,
+ dev : 5;
+ };
+ uint8_t extfunc;
+ };
+ uint8_t bus;
+ };
+ };
+ uint16_t seg;
+ };
+} pci_sbdf_t;
+
+#include "vpci.h"
+
+#define __hwdom_init
+
+#define has_vpci(d) true
+
+#define xzalloc(type) ((type *)calloc(1, sizeof(type)))
+#define xmalloc(type) ((type *)malloc(sizeof(type)))
+#define xfree(p) free(p)
+
+#define pci_get_pdev_by_domain(...) &test_pdev
+
+/* Dummy native helpers. Writes are ignored, reads return 1's. */
+#define pci_conf_read8(...) 0xff
+#define pci_conf_read16(...) 0xffff
+#define pci_conf_read32(...) 0xffffffff
+#define pci_conf_write8(...)
+#define pci_conf_write16(...)
+#define pci_conf_write32(...)
+
+#define PCI_CFG_SPACE_EXP_SIZE 4096
+
+#define BUG() assert(0)
+#define ASSERT_UNREACHABLE() assert(0)
+
+#define min(x, y) ({ \
+ const typeof(x) tx = (x); \
+ const typeof(y) ty = (y); \
+ \
+ (void) (&tx == &ty); \
+ tx < ty ? tx : ty; \
+})
+
+#define max(x, y) ({ \
+ const typeof(x) tx = (x); \
+ const typeof(y) ty = (y); \
+ \
+ (void) (&tx == &ty); \
+ tx > ty ? tx : ty; \
+})
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/tests/vpci/main.c b/tools/tests/vpci/main.c
new file mode 100644
index 0000000000..206f4392ab
--- /dev/null
+++ b/tools/tests/vpci/main.c
@@ -0,0 +1,308 @@
+/*
+ * Unit tests for the generic vPCI handler code.
+ *
+ * Copyright (C) 2017 Citrix Systems R&D
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "emul.h"
+
+/* Single vcpu (current), and single domain with a single PCI device. */
+static struct vpci vpci;
+
+const static struct domain d;
+
+const struct pci_dev test_pdev = {
+ .vpci = &vpci,
+};
+
+const static struct vcpu v = {
+ .domain = &d
+};
+
+const struct vcpu *current = &v;
+
+/* Dummy hooks, write stores data, read fetches it. */
+static uint32_t vpci_read8(const struct pci_dev *pdev, unsigned int reg,
+ void *data)
+{
+ return *(uint8_t *)data;
+}
+
+static void vpci_write8(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data)
+{
+ *(uint8_t *)data = val;
+}
+
+static uint32_t vpci_read16(const struct pci_dev *pdev, unsigned int reg,
+ void *data)
+{
+ return *(uint16_t *)data;
+}
+
+static void vpci_write16(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data)
+{
+ *(uint16_t *)data = val;
+}
+
+static uint32_t vpci_read32(const struct pci_dev *pdev, unsigned int reg,
+ void *data)
+{
+ return *(uint32_t *)data;
+}
+
+static void vpci_write32(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data)
+{
+ *(uint32_t *)data = val;
+}
+
+#define VPCI_READ(reg, size, data) ({ \
+ data = vpci_read((pci_sbdf_t){ .sbdf = 0 }, reg, size); \
+})
+
+#define VPCI_READ_CHECK(reg, size, expected) ({ \
+ uint32_t rd; \
+ \
+ VPCI_READ(reg, size, rd); \
+ assert(rd == (expected)); \
+})
+
+#define VPCI_WRITE(reg, size, data) ({ \
+ vpci_write((pci_sbdf_t){ .sbdf = 0 }, reg, size, data); \
+})
+
+#define VPCI_WRITE_CHECK(reg, size, data) ({ \
+ VPCI_WRITE(reg, size, data); \
+ VPCI_READ_CHECK(reg, size, data); \
+})
+
+#define VPCI_ADD_REG(fread, fwrite, off, size, store) \
+ assert(!vpci_add_register(&test_pdev, fread, fwrite, off, size, &store))
+
+#define VPCI_ADD_INVALID_REG(fread, fwrite, off, size) \
+ assert(vpci_add_register(&test_pdev, fread, fwrite, off, size, NULL))
+
+#define VPCI_REMOVE_REG(off, size) \
+ assert(!vpci_remove_register(&test_pdev, off, size))
+
+#define VPCI_REMOVE_INVALID_REG(off, size) \
+ assert(vpci_remove_register(&test_pdev, off, size))
+
+/* Read a 32b register using all possible sizes. */
+void multiread4_check(unsigned int reg, uint32_t val)
+{
+ unsigned int i;
+
+ /* Read using bytes. */
+ for ( i = 0; i < 4; i++ )
+ VPCI_READ_CHECK(reg + i, 1, (val >> (i * 8)) & UINT8_MAX);
+
+ /* Read using 2bytes. */
+ for ( i = 0; i < 2; i++ )
+ VPCI_READ_CHECK(reg + i * 2, 2, (val >> (i * 2 * 8)) & UINT16_MAX);
+
+ VPCI_READ_CHECK(reg, 4, val);
+}
+
+void multiwrite4_check(unsigned int reg)
+{
+ unsigned int i;
+ uint32_t val = 0xa2f51732;
+
+ /* Write using bytes. */
+ for ( i = 0; i < 4; i++ )
+ VPCI_WRITE_CHECK(reg + i, 1, (val >> (i * 8)) & UINT8_MAX);
+ multiread4_check(reg, val);
+
+ /* Change the value each time to be sure writes work fine. */
+ val = 0x2b836fda;
+ /* Write using 2bytes. */
+ for ( i = 0; i < 2; i++ )
+ VPCI_WRITE_CHECK(reg + i * 2, 2, (val >> (i * 2 * 8)) & UINT16_MAX);
+ multiread4_check(reg, val);
+
+ val = 0xc4693beb;
+ VPCI_WRITE_CHECK(reg, 4, val);
+ multiread4_check(reg, val);
+}
+
+int
+main(int argc, char **argv)
+{
+ /* Index storage by offset. */
+ uint32_t r0 = 0xdeadbeef;
+ uint8_t r5 = 0xef;
+ uint8_t r6 = 0xbe;
+ uint8_t r7 = 0xef;
+ uint16_t r12 = 0x8696;
+ uint8_t r16[4] = { };
+ uint16_t r20[2] = { };
+ uint32_t r24 = 0;
+ uint8_t r28, r30;
+ unsigned int i;
+ int rc;
+
+ INIT_LIST_HEAD(&vpci.handlers);
+ spin_lock_init(&vpci.lock);
+
+ VPCI_ADD_REG(vpci_read32, vpci_write32, 0, 4, r0);
+ VPCI_READ_CHECK(0, 4, r0);
+ VPCI_WRITE_CHECK(0, 4, 0xbcbcbcbc);
+
+ VPCI_ADD_REG(vpci_read8, vpci_write8, 5, 1, r5);
+ VPCI_READ_CHECK(5, 1, r5);
+ VPCI_WRITE_CHECK(5, 1, 0xba);
+
+ VPCI_ADD_REG(vpci_read8, vpci_write8, 6, 1, r6);
+ VPCI_READ_CHECK(6, 1, r6);
+ VPCI_WRITE_CHECK(6, 1, 0xba);
+
+ VPCI_ADD_REG(vpci_read8, vpci_write8, 7, 1, r7);
+ VPCI_READ_CHECK(7, 1, r7);
+ VPCI_WRITE_CHECK(7, 1, 0xbd);
+
+ VPCI_ADD_REG(vpci_read16, vpci_write16, 12, 2, r12);
+ VPCI_READ_CHECK(12, 2, r12);
+ VPCI_READ_CHECK(12, 4, 0xffff8696);
+
+ /*
+ * At this point we have the following layout:
+ *
+ * Note that this refers to the position of the variables,
+ * but the value has already changed from the one given at
+ * initialization time because write tests have been performed.
+ *
+ * 32 24 16 8 0
+ * +-----+-----+-----+-----+
+ * | r0 | 0
+ * +-----+-----+-----+-----+
+ * | r7 | r6 | r5 |/////| 32
+ * +-----+-----+-----+-----|
+ * |///////////////////////| 64
+ * +-----------+-----------+
+ * |///////////| r12 | 96
+ * +-----------+-----------+
+ * ...
+ * / = unhandled.
+ */
+
+ /* Try to add an overlapping register handler. */
+ VPCI_ADD_INVALID_REG(vpci_read32, vpci_write32, 4, 4);
+
+ /* Try to add a non-aligned register. */
+ VPCI_ADD_INVALID_REG(vpci_read16, vpci_write16, 15, 2);
+
+ /* Try to add a register with wrong size. */
+ VPCI_ADD_INVALID_REG(vpci_read16, vpci_write16, 8, 3);
+
+ /* Try to add a register with missing handlers. */
+ VPCI_ADD_INVALID_REG(NULL, NULL, 8, 2);
+
+ /* Read/write of unset register. */
+ VPCI_READ_CHECK(8, 4, 0xffffffff);
+ VPCI_READ_CHECK(8, 2, 0xffff);
+ VPCI_READ_CHECK(8, 1, 0xff);
+ VPCI_WRITE(10, 2, 0xbeef);
+ VPCI_READ_CHECK(10, 2, 0xffff);
+
+ /* Read of multiple registers */
+ VPCI_WRITE_CHECK(7, 1, 0xbd);
+ VPCI_READ_CHECK(4, 4, 0xbdbabaff);
+
+ /* Partial read of a register. */
+ VPCI_WRITE_CHECK(0, 4, 0x1a1b1c1d);
+ VPCI_READ_CHECK(2, 1, 0x1b);
+ VPCI_READ_CHECK(6, 2, 0xbdba);
+
+ /* Write of multiple registers. */
+ VPCI_WRITE_CHECK(4, 4, 0xaabbccff);
+
+ /* Partial write of a register. */
+ VPCI_WRITE_CHECK(2, 1, 0xfe);
+ VPCI_WRITE_CHECK(6, 2, 0xfebc);
+
+ /*
+ * Test all possible read/write size combinations.
+ *
+ * Place 4 1B registers at 128bits (16B), 2 2B registers at 160bits
+ * (20B) and finally 1 4B register at 192bits (24B).
+ *
+ * Then perform all possible write and read sizes on each of them.
+ *
+ * ...
+ * 32 24 16 8 0
+ * +------+------+------+------+
+ * |r16[3]|r16[2]|r16[1]|r16[0]| 16
+ * +------+------+------+------+
+ * | r20[1] | r20[0] | 20
+ * +-------------+-------------|
+ * | r24 | 24
+ * +-------------+-------------+
+ *
+ */
+ VPCI_ADD_REG(vpci_read8, vpci_write8, 16, 1, r16[0]);
+ VPCI_ADD_REG(vpci_read8, vpci_write8, 17, 1, r16[1]);
+ VPCI_ADD_REG(vpci_read8, vpci_write8, 18, 1, r16[2]);
+ VPCI_ADD_REG(vpci_read8, vpci_write8, 19, 1, r16[3]);
+
+ VPCI_ADD_REG(vpci_read16, vpci_write16, 20, 2, r20[0]);
+ VPCI_ADD_REG(vpci_read16, vpci_write16, 22, 2, r20[1]);
+
+ VPCI_ADD_REG(vpci_read32, vpci_write32, 24, 4, r24);
+
+ /* Check the initial value is 0. */
+ multiread4_check(16, 0);
+ multiread4_check(20, 0);
+ multiread4_check(24, 0);
+
+ multiwrite4_check(16);
+ multiwrite4_check(20);
+ multiwrite4_check(24);
+
+ /*
+ * Check multiple non-consecutive gaps on the same read/write:
+ *
+ * 32 24 16 8 0
+ * +------+------+------+------+
+ * |//////| r30 |//////| r28 | 28
+ * +------+------+------+------+
+ *
+ */
+ VPCI_ADD_REG(vpci_read8, vpci_write8, 28, 1, r28);
+ VPCI_ADD_REG(vpci_read8, vpci_write8, 30, 1, r30);
+ VPCI_WRITE_CHECK(28, 4, 0xffacffdc);
+
+ /* Finally try to remove a couple of registers. */
+ VPCI_REMOVE_REG(28, 1);
+ VPCI_REMOVE_REG(24, 4);
+ VPCI_REMOVE_REG(12, 2);
+
+ VPCI_REMOVE_INVALID_REG(20, 1);
+ VPCI_REMOVE_INVALID_REG(16, 2);
+ VPCI_REMOVE_INVALID_REG(30, 2);
+
+ return 0;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/arm/xen.lds.S b/xen/arch/arm/xen.lds.S
index c9b9546435..eb14909645 100644
--- a/xen/arch/arm/xen.lds.S
+++ b/xen/arch/arm/xen.lds.S
@@ -65,6 +65,12 @@ SECTIONS
__param_start = .;
*(.data.param)
__param_end = .;
+
+#if defined(CONFIG_HAS_PCI) && defined(CONFIG_LATE_HWDOM)
+ __start_vpci_array = .;
+ *(.data.vpci)
+ __end_vpci_array = .;
+#endif
} :text

#if defined(BUILD_ID)
@@ -173,6 +179,12 @@ SECTIONS
*(.init_array)
*(SORT(.init_array.*))
__ctors_end = .;
+
+#if defined(CONFIG_HAS_PCI) && !defined(CONFIG_LATE_HWDOM)
+ __start_vpci_array = .;
+ *(.data.vpci)
+ __end_vpci_array = .;
+#endif
} :text
__init_end_efi = .;
. = ALIGN(STACK_SIZE);
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index dbddc536d3..898bb746ac 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -382,11 +382,21 @@ static bool emulation_flags_ok(const struct domain *d, uint32_t emflags)
if ( is_hvm_domain(d) )
{
if ( is_hardware_domain(d) &&
- emflags != (XEN_X86_EMU_LAPIC|XEN_X86_EMU_IOAPIC) )
- return false;
- if ( !is_hardware_domain(d) && emflags &&
- emflags != XEN_X86_EMU_ALL && emflags != XEN_X86_EMU_LAPIC )
+ emflags != (XEN_X86_EMU_LAPIC|XEN_X86_EMU_IOAPIC|
+ XEN_X86_EMU_VPCI) )
return false;
+ if ( !is_hardware_domain(d) )
+ {
+ switch ( emflags )
+ {
+ case XEN_X86_EMU_ALL & ~XEN_X86_EMU_VPCI:
+ case XEN_X86_EMU_LAPIC:
+ case 0:
+ break;
+ default:
+ return false;
+ }
+ }
}
else if ( emflags != 0 && emflags != XEN_X86_EMU_PIT )
{
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 6cb903def5..cc73df8dc7 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -36,6 +36,7 @@
#include <xen/rangeset.h>
#include <xen/monitor.h>
#include <xen/warning.h>
+#include <xen/vpci.h>
#include <asm/shadow.h>
#include <asm/hap.h>
#include <asm/current.h>
@@ -629,6 +630,7 @@ int hvm_domain_initialise(struct domain *d, unsigned long domcr_flags,
d->arch.hvm_domain.io_bitmap = hvm_io_bitmap;

register_g2m_portio_handler(d);
+ register_vpci_portio_handler(d);

hvm_ioreq_init(d);

diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
index 4e49e59012..6f9cd1f19e 100644
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -25,6 +25,7 @@
#include <xen/trace.h>
#include <xen/event.h>
#include <xen/hypercall.h>
+#include <xen/vpci.h>
#include <asm/current.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
@@ -269,6 +270,108 @@ unsigned int hvm_pci_decode_addr(unsigned int cf8, unsigned int addr,
return CF8_ADDR_LO(cf8) | (addr & 3);
}

+/* Do some sanity checks. */
+static bool vpci_access_allowed(unsigned int reg, unsigned int len)
+{
+ /* Check access size. */
+ if ( len != 1 && len != 2 && len != 4 )
+ return false;
+
+ /* Check that access is size aligned. */
+ if ( (reg & (len - 1)) )
+ return false;
+
+ return true;
+}
+
+/* vPCI config space IO ports handlers (0xcf8/0xcfc). */
+static bool vpci_portio_accept(const struct hvm_io_handler *handler,
+ const ioreq_t *p)
+{
+ return (p->addr == 0xcf8 && p->size == 4) || (p->addr & ~3) == 0xcfc;
+}
+
+static int vpci_portio_read(const struct hvm_io_handler *handler,
+ uint64_t addr, uint32_t size, uint64_t *data)
+{
+ struct domain *d = current->domain;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+ uint32_t cf8;
+
+ *data = ~(uint64_t)0;
+
+ if ( addr == 0xcf8 )
+ {
+ ASSERT(size == 4);
+ *data = d->arch.hvm_domain.pci_cf8;
+ return X86EMUL_OKAY;
+ }
+
+ cf8 = ACCESS_ONCE(d->arch.hvm_domain.pci_cf8);
+ if ( !CF8_ENABLED(cf8) )
+ return X86EMUL_OKAY;
+
+ reg = hvm_pci_decode_addr(cf8, addr, &sbdf);
+
+ if ( !vpci_access_allowed(reg, size) )
+ return X86EMUL_OKAY;
+
+ *data = vpci_read(sbdf, reg, size);
+
+ return X86EMUL_OKAY;
+}
+
+static int vpci_portio_write(const struct hvm_io_handler *handler,
+ uint64_t addr, uint32_t size, uint64_t data)
+{
+ struct domain *d = current->domain;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+ uint32_t cf8;
+
+ if ( addr == 0xcf8 )
+ {
+ ASSERT(size == 4);
+ d->arch.hvm_domain.pci_cf8 = data;
+ return X86EMUL_OKAY;
+ }
+
+ cf8 = ACCESS_ONCE(d->arch.hvm_domain.pci_cf8);
+ if ( !CF8_ENABLED(cf8) )
+ return X86EMUL_OKAY;
+
+ reg = hvm_pci_decode_addr(cf8, addr, &sbdf);
+
+ if ( !vpci_access_allowed(reg, size) )
+ return X86EMUL_OKAY;
+
+ vpci_write(sbdf, reg, size, data);
+
+ return X86EMUL_OKAY;
+}
+
+static const struct hvm_io_ops vpci_portio_ops = {
+ .accept = vpci_portio_accept,
+ .read = vpci_portio_read,
+ .write = vpci_portio_write,
+};
+
+void register_vpci_portio_handler(struct domain *d)
+{
+ struct hvm_io_handler *handler;
+
+ if ( !has_vpci(d) )
+ return;
+
+ handler = hvm_next_io_handler(d);
+ if ( !handler )
+ return;
+
+ handler->type = IOREQ_TYPE_PIO;
+ handler->ops = &vpci_portio_ops;
+}
+
/*
* Local variables:
* mode: C
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 3cbe305202..1bb2dbd31c 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1581,7 +1581,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
domcr_flags |= DOMCRF_hvm |
((hvm_funcs.hap_supported && !opt_dom0_shadow) ?
DOMCRF_hap : 0);
- config.emulation_flags = XEN_X86_EMU_LAPIC|XEN_X86_EMU_IOAPIC;
+ config.emulation_flags = XEN_X86_EMU_LAPIC|XEN_X86_EMU_IOAPIC|
+ XEN_X86_EMU_VPCI;
}

/* Create initial domain 0. */
diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
index d5e8821d41..61775953d6 100644
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -124,6 +124,12 @@ SECTIONS
__param_start = .;
*(.data.param)
__param_end = .;
+
+#if defined(CONFIG_HAS_PCI) && defined(CONFIG_LATE_HWDOM)
+ __start_vpci_array = .;
+ *(.data.vpci)
+ __end_vpci_array = .;
+#endif
} :text

#if defined(BUILD_ID)
@@ -213,6 +219,12 @@ SECTIONS
*(.init_array)
*(SORT(.init_array.*))
__ctors_end = .;
+
+#if defined(CONFIG_HAS_PCI) && !defined(CONFIG_LATE_HWDOM)
+ __start_vpci_array = .;
+ *(.data.vpci)
+ __end_vpci_array = .;
+#endif
} :text

#ifdef EFI
diff --git a/xen/drivers/Makefile b/xen/drivers/Makefile
index 19391802a8..d51c766453 100644
--- a/xen/drivers/Makefile
+++ b/xen/drivers/Makefile
@@ -1,6 +1,6 @@
subdir-y += char
subdir-$(CONFIG_HAS_CPUFREQ) += cpufreq
-subdir-$(CONFIG_HAS_PCI) += pci
+subdir-$(CONFIG_HAS_PCI) += pci vpci
subdir-$(CONFIG_HAS_PASSTHROUGH) += passthrough
subdir-$(CONFIG_ACPI) += acpi
subdir-$(CONFIG_VIDEO) += video
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 469dfc6c3d..975485fe05 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -31,6 +31,7 @@
#include <xen/radix-tree.h>
#include <xen/softirq.h>
#include <xen/tasklet.h>
+#include <xen/vpci.h>
#include <xsm/xsm.h>
#include <asm/msi.h>
#include "ats.h"
@@ -1052,9 +1053,10 @@ static void __hwdom_init setup_one_hwdom_device(const struct setup_hwdom *ctxt,
struct pci_dev *pdev)
{
u8 devfn = pdev->devfn;
+ int err;

do {
- int err = ctxt->handler(devfn, pdev);
+ err = ctxt->handler(devfn, pdev);

if ( err )
{
@@ -1067,6 +1069,11 @@ static void __hwdom_init setup_one_hwdom_device(const struct setup_hwdom *ctxt,
devfn += pdev->phantom_stride;
} while ( devfn != pdev->devfn &&
PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) );
+
+ err = vpci_add_handlers(pdev);
+ if ( err )
+ printk(XENLOG_ERR "setup of vPCI for d%d failed: %d\n",
+ ctxt->d->domain_id, err);
}

static int __hwdom_init _setup_hwdom_pci_devices(struct pci_seg *pseg, void *arg)
diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
new file mode 100644
index 0000000000..840a906470
--- /dev/null
+++ b/xen/drivers/vpci/Makefile
@@ -0,0 +1 @@
+obj-y += vpci.o
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
new file mode 100644
index 0000000000..3d578237bd
--- /dev/null
+++ b/xen/drivers/vpci/vpci.c
@@ -0,0 +1,450 @@
+/*
+ * Generic functionality for handling accesses to the PCI configuration space
+ * from guests.
+ *
+ * Copyright (C) 2017 Citrix Systems R&D
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/sched.h>
+#include <xen/vpci.h>
+
+extern vpci_register_init_t *const __start_vpci_array[];
+extern vpci_register_init_t *const __end_vpci_array[];
+#define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
+
+/* Internal struct to store the emulated PCI registers. */
+struct vpci_register {
+ vpci_read_t *read;
+ vpci_write_t *write;
+ unsigned int size;
+ unsigned int offset;
+ void *private;
+ struct list_head node;
+};
+
+int __hwdom_init vpci_add_handlers(struct pci_dev *pdev)
+{
+ unsigned int i;
+ int rc = 0;
+
+ if ( !has_vpci(pdev->domain) )
+ return 0;
+
+ pdev->vpci = xzalloc(struct vpci);
+ if ( !pdev->vpci )
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&pdev->vpci->handlers);
+ spin_lock_init(&pdev->vpci->lock);
+
+ for ( i = 0; i < NUM_VPCI_INIT; i++ )
+ {
+ rc = __start_vpci_array[i](pdev);
+ if ( rc )
+ break;
+ }
+
+ if ( rc )
+ {
+ while ( !list_empty(&pdev->vpci->handlers) )
+ {
+ struct vpci_register *r = list_first_entry(&pdev->vpci->handlers,
+ struct vpci_register,
+ node);
+
+ list_del(&r->node);
+ xfree(r);
+ }
+ xfree(pdev->vpci);
+ pdev->vpci = NULL;
+ }
+
+ return rc;
+}
+
+static int vpci_register_cmp(const struct vpci_register *r1,
+ const struct vpci_register *r2)
+{
+ /* Return 0 if registers overlap. */
+ if ( r1->offset < r2->offset + r2->size &&
+ r2->offset < r1->offset + r1->size )
+ return 0;
+ if ( r1->offset < r2->offset )
+ return -1;
+ if ( r1->offset > r2->offset )
+ return 1;
+
+ ASSERT_UNREACHABLE();
+ return 0;
+}
+
+/* Dummy hooks, writes are ignored, reads return 1's */
+static uint32_t vpci_ignored_read(const struct pci_dev *pdev, unsigned int reg,
+ void *data)
+{
+ return ~(uint32_t)0;
+}
+
+static void vpci_ignored_write(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data)
+{
+}
+
+int vpci_add_register(const struct pci_dev *pdev, vpci_read_t *read_handler,
+ vpci_write_t *write_handler, unsigned int offset,
+ unsigned int size, void *data)
+{
+ struct list_head *prev;
+ struct vpci_register *r;
+
+ /* Some sanity checks. */
+ if ( (size != 1 && size != 2 && size != 4) ||
+ offset >= PCI_CFG_SPACE_EXP_SIZE || (offset & (size - 1)) ||
+ (!read_handler && !write_handler) )
+ return -EINVAL;
+
+ r = xmalloc(struct vpci_register);
+ if ( !r )
+ return -ENOMEM;
+
+ r->read = read_handler ?: vpci_ignored_read;
+ r->write = write_handler ?: vpci_ignored_write;
+ r->size = size;
+ r->offset = offset;
+ r->private = data;
+
+ spin_lock(&pdev->vpci->lock);
+
+ /* The list of handlers must be kept sorted at all times. */
+ list_for_each ( prev, &pdev->vpci->handlers )
+ {
+ const struct vpci_register *this =
+ list_entry(prev, const struct vpci_register, node);
+ int cmp = vpci_register_cmp(r, this);
+
+ if ( cmp < 0 )
+ break;
+ if ( cmp == 0 )
+ {
+ spin_unlock(&pdev->vpci->lock);
+ xfree(r);
+ return -EEXIST;
+ }
+ }
+
+ list_add_tail(&r->node, prev);
+ spin_unlock(&pdev->vpci->lock);
+
+ return 0;
+}
+
+int vpci_remove_register(const struct pci_dev *pdev, unsigned int offset,
+ unsigned int size)
+{
+ const struct vpci_register r = { .offset = offset, .size = size };
+ struct vpci_register *rm;
+
+ spin_lock(&pdev->vpci->lock);
+ list_for_each_entry ( rm, &pdev->vpci->handlers, node )
+ {
+ int cmp = vpci_register_cmp(&r, rm);
+
+ /*
+ * NB: do not use a switch so that we can use break to
+ * get out of the list loop earlier if required.
+ */
+ if ( !cmp && rm->offset == offset && rm->size == size )
+ {
+ list_del(&rm->node);
+ spin_unlock(&pdev->vpci->lock);
+ xfree(rm);
+ return 0;
+ }
+ if ( cmp <= 0 )
+ break;
+ }
+ spin_unlock(&pdev->vpci->lock);
+
+ return -ENOENT;
+}
+
+/* Wrappers for performing reads/writes to the underlying hardware. */
+static uint32_t vpci_read_hw(pci_sbdf_t sbdf, unsigned int reg,
+ unsigned int size)
+{
+ uint32_t data;
+
+ switch ( size )
+ {
+ case 4:
+ data = pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, reg);
+ break;
+ case 3:
+ /*
+ * This is possible because a 4byte read can have 1byte trapped and
+ * the rest passed-through.
+ */
+ if ( reg & 1 )
+ {
+ data = pci_conf_read8(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func,
+ reg);
+ data |= pci_conf_read16(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func,
+ reg + 1) << 8;
+ }
+ else
+ {
+ data = pci_conf_read16(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func,
+ reg);
+ data |= pci_conf_read8(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func,
+ reg + 2) << 16;
+ }
+ break;
+ case 2:
+ data = pci_conf_read16(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, reg);
+ break;
+ case 1:
+ data = pci_conf_read8(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, reg);
+ break;
+ default:
+ ASSERT_UNREACHABLE();
+ data = ~(uint32_t)0;
+ break;
+ }
+
+ return data;
+}
+
+static void vpci_write_hw(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
+ uint32_t data)
+{
+ switch ( size )
+ {
+ case 4:
+ pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, reg, data);
+ break;
+ case 3:
+ /*
+ * This is possible because a 4byte write can have 1byte trapped and
+ * the rest passed-through.
+ */
+ if ( reg & 1 )
+ {
+ pci_conf_write8(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, reg,
+ data);
+ pci_conf_write16(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, reg + 1,
+ data >> 8);
+ }
+ else
+ {
+ pci_conf_write16(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, reg,
+ data);
+ pci_conf_write8(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, reg + 2,
+ data >> 16);
+ }
+ break;
+ case 2:
+ pci_conf_write16(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, reg, data);
+ break;
+ case 1:
+ pci_conf_write8(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, reg, data);
+ break;
+ default:
+ ASSERT_UNREACHABLE();
+ break;
+ }
+}
+
+/*
+ * Merge new data into a partial result.
+ *
+ * Copy the value found in 'new' from [0, size) left shifted by
+ * 'offset' into 'data'.
+ */
+static uint32_t merge_result(uint32_t data, uint32_t new, unsigned int size,
+ unsigned int offset)
+{
+ uint32_t mask = 0xffffffff >> (32 - 8 * size);
+
+ return (data & ~(mask << (offset * 8))) | ((new & mask) << (offset * 8));
+}
+
+uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
+{
+ const struct domain *d = current->domain;
+ const struct pci_dev *pdev;
+ const struct vpci_register *r;
+ unsigned int data_offset = 0;
+ uint32_t data = ~(uint32_t)0;
+
+ /* Find the PCI dev matching the address. */
+ pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.extfunc);
+ if ( !pdev )
+ return vpci_read_hw(sbdf, reg, size);
+
+ spin_lock(&pdev->vpci->lock);
+
+ /* Read from the hardware or the emulated register handlers. */
+ list_for_each_entry ( r, &pdev->vpci->handlers, node )
+ {
+ const struct vpci_register emu = {
+ .offset = reg + data_offset,
+ .size = size - data_offset
+ };
+ int cmp = vpci_register_cmp(&emu, r);
+ uint32_t val;
+ unsigned int read_size;
+
+ if ( cmp < 0 )
+ break;
+ if ( cmp > 0 )
+ continue;
+
+ if ( emu.offset < r->offset )
+ {
+ /* Heading gap, read partial content from hardware. */
+ read_size = r->offset - emu.offset;
+ val = vpci_read_hw(sbdf, emu.offset, read_size);
+ data = merge_result(data, val, read_size, data_offset);
+ data_offset += read_size;
+ }
+
+ val = r->read(pdev, r->offset, r->private);
+
+ /* Check if the read is in the middle of a register. */
+ if ( r->offset < emu.offset )
+ val >>= (emu.offset - r->offset) * 8;
+
+ /* Find the intersection size between the two sets. */
+ read_size = min(emu.offset + emu.size, r->offset + r->size) -
+ max(emu.offset, r->offset);
+ /* Merge the emulated data into the native read value. */
+ data = merge_result(data, val, read_size, data_offset);
+ data_offset += read_size;
+ if ( data_offset == size )
+ break;
+ ASSERT(data_offset < size);
+ }
+
+ if ( data_offset < size )
+ {
+ /* Tailing gap, read the remaining. */
+ uint32_t tmp_data = vpci_read_hw(sbdf, reg + data_offset,
+ size - data_offset);
+
+ data = merge_result(data, tmp_data, size - data_offset, data_offset);
+ }
+ spin_unlock(&pdev->vpci->lock);
+
+ return data & (0xffffffff >> (32 - 8 * size));
+}
+
+/*
+ * Perform a maybe partial write to a register.
+ *
+ * Note that this will only work for simple registers, if Xen needs to
+ * trap accesses to rw1c registers (like the status PCI header register)
+ * the logic in vpci_write will have to be expanded in order to correctly
+ * deal with them.
+ */
+static void vpci_write_helper(const struct pci_dev *pdev,
+ const struct vpci_register *r, unsigned int size,
+ unsigned int offset, uint32_t data)
+{
+ ASSERT(size <= r->size);
+
+ if ( size != r->size )
+ {
+ uint32_t val;
+
+ val = r->read(pdev, r->offset, r->private);
+ data = merge_result(val, data, size, offset);
+ }
+
+ r->write(pdev, r->offset, data & (0xffffffff >> (32 - 8 * r->size)),
+ r->private);
+}
+
+void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
+ uint32_t data)
+{
+ const struct domain *d = current->domain;
+ const struct pci_dev *pdev;
+ const struct vpci_register *r;
+ unsigned int data_offset = 0;
+
+ /*
+ * Find the PCI dev matching the address.
+ * Passthrough everything that's not trapped.
+ */
+ pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.extfunc);
+ if ( !pdev )
+ {
+ vpci_write_hw(sbdf, reg, size, data);
+ return;
+ }
+
+ spin_lock(&pdev->vpci->lock);
+
+ /* Write the value to the hardware or emulated registers. */
+ list_for_each_entry ( r, &pdev->vpci->handlers, node )
+ {
+ const struct vpci_register emu = {
+ .offset = reg + data_offset,
+ .size = size - data_offset
+ };
+ int cmp = vpci_register_cmp(&emu, r);
+ unsigned int write_size;
+
+ if ( cmp < 0 )
+ break;
+ if ( cmp > 0 )
+ continue;
+
+ if ( emu.offset < r->offset )
+ {
+ /* Heading gap, write partial content to hardware. */
+ vpci_write_hw(sbdf, emu.offset, r->offset - emu.offset,
+ data >> (data_offset * 8));
+ data_offset += r->offset - emu.offset;
+ }
+
+ /* Find the intersection size between the two sets. */
+ write_size = min(emu.offset + emu.size, r->offset + r->size) -
+ max(emu.offset, r->offset);
+ vpci_write_helper(pdev, r, write_size, reg + data_offset - r->offset,
+ data >> (data_offset * 8));
+ data_offset += write_size;
+ if ( data_offset == size )
+ break;
+ ASSERT(data_offset < size);
+ }
+
+ if ( data_offset < size )
+ /* Tailing gap, write the remaining. */
+ vpci_write_hw(sbdf, reg + data_offset, size - data_offset,
+ data >> (data_offset * 8));
+
+ spin_unlock(&pdev->vpci->lock);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index fb8bf17458..695418bd74 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -428,6 +428,7 @@ struct arch_domain
#define has_vpit(d) (!!((d)->arch.emulation_flags & XEN_X86_EMU_PIT))
#define has_pirq(d) (!!((d)->arch.emulation_flags & \
XEN_X86_EMU_USE_PIRQ))
+#define has_vpci(d) (!!((d)->arch.emulation_flags & XEN_X86_EMU_VPCI))

#define has_arch_pdevs(d) (!list_empty(&(d)->arch.pdev_list))

diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h
index 2ff1c96883..f68aed9323 100644
--- a/xen/include/asm-x86/hvm/io.h
+++ b/xen/include/asm-x86/hvm/io.h
@@ -160,6 +160,9 @@ unsigned int hvm_pci_decode_addr(unsigned int cf8, unsigned int addr,
*/
void register_g2m_portio_handler(struct domain *d);

+/* HVM port IO handler for vPCI accesses. */
+void register_vpci_portio_handler(struct domain *d);
+
#endif /* __ASM_X86_HVM_IO_H__ */


diff --git a/xen/include/public/arch-x86/xen.h b/xen/include/public/arch-x86/xen.h
index ff918310f6..06ef4772cd 100644
--- a/xen/include/public/arch-x86/xen.h
+++ b/xen/include/public/arch-x86/xen.h
@@ -293,12 +293,15 @@ struct xen_arch_domainconfig {
#define XEN_X86_EMU_PIT (1U<<_XEN_X86_EMU_PIT)
#define _XEN_X86_EMU_USE_PIRQ 9
#define XEN_X86_EMU_USE_PIRQ (1U<<_XEN_X86_EMU_USE_PIRQ)
+#define _XEN_X86_EMU_VPCI 10
+#define XEN_X86_EMU_VPCI (1U<<_XEN_X86_EMU_VPCI)

#define XEN_X86_EMU_ALL (XEN_X86_EMU_LAPIC | XEN_X86_EMU_HPET | \
XEN_X86_EMU_PM | XEN_X86_EMU_RTC | \
XEN_X86_EMU_IOAPIC | XEN_X86_EMU_PIC | \
XEN_X86_EMU_VGA | XEN_X86_EMU_IOMMU | \
- XEN_X86_EMU_PIT | XEN_X86_EMU_USE_PIRQ)
+ XEN_X86_EMU_PIT | XEN_X86_EMU_USE_PIRQ |\
+ XEN_X86_EMU_VPCI)
uint32_t emulation_flags;
};

diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index dd5ec43a70..b7a6abfc53 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -112,6 +112,9 @@ struct pci_dev {
#define PT_FAULT_THRESHOLD 10
} fault;
u64 vf_rlen[6];
+
+ /* Data for vPCI. */
+ struct vpci *vpci;
};

#define for_each_pdev(domain, pdev) \
diff --git a/xen/include/xen/pci_regs.h b/xen/include/xen/pci_regs.h
index ecd6124d91..cc4ee3b83e 100644
--- a/xen/include/xen/pci_regs.h
+++ b/xen/include/xen/pci_regs.h
@@ -23,6 +23,14 @@
#define LINUX_PCI_REGS_H

/*
+ * Conventional PCI and PCI-X Mode 1 devices have 256 bytes of
+ * configuration space. PCI-X Mode 2 and PCIe devices have 4096 bytes of
+ * configuration space.
+ */
+#define PCI_CFG_SPACE_SIZE 256
+#define PCI_CFG_SPACE_EXP_SIZE 4096
+
+/*
* Under PCI, each device has 256 bytes of configuration address space,
* of which the first 64 bytes are standardized as follows:
*/
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
new file mode 100644
index 0000000000..b42e38ed54
--- /dev/null
+++ b/xen/include/xen/vpci.h
@@ -0,0 +1,54 @@
+#ifndef _XEN_VPCI_H_
+#define _XEN_VPCI_H_
+
+#include <xen/pci.h>
+#include <xen/types.h>
+#include <xen/list.h>
+
+typedef uint32_t vpci_read_t(const struct pci_dev *pdev, unsigned int reg,
+ void *data);
+
+typedef void vpci_write_t(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data);
+
+typedef int vpci_register_init_t(struct pci_dev *dev);
+
+#define REGISTER_VPCI_INIT(x) \
+ static vpci_register_init_t *const x##_entry \
+ __used_section(".data.vpci") = x
+
+/* Add vPCI handlers to device. */
+int __must_check vpci_add_handlers(struct pci_dev *dev);
+
+/* Add/remove a register handler. */
+int __must_check vpci_add_register(const struct pci_dev *pdev,
+ vpci_read_t *read_handler,
+ vpci_write_t *write_handler,
+ unsigned int offset, unsigned int size,
+ void *data);
+int __must_check vpci_remove_register(const struct pci_dev *pdev,
+ unsigned int offset,
+ unsigned int size);
+
+/* Generic read/write handlers for the PCI config space. */
+uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
+void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
+ uint32_t data);
+
+struct vpci {
+ /* List of vPCI handlers for a device. */
+ struct list_head handlers;
+ spinlock_t lock;
+};
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--
2.11.0 (Apple Git-81)
Wei Liu
2017-09-21 13:36:25 UTC
Permalink
Post by Roger Pau Monne
This functionality is going to reside in vpci.c (and the corresponding
vpci.h header), and should be arch-agnostic. The handlers introduced
in this patch setup the basic functionality required in order to trap
accesses to the PCI config space, and allow decoding the address and
finding the corresponding handler that should handle the access
(although no handlers are implemented).
Note that the traps to the PCI IO ports registers (0xcf8/0xcfc) are
setup inside of a x86 HVM file, since that's not shared with other
arches.
A new XEN_X86_EMU_VPCI x86 domain flag is added in order to signal Xen
whether a domain should use the newly introduced vPCI handlers, this
is only enabled for PVH Dom0 at the moment.
A very simple user-space test is also provided, so that the basic
functionality of the vPCI traps can be asserted. This has been proven
quite helpful during development, since the logic to handle partial
accesses or accesses that expand across multiple registers is not
trivial.
The handlers for the registers are added to a linked list that's keep
sorted at all times. Both the read and write handlers support accesses
that expand across multiple emulated registers and contain gaps not
emulated.
I am afraid I don't know much about PCI so I can't do meaningful review
of this patch.
Post by Roger Pau Monne
---
---
- Use a spinlock per pci device.
- Use the recently introduced pci_sbdf_t type.
- Fix test harness to use the right handler type and the newly
introduced lock.
- Move the position of the vpci sections in the linker scripts.
- Constify domain and pci_dev in vpci_{read/write}.
- Fix typos in comments.
- Use _XEN_VPCI_H_ as header guard.
- Do not redirect the output of the test.
- Add main.c and emul.h as dependencies of the Makefile target.
- Use the same rule to modify the vpci and list headers.
- Remove underscores from local macro variables.
- Add _check suffix to the test harness multiread function.
- Change the value written by every different size in the multiwrite
test.
- Use { } to initialize the r16 and r20 arrays (instead of { 0 }).
- Perform some of the read checks with the local variable directly.
- Expand some comments.
- Implement a dummy rwlock.
- Guard the linker script changes with CONFIG_HAS_PCI.
- Rename vpci_access_check to vpci_access_allowed and make it return
bool.
- Make hvm_pci_decode_addr return the register as return value.
- Use ~3 instead of 0xfffc to remove the register offset when
checking accesses to IO ports.
- s/head/prev in vpci_add_register.
- Add parentheses around & in vpci_add_register.
- Fix register removal.
- Change the BUGs in vpci_{read/write}_hw helpers to
ASSERT_UNREACHABLE.
- Make merge_result static and change the computation of the mask to
avoid using a uint64_t.
- Modify vpci_read to only read from hardware the not-emulated gaps.
- Remove the vpci_val union and use a uint32_t instead.
- Change handler read type to return a uint32_t instead of modifying
a variable passed by reference.
- Constify the data opaque parameter of read handlers.
- Change the size parameter of the vpci_{read/write} functions to
unsigned int.
- Place the array of initialization handlers in init.rodata or
.rodata depending on whether late-hwdom is enabled.
- Remove the pci_devs lock, assume the Dom0 is well behaved and won't
remove the device while trying to access it.
- Change the recursive spinlock into a rw lock for performance
reasons.
- Fix spaces in container_of macro.
- Implement a dummy locking functions.
- Remove 'current' macro make current a pointer to the statically
allocated vpcu.
- Remove unneeded parentheses in the pci_conf_readX macros.
- Fix the name of the write test macro.
- Remove the dummy EXPORT_SYMBOL macro (this was needed by the RB
code only).
- Import the max macro.
- Test all possible read/write size combinations with all possible
emulated register sizes.
- Introduce a test for register removal.
- Use a sorted list in order to store the config space handlers.
- Remove some unneeded 'else' branches.
- Make the IO port handlers always return X86EMUL_OKAY, and set the
data to all 1's in case of read failure (write are simply ignored).
- In hvm_select_ioreq_server reuse local variables when calling
XEN_DMOP_PCI_SBDF.
- Store the pointers to the initialization functions in the .rodata
section.
- Do not ignore the return value of xen_vpci_add_handlers in
setup_one_hwdom_device.
- Remove the vpci_init macro.
- Do not hide the pointers inside of the vpci_{read/write}_t
typedefs.
- Rename priv_data to private in vpci_register.
- Simplify checking for register overlap in vpci_register_cmp.
- Check that the offset and the length match before removing a
register in xen_vpci_remove_register.
- Make vpci_read_hw return a value rather than storing it in a
pointer passed by parameter.
- Handler dispatcher functions vpci_{read/write} no longer return an
error code, errors on reads/writes should be treated like hardware
(writes ignored, reads return all 1's or garbage).
- Make sure pcidevs is locked before calling pci_get_pdev_by_domain.
- Use a recursive spinlock for the vpci lock, so that spin_is_locked
checks that the current CPU is holding the lock.
- Make the code less error-chatty by removing some of the printk's.
- Pass the slot and the function as separate parameters to the
handler dispatchers (instead of passing devfn).
- Allow handlers to be registered with either a read or write
function only, the missing handler will be replaced by a dummy
handler (writes ignored, reads return 1's).
- Introduce PCI_CFG_SPACE_* defines from Linux.
- Simplify the handler dispatchers by removing the recursion, now the
dispatchers iterate over the list of sorted handlers and call them
in order.
- Remove the GENMASK_BYTES, SHIFT_RIGHT_BYTES and ADD_RESULT macros,
and instead provide a merge_result function in order to merge a
register output into a partial result.
- Rename the fields of the vpci_val union to u8/u16/u32.
- Remove the return values from the read/write handlers, errors
should be handled internally and signaled as would be done on
native hardware.
- Remove the usage of the GENMASK macro.
- Generalize the PCI address decoding and use it for IOREQ code also.
- Allow access to cross a word-boundary.
- Add locking.
- Add cleanup to xen_vpci_add_handlers in case of failure.
---
.gitignore | 3 +
tools/libxl/libxl_x86.c | 2 +-
tools/tests/Makefile | 1 +
tools/tests/vpci/Makefile | 37 ++++
tools/tests/vpci/emul.h | 133 +++++++++++
tools/tests/vpci/main.c | 308 ++++++++++++++++++++++++++
xen/arch/arm/xen.lds.S | 12 +
xen/arch/x86/domain.c | 18 +-
xen/arch/x86/hvm/hvm.c | 2 +
xen/arch/x86/hvm/io.c | 103 +++++++++
xen/arch/x86/setup.c | 3 +-
xen/arch/x86/xen.lds.S | 12 +
xen/drivers/Makefile | 2 +-
xen/drivers/passthrough/pci.c | 9 +-
xen/drivers/vpci/Makefile | 1 +
xen/drivers/vpci/vpci.c | 450 ++++++++++++++++++++++++++++++++++++++
xen/include/asm-x86/domain.h | 1 +
xen/include/asm-x86/hvm/io.h | 3 +
xen/include/public/arch-x86/xen.h | 5 +-
xen/include/xen/pci.h | 3 +
xen/include/xen/pci_regs.h | 8 +
xen/include/xen/vpci.h | 54 +++++
22 files changed, 1161 insertions(+), 9 deletions(-)
create mode 100644 tools/tests/vpci/Makefile
create mode 100644 tools/tests/vpci/emul.h
create mode 100644 tools/tests/vpci/main.c
create mode 100644 xen/drivers/vpci/Makefile
create mode 100644 xen/drivers/vpci/vpci.c
create mode 100644 xen/include/xen/vpci.h
diff --git a/.gitignore b/.gitignore
index cc16649457..1c670b27d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -240,6 +240,9 @@ tools/tests/regression/build/*
tools/tests/regression/downloads/*
tools/tests/mem-sharing/memshrtool
tools/tests/mce-test/tools/xen-mceinj
+tools/tests/vpci/list.h
+tools/tests/vpci/vpci.[hc]
+tools/tests/vpci/test_vpci
tools/xcutils/lsevtchn
tools/xcutils/readnotes
tools/xenbackendd/_paths.h
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index 455f6f0bed..dd7fc78a99 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -11,7 +11,7 @@ int libxl__arch_domain_prepare_config(libxl__gc *gc,
if (d_config->c_info.type == LIBXL_DOMAIN_TYPE_HVM) {
if (d_config->b_info.device_model_version !=
LIBXL_DEVICE_MODEL_VERSION_NONE) {
- xc_config->emulation_flags = XEN_X86_EMU_ALL;
+ xc_config->emulation_flags = (XEN_X86_EMU_ALL & ~XEN_X86_EMU_VPCI);
} else if (libxl_defbool_val(d_config->b_info.u.hvm.apic)) {
/*
* HVM guests without device model may want
diff --git a/tools/tests/Makefile b/tools/tests/Makefile
index 7162945121..f6942a93fb 100644
--- a/tools/tests/Makefile
+++ b/tools/tests/Makefile
@@ -13,6 +13,7 @@ endif
SUBDIRS-$(CONFIG_X86) += x86_emulator
SUBDIRS-y += xen-access
SUBDIRS-y += xenstore
+SUBDIRS-$(CONFIG_HAS_PCI) += vpci
.PHONY: all clean install distclean uninstall
all clean distclean: %: subdirs-%
diff --git a/tools/tests/vpci/Makefile b/tools/tests/vpci/Makefile
new file mode 100644
index 0000000000..e45fcb5cd9
--- /dev/null
+++ b/tools/tests/vpci/Makefile
@@ -0,0 +1,37 @@
+XEN_ROOT=$(CURDIR)/../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+TARGET := test_vpci
+
+.PHONY: all
+all: $(TARGET)
+
+.PHONY: run
+run: $(TARGET)
+ ./$(TARGET)
Is such target useful? It will possibly break if you invoke it outside
of this directory.
Post by Roger Pau Monne
+
+$(TARGET): vpci.c vpci.h list.h main.c emul.h
+
+.PHONY: clean
+ rm -rf $(TARGET) *.o *~ vpci.h vpci.c list.h
+
+.PHONY: distclean
+distclean: clean
+
+.PHONY: install
+
+vpci.c: $(XEN_ROOT)/xen/drivers/vpci/vpci.c
+ # Trick the compiler so it doesn't complain about missing symbols
+ sed -e '/#include/d' \
+ -e '1s;^;#include "emul.h"\
+ vpci_register_init_t *const __start_vpci_array[1]\;\
+ vpci_register_init_t *const __end_vpci_array[1]\;\
This is prone to breakage. I think it would be better to just have a
local stub file. Or maybe define it in your main.c?
Post by Roger Pau Monne
+
+int
+main(int argc, char **argv)
+{
+ /* Index storage by offset. */
+ uint32_t r0 = 0xdeadbeef;
+ uint8_t r5 = 0xef;
+ uint8_t r6 = 0xbe;
+ uint8_t r7 = 0xef;
+ uint16_t r12 = 0x8696;
+ uint8_t r16[4] = { };
+ uint16_t r20[2] = { };
Need to put 0 in the brackets.

The code seems rather self-contained. Maybe we can write a fuzzer for
it?
Roger Pau Monné
2017-09-22 16:50:39 UTC
Permalink
Post by Wei Liu
Post by Roger Pau Monne
This functionality is going to reside in vpci.c (and the corresponding
vpci.h header), and should be arch-agnostic. The handlers introduced
in this patch setup the basic functionality required in order to trap
accesses to the PCI config space, and allow decoding the address and
finding the corresponding handler that should handle the access
(although no handlers are implemented).
Note that the traps to the PCI IO ports registers (0xcf8/0xcfc) are
setup inside of a x86 HVM file, since that's not shared with other
arches.
A new XEN_X86_EMU_VPCI x86 domain flag is added in order to signal Xen
whether a domain should use the newly introduced vPCI handlers, this
is only enabled for PVH Dom0 at the moment.
A very simple user-space test is also provided, so that the basic
functionality of the vPCI traps can be asserted. This has been proven
quite helpful during development, since the logic to handle partial
accesses or accesses that expand across multiple registers is not
trivial.
The handlers for the registers are added to a linked list that's keep
sorted at all times. Both the read and write handlers support accesses
that expand across multiple emulated registers and contain gaps not
emulated.
I am afraid I don't know much about PCI so I can't do meaningful review
of this patch.
Thanks.
Post by Wei Liu
Post by Roger Pau Monne
---
---
- Use a spinlock per pci device.
- Use the recently introduced pci_sbdf_t type.
- Fix test harness to use the right handler type and the newly
introduced lock.
- Move the position of the vpci sections in the linker scripts.
- Constify domain and pci_dev in vpci_{read/write}.
- Fix typos in comments.
- Use _XEN_VPCI_H_ as header guard.
- Do not redirect the output of the test.
- Add main.c and emul.h as dependencies of the Makefile target.
- Use the same rule to modify the vpci and list headers.
- Remove underscores from local macro variables.
- Add _check suffix to the test harness multiread function.
- Change the value written by every different size in the multiwrite
test.
- Use { } to initialize the r16 and r20 arrays (instead of { 0 }).
- Perform some of the read checks with the local variable directly.
- Expand some comments.
- Implement a dummy rwlock.
- Guard the linker script changes with CONFIG_HAS_PCI.
- Rename vpci_access_check to vpci_access_allowed and make it return
bool.
- Make hvm_pci_decode_addr return the register as return value.
- Use ~3 instead of 0xfffc to remove the register offset when
checking accesses to IO ports.
- s/head/prev in vpci_add_register.
- Add parentheses around & in vpci_add_register.
- Fix register removal.
- Change the BUGs in vpci_{read/write}_hw helpers to
ASSERT_UNREACHABLE.
- Make merge_result static and change the computation of the mask to
avoid using a uint64_t.
- Modify vpci_read to only read from hardware the not-emulated gaps.
- Remove the vpci_val union and use a uint32_t instead.
- Change handler read type to return a uint32_t instead of modifying
a variable passed by reference.
- Constify the data opaque parameter of read handlers.
- Change the size parameter of the vpci_{read/write} functions to
unsigned int.
- Place the array of initialization handlers in init.rodata or
.rodata depending on whether late-hwdom is enabled.
- Remove the pci_devs lock, assume the Dom0 is well behaved and won't
remove the device while trying to access it.
- Change the recursive spinlock into a rw lock for performance
reasons.
- Fix spaces in container_of macro.
- Implement a dummy locking functions.
- Remove 'current' macro make current a pointer to the statically
allocated vpcu.
- Remove unneeded parentheses in the pci_conf_readX macros.
- Fix the name of the write test macro.
- Remove the dummy EXPORT_SYMBOL macro (this was needed by the RB
code only).
- Import the max macro.
- Test all possible read/write size combinations with all possible
emulated register sizes.
- Introduce a test for register removal.
- Use a sorted list in order to store the config space handlers.
- Remove some unneeded 'else' branches.
- Make the IO port handlers always return X86EMUL_OKAY, and set the
data to all 1's in case of read failure (write are simply ignored).
- In hvm_select_ioreq_server reuse local variables when calling
XEN_DMOP_PCI_SBDF.
- Store the pointers to the initialization functions in the .rodata
section.
- Do not ignore the return value of xen_vpci_add_handlers in
setup_one_hwdom_device.
- Remove the vpci_init macro.
- Do not hide the pointers inside of the vpci_{read/write}_t
typedefs.
- Rename priv_data to private in vpci_register.
- Simplify checking for register overlap in vpci_register_cmp.
- Check that the offset and the length match before removing a
register in xen_vpci_remove_register.
- Make vpci_read_hw return a value rather than storing it in a
pointer passed by parameter.
- Handler dispatcher functions vpci_{read/write} no longer return an
error code, errors on reads/writes should be treated like hardware
(writes ignored, reads return all 1's or garbage).
- Make sure pcidevs is locked before calling pci_get_pdev_by_domain.
- Use a recursive spinlock for the vpci lock, so that spin_is_locked
checks that the current CPU is holding the lock.
- Make the code less error-chatty by removing some of the printk's.
- Pass the slot and the function as separate parameters to the
handler dispatchers (instead of passing devfn).
- Allow handlers to be registered with either a read or write
function only, the missing handler will be replaced by a dummy
handler (writes ignored, reads return 1's).
- Introduce PCI_CFG_SPACE_* defines from Linux.
- Simplify the handler dispatchers by removing the recursion, now the
dispatchers iterate over the list of sorted handlers and call them
in order.
- Remove the GENMASK_BYTES, SHIFT_RIGHT_BYTES and ADD_RESULT macros,
and instead provide a merge_result function in order to merge a
register output into a partial result.
- Rename the fields of the vpci_val union to u8/u16/u32.
- Remove the return values from the read/write handlers, errors
should be handled internally and signaled as would be done on
native hardware.
- Remove the usage of the GENMASK macro.
- Generalize the PCI address decoding and use it for IOREQ code also.
- Allow access to cross a word-boundary.
- Add locking.
- Add cleanup to xen_vpci_add_handlers in case of failure.
---
.gitignore | 3 +
tools/libxl/libxl_x86.c | 2 +-
tools/tests/Makefile | 1 +
tools/tests/vpci/Makefile | 37 ++++
tools/tests/vpci/emul.h | 133 +++++++++++
tools/tests/vpci/main.c | 308 ++++++++++++++++++++++++++
xen/arch/arm/xen.lds.S | 12 +
xen/arch/x86/domain.c | 18 +-
xen/arch/x86/hvm/hvm.c | 2 +
xen/arch/x86/hvm/io.c | 103 +++++++++
xen/arch/x86/setup.c | 3 +-
xen/arch/x86/xen.lds.S | 12 +
xen/drivers/Makefile | 2 +-
xen/drivers/passthrough/pci.c | 9 +-
xen/drivers/vpci/Makefile | 1 +
xen/drivers/vpci/vpci.c | 450 ++++++++++++++++++++++++++++++++++++++
xen/include/asm-x86/domain.h | 1 +
xen/include/asm-x86/hvm/io.h | 3 +
xen/include/public/arch-x86/xen.h | 5 +-
xen/include/xen/pci.h | 3 +
xen/include/xen/pci_regs.h | 8 +
xen/include/xen/vpci.h | 54 +++++
22 files changed, 1161 insertions(+), 9 deletions(-)
create mode 100644 tools/tests/vpci/Makefile
create mode 100644 tools/tests/vpci/emul.h
create mode 100644 tools/tests/vpci/main.c
create mode 100644 xen/drivers/vpci/Makefile
create mode 100644 xen/drivers/vpci/vpci.c
create mode 100644 xen/include/xen/vpci.h
diff --git a/.gitignore b/.gitignore
index cc16649457..1c670b27d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -240,6 +240,9 @@ tools/tests/regression/build/*
tools/tests/regression/downloads/*
tools/tests/mem-sharing/memshrtool
tools/tests/mce-test/tools/xen-mceinj
+tools/tests/vpci/list.h
+tools/tests/vpci/vpci.[hc]
+tools/tests/vpci/test_vpci
tools/xcutils/lsevtchn
tools/xcutils/readnotes
tools/xenbackendd/_paths.h
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index 455f6f0bed..dd7fc78a99 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -11,7 +11,7 @@ int libxl__arch_domain_prepare_config(libxl__gc *gc,
if (d_config->c_info.type == LIBXL_DOMAIN_TYPE_HVM) {
if (d_config->b_info.device_model_version !=
LIBXL_DEVICE_MODEL_VERSION_NONE) {
- xc_config->emulation_flags = XEN_X86_EMU_ALL;
+ xc_config->emulation_flags = (XEN_X86_EMU_ALL & ~XEN_X86_EMU_VPCI);
} else if (libxl_defbool_val(d_config->b_info.u.hvm.apic)) {
/*
* HVM guests without device model may want
diff --git a/tools/tests/Makefile b/tools/tests/Makefile
index 7162945121..f6942a93fb 100644
--- a/tools/tests/Makefile
+++ b/tools/tests/Makefile
@@ -13,6 +13,7 @@ endif
SUBDIRS-$(CONFIG_X86) += x86_emulator
SUBDIRS-y += xen-access
SUBDIRS-y += xenstore
+SUBDIRS-$(CONFIG_HAS_PCI) += vpci
.PHONY: all clean install distclean uninstall
all clean distclean: %: subdirs-%
diff --git a/tools/tests/vpci/Makefile b/tools/tests/vpci/Makefile
new file mode 100644
index 0000000000..e45fcb5cd9
--- /dev/null
+++ b/tools/tests/vpci/Makefile
@@ -0,0 +1,37 @@
+XEN_ROOT=$(CURDIR)/../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+TARGET := test_vpci
+
+.PHONY: all
+all: $(TARGET)
+
+.PHONY: run
+run: $(TARGET)
+ ./$(TARGET)
Is such target useful? It will possibly break if you invoke it outside
of this directory.
Seems to work fine if I do:

gmake -C tools/tests/vpci run
Post by Wei Liu
Post by Roger Pau Monne
+
+$(TARGET): vpci.c vpci.h list.h main.c emul.h
+
+.PHONY: clean
+ rm -rf $(TARGET) *.o *~ vpci.h vpci.c list.h
+
+.PHONY: distclean
+distclean: clean
+
+.PHONY: install
+
+vpci.c: $(XEN_ROOT)/xen/drivers/vpci/vpci.c
+ # Trick the compiler so it doesn't complain about missing symbols
+ sed -e '/#include/d' \
+ -e '1s;^;#include "emul.h"\
+ vpci_register_init_t *const __start_vpci_array[1]\;\
+ vpci_register_init_t *const __end_vpci_array[1]\;\
This is prone to breakage. I think it would be better to just have a
local stub file. Or maybe define it in your main.c?
I could add this to main.c, I haven't done so because the above it's
done for the linking to succeed, but it's not used in any way by the
test harness itself, or any of the functions called by it.
Post by Wei Liu
Post by Roger Pau Monne
+
+int
+main(int argc, char **argv)
+{
+ /* Index storage by offset. */
+ uint32_t r0 = 0xdeadbeef;
+ uint8_t r5 = 0xef;
+ uint8_t r6 = 0xbe;
+ uint8_t r7 = 0xef;
+ uint16_t r12 = 0x8696;
+ uint8_t r16[4] = { };
+ uint16_t r20[2] = { };
Need to put 0 in the brackets.
Jan requested to remove explicitly setting it to 0:

https://marc.info/?l=xen-devel&m=149995672426469

The end result is the same, when doing initialization non-set fields
get set to 0.
Post by Wei Liu
The code seems rather self-contained. Maybe we can write a fuzzer for
it?
Probably, but I don't think we can expect the fuzzer to set the
handlers, in which case we would have to pre-populate the PCI space
with some handlers in order to perform the fuzzing, which kinds of
limits it's usefulness.

Roger.
Jan Beulich
2017-10-04 08:30:38 UTC
Permalink
Post by Roger Pau Monne
+static int vpci_portio_read(const struct hvm_io_handler *handler,
+ uint64_t addr, uint32_t size, uint64_t *data)
+{
+ struct domain *d = current->domain;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+ uint32_t cf8;
+
+ *data = ~(uint64_t)0;
+
+ if ( addr == 0xcf8 )
+ {
+ ASSERT(size == 4);
+ *data = d->arch.hvm_domain.pci_cf8;
+ return X86EMUL_OKAY;
+ }
+
+ cf8 = ACCESS_ONCE(d->arch.hvm_domain.pci_cf8);
+ if ( !CF8_ENABLED(cf8) )
+ return X86EMUL_OKAY;
Why is this OKAY instead of UNHANDLEABLE? The access is supposed to be
forwarded to qemu if it's not a config space one. Same in the write path
then.
Post by Roger Pau Monne
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -124,6 +124,12 @@ SECTIONS
__param_start = .;
*(.data.param)
__param_end = .;
+
+#if defined(CONFIG_HAS_PCI) && defined(CONFIG_LATE_HWDOM)
+ __start_vpci_array = .;
+ *(.data.vpci)
+ __end_vpci_array = .;
+#endif
} :text
#if defined(BUILD_ID)
@@ -213,6 +219,12 @@ SECTIONS
*(.init_array)
*(SORT(.init_array.*))
__ctors_end = .;
+
+#if defined(CONFIG_HAS_PCI) && !defined(CONFIG_LATE_HWDOM)
+ __start_vpci_array = .;
+ *(.data.vpci)
+ __end_vpci_array = .;
+#endif
} :text
Suitable alignment needs to be enforced in both cases, or else we risk
someone adding something immediately ahead of one of your insertions,
making __start_vpci_array no longer point to the first entry.
Post by Roger Pau Monne
@@ -1052,9 +1053,10 @@ static void __hwdom_init setup_one_hwdom_device(const struct setup_hwdom *ctxt,
struct pci_dev *pdev)
{
u8 devfn = pdev->devfn;
+ int err;
do {
- int err = ctxt->handler(devfn, pdev);
+ err = ctxt->handler(devfn, pdev);
if ( err )
Please also remove the now stray blank line.
Post by Roger Pau Monne
+int vpci_add_register(const struct pci_dev *pdev, vpci_read_t *read_handler,
+ vpci_write_t *write_handler, unsigned int offset,
+ unsigned int size, void *data)
+{
+ struct list_head *prev;
+ struct vpci_register *r;
+
+ /* Some sanity checks. */
+ if ( (size != 1 && size != 2 && size != 4) ||
+ offset >= PCI_CFG_SPACE_EXP_SIZE || (offset & (size - 1)) ||
+ (!read_handler && !write_handler) )
+ return -EINVAL;
+
+ r = xmalloc(struct vpci_register);
+ if ( !r )
+ return -ENOMEM;
+
+ r->read = read_handler ?: vpci_ignored_read;
+ r->write = write_handler ?: vpci_ignored_write;
+ r->size = size;
+ r->offset = offset;
+ r->private = data;
+
+ spin_lock(&pdev->vpci->lock);
+
+ /* The list of handlers must be kept sorted at all times. */
+ list_for_each ( prev, &pdev->vpci->handlers )
+ {
+ const struct vpci_register *this =
+ list_entry(prev, const struct vpci_register, node);
+ int cmp = vpci_register_cmp(r, this);
+
+ if ( cmp < 0 )
+ break;
+ if ( cmp == 0 )
+ {
+ spin_unlock(&pdev->vpci->lock);
+ xfree(r);
+ return -EEXIST;
+ }
+ }
+
+ list_add_tail(&r->node, prev);
+ spin_unlock(&pdev->vpci->lock);
+
+ return 0;
+}
Looking at this and its remove counterpart it is not (no longer?) clear
why they both take a struct pci_dev * as parameter - struct vpci * would
fully suffice, and would eliminate the question on whether functions
like these should have the respective parameters const-qualified.
Post by Roger Pau Monne
+/*
+ * Merge new data into a partial result.
+ *
+ * Copy the value found in 'new' from [0, size) left shifted by
+ * 'offset' into 'data'.
+ */
+static uint32_t merge_result(uint32_t data, uint32_t new, unsigned int size,
+ unsigned int offset)
+{
+ uint32_t mask = 0xffffffff >> (32 - 8 * size);
+
+ return (data & ~(mask << (offset * 8))) | ((new & mask) << (offset * 8));
+}
If a function like this one has a relatively long comment, I think that
comment should clarify that both size and offset are byte-granular.
Especially for offset (used for shifting) bit otherwise would seem more
natural to me.

Jan
Roger Pau Monné
2017-10-04 09:24:46 UTC
Permalink
Post by Jan Beulich
Post by Roger Pau Monne
+static int vpci_portio_read(const struct hvm_io_handler *handler,
+ uint64_t addr, uint32_t size, uint64_t *data)
+{
+ struct domain *d = current->domain;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+ uint32_t cf8;
+
+ *data = ~(uint64_t)0;
+
+ if ( addr == 0xcf8 )
+ {
+ ASSERT(size == 4);
+ *data = d->arch.hvm_domain.pci_cf8;
+ return X86EMUL_OKAY;
+ }
+
+ cf8 = ACCESS_ONCE(d->arch.hvm_domain.pci_cf8);
+ if ( !CF8_ENABLED(cf8) )
+ return X86EMUL_OKAY;
Why is this OKAY instead of UNHANDLEABLE? The access is supposed to be
forwarded to qemu if it's not a config space one. Same in the write path
then.
No, I don't think this should be forwarded to QEMU. It is a config
space access (because vpci_portio_accept returned true). But the value
in CF8 doesn't have the enabled bit set, hence the access is
discarded.
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -124,6 +124,12 @@ SECTIONS
__param_start = .;
*(.data.param)
__param_end = .;
+
+#if defined(CONFIG_HAS_PCI) && defined(CONFIG_LATE_HWDOM)
+ __start_vpci_array = .;
+ *(.data.vpci)
+ __end_vpci_array = .;
+#endif
} :text
#if defined(BUILD_ID)
@@ -213,6 +219,12 @@ SECTIONS
*(.init_array)
*(SORT(.init_array.*))
__ctors_end = .;
+
+#if defined(CONFIG_HAS_PCI) && !defined(CONFIG_LATE_HWDOM)
+ __start_vpci_array = .;
+ *(.data.vpci)
+ __end_vpci_array = .;
+#endif
} :text
Suitable alignment needs to be enforced in both cases, or else we risk
someone adding something immediately ahead of one of your insertions,
making __start_vpci_array no longer point to the first entry.
OK, I've used . = ALIGN(POINTER_ALIGN); for both x86 and ARM.

Thanks, Roger.
Jan Beulich
2017-10-04 09:54:18 UTC
Permalink
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
+static int vpci_portio_read(const struct hvm_io_handler *handler,
+ uint64_t addr, uint32_t size, uint64_t *data)
+{
+ struct domain *d = current->domain;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+ uint32_t cf8;
+
+ *data = ~(uint64_t)0;
+
+ if ( addr == 0xcf8 )
+ {
+ ASSERT(size == 4);
+ *data = d->arch.hvm_domain.pci_cf8;
+ return X86EMUL_OKAY;
+ }
+
+ cf8 = ACCESS_ONCE(d->arch.hvm_domain.pci_cf8);
+ if ( !CF8_ENABLED(cf8) )
+ return X86EMUL_OKAY;
Why is this OKAY instead of UNHANDLEABLE? The access is supposed to be
forwarded to qemu if it's not a config space one. Same in the write path
then.
No, I don't think this should be forwarded to QEMU. It is a config
space access (because vpci_portio_accept returned true). But the value
in CF8 doesn't have the enabled bit set, hence the access is
discarded.
With the enable bit clear it is my understanding that this is then
_not_ a config space access. vpci_portio_accept() simply doesn't
have enough information to tell.

Jan
Roger Pau Monné
2017-10-04 10:32:31 UTC
Permalink
Post by Jan Beulich
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
+static int vpci_portio_read(const struct hvm_io_handler *handler,
+ uint64_t addr, uint32_t size, uint64_t *data)
+{
+ struct domain *d = current->domain;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+ uint32_t cf8;
+
+ *data = ~(uint64_t)0;
+
+ if ( addr == 0xcf8 )
+ {
+ ASSERT(size == 4);
+ *data = d->arch.hvm_domain.pci_cf8;
+ return X86EMUL_OKAY;
+ }
+
+ cf8 = ACCESS_ONCE(d->arch.hvm_domain.pci_cf8);
+ if ( !CF8_ENABLED(cf8) )
+ return X86EMUL_OKAY;
Why is this OKAY instead of UNHANDLEABLE? The access is supposed to be
forwarded to qemu if it's not a config space one. Same in the write path
then.
No, I don't think this should be forwarded to QEMU. It is a config
space access (because vpci_portio_accept returned true). But the value
in CF8 doesn't have the enabled bit set, hence the access is
discarded.
With the enable bit clear it is my understanding that this is then
_not_ a config space access. vpci_portio_accept() simply doesn't
have enough information to tell.
OK, it was my understanding that accesses to cf8/cfc where only used
by the PCI config space.

Roger.
Jan Beulich
2017-10-04 11:33:55 UTC
Permalink
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
+static int vpci_portio_read(const struct hvm_io_handler *handler,
+ uint64_t addr, uint32_t size, uint64_t *data)
+{
+ struct domain *d = current->domain;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+ uint32_t cf8;
+
+ *data = ~(uint64_t)0;
+
+ if ( addr == 0xcf8 )
+ {
+ ASSERT(size == 4);
+ *data = d->arch.hvm_domain.pci_cf8;
+ return X86EMUL_OKAY;
+ }
+
+ cf8 = ACCESS_ONCE(d->arch.hvm_domain.pci_cf8);
+ if ( !CF8_ENABLED(cf8) )
+ return X86EMUL_OKAY;
Why is this OKAY instead of UNHANDLEABLE? The access is supposed to be
forwarded to qemu if it's not a config space one. Same in the write path
then.
No, I don't think this should be forwarded to QEMU. It is a config
space access (because vpci_portio_accept returned true). But the value
in CF8 doesn't have the enabled bit set, hence the access is
discarded.
With the enable bit clear it is my understanding that this is then
_not_ a config space access. vpci_portio_accept() simply doesn't
have enough information to tell.
OK, it was my understanding that accesses to cf8/cfc where only used
by the PCI config space.
Just like with the overlaid byte accesses to port cf9, other such
overlays could exist too; iirc back when PCI was introduced the
enable bit was used to make sure other uses of this port range
remained reasonably unaffected.

Jan
Roger Pau Monne
2017-09-19 15:29:32 UTC
Permalink
This function allows to iterate over a rangeset while removing the
processed regions.

It will be used by the following patches in order to store memory
regions in rangesets, and remove them while iterating.

Signed-off-by: Roger Pau Monné <***@citrix.com>
---
Cc: George Dunlap <***@eu.citrix.com>
Cc: Ian Jackson <***@eu.citrix.com>
Cc: Jan Beulich <***@suse.com>
Cc: Konrad Rzeszutek Wilk <***@oracle.com>
Cc: Stefano Stabellini <***@kernel.org>
Cc: Tim Deegan <***@xen.org>
Cc: Wei Liu <***@citrix.com>
---
Changes since v5:
- New in this version.
---
xen/common/rangeset.c | 28 ++++++++++++++++++++++++++++
xen/include/xen/rangeset.h | 4 ++++
2 files changed, 32 insertions(+)

diff --git a/xen/common/rangeset.c b/xen/common/rangeset.c
index 6c6293c15c..fd4a6b3384 100644
--- a/xen/common/rangeset.c
+++ b/xen/common/rangeset.c
@@ -298,6 +298,34 @@ int rangeset_report_ranges(
return rc;
}

+int rangeset_consume_ranges(
+ struct rangeset *r,
+ int (*cb)(unsigned long s, unsigned long e, void *, unsigned long *c),
+ void *ctxt)
+{
+ int rc = 0;
+
+ write_lock(&r->lock);
+ while ( !rangeset_is_empty(r) )
+ {
+ unsigned long consumed = 0;
+ struct range *x = first_range(r);
+
+ rc = cb(x->s, x->e, ctxt, &consumed);
+
+ ASSERT(consumed <= x->e - x->s + 1);
+ x->s += consumed;
+ if ( x->s > x->e )
+ destroy_range(r, x);
+
+ if ( rc )
+ break;
+ }
+ write_unlock(&r->lock);
+
+ return rc;
+}
+
int rangeset_add_singleton(
struct rangeset *r, unsigned long s)
{
diff --git a/xen/include/xen/rangeset.h b/xen/include/xen/rangeset.h
index aa6408248b..dfdb193800 100644
--- a/xen/include/xen/rangeset.h
+++ b/xen/include/xen/rangeset.h
@@ -67,6 +67,10 @@ bool_t __must_check rangeset_overlaps_range(
int rangeset_report_ranges(
struct rangeset *r, unsigned long s, unsigned long e,
int (*cb)(unsigned long s, unsigned long e, void *), void *ctxt);
+int rangeset_consume_ranges(
+ struct rangeset *r,
+ int (*cb)(unsigned long s, unsigned long e, void *, unsigned long *c),
+ void *ctxt);

/* Add/remove/query a single number. */
int __must_check rangeset_add_singleton(
--
2.11.0 (Apple Git-81)
Wei Liu
2017-09-21 13:53:54 UTC
Permalink
Post by Roger Pau Monne
This function allows to iterate over a rangeset while removing the
processed regions.
It will be used by the following patches in order to store memory
regions in rangesets, and remove them while iterating.
---
---
- New in this version.
---
xen/common/rangeset.c | 28 ++++++++++++++++++++++++++++
xen/include/xen/rangeset.h | 4 ++++
2 files changed, 32 insertions(+)
diff --git a/xen/common/rangeset.c b/xen/common/rangeset.c
index 6c6293c15c..fd4a6b3384 100644
--- a/xen/common/rangeset.c
+++ b/xen/common/rangeset.c
@@ -298,6 +298,34 @@ int rangeset_report_ranges(
return rc;
}
I think you need to document the behaviour of this new function due to
its destructive nature.

Something like:

Iterate through the range within a range set. Call cb on each range
provided. Bail on first error. Destroy the range processed when cb
has consumed the whole range.

Though without reading further I don't know why cb will only consume
part of the range but not all of it all the time.
Post by Roger Pau Monne
+int rangeset_consume_ranges(
+ struct rangeset *r,
+ int (*cb)(unsigned long s, unsigned long e, void *, unsigned long *c),
+ void *ctxt)
+{
+ int rc = 0;
+
+ write_lock(&r->lock);
+ while ( !rangeset_is_empty(r) )
+ {
+ unsigned long consumed = 0;
+ struct range *x = first_range(r);
+
+ rc = cb(x->s, x->e, ctxt, &consumed);
+
+ ASSERT(consumed <= x->e - x->s + 1);
+ x->s += consumed;
+ if ( x->s > x->e )
+ destroy_range(r, x);
+
+ if ( rc )
+ break;
+ }
+ write_unlock(&r->lock);
+
+ return rc;
+}
+
int rangeset_add_singleton(
struct rangeset *r, unsigned long s)
{
diff --git a/xen/include/xen/rangeset.h b/xen/include/xen/rangeset.h
index aa6408248b..dfdb193800 100644
--- a/xen/include/xen/rangeset.h
+++ b/xen/include/xen/rangeset.h
@@ -67,6 +67,10 @@ bool_t __must_check rangeset_overlaps_range(
int rangeset_report_ranges(
struct rangeset *r, unsigned long s, unsigned long e,
int (*cb)(unsigned long s, unsigned long e, void *), void *ctxt);
+int rangeset_consume_ranges(
+ struct rangeset *r,
+ int (*cb)(unsigned long s, unsigned long e, void *, unsigned long *c),
+ void *ctxt);
/* Add/remove/query a single number. */
int __must_check rangeset_add_singleton(
--
2.11.0 (Apple Git-81)
Roger Pau Monné
2017-09-21 14:42:49 UTC
Permalink
Post by Wei Liu
Post by Roger Pau Monne
This function allows to iterate over a rangeset while removing the
processed regions.
It will be used by the following patches in order to store memory
regions in rangesets, and remove them while iterating.
---
---
- New in this version.
---
xen/common/rangeset.c | 28 ++++++++++++++++++++++++++++
xen/include/xen/rangeset.h | 4 ++++
2 files changed, 32 insertions(+)
diff --git a/xen/common/rangeset.c b/xen/common/rangeset.c
index 6c6293c15c..fd4a6b3384 100644
--- a/xen/common/rangeset.c
+++ b/xen/common/rangeset.c
@@ -298,6 +298,34 @@ int rangeset_report_ranges(
return rc;
}
I think you need to document the behaviour of this new function due to
its destructive nature.
Iterate through the range within a range set. Call cb on each range
provided. Bail on first error. Destroy the range processed when cb
has consumed the whole range.
OK, I thought that the 'consume' in the name was enough, but now that
you have written the comment I certainly don't mind adding it ;).
Post by Wei Liu
Though without reading further I don't know why cb will only consume
part of the range but not all of it all the time.
I guess you have to look at the next patch and it's usage. This will
be used to store all the MMIO areas that need to be mapped into a
domain p2m.

Some of the ranges might be very big (BARs from gfx cards for
example), and might require preemption in order to map them, hence the
emulated PCI code needs a way to store it's progress, and that's done
by partially consuming a range.

Thanks, Roger.
Wei Liu
2017-09-22 09:51:05 UTC
Permalink
Post by Roger Pau Monné
Post by Wei Liu
Post by Roger Pau Monne
This function allows to iterate over a rangeset while removing the
processed regions.
It will be used by the following patches in order to store memory
regions in rangesets, and remove them while iterating.
---
---
- New in this version.
---
xen/common/rangeset.c | 28 ++++++++++++++++++++++++++++
xen/include/xen/rangeset.h | 4 ++++
2 files changed, 32 insertions(+)
diff --git a/xen/common/rangeset.c b/xen/common/rangeset.c
index 6c6293c15c..fd4a6b3384 100644
--- a/xen/common/rangeset.c
+++ b/xen/common/rangeset.c
@@ -298,6 +298,34 @@ int rangeset_report_ranges(
return rc;
}
I think you need to document the behaviour of this new function due to
its destructive nature.
Iterate through the range within a range set. Call cb on each range
provided. Bail on first error. Destroy the range processed when cb
has consumed the whole range.
OK, I thought that the 'consume' in the name was enough, but now that
you have written the comment I certainly don't mind adding it ;).
Post by Wei Liu
Though without reading further I don't know why cb will only consume
part of the range but not all of it all the time.
I guess you have to look at the next patch and it's usage. This will
be used to store all the MMIO areas that need to be mapped into a
domain p2m.
Some of the ranges might be very big (BARs from gfx cards for
example), and might require preemption in order to map them, hence the
emulated PCI code needs a way to store it's progress, and that's done
by partially consuming a range.
Ah, so the goal is indeed to consume all the ranges within the rangeset.
This makes more sense now.
Jan Beulich
2017-10-04 08:32:58 UTC
Permalink
Post by Roger Pau Monne
This function allows to iterate over a rangeset while removing the
processed regions.
It will be used by the following patches in order to store memory
regions in rangesets, and remove them while iterating.
This really only repeats what the first paragraph already says. Instead
you want to state why this is actually needed (to be able to split
processing aiui).
Post by Roger Pau Monne
--- a/xen/common/rangeset.c
+++ b/xen/common/rangeset.c
@@ -298,6 +298,34 @@ int rangeset_report_ranges(
return rc;
}
+int rangeset_consume_ranges(
+ struct rangeset *r,
+ int (*cb)(unsigned long s, unsigned long e, void *, unsigned long *c),
+ void *ctxt)
+{
+ int rc = 0;
+
+ write_lock(&r->lock);
+ while ( !rangeset_is_empty(r) )
+ {
+ unsigned long consumed = 0;
+ struct range *x = first_range(r);
+
+ rc = cb(x->s, x->e, ctxt, &consumed);
+
+ ASSERT(consumed <= x->e - x->s + 1);
+ x->s += consumed;
+ if ( x->s > x->e )
+ destroy_range(r, x);
+
+ if ( rc )
+ break;
+ }
+ write_unlock(&r->lock);
+
+ return rc;
+}
Leaving the rangeset populated in case of error (other than -ERESTART)
looks to be potentially problematic/unexpected. Please at least add a
comment in the header stating this. Perhaps negative vs positive rc
from the callback could be used to direct intended behavior.
Post by Roger Pau Monne
--- a/xen/include/xen/rangeset.h
+++ b/xen/include/xen/rangeset.h
@@ -67,6 +67,10 @@ bool_t __must_check rangeset_overlaps_range(
int rangeset_report_ranges(
struct rangeset *r, unsigned long s, unsigned long e,
int (*cb)(unsigned long s, unsigned long e, void *), void *ctxt);
+int rangeset_consume_ranges(
+ struct rangeset *r,
+ int (*cb)(unsigned long s, unsigned long e, void *, unsigned long *c),
+ void *ctxt);
Indentation.

Jan


_______________________________________________
Xen-devel mailing list
Roger Pau Monne
2017-09-19 15:29:30 UTC
Permalink
So that it can be called from outside in order to get the size of regular PCI
BARs. This will be required in order to map the BARs from PCI devices into PVH
Dom0 p2m.

Signed-off-by: Roger Pau Monné <***@citrix.com>
Reviewed-by: Jan Beulich <***@suse.com>
---
Cc: Jan Beulich <***@suse.com>
---
Changes since v5:
- Introduce a flags field for pci_size_mem_bar.
- Use pci_sbdf_t.

Changes since v4:
- Restore printing whether the BAR is from a vf.
- Make the psize pointer parameter not optional.
- s/u64/uint64_t.
- Remove some unneeded parentheses.
- Assert the return value is never 0.
- Use the newly introduced pci_sbdf_t type.

Changes since v3:
- Rename function to size BARs to pci_size_mem_bar.
- Change the parameters passed to the function. Pass the position and
whether the BAR is the last one, instead of the (base, max_bars,
*index) tuple.
- Make the function return the number of BARs consumed (1 for 32b, 2
for 64b BARs).
- Change the dprintk back to printk.
- Do not log another error message in pci_add_device in case
pci_size_mem_bar fails.
---
xen/drivers/passthrough/pci.c | 98 ++++++++++++++++++++++++++++---------------
xen/include/xen/pci.h | 4 ++
2 files changed, 68 insertions(+), 34 deletions(-)

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 975485fe05..ba58b4d0cc 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -603,6 +603,56 @@ static int iommu_add_device(struct pci_dev *pdev);
static int iommu_enable_device(struct pci_dev *pdev);
static int iommu_remove_device(struct pci_dev *pdev);

+int pci_size_mem_bar(pci_sbdf_t sbdf, unsigned int pos, bool last,
+ uint64_t *paddr, uint64_t *psize, unsigned int flags)
+{
+ uint32_t hi = 0, bar = pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev,
+ sbdf.func, pos);
+ uint64_t addr, size;
+ bool vf = flags & PCI_BAR_VF;
+
+ ASSERT((bar & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_MEMORY);
+ pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos, ~0);
+ if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+ PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ {
+ if ( last )
+ {
+ printk(XENLOG_WARNING
+ "%sdevice %04x:%02x:%02x.%u with 64-bit %sBAR in last slot\n",
+ vf ? "SR-IOV " : "", sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func,
+ vf ? "vf " : "");
+ return -EINVAL;
+ }
+ hi = pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos + 4);
+ pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos + 4, ~0);
+ }
+ size = pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos) &
+ PCI_BASE_ADDRESS_MEM_MASK;
+ if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+ PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ {
+ size |= (uint64_t)pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev,
+ sbdf.func, pos + 4) << 32;
+ pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos + 4, hi);
+ }
+ else if ( size )
+ size |= (uint64_t)~0 << 32;
+ pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos, bar);
+ size = -size;
+ addr = (bar & PCI_BASE_ADDRESS_MEM_MASK) | ((uint64_t)hi << 32);
+
+ if ( paddr )
+ *paddr = addr;
+ *psize = size;
+
+ if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+ PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ return 2;
+
+ return 1;
+}
+
int pci_add_device(u16 seg, u8 bus, u8 devfn,
const struct pci_dev_info *info, nodeid_t node)
{
@@ -674,11 +724,16 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
unsigned int i;

BUILD_BUG_ON(ARRAY_SIZE(pdev->vf_rlen) != PCI_SRIOV_NUM_BARS);
- for ( i = 0; i < PCI_SRIOV_NUM_BARS; ++i )
+ for ( i = 0; i < PCI_SRIOV_NUM_BARS; )
{
unsigned int idx = pos + PCI_SRIOV_BAR + i * 4;
u32 bar = pci_conf_read32(seg, bus, slot, func, idx);
- u32 hi = 0;
+ pci_sbdf_t sbdf = {
+ .seg = seg,
+ .bus = bus,
+ .dev = slot,
+ .func = func,
+ };

if ( (bar & PCI_BASE_ADDRESS_SPACE) ==
PCI_BASE_ADDRESS_SPACE_IO )
@@ -689,38 +744,13 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
seg, bus, slot, func, i);
continue;
}
- pci_conf_write32(seg, bus, slot, func, idx, ~0);
- if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
- PCI_BASE_ADDRESS_MEM_TYPE_64 )
- {
- if ( i >= PCI_SRIOV_NUM_BARS )
- {
- printk(XENLOG_WARNING
- "SR-IOV device %04x:%02x:%02x.%u with 64-bit"
- " vf BAR in last slot\n",
- seg, bus, slot, func);
- break;
- }
- hi = pci_conf_read32(seg, bus, slot, func, idx + 4);
- pci_conf_write32(seg, bus, slot, func, idx + 4, ~0);
- }
- pdev->vf_rlen[i] = pci_conf_read32(seg, bus, slot, func, idx) &
- PCI_BASE_ADDRESS_MEM_MASK;
- if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
- PCI_BASE_ADDRESS_MEM_TYPE_64 )
- {
- pdev->vf_rlen[i] |= (u64)pci_conf_read32(seg, bus,
- slot, func,
- idx + 4) << 32;
- pci_conf_write32(seg, bus, slot, func, idx + 4, hi);
- }
- else if ( pdev->vf_rlen[i] )
- pdev->vf_rlen[i] |= (u64)~0 << 32;
- pci_conf_write32(seg, bus, slot, func, idx, bar);
- pdev->vf_rlen[i] = -pdev->vf_rlen[i];
- if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
- PCI_BASE_ADDRESS_MEM_TYPE_64 )
- ++i;
+ ret = pci_size_mem_bar(sbdf, idx, i == PCI_SRIOV_NUM_BARS - 1,
+ NULL, &pdev->vf_rlen[i], PCI_BAR_VF);
+ if ( ret < 0 )
+ break;
+
+ ASSERT(ret);
+ i += ret;
}
}
else
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index b7a6abfc53..2bee6a3247 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -189,6 +189,10 @@ const char *parse_pci(const char *, unsigned int *seg, unsigned int *bus,
const char *parse_pci_seg(const char *, unsigned int *seg, unsigned int *bus,
unsigned int *dev, unsigned int *func, bool *def_seg);

+#define _PCI_BAR_VF 0
+#define PCI_BAR_VF (1u << _PCI_BAR_VF)
+int pci_size_mem_bar(pci_sbdf_t sbdf, unsigned int pos, bool last,
+ uint64_t *addr, uint64_t *size, unsigned int flags);

bool_t pcie_aer_get_firmware_first(const struct pci_dev *);
--
2.11.0 (Apple Git-81)
Jan Beulich
2017-10-04 08:32:06 UTC
Permalink
Post by Roger Pau Monne
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -603,6 +603,56 @@ static int iommu_add_device(struct pci_dev *pdev);
static int iommu_enable_device(struct pci_dev *pdev);
static int iommu_remove_device(struct pci_dev *pdev);
+int pci_size_mem_bar(pci_sbdf_t sbdf, unsigned int pos, bool last,
+ uint64_t *paddr, uint64_t *psize, unsigned int flags)
+{
+ uint32_t hi = 0, bar = pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev,
+ sbdf.func, pos);
+ uint64_t addr, size;
+ bool vf = flags & PCI_BAR_VF;
Honestly I'm not convinced of the utility of this variable; same for the
"rom" one in the next patch.
Post by Roger Pau Monne
+ ASSERT((bar & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_MEMORY);
+ pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos, ~0);
+ if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+ PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ {
+ if ( last )
+ {
+ printk(XENLOG_WARNING
+ "%sdevice %04x:%02x:%02x.%u with 64-bit %sBAR in last slot\n",
+ vf ? "SR-IOV " : "", sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func,
+ vf ? "vf " : "");
+ return -EINVAL;
+ }
+ hi = pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos + 4);
+ pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos + 4, ~0);
+ }
+ size = pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos) &
+ PCI_BASE_ADDRESS_MEM_MASK;
+ if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+ PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ {
+ size |= (uint64_t)pci_conf_read32(sbdf.seg, sbdf.bus, sbdf.dev,
+ sbdf.func, pos + 4) << 32;
+ pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos + 4, hi);
+ }
+ else if ( size )
+ size |= (uint64_t)~0 << 32;
+ pci_conf_write32(sbdf.seg, sbdf.bus, sbdf.dev, sbdf.func, pos, bar);
+ size = -size;
+ addr = (bar & PCI_BASE_ADDRESS_MEM_MASK) | ((uint64_t)hi << 32);
+
+ if ( paddr )
+ *paddr = addr;
You need addr only inside the if() - no need for the local variable,
and no need to calculate it unconditionally.
Post by Roger Pau Monne
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -189,6 +189,10 @@ const char *parse_pci(const char *, unsigned int *seg, unsigned int *bus,
const char *parse_pci_seg(const char *, unsigned int *seg, unsigned int *bus,
unsigned int *dev, unsigned int *func, bool *def_seg);
+#define _PCI_BAR_VF 0
+#define PCI_BAR_VF (1u << _PCI_BAR_VF)
Do you really need both? I know we have quite a few cases where flags
are being defined this way, but that's usually when bit operations
(test_bit() and alike) are intended on the flags fields.

Jan
Roger Pau Monné
2017-10-04 13:31:56 UTC
Permalink
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -189,6 +189,10 @@ const char *parse_pci(const char *, unsigned int *seg, unsigned int *bus,
const char *parse_pci_seg(const char *, unsigned int *seg, unsigned int *bus,
unsigned int *dev, unsigned int *func, bool *def_seg);
+#define _PCI_BAR_VF 0
+#define PCI_BAR_VF (1u << _PCI_BAR_VF)
Do you really need both? I know we have quite a few cases where flags
are being defined this way, but that's usually when bit operations
(test_bit() and alike) are intended on the flags fields.
Ack, would you then rather prefer to have 1, or (1u << 0)? (to keep it
in line with the other flag that will be added later).

Thanks, Roger.
Jan Beulich
2017-10-04 16:00:24 UTC
Permalink
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -189,6 +189,10 @@ const char *parse_pci(const char *, unsigned int *seg, unsigned int *bus,
const char *parse_pci_seg(const char *, unsigned int *seg, unsigned int *bus,
unsigned int *dev, unsigned int *func, bool *def_seg);
+#define _PCI_BAR_VF 0
+#define PCI_BAR_VF (1u << _PCI_BAR_VF)
Do you really need both? I know we have quite a few cases where flags
are being defined this way, but that's usually when bit operations
(test_bit() and alike) are intended on the flags fields.
Ack, would you then rather prefer to have 1, or (1u << 0)? (to keep it
in line with the other flag that will be added later).
1u please, as that's going to be mandatory once someone adds
a definition for the 32nd flag bit. The other option would be to
use a plain hex constant without involving the shift operator.

Jan
Roger Pau Monne
2017-09-19 15:29:36 UTC
Permalink
Add handlers for accesses to the MSI-X message control field on the
PCI configuration space, and traps for accesses to the memory region
that contains the MSI-X table and PBA. This traps detect attempts from
the guest to configure MSI-X interrupts and properly sets them up.

Note that accesses to the Table Offset, Table BIR, PBA Offset and PBA
BIR are not trapped by Xen at the moment.

Finally, turn the panic in the Dom0 PVH builder into a warning.

Signed-off-by: Roger Pau Monné <***@citrix.com>
---
Cc: Jan Beulich <***@suse.com>
Cc: Andrew Cooper <***@citrix.com>
---
Changes since v5:
- Update lock usage.
- Unbind/unmap PIRQs when MSIX is disabled.
- Share the arch-specific MSIX code with the MSI functions.
- Do not reference the MSIX memory areas from the PCI BARs fields,
instead fetch the BIR and offset each time needed.
- Add the '_entry' suffix to the MSIX arch functions.
- Prefix the vMSIX macros with 'V'.
- s/gdprintk/gprintk/ in msix.c
- Make vpci_msix_access_check return bool, and change it's name to
vpci_msix_access_allowed.
- Join the first two ifs in vpci_msix_{read/write} into a single one.
- Allow Dom0 to write to the PBA area.
- Add a note that reads from the PBA area will need to be translated
if the PBA it's not identity mapped.

Changes since v4:
- Remove parentheses around offsetof.
- Add "being" to MSI-X enabling comment.
- Use INVALID_PIRQ.
- Add a simple sanity check to vpci_msix_arch_enable in order to
detect wrong MSI-X entries more quickly.
- Constify vpci_msix_arch_print entry argument.
- s/cpu/fixed/ in vpci_msix_arch_print.
- Dump the MSI-X info together with the MSI info.
- Fix vpci_msix_control_write to take into account changes to the
address and data fields when switching the function mask bit.
- Only disable/enable the entries if the address or data fields have
been updated.
- Usew the BAR enable field to check if a BAR is mapped or not
(instead of reading the command register for each device).
- Fix error path in vpci_msix_read to set the return data to ~0.
- Simplify mask usage in vpci_msix_write.
- Cast data to uint64_t when shifting it 32 bits.
- Fix writes to the table entry control register to take into account
if the mask-all bit is set.
- Add some comments to clarify the intended behavior of the code.
- Align the PBA size to 64-bits.
- Remove the error label in vpci_init_msix.
- Try to compact the layout of the vpci_msix structure.
- Remove the local table_bar and pba_bar variables from
vpci_init_msix, they are used only once.

Changes since v3:
- Propagate changes from previous versions: remove xen_ prefix, use
the new fields in vpci_val and remove the return value from
handlers.
- Remove the usage of GENMASK.
- Mave the arch-specific parts of the dump routine to the
x86/hvm/vmsi.c dump handler.
- Chain the MSI-X dump handler to the 'M' debug key.
- Fix the header BAR mappings so that the MSI-X regions inside of
BARs are unmapped from the domain p2m in order for the handlers to
work properly.
- Unconditionally trap and forward accesses to the PBA MSI-X area.
- Simplify the conditionals in vpci_msix_control_write.
- Fix vpci_msix_accept to use a bool type.
- Allow all supported accesses as described in the spec to the MSI-X
table.
- Truncate the returned address when the access is a 32b read.
- Always return X86EMUL_OKAY from the handlers, returning ~0 in the
read case if the access is not supported, or ignoring writes.
- Do not check that max_entries is != 0 in the init handler.
- Use trylock in the dump handler.

Changes since v2:
- Split out arch-specific code.

This patch has been tested with devices using both a single MSI-X
entry and multiple ones.
---
xen/arch/x86/hvm/dom0_build.c | 2 +-
xen/arch/x86/hvm/hvm.c | 1 +
xen/arch/x86/hvm/vmsi.c | 133 ++++++++--
xen/drivers/vpci/Makefile | 2 +-
xen/drivers/vpci/header.c | 16 ++
xen/drivers/vpci/msi.c | 22 +-
xen/drivers/vpci/msix.c | 506 +++++++++++++++++++++++++++++++++++++++
xen/include/asm-x86/hvm/domain.h | 3 +
xen/include/asm-x86/hvm/io.h | 5 +
xen/include/xen/vpci.h | 45 ++++
10 files changed, 705 insertions(+), 30 deletions(-)
create mode 100644 xen/drivers/vpci/msix.c

diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
index 17d77137d6..8fa92bc5b6 100644
--- a/xen/arch/x86/hvm/dom0_build.c
+++ b/xen/arch/x86/hvm/dom0_build.c
@@ -1111,7 +1111,7 @@ int __init dom0_construct_pvh(struct domain *d, const module_t *image,

pvh_setup_mmcfg(d);

- panic("Building a PVHv2 Dom0 is not yet supported.");
+ printk("WARNING: PVH is an experimental mode with limited functionality\n");
return 0;
}

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index b1064413fc..042b7c6a31 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -585,6 +585,7 @@ int hvm_domain_initialise(struct domain *d, unsigned long domcr_flags,
INIT_LIST_HEAD(&d->arch.hvm_domain.write_map.list);
INIT_LIST_HEAD(&d->arch.hvm_domain.g2m_ioport_list);
INIT_LIST_HEAD(&d->arch.hvm_domain.mmcfg_regions);
+ INIT_LIST_HEAD(&d->arch.hvm_domain.msix_tables);

rc = create_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0, NULL, NULL);
if ( rc )
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 3dcde3d882..a335e75f8b 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -642,16 +642,15 @@ static unsigned int msi_gflags(uint16_t data, uint64_t addr)
XEN_DOMCTL_VMSI_X86_TRIG_MASK);
}

-void vpci_msi_arch_mask(struct vpci_msi *msi, const struct pci_dev *pdev,
- unsigned int entry, bool mask)
+static void vpci_mask_pirq(struct domain *d, int pirq, bool mask)
{
const struct pirq *pinfo;
struct irq_desc *desc;
unsigned long flags;
int irq;

- ASSERT(msi->arch.pirq >= 0 && entry < msi->vectors);
- pinfo = pirq_info(pdev->domain, msi->arch.pirq + entry);
+ ASSERT(pirq >= 0);
+ pinfo = pirq_info(d, pirq);
if ( !pinfo )
return;

@@ -668,23 +667,31 @@ void vpci_msi_arch_mask(struct vpci_msi *msi, const struct pci_dev *pdev,
spin_unlock_irqrestore(&desc->lock, flags);
}

-int vpci_msi_arch_enable(struct vpci_msi *msi, const struct pci_dev *pdev,
- unsigned int vectors)
+void vpci_msi_arch_mask(struct vpci_msi *msi, const struct pci_dev *pdev,
+ unsigned int entry, bool mask)
+{
+ vpci_mask_pirq(pdev->domain, msi->arch.pirq + entry, mask);
+}
+
+static int vpci_msi_enable(const struct pci_dev *pdev, uint32_t data,
+ uint64_t address, unsigned int nr,
+ paddr_t table_base)
{
struct msi_info msi_info = {
.seg = pdev->seg,
.bus = pdev->bus,
.devfn = pdev->devfn,
- .entry_nr = vectors,
+ .table_base = table_base,
+ .entry_nr = nr,
};
- unsigned int i;
- int rc;
-
- ASSERT(msi->arch.pirq == INVALID_PIRQ);
+ unsigned int i, vectors = table_base ? 1 : nr;
+ int rc, pirq = INVALID_PIRQ;

/* Get a PIRQ. */
- rc = allocate_and_map_msi_pirq(pdev->domain, -1, &msi->arch.pirq,
- MAP_PIRQ_TYPE_MULTI_MSI, &msi_info);
+ rc = allocate_and_map_msi_pirq(pdev->domain, -1, &pirq,
+ table_base ? MAP_PIRQ_TYPE_MSI
+ : MAP_PIRQ_TYPE_MULTI_MSI,
+ &msi_info);
if ( rc )
{
gdprintk(XENLOG_ERR, "%04x:%02x:%02x.%u: failed to map PIRQ: %d\n",
@@ -695,14 +702,14 @@ int vpci_msi_arch_enable(struct vpci_msi *msi, const struct pci_dev *pdev,

for ( i = 0; i < vectors; i++ )
{
- uint8_t vector = MASK_EXTR(msi->data, MSI_DATA_VECTOR_MASK);
- uint8_t vector_mask = 0xff >> (8 - fls(msi->vectors) + 1);
+ uint8_t vector = MASK_EXTR(data, MSI_DATA_VECTOR_MASK);
+ uint8_t vector_mask = 0xff >> (8 - fls(vectors) + 1);
xen_domctl_bind_pt_irq_t bind = {
- .machine_irq = msi->arch.pirq + i,
+ .machine_irq = pirq + i,
.irq_type = PT_IRQ_TYPE_MSI,
.u.msi.gvec = (vector & ~vector_mask) |
((vector + i) & vector_mask),
- .u.msi.gflags = msi_gflags(msi->data, msi->address),
+ .u.msi.gflags = msi_gflags(data, address),
};

pcidevs_lock();
@@ -712,33 +719,48 @@ int vpci_msi_arch_enable(struct vpci_msi *msi, const struct pci_dev *pdev,
gdprintk(XENLOG_ERR,
"%04x:%02x:%02x.%u: failed to bind PIRQ %u: %d\n",
pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
- PCI_FUNC(pdev->devfn), msi->arch.pirq + i, rc);
+ PCI_FUNC(pdev->devfn), pirq + i, rc);
while ( bind.machine_irq-- )
pt_irq_destroy_bind(pdev->domain, &bind);
spin_lock(&pdev->domain->event_lock);
- unmap_domain_pirq(pdev->domain, msi->arch.pirq);
+ unmap_domain_pirq(pdev->domain, pirq);
spin_unlock(&pdev->domain->event_lock);
pcidevs_unlock();
- msi->arch.pirq = INVALID_PIRQ;
return rc;
}
pcidevs_unlock();
}

- return 0;
+ return pirq;
}

-int vpci_msi_arch_disable(struct vpci_msi *msi, const struct pci_dev *pdev)
+int vpci_msi_arch_enable(struct vpci_msi *msi, const struct pci_dev *pdev,
+ unsigned int vectors)
+{
+ int rc;
+
+ ASSERT(msi->arch.pirq == INVALID_PIRQ);
+ rc = vpci_msi_enable(pdev, msi->data, msi->address, vectors, 0);
+ if ( rc >= 0 )
+ {
+ msi->arch.pirq = rc;
+ rc = 0;
+ }
+
+ return rc;
+}
+
+void vpci_msi_disable(const struct pci_dev *pdev, int pirq, unsigned int nr)
{
unsigned int i;

- ASSERT(msi->arch.pirq != INVALID_PIRQ);
+ ASSERT(pirq != INVALID_PIRQ);

pcidevs_lock();
- for ( i = 0; i < msi->vectors; i++ )
+ for ( i = 0; i < nr; i++ )
{
xen_domctl_bind_pt_irq_t bind = {
- .machine_irq = msi->arch.pirq + i,
+ .machine_irq = pirq + i,
.irq_type = PT_IRQ_TYPE_MSI,
};
int rc;
@@ -748,10 +770,14 @@ int vpci_msi_arch_disable(struct vpci_msi *msi, const struct pci_dev *pdev)
}

spin_lock(&pdev->domain->event_lock);
- unmap_domain_pirq(pdev->domain, msi->arch.pirq);
+ unmap_domain_pirq(pdev->domain, pirq);
spin_unlock(&pdev->domain->event_lock);
pcidevs_unlock();
+}

+int vpci_msi_arch_disable(struct vpci_msi *msi, const struct pci_dev *pdev)
+{
+ vpci_msi_disable(pdev, msi->arch.pirq, msi->vectors);
msi->arch.pirq = INVALID_PIRQ;

return 0;
@@ -774,3 +800,58 @@ void vpci_msi_arch_print(const struct vpci_msi *msi)
MASK_EXTR(msi->address, MSI_ADDR_DEST_ID_MASK),
msi->arch.pirq);
}
+
+void vpci_msix_arch_mask_entry(struct vpci_msix_entry *entry,
+ const struct pci_dev *pdev, bool mask)
+{
+ ASSERT(entry->arch.pirq != INVALID_PIRQ);
+ vpci_mask_pirq(pdev->domain, entry->arch.pirq, mask);
+}
+
+int vpci_msix_arch_enable_entry(struct vpci_msix_entry *entry,
+ const struct pci_dev *pdev, paddr_t table_base)
+{
+ int rc;
+
+ ASSERT(entry->arch.pirq == INVALID_PIRQ);
+ rc = vpci_msi_enable(pdev, entry->data, entry->addr, entry->nr,
+ table_base);
+ if ( rc >= 0 )
+ {
+ entry->arch.pirq = rc;
+ rc = 0;
+ }
+
+ return rc;
+}
+
+int vpci_msix_arch_disable_entry(struct vpci_msix_entry *entry,
+ const struct pci_dev *pdev)
+{
+ if ( entry->arch.pirq == INVALID_PIRQ )
+ return -ENOENT;
+
+ vpci_msi_disable(pdev, entry->arch.pirq, 1);
+ entry->arch.pirq = INVALID_PIRQ;
+
+ return 0;
+}
+
+int vpci_msix_arch_init_entry(struct vpci_msix_entry *entry)
+{
+ entry->arch.pirq = INVALID_PIRQ;
+ return 0;
+}
+
+void vpci_msix_arch_print_entry(const struct vpci_msix_entry *entry)
+{
+ printk("%4u vec=%#02x%7s%6s%3sassert%5s%7s dest_id=%lu mask=%u pirq: %d\n",
+ entry->nr, MASK_EXTR(entry->data, MSI_DATA_VECTOR_MASK),
+ entry->data & MSI_DATA_DELIVERY_LOWPRI ? "lowest" : "fixed",
+ entry->data & MSI_DATA_TRIGGER_LEVEL ? "level" : "edge",
+ entry->data & MSI_DATA_LEVEL_ASSERT ? "" : "de",
+ entry->addr & MSI_ADDR_DESTMODE_LOGIC ? "log" : "phys",
+ entry->addr & MSI_ADDR_REDIRECTION_LOWPRI ? "lowest" : "fixed",
+ MASK_EXTR(entry->addr, MSI_ADDR_DEST_ID_MASK),
+ entry->masked, entry->arch.pirq);
+}
diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
index 62cec9e82b..55d1bdfda0 100644
--- a/xen/drivers/vpci/Makefile
+++ b/xen/drivers/vpci/Makefile
@@ -1 +1 @@
-obj-y += vpci.o header.o msi.o
+obj-y += vpci.o header.o msi.o msix.o
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 07a6bbf0be..02b9776ea9 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -152,6 +152,7 @@ static int vpci_check_bar_overlap(const struct pci_dev *pdev,
static void vpci_modify_bars(const struct pci_dev *pdev, bool map)
{
struct vpci_header *header = &pdev->vpci->header;
+ struct vpci_msix *msix = pdev->vpci->msix;
struct rangeset *mem = rangeset_new(NULL, NULL, 0);
unsigned int i;
int rc;
@@ -186,6 +187,21 @@ static void vpci_modify_bars(const struct pci_dev *pdev, bool map)
}
}

+ /* Remove any MSIX regions if present. */
+ for ( i = 0; msix && i < ARRAY_SIZE(msix->mem); i++ )
+ {
+ paddr_t start =
+ header->bars[msix->mem[i].bir].addr + msix->mem[i].offset;
+
+ rc = rangeset_remove_range(mem, PFN_DOWN(start),
+ PFN_DOWN(start + msix->mem[i].size - 1));
+ if ( rc )
+ {
+ rangeset_destroy(mem);
+ return;
+ }
+ }
+
/* Check for overlaps with other device's BARs. */
rc = vpci_check_bar_overlap(pdev, NULL, mem);
if ( rc )
diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
index 7a0b0521c5..5c10a0d9c9 100644
--- a/xen/drivers/vpci/msi.c
+++ b/xen/drivers/vpci/msi.c
@@ -320,13 +320,17 @@ void vpci_dump_msi(void)
if ( !has_vpci(d) )
continue;

- printk("vPCI MSI information for d%d\n", d->domain_id);
+ printk("vPCI MSI/MSI-X information for d%d\n", d->domain_id);

list_for_each_entry ( pdev, &d->arch.pdev_list, domain_list )
{
uint8_t seg = pdev->seg, bus = pdev->bus;
uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
const struct vpci_msi *msi = pdev->vpci->msi;
+ const struct vpci_msix *msix = pdev->vpci->msix;
+
+ if ( msi || msix )
+ printk("Device %04x:%02x:%02x.%u\n", seg, bus, slot, func);

if ( !spin_trylock(&pdev->vpci->lock) )
{
@@ -336,7 +340,7 @@ void vpci_dump_msi(void)

if ( msi )
{
- printk("Device %04x:%02x:%02x.%u\n", seg, bus, slot, func);
+ printk(" MSI\n");

printk(" Enabled: %u Supports masking: %u 64-bit addresses: %u\n",
msi->enabled, msi->masking, msi->address64);
@@ -349,6 +353,20 @@ void vpci_dump_msi(void)
printk(" mask=%08x\n", msi->mask);
}

+ if ( msix )
+ {
+ unsigned int i;
+
+ printk(" MSI-X\n");
+
+ printk(" Max entries: %u maskall: %u enabled: %u\n",
+ msix->max_entries, msix->masked, msix->enabled);
+
+ printk(" Table entries:\n");
+ for ( i = 0; i < msix->max_entries; i++ )
+ vpci_msix_arch_print_entry(&msix->entries[i]);
+ }
+
spin_unlock(&pdev->vpci->lock);
process_pending_softirqs();
}
diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
new file mode 100644
index 0000000000..ad4684c357
--- /dev/null
+++ b/xen/drivers/vpci/msix.c
@@ -0,0 +1,506 @@
+/*
+ * Handlers for accesses to the MSI-X capability structure and the memory
+ * region.
+ *
+ * Copyright (C) 2017 Citrix Systems R&D
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/sched.h>
+#include <xen/vpci.h>
+#include <asm/msi.h>
+#include <xen/p2m-common.h>
+#include <xen/keyhandler.h>
+
+#define VMSIX_SIZE(num) offsetof(struct vpci_msix, entries[num])
+#define VMSIX_ADDR_IN_RANGE(addr, table, bar) \
+ ((addr) >= (bar)->addr + (table)->offset && \
+ (addr) < (bar)->addr + (table)->offset + (table)->size)
+
+static uint32_t vpci_msix_control_read(const struct pci_dev *pdev,
+ unsigned int reg, void *data)
+{
+ const struct vpci_msix *msix = data;
+ uint16_t val;
+
+ val = msix->max_entries - 1;
+ val |= msix->enabled ? PCI_MSIX_FLAGS_ENABLE : 0;
+ val |= msix->masked ? PCI_MSIX_FLAGS_MASKALL : 0;
+
+ return val;
+}
+
+static void vpci_msix_control_write(const struct pci_dev *pdev,
+ unsigned int reg, uint32_t val, void *data)
+{
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ struct vpci_msix *msix = data;
+ bool new_masked, new_enabled;
+ unsigned int i;
+ int rc;
+
+ new_masked = val & PCI_MSIX_FLAGS_MASKALL;
+ new_enabled = val & PCI_MSIX_FLAGS_ENABLE;
+
+ /*
+ * According to the PCI 3.0 specification, switching the enable bit
+ * to 1 or the function mask bit to 0 should cause all the cached
+ * addresses and data fields to be recalculated. Xen implements this
+ * as disabling and enabling the entries.
+ *
+ * Note that the disable/enable sequence is only performed when the
+ * guest has written to the entry (ie: updated field set) or MSIX is
+ * enabled.
+ */
+ if ( new_enabled && !new_masked && (!msix->enabled || msix->masked) )
+ {
+ paddr_t table_base =
+ pdev->vpci->header.bars[msix->mem[VPCI_MSIX_TABLE].bir].addr;
+
+ for ( i = 0; i < msix->max_entries; i++ )
+ {
+ if ( msix->entries[i].masked ||
+ (new_enabled && msix->enabled && !msix->entries[i].updated) )
+ continue;
+
+ rc = vpci_msix_arch_disable_entry(&msix->entries[i], pdev);
+ if ( rc )
+ {
+ gprintk(XENLOG_WARNING,
+ "%04x:%02x:%02x.%u: unable to disable entry %u: %d\n",
+ seg, bus, slot, func, msix->entries[i].nr, rc);
+ return;
+ }
+
+ rc = vpci_msix_arch_enable_entry(&msix->entries[i], pdev,
+ table_base);
+ if ( rc )
+ {
+ gprintk(XENLOG_WARNING,
+ "%04x:%02x:%02x.%u: unable to enable entry %u: %d\n",
+ seg, bus, slot, func, msix->entries[i].nr, rc);
+ /* Entry is likely not properly configured, skip it. */
+ continue;
+ }
+
+ /*
+ * At this point the PIRQ is still masked. Unmask it, or else the
+ * guest won't receive interrupts. This is due to the
+ * disable/enable sequence performed above.
+ */
+ vpci_msix_arch_mask_entry(&msix->entries[i], pdev, false);
+
+ msix->entries[i].updated = false;
+ }
+ }
+ else if ( !new_enabled && msix->enabled )
+ {
+ /* Guest has disabled MSIX, disable all entries. */
+ for ( i = 0; i < msix->max_entries; i++ )
+ {
+ /*
+ * NB: vpci_msix_arch_disable can be called for entries that are
+ * not setup, it will return -ENOENT in that case.
+ */
+ rc = vpci_msix_arch_disable_entry(&msix->entries[i], pdev);
+ switch ( rc )
+ {
+ case 0:
+ /*
+ * Mark the entry successfully disabled as updated, so that on
+ * the next enable the entry is properly setup. This is done
+ * so that the following flow works correctly:
+ *
+ * mask entry -> disable MSIX -> enable MSIX -> unmask entry
+ *
+ * Without setting 'updated', the 'unmask entry' step will fail
+ * because the entry has not been updated, so it would not be
+ * mapped/bound at all.
+ */
+ msix->entries[i].updated = true;
+ break;
+ case -ENOENT:
+ /* Ignore non-present entry. */
+ break;
+ default:
+ gprintk(XENLOG_WARNING,
+ "%04x:%02x:%02x.%u: unable to disable entry %u: %d\n",
+ seg, bus, slot, func, msix->entries[i].nr, rc);
+ return;
+ }
+ }
+ }
+
+ if ( (new_enabled != msix->enabled || new_masked != msix->masked) &&
+ pci_msi_conf_write_intercept(msix->pdev, reg, 2, &val) >= 0 )
+ pci_conf_write16(seg, bus, slot, func, reg, val);
+
+ msix->masked = new_masked;
+ msix->enabled = new_enabled;
+}
+
+static struct vpci_msix *vpci_msix_find(const struct domain *d,
+ unsigned long addr)
+{
+ struct vpci_msix *msix;
+
+ list_for_each_entry ( msix, &d->arch.hvm_domain.msix_tables, next )
+ {
+ const struct vpci_bar *bars = msix->pdev->vpci->header.bars;
+ unsigned int i;
+
+ for ( i = 0; i < ARRAY_SIZE(msix->mem); i++ )
+ if ( bars[msix->mem[i].bir].enabled &&
+ VMSIX_ADDR_IN_RANGE(addr, &msix->mem[i],
+ &bars[msix->mem[i].bir]) )
+ return msix;
+ }
+
+ return NULL;
+}
+
+static int vpci_msix_accept(struct vcpu *v, unsigned long addr)
+{
+ return !!vpci_msix_find(v->domain, addr);
+}
+
+static bool vpci_msix_access_allowed(const struct pci_dev *pdev,
+ unsigned long addr, unsigned int len)
+{
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+
+ /* Only allow 32/64b accesses. */
+ if ( len != 4 && len != 8 )
+ {
+ gprintk(XENLOG_WARNING,
+ "%04x:%02x:%02x.%u: invalid MSI-X table access size: %u\n",
+ seg, bus, slot, func, len);
+ return false;
+ }
+
+ /* Only allow aligned accesses. */
+ if ( (addr & (len - 1)) != 0 )
+ {
+ gprintk(XENLOG_WARNING,
+ "%04x:%02x:%02x.%u: MSI-X only allows aligned accesses\n",
+ seg, bus, slot, func);
+ return false;
+ }
+
+ return true;
+}
+
+static struct vpci_msix_entry *vpci_msix_get_entry(struct vpci_msix *msix,
+ const struct vpci_bar *bars,
+ unsigned long addr)
+{
+ paddr_t start = bars[msix->mem[VPCI_MSIX_TABLE].bir].addr +
+ msix->mem[VPCI_MSIX_TABLE].offset;
+
+ return &msix->entries[(addr - start) / PCI_MSIX_ENTRY_SIZE];
+}
+
+static int vpci_msix_read(struct vcpu *v, unsigned long addr,
+ unsigned int len, unsigned long *data)
+{
+ struct domain *d = v->domain;
+ const struct vpci_bar *bars;
+ struct vpci_msix *msix;
+ const struct vpci_msix_entry *entry;
+ unsigned int offset;
+
+ *data = ~0ul;
+
+ msix = vpci_msix_find(d, addr);
+ if ( !msix || !vpci_msix_access_allowed(msix->pdev, addr, len) )
+ return X86EMUL_OKAY;
+
+ bars = msix->pdev->vpci->header.bars;
+ if ( VMSIX_ADDR_IN_RANGE(addr, &msix->mem[VPCI_MSIX_PBA],
+ &bars[msix->mem[VPCI_MSIX_PBA].bir]) )
+ {
+ /*
+ * Access to PBA.
+ *
+ * TODO: note that this relies on having the PBA identity mapped to the
+ * guest address space. If this changes the address will need to be
+ * translated.
+ */
+ switch ( len )
+ {
+ case 4:
+ *data = readl(addr);
+ break;
+ case 8:
+ *data = readq(addr);
+ break;
+ default:
+ ASSERT_UNREACHABLE();
+ break;
+ }
+
+ return X86EMUL_OKAY;
+ }
+
+ spin_lock(&msix->pdev->vpci->lock);
+ entry = vpci_msix_get_entry(msix, bars, addr);
+ offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
+
+ switch ( offset )
+ {
+ case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
+ *data = entry->addr;
+ break;
+ case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
+ *data = entry->addr >> 32;
+ break;
+ case PCI_MSIX_ENTRY_DATA_OFFSET:
+ *data = entry->data;
+ if ( len == 8 )
+ *data |=
+ (uint64_t)(entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0) << 32;
+ break;
+ case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
+ *data = entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0;
+ break;
+ default:
+ ASSERT_UNREACHABLE();
+ break;
+ }
+ spin_unlock(&msix->pdev->vpci->lock);
+
+ return X86EMUL_OKAY;
+}
+
+static int vpci_msix_write(struct vcpu *v, unsigned long addr,
+ unsigned int len, unsigned long data)
+{
+ struct domain *d = v->domain;
+ const struct vpci_bar *bars;
+ struct vpci_msix *msix;
+ struct vpci_msix_entry *entry;
+ unsigned int offset;
+
+ msix = vpci_msix_find(d, addr);
+ if ( !msix || !vpci_msix_access_allowed(msix->pdev, addr, len) )
+ return X86EMUL_OKAY;
+
+ bars = msix->pdev->vpci->header.bars;
+ if ( VMSIX_ADDR_IN_RANGE(addr, &msix->mem[VPCI_MSIX_PBA],
+ &bars[msix->mem[VPCI_MSIX_PBA].bir]) )
+ {
+ /* Ignore writes to PBA for DomUs, it's behavior is undefined. */
+ if ( is_hardware_domain(d) )
+ {
+ switch ( len )
+ {
+ case 4:
+ writel(data, addr);
+ break;
+ case 8:
+ writeq(data, addr);
+ break;
+ default:
+ ASSERT_UNREACHABLE();
+ break;
+ }
+ }
+
+ return X86EMUL_OKAY;
+ }
+
+ spin_lock(&msix->pdev->vpci->lock);
+ entry = vpci_msix_get_entry(msix, bars, addr);
+ offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
+
+ /*
+ * NB: Xen allows writes to the data/address registers with the entry
+ * unmasked. The specification says this is undefined behavior, and Xen
+ * implements it as storing the written value, which will be made effective
+ * in the next mask/unmask cycle. This also mimics the implementation in
+ * QEMU.
+ */
+ switch ( offset )
+ {
+ case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
+ entry->updated = true;
+ if ( len == 8 )
+ {
+ entry->addr = data;
+ break;
+ }
+ entry->addr &= ~0xffffffff;
+ entry->addr |= data;
+ break;
+ case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
+ entry->updated = true;
+ entry->addr &= 0xffffffff;
+ entry->addr |= (uint64_t)data << 32;
+ break;
+ case PCI_MSIX_ENTRY_DATA_OFFSET:
+ entry->updated = true;
+ entry->data = data;
+
+ if ( len == 4 )
+ break;
+
+ data >>= 32;
+ /* fallthrough */
+ case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
+ {
+ bool new_masked = data & PCI_MSIX_VECTOR_BITMASK;
+ const struct pci_dev *pdev = msix->pdev;
+ paddr_t table_base = bars[msix->mem[VPCI_MSIX_TABLE].bir].addr;
+ int rc;
+
+ if ( entry->masked == new_masked )
+ /* No change in the mask bit, nothing to do. */
+ break;
+
+ if ( !new_masked && msix->enabled && !msix->masked && entry->updated )
+ {
+ /*
+ * If MSI-X is enabled, the function mask is not active, the entry
+ * is being unmasked and there have been changes to the address or
+ * data fields Xen needs to disable and enable the entry in order
+ * to pick up the changes.
+ */
+ rc = vpci_msix_arch_disable_entry(entry, pdev);
+ if ( rc && rc != -ENOENT )
+ {
+ gprintk(XENLOG_WARNING,
+ "%04x:%02x:%02x.%u: unable to disable entry %u: %d\n",
+ pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), entry->nr, rc);
+ break;
+ }
+
+ rc = vpci_msix_arch_enable_entry(entry, pdev, table_base);
+ if ( rc )
+ {
+ gprintk(XENLOG_WARNING,
+ "%04x:%02x:%02x.%u: unable to enable entry %u: %d\n",
+ pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), entry->nr, rc);
+ break;
+ }
+ entry->updated = false;
+ }
+
+ vpci_msix_arch_mask_entry(entry, pdev, new_masked);
+ entry->masked = new_masked;
+
+ break;
+ }
+ default:
+ ASSERT_UNREACHABLE();
+ break;
+ }
+ spin_unlock(&msix->pdev->vpci->lock);
+
+ return X86EMUL_OKAY;
+}
+
+static const struct hvm_mmio_ops vpci_msix_table_ops = {
+ .check = vpci_msix_accept,
+ .read = vpci_msix_read,
+ .write = vpci_msix_write,
+};
+
+static int vpci_init_msix(struct pci_dev *pdev)
+{
+ struct domain *d = pdev->domain;
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ struct vpci_msix *msix;
+ struct vpci_msix_mem *table, *pba;
+ unsigned int msix_offset, i, max_entries;
+ uint16_t control;
+ int rc;
+
+ msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
+ if ( !msix_offset )
+ return 0;
+
+ control = pci_conf_read16(seg, bus, slot, func,
+ msix_control_reg(msix_offset));
+
+ max_entries = msix_table_size(control);
+
+ msix = xzalloc_bytes(VMSIX_SIZE(max_entries));
+ if ( !msix )
+ return -ENOMEM;
+
+ msix->max_entries = max_entries;
+ msix->pdev = pdev;
+
+ /* Find the MSI-X table address. */
+ table = &msix->mem[VPCI_MSIX_TABLE];
+ table->offset = pci_conf_read32(seg, bus, slot, func,
+ msix_table_offset_reg(msix_offset));
+ table->bir = table->offset & PCI_MSIX_BIRMASK;
+ table->offset &= ~PCI_MSIX_BIRMASK;
+ table->size = msix->max_entries * PCI_MSIX_ENTRY_SIZE;
+
+ /* Find the MSI-X pba address. */
+ pba = &msix->mem[VPCI_MSIX_PBA];
+ pba->offset = pci_conf_read32(seg, bus, slot, func,
+ msix_pba_offset_reg(msix_offset));
+ pba->bir = pba->offset & PCI_MSIX_BIRMASK;
+ pba->offset &= ~PCI_MSIX_BIRMASK;
+ /*
+ * The spec mentions regarding to the PBA that "The last QWORD will not
+ * necessarily be fully populated", so it implies that the PBA size is
+ * 64-bit aligned.
+ */
+ pba->size = ROUNDUP(DIV_ROUND_UP(msix->max_entries, 8), 8);
+
+ for ( i = 0; i < msix->max_entries; i++)
+ {
+ msix->entries[i].masked = true;
+ msix->entries[i].nr = i;
+ vpci_msix_arch_init_entry(&msix->entries[i]);
+ }
+
+ if ( list_empty(&d->arch.hvm_domain.msix_tables) )
+ register_mmio_handler(d, &vpci_msix_table_ops);
+
+ list_add(&msix->next, &d->arch.hvm_domain.msix_tables);
+
+ rc = vpci_add_register(pdev, vpci_msix_control_read,
+ vpci_msix_control_write,
+ msix_control_reg(msix_offset), 2, msix);
+ if ( rc )
+ {
+ xfree(msix);
+ return rc;
+ }
+
+ pdev->vpci->msix = msix;
+
+ return 0;
+}
+REGISTER_VPCI_INIT(vpci_init_msix, VPCI_PRIORITY_HIGH);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index cd19ee11e9..5e3139e61c 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -188,6 +188,9 @@ struct hvm_domain {
struct list_head mmcfg_regions;
rwlock_t mmcfg_lock;

+ /* List of MSI-X tables. */
+ struct list_head msix_tables;
+
/* List of permanently write-mapped pages. */
struct {
spinlock_t lock;
diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h
index c47cc971d3..e7367071ce 100644
--- a/xen/include/asm-x86/hvm/io.h
+++ b/xen/include/asm-x86/hvm/io.h
@@ -132,6 +132,11 @@ struct vpci_arch_msi {
int pirq;
};

+/* Arch-specific MSI-X entry data for vPCI. */
+struct vpci_arch_msix_entry {
+ int pirq;
+};
+
enum stdvga_cache_state {
STDVGA_CACHE_UNINITIALIZED,
STDVGA_CACHE_ENABLED,
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index c6913631c0..9656b1855b 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -100,6 +100,40 @@ struct vpci {
/* 64-bit address capable? */
bool address64;
} *msi;
+
+ /* MSI-X data. */
+ struct vpci_msix {
+ struct pci_dev *pdev;
+ /* List link. */
+ struct list_head next;
+ /* Table information. */
+ struct vpci_msix_mem {
+ /* MSI-X table offset. */
+ unsigned int offset;
+ /* MSI-X table BIR. */
+ unsigned int bir;
+ /* Table size. */
+ unsigned int size;
+#define VPCI_MSIX_TABLE 0
+#define VPCI_MSIX_PBA 1
+#define VPCI_MSIX_MEM_NUM 2
+ } mem[VPCI_MSIX_MEM_NUM];
+ /* Maximum number of vectors supported by the device. */
+ unsigned int max_entries;
+ /* MSI-X enabled? */
+ bool enabled;
+ /* Masked? */
+ bool masked;
+ /* Entries. */
+ struct vpci_msix_entry {
+ uint64_t addr;
+ uint32_t data;
+ unsigned int nr;
+ struct vpci_arch_msix_entry arch;
+ bool masked;
+ bool updated;
+ } entries[];
+ } *msix;
#endif
};

@@ -119,6 +153,17 @@ int vpci_msi_arch_enable(struct vpci_msi *msi, const struct pci_dev *pdev,
int vpci_msi_arch_disable(struct vpci_msi *msi, const struct pci_dev *pdev);
void vpci_msi_arch_init(struct vpci_msi *msi);
void vpci_msi_arch_print(const struct vpci_msi *msi);
+
+/* Arch-specific vPCI MSI-X helpers. */
+void vpci_msix_arch_mask_entry(struct vpci_msix_entry *entry,
+ const struct pci_dev *pdev, bool mask);
+int vpci_msix_arch_enable_entry(struct vpci_msix_entry *entry,
+ const struct pci_dev *pdev,
+ paddr_t table_base);
+int vpci_msix_arch_disable_entry(struct vpci_msix_entry *entry,
+ const struct pci_dev *pdev);
+int vpci_msix_arch_init_entry(struct vpci_msix_entry *entry);
+void vpci_msix_arch_print_entry(const struct vpci_msix_entry *entry);
#endif

#endif
--
2.11.0 (Apple Git-81)
Jan Beulich
2017-10-04 08:34:43 UTC
Permalink
Post by Roger Pau Monne
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -152,6 +152,7 @@ static int vpci_check_bar_overlap(const struct pci_dev *pdev,
static void vpci_modify_bars(const struct pci_dev *pdev, bool map)
{
struct vpci_header *header = &pdev->vpci->header;
+ struct vpci_msix *msix = pdev->vpci->msix;
const and please fetch the value only right before you first need it.
Post by Roger Pau Monne
--- a/xen/drivers/vpci/msi.c
+++ b/xen/drivers/vpci/msi.c
@@ -320,13 +320,17 @@ void vpci_dump_msi(void)
if ( !has_vpci(d) )
continue;
- printk("vPCI MSI information for d%d\n", d->domain_id);
+ printk("vPCI MSI/MSI-X information for d%d\n", d->domain_id);
list_for_each_entry ( pdev, &d->arch.pdev_list, domain_list )
{
uint8_t seg = pdev->seg, bus = pdev->bus;
uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
const struct vpci_msi *msi = pdev->vpci->msi;
+ const struct vpci_msix *msix = pdev->vpci->msix;
+
+ if ( msi || msix )
+ printk("Device %04x:%02x:%02x.%u\n", seg, bus, slot, func);
if ( !spin_trylock(&pdev->vpci->lock) )
{
@@ -336,7 +340,7 @@ void vpci_dump_msi(void)
if ( msi )
{
- printk("Device %04x:%02x:%02x.%u\n", seg, bus, slot, func);
+ printk(" MSI\n");
printk(" Enabled: %u Supports masking: %u 64-bit addresses: %u\n",
msi->enabled, msi->masking, msi->address64);
@@ -349,6 +353,20 @@ void vpci_dump_msi(void)
printk(" mask=%08x\n", msi->mask);
}
+ if ( msix )
+ {
+ unsigned int i;
+
+ printk(" MSI-X\n");
+
+ printk(" Max entries: %u maskall: %u enabled: %u\n",
+ msix->max_entries, msix->masked, msix->enabled);
+
+ printk(" Table entries:\n");
+ for ( i = 0; i < msix->max_entries; i++ )
+ vpci_msix_arch_print_entry(&msix->entries[i]);
+ }
+
Again, please try to reduce the amount of overall output.
Post by Roger Pau Monne
+static void vpci_msix_control_write(const struct pci_dev *pdev,
+ unsigned int reg, uint32_t val, void *data)
+{
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ struct vpci_msix *msix = data;
+ bool new_masked, new_enabled;
+ unsigned int i;
+ int rc;
+
+ new_masked = val & PCI_MSIX_FLAGS_MASKALL;
+ new_enabled = val & PCI_MSIX_FLAGS_ENABLE;
+
+ /*
+ * According to the PCI 3.0 specification, switching the enable bit
+ * to 1 or the function mask bit to 0 should cause all the cached
+ * addresses and data fields to be recalculated. Xen implements this
+ * as disabling and enabling the entries.
+ *
+ * Note that the disable/enable sequence is only performed when the
+ * guest has written to the entry (ie: updated field set) or MSIX is
+ * enabled.
+ */
+ if ( new_enabled && !new_masked && (!msix->enabled || msix->masked) )
+ {
+ paddr_t table_base =
+ pdev->vpci->header.bars[msix->mem[VPCI_MSIX_TABLE].bir].addr;
+
+ for ( i = 0; i < msix->max_entries; i++ )
+ {
+ if ( msix->entries[i].masked ||
+ (new_enabled && msix->enabled && !msix->entries[i].updated) )
+ continue;
This doesn't look to match up with the earlier comment.
Post by Roger Pau Monne
+static int vpci_msix_read(struct vcpu *v, unsigned long addr,
+ unsigned int len, unsigned long *data)
+{
+ struct domain *d = v->domain;
const?
Post by Roger Pau Monne
+ const struct vpci_bar *bars;
+ struct vpci_msix *msix;
+ const struct vpci_msix_entry *entry;
+ unsigned int offset;
+
+ *data = ~0ul;
+
+ msix = vpci_msix_find(d, addr);
+ if ( !msix || !vpci_msix_access_allowed(msix->pdev, addr, len) )
+ return X86EMUL_OKAY;
In the !msix case I'm once again not convinced returning OKAY is correct
here.
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -100,6 +100,40 @@ struct vpci {
/* 64-bit address capable? */
bool address64;
} *msi;
+
+ /* MSI-X data. */
+ struct vpci_msix {
+ struct pci_dev *pdev;
+ /* List link. */
+ struct list_head next;
+ /* Table information. */
+ struct vpci_msix_mem {
+ /* MSI-X table offset. */
+ unsigned int offset;
+ /* MSI-X table BIR. */
+ unsigned int bir;
+ /* Table size. */
+ unsigned int size;
+#define VPCI_MSIX_TABLE 0
+#define VPCI_MSIX_PBA 1
+#define VPCI_MSIX_MEM_NUM 2
+ } mem[VPCI_MSIX_MEM_NUM];
+ /* Maximum number of vectors supported by the device. */
+ unsigned int max_entries;
+ /* MSI-X enabled? */
+ bool enabled;
+ /* Masked? */
+ bool masked;
+ /* Entries. */
+ struct vpci_msix_entry {
+ uint64_t addr;
+ uint32_t data;
+ unsigned int nr;
+ struct vpci_arch_msix_entry arch;
+ bool masked;
+ bool updated;
+ } entries[];
+ } *msix;
Same remark as for MSI regarding optimizing structure size.

Jan
Roger Pau Monné
2017-10-10 15:04:47 UTC
Permalink
Post by Jan Beulich
Post by Roger Pau Monne
+ const struct vpci_bar *bars;
+ struct vpci_msix *msix;
+ const struct vpci_msix_entry *entry;
+ unsigned int offset;
+
+ *data = ~0ul;
+
+ msix = vpci_msix_find(d, addr);
+ if ( !msix || !vpci_msix_access_allowed(msix->pdev, addr, len) )
+ return X86EMUL_OKAY;
In the !msix case I'm once again not convinced returning OKAY is correct
here.
From what we have spoken in the mmcfg case, for the !msix case Xen
should return _RETRY. This error can only happen when the msix table
is unmapped in between a accept and read/write call, so calling the
accept handler again will return the correct value.
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -100,6 +100,40 @@ struct vpci {
/* 64-bit address capable? */
bool address64;
} *msi;
+
+ /* MSI-X data. */
+ struct vpci_msix {
+ struct pci_dev *pdev;
+ /* List link. */
+ struct list_head next;
+ /* Table information. */
+ struct vpci_msix_mem {
+ /* MSI-X table offset. */
+ unsigned int offset;
+ /* MSI-X table BIR. */
+ unsigned int bir;
+ /* Table size. */
+ unsigned int size;
+#define VPCI_MSIX_TABLE 0
+#define VPCI_MSIX_PBA 1
+#define VPCI_MSIX_MEM_NUM 2
+ } mem[VPCI_MSIX_MEM_NUM];
+ /* Maximum number of vectors supported by the device. */
+ unsigned int max_entries;
+ /* MSI-X enabled? */
+ bool enabled;
+ /* Masked? */
+ bool masked;
+ /* Entries. */
+ struct vpci_msix_entry {
+ uint64_t addr;
+ uint32_t data;
+ unsigned int nr;
+ struct vpci_arch_msix_entry arch;
+ bool masked;
+ bool updated;
+ } entries[];
+ } *msix;
Same remark as for MSI regarding optimizing structure size.
Going over the fields, bir can be turned into a uint8_t, and size into
a uint16_t. max_entries can also be converted to a uint16_t together
with nr.

Apart from that I don't see much more optimization, unless we start
packaging fields (ie: offset and bir could reside in a uint32_t), but
IMHO that's going to make the code harder to parse for little gain,
and will involve more calculations in the handlers.

Thanks, Roger.
Jan Beulich
2017-10-11 10:30:52 UTC
Permalink
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -100,6 +100,40 @@ struct vpci {
/* 64-bit address capable? */
bool address64;
} *msi;
+
+ /* MSI-X data. */
+ struct vpci_msix {
+ struct pci_dev *pdev;
+ /* List link. */
+ struct list_head next;
+ /* Table information. */
+ struct vpci_msix_mem {
+ /* MSI-X table offset. */
+ unsigned int offset;
+ /* MSI-X table BIR. */
+ unsigned int bir;
+ /* Table size. */
+ unsigned int size;
+#define VPCI_MSIX_TABLE 0
+#define VPCI_MSIX_PBA 1
+#define VPCI_MSIX_MEM_NUM 2
+ } mem[VPCI_MSIX_MEM_NUM];
+ /* Maximum number of vectors supported by the device. */
+ unsigned int max_entries;
+ /* MSI-X enabled? */
+ bool enabled;
+ /* Masked? */
+ bool masked;
+ /* Entries. */
+ struct vpci_msix_entry {
+ uint64_t addr;
+ uint32_t data;
+ unsigned int nr;
+ struct vpci_arch_msix_entry arch;
+ bool masked;
+ bool updated;
+ } entries[];
+ } *msix;
Same remark as for MSI regarding optimizing structure size.
Going over the fields, bir can be turned into a uint8_t, and size into
a uint16_t. max_entries can also be converted to a uint16_t together
with nr.
Apart from that I don't see much more optimization, unless we start
packaging fields (ie: offset and bir could reside in a uint32_t), but
IMHO that's going to make the code harder to parse for little gain,
and will involve more calculations in the handlers.
The more instances of a structure there may be, the more
relevant it is to pack them tightly. I.e. primary focus needs
to be on struct vpci_msix_entry, but since - as indicated -
there may be many devices supporting MSI-X, struct vpci_msix
as a whole should be reasonably well packed as well. I don't
think more calculation in the handlers is an argument - the
compiler will do it for you, and the affected code shouldn't
really be performance critical (it's involved in setting up
interrupts, not delivering them).

Jan
Roger Pau Monne
2017-09-19 15:29:28 UTC
Permalink
Introduce a set of handlers for the accesses to the MMCFG areas. Those
areas are setup based on the contents of the hardware MMCFG tables,
and the list of handled MMCFG areas is stored inside of the hvm_domain
struct.

The read/writes are forwarded to the generic vpci handlers once the
address is decoded in order to obtain the device and register the
guest is trying to access.

Signed-off-by: Roger Pau Monné <***@citrix.com>
Reviewed-by: Paul Durrant <***@citrix.com>
---
Cc: Jan Beulich <***@suse.com>
Cc: Andrew Cooper <***@citrix.com>
Cc: Paul Durrant <***@citrix.com>
---
Changes since v5:
- Switch to use pci_sbdf_t.
- Switch to the new per vpci locks.
- Move the mmcfg related external definitions to asm-x86/pci.h

Changes since v4:
- Change the attribute of pvh_setup_mmcfg to __hwdom_init.
- Try to add as many MMCFG regions as possible, even if one fails to
add.
- Change some fields of the hvm_mmcfg struct: turn size into a
unsigned int, segment into uint16_t and bus into uint8_t.
- Convert some address parameters from unsigned long to paddr_t for
consistency.
- Make vpci_mmcfg_decode_addr return the decoded register in the
return of the function.
- Introduce a new macro to convert a MMCFG address into a BDF, and
use it in vpci_mmcfg_decode_addr to clarify the logic.
- In vpci_mmcfg_{read/write} unify the logic for 8B accesses and
smaller ones.
- Add the __hwdom_init attribute to register_vpci_mmcfg_handler.
- Test that reg + size doesn't cross a device boundary.

Changes since v3:
- Propagate changes from previous patches: drop xen_ prefix for vpci
functions, pass slot and func instead of devfn and fix the error
paths of the MMCFG handlers.
- s/ecam/mmcfg/.
- Move the destroy code to a separate function, so the hvm_mmcfg
struct can be private to hvm/io.c.
- Constify the return of vpci_mmcfg_find.
- Use d instead of v->domain in vpci_mmcfg_accept.
- Allow 8byte accesses to the mmcfg.

Changes since v1:
- Added locking.
---
xen/arch/x86/hvm/dom0_build.c | 21 +++++
xen/arch/x86/hvm/hvm.c | 4 +
xen/arch/x86/hvm/io.c | 174 ++++++++++++++++++++++++++++++++++++++-
xen/arch/x86/x86_64/mmconfig.h | 4 -
xen/include/asm-x86/hvm/domain.h | 4 +
xen/include/asm-x86/hvm/io.h | 7 ++
xen/include/asm-x86/pci.h | 6 ++
7 files changed, 215 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
index 020c355faf..17d77137d6 100644
--- a/xen/arch/x86/hvm/dom0_build.c
+++ b/xen/arch/x86/hvm/dom0_build.c
@@ -22,6 +22,7 @@
#include <xen/init.h>
#include <xen/libelf.h>
#include <xen/multiboot.h>
+#include <xen/pci.h>
#include <xen/softirq.h>

#include <acpi/actables.h>
@@ -1048,6 +1049,24 @@ static int __init pvh_setup_acpi(struct domain *d, paddr_t start_info)
return 0;
}

+static void __hwdom_init pvh_setup_mmcfg(struct domain *d)
+{
+ unsigned int i;
+ int rc;
+
+ for ( i = 0; i < pci_mmcfg_config_num; i++ )
+ {
+ rc = register_vpci_mmcfg_handler(d, pci_mmcfg_config[i].address,
+ pci_mmcfg_config[i].start_bus_number,
+ pci_mmcfg_config[i].end_bus_number,
+ pci_mmcfg_config[i].pci_segment);
+ if ( rc )
+ printk("Unable to setup MMCFG handler at %#lx for segment %u\n",
+ pci_mmcfg_config[i].address,
+ pci_mmcfg_config[i].pci_segment);
+ }
+}
+
int __init dom0_construct_pvh(struct domain *d, const module_t *image,
unsigned long image_headroom,
module_t *initrd,
@@ -1090,6 +1109,8 @@ int __init dom0_construct_pvh(struct domain *d, const module_t *image,
return rc;
}

+ pvh_setup_mmcfg(d);
+
panic("Building a PVHv2 Dom0 is not yet supported.");
return 0;
}
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index cc73df8dc7..b1064413fc 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -581,8 +581,10 @@ int hvm_domain_initialise(struct domain *d, unsigned long domcr_flags,
spin_lock_init(&d->arch.hvm_domain.irq_lock);
spin_lock_init(&d->arch.hvm_domain.uc_lock);
spin_lock_init(&d->arch.hvm_domain.write_map.lock);
+ rwlock_init(&d->arch.hvm_domain.mmcfg_lock);
INIT_LIST_HEAD(&d->arch.hvm_domain.write_map.list);
INIT_LIST_HEAD(&d->arch.hvm_domain.g2m_ioport_list);
+ INIT_LIST_HEAD(&d->arch.hvm_domain.mmcfg_regions);

rc = create_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0, NULL, NULL);
if ( rc )
@@ -728,6 +730,8 @@ void hvm_domain_destroy(struct domain *d)
list_del(&ioport->list);
xfree(ioport);
}
+
+ destroy_vpci_mmcfg(&d->arch.hvm_domain.mmcfg_regions);
}

static int hvm_save_tsc_adjust(struct domain *d, hvm_domain_context_t *h)
diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
index 6f9cd1f19e..7ee20eb5d4 100644
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -274,7 +274,7 @@ unsigned int hvm_pci_decode_addr(unsigned int cf8, unsigned int addr,
static bool vpci_access_allowed(unsigned int reg, unsigned int len)
{
/* Check access size. */
- if ( len != 1 && len != 2 && len != 4 )
+ if ( len != 1 && len != 2 && len != 4 && len != 8 )
return false;

/* Check that access is size aligned. */
@@ -372,6 +372,178 @@ void register_vpci_portio_handler(struct domain *d)
handler->ops = &vpci_portio_ops;
}

+struct hvm_mmcfg {
+ struct list_head next;
+ paddr_t addr;
+ unsigned int size;
+ uint16_t segment;
+ uint8_t start_bus;
+};
+
+/* Handlers to trap PCI MMCFG config accesses. */
+static const struct hvm_mmcfg *vpci_mmcfg_find(const struct domain *d,
+ paddr_t addr)
+{
+ const struct hvm_mmcfg *mmcfg;
+
+ list_for_each_entry ( mmcfg, &d->arch.hvm_domain.mmcfg_regions, next )
+ if ( addr >= mmcfg->addr && addr < mmcfg->addr + mmcfg->size )
+ return mmcfg;
+
+ return NULL;
+}
+
+static unsigned int vpci_mmcfg_decode_addr(const struct hvm_mmcfg *mmcfg,
+ paddr_t addr, pci_sbdf_t *sbdf)
+{
+ addr -= mmcfg->addr;
+ sbdf->bdf = MMCFG_BDF(addr);
+ sbdf->bus += mmcfg->start_bus;
+ sbdf->seg = mmcfg->segment;
+
+ return addr & (PCI_CFG_SPACE_EXP_SIZE - 1);
+}
+
+static int vpci_mmcfg_accept(struct vcpu *v, unsigned long addr)
+{
+ struct domain *d = v->domain;
+ bool found;
+
+ read_lock(&d->arch.hvm_domain.mmcfg_lock);
+ found = vpci_mmcfg_find(d, addr);
+ read_unlock(&d->arch.hvm_domain.mmcfg_lock);
+
+ return found;
+}
+
+static int vpci_mmcfg_read(struct vcpu *v, unsigned long addr,
+ unsigned int len, unsigned long *data)
+{
+ struct domain *d = v->domain;
+ const struct hvm_mmcfg *mmcfg;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+
+ *data = ~0ul;
+
+ read_lock(&d->arch.hvm_domain.mmcfg_lock);
+ mmcfg = vpci_mmcfg_find(d, addr);
+ if ( !mmcfg )
+ {
+ read_unlock(&d->arch.hvm_domain.mmcfg_lock);
+ return X86EMUL_OKAY;
+ }
+
+ reg = vpci_mmcfg_decode_addr(mmcfg, addr, &sbdf);
+ read_unlock(&d->arch.hvm_domain.mmcfg_lock);
+
+ if ( !vpci_access_allowed(reg, len) ||
+ (reg + len) > PCI_CFG_SPACE_EXP_SIZE )
+ return X86EMUL_OKAY;
+
+ /*
+ * According to the PCIe 3.1A specification:
+ * - Configuration Reads and Writes must usually be DWORD or smaller
+ * in size.
+ * - Because Root Complex implementations are not required to support
+ * accesses to a RCRB that cross DW boundaries [...] software
+ * should take care not to cause the generation of such accesses
+ * when accessing a RCRB unless the Root Complex will support the
+ * access.
+ * Xen however supports 8byte accesses by splitting them into two
+ * 4byte accesses.
+ */
+ *data = vpci_read(sbdf, reg, min(4u, len));
+ if ( len == 8 )
+ *data |= (uint64_t)vpci_read(sbdf, reg + 4, 4) << 32;
+
+ return X86EMUL_OKAY;
+}
+
+static int vpci_mmcfg_write(struct vcpu *v, unsigned long addr,
+ unsigned int len, unsigned long data)
+{
+ struct domain *d = v->domain;
+ const struct hvm_mmcfg *mmcfg;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+
+ read_lock(&d->arch.hvm_domain.mmcfg_lock);
+ mmcfg = vpci_mmcfg_find(d, addr);
+ if ( !mmcfg )
+ {
+ read_unlock(&d->arch.hvm_domain.mmcfg_lock);
+ return X86EMUL_OKAY;
+ }
+
+ reg = vpci_mmcfg_decode_addr(mmcfg, addr, &sbdf);
+ read_unlock(&d->arch.hvm_domain.mmcfg_lock);
+
+ if ( !vpci_access_allowed(reg, len) ||
+ (reg + len) > PCI_CFG_SPACE_EXP_SIZE )
+ return X86EMUL_OKAY;
+
+ vpci_write(sbdf, reg, min(4u, len), data);
+ if ( len == 8 )
+ vpci_write(sbdf, reg + 4, 4, data >> 32);
+
+ return X86EMUL_OKAY;
+}
+
+static const struct hvm_mmio_ops vpci_mmcfg_ops = {
+ .check = vpci_mmcfg_accept,
+ .read = vpci_mmcfg_read,
+ .write = vpci_mmcfg_write,
+};
+
+int __hwdom_init register_vpci_mmcfg_handler(struct domain *d, paddr_t addr,
+ unsigned int start_bus,
+ unsigned int end_bus,
+ unsigned int seg)
+{
+ struct hvm_mmcfg *mmcfg;
+
+ ASSERT(is_hardware_domain(d));
+
+ write_lock(&d->arch.hvm_domain.mmcfg_lock);
+ if ( vpci_mmcfg_find(d, addr) )
+ {
+ write_unlock(&d->arch.hvm_domain.mmcfg_lock);
+ return -EEXIST;
+ }
+
+ mmcfg = xmalloc(struct hvm_mmcfg);
+ if ( !mmcfg )
+ {
+ write_unlock(&d->arch.hvm_domain.mmcfg_lock);
+ return -ENOMEM;
+ }
+
+ if ( list_empty(&d->arch.hvm_domain.mmcfg_regions) )
+ register_mmio_handler(d, &vpci_mmcfg_ops);
+
+ mmcfg->addr = addr + (start_bus << 20);
+ mmcfg->start_bus = start_bus;
+ mmcfg->segment = seg;
+ mmcfg->size = (end_bus - start_bus + 1) << 20;
+ list_add(&mmcfg->next, &d->arch.hvm_domain.mmcfg_regions);
+ write_unlock(&d->arch.hvm_domain.mmcfg_lock);
+
+ return 0;
+}
+
+void destroy_vpci_mmcfg(struct list_head *domain_mmcfg)
+{
+ while ( !list_empty(domain_mmcfg) )
+ {
+ struct hvm_mmcfg *mmcfg = list_first_entry(domain_mmcfg,
+ struct hvm_mmcfg, next);
+
+ list_del(&mmcfg->next);
+ xfree(mmcfg);
+ }
+}
+
/*
* Local variables:
* mode: C
diff --git a/xen/arch/x86/x86_64/mmconfig.h b/xen/arch/x86/x86_64/mmconfig.h
index 7537519414..2e836848ad 100644
--- a/xen/arch/x86/x86_64/mmconfig.h
+++ b/xen/arch/x86/x86_64/mmconfig.h
@@ -74,10 +74,6 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val)
asm volatile("movl %%eax,(%1)" :: "a" (val), "r" (pos) : "memory");
}

-/* external variable defines */
-extern int pci_mmcfg_config_num;
-extern struct acpi_mcfg_allocation *pci_mmcfg_config;
-
/* function prototypes */
int acpi_parse_mcfg(struct acpi_table_header *header);
int pci_mmcfg_reserved(uint64_t address, unsigned int segment,
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index d2899c9bb2..cd19ee11e9 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -184,6 +184,10 @@ struct hvm_domain {
/* List of guest to machine IO ports mapping. */
struct list_head g2m_ioport_list;

+ /* List of MMCFG regions trapped by Xen. */
+ struct list_head mmcfg_regions;
+ rwlock_t mmcfg_lock;
+
/* List of permanently write-mapped pages. */
struct {
spinlock_t lock;
diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h
index f68aed9323..ba66e22e04 100644
--- a/xen/include/asm-x86/hvm/io.h
+++ b/xen/include/asm-x86/hvm/io.h
@@ -163,6 +163,13 @@ void register_g2m_portio_handler(struct domain *d);
/* HVM port IO handler for vPCI accesses. */
void register_vpci_portio_handler(struct domain *d);

+/* HVM MMIO handler for PCI MMCFG accesses. */
+int register_vpci_mmcfg_handler(struct domain *d, paddr_t addr,
+ unsigned int start_bus, unsigned int end_bus,
+ unsigned int seg);
+/* Destroy tracked MMCFG areas. */
+void destroy_vpci_mmcfg(struct list_head *domain_mmcfg);
+
#endif /* __ASM_X86_HVM_IO_H__ */


diff --git a/xen/include/asm-x86/pci.h b/xen/include/asm-x86/pci.h
index 36801d317b..cc05045e9c 100644
--- a/xen/include/asm-x86/pci.h
+++ b/xen/include/asm-x86/pci.h
@@ -6,6 +6,8 @@
#define CF8_ADDR_HI(cf8) ( ((cf8) & 0x0f000000) >> 16)
#define CF8_ENABLED(cf8) (!!((cf8) & 0x80000000))

+#define MMCFG_BDF(addr) ( ((addr) & 0x0ffff000) >> 12)
+
#define IS_SNB_GFX(id) (id == 0x01068086 || id == 0x01168086 \
|| id == 0x01268086 || id == 0x01028086 \
|| id == 0x01128086 || id == 0x01228086 \
@@ -26,4 +28,8 @@ bool_t pci_mmcfg_decode(unsigned long mfn, unsigned int *seg,
bool_t pci_ro_mmcfg_decode(unsigned long mfn, unsigned int *seg,
unsigned int *bdf);

+/* MMCFG external variable defines */
+extern int pci_mmcfg_config_num;
+extern struct acpi_mcfg_allocation *pci_mmcfg_config;
+
#endif /* __X86_PCI_H__ */
--
2.11.0 (Apple Git-81)
Jan Beulich
2017-10-04 08:31:18 UTC
Permalink
Post by Roger Pau Monne
+static int vpci_mmcfg_read(struct vcpu *v, unsigned long addr,
+ unsigned int len, unsigned long *data)
+{
+ struct domain *d = v->domain;
+ const struct hvm_mmcfg *mmcfg;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+
+ *data = ~0ul;
+
+ read_lock(&d->arch.hvm_domain.mmcfg_lock);
+ mmcfg = vpci_mmcfg_find(d, addr);
+ if ( !mmcfg )
+ {
+ read_unlock(&d->arch.hvm_domain.mmcfg_lock);
+ return X86EMUL_OKAY;
+ }
With the lock dropped between accept() and read() (or write() below),
is it really appropriate to return OKAY here? The access again should
be forwarded to qemu, I would think.
Post by Roger Pau Monne
+int __hwdom_init register_vpci_mmcfg_handler(struct domain *d, paddr_t addr,
+ unsigned int start_bus,
+ unsigned int end_bus,
+ unsigned int seg)
+{
+ struct hvm_mmcfg *mmcfg;
+
+ ASSERT(is_hardware_domain(d));
+
+ write_lock(&d->arch.hvm_domain.mmcfg_lock);
+ if ( vpci_mmcfg_find(d, addr) )
+ {
+ write_unlock(&d->arch.hvm_domain.mmcfg_lock);
+ return -EEXIST;
+ }
You check here for an exact match in starting address. Is it really
to reject just this special case, rather than doing a proper overlap
check?
Post by Roger Pau Monne
+ mmcfg = xmalloc(struct hvm_mmcfg);
Whenever possible without too much trouble allocations should be done
with no lock held.

Jan
Roger Pau Monné
2017-10-04 10:21:38 UTC
Permalink
Post by Jan Beulich
Post by Roger Pau Monne
+static int vpci_mmcfg_read(struct vcpu *v, unsigned long addr,
+ unsigned int len, unsigned long *data)
+{
+ struct domain *d = v->domain;
+ const struct hvm_mmcfg *mmcfg;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+
+ *data = ~0ul;
+
+ read_lock(&d->arch.hvm_domain.mmcfg_lock);
+ mmcfg = vpci_mmcfg_find(d, addr);
+ if ( !mmcfg )
+ {
+ read_unlock(&d->arch.hvm_domain.mmcfg_lock);
+ return X86EMUL_OKAY;
+ }
With the lock dropped between accept() and read() (or write() below),
is it really appropriate to return OKAY here? The access again should
be forwarded to qemu, I would think.
That's right, the MCFG area could have been removed in the meantime.
I guess it is indeed more appropriate to return X86EMUL_UNHANDLEABLE
or X86EMUL_RETRY.

It would seem like RETRY is better, since a new call to _accept should
return false now.

Roger.
Jan Beulich
2017-10-04 11:34:37 UTC
Permalink
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
+static int vpci_mmcfg_read(struct vcpu *v, unsigned long addr,
+ unsigned int len, unsigned long *data)
+{
+ struct domain *d = v->domain;
+ const struct hvm_mmcfg *mmcfg;
+ unsigned int reg;
+ pci_sbdf_t sbdf;
+
+ *data = ~0ul;
+
+ read_lock(&d->arch.hvm_domain.mmcfg_lock);
+ mmcfg = vpci_mmcfg_find(d, addr);
+ if ( !mmcfg )
+ {
+ read_unlock(&d->arch.hvm_domain.mmcfg_lock);
+ return X86EMUL_OKAY;
+ }
With the lock dropped between accept() and read() (or write() below),
is it really appropriate to return OKAY here? The access again should
be forwarded to qemu, I would think.
That's right, the MCFG area could have been removed in the meantime.
I guess it is indeed more appropriate to return X86EMUL_UNHANDLEABLE
or X86EMUL_RETRY.
It would seem like RETRY is better, since a new call to _accept should
return false now.
Ah, yes, I'm fine with RETRY.

Jan
Roger Pau Monne
2017-09-19 15:29:26 UTC
Permalink
That provides direct access to all the members that constitute a SBDF.
The only function switched to use it is hvm_pci_decode_addr, because
it makes following patches simpler.

Suggested-by: Andrew Cooper <***@citrix.com>
Signed-off-by: Roger Pau Monné <***@citrix.com>
---
Cc: Paul Durrant <***@citrix.com>
Cc: Jan Beulich <***@suse.com>
Cc: Andrew Cooper <***@citrix.com>
Cc: George Dunlap <***@eu.citrix.com>
Cc: Ian Jackson <***@eu.citrix.com>
Cc: Konrad Rzeszutek Wilk <***@oracle.com>
Cc: Stefano Stabellini <***@kernel.org>
Cc: Tim Deegan <***@xen.org>
Cc: Wei Liu <***@citrix.com>
---
Changes since v5:
- New in this version.
---
xen/arch/x86/hvm/io.c | 10 ++--------
xen/arch/x86/hvm/ioreq.c | 12 +++++-------
xen/include/asm-x86/hvm/io.h | 4 ++--
xen/include/xen/pci.h | 20 ++++++++++++++++++++
4 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
index bf41954f59..4e49e59012 100644
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -257,17 +257,11 @@ void register_g2m_portio_handler(struct domain *d)
}

unsigned int hvm_pci_decode_addr(unsigned int cf8, unsigned int addr,
- unsigned int *bus, unsigned int *slot,
- unsigned int *func)
+ pci_sbdf_t *bdf)
{
- unsigned int bdf;
-
ASSERT(CF8_ENABLED(cf8));

- bdf = CF8_BDF(cf8);
- *bus = PCI_BUS(bdf);
- *slot = PCI_SLOT(bdf);
- *func = PCI_FUNC(bdf);
+ bdf->sbdf = CF8_BDF(cf8);
/*
* NB: the lower 2 bits of the register address are fetched from the
* offset into the 0xcfc register when reading/writing to it.
diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index 752976d16d..3e7a88e053 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -1177,17 +1177,15 @@ struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
(p->addr & ~3) == 0xcfc &&
CF8_ENABLED(cf8) )
{
- uint32_t sbdf, x86_fam;
- unsigned int bus, slot, func, reg;
+ uint32_t x86_fam;
+ pci_sbdf_t bdf;
+ unsigned int reg;

- reg = hvm_pci_decode_addr(cf8, p->addr, &bus, &slot, &func);
+ reg = hvm_pci_decode_addr(cf8, p->addr, &bdf);

/* PCI config data cycle */
-
- sbdf = XEN_DMOP_PCI_SBDF(0, bus, slot, func);
-
type = XEN_DMOP_IO_RANGE_PCI;
- addr = ((uint64_t)sbdf << 32) | reg;
+ addr = ((uint64_t)bdf.bdf << 32) | reg;
/* AMD extended configuration space access? */
if ( CF8_ADDR_HI(cf8) &&
d->arch.cpuid->x86_vendor == X86_VENDOR_AMD &&
diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h
index 51659b6c7f..2ff1c96883 100644
--- a/xen/include/asm-x86/hvm/io.h
+++ b/xen/include/asm-x86/hvm/io.h
@@ -20,6 +20,7 @@
#define __ASM_X86_HVM_IO_H__

#include <xen/mm.h>
+#include <xen/pci.h>
#include <asm/hvm/vpic.h>
#include <asm/hvm/vioapic.h>
#include <public/hvm/ioreq.h>
@@ -151,8 +152,7 @@ extern void hvm_dpci_msi_eoi(struct domain *d, int vector);

/* Decode a PCI port IO access into a bus/slot/func/reg. */
unsigned int hvm_pci_decode_addr(unsigned int cf8, unsigned int addr,
- unsigned int *bus, unsigned int *slot,
- unsigned int *func);
+ pci_sbdf_t *bdf);

/*
* HVM port IO handler that performs forwarding of guest IO ports into machine
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 43f21251a5..dd5ec43a70 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -38,6 +38,26 @@
#define PCI_SBDF2(s,bdf) ((((s) & 0xffff) << 16) | ((bdf) & 0xffff))
#define PCI_SBDF3(s,b,df) ((((s) & 0xffff) << 16) | PCI_BDF2(b, df))

+typedef union {
+ uint32_t sbdf;
+ struct {
+ union {
+ uint16_t bdf;
+ struct {
+ union {
+ struct {
+ uint8_t func : 3,
+ dev : 5;
+ };
+ uint8_t extfunc;
+ };
+ uint8_t bus;
+ };
+ };
+ uint16_t seg;
+ };
+} pci_sbdf_t;
+
struct pci_dev_info {
/*
* VF's 'is_extfn' field is used to indicate whether its PF is an extended
--
2.11.0 (Apple Git-81)
Paul Durrant
2017-09-19 15:40:51 UTC
Permalink
-----Original Message-----
Sent: 19 September 2017 16:29
Subject: [PATCH v6 01/11] pci: introduce a type to store a SBDF
That provides direct access to all the members that constitute a SBDF.
The only function switched to use it is hvm_pci_decode_addr, because
it makes following patches simpler.
---
---
- New in this version.
---
xen/arch/x86/hvm/io.c | 10 ++--------
xen/arch/x86/hvm/ioreq.c | 12 +++++-------
xen/include/asm-x86/hvm/io.h | 4 ++--
xen/include/xen/pci.h | 20 ++++++++++++++++++++
4 files changed, 29 insertions(+), 17 deletions(-)
diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
index bf41954f59..4e49e59012 100644
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -257,17 +257,11 @@ void register_g2m_portio_handler(struct domain *d)
}
unsigned int hvm_pci_decode_addr(unsigned int cf8, unsigned int addr,
- unsigned int *bus, unsigned int *slot,
- unsigned int *func)
+ pci_sbdf_t *bdf)
I'd prefer the pointer name to be 'sbdf' rather than 'bdf', but otherwise...
{
- unsigned int bdf;
-
ASSERT(CF8_ENABLED(cf8));
- bdf = CF8_BDF(cf8);
- *bus = PCI_BUS(bdf);
- *slot = PCI_SLOT(bdf);
- *func = PCI_FUNC(bdf);
+ bdf->sbdf = CF8_BDF(cf8);
/*
* NB: the lower 2 bits of the register address are fetched from the
* offset into the 0xcfc register when reading/writing to it.
diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index 752976d16d..3e7a88e053 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -1177,17 +1177,15 @@ struct hvm_ioreq_server
*hvm_select_ioreq_server(struct domain *d,
(p->addr & ~3) == 0xcfc &&
CF8_ENABLED(cf8) )
{
- uint32_t sbdf, x86_fam;
- unsigned int bus, slot, func, reg;
+ uint32_t x86_fam;
+ pci_sbdf_t bdf;
+ unsigned int reg;
- reg = hvm_pci_decode_addr(cf8, p->addr, &bus, &slot, &func);
+ reg = hvm_pci_decode_addr(cf8, p->addr, &bdf);
/* PCI config data cycle */
-
- sbdf = XEN_DMOP_PCI_SBDF(0, bus, slot, func);
-
type = XEN_DMOP_IO_RANGE_PCI;
- addr = ((uint64_t)sbdf << 32) | reg;
+ addr = ((uint64_t)bdf.bdf << 32) | reg;
/* AMD extended configuration space access? */
if ( CF8_ADDR_HI(cf8) &&
d->arch.cpuid->x86_vendor == X86_VENDOR_AMD &&
diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h
index 51659b6c7f..2ff1c96883 100644
--- a/xen/include/asm-x86/hvm/io.h
+++ b/xen/include/asm-x86/hvm/io.h
@@ -20,6 +20,7 @@
#define __ASM_X86_HVM_IO_H__
#include <xen/mm.h>
+#include <xen/pci.h>
#include <asm/hvm/vpic.h>
#include <asm/hvm/vioapic.h>
#include <public/hvm/ioreq.h>
@@ -151,8 +152,7 @@ extern void hvm_dpci_msi_eoi(struct domain *d, int vector);
/* Decode a PCI port IO access into a bus/slot/func/reg. */
unsigned int hvm_pci_decode_addr(unsigned int cf8, unsigned int addr,
- unsigned int *bus, unsigned int *slot,
- unsigned int *func);
+ pci_sbdf_t *bdf);
/*
* HVM port IO handler that performs forwarding of guest IO ports into machine
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 43f21251a5..dd5ec43a70 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -38,6 +38,26 @@
#define PCI_SBDF2(s,bdf) ((((s) & 0xffff) << 16) | ((bdf) & 0xffff))
#define PCI_SBDF3(s,b,df) ((((s) & 0xffff) << 16) | PCI_BDF2(b, df))
+typedef union {
+ uint32_t sbdf;
+ struct {
+ union {
+ uint16_t bdf;
+ struct {
+ union {
+ struct {
+ uint8_t func : 3,
+ dev : 5;
+ };
+ uint8_t extfunc;
+ };
+ uint8_t bus;
+ };
+ };
+ uint16_t seg;
+ };
+} pci_sbdf_t;
+
struct pci_dev_info {
/*
* VF's 'is_extfn' field is used to indicate whether its PF is an extended
--
2.11.0 (Apple Git-81)
Jan Beulich
2017-09-19 16:04:23 UTC
Permalink
Post by Paul Durrant
Post by Roger Pau Monne
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -257,17 +257,11 @@ void register_g2m_portio_handler(struct domain *d)
}
unsigned int hvm_pci_decode_addr(unsigned int cf8, unsigned int addr,
- unsigned int *bus, unsigned int *slot,
- unsigned int *func)
+ pci_sbdf_t *bdf)
I'd prefer the pointer name to be 'sbdf' rather than 'bdf', but otherwise...
Indeed. Or have a sub-type "struct pci_bdf_t", as the segment
(sadly) isn't relevant yet.
Post by Paul Durrant
Post by Roger Pau Monne
{
- unsigned int bdf;
-
ASSERT(CF8_ENABLED(cf8));
- bdf = CF8_BDF(cf8);
- *bus = PCI_BUS(bdf);
- *slot = PCI_SLOT(bdf);
- *func = PCI_FUNC(bdf);
+ bdf->sbdf = CF8_BDF(cf8);
Filling ->bdf here and setting ->seg explicitly with zero may also
make the current limitation more obvious.
Post by Paul Durrant
Post by Roger Pau Monne
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -1177,17 +1177,15 @@ struct hvm_ioreq_server
*hvm_select_ioreq_server(struct domain *d,
(p->addr & ~3) == 0xcfc &&
CF8_ENABLED(cf8) )
{
- uint32_t sbdf, x86_fam;
- unsigned int bus, slot, func, reg;
+ uint32_t x86_fam;
+ pci_sbdf_t bdf;
+ unsigned int reg;
- reg = hvm_pci_decode_addr(cf8, p->addr, &bus, &slot, &func);
+ reg = hvm_pci_decode_addr(cf8, p->addr, &bdf);
/* PCI config data cycle */
-
- sbdf = XEN_DMOP_PCI_SBDF(0, bus, slot, func);
-
type = XEN_DMOP_IO_RANGE_PCI;
- addr = ((uint64_t)sbdf << 32) | reg;
+ addr = ((uint64_t)bdf.bdf << 32) | reg;
I also wonder why the field used here is bdf instead of sbdf.
It would make for less future changes if you used .sbdf here
right away.

Jan
Roger Pau Monne
2017-09-19 15:29:34 UTC
Permalink
Add handlers for the MSI control, address, data and mask fields in
order to detect accesses to them and setup the interrupts as requested
by the guest.

Note that the pending register is not trapped, and the guest can
freely read/write to it.

Signed-off-by: Roger Pau Monné <***@citrix.com>
Reviewed-by: Paul Durrant <***@citrix.com>
---
Cc: Jan Beulich <***@suse.com>
Cc: Andrew Cooper <***@citrix.com>
Cc: Paul Durrant <***@citrix.com>
---
Changes since v5:
- Update to new lock usage.
- Change handlers to match the new type.
- s/msi_flags/msi_gflags/, remove the local variables and use the new
DOMCTL_VMSI_* defines.
- Change the MSI arch function to take a vpci_msi instead of a
vpci_arch_msi as parameter.
- Fix the calculation of the guest vector for MSI injection to take
into account the number of bits that can be modified.
- Use INVALID_PIRQ everywhere.
- Simplify exit path of vpci_msi_disable.
- Remove the conditional when setting address64 and masking fields.
- Add a process_pending_softirqs to the MSI dump loop.
- Place the prototypes for the MSI arch-specific functions in
xen/vpci.h.
- Add parentheses around the INVALID_PIRQ definition.

Changes since v4:
- Fix commit message.
- Change the ASSERTs in vpci_msi_arch_mask into ifs.
- Introduce INVALID_PIRQ.
- Destroy the partially created bindings in case of failure in
vpci_msi_arch_enable.
- Just take the pcidevs lock once in vpci_msi_arch_disable.
- Print an error message in case of failure of pt_irq_destroy_bind.
- Make vpci_msi_arch_init return void.
- Constify the arch parameter of vpci_msi_arch_print.
- Use fixed instead of cpu for msi redirection.
- Separate the header includes in vpci/msi.c between xen and asm.
- Store the number of configured vectors even if MSI is not enabled
and always return it in vpci_msi_control_read.
- Fix/add comments in vpci_msi_control_write to clarify intended
behavior.
- Simplify usage of masks in vpci_msi_address_{upper_}write.
- Add comment to vpci_msi_mask_{read/write}.
- Don't use MASK_EXTR in vpci_msi_mask_write.
- s/msi_offset/pos/ in vpci_init_msi.
- Move control variable setup closer to it's usage.
- Use d%d in vpci_dump_msi.
- Fix printing of bitfield mask in vpci_dump_msi.
- Fix definition of MSI_ADDR_REDIRECTION_MASK.
- Shuffle the layout of vpci_msi to minimize gaps.
- Remove the error label in vpci_init_msi.

Changes since v3:
- Propagate changes from previous versions: drop xen_ prefix, drop
return value from handlers, use the new vpci_val fields.
- Use MASK_EXTR.
- Remove the usage of GENMASK.
- Add GFLAGS_SHIFT_DEST_ID and use it in msi_flags.
- Add "arch" to the MSI arch specific functions.
- Move the dumping of vPCI MSI information to dump_msi (key 'M').
- Remove the guest_vectors field.
- Allow the guest to change the number of active vectors without
having to disable and enable MSI.
- Check the number of active vectors when parsing the disable
mask.
- Remove the debug messages from vpci_init_msi.
- Move the arch-specific part of the dump handler to x86/hvm/vmsi.c.
- Use trylock in the dump handler to get the vpci lock.

Changes since v2:
- Add an arch-specific abstraction layer. Note that this is only implemented
for x86 currently.
- Add a wrapper to detect MSI enabling for vPCI.

NB: I've only been able to test this with devices using a single MSI interrupt
and no mask register. I will try to find hardware that supports the mask
register and more than one vector, but I cannot make any promises.

If there are doubts about the untested parts we could always force Xen to
report no per-vector masking support and only 1 available vector, but I would
rather avoid doing it.
---
xen/arch/x86/hvm/vmsi.c | 153 ++++++++++++++++++
xen/arch/x86/msi.c | 3 +
xen/drivers/vpci/Makefile | 2 +-
xen/drivers/vpci/msi.c | 366 +++++++++++++++++++++++++++++++++++++++++++
xen/include/asm-x86/hvm/io.h | 5 +
xen/include/asm-x86/msi.h | 1 +
xen/include/xen/irq.h | 1 +
xen/include/xen/vpci.h | 35 +++++
8 files changed, 565 insertions(+), 1 deletion(-)
create mode 100644 xen/drivers/vpci/msi.c

diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 9b35e9b696..3dcde3d882 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -31,6 +31,7 @@
#include <xen/errno.h>
#include <xen/sched.h>
#include <xen/irq.h>
+#include <xen/vpci.h>
#include <public/hvm/ioreq.h>
#include <asm/hvm/io.h>
#include <asm/hvm/vpic.h>
@@ -621,3 +622,155 @@ void msix_write_completion(struct vcpu *v)
if ( msixtbl_write(v, ctrl_address, 4, 0) != X86EMUL_OKAY )
gdprintk(XENLOG_WARNING, "MSI-X write completion failure\n");
}
+
+static unsigned int msi_gflags(uint16_t data, uint64_t addr)
+{
+ /*
+ * We need to use the DOMCTL constants here because the output of this
+ * function is used as input to pt_irq_create_bind, which also takes the
+ * input from the DOMCTL itself.
+ */
+ return MASK_INSR(MASK_EXTR(addr, MSI_ADDR_DEST_ID_MASK),
+ XEN_DOMCTL_VMSI_X86_DEST_ID_MASK) |
+ MASK_INSR(MASK_EXTR(addr, MSI_ADDR_REDIRECTION_MASK),
+ XEN_DOMCTL_VMSI_X86_RH_MASK) |
+ MASK_INSR(MASK_EXTR(addr, MSI_ADDR_DESTMODE_MASK),
+ XEN_DOMCTL_VMSI_X86_DM_MASK) |
+ MASK_INSR(MASK_EXTR(data, MSI_DATA_DELIVERY_MODE_MASK),
+ XEN_DOMCTL_VMSI_X86_DELIV_MASK) |
+ MASK_INSR(MASK_EXTR(data, MSI_DATA_TRIGGER_MASK),
+ XEN_DOMCTL_VMSI_X86_TRIG_MASK);
+}
+
+void vpci_msi_arch_mask(struct vpci_msi *msi, const struct pci_dev *pdev,
+ unsigned int entry, bool mask)
+{
+ const struct pirq *pinfo;
+ struct irq_desc *desc;
+ unsigned long flags;
+ int irq;
+
+ ASSERT(msi->arch.pirq >= 0 && entry < msi->vectors);
+ pinfo = pirq_info(pdev->domain, msi->arch.pirq + entry);
+ if ( !pinfo )
+ return;
+
+ irq = pinfo->arch.irq;
+ if ( irq >= nr_irqs || irq < 0)
+ return;
+
+ desc = irq_to_desc(irq);
+ if ( !desc )
+ return;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ guest_mask_msi_irq(desc, mask);
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+int vpci_msi_arch_enable(struct vpci_msi *msi, const struct pci_dev *pdev,
+ unsigned int vectors)
+{
+ struct msi_info msi_info = {
+ .seg = pdev->seg,
+ .bus = pdev->bus,
+ .devfn = pdev->devfn,
+ .entry_nr = vectors,
+ };
+ unsigned int i;
+ int rc;
+
+ ASSERT(msi->arch.pirq == INVALID_PIRQ);
+
+ /* Get a PIRQ. */
+ rc = allocate_and_map_msi_pirq(pdev->domain, -1, &msi->arch.pirq,
+ MAP_PIRQ_TYPE_MULTI_MSI, &msi_info);
+ if ( rc )
+ {
+ gdprintk(XENLOG_ERR, "%04x:%02x:%02x.%u: failed to map PIRQ: %d\n",
+ pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), rc);
+ return rc;
+ }
+
+ for ( i = 0; i < vectors; i++ )
+ {
+ uint8_t vector = MASK_EXTR(msi->data, MSI_DATA_VECTOR_MASK);
+ uint8_t vector_mask = 0xff >> (8 - fls(msi->vectors) + 1);
+ xen_domctl_bind_pt_irq_t bind = {
+ .machine_irq = msi->arch.pirq + i,
+ .irq_type = PT_IRQ_TYPE_MSI,
+ .u.msi.gvec = (vector & ~vector_mask) |
+ ((vector + i) & vector_mask),
+ .u.msi.gflags = msi_gflags(msi->data, msi->address),
+ };
+
+ pcidevs_lock();
+ rc = pt_irq_create_bind(pdev->domain, &bind);
+ if ( rc )
+ {
+ gdprintk(XENLOG_ERR,
+ "%04x:%02x:%02x.%u: failed to bind PIRQ %u: %d\n",
+ pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), msi->arch.pirq + i, rc);
+ while ( bind.machine_irq-- )
+ pt_irq_destroy_bind(pdev->domain, &bind);
+ spin_lock(&pdev->domain->event_lock);
+ unmap_domain_pirq(pdev->domain, msi->arch.pirq);
+ spin_unlock(&pdev->domain->event_lock);
+ pcidevs_unlock();
+ msi->arch.pirq = INVALID_PIRQ;
+ return rc;
+ }
+ pcidevs_unlock();
+ }
+
+ return 0;
+}
+
+int vpci_msi_arch_disable(struct vpci_msi *msi, const struct pci_dev *pdev)
+{
+ unsigned int i;
+
+ ASSERT(msi->arch.pirq != INVALID_PIRQ);
+
+ pcidevs_lock();
+ for ( i = 0; i < msi->vectors; i++ )
+ {
+ xen_domctl_bind_pt_irq_t bind = {
+ .machine_irq = msi->arch.pirq + i,
+ .irq_type = PT_IRQ_TYPE_MSI,
+ };
+ int rc;
+
+ rc = pt_irq_destroy_bind(pdev->domain, &bind);
+ ASSERT(!rc);
+ }
+
+ spin_lock(&pdev->domain->event_lock);
+ unmap_domain_pirq(pdev->domain, msi->arch.pirq);
+ spin_unlock(&pdev->domain->event_lock);
+ pcidevs_unlock();
+
+ msi->arch.pirq = INVALID_PIRQ;
+
+ return 0;
+}
+
+void vpci_msi_arch_init(struct vpci_msi *msi)
+{
+ msi->arch.pirq = INVALID_PIRQ;
+}
+
+void vpci_msi_arch_print(const struct vpci_msi *msi)
+{
+ printk("vec=%#02x%7s%6s%3sassert%5s%7s dest_id=%lu pirq: %d\n",
+ MASK_EXTR(msi->data, MSI_DATA_VECTOR_MASK),
+ msi->data & MSI_DATA_DELIVERY_LOWPRI ? "lowest" : "fixed",
+ msi->data & MSI_DATA_TRIGGER_LEVEL ? "level" : "edge",
+ msi->data & MSI_DATA_LEVEL_ASSERT ? "" : "de",
+ msi->address & MSI_ADDR_DESTMODE_LOGIC ? "log" : "phys",
+ msi->address & MSI_ADDR_REDIRECTION_LOWPRI ? "lowest" : "fixed",
+ MASK_EXTR(msi->address, MSI_ADDR_DEST_ID_MASK),
+ msi->arch.pirq);
+}
diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index 77998f4fb3..63769153f1 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -30,6 +30,7 @@
#include <public/physdev.h>
#include <xen/iommu.h>
#include <xsm/xsm.h>
+#include <xen/vpci.h>

static s8 __read_mostly use_msi = -1;
boolean_param("msi", use_msi);
@@ -1536,6 +1537,8 @@ static void dump_msi(unsigned char key)
attr.guest_masked ? 'G' : ' ',
mask);
}
+
+ vpci_dump_msi();
}

static int __init msi_setup_keyhandler(void)
diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
index 241467212f..62cec9e82b 100644
--- a/xen/drivers/vpci/Makefile
+++ b/xen/drivers/vpci/Makefile
@@ -1 +1 @@
-obj-y += vpci.o header.o
+obj-y += vpci.o header.o msi.o
diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
new file mode 100644
index 0000000000..933adba0ff
--- /dev/null
+++ b/xen/drivers/vpci/msi.c
@@ -0,0 +1,366 @@
+/*
+ * Handlers for accesses to the MSI capability structure.
+ *
+ * Copyright (C) 2017 Citrix Systems R&D
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/sched.h>
+#include <xen/softirq.h>
+#include <xen/vpci.h>
+
+#include <asm/msi.h>
+
+static uint32_t vpci_msi_control_read(const struct pci_dev *pdev,
+ unsigned int reg, void *data)
+{
+ const struct vpci_msi *msi = data;
+ uint16_t val;
+
+ /* Set the number of supported/configured messages. */
+ val = MASK_INSR(fls(msi->max_vectors) - 1, PCI_MSI_FLAGS_QMASK);
+ val |= MASK_INSR(fls(msi->vectors) - 1, PCI_MSI_FLAGS_QSIZE);
+
+ val |= msi->enabled ? PCI_MSI_FLAGS_ENABLE : 0;
+ val |= msi->masking ? PCI_MSI_FLAGS_MASKBIT : 0;
+ val |= msi->address64 ? PCI_MSI_FLAGS_64BIT : 0;
+
+ return val;
+}
+
+static void vpci_msi_enable(const struct pci_dev *pdev, struct vpci_msi *msi,
+ unsigned int vectors)
+{
+ int ret;
+
+ ASSERT(!msi->enabled);
+ ret = vpci_msi_arch_enable(msi, pdev, vectors);
+ if ( ret )
+ return;
+
+ /* Apply the mask bits. */
+ if ( msi->masking )
+ {
+ unsigned int i;
+ uint32_t mask = msi->mask;
+
+ for ( i = ffs(mask) - 1; mask && i < vectors; i = ffs(mask) - 1 )
+ {
+ vpci_msi_arch_mask(msi, pdev, i, true);
+ __clear_bit(i, &mask);
+ }
+ }
+
+ __msi_set_enable(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), msi->pos, 1);
+
+ msi->enabled = true;
+}
+
+static int vpci_msi_disable(const struct pci_dev *pdev, struct vpci_msi *msi)
+{
+ int ret;
+
+ ASSERT(msi->enabled);
+ __msi_set_enable(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), msi->pos, 0);
+
+ ret = vpci_msi_arch_disable(msi, pdev);
+ if ( !ret )
+ msi->enabled = false;
+
+ return ret;
+}
+
+static void vpci_msi_control_write(const struct pci_dev *pdev,
+ unsigned int reg, uint32_t val, void *data)
+{
+ struct vpci_msi *msi = data;
+ unsigned int vectors = 1 << MASK_EXTR(val, PCI_MSI_FLAGS_QSIZE);
+ bool new_enabled = val & PCI_MSI_FLAGS_ENABLE;
+
+ if ( vectors > msi->max_vectors )
+ vectors = msi->max_vectors;
+
+ /*
+ * No change if the enable field and the number of vectors is
+ * the same or the device is not enabled, in which case the
+ * vectors field can be updated directly.
+ */
+ if ( new_enabled == msi->enabled &&
+ (vectors == msi->vectors || !msi->enabled) )
+ {
+ msi->vectors = vectors;
+ return;
+ }
+
+ if ( new_enabled )
+ {
+ /*
+ * If the device is already enabled it means the number of
+ * enabled messages has changed. Disable and re-enable the
+ * device in order to apply the change.
+ */
+ if ( msi->enabled && vpci_msi_disable(pdev, msi) )
+ /*
+ * Somehow Xen has not been able to disable the
+ * configured MSI messages, leave the device state as-is,
+ * so that the guest can try to disable MSI again.
+ */
+ return;
+
+ vpci_msi_enable(pdev, msi, vectors);
+ }
+ else
+ vpci_msi_disable(pdev, msi);
+
+ msi->vectors = vectors;
+}
+
+/* Handlers for the address field (32bit or low part of a 64bit address). */
+static uint32_t vpci_msi_address_read(const struct pci_dev *pdev,
+ unsigned int reg, void *data)
+{
+ const struct vpci_msi *msi = data;
+
+ return msi->address;
+}
+
+static void vpci_msi_address_write(const struct pci_dev *pdev,
+ unsigned int reg, uint32_t val, void *data)
+{
+ struct vpci_msi *msi = data;
+
+ /* Clear low part. */
+ msi->address &= ~0xffffffffull;
+ msi->address |= val;
+}
+
+/* Handlers for the high part of a 64bit address field. */
+static uint32_t vpci_msi_address_upper_read(const struct pci_dev *pdev,
+ unsigned int reg, void *data)
+{
+ const struct vpci_msi *msi = data;
+
+ return msi->address >> 32;
+}
+
+static void vpci_msi_address_upper_write(const struct pci_dev *pdev,
+ unsigned int reg, uint32_t val,
+ void *data)
+{
+ struct vpci_msi *msi = data;
+
+ /* Clear high part. */
+ msi->address &= 0xffffffff;
+ msi->address |= (uint64_t)val << 32;
+}
+
+/* Handlers for the data field. */
+static uint32_t vpci_msi_data_read(const struct pci_dev *pdev,
+ unsigned int reg, void *data)
+{
+ const struct vpci_msi *msi = data;
+
+ return msi->data;
+}
+
+static void vpci_msi_data_write(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data)
+{
+ struct vpci_msi *msi = data;
+
+ msi->data = val;
+}
+
+/* Handlers for the MSI mask bits. */
+static uint32_t vpci_msi_mask_read(const struct pci_dev *pdev,
+ unsigned int reg, void *data)
+{
+ const struct vpci_msi *msi = data;
+
+ return msi->mask;
+}
+
+static void vpci_msi_mask_write(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data)
+{
+ struct vpci_msi *msi = data;
+ uint32_t dmask;
+
+ dmask = msi->mask ^ val;
+
+ if ( !dmask )
+ return;
+
+ if ( msi->enabled )
+ {
+ unsigned int i;
+
+ for ( i = ffs(dmask) - 1; dmask && i < msi->vectors;
+ i = ffs(dmask) - 1 )
+ {
+ vpci_msi_arch_mask(msi, pdev, i, (val >> i) & 1);
+ __clear_bit(i, &dmask);
+ }
+ }
+
+ msi->mask = val;
+}
+
+static int vpci_init_msi(struct pci_dev *pdev)
+{
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ struct vpci_msi *msi;
+ unsigned int pos;
+ uint16_t control;
+ int ret;
+
+ pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI);
+ if ( !pos )
+ return 0;
+
+ msi = xzalloc(struct vpci_msi);
+ if ( !msi )
+ return -ENOMEM;
+
+ msi->pos = pos;
+
+ ret = vpci_add_register(pdev, vpci_msi_control_read,
+ vpci_msi_control_write,
+ msi_control_reg(pos), 2, msi);
+ if ( ret )
+ {
+ xfree(msi);
+ return ret;
+ }
+
+ /* Get the maximum number of vectors the device supports. */
+ control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
+ msi->max_vectors = multi_msi_capable(control);
+ ASSERT(msi->max_vectors <= 32);
+
+ /* The multiple message enable is 0 after reset (1 message enabled). */
+ msi->vectors = 1;
+
+ /* No PIRQ bound yet. */
+ vpci_msi_arch_init(msi);
+
+ msi->address64 = is_64bit_address(control);
+ msi->masking = is_mask_bit_support(control);
+
+ ret = vpci_add_register(pdev, vpci_msi_address_read,
+ vpci_msi_address_write,
+ msi_lower_address_reg(pos), 4, msi);
+ if ( ret )
+ {
+ xfree(msi);
+ return ret;
+ }
+
+ ret = vpci_add_register(pdev, vpci_msi_data_read, vpci_msi_data_write,
+ msi_data_reg(pos, msi->address64), 2,
+ msi);
+ if ( ret )
+ {
+ xfree(msi);
+ return ret;
+ }
+
+ if ( msi->address64 )
+ {
+ ret = vpci_add_register(pdev, vpci_msi_address_upper_read,
+ vpci_msi_address_upper_write,
+ msi_upper_address_reg(pos), 4, msi);
+ if ( ret )
+ {
+ xfree(msi);
+ return ret;
+ }
+ }
+
+ if ( msi->masking )
+ {
+ ret = vpci_add_register(pdev, vpci_msi_mask_read, vpci_msi_mask_write,
+ msi_mask_bits_reg(pos, msi->address64), 4,
+ msi);
+ if ( ret )
+ {
+ xfree(msi);
+ return ret;
+ }
+ }
+
+ pdev->vpci->msi = msi;
+
+ return 0;
+}
+REGISTER_VPCI_INIT(vpci_init_msi);
+
+void vpci_dump_msi(void)
+{
+ struct domain *d;
+
+ for_each_domain ( d )
+ {
+ const struct pci_dev *pdev;
+
+ if ( !has_vpci(d) )
+ continue;
+
+ printk("vPCI MSI information for d%d\n", d->domain_id);
+
+ list_for_each_entry ( pdev, &d->arch.pdev_list, domain_list )
+ {
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ const struct vpci_msi *msi = pdev->vpci->msi;
+
+ if ( !spin_trylock(&pdev->vpci->lock) )
+ {
+ printk("Unable to get vPCI lock, skipping\n");
+ continue;
+ }
+
+ if ( msi )
+ {
+ printk("Device %04x:%02x:%02x.%u\n", seg, bus, slot, func);
+
+ printk(" Enabled: %u Supports masking: %u 64-bit addresses: %u\n",
+ msi->enabled, msi->masking, msi->address64);
+ printk(" Max vectors: %u enabled vectors: %u\n",
+ msi->max_vectors, msi->vectors);
+
+ vpci_msi_arch_print(msi);
+
+ if ( msi->masking )
+ printk(" mask=%08x\n", msi->mask);
+ }
+
+ spin_unlock(&pdev->vpci->lock);
+ process_pending_softirqs();
+ }
+ }
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h
index ba66e22e04..c47cc971d3 100644
--- a/xen/include/asm-x86/hvm/io.h
+++ b/xen/include/asm-x86/hvm/io.h
@@ -127,6 +127,11 @@ void hvm_dpci_eoi(struct domain *d, unsigned int guest_irq,
void msix_write_completion(struct vcpu *);
void msixtbl_init(struct domain *d);

+/* Arch-specific MSI data for vPCI. */
+struct vpci_arch_msi {
+ int pirq;
+};
+
enum stdvga_cache_state {
STDVGA_CACHE_UNINITIALIZED,
STDVGA_CACHE_ENABLED,
diff --git a/xen/include/asm-x86/msi.h b/xen/include/asm-x86/msi.h
index 37d37b820e..43ab5c6bc6 100644
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -48,6 +48,7 @@
#define MSI_ADDR_REDIRECTION_SHIFT 3
#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT)
#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
+#define MSI_ADDR_REDIRECTION_MASK (1 << MSI_ADDR_REDIRECTION_SHIFT)

#define MSI_ADDR_DEST_ID_SHIFT 12
#define MSI_ADDR_DEST_ID_MASK 0x00ff000
diff --git a/xen/include/xen/irq.h b/xen/include/xen/irq.h
index 0aa817e266..586b78393a 100644
--- a/xen/include/xen/irq.h
+++ b/xen/include/xen/irq.h
@@ -133,6 +133,7 @@ struct pirq {
struct arch_pirq arch;
};

+#define INVALID_PIRQ (-1)
#define pirq_info(d, p) ((struct pirq *)radix_tree_lookup(&(d)->pirq_tree, p))

/* Use this instead of pirq_info() if the structure may need allocating. */
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 4e0b67c2f1..5b582b8012 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -72,6 +72,30 @@ struct vpci {
} bars[7]; /* At most 6 BARS + 1 expansion ROM BAR. */
/* FIXME: currently there's no support for SR-IOV. */
} header;
+
+ /* MSI data. */
+ struct vpci_msi {
+ /* Arch-specific data. */
+ struct vpci_arch_msi arch;
+ /* Address. */
+ uint64_t address;
+ /* Offset of the capability in the config space. */
+ unsigned int pos;
+ /* Maximum number of vectors supported by the device. */
+ unsigned int max_vectors;
+ /* Number of vectors configured. */
+ unsigned int vectors;
+ /* Mask bitfield. */
+ uint32_t mask;
+ /* Data. */
+ uint16_t data;
+ /* Enabled? */
+ bool enabled;
+ /* Supports per-vector masking? */
+ bool masking;
+ /* 64-bit address capable? */
+ bool address64;
+ } *msi;
#endif
};

@@ -80,6 +104,17 @@ struct vpci_vcpu {
struct rangeset *mem;
bool map;
};
+
+void vpci_dump_msi(void);
+
+/* Arch-specific vPCI MSI helpers. */
+void vpci_msi_arch_mask(struct vpci_msi *msi, const struct pci_dev *pdev,
+ unsigned int entry, bool mask);
+int vpci_msi_arch_enable(struct vpci_msi *msi, const struct pci_dev *pdev,
+ unsigned int vectors);
+int vpci_msi_arch_disable(struct vpci_msi *msi, const struct pci_dev *pdev);
+void vpci_msi_arch_init(struct vpci_msi *msi);
+void vpci_msi_arch_print(const struct vpci_msi *msi);
#endif

#endif
--
2.11.0 (Apple Git-81)
Jan Beulich
2017-10-04 08:34:13 UTC
Permalink
Post by Roger Pau Monne
Add handlers for the MSI control, address, data and mask fields in
order to detect accesses to them and setup the interrupts as requested
by the guest.
Note that the pending register is not trapped, and the guest can
freely read/write to it.
I wonder how valid this can be with ...
Post by Roger Pau Monne
- Update to new lock usage.
- Change handlers to match the new type.
- s/msi_flags/msi_gflags/, remove the local variables and use the new
DOMCTL_VMSI_* defines.
- Change the MSI arch function to take a vpci_msi instead of a
vpci_arch_msi as parameter.
- Fix the calculation of the guest vector for MSI injection to take
into account the number of bits that can be modified.
- Use INVALID_PIRQ everywhere.
- Simplify exit path of vpci_msi_disable.
- Remove the conditional when setting address64 and masking fields.
- Add a process_pending_softirqs to the MSI dump loop.
- Place the prototypes for the MSI arch-specific functions in
xen/vpci.h.
- Add parentheses around the INVALID_PIRQ definition.
... this set of changes.
Post by Roger Pau Monne
+void vpci_msi_arch_mask(struct vpci_msi *msi, const struct pci_dev *pdev,
+ unsigned int entry, bool mask)
+{
+ const struct pirq *pinfo;
+ struct irq_desc *desc;
+ unsigned long flags;
+ int irq;
+
+ ASSERT(msi->arch.pirq >= 0 && entry < msi->vectors);
+ pinfo = pirq_info(pdev->domain, msi->arch.pirq + entry);
+ if ( !pinfo )
+ return;
+
+ irq = pinfo->arch.irq;
+ if ( irq >= nr_irqs || irq < 0)
Style. However, ...
Post by Roger Pau Monne
+ return;
+
+ desc = irq_to_desc(irq);
+ if ( !desc )
+ return;
+
+ spin_lock_irqsave(&desc->lock, flags);
... didn't I comment on this already suggesting to use
domain_spin_lock_irq_desc() instead of open coding it?
Post by Roger Pau Monne
+static void vpci_msi_enable(const struct pci_dev *pdev, struct vpci_msi *msi,
+ unsigned int vectors)
+{
+ int ret;
+
+ ASSERT(!msi->enabled);
+ ret = vpci_msi_arch_enable(msi, pdev, vectors);
+ if ( ret )
+ return;
+
+ /* Apply the mask bits. */
+ if ( msi->masking )
+ {
+ unsigned int i;
+ uint32_t mask = msi->mask;
+
+ for ( i = ffs(mask) - 1; mask && i < vectors; i = ffs(mask) - 1 )
+ {
+ vpci_msi_arch_mask(msi, pdev, i, true);
+ __clear_bit(i, &mask);
+ }
+ }
+
+ __msi_set_enable(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), msi->pos, 1);
This is very unlikely to be a function that arch-independent code is
permitted to call.
Post by Roger Pau Monne
+void vpci_dump_msi(void)
+{
+ struct domain *d;
const?
Post by Roger Pau Monne
+ for_each_domain ( d )
You need to rcu_read_lock(&domlist_read_lock) in order to validly use
this construct.
Post by Roger Pau Monne
+ {
+ const struct pci_dev *pdev;
+
+ if ( !has_vpci(d) )
+ continue;
+
+ printk("vPCI MSI information for d%d\n", d->domain_id);
+
+ list_for_each_entry ( pdev, &d->arch.pdev_list, domain_list )
+ {
+ uint8_t seg = pdev->seg, bus = pdev->bus;
+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+ const struct vpci_msi *msi = pdev->vpci->msi;
+
+ if ( !spin_trylock(&pdev->vpci->lock) )
+ {
+ printk("Unable to get vPCI lock, skipping\n");
+ continue;
+ }
+
+ if ( msi )
+ {
+ printk("Device %04x:%02x:%02x.%u\n", seg, bus, slot, func);
+
+ printk(" Enabled: %u Supports masking: %u 64-bit addresses: %u\n",
+ msi->enabled, msi->masking, msi->address64);
bool wants to be printed with %d, I think.
Post by Roger Pau Monne
+ printk(" Max vectors: %u enabled vectors: %u\n",
+ msi->max_vectors, msi->vectors);
+
+ vpci_msi_arch_print(msi);
+
+ if ( msi->masking )
+ printk(" mask=%08x\n", msi->mask);
Is this really worth a separate line? Also please don't separately print
->making as its value will be known from the presence of the field here.

Overall please try to shorten messages such that they're still
meaningful but don't cause unnecesary load on the serial line or extra
wasted space in the ring buffer.
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -72,6 +72,30 @@ struct vpci {
} bars[7]; /* At most 6 BARS + 1 expansion ROM BAR. */
/* FIXME: currently there's no support for SR-IOV. */
} header;
+
+ /* MSI data. */
+ struct vpci_msi {
+ /* Arch-specific data. */
+ struct vpci_arch_msi arch;
+ /* Address. */
+ uint64_t address;
+ /* Offset of the capability in the config space. */
+ unsigned int pos;
+ /* Maximum number of vectors supported by the device. */
+ unsigned int max_vectors;
+ /* Number of vectors configured. */
+ unsigned int vectors;
+ /* Mask bitfield. */
+ uint32_t mask;
+ /* Data. */
+ uint16_t data;
+ /* Enabled? */
+ bool enabled;
+ /* Supports per-vector masking? */
+ bool masking;
+ /* 64-bit address capable? */
+ bool address64;
+ } *msi;
#endif
};
As there may be quite a few instance of this structure, please strive to
keep its size down. Many of the fields above have a pretty limited valid
value range and hence would benefit from using more narrow types and/or
bitfields.

Jan
Roger Pau Monné
2017-10-10 11:35:26 UTC
Permalink
Post by Jan Beulich
Post by Roger Pau Monne
+static void vpci_msi_enable(const struct pci_dev *pdev, struct vpci_msi *msi,
+ unsigned int vectors)
+{
+ int ret;
+
+ ASSERT(!msi->enabled);
+ ret = vpci_msi_arch_enable(msi, pdev, vectors);
+ if ( ret )
+ return;
+
+ /* Apply the mask bits. */
+ if ( msi->masking )
+ {
+ unsigned int i;
+ uint32_t mask = msi->mask;
+
+ for ( i = ffs(mask) - 1; mask && i < vectors; i = ffs(mask) - 1 )
+ {
+ vpci_msi_arch_mask(msi, pdev, i, true);
+ __clear_bit(i, &mask);
+ }
+ }
+
+ __msi_set_enable(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), msi->pos, 1);
This is very unlikely to be a function that arch-independent code is
permitted to call.
Right, I could remove the '__' prefix, or introduce a
vpci_msi_arch_dev_enable helper that calls this function.
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -72,6 +72,30 @@ struct vpci {
} bars[7]; /* At most 6 BARS + 1 expansion ROM BAR. */
/* FIXME: currently there's no support for SR-IOV. */
} header;
+
+ /* MSI data. */
+ struct vpci_msi {
+ /* Arch-specific data. */
+ struct vpci_arch_msi arch;
+ /* Address. */
+ uint64_t address;
+ /* Offset of the capability in the config space. */
+ unsigned int pos;
+ /* Maximum number of vectors supported by the device. */
+ unsigned int max_vectors;
+ /* Number of vectors configured. */
+ unsigned int vectors;
+ /* Mask bitfield. */
+ uint32_t mask;
+ /* Data. */
+ uint16_t data;
+ /* Enabled? */
+ bool enabled;
+ /* Supports per-vector masking? */
+ bool masking;
+ /* 64-bit address capable? */
+ bool address64;
+ } *msi;
#endif
};
As there may be quite a few instance of this structure, please strive to
keep its size down. Many of the fields above have a pretty limited valid
value range and hence would benefit from using more narrow types and/or
bitfields.
max_vectors/vectors can be uint8_t, the rest I'm not sure how to
reduce. I could turn the bools into a bitfield, but isn't a bool
already limited to 1 bit?

Thanks, Roger.
Jan Beulich
2017-10-11 10:25:38 UTC
Permalink
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -72,6 +72,30 @@ struct vpci {
} bars[7]; /* At most 6 BARS + 1 expansion ROM BAR. */
/* FIXME: currently there's no support for SR-IOV. */
} header;
+
+ /* MSI data. */
+ struct vpci_msi {
+ /* Arch-specific data. */
+ struct vpci_arch_msi arch;
+ /* Address. */
+ uint64_t address;
+ /* Offset of the capability in the config space. */
+ unsigned int pos;
+ /* Maximum number of vectors supported by the device. */
+ unsigned int max_vectors;
+ /* Number of vectors configured. */
+ unsigned int vectors;
+ /* Mask bitfield. */
+ uint32_t mask;
+ /* Data. */
+ uint16_t data;
+ /* Enabled? */
+ bool enabled;
+ /* Supports per-vector masking? */
+ bool masking;
+ /* 64-bit address capable? */
+ bool address64;
+ } *msi;
#endif
};
As there may be quite a few instance of this structure, please strive to
keep its size down. Many of the fields above have a pretty limited valid
value range and hence would benefit from using more narrow types and/or
bitfields.
max_vectors/vectors can be uint8_t, the rest I'm not sure how to
reduce.
"pos" can be uint8_t too afaict.
Post by Roger Pau Monné
I could turn the bools into a bitfield, but isn't a bool
already limited to 1 bit?
Plus 7 bits of unused data. The minimum addressable unit is a
byte after all, and non-bitfield members need to allow their
address being taken.

Jan
Roger Pau Monné
2017-10-13 11:17:04 UTC
Permalink
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
+static void vpci_msi_enable(const struct pci_dev *pdev, struct vpci_msi *msi,
+ unsigned int vectors)
+{
+ int ret;
+
+ ASSERT(!msi->enabled);
+ ret = vpci_msi_arch_enable(msi, pdev, vectors);
+ if ( ret )
+ return;
+
+ /* Apply the mask bits. */
+ if ( msi->masking )
+ {
+ unsigned int i;
+ uint32_t mask = msi->mask;
+
+ for ( i = ffs(mask) - 1; mask && i < vectors; i = ffs(mask) - 1 )
+ {
+ vpci_msi_arch_mask(msi, pdev, i, true);
+ __clear_bit(i, &mask);
+ }
+ }
+
+ __msi_set_enable(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), msi->pos, 1);
This is very unlikely to be a function that arch-independent code is
permitted to call.
Right, I could remove the '__' prefix, or introduce a
vpci_msi_arch_dev_enable helper that calls this function.
So would using msi_set_enable instead be acceptable?

Thanks, Roger.
Jan Beulich
2017-10-13 12:20:59 UTC
Permalink
Post by Roger Pau Monné
Post by Roger Pau Monné
Post by Jan Beulich
Post by Roger Pau Monne
+static void vpci_msi_enable(const struct pci_dev *pdev, struct vpci_msi *msi,
+ unsigned int vectors)
+{
+ int ret;
+
+ ASSERT(!msi->enabled);
+ ret = vpci_msi_arch_enable(msi, pdev, vectors);
+ if ( ret )
+ return;
+
+ /* Apply the mask bits. */
+ if ( msi->masking )
+ {
+ unsigned int i;
+ uint32_t mask = msi->mask;
+
+ for ( i = ffs(mask) - 1; mask && i < vectors; i = ffs(mask) - 1 )
+ {
+ vpci_msi_arch_mask(msi, pdev, i, true);
+ __clear_bit(i, &mask);
+ }
+ }
+
+ __msi_set_enable(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn), msi->pos, 1);
This is very unlikely to be a function that arch-independent code is
permitted to call.
Right, I could remove the '__' prefix, or introduce a
vpci_msi_arch_dev_enable helper that calls this function.
So would using msi_set_enable instead be acceptable?
Not really, no, the more that it's static (and should remain so);
__msi_set_enable() not being static is also just because of an
AMD IOMMU oddity. These are low level functions that higher
layers aren't supposed to call directly.

Jan
Jan Beulich
2017-10-04 08:32:38 UTC
Permalink
Post by Roger Pau Monne
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -610,11 +610,17 @@ int pci_size_mem_bar(pci_sbdf_t sbdf, unsigned int pos, bool last,
sbdf.func, pos);
uint64_t addr, size;
bool vf = flags & PCI_BAR_VF;
-
- ASSERT((bar & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_MEMORY);
+ bool rom = flags & PCI_BAR_ROM;
Ideally with this local variable and ...
Post by Roger Pau Monne
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -191,6 +191,8 @@ const char *parse_pci_seg(const char *, unsigned int *seg, unsigned int *bus,
#define _PCI_BAR_VF 0
#define PCI_BAR_VF (1u << _PCI_BAR_VF)
+#define _PCI_BAR_ROM 1
+#define PCI_BAR_ROM (1u << _PCI_BAR_ROM)
... the first of these two dropped
Reviewed-by: Jan Beulich <***@suse.com>

Jan
Jaggi, Manish
2018-01-06 20:17:26 UTC
Permalink
Hello,
________________________________________
Sent: Tuesday, September 19, 2017 8:59 PM
Subject: [Xen-devel] [PATCH v6 00/11] vpci: PCI config space emulation
Hello,
The following series contain an implementation of handlers for the PCI
configuration space inside of Xen. This allows Xen to detect accesses
to the PCI configuration space and react accordingly.
Is someone working on arm port of this series?
I would be happy to test, and help in developing remaining parts if any...

-manish

Loading...