summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/powerpc/pci_iov_resource_on_powernv.txt301
-rw-r--r--arch/powerpc/include/asm/iommu.h3
-rw-r--r--arch/powerpc/include/asm/machdep.h5
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h13
-rw-r--r--arch/powerpc/kernel/pci-common.c20
-rw-r--r--arch/powerpc/kernel/pci_dn.c129
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c772
-rw-r--r--arch/powerpc/platforms/powernv/pci.c18
-rw-r--r--arch/powerpc/platforms/powernv/pci.h9
-rw-r--r--drivers/pci/iov.c155
-rw-r--r--drivers/pci/pci.h2
-rw-r--r--drivers/pci/setup-bus.c95
-rw-r--r--include/linux/pci.h15
13 files changed, 1448 insertions, 89 deletions
diff --git a/Documentation/powerpc/pci_iov_resource_on_powernv.txt b/Documentation/powerpc/pci_iov_resource_on_powernv.txt
new file mode 100644
index 000000000000..b55c5cd83f8d
--- /dev/null
+++ b/Documentation/powerpc/pci_iov_resource_on_powernv.txt
@@ -0,0 +1,301 @@
+Wei Yang <weiyang@linux.vnet.ibm.com>
+Benjamin Herrenschmidt <benh@au1.ibm.com>
+Bjorn Helgaas <bhelgaas@google.com>
+26 Aug 2014
+
+This document describes the requirement from hardware for PCI MMIO resource
+sizing and assignment on PowerKVM and how generic PCI code handles this
+requirement. The first two sections describe the concepts of Partitionable
+Endpoints and the implementation on P8 (IODA2). The next two sections talks
+about considerations on enabling SRIOV on IODA2.
+
+1. Introduction to Partitionable Endpoints
+
+A Partitionable Endpoint (PE) is a way to group the various resources
+associated with a device or a set of devices to provide isolation between
+partitions (i.e., filtering of DMA, MSIs etc.) and to provide a mechanism
+to freeze a device that is causing errors in order to limit the possibility
+of propagation of bad data.
+
+There is thus, in HW, a table of PE states that contains a pair of "frozen"
+state bits (one for MMIO and one for DMA, they get set together but can be
+cleared independently) for each PE.
+
+When a PE is frozen, all stores in any direction are dropped and all loads
+return all 1's value. MSIs are also blocked. There's a bit more state that
+captures things like the details of the error that caused the freeze etc., but
+that's not critical.
+
+The interesting part is how the various PCIe transactions (MMIO, DMA, ...)
+are matched to their corresponding PEs.
+
+The following section provides a rough description of what we have on P8
+(IODA2). Keep in mind that this is all per PHB (PCI host bridge). Each PHB
+is a completely separate HW entity that replicates the entire logic, so has
+its own set of PEs, etc.
+
+2. Implementation of Partitionable Endpoints on P8 (IODA2)
+
+P8 supports up to 256 Partitionable Endpoints per PHB.
+
+ * Inbound
+
+ For DMA, MSIs and inbound PCIe error messages, we have a table (in
+ memory but accessed in HW by the chip) that provides a direct
+ correspondence between a PCIe RID (bus/dev/fn) with a PE number.
+ We call this the RTT.
+
+ - For DMA we then provide an entire address space for each PE that can
+ contain two "windows", depending on the value of PCI address bit 59.
+ Each window can be configured to be remapped via a "TCE table" (IOMMU
+ translation table), which has various configurable characteristics
+ not described here.
+
+ - For MSIs, we have two windows in the address space (one at the top of
+ the 32-bit space and one much higher) which, via a combination of the
+ address and MSI value, will result in one of the 2048 interrupts per
+ bridge being triggered. There's a PE# in the interrupt controller
+ descriptor table as well which is compared with the PE# obtained from
+ the RTT to "authorize" the device to emit that specific interrupt.
+
+ - Error messages just use the RTT.
+
+ * Outbound. That's where the tricky part is.
+
+ Like other PCI host bridges, the Power8 IODA2 PHB supports "windows"
+ from the CPU address space to the PCI address space. There is one M32
+ window and sixteen M64 windows. They have different characteristics.
+ First what they have in common: they forward a configurable portion of
+ the CPU address space to the PCIe bus and must be naturally aligned
+ power of two in size. The rest is different:
+
+ - The M32 window:
+
+ * Is limited to 4GB in size.
+
+ * Drops the top bits of the address (above the size) and replaces
+ them with a configurable value. This is typically used to generate
+ 32-bit PCIe accesses. We configure that window at boot from FW and
+ don't touch it from Linux; it's usually set to forward a 2GB
+ portion of address space from the CPU to PCIe
+ 0x8000_0000..0xffff_ffff. (Note: The top 64KB are actually
+ reserved for MSIs but this is not a problem at this point; we just
+ need to ensure Linux doesn't assign anything there, the M32 logic
+ ignores that however and will forward in that space if we try).
+
+ * It is divided into 256 segments of equal size. A table in the chip
+ maps each segment to a PE#. That allows portions of the MMIO space
+ to be assigned to PEs on a segment granularity. For a 2GB window,
+ the segment granularity is 2GB/256 = 8MB.
+
+ Now, this is the "main" window we use in Linux today (excluding
+ SR-IOV). We basically use the trick of forcing the bridge MMIO windows
+ onto a segment alignment/granularity so that the space behind a bridge
+ can be assigned to a PE.
+
+ Ideally we would like to be able to have individual functions in PEs
+ but that would mean using a completely different address allocation
+ scheme where individual function BARs can be "grouped" to fit in one or
+ more segments.
+
+ - The M64 windows:
+
+ * Must be at least 256MB in size.
+
+ * Do not translate addresses (the address on PCIe is the same as the
+ address on the PowerBus). There is a way to also set the top 14
+ bits which are not conveyed by PowerBus but we don't use this.
+
+ * Can be configured to be segmented. When not segmented, we can
+ specify the PE# for the entire window. When segmented, a window
+ has 256 segments; however, there is no table for mapping a segment
+ to a PE#. The segment number *is* the PE#.
+
+ * Support overlaps. If an address is covered by multiple windows,
+ there's a defined ordering for which window applies.
+
+ We have code (fairly new compared to the M32 stuff) that exploits that
+ for large BARs in 64-bit space:
+
+ We configure an M64 window to cover the entire region of address space
+ that has been assigned by FW for the PHB (about 64GB, ignore the space
+ for the M32, it comes out of a different "reserve"). We configure it
+ as segmented.
+
+ Then we do the same thing as with M32, using the bridge alignment
+ trick, to match to those giant segments.
+
+ Since we cannot remap, we have two additional constraints:
+
+ - We do the PE# allocation *after* the 64-bit space has been assigned
+ because the addresses we use directly determine the PE#. We then
+ update the M32 PE# for the devices that use both 32-bit and 64-bit
+ spaces or assign the remaining PE# to 32-bit only devices.
+
+ - We cannot "group" segments in HW, so if a device ends up using more
+ than one segment, we end up with more than one PE#. There is a HW
+ mechanism to make the freeze state cascade to "companion" PEs but
+ that only works for PCIe error messages (typically used so that if
+ you freeze a switch, it freezes all its children). So we do it in
+ SW. We lose a bit of effectiveness of EEH in that case, but that's
+ the best we found. So when any of the PEs freezes, we freeze the
+ other ones for that "domain". We thus introduce the concept of
+ "master PE" which is the one used for DMA, MSIs, etc., and "secondary
+ PEs" that are used for the remaining M64 segments.
+
+ We would like to investigate using additional M64 windows in "single
+ PE" mode to overlay over specific BARs to work around some of that, for
+ example for devices with very large BARs, e.g., GPUs. It would make
+ sense, but we haven't done it yet.
+
+3. Considerations for SR-IOV on PowerKVM
+
+ * SR-IOV Background
+
+ The PCIe SR-IOV feature allows a single Physical Function (PF) to
+ support several Virtual Functions (VFs). Registers in the PF's SR-IOV
+ Capability control the number of VFs and whether they are enabled.
+
+ When VFs are enabled, they appear in Configuration Space like normal
+ PCI devices, but the BARs in VF config space headers are unusual. For
+ a non-VF device, software uses BARs in the config space header to
+ discover the BAR sizes and assign addresses for them. For VF devices,
+ software uses VF BAR registers in the *PF* SR-IOV Capability to
+ discover sizes and assign addresses. The BARs in the VF's config space
+ header are read-only zeros.
+
+ When a VF BAR in the PF SR-IOV Capability is programmed, it sets the
+ base address for all the corresponding VF(n) BARs. For example, if the
+ PF SR-IOV Capability is programmed to enable eight VFs, and it has a
+ 1MB VF BAR0, the address in that VF BAR sets the base of an 8MB region.
+ This region is divided into eight contiguous 1MB regions, each of which
+ is a BAR0 for one of the VFs. Note that even though the VF BAR
+ describes an 8MB region, the alignment requirement is for a single VF,
+ i.e., 1MB in this example.
+
+ There are several strategies for isolating VFs in PEs:
+
+ - M32 window: There's one M32 window, and it is split into 256
+ equally-sized segments. The finest granularity possible is a 256MB
+ window with 1MB segments. VF BARs that are 1MB or larger could be
+ mapped to separate PEs in this window. Each segment can be
+ individually mapped to a PE via the lookup table, so this is quite
+ flexible, but it works best when all the VF BARs are the same size. If
+ they are different sizes, the entire window has to be small enough that
+ the segment size matches the smallest VF BAR, which means larger VF
+ BARs span several segments.
+
+ - Non-segmented M64 window: A non-segmented M64 window is mapped entirely
+ to a single PE, so it could only isolate one VF.
+
+ - Single segmented M64 windows: A segmented M64 window could be used just
+ like the M32 window, but the segments can't be individually mapped to
+ PEs (the segment number is the PE#), so there isn't as much
+ flexibility. A VF with multiple BARs would have to be in a "domain" of
+ multiple PEs, which is not as well isolated as a single PE.
+
+ - Multiple segmented M64 windows: As usual, each window is split into 256
+ equally-sized segments, and the segment number is the PE#. But if we
+ use several M64 windows, they can be set to different base addresses
+ and different segment sizes. If we have VFs that each have a 1MB BAR
+ and a 32MB BAR, we could use one M64 window to assign 1MB segments and
+ another M64 window to assign 32MB segments.
+
+ Finally, the plan to use M64 windows for SR-IOV, which will be described
+ more in the next two sections. For a given VF BAR, we need to
+ effectively reserve the entire 256 segments (256 * VF BAR size) and
+ position the VF BAR to start at the beginning of a free range of
+ segments/PEs inside that M64 window.
+
+ The goal is of course to be able to give a separate PE for each VF.
+
+ The IODA2 platform has 16 M64 windows, which are used to map MMIO
+ range to PE#. Each M64 window defines one MMIO range and this range is
+ divided into 256 segments, with each segment corresponding to one PE.
+
+ We decide to leverage this M64 window to map VFs to individual PEs, since
+ SR-IOV VF BARs are all the same size.
+
+ But doing so introduces another problem: total_VFs is usually smaller
+ than the number of M64 window segments, so if we map one VF BAR directly
+ to one M64 window, some part of the M64 window will map to another
+ device's MMIO range.
+
+ IODA supports 256 PEs, so segmented windows contain 256 segments, so if
+ total_VFs is less than 256, we have the situation in Figure 1.0, where
+ segments [total_VFs, 255] of the M64 window may map to some MMIO range on
+ other devices:
+
+ 0 1 total_VFs - 1
+ +------+------+- -+------+------+
+ | | | ... | | |
+ +------+------+- -+------+------+
+
+ VF(n) BAR space
+
+ 0 1 total_VFs - 1 255
+ +------+------+- -+------+------+- -+------+------+
+ | | | ... | | | ... | | |
+ +------+------+- -+------+------+- -+------+------+
+
+ M64 window
+
+ Figure 1.0 Direct map VF(n) BAR space
+
+ Our current solution is to allocate 256 segments even if the VF(n) BAR
+ space doesn't need that much, as shown in Figure 1.1:
+
+ 0 1 total_VFs - 1 255
+ +------+------+- -+------+------+- -+------+------+
+ | | | ... | | | ... | | |
+ +------+------+- -+------+------+- -+------+------+
+
+ VF(n) BAR space + extra
+
+ 0 1 total_VFs - 1 255
+ +------+------+- -+------+------+- -+------+------+
+ | | | ... | | | ... | | |
+ +------+------+- -+------+------+- -+------+------+
+
+ M64 window
+
+ Figure 1.1 Map VF(n) BAR space + extra
+
+ Allocating the extra space ensures that the entire M64 window will be
+ assigned to this one SR-IOV device and none of the space will be
+ available for other devices. Note that this only expands the space
+ reserved in software; there are still only total_VFs VFs, and they only
+ respond to segments [0, total_VFs - 1]. There's nothing in hardware that
+ responds to segments [total_VFs, 255].
+
+4. Implications for the Generic PCI Code
+
+The PCIe SR-IOV spec requires that the base of the VF(n) BAR space be
+aligned to the size of an individual VF BAR.
+
+In IODA2, the MMIO address determines the PE#. If the address is in an M32
+window, we can set the PE# by updating the table that translates segments
+to PE#s. Similarly, if the address is in an unsegmented M64 window, we can
+set the PE# for the window. But if it's in a segmented M64 window, the
+segment number is the PE#.
+
+Therefore, the only way to control the PE# for a VF is to change the base
+of the VF(n) BAR space in the VF BAR. If the PCI core allocates the exact
+amount of space required for the VF(n) BAR space, the VF BAR value is fixed
+and cannot be changed.
+
+On the other hand, if the PCI core allocates additional space, the VF BAR
+value can be changed as long as the entire VF(n) BAR space remains inside
+the space allocated by the core.
+
+Ideally the segment size will be the same as an individual VF BAR size.
+Then each VF will be in its own PE. The VF BARs (and therefore the PE#s)
+are contiguous. If VF0 is in PE(x), then VF(n) is in PE(x+n). If we
+allocate 256 segments, there are (256 - numVFs) choices for the PE# of VF0.
+
+If the segment size is smaller than the VF BAR size, it will take several
+segments to cover a VF BAR, and a VF will be in several PEs. This is
+possible, but the isolation isn't as good, and it reduces the number of PE#
+choices because instead of consuming only numVFs segments, the VF(n) BAR
+space will consume (numVFs * n) segments. That means there aren't as many
+available segments for adjusting base of the VF(n) BAR space.
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 0be7d9e13189..1e27d6338565 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -79,6 +79,9 @@ struct iommu_table {
struct iommu_group *it_group;
#endif
void (*set_bypass)(struct iommu_table *tbl, bool enable);
+#ifdef CONFIG_PPC_POWERNV
+ void *data;
+#endif
};
/* Pure 2^n version of get_order */
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 5c19ac527a8e..ef8899432ae7 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -236,6 +236,11 @@ struct machdep_calls {
/* Called after scan and before resource survey */
void (*pcibios_fixup_phb)(struct pci_controller *hose);
+#ifdef CONFIG_PCI_IOV
+ void (*pcibios_fixup_sriov)(struct pci_dev *pdev);
+ resource_size_t (*pcibios_iov_resource_alignment)(struct pci_dev *, int resno);
+#endif /* CONFIG_PCI_IOV */
+
/* Called to shutdown machine specific hardware not already controlled
* by other drivers.
*/
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 7d972bc85638..1811c44bf34b 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -175,6 +175,7 @@ struct iommu_table;
struct pci_dn {
int flags;
+#define PCI_DN_FLAG_IOV_VF 0x01
int busno; /* pci bus number */
int devfn; /* pci device and function number */
@@ -189,13 +190,21 @@ struct pci_dn {
int pci_ext_config_space; /* for pci devices */
- struct pci_dev *pcidev; /* back-pointer to the pci device */
#ifdef CONFIG_EEH
struct eeh_dev *edev; /* eeh device */
#endif
#define IODA_INVALID_PE (-1)
#ifdef CONFIG_PPC_POWERNV
int pe_number;
+#ifdef CONFIG_PCI_IOV
+ u16 vfs_expanded; /* number of VFs IOV BAR expanded */
+ u16 num_vfs; /* number of VFs enabled*/
+ int offset; /* PE# for the first VF PE */
+#define M64_PER_IOV 4
+ int m64_per_iov;
+#define IODA_INVALID_M64 (-1)
+ int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
+#endif /* CONFIG_PCI_IOV */
#endif
struct list_head child_list;
struct list_head list;
@@ -207,6 +216,8 @@ struct pci_dn {
extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
int devfn);
extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev);
+extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev);
+extern void remove_dev_pci_data(struct pci_dev *pdev);
extern void *update_dn_pci_info(struct device_node *dn, void *data);
static inline int pci_device_from_OF_node(struct device_node *np,
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 9052b4fbc41f..0d054068a21d 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -134,6 +134,16 @@ void pcibios_reset_secondary_bus(struct pci_dev *dev)
pci_reset_secondary_bus(dev);
}
+#ifdef CONFIG_PCI_IOV
+resource_size_t pcibios_iov_resource_alignment(struct pci_dev *pdev, int resno)
+{
+ if (ppc_md.pcibios_iov_resource_alignment)
+ return ppc_md.pcibios_iov_resource_alignment(pdev, resno);
+
+ return pci_iov_resource_size(pdev, resno);
+}
+#endif /* CONFIG_PCI_IOV */
+
static resource_size_t pcibios_io_size(const struct pci_controller *hose)
{
#ifdef CONFIG_PPC64
@@ -792,6 +802,10 @@ static void pcibios_fixup_resources(struct pci_dev *dev)
pci_name(dev));
return;
}
+
+ if (dev->is_virtfn)
+ return;
+
for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
struct resource *res = dev->resource + i;
struct pci_bus_region reg;
@@ -995,6 +1009,12 @@ int pcibios_add_device(struct pci_dev *dev)
*/
if (dev->bus->is_added)
pcibios_setup_device(dev);
+
+#ifdef CONFIG_PCI_IOV
+ if (ppc_md.pcibios_fixup_sriov)
+ ppc_md.pcibios_fixup_sriov(dev);
+#endif /* CONFIG_PCI_IOV */
+
return 0;
}
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 65b98367005c..b3b4df91b792 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -136,6 +136,135 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
return NULL;
}
+#ifdef CONFIG_PCI_IOV
+static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
+ struct pci_dev *pdev,
+ int busno, int devfn)
+{
+ struct pci_dn *pdn;
+
+ /* Except PHB, we always have the parent */
+ if (!parent)
+ return NULL;
+
+ pdn = kzalloc(sizeof(*pdn), GFP_KERNEL);
+ if (!pdn) {
+ dev_warn(&pdev->dev, "%s: Out of memory!\n", __func__);
+ return NULL;
+ }
+
+ pdn->phb = parent->phb;
+ pdn->parent = parent;
+ pdn->busno = busno;
+ pdn->devfn = devfn;
+#ifdef CONFIG_PPC_POWERNV
+ pdn->pe_number = IODA_INVALID_PE;
+#endif
+ INIT_LIST_HEAD(&pdn->child_list);
+ INIT_LIST_HEAD(&pdn->list);
+ list_add_tail(&pdn->list, &parent->child_list);
+
+ /*
+ * If we already have PCI device instance, lets
+ * bind them.
+ */
+ if (pdev)
+ pdev->dev.archdata.pci_data = pdn;
+
+ return pdn;
+}
+#endif
+
+struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
+{
+#ifdef CONFIG_PCI_IOV
+ struct pci_dn *parent, *pdn;
+ int i;
+
+ /* Only support IOV for now */
+ if (!pdev->is_physfn)
+ return pci_get_pdn(pdev);
+
+ /* Check if VFs have been populated */
+ pdn = pci_get_pdn(pdev);
+ if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
+ return NULL;
+
+ pdn->flags |= PCI_DN_FLAG_IOV_VF;
+ parent = pci_bus_to_pdn(pdev->bus);
+ if (!parent)
+ return NULL;
+
+ for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
+ pdn = add_one_dev_pci_data(parent, NULL,
+ pci_iov_virtfn_bus(pdev, i),
+ pci_iov_virtfn_devfn(pdev, i));
+ if (!pdn) {
+ dev_warn(&pdev->dev, "%s: Cannot create firmware data for VF#%d\n",
+ __func__, i);
+ return NULL;
+ }
+ }
+#endif /* CONFIG_PCI_IOV */
+
+ return pci_get_pdn(pdev);
+}
+
+void remove_dev_pci_data(struct pci_dev *pdev)
+{
+#ifdef CONFIG_PCI_IOV
+ struct pci_dn *parent;
+ struct pci_dn *pdn, *tmp;
+ int i;
+
+ /*
+ * VF and VF PE are created/released dynamically, so we need to
+ * bind/unbind them. Otherwise the VF and VF PE would be mismatched
+ * when re-enabling SR-IOV.
+ */
+ if (pdev->is_virtfn) {
+ pdn = pci_get_pdn(pdev);
+#ifdef CONFIG_PPC_POWERNV
+ pdn->pe_number = IODA_INVALID_PE;
+#endif
+ return;
+ }
+
+ /* Only support IOV PF for now */
+ if (!pdev->is_physfn)
+ return;
+
+ /* Check if VFs have been populated */
+ pdn = pci_get_pdn(pdev);
+ if (!pdn || !(pdn->flags & PCI_DN_FLAG_IOV_VF))
+ return;
+
+ pdn->flags &= ~PCI_DN_FLAG_IOV_VF;
+ parent = pci_bus_to_pdn(pdev->bus);
+ if (!parent)
+ return;
+
+ /*
+ * We might introduce flag to pci_dn in future
+ * so that we can release VF's firmware data in
+ * a batch mode.
+ */
+ for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
+ list_for_each_entry_safe(pdn, tmp,
+ &parent->child_list, list) {
+ if (pdn->busno != pci_iov_virtfn_bus(pdev, i) ||
+ pdn->devfn != pci_iov_virtfn_devfn(pdev, i))
+ continue;
+
+ if (!list_empty(&pdn->list))
+ list_del(&pdn->list);
+
+ kfree(pdn);
+ }
+ }
+#endif /* CONFIG_PCI_IOV */
+}
+
/*
* Traverse_func that inits the PCI fields of the device node.
* NOTE: this *must* be done before read/write config to the device.
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5e917753c672..920c252d1f49 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -44,6 +44,9 @@
#include "powernv.h"
#include "pci.h"
+/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
+#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8)
+
static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...)
{
@@ -56,11 +59,18 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
vaf.fmt = fmt;
vaf.va = &args;
- if (pe->pdev)
+ if (pe->flags & PNV_IODA_PE_DEV)
strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
- else
+ else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
sprintf(pfix, "%04x:%02x ",
pci_domain_nr(pe->pbus), pe->pbus->number);
+#ifdef CONFIG_PCI_IOV
+ else if (pe->flags & PNV_IODA_PE_VF)
+ sprintf(pfix, "%04x:%02x:%2x.%d",
+ pci_domain_nr(pe->parent_dev->bus),
+ (pe->rid & 0xff00) >> 8,
+ PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
+#endif /* CONFIG_PCI_IOV*/
printk("%spci %s: [PE# %.3d] %pV",
level, pfix, pe->pe_number, &vaf);
@@ -591,7 +601,7 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
bool is_add)
{
struct pnv_ioda_pe *slave;
- struct pci_dev *pdev;
+ struct pci_dev *pdev = NULL;
int ret;
/*
@@ -630,8 +640,12 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
pdev = pe->pbus->self;
- else
+ else if (pe->flags & PNV_IODA_PE_DEV)
pdev = pe->pdev->bus->self;
+#ifdef CONFIG_PCI_IOV
+ else if (pe->flags & PNV_IODA_PE_VF)
+ pdev = pe->parent_dev->bus->self;
+#endif /* CONFIG_PCI_IOV */
while (pdev) {
struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *parent;
@@ -649,6 +663,87 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
return 0;
}
+#ifdef CONFIG_PCI_IOV
+static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+{
+ struct pci_dev *parent;
+ uint8_t bcomp, dcomp, fcomp;
+ int64_t rc;
+ long rid_end, rid;
+
+ /* Currently, we just deconfigure VF PE. Bus PE will always there.*/
+ if (pe->pbus) {
+ int count;
+
+ dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+ parent = pe->pbus->self;
+ if (pe->flags & PNV_IODA_PE_BUS_ALL)
+ count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+ else
+ count = 1;
+
+ switch(count) {
+ case 1: bcomp = OpalPciBusAll; break;
+ case 2: bcomp = OpalPciBus7Bits; break;
+ case 4: bcomp = OpalPciBus6Bits; break;
+ case 8: bcomp = OpalPciBus5Bits; break;
+ case 16: bcomp = OpalPciBus4Bits; break;
+ case 32: bcomp = OpalPciBus3Bits; break;
+ default:
+ dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
+ count);
+ /* Do an exact match only */
+ bcomp = OpalPciBusAll;
+ }
+ rid_end = pe->rid + (count << 8);
+ } else {
+ if (pe->flags & PNV_IODA_PE_VF)
+ parent = pe->parent_dev;
+ else
+ parent = pe->pdev->bus->self;
+ bcomp = OpalPciBusAll;
+ dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+ rid_end = pe->rid + 1;
+ }
+
+ /* Clear the reverse map */
+ for (rid = pe->rid; rid < rid_end; rid++)
+ phb->ioda.pe_rmap[rid] = 0;
+
+ /* Release from all parents PELT-V */
+ while (parent) {
+ struct pci_dn *pdn = pci_get_pdn(parent);
+ if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+ rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
+ pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+ /* XXX What to do in case of error ? */
+ }
+ parent = parent->bus->self;
+ }
+
+ opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+ /* Disassociate PE in PELT */
+ rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
+ pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+ if (rc)
+ pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
+ rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+ bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
+ if (rc)
+ pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+
+ pe->pbus = NULL;
+ pe->pdev = NULL;
+ pe->parent_dev = NULL;
+
+ return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
{
struct pci_dev *parent;
@@ -675,15 +770,19 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
case 16: bcomp = OpalPciBus4Bits; break;
case 32: bcomp = OpalPciBus3Bits; break;
default:
- pr_err("%s: Number of subordinate busses %d"
- " unsupported\n",
- pci_name(pe->pbus->self), count);
+ dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
+ count);
/* Do an exact match only */
bcomp = OpalPciBusAll;
}
rid_end = pe->rid + (count << 8);
} else {
- parent = pe->pdev->bus->self;
+#ifdef CONFIG_PCI_IOV
+ if (pe->flags & PNV_IODA_PE_VF)
+ parent = pe->parent_dev;
+ else
+#endif /* CONFIG_PCI_IOV */
+ parent = pe->pdev->bus->self;
bcomp = OpalPciBusAll;
dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
@@ -774,6 +873,78 @@ static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
return 10;
}
+#ifdef CONFIG_PCI_IOV
+static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
+{
+ struct pci_dn *pdn = pci_get_pdn(dev);
+ int i;
+ struct resource *res, res2;
+ resource_size_t size;
+ u16 num_vfs;
+
+ if (!dev->is_physfn)
+ return -EINVAL;
+
+ /*
+ * "offset" is in VFs. The M64 windows are sized so that when they
+ * are segmented, each segment is the same size as the IOV BAR.
+ * Each segment is in a separate PE, and the high order bits of the
+ * address are the PE number. Therefore, each VF's BAR is in a
+ * separate PE, and changing the IOV BAR start address changes the
+ * range of PEs the VFs are in.
+ */
+ num_vfs = pdn->num_vfs;
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+
+ if (!pnv_pci_is_mem_pref_64(res->flags))
+ continue;
+
+ /*
+ * The actual IOV BAR range is determined by the start address
+ * and the actual size for num_vfs VFs BAR. This check is to
+ * make sure that after shifting, the range will not overlap
+ * with another device.
+ */
+ size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+ res2.flags = res->flags;
+ res2.start = res->start + (size * offset);
+ res2.end = res2.start + (size * num_vfs) - 1;
+
+ if (res2.end > res->end) {
+ dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
+ i, &res2, res, num_vfs, offset);
+ return -EBUSY;
+ }
+ }
+
+ /*
+ * After doing so, there would be a "hole" in the /proc/iomem when
+ * offset is a positive value. It looks like the device return some
+ * mmio back to the system, which actually no one could use it.
+ */
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+
+ if (!pnv_pci_is_mem_pref_64(res->flags))
+ continue;
+
+ size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+ res2 = *res;
+ res->start += size * offset;
+
+ dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n",
+ i, &res2, res, num_vfs, offset);
+ pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+ }
+ return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
#if 0
static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
{
@@ -857,7 +1028,6 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
pci_name(dev));
continue;
}
- pdn->pcidev = dev;
pdn->pe_number = pe->pe_number;
pe->dma_weight += pnv_ioda_dma_weight(dev);
if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
@@ -916,6 +1086,10 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
return;
}
+ pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
+ GFP_KERNEL, hose->node);
+ pe->tce32_table->data = pe;
+
/* Associate it with all child devices */
pnv_ioda_setup_same_PE(bus, pe);
@@ -974,6 +1148,441 @@ static void pnv_pci_ioda_setup_PEs(void)
}
}
+#ifdef CONFIG_PCI_IOV
+static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ int i, j;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+ for (j = 0; j < M64_PER_IOV; j++) {
+ if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
+ continue;
+ opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
+ clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
+ pdn->m64_wins[i][j] = IODA_INVALID_M64;
+ }
+
+ return 0;
+}
+
+static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ unsigned int win;
+ struct resource *res;
+ int i, j;
+ int64_t rc;
+ int total_vfs;
+ resource_size_t size, start;
+ int pe_num;
+ int vf_groups;
+ int vf_per_group;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+ total_vfs = pci_sriov_get_totalvfs(pdev);
+
+ /* Initialize the m64_wins to IODA_INVALID_M64 */
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+ for (j = 0; j < M64_PER_IOV; j++)
+ pdn->m64_wins[i][j] = IODA_INVALID_M64;
+
+ if (pdn->m64_per_iov == M64_PER_IOV) {
+ vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
+ vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
+ roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
+ } else {
+ vf_groups = 1;
+ vf_per_group = 1;
+ }
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+
+ if (!pnv_pci_is_mem_pref_64(res->flags))
+ continue;
+
+ for (j = 0; j < vf_groups; j++) {
+ do {
+ win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
+ phb->ioda.m64_bar_idx + 1, 0);
+
+ if (win >= phb->ioda.m64_bar_idx + 1)
+ goto m64_failed;
+ } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+
+ pdn->m64_wins[i][j] = win;
+
+ if (pdn->m64_per_iov == M64_PER_IOV) {
+ size = pci_iov_resource_size(pdev,
+ PCI_IOV_RESOURCES + i);
+ size = size * vf_per_group;
+ start = res->start + size * j;
+ } else {
+ size = resource_size(res);
+ start = res->start;
+ }
+
+ /* Map the M64 here */
+ if (pdn->m64_per_iov == M64_PER_IOV) {
+ pe_num = pdn->offset + j;
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe_num, OPAL_M64_WINDOW_TYPE,
+ pdn->m64_wins[i][j], 0);
+ }
+
+ rc = opal_pci_set_phb_mem_window(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ pdn->m64_wins[i][j],
+ start,
+ 0, /* unused */
+ size);
+
+
+ if (rc != OPAL_SUCCESS) {
+ dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
+ win, rc);
+ goto m64_failed;
+ }
+
+ if (pdn->m64_per_iov == M64_PER_IOV)
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
+ else
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
+
+ if (rc != OPAL_SUCCESS) {
+ dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
+ win, rc);
+ goto m64_failed;
+ }
+ }
+ }
+ return 0;
+
+m64_failed:
+ pnv_pci_vf_release_m64(pdev);
+ return -EBUSY;
+}
+
+static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct iommu_table *tbl;
+ unsigned long addr;
+ int64_t rc;
+
+ bus = dev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ tbl = pe->tce32_table;
+ addr = tbl->it_base;
+
+ opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+ pe->pe_number << 1, 1, __pa(addr),
+ 0, 0x1000);
+
+ rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+ pe->pe_number,
+ (pe->pe_number << 1) + 1,
+ pe->tce_bypass_base,
+ 0);
+ if (rc)
+ pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
+
+ iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
+ free_pages(addr, get_order(TCE32_TABLE_SIZE));
+ pe->tce32_table = NULL;
+}
+
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe, *pe_n;
+ struct pci_dn *pdn;
+ u16 vf_index;
+ int64_t rc;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ if (!pdev->is_physfn)
+ return;
+
+ if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
+ int vf_group;
+ int vf_per_group;
+ int vf_index1;
+
+ vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
+
+ for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
+ for (vf_index = vf_group * vf_per_group;
+ vf_index < (vf_group + 1) * vf_per_group &&
+ vf_index < num_vfs;
+ vf_index++)
+ for (vf_index1 = vf_group * vf_per_group;
+ vf_index1 < (vf_group + 1) * vf_per_group &&
+ vf_index1 < num_vfs;
+ vf_index1++){
+
+ rc = opal_pci_set_peltv(phb->opal_id,
+ pdn->offset + vf_index,
+ pdn->offset + vf_index1,
+ OPAL_REMOVE_PE_FROM_DOMAIN);
+
+ if (rc)
+ dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
+ __func__,
+ pdn->offset + vf_index1, rc);
+ }
+ }
+
+ list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
+ if (pe->parent_dev != pdev)
+ continue;
+
+ pnv_pci_ioda2_release_dma_pe(pdev, pe);
+
+ /* Remove from list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
+ list_del(&pe->list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
+ pnv_ioda_deconfigure_pe(phb, pe);
+
+ pnv_ioda_free_pe(phb, pe->pe_number);
+ }
+}
+
+void pnv_pci_sriov_disable(struct pci_dev *pdev)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ struct pci_sriov *iov;
+ u16 num_vfs;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+ iov = pdev->sriov;
+ num_vfs = pdn->num_vfs;
+
+ /* Release VF PEs */
+ pnv_ioda_release_vf_PE(pdev, num_vfs);
+
+ if (phb->type == PNV_PHB_IODA2) {
+ if (pdn->m64_per_iov == 1)
+ pnv_pci_vf_resource_shift(pdev, -pdn->offset);
+
+ /* Release M64 windows */
+ pnv_pci_vf_release_m64(pdev);
+
+ /* Release PE numbers */
+ bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
+ pdn->offset = 0;
+ }
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe);
+static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
+ int pe_num;
+ u16 vf_index;
+ struct pci_dn *pdn;
+ int64_t rc;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ if (!pdev->is_physfn)
+ return;
+
+ /* Reserve PE for each VF */
+ for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+ pe_num = pdn->offset + vf_index;
+
+ pe = &phb->ioda.pe_array[pe_num];
+ pe->pe_number = pe_num;
+ pe->phb = phb;
+ pe->flags = PNV_IODA_PE_VF;
+ pe->pbus = NULL;
+ pe->parent_dev = pdev;
+ pe->tce32_seg = -1;
+ pe->mve_number = -1;
+ pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
+ pci_iov_virtfn_devfn(pdev, vf_index);
+
+ pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n",
+ hose->global_number, pdev->bus->number,
+ PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
+ PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
+
+ if (pnv_ioda_configure_pe(phb, pe)) {
+ /* XXX What do we do here ? */
+ if (pe_num)
+ pnv_ioda_free_pe(phb, pe_num);
+ pe->pdev = NULL;
+ continue;
+ }
+
+ pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
+ GFP_KERNEL, hose->node);
+ pe->tce32_table->data = pe;
+
+ /* Put PE to the list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
+ list_add_tail(&pe->list, &phb->ioda.pe_list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
+ pnv_pci_ioda2_setup_dma_pe(phb, pe);
+ }
+
+ if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
+ int vf_group;
+ int vf_per_group;
+ int vf_index1;
+
+ vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
+
+ for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
+ for (vf_index = vf_group * vf_per_group;
+ vf_index < (vf_group + 1) * vf_per_group &&
+ vf_index < num_vfs;
+ vf_index++) {
+ for (vf_index1 = vf_group * vf_per_group;
+ vf_index1 < (vf_group + 1) * vf_per_group &&
+ vf_index1 < num_vfs;
+ vf_index1++) {
+
+ rc = opal_pci_set_peltv(phb->opal_id,
+ pdn->offset + vf_index,
+ pdn->offset + vf_index1,
+ OPAL_ADD_PE_TO_DOMAIN);
+
+ if (rc)
+ dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
+ __func__,
+ pdn->offset + vf_index1, rc);
+ }
+ }
+ }
+ }
+}
+
+int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ int ret;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ if (phb->type == PNV_PHB_IODA2) {
+ /* Calculate available PE for required VFs */
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
+ pdn->offset = bitmap_find_next_zero_area(
+ phb->ioda.pe_alloc, phb->ioda.total_pe,
+ 0, num_vfs, 0);
+ if (pdn->offset >= phb->ioda.total_pe) {
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
+ dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
+ pdn->offset = 0;
+ return -EBUSY;
+ }
+ bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
+ pdn->num_vfs = num_vfs;
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
+
+ /* Assign M64 window accordingly */
+ ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
+ if (ret) {
+ dev_info(&pdev->dev, "Not enough M64 window resources\n");
+ goto m64_failed;
+ }
+
+ /*
+ * When using one M64 BAR to map one IOV BAR, we need to shift
+ * the IOV BAR according to the PE# allocated to the VFs.
+ * Otherwise, the PE# for the VF will conflict with others.
+ */
+ if (pdn->m64_per_iov == 1) {
+ ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
+ if (ret)
+ goto m64_failed;
+ }
+ }
+
+ /* Setup VF PEs */
+ pnv_ioda_setup_vf_PE(pdev, num_vfs);
+
+ return 0;
+
+m64_failed:
+ bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
+ pdn->offset = 0;
+
+ return ret;
+}
+
+int pcibios_sriov_disable(struct pci_dev *pdev)
+{
+ pnv_pci_sriov_disable(pdev);
+
+ /* Release PCI data */
+ remove_dev_pci_data(pdev);
+ return 0;
+}
+
+int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+ /* Allocate PCI data */
+ add_dev_pci_data(pdev);
+
+ pnv_pci_sriov_enable(pdev, num_vfs);
+ return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
{
struct pci_dn *pdn = pci_get_pdn(pdev);
@@ -989,7 +1598,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
pe = &phb->ioda.pe_array[pdn->pe_number];
WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
- set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table);
+ set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table);
}
static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
@@ -1016,7 +1625,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
} else {
dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
set_dma_ops(&pdev->dev, &dma_iommu_ops);
- set_iommu_table_base(&pdev->dev, &pe->tce32_table);
+ set_iommu_table_base(&pdev->dev, pe->tce32_table);
}
*pdev->dev.dma_mask = dma_mask;
return 0;
@@ -1053,9 +1662,9 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
list_for_each_entry(dev, &bus->devices, bus_list) {
if (add_to_iommu_group)
set_iommu_table_base_and_group(&dev->dev,
- &pe->tce32_table);
+ pe->tce32_table);
else
- set_iommu_table_base(&dev->dev, &pe->tce32_table);
+ set_iommu_table_base(&dev->dev, pe->tce32_table);
if (dev->subordinate)
pnv_ioda_setup_bus_dma(pe, dev->subordinate,
@@ -1145,8 +1754,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
__be64 *startp, __be64 *endp, bool rm)
{
- struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
- tce32_table);
+ struct pnv_ioda_pe *pe = tbl->data;
struct pnv_phb *phb = pe->phb;
if (phb->type == PNV_PHB_IODA1)
@@ -1167,9 +1775,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
int64_t rc;
void *addr;
- /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
-#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8)
-
/* XXX FIXME: Handle 64-bit only DMA devices */
/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
/* XXX FIXME: Allocate multi-level tables on PHB3 */
@@ -1212,7 +1817,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
}
/* Setup linux iommu table */
- tbl = &pe->tce32_table;
+ tbl = pe->tce32_table;
pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
base << 28, IOMMU_PAGE_SHIFT_4K);
@@ -1232,12 +1837,19 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
TCE_PCI_SWINV_PAIR);
}
iommu_init_table(tbl, phb->hose->node);
- iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
- if (pe->pdev)
+ if (pe->flags & PNV_IODA_PE_DEV) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
- else
+ } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+ } else if (pe->flags & PNV_IODA_PE_VF) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
+ }
return;
fail:
@@ -1250,8 +1862,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
{
- struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
- tce32_table);
+ struct pnv_ioda_pe *pe = tbl->data;
uint16_t window_id = (pe->pe_number << 1 ) + 1;
int64_t rc;
@@ -1296,10 +1907,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
pe->tce_bypass_base = 1ull << 59;
/* Install set_bypass callback for VFIO */
- pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass;
+ pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass;
/* Enable bypass by default */
- pnv_pci_ioda2_set_bypass(&pe->tce32_table, true);
+ pnv_pci_ioda2_set_bypass(pe->tce32_table, true);
}
static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
@@ -1347,7 +1958,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
}
/* Setup linux iommu table */
- tbl = &pe->tce32_table;
+ tbl = pe->tce32_table;
pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
IOMMU_PAGE_SHIFT_4K);
@@ -1365,12 +1976,19 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
}
iommu_init_table(tbl, phb->hose->node);
- iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
- if (pe->pdev)
+ if (pe->flags & PNV_IODA_PE_DEV) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
- else
+ } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+ } else if (pe->flags & PNV_IODA_PE_VF) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
+ }
/* Also create a bypass window */
if (!pnv_iommu_bypass_disabled)
@@ -1731,6 +2349,73 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
#endif /* CONFIG_PCI_MSI */
+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct resource *res;
+ int i;
+ resource_size_t size;
+ struct pci_dn *pdn;
+ int mul, total_vfs;
+
+ if (!pdev->is_physfn || pdev->is_added)
+ return;
+
+ hose = pci_bus_to_host(pdev->bus);
+ phb = hose->private_data;
+
+ pdn = pci_get_pdn(pdev);
+ pdn->vfs_expanded = 0;
+
+ total_vfs = pci_sriov_get_totalvfs(pdev);
+ pdn->m64_per_iov = 1;
+ mul = phb->ioda.total_pe;
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || res->parent)
+ continue;
+ if (!pnv_pci_is_mem_pref_64(res->flags)) {
+ dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
+ i, res);
+ continue;
+ }
+
+ size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+
+ /* bigger than 64M */
+ if (size > (1 << 26)) {
+ dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
+ i, res);
+ pdn->m64_per_iov = M64_PER_IOV;
+ mul = roundup_pow_of_two(total_vfs);
+ break;
+ }
+ }
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || res->parent)
+ continue;
+ if (!pnv_pci_is_mem_pref_64(res->flags)) {
+ dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
+ i, res);
+ continue;
+ }
+
+ dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
+ size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+ res->end = res->start + size * mul - 1;
+ dev_dbg(&pdev->dev, " %pR\n", res);
+ dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
+ i, res, mul);
+ }
+ pdn->vfs_expanded = mul;
+}
+#endif /* CONFIG_PCI_IOV */
+
/*
* This function is supposed to be called on basis of PE from top
* to bottom style. So the the I/O or MMIO segment assigned to
@@ -1908,6 +2593,25 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
return phb->ioda.io_segsize;
}
+#ifdef CONFIG_PCI_IOV
+static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
+ int resno)
+{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ resource_size_t align, iov_align;
+
+ iov_align = resource_size(&pdev->resource[resno]);
+ if (iov_align)
+ return iov_align;
+
+ align = pci_iov_resource_size(pdev, resno);
+ if (pdn->vfs_expanded)
+ return pdn->vfs_expanded * align;
+
+ return align;
+}
+#endif /* CONFIG_PCI_IOV */
+
/* Prevent enabling devices for which we couldn't properly
* assign a PE
*/
@@ -1993,6 +2697,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->hub_id = hub_id;
phb->opal_id = phb_id;
phb->type = ioda_type;
+ mutex_init(&phb->ioda.pe_alloc_mutex);
/* Detect specific models for error handling */
if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
@@ -2052,6 +2757,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
INIT_LIST_HEAD(&phb->ioda.pe_list);
+ mutex_init(&phb->ioda.pe_list_mutex);
/* Calculate how many 32-bit TCE segments we have */
phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
@@ -2106,6 +2812,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
pnv_pci_controller_ops.enable_device_hook = pnv_pci_enable_device_hook;
pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
+
+#ifdef CONFIG_PCI_IOV
+ ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
+ ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
+#endif
+
pci_add_flags(PCI_REASSIGN_ALL_RSRC);
/* Reset IODA tables to a clean state */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index fa96aa8aa1e2..bca2aeb6e4b6 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -666,6 +666,24 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
{
struct pci_controller *hose = pci_bus_to_host(pdev->bus);
struct pnv_phb *phb = hose->private_data;
+#ifdef CONFIG_PCI_IOV
+ struct pnv_ioda_pe *pe;
+ struct pci_dn *pdn;
+
+ /* Fix the VF pdn PE number */
+ if (pdev->is_virtfn) {
+ pdn = pci_get_pdn(pdev);
+ WARN_ON(pdn->pe_number != IODA_INVALID_PE);
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ if (pe->rid == ((pdev->bus->number << 8) |
+ (pdev->devfn & 0xff))) {
+ pdn->pe_number = pe->pe_number;
+ pe->pdev = pdev;
+ break;
+ }
+ }
+ }
+#endif /* CONFIG_PCI_IOV */
if (phb && phb->dma_dev_setup)
phb->dma_dev_setup(phb, pdev);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 1f0cb66133a1..070ee888fc95 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -23,6 +23,7 @@ enum pnv_phb_model {
#define PNV_IODA_PE_BUS_ALL (1 << 2) /* PE has subordinate buses */
#define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */
#define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */
+#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */
/* Data associated with a PE, including IOMMU tracking etc.. */
struct pnv_phb;
@@ -34,6 +35,9 @@ struct pnv_ioda_pe {
* entire bus (& children). In the former case, pdev
* is populated, in the later case, pbus is.
*/
+#ifdef CONFIG_PCI_IOV
+ struct pci_dev *parent_dev;
+#endif
struct pci_dev *pdev;
struct pci_bus *pbus;
@@ -53,7 +57,7 @@ struct pnv_ioda_pe {
/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
int tce32_seg;
int tce32_segcount;
- struct iommu_table tce32_table;
+ struct iommu_table *tce32_table;
phys_addr_t tce_inval_reg_phys;
/* 64-bit TCE bypass region */
@@ -145,6 +149,8 @@ struct pnv_phb {
/* PE allocation bitmap */
unsigned long *pe_alloc;
+ /* PE allocation mutex */
+ struct mutex pe_alloc_mutex;
/* M32 & IO segment maps */
unsigned int *m32_segmap;
@@ -159,6 +165,7 @@ struct pnv_phb {
* on the sequence of creation
*/
struct list_head pe_list;
+ struct mutex pe_list_mutex;
/* Reverse map of PEs, will have to extend if
* we are to support more than 256 PEs, indexed
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 4b3a4eaad996..ee0ebff103a4 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -19,16 +19,59 @@
#define VIRTFN_ID_LEN 16
-static inline u8 virtfn_bus(struct pci_dev *dev, int id)
+int pci_iov_virtfn_bus(struct pci_dev *dev, int vf_id)
{
+ if (!dev->is_physfn)
+ return -EINVAL;
return dev->bus->number + ((dev->devfn + dev->sriov->offset +
- dev->sriov->stride * id) >> 8);
+ dev->sriov->stride * vf_id) >> 8);
}
-static inline u8 virtfn_devfn(struct pci_dev *dev, int id)
+int pci_iov_virtfn_devfn(struct pci_dev *dev, int vf_id)
{
+ if (!dev->is_physfn)
+ return -EINVAL;
return (dev->devfn + dev->sriov->offset +
- dev->sriov->stride * id) & 0xff;
+ dev->sriov->stride * vf_id) & 0xff;
+}
+
+/*
+ * Per SR-IOV spec sec 3.3.10 and 3.3.11, First VF Offset and VF Stride may
+ * change when NumVFs changes.
+ *
+ * Update iov->offset and iov->stride when NumVFs is written.
+ */
+static inline void pci_iov_set_numvfs(struct pci_dev *dev, int nr_virtfn)
+{
+ struct pci_sriov *iov = dev->sriov;
+
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
+ pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &iov->offset);
+ pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &iov->stride);
+}
+
+/*
+ * The PF consumes one bus number. NumVFs, First VF Offset, and VF Stride
+ * determine how many additional bus numbers will be consumed by VFs.
+ *
+ * Iterate over all valid NumVFs and calculate the maximum number of bus
+ * numbers that could ever be required.
+ */
+static inline u8 virtfn_max_buses(struct pci_dev *dev)
+{
+ struct pci_sriov *iov = dev->sriov;
+ int nr_virtfn;
+ u8 max = 0;
+ int busnr;
+
+ for (nr_virtfn = 1; nr_virtfn <= iov->total_VFs; nr_virtfn++) {
+ pci_iov_set_numvfs(dev, nr_virtfn);
+ busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
+ if (busnr > max)
+ max = busnr;
+ }
+
+ return max;
}
static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
@@ -57,6 +100,14 @@ static void virtfn_remove_bus(struct pci_bus *physbus, struct pci_bus *virtbus)
pci_remove_bus(virtbus);
}
+resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
+{
+ if (!dev->is_physfn)
+ return 0;
+
+ return dev->sriov->barsz[resno - PCI_IOV_RESOURCES];
+}
+
static int virtfn_add(struct pci_dev *dev, int id, int reset)
{
int i;
@@ -69,7 +120,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
struct pci_bus *bus;
mutex_lock(&iov->dev->sriov->lock);
- bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id));
+ bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
if (!bus)
goto failed;
@@ -77,7 +128,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
if (!virtfn)
goto failed0;
- virtfn->devfn = virtfn_devfn(dev, id);
+ virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
virtfn->vendor = dev->vendor;
pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device);
pci_setup_device(virtfn);
@@ -87,13 +138,12 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
virtfn->multifunction = 0;
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = dev->resource + PCI_IOV_RESOURCES + i;
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
if (!res->parent)
continue;
virtfn->resource[i].name = pci_name(virtfn);
virtfn->resource[i].flags = res->flags;
- size = resource_size(res);
- do_div(size, iov->total_VFs);
+ size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
virtfn->resource[i].start = res->start + size * id;
virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
rc = request_resource(res, &virtfn->resource[i]);
@@ -140,8 +190,8 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
struct pci_sriov *iov = dev->sriov;
virtfn = pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus),
- virtfn_bus(dev, id),
- virtfn_devfn(dev, id));
+ pci_iov_virtfn_bus(dev, id),
+ pci_iov_virtfn_devfn(dev, id));
if (!virtfn)
return;
@@ -170,6 +220,11 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
pci_dev_put(dev);
}
+int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+ return 0;
+}
+
static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
{
int rc;
@@ -180,6 +235,8 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
struct pci_dev *pdev;
struct pci_sriov *iov = dev->sriov;
int bars = 0;
+ int bus;
+ int retval;
if (!nr_virtfn)
return 0;
@@ -204,7 +261,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
nres = 0;
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
bars |= (1 << (i + PCI_IOV_RESOURCES));
- res = dev->resource + PCI_IOV_RESOURCES + i;
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
if (res->parent)
nres++;
}
@@ -216,8 +273,10 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
iov->offset = offset;
iov->stride = stride;
- if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->busn_res.end) {
- dev_err(&dev->dev, "SR-IOV: bus number out of range\n");
+ bus = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
+ if (bus > dev->bus->busn_res.end) {
+ dev_err(&dev->dev, "can't enable %d VFs (bus %02x out of range of %pR)\n",
+ nr_virtfn, bus, &dev->bus->busn_res);
return -ENOMEM;
}
@@ -243,7 +302,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
return rc;
}
- pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
+ pci_iov_set_numvfs(dev, nr_virtfn);
iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
pci_cfg_access_lock(dev);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
@@ -254,6 +313,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
if (nr_virtfn < initial)
initial = nr_virtfn;
+ if ((retval = pcibios_sriov_enable(dev, initial))) {
+ dev_err(&dev->dev, "failure %d from pcibios_sriov_enable()\n",
+ retval);
+ return retval;
+ }
+
for (i = 0; i < initial; i++) {
rc = virtfn_add(dev, i, 0);
if (rc)
@@ -272,7 +337,7 @@ failed:
iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
pci_cfg_access_lock(dev);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
- pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, 0);
+ pci_iov_set_numvfs(dev, 0);
ssleep(1);
pci_cfg_access_unlock(dev);
@@ -282,6 +347,11 @@ failed:
return rc;
}
+int __weak pcibios_sriov_disable(struct pci_dev *pdev)
+{
+ return 0;
+}
+
static void sriov_disable(struct pci_dev *dev)
{
int i;
@@ -293,6 +363,8 @@ static void sriov_disable(struct pci_dev *dev)
for (i = 0; i < iov->num_VFs; i++)
virtfn_remove(dev, i, 0);
+ pcibios_sriov_disable(dev);
+
iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
pci_cfg_access_lock(dev);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
@@ -303,12 +375,12 @@ static void sriov_disable(struct pci_dev *dev)
sysfs_remove_link(&dev->dev.kobj, "dep_link");
iov->num_VFs = 0;
- pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, 0);
+ pci_iov_set_numvfs(dev, 0);
}
static int sriov_init(struct pci_dev *dev, int pos)
{
- int i;
+ int i, bar64;
int rc;
int nres;
u32 pgsz;
@@ -357,27 +429,29 @@ found:
pgsz &= ~(pgsz - 1);
pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
+ iov = kzalloc(sizeof(*iov), GFP_KERNEL);
+ if (!iov)
+ return -ENOMEM;
+
nres = 0;
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = dev->resource + PCI_IOV_RESOURCES + i;
- i += __pci_read_base(dev, pci_bar_unknown, res,
- pos + PCI_SRIOV_BAR + i * 4);
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
+ bar64 = __pci_read_base(dev, pci_bar_unknown, res,
+ pos + PCI_SRIOV_BAR + i * 4);
if (!res->flags)
continue;
if (resource_size(res) & (PAGE_SIZE - 1)) {
rc = -EIO;
goto failed;
}
+ iov->barsz[i] = resource_size(res);
res->end = res->start + resource_size(res) * total - 1;
+ dev_info(&dev->dev, "VF(n) BAR%d space: %pR (contains BAR%d for %d VFs)\n",
+ i, res, i, total);
+ i += bar64;
nres++;
}
- iov = kzalloc(sizeof(*iov), GFP_KERNEL);
- if (!iov) {
- rc = -ENOMEM;
- goto failed;
- }
-
iov->pos = pos;
iov->nres = nres;
iov->ctrl = ctrl;
@@ -400,15 +474,17 @@ found:
dev->sriov = iov;
dev->is_physfn = 1;
+ iov->max_VF_buses = virtfn_max_buses(dev);
return 0;
failed:
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = dev->resource + PCI_IOV_RESOURCES + i;
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
res->flags = 0;
}
+ kfree(iov);
return rc;
}
@@ -439,7 +515,7 @@ static void sriov_restore_state(struct pci_dev *dev)
pci_update_resource(dev, i);
pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
- pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, iov->num_VFs);
+ pci_iov_set_numvfs(dev, iov->num_VFs);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
msleep(100);
@@ -493,6 +569,12 @@ int pci_iov_resource_bar(struct pci_dev *dev, int resno)
4 * (resno - PCI_IOV_RESOURCES);
}
+resource_size_t __weak pcibios_iov_resource_alignment(struct pci_dev *dev,
+ int resno)
+{
+ return pci_iov_resource_size(dev, resno);
+}
+
/**
* pci_sriov_resource_alignment - get resource alignment for VF BAR
* @dev: the PCI device
@@ -505,14 +587,7 @@ int pci_iov_resource_bar(struct pci_dev *dev, int resno)
*/
resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno)
{
- struct resource tmp;
- int reg = pci_iov_resource_bar(dev, resno);
-
- if (!reg)
- return 0;
-
- __pci_read_base(dev, pci_bar_unknown, &tmp, reg);
- return resource_alignment(&tmp);
+ return pcibios_iov_resource_alignment(dev, resno);
}
/**
@@ -535,15 +610,13 @@ void pci_restore_iov_state(struct pci_dev *dev)
int pci_iov_bus_range(struct pci_bus *bus)
{
int max = 0;
- u8 busnr;
struct pci_dev *dev;
list_for_each_entry(dev, &bus->devices, bus_list) {
if (!dev->is_physfn)
continue;
- busnr = virtfn_bus(dev, dev->sriov->total_VFs - 1);
- if (busnr > max)
- max = busnr;
+ if (dev->sriov->max_VF_buses > max)
+ max = dev->sriov->max_VF_buses;
}
return max ? max - bus->number : 0;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 4091f82239cd..bae593c04541 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -243,10 +243,12 @@ struct pci_sriov {
u16 stride; /* following VF stride */
u32 pgsz; /* page size for BAR alignment */
u8 link; /* Function Dependency Link */
+ u8 max_VF_buses; /* max buses consumed by VFs */
u16 driver_max_VFs; /* max num VFs driver supports */
struct pci_dev *dev; /* lowest numbered PF */
struct pci_dev *self; /* this PF */
struct mutex lock; /* lock for VF bus */
+ resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */
};
#ifdef CONFIG_PCI_ATS
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index e3e17f3c0f0f..6603d401bb7c 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -99,8 +99,8 @@ static void remove_from_list(struct list_head *head,
}
}
-static resource_size_t get_res_add_size(struct list_head *head,
- struct resource *res)
+static struct pci_dev_resource *res_to_dev_res(struct list_head *head,
+ struct resource *res)
{
struct pci_dev_resource *dev_res;
@@ -109,17 +109,37 @@ static resource_size_t get_res_add_size(struct list_head *head,
int idx = res - &dev_res->dev->resource[0];
dev_printk(KERN_DEBUG, &dev_res->dev->dev,
- "res[%d]=%pR get_res_add_size add_size %llx\n",
+ "res[%d]=%pR res_to_dev_res add_size %llx min_align %llx\n",
idx, dev_res->res,
- (unsigned long long)dev_res->add_size);
+ (unsigned long long)dev_res->add_size,
+ (unsigned long long)dev_res->min_align);
- return dev_res->add_size;
+ return dev_res;
}
}
- return 0;
+ return NULL;
}
+static resource_size_t get_res_add_size(struct list_head *head,
+ struct resource *res)
+{
+ struct pci_dev_resource *dev_res;
+
+ dev_res = res_to_dev_res(head, res);
+ return dev_res ? dev_res->add_size : 0;
+}
+
+static resource_size_t get_res_add_align(struct list_head *head,
+ struct resource *res)
+{
+ struct pci_dev_resource *dev_res;
+
+ dev_res = res_to_dev_res(head, res);
+ return dev_res ? dev_res->min_align : 0;
+}
+
+
/* Sort resources by alignment */
static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
{
@@ -215,7 +235,7 @@ static void reassign_resources_sorted(struct list_head *realloc_head,
struct resource *res;
struct pci_dev_resource *add_res, *tmp;
struct pci_dev_resource *dev_res;
- resource_size_t add_size;
+ resource_size_t add_size, align;
int idx;
list_for_each_entry_safe(add_res, tmp, realloc_head, list) {
@@ -238,13 +258,13 @@ static void reassign_resources_sorted(struct list_head *realloc_head,
idx = res - &add_res->dev->resource[0];
add_size = add_res->add_size;
+ align = add_res->min_align;
if (!resource_size(res)) {
- res->start = add_res->start;
+ res->start = align;
res->end = res->start + add_size - 1;
if (pci_assign_resource(add_res->dev, idx))
reset_resource(res);
} else {
- resource_size_t align = add_res->min_align;
res->flags |= add_res->flags &
(IORESOURCE_STARTALIGN|IORESOURCE_SIZEALIGN);
if (pci_reassign_resource(add_res->dev, idx,
@@ -368,8 +388,9 @@ static void __assign_resources_sorted(struct list_head *head,
LIST_HEAD(save_head);
LIST_HEAD(local_fail_head);
struct pci_dev_resource *save_res;
- struct pci_dev_resource *dev_res, *tmp_res;
+ struct pci_dev_resource *dev_res, *tmp_res, *dev_res2;
unsigned long fail_type;
+ resource_size_t add_align, align;
/* Check if optional add_size is there */
if (!realloc_head || list_empty(realloc_head))
@@ -384,10 +405,44 @@ static void __assign_resources_sorted(struct list_head *head,
}
/* Update res in head list with add_size in realloc_head list */
- list_for_each_entry(dev_res, head, list)
+ list_for_each_entry_safe(dev_res, tmp_res, head, list) {
dev_res->res->end += get_res_add_size(realloc_head,
dev_res->res);
+ /*
+ * There are two kinds of additional resources in the list:
+ * 1. bridge resource -- IORESOURCE_STARTALIGN
+ * 2. SR-IOV resource -- IORESOURCE_SIZEALIGN
+ * Here just fix the additional alignment for bridge
+ */
+ if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
+ continue;
+
+ add_align = get_res_add_align(realloc_head, dev_res->res);
+
+ /*
+ * The "head" list is sorted by the alignment to make sure
+ * resources with bigger alignment will be assigned first.
+ * After we change the alignment of a dev_res in "head" list,
+ * we need to reorder the list by alignment to make it
+ * consistent.
+ */
+ if (add_align > dev_res->res->start) {
+ dev_res->res->start = add_align;
+ dev_res->res->end = add_align +
+ resource_size(dev_res->res);
+
+ list_for_each_entry(dev_res2, head, list) {
+ align = pci_resource_alignment(dev_res2->dev,
+ dev_res2->res);
+ if (add_align > align)
+ list_move_tail(&dev_res->list,
+ &dev_res2->list);
+ }
+ }
+
+ }
+
/* Try updated head list with add_size added */
assign_requested_resources_sorted(head, &local_fail_head);
@@ -962,6 +1017,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
struct resource *b_res = find_free_bus_resource(bus,
mask | IORESOURCE_PREFETCH, type);
resource_size_t children_add_size = 0;
+ resource_size_t children_add_align = 0;
+ resource_size_t add_align = 0;
if (!b_res)
return -ENOSPC;
@@ -986,6 +1043,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
/* put SRIOV requested res to the optional list */
if (realloc_head && i >= PCI_IOV_RESOURCES &&
i <= PCI_IOV_RESOURCE_END) {
+ add_align = max(pci_resource_alignment(dev, r), add_align);
r->end = r->start - 1;
add_to_list(realloc_head, dev, r, r_size, 0/* don't care */);
children_add_size += r_size;
@@ -1016,19 +1074,23 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
if (order > max_order)
max_order = order;
- if (realloc_head)
+ if (realloc_head) {
children_add_size += get_res_add_size(realloc_head, r);
+ children_add_align = get_res_add_align(realloc_head, r);
+ add_align = max(add_align, children_add_align);
+ }
}
}
min_align = calculate_mem_align(aligns, max_order);
min_align = max(min_align, window_alignment(bus, b_res->flags));
size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align);
+ add_align = max(min_align, add_align);
if (children_add_size > add_size)
add_size = children_add_size;
size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
calculate_memsize(size, min_size, add_size,
- resource_size(b_res), min_align);
+ resource_size(b_res), add_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n",
@@ -1040,10 +1102,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
b_res->end = size0 + min_align - 1;
b_res->flags |= IORESOURCE_STARTALIGN;
if (size1 > size0 && realloc_head) {
- add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align);
- dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n",
+ add_to_list(realloc_head, bus->self, b_res, size1-size0, add_align);
+ dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx add_align %llx\n",
b_res, &bus->busn_res,
- (unsigned long long)size1-size0);
+ (unsigned long long) (size1 - size0),
+ (unsigned long long) add_align);
}
return 0;
}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 211e9da8a7d7..4e1f17db1a81 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1174,6 +1174,7 @@ unsigned char pci_bus_max_busnr(struct pci_bus *bus);
void pci_setup_bridge(struct pci_bus *bus);
resource_size_t pcibios_window_alignment(struct pci_bus *bus,
unsigned long type);
+resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno);
#define PCI_VGA_STATE_CHANGE_BRIDGE (1 << 0)
#define PCI_VGA_STATE_CHANGE_DECODES (1 << 1)
@@ -1669,13 +1670,25 @@ int pci_ext_cfg_avail(void);
void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
#ifdef CONFIG_PCI_IOV
+int pci_iov_virtfn_bus(struct pci_dev *dev, int id);
+int pci_iov_virtfn_devfn(struct pci_dev *dev, int id);
+
int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
void pci_disable_sriov(struct pci_dev *dev);
int pci_num_vf(struct pci_dev *dev);
int pci_vfs_assigned(struct pci_dev *dev);
int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
int pci_sriov_get_totalvfs(struct pci_dev *dev);
+resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
#else
+static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id)
+{
+ return -ENOSYS;
+}
+static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
+{
+ return -ENOSYS;
+}
static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
{ return -ENODEV; }
static inline void pci_disable_sriov(struct pci_dev *dev) { }
@@ -1686,6 +1699,8 @@ static inline int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs)
{ return 0; }
static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
{ return 0; }
+static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
+{ return 0; }
#endif
#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)