summaryrefslogtreecommitdiffstats
path: root/drivers/xen
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/xen')
-rw-r--r--drivers/xen/Kconfig45
-rw-r--r--drivers/xen/Makefile17
-rw-r--r--drivers/xen/balloon.c96
-rw-r--r--drivers/xen/biomerge.c13
-rw-r--r--drivers/xen/cpu_hotplug.c3
-rw-r--r--drivers/xen/events.c847
-rw-r--r--drivers/xen/evtchn.c103
-rw-r--r--drivers/xen/gntdev.c665
-rw-r--r--drivers/xen/grant-table.c125
-rw-r--r--drivers/xen/manage.c85
-rw-r--r--drivers/xen/pci.c117
-rw-r--r--drivers/xen/platform-pci.c200
-rw-r--r--drivers/xen/swiotlb-xen.c515
-rw-r--r--drivers/xen/sys-hypervisor.c4
-rw-r--r--drivers/xen/xenbus/Makefile5
-rw-r--r--drivers/xen/xenbus/xenbus_client.c93
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c420
-rw-r--r--drivers/xen/xenbus/xenbus_probe.h31
-rw-r--r--drivers/xen/xenbus/xenbus_probe_backend.c276
-rw-r--r--drivers/xen/xenbus/xenbus_probe_frontend.c294
-rw-r--r--drivers/xen/xenbus/xenbus_xs.c2
-rw-r--r--drivers/xen/xencomm.c2
-rw-r--r--drivers/xen/xenfs/Makefile3
-rw-r--r--drivers/xen/xenfs/privcmd.c400
-rw-r--r--drivers/xen/xenfs/super.c72
-rw-r--r--drivers/xen/xenfs/xenbus.c3
-rw-r--r--drivers/xen/xenfs/xenfs.h3
-rw-r--r--drivers/xen/xenfs/xenstored.c68
28 files changed, 3953 insertions, 554 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index cab100acf983..07bec09d1dad 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -1,6 +1,8 @@
+menu "Xen driver support"
+ depends on XEN
+
config XEN_BALLOON
bool "Xen memory balloon driver"
- depends on XEN
default y
help
The balloon driver allows the Xen domain to request more memory from
@@ -20,7 +22,6 @@ config XEN_SCRUB_PAGES
config XEN_DEV_EVTCHN
tristate "Xen /dev/xen/evtchn device"
- depends on XEN
default y
help
The evtchn driver allows a userspace process to triger event
@@ -28,9 +29,16 @@ config XEN_DEV_EVTCHN
firing.
If in doubt, say yes.
+config XEN_BACKEND
+ bool "Backend driver support"
+ depends on XEN_DOM0
+ default y
+ help
+ Support for backend device drivers that provide I/O services
+ to other virtual machines.
+
config XENFS
tristate "Xen filesystem"
- depends on XEN
default y
help
The xen filesystem provides a way for domains to share
@@ -53,11 +61,38 @@ config XEN_COMPAT_XENFS
config XEN_SYS_HYPERVISOR
bool "Create xen entries under /sys/hypervisor"
- depends on XEN && SYSFS
+ depends on SYSFS
select SYS_HYPERVISOR
default y
help
Create entries under /sys/hypervisor describing the Xen
hypervisor environment. When running native or in another
virtual environment, /sys/hypervisor will still be present,
- but will have no xen contents. \ No newline at end of file
+ but will have no xen contents.
+
+config XEN_XENBUS_FRONTEND
+ tristate
+
+config XEN_GNTDEV
+ tristate "userspace grant access device driver"
+ depends on XEN
+ select MMU_NOTIFIER
+ help
+ Allows userspace processes to use grants.
+
+config XEN_PLATFORM_PCI
+ tristate "xen platform pci device driver"
+ depends on XEN_PVHVM && PCI
+ default m
+ help
+ Driver for the Xen PCI Platform device: it is responsible for
+ initializing xenbus and grant_table when running in a Xen HVM
+ domain. As a consequence this driver is required to run any Xen PV
+ frontend on Xen HVM.
+
+config SWIOTLB_XEN
+ def_bool y
+ depends on PCI
+ select SWIOTLB
+
+endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index ec2a39b1e26f..5088cc2e6fe2 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,9 +1,22 @@
obj-y += grant-table.o features.o events.o manage.o
obj-y += xenbus/
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_features.o := $(nostackp)
+
+obj-$(CONFIG_BLOCK) += biomerge.o
obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
obj-$(CONFIG_XEN_BALLOON) += balloon.o
-obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o
+obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o
+obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
obj-$(CONFIG_XENFS) += xenfs/
-obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o \ No newline at end of file
+obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
+obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o
+obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
+obj-$(CONFIG_XEN_DOM0) += pci.o
+
+xen-evtchn-y := evtchn.o
+xen-gntdev-y := gntdev.o
+
+xen-platform-pci-y := platform-pci.o
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index f5bbd9e83416..43f9f02c7db0 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -43,15 +43,19 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/sysdev.h>
+#include <linux/gfp.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
+#include <asm/e820.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/interface/memory.h>
#include <xen/xenbus.h>
@@ -66,8 +70,6 @@ struct balloon_stats {
/* We aim for 'current allocation' == 'target allocation'. */
unsigned long current_pages;
unsigned long target_pages;
- /* We may hit the hard limit in Xen. If we do then we remember it. */
- unsigned long hard_limit;
/*
* Drivers may alter the memory reservation independently, but they
* must inform the balloon driver so we avoid hitting the hard limit.
@@ -84,23 +86,12 @@ static struct sys_device balloon_sysdev;
static int register_balloon(struct sys_device *sysdev);
-/*
- * Protects atomic reservation decrease/increase against concurrent increases.
- * Also protects non-atomic updates of current_pages and driver_pages, and
- * balloon lists.
- */
-static DEFINE_SPINLOCK(balloon_lock);
-
static struct balloon_stats balloon_stats;
/* We increase/decrease in batches which fit in a page */
static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
-/* VM /proc information for memory */
-extern unsigned long totalram_pages;
-
#ifdef CONFIG_HIGHMEM
-extern unsigned long totalhigh_pages;
#define inc_totalhigh_pages() (totalhigh_pages++)
#define dec_totalhigh_pages() (totalhigh_pages--)
#else
@@ -129,7 +120,7 @@ static void scrub_page(struct page *page)
}
/* balloon_append: add the given page to the balloon. */
-static void balloon_append(struct page *page)
+static void __balloon_append(struct page *page)
{
/* Lowmem is re-populated first, so highmem pages go at list tail. */
if (PageHighMem(page)) {
@@ -142,6 +133,12 @@ static void balloon_append(struct page *page)
}
}
+static void balloon_append(struct page *page)
+{
+ __balloon_append(page);
+ totalram_pages--;
+}
+
/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
static struct page *balloon_retrieve(void)
{
@@ -160,6 +157,8 @@ static struct page *balloon_retrieve(void)
else
balloon_stats.balloon_low--;
+ totalram_pages++;
+
return page;
}
@@ -185,7 +184,7 @@ static void balloon_alarm(unsigned long unused)
static unsigned long current_target(void)
{
- unsigned long target = min(balloon_stats.target_pages, balloon_stats.hard_limit);
+ unsigned long target = balloon_stats.target_pages;
target = min(target,
balloon_stats.current_pages +
@@ -197,7 +196,7 @@ static unsigned long current_target(void)
static int increase_reservation(unsigned long nr_pages)
{
- unsigned long pfn, i, flags;
+ unsigned long pfn, i;
struct page *page;
long rc;
struct xen_memory_reservation reservation = {
@@ -209,35 +208,20 @@ static int increase_reservation(unsigned long nr_pages)
if (nr_pages > ARRAY_SIZE(frame_list))
nr_pages = ARRAY_SIZE(frame_list);
- spin_lock_irqsave(&balloon_lock, flags);
-
page = balloon_first_page();
for (i = 0; i < nr_pages; i++) {
BUG_ON(page == NULL);
- frame_list[i] = page_to_pfn(page);;
+ frame_list[i] = page_to_pfn(page);
page = balloon_next_page(page);
}
set_xen_guest_handle(reservation.extent_start, frame_list);
reservation.nr_extents = nr_pages;
rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
- if (rc < nr_pages) {
- if (rc > 0) {
- int ret;
-
- /* We hit the Xen hard limit: reprobe. */
- reservation.nr_extents = rc;
- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
- &reservation);
- BUG_ON(ret != rc);
- }
- if (rc >= 0)
- balloon_stats.hard_limit = (balloon_stats.current_pages + rc -
- balloon_stats.driver_pages);
+ if (rc < 0)
goto out;
- }
- for (i = 0; i < nr_pages; i++) {
+ for (i = 0; i < rc; i++) {
page = balloon_retrieve();
BUG_ON(page == NULL);
@@ -263,18 +247,15 @@ static int increase_reservation(unsigned long nr_pages)
__free_page(page);
}
- balloon_stats.current_pages += nr_pages;
- totalram_pages = balloon_stats.current_pages;
+ balloon_stats.current_pages += rc;
out:
- spin_unlock_irqrestore(&balloon_lock, flags);
-
- return 0;
+ return rc < 0 ? rc : rc != nr_pages;
}
static int decrease_reservation(unsigned long nr_pages)
{
- unsigned long pfn, i, flags;
+ unsigned long pfn, i;
struct page *page;
int need_sleep = 0;
int ret;
@@ -312,8 +293,6 @@ static int decrease_reservation(unsigned long nr_pages)
kmap_flush_unused();
flush_tlb_all();
- spin_lock_irqsave(&balloon_lock, flags);
-
/* No more mappings: invalidate P2M and add to balloon. */
for (i = 0; i < nr_pages; i++) {
pfn = mfn_to_pfn(frame_list[i]);
@@ -327,9 +306,6 @@ static int decrease_reservation(unsigned long nr_pages)
BUG_ON(ret != nr_pages);
balloon_stats.current_pages -= nr_pages;
- totalram_pages = balloon_stats.current_pages;
-
- spin_unlock_irqrestore(&balloon_lock, flags);
return need_sleep;
}
@@ -371,7 +347,6 @@ static void balloon_process(struct work_struct *work)
static void balloon_set_new_target(unsigned long target)
{
/* No need for lock. Not read-modify-write updates. */
- balloon_stats.hard_limit = ~0UL;
balloon_stats.target_pages = target;
schedule_work(&balloon_worker);
}
@@ -417,7 +392,7 @@ static struct notifier_block xenstore_notifier;
static int __init balloon_init(void)
{
- unsigned long pfn;
+ unsigned long pfn, extra_pfn_end;
struct page *page;
if (!xen_pv_domain())
@@ -426,12 +401,10 @@ static int __init balloon_init(void)
pr_info("xen_balloon: Initialising balloon driver.\n");
balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
- totalram_pages = balloon_stats.current_pages;
balloon_stats.target_pages = balloon_stats.current_pages;
balloon_stats.balloon_low = 0;
balloon_stats.balloon_high = 0;
balloon_stats.driver_pages = 0UL;
- balloon_stats.hard_limit = ~0UL;
init_timer(&balloon_timer);
balloon_timer.data = 0;
@@ -439,11 +412,24 @@ static int __init balloon_init(void)
register_balloon(&balloon_sysdev);
- /* Initialise the balloon with excess memory space. */
- for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+ /*
+ * Initialise the balloon with excess memory space. We need
+ * to make sure we don't add memory which doesn't exist or
+ * logically exist. The E820 map can be trimmed to be smaller
+ * than the amount of physical memory due to the mem= command
+ * line parameter. And if this is a 32-bit non-HIGHMEM kernel
+ * on a system with memory which requires highmem to access,
+ * don't try to use it.
+ */
+ extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()),
+ (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size));
+ for (pfn = PFN_UP(xen_extra_mem_start);
+ pfn < extra_pfn_end;
+ pfn++) {
page = pfn_to_page(pfn);
- if (!PageReserved(page))
- balloon_append(page);
+ /* totalram_pages doesn't include the boot-time
+ balloon extension, so don't subtract from it. */
+ __balloon_append(page);
}
target_watch.callback = watch_target;
@@ -476,9 +462,6 @@ module_exit(balloon_exit);
BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low));
BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
-BALLOON_SHOW(hard_limit_kb,
- (balloon_stats.hard_limit!=~0UL) ? "%lu\n" : "???\n",
- (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0);
BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages));
static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr,
@@ -548,7 +531,6 @@ static struct attribute *balloon_info_attrs[] = {
&attr_current_kb.attr,
&attr_low_kb.attr,
&attr_high_kb.attr,
- &attr_hard_limit_kb.attr,
&attr_driver_kb.attr,
NULL
};
diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
new file mode 100644
index 000000000000..ba6eda4b5143
--- /dev/null
+++ b/drivers/xen/biomerge.c
@@ -0,0 +1,13 @@
+#include <linux/bio.h>
+#include <linux/io.h>
+#include <xen/page.h>
+
+bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+ const struct bio_vec *vec2)
+{
+ unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page));
+ unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page));
+
+ return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
+ ((mfn1 == mfn2) || ((mfn1+1) == mfn2));
+}
diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index bdfd584ad853..14e2d995e958 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -1,5 +1,6 @@
#include <linux/notifier.h>
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <asm/xen/hypervisor.h>
@@ -86,7 +87,7 @@ static int setup_cpu_watcher(struct notifier_block *notifier,
for_each_possible_cpu(cpu) {
if (vcpu_online(cpu) == 0) {
(void)cpu_down(cpu);
- cpu_clear(cpu, cpu_present_map);
+ set_cpu_present(cpu, false);
}
}
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index abad71b1632b..74681478100a 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -16,7 +16,7 @@
* (typically dom0).
* 2. VIRQs, typically used for timers. These are per-cpu events.
* 3. IPIs.
- * 4. Hardware interrupts. Not supported at present.
+ * 4. PIRQs - Hardware interrupts.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
@@ -27,18 +27,28 @@
#include <linux/module.h>
#include <linux/string.h>
#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/irqnr.h>
+#include <linux/pci.h>
+#include <asm/desc.h>
#include <asm/ptrace.h>
#include <asm/irq.h>
#include <asm/idle.h>
+#include <asm/io_apic.h>
#include <asm/sync_bitops.h>
+#include <asm/xen/pci.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
+#include <xen/hvm.h>
#include <xen/xen-ops.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
+#include <xen/interface/hvm/hvm_op.h>
+#include <xen/interface/hvm/params.h>
/*
* This lock protects updates to the following mapping and reference-count
@@ -47,10 +57,10 @@
static DEFINE_SPINLOCK(irq_mapping_update_lock);
/* IRQ <-> VIRQ mapping. */
-static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
/* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
/* Interrupt types. */
enum xen_irq_type {
@@ -67,7 +77,8 @@ enum xen_irq_type {
* event channel - irq->event channel mapping
* cpu - cpu this event channel is bound to
* index - type-specific information:
- * PIRQ - vector, with MSB being "needs EIO"
+ * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
+ * guest, or GSI (real passthrough IRQ) of the device.
* VIRQ - virq number
* IPI - IPI vector
* EVTCHN -
@@ -82,21 +93,29 @@ struct irq_info
unsigned short virq;
enum ipi_vector ipi;
struct {
+ unsigned short pirq;
unsigned short gsi;
- unsigned short vector;
+ unsigned char vector;
+ unsigned char flags;
} pirq;
} u;
};
+#define PIRQ_NEEDS_EOI (1 << 0)
+#define PIRQ_SHAREABLE (1 << 1)
-static struct irq_info irq_info[NR_IRQS];
+static struct irq_info *irq_info;
+static int *pirq_to_irq;
-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
- [0 ... NR_EVENT_CHANNELS-1] = -1
-};
+static int *evtchn_to_irq;
struct cpu_evtchn_s {
unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
};
-static struct cpu_evtchn_s *cpu_evtchn_mask_p;
+
+static __initdata struct cpu_evtchn_s init_evtchn_mask = {
+ .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
+};
+static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
+
static inline unsigned long *cpu_evtchn_mask(int cpu)
{
return cpu_evtchn_mask_p[cpu].bits;
@@ -106,6 +125,8 @@ static inline unsigned long *cpu_evtchn_mask(int cpu)
#define VALID_EVTCHN(chn) ((chn) != 0)
static struct irq_chip xen_dynamic_chip;
+static struct irq_chip xen_percpu_chip;
+static struct irq_chip xen_pirq_chip;
/* Constructor for packed IRQ information. */
static struct irq_info mk_unbound_info(void)
@@ -131,11 +152,12 @@ static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq)
.cpu = 0, .u.virq = virq };
}
-static struct irq_info mk_pirq_info(unsigned short evtchn,
+static struct irq_info mk_pirq_info(unsigned short evtchn, unsigned short pirq,
unsigned short gsi, unsigned short vector)
{
return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
- .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
+ .cpu = 0,
+ .u.pirq = { .pirq = pirq, .gsi = gsi, .vector = vector } };
}
/*
@@ -148,6 +170,9 @@ static struct irq_info *info_for_irq(unsigned irq)
static unsigned int evtchn_from_irq(unsigned irq)
{
+ if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq)))
+ return 0;
+
return info_for_irq(irq)->evtchn;
}
@@ -177,6 +202,16 @@ static unsigned virq_from_irq(unsigned irq)
return info->u.virq;
}
+static unsigned pirq_from_irq(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info == NULL);
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ return info->u.pirq.pirq;
+}
+
static unsigned gsi_from_irq(unsigned irq)
{
struct irq_info *info = info_for_irq(irq);
@@ -218,6 +253,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn)
return ret;
}
+static bool pirq_needs_eoi(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ return info->u.pirq.flags & PIRQ_NEEDS_EOI;
+}
+
static inline unsigned long active_evtchns(unsigned int cpu,
struct shared_info *sh,
unsigned int idx)
@@ -236,17 +280,17 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
#endif
- __clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
- __set_bit(chn, cpu_evtchn_mask(cpu));
+ clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
+ set_bit(chn, cpu_evtchn_mask(cpu));
irq_info[irq].cpu = cpu;
}
static void init_evtchn_cpu_bindings(void)
{
+ int i;
#ifdef CONFIG_SMP
struct irq_desc *desc;
- int i;
/* By default all event channels notify CPU#0. */
for_each_irq_desc(i, desc) {
@@ -254,7 +298,10 @@ static void init_evtchn_cpu_bindings(void)
}
#endif
- memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0)));
+ for_each_possible_cpu(i)
+ memset(cpu_evtchn_mask(i),
+ (i == 0) ? ~0 : 0, sizeof(struct cpu_evtchn_s));
+
}
static inline void clear_evtchn(int port)
@@ -311,7 +358,7 @@ static void unmask_evtchn(int port)
struct evtchn_unmask unmask = { .port = port };
(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
} else {
- struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+ struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
sync_clear_bit(port, &s->evtchn_mask[0]);
@@ -329,27 +376,435 @@ static void unmask_evtchn(int port)
put_cpu();
}
-static int find_unbound_irq(void)
+static int get_nr_hw_irqs(void)
{
- int irq;
- struct irq_desc *desc;
+ int ret = 1;
- for (irq = 0; irq < nr_irqs; irq++)
- if (irq_info[irq].type == IRQT_UNBOUND)
+#ifdef CONFIG_X86_IO_APIC
+ ret = get_nr_irqs_gsi();
+#endif
+
+ return ret;
+}
+
+static int find_unbound_pirq(int type)
+{
+ int rc, i;
+ struct physdev_get_free_pirq op_get_free_pirq;
+ op_get_free_pirq.type = type;
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
+ if (!rc)
+ return op_get_free_pirq.pirq;
+
+ for (i = 0; i < nr_irqs; i++) {
+ if (pirq_to_irq[i] < 0)
+ return i;
+ }
+ return -1;
+}
+
+static int find_unbound_irq(void)
+{
+ struct irq_data *data;
+ int irq, res;
+ int bottom = get_nr_hw_irqs();
+ int top = nr_irqs-1;
+
+ if (bottom == nr_irqs)
+ goto no_irqs;
+
+ /* This loop starts from the top of IRQ space and goes down.
+ * We need this b/c if we have a PCI device in a Xen PV guest
+ * we do not have an IO-APIC (though the backend might have them)
+ * mapped in. To not have a collision of physical IRQs with the Xen
+ * event channels start at the top of the IRQ space for virtual IRQs.
+ */
+ for (irq = top; irq > bottom; irq--) {
+ data = irq_get_irq_data(irq);
+ /* only 15->0 have init'd desc; handle irq > 16 */
+ if (!data)
+ break;
+ if (data->chip == &no_irq_chip)
break;
+ if (data->chip != &xen_dynamic_chip)
+ continue;
+ if (irq_info[irq].type == IRQT_UNBOUND)
+ return irq;
+ }
+
+ if (irq == bottom)
+ goto no_irqs;
- if (irq == nr_irqs)
- panic("No available IRQ to bind to: increase nr_irqs!\n");
+ res = irq_alloc_desc_at(irq, -1);
- desc = irq_to_desc_alloc_node(irq, 0);
- if (WARN_ON(desc == NULL))
+ if (WARN_ON(res != irq))
return -1;
- dynamic_irq_init(irq);
+ return irq;
+
+no_irqs:
+ panic("No available IRQ to bind to: increase nr_irqs!\n");
+}
+
+static bool identity_mapped_irq(unsigned irq)
+{
+ /* identity map all the hardware irqs */
+ return irq < get_nr_hw_irqs();
+}
+
+static void pirq_unmask_notify(int irq)
+{
+ struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) };
+
+ if (unlikely(pirq_needs_eoi(irq))) {
+ int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+ WARN_ON(rc);
+ }
+}
+
+static void pirq_query_unmask(int irq)
+{
+ struct physdev_irq_status_query irq_status;
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ irq_status.irq = pirq_from_irq(irq);
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+ irq_status.flags = 0;
+
+ info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
+ if (irq_status.flags & XENIRQSTAT_needs_eoi)
+ info->u.pirq.flags |= PIRQ_NEEDS_EOI;
+}
+
+static bool probing_irq(int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return desc && desc->action == NULL;
+}
+
+static unsigned int startup_pirq(unsigned int irq)
+{
+ struct evtchn_bind_pirq bind_pirq;
+ struct irq_info *info = info_for_irq(irq);
+ int evtchn = evtchn_from_irq(irq);
+ int rc;
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ if (VALID_EVTCHN(evtchn))
+ goto out;
+
+ bind_pirq.pirq = pirq_from_irq(irq);
+ /* NB. We are happy to share unless we are probing. */
+ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
+ BIND_PIRQ__WILL_SHARE : 0;
+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
+ if (rc != 0) {
+ if (!probing_irq(irq))
+ printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
+ irq);
+ return 0;
+ }
+ evtchn = bind_pirq.port;
+
+ pirq_query_unmask(irq);
+
+ evtchn_to_irq[evtchn] = irq;
+ bind_evtchn_to_cpu(evtchn, 0);
+ info->evtchn = evtchn;
+
+out:
+ unmask_evtchn(evtchn);
+ pirq_unmask_notify(irq);
+
+ return 0;
+}
+
+static void shutdown_pirq(unsigned int irq)
+{
+ struct evtchn_close close;
+ struct irq_info *info = info_for_irq(irq);
+ int evtchn = evtchn_from_irq(irq);
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ if (!VALID_EVTCHN(evtchn))
+ return;
+
+ mask_evtchn(evtchn);
+
+ close.port = evtchn;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+ BUG();
+
+ bind_evtchn_to_cpu(evtchn, 0);
+ evtchn_to_irq[evtchn] = -1;
+ info->evtchn = 0;
+}
+
+static void enable_pirq(unsigned int irq)
+{
+ startup_pirq(irq);
+}
+
+static void disable_pirq(unsigned int irq)
+{
+}
+
+static void ack_pirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ move_native_irq(irq);
+
+ if (VALID_EVTCHN(evtchn)) {
+ mask_evtchn(evtchn);
+ clear_evtchn(evtchn);
+ }
+}
+
+static void end_pirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (WARN_ON(!desc))
+ return;
+
+ if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
+ (IRQ_DISABLED|IRQ_PENDING)) {
+ shutdown_pirq(irq);
+ } else if (VALID_EVTCHN(evtchn)) {
+ unmask_evtchn(evtchn);
+ pirq_unmask_notify(irq);
+ }
+}
+
+static int find_irq_by_gsi(unsigned gsi)
+{
+ int irq;
+
+ for (irq = 0; irq < nr_irqs; irq++) {
+ struct irq_info *info = info_for_irq(irq);
+
+ if (info == NULL || info->type != IRQT_PIRQ)
+ continue;
+
+ if (gsi_from_irq(irq) == gsi)
+ return irq;
+ }
+
+ return -1;
+}
+
+int xen_allocate_pirq(unsigned gsi, int shareable, char *name)
+{
+ return xen_map_pirq_gsi(gsi, gsi, shareable, name);
+}
+
+/* xen_map_pirq_gsi might allocate irqs from the top down, as a
+ * consequence don't assume that the irq number returned has a low value
+ * or can be used as a pirq number unless you know otherwise.
+ *
+ * One notable exception is when xen_map_pirq_gsi is called passing an
+ * hardware gsi as argument, in that case the irq number returned
+ * matches the gsi number passed as second argument.
+ *
+ * Note: We don't assign an event channel until the irq actually started
+ * up. Return an existing irq if we've already got one for the gsi.
+ */
+int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
+{
+ int irq = 0;
+ struct physdev_irq irq_op;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ if ((pirq > nr_irqs) || (gsi > nr_irqs)) {
+ printk(KERN_WARNING "xen_map_pirq_gsi: %s %s is incorrect!\n",
+ pirq > nr_irqs ? "pirq" :"",
+ gsi > nr_irqs ? "gsi" : "");
+ goto out;
+ }
+
+ irq = find_irq_by_gsi(gsi);
+ if (irq != -1) {
+ printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n",
+ irq, gsi);
+ goto out; /* XXX need refcount? */
+ }
+
+ /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
+ * we are using the !xen_initial_domain() to drop in the function.*/
+ if (identity_mapped_irq(gsi) || (!xen_initial_domain() &&
+ xen_pv_domain())) {
+ irq = gsi;
+ irq_alloc_desc_at(irq, -1);
+ } else
+ irq = find_unbound_irq();
+
+ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
+ handle_level_irq, name);
+
+ irq_op.irq = irq;
+ irq_op.vector = 0;
+
+ /* Only the privileged domain can do this. For non-priv, the pcifront
+ * driver provides a PCI bus that does the call to do exactly
+ * this in the priv domain. */
+ if (xen_initial_domain() &&
+ HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
+ irq_free_desc(irq);
+ irq = -ENOSPC;
+ goto out;
+ }
+
+ irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector);
+ irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
+ pirq_to_irq[pirq] = irq;
+
+out:
+ spin_unlock(&irq_mapping_update_lock);
return irq;
}
+#ifdef CONFIG_PCI_MSI
+#include <linux/msi.h>
+#include "../pci/msi.h"
+
+void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc)
+{
+ spin_lock(&irq_mapping_update_lock);
+
+ if (alloc & XEN_ALLOC_IRQ) {
+ *irq = find_unbound_irq();
+ if (*irq == -1)
+ goto out;
+ }
+
+ if (alloc & XEN_ALLOC_PIRQ) {
+ *pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI);
+ if (*pirq == -1)
+ goto out;
+ }
+
+ set_irq_chip_and_handler_name(*irq, &xen_pirq_chip,
+ handle_level_irq, name);
+
+ irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0);
+ pirq_to_irq[*pirq] = *irq;
+
+out:
+ spin_unlock(&irq_mapping_update_lock);
+}
+
+int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
+{
+ int irq = -1;
+ struct physdev_map_pirq map_irq;
+ int rc;
+ int pos;
+ u32 table_offset, bir;
+
+ memset(&map_irq, 0, sizeof(map_irq));
+ map_irq.domid = DOMID_SELF;
+ map_irq.type = MAP_PIRQ_TYPE_MSI;
+ map_irq.index = -1;
+ map_irq.pirq = -1;
+ map_irq.bus = dev->bus->number;
+ map_irq.devfn = dev->devfn;
+
+ if (type == PCI_CAP_ID_MSIX) {
+ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+
+ pci_read_config_dword(dev, msix_table_offset_reg(pos),
+ &table_offset);
+ bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+
+ map_irq.table_base = pci_resource_start(dev, bir);
+ map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+ }
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = find_unbound_irq();
+
+ if (irq == -1)
+ goto out;
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+ if (rc) {
+ printk(KERN_WARNING "xen map irq failed %d\n", rc);
+
+ irq_free_desc(irq);
+
+ irq = -1;
+ goto out;
+ }
+ irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index);
+
+ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
+ handle_level_irq,
+ (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
+
+out:
+ spin_unlock(&irq_mapping_update_lock);
+ return irq;
+}
+#endif
+
+int xen_destroy_irq(int irq)
+{
+ struct irq_desc *desc;
+ struct physdev_unmap_pirq unmap_irq;
+ struct irq_info *info = info_for_irq(irq);
+ int rc = -ENOENT;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ desc = irq_to_desc(irq);
+ if (!desc)
+ goto out;
+
+ if (xen_initial_domain()) {
+ unmap_irq.pirq = info->u.pirq.pirq;
+ unmap_irq.domid = DOMID_SELF;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
+ if (rc) {
+ printk(KERN_WARNING "unmap irq failed %d\n", rc);
+ goto out;
+ }
+ pirq_to_irq[info->u.pirq.pirq] = -1;
+ }
+ irq_info[irq] = mk_unbound_info();
+
+ irq_free_desc(irq);
+
+out:
+ spin_unlock(&irq_mapping_update_lock);
+ return rc;
+}
+
+int xen_vector_from_irq(unsigned irq)
+{
+ return vector_from_irq(irq);
+}
+
+int xen_gsi_from_irq(unsigned irq)
+{
+ return gsi_from_irq(irq);
+}
+
+int xen_irq_from_pirq(unsigned pirq)
+{
+ return pirq_to_irq[pirq];
+}
+
int bind_evtchn_to_irq(unsigned int evtchn)
{
int irq;
@@ -362,7 +817,7 @@ int bind_evtchn_to_irq(unsigned int evtchn)
irq = find_unbound_irq();
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "event");
+ handle_fasteoi_irq, "event");
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_evtchn_info(evtchn);
@@ -388,8 +843,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
if (irq < 0)
goto out;
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "ipi");
+ set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
+ handle_percpu_irq, "ipi");
bind_ipi.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
@@ -410,7 +865,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
}
-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
{
struct evtchn_bind_virq bind_virq;
int evtchn, irq;
@@ -420,6 +875,11 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
irq = per_cpu(virq_to_irq, cpu)[virq];
if (irq == -1) {
+ irq = find_unbound_irq();
+
+ set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
+ handle_percpu_irq, "virq");
+
bind_virq.virq = virq;
bind_virq.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
@@ -427,11 +887,6 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
BUG();
evtchn = bind_virq.port;
- irq = find_unbound_irq();
-
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "virq");
-
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_virq_info(evtchn, virq);
@@ -474,9 +929,12 @@ static void unbind_from_irq(unsigned int irq)
bind_evtchn_to_cpu(evtchn, 0);
evtchn_to_irq[evtchn] = -1;
+ }
+
+ if (irq_info[irq].type != IRQT_UNBOUND) {
irq_info[irq] = mk_unbound_info();
- dynamic_irq_cleanup(irq);
+ irq_free_desc(irq);
}
spin_unlock(&irq_mapping_update_lock);
@@ -532,6 +990,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
if (irq < 0)
return irq;
+ irqflags |= IRQF_NO_SUSPEND;
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
@@ -559,41 +1018,75 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
{
struct shared_info *sh = HYPERVISOR_shared_info;
int cpu = smp_processor_id();
+ unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu);
int i;
unsigned long flags;
static DEFINE_SPINLOCK(debug_lock);
+ struct vcpu_info *v;
spin_lock_irqsave(&debug_lock, flags);
- printk("vcpu %d\n ", cpu);
+ printk("\nvcpu %d\n ", cpu);
for_each_online_cpu(i) {
- struct vcpu_info *v = per_cpu(xen_vcpu, i);
- printk("%d: masked=%d pending=%d event_sel %08lx\n ", i,
- (get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask,
- v->evtchn_upcall_pending,
- v->evtchn_pending_sel);
- }
- printk("pending:\n ");
- for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
- printk("%08lx%s", sh->evtchn_pending[i],
- i % 8 == 0 ? "\n " : " ");
- printk("\nmasks:\n ");
- for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
- printk("%08lx%s", sh->evtchn_mask[i],
- i % 8 == 0 ? "\n " : " ");
-
- printk("\nunmasked:\n ");
- for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
- printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
- i % 8 == 0 ? "\n " : " ");
+ int pending;
+ v = per_cpu(xen_vcpu, i);
+ pending = (get_irq_regs() && i == cpu)
+ ? xen_irqs_disabled(get_irq_regs())
+ : v->evtchn_upcall_mask;
+ printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i,
+ pending, v->evtchn_upcall_pending,
+ (int)(sizeof(v->evtchn_pending_sel)*2),
+ v->evtchn_pending_sel);
+ }
+ v = per_cpu(xen_vcpu, cpu);
+
+ printk("\npending:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+ printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2,
+ sh->evtchn_pending[i],
+ i % 8 == 0 ? "\n " : " ");
+ printk("\nglobal mask:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%0*lx%s",
+ (int)(sizeof(sh->evtchn_mask[0])*2),
+ sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nglobally unmasked:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
+ sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nlocal cpu%d mask:\n ", cpu);
+ for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--)
+ printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
+ cpu_evtchn[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nlocally unmasked:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
+ unsigned long pending = sh->evtchn_pending[i]
+ & ~sh->evtchn_mask[i]
+ & cpu_evtchn[i];
+ printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
+ pending, i % 8 == 0 ? "\n " : " ");
+ }
printk("\npending list:\n");
- for(i = 0; i < NR_EVENT_CHANNELS; i++) {
+ for (i = 0; i < NR_EVENT_CHANNELS; i++) {
if (sync_test_bit(i, sh->evtchn_pending)) {
- printk(" %d: event %d -> irq %d\n",
+ int word_idx = i / BITS_PER_LONG;
+ printk(" %d: event %d -> irq %d%s%s%s\n",
cpu_from_evtchn(i), i,
- evtchn_to_irq[i]);
+ evtchn_to_irq[i],
+ sync_test_bit(word_idx, &v->evtchn_pending_sel)
+ ? "" : " l2-clear",
+ !sync_test_bit(i, sh->evtchn_mask)
+ ? "" : " globally-masked",
+ sync_test_bit(i, cpu_evtchn)
+ ? "" : " locally-masked");
}
}
@@ -602,6 +1095,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
+static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+
/*
* Search the CPUs pending events bitmasks. For each one found, map
* the event number to an irq, and feed it into do_IRQ() for
@@ -611,24 +1106,19 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
* a bitset of words which contain pending event bits. The second
* level is a bitset of pending events themselves.
*/
-void xen_evtchn_do_upcall(struct pt_regs *regs)
+static void __xen_evtchn_do_upcall(void)
{
int cpu = get_cpu();
- struct pt_regs *old_regs = set_irq_regs(regs);
struct shared_info *s = HYPERVISOR_shared_info;
- struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
- static DEFINE_PER_CPU(unsigned, nesting_count);
+ struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
unsigned count;
- exit_idle();
- irq_enter();
-
do {
unsigned long pending_words;
vcpu_info->evtchn_upcall_pending = 0;
- if (__get_cpu_var(nesting_count)++)
+ if (__this_cpu_inc_return(xed_nesting_count) - 1)
goto out;
#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
@@ -645,24 +1135,48 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
int bit_idx = __ffs(pending_bits);
int port = (word_idx * BITS_PER_LONG) + bit_idx;
int irq = evtchn_to_irq[port];
+ struct irq_desc *desc;
- if (irq != -1)
- handle_irq(irq, regs);
+ mask_evtchn(port);
+ clear_evtchn(port);
+
+ if (irq != -1) {
+ desc = irq_to_desc(irq);
+ if (desc)
+ generic_handle_irq_desc(irq, desc);
+ }
}
}
BUG_ON(!irqs_disabled());
- count = __get_cpu_var(nesting_count);
- __get_cpu_var(nesting_count) = 0;
- } while(count != 1);
+ count = __this_cpu_read(xed_nesting_count);
+ __this_cpu_write(xed_nesting_count, 0);
+ } while (count != 1 || vcpu_info->evtchn_upcall_pending);
out:
+
+ put_cpu();
+}
+
+void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ exit_idle();
+ irq_enter();
+
+ __xen_evtchn_do_upcall();
+
irq_exit();
set_irq_regs(old_regs);
+}
- put_cpu();
+void xen_hvm_evtchn_do_upcall(void)
+{
+ __xen_evtchn_do_upcall();
}
+EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
/* Rebind a new event channel to an existing irq. */
void rebind_evtchn_irq(int evtchn, int irq)
@@ -699,7 +1213,10 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
struct evtchn_bind_vcpu bind_vcpu;
int evtchn = evtchn_from_irq(irq);
- if (!VALID_EVTCHN(evtchn))
+ /* events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 */
+ if (!VALID_EVTCHN(evtchn) ||
+ (xen_hvm_domain() && !xen_have_vector_callback))
return -1;
/* Send future instances of this interrupt to other vcpu. */
@@ -760,10 +1277,10 @@ static void ack_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
- move_native_irq(irq);
+ move_masked_irq(irq);
if (VALID_EVTCHN(evtchn))
- clear_evtchn(evtchn);
+ unmask_evtchn(evtchn);
}
static int retrigger_dynirq(unsigned int irq)
@@ -785,6 +1302,42 @@ static int retrigger_dynirq(unsigned int irq)
return ret;
}
+static void restore_cpu_pirqs(void)
+{
+ int pirq, rc, irq, gsi;
+ struct physdev_map_pirq map_irq;
+
+ for (pirq = 0; pirq < nr_irqs; pirq++) {
+ irq = pirq_to_irq[pirq];
+ if (irq == -1)
+ continue;
+
+ /* save/restore of PT devices doesn't work, so at this point the
+ * only devices present are GSI based emulated devices */
+ gsi = gsi_from_irq(irq);
+ if (!gsi)
+ continue;
+
+ map_irq.domid = DOMID_SELF;
+ map_irq.type = MAP_PIRQ_TYPE_GSI;
+ map_irq.index = gsi;
+ map_irq.pirq = pirq;
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+ if (rc) {
+ printk(KERN_WARNING "xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n",
+ gsi, irq, pirq, rc);
+ irq_info[irq] = mk_unbound_info();
+ pirq_to_irq[pirq] = -1;
+ continue;
+ }
+
+ printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
+
+ startup_pirq(irq);
+ }
+}
+
static void restore_cpu_virqs(unsigned int cpu)
{
struct evtchn_bind_virq bind_virq;
@@ -808,9 +1361,6 @@ static void restore_cpu_virqs(unsigned int cpu)
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_virq_info(evtchn, virq);
bind_evtchn_to_cpu(evtchn, cpu);
-
- /* Ready for use. */
- unmask_evtchn(evtchn);
}
}
@@ -836,10 +1386,6 @@ static void restore_cpu_ipis(unsigned int cpu)
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_ipi_info(evtchn, ipi);
bind_evtchn_to_cpu(evtchn, cpu);
-
- /* Ready for use. */
- unmask_evtchn(evtchn);
-
}
}
@@ -851,7 +1397,7 @@ void xen_clear_irq_pending(int irq)
if (VALID_EVTCHN(evtchn))
clear_evtchn(evtchn);
}
-
+EXPORT_SYMBOL(xen_clear_irq_pending);
void xen_set_irq_pending(int irq)
{
int evtchn = evtchn_from_irq(irq);
@@ -871,9 +1417,9 @@ bool xen_test_irq_pending(int irq)
return ret;
}
-/* Poll waiting for an irq to become pending. In the usual case, the
- irq will be disabled so it won't deliver an interrupt. */
-void xen_poll_irq(int irq)
+/* Poll waiting for an irq to become pending with timeout. In the usual case,
+ * the irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq_timeout(int irq, u64 timeout)
{
evtchn_port_t evtchn = evtchn_from_irq(irq);
@@ -881,17 +1427,25 @@ void xen_poll_irq(int irq)
struct sched_poll poll;
poll.nr_ports = 1;
- poll.timeout = 0;
+ poll.timeout = timeout;
set_xen_guest_handle(poll.ports, &evtchn);
if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
BUG();
}
}
+EXPORT_SYMBOL(xen_poll_irq_timeout);
+/* Poll waiting for an irq to become pending. In the usual case, the
+ * irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq)
+{
+ xen_poll_irq_timeout(irq, 0 /* no timeout */);
+}
void xen_irq_resume(void)
{
unsigned int cpu, irq, evtchn;
+ struct irq_desc *desc;
init_evtchn_cpu_bindings();
@@ -910,6 +1464,25 @@ void xen_irq_resume(void)
restore_cpu_virqs(cpu);
restore_cpu_ipis(cpu);
}
+
+ /*
+ * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These
+ * are not handled by the IRQ core.
+ */
+ for_each_irq_desc(irq, desc) {
+ if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND))
+ continue;
+ if (desc->status & IRQ_DISABLED)
+ continue;
+
+ evtchn = evtchn_from_irq(irq);
+ if (evtchn == -1)
+ continue;
+
+ unmask_evtchn(evtchn);
+ }
+
+ restore_cpu_pirqs();
}
static struct irq_chip xen_dynamic_chip __read_mostly = {
@@ -919,18 +1492,98 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
.mask = disable_dynirq,
.unmask = enable_dynirq,
- .ack = ack_dynirq,
+ .eoi = ack_dynirq,
+ .set_affinity = set_affinity_irq,
+ .retrigger = retrigger_dynirq,
+};
+
+static struct irq_chip xen_pirq_chip __read_mostly = {
+ .name = "xen-pirq",
+
+ .startup = startup_pirq,
+ .shutdown = shutdown_pirq,
+
+ .enable = enable_pirq,
+ .unmask = enable_pirq,
+
+ .disable = disable_pirq,
+ .mask = disable_pirq,
+
+ .ack = ack_pirq,
+ .end = end_pirq,
+
.set_affinity = set_affinity_irq,
+
.retrigger = retrigger_dynirq,
};
+static struct irq_chip xen_percpu_chip __read_mostly = {
+ .name = "xen-percpu",
+
+ .disable = disable_dynirq,
+ .mask = disable_dynirq,
+ .unmask = enable_dynirq,
+
+ .ack = ack_dynirq,
+};
+
+int xen_set_callback_via(uint64_t via)
+{
+ struct xen_hvm_param a;
+ a.domid = DOMID_SELF;
+ a.index = HVM_PARAM_CALLBACK_IRQ;
+ a.value = via;
+ return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
+}
+EXPORT_SYMBOL_GPL(xen_set_callback_via);
+
+#ifdef CONFIG_XEN_PVHVM
+/* Vector callbacks are better than PCI interrupts to receive event
+ * channel notifications because we can receive vector callbacks on any
+ * vcpu and we don't need PCI support or APIC interactions. */
+void xen_callback_vector(void)
+{
+ int rc;
+ uint64_t callback_via;
+ if (xen_have_vector_callback) {
+ callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK);
+ rc = xen_set_callback_via(callback_via);
+ if (rc) {
+ printk(KERN_ERR "Request for Xen HVM callback vector"
+ " failed.\n");
+ xen_have_vector_callback = 0;
+ return;
+ }
+ printk(KERN_INFO "Xen HVM callback vector for event delivery is "
+ "enabled\n");
+ /* in the restore case the vector has already been allocated */
+ if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors))
+ alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector);
+ }
+}
+#else
+void xen_callback_vector(void) {}
+#endif
+
void __init xen_init_IRQ(void)
{
int i;
cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
GFP_KERNEL);
- BUG_ON(cpu_evtchn_mask_p == NULL);
+ irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL);
+
+ /* We are using nr_irqs as the maximum number of pirq available but
+ * that number is actually chosen by Xen and we don't know exactly
+ * what it is. Be careful choosing high pirq numbers. */
+ pirq_to_irq = kcalloc(nr_irqs, sizeof(*pirq_to_irq), GFP_KERNEL);
+ for (i = 0; i < nr_irqs; i++)
+ pirq_to_irq[i] = -1;
+
+ evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
+ GFP_KERNEL);
+ for (i = 0; i < NR_EVENT_CHANNELS; i++)
+ evtchn_to_irq[i] = -1;
init_evtchn_cpu_bindings();
@@ -938,5 +1591,15 @@ void __init xen_init_IRQ(void)
for (i = 0; i < NR_EVENT_CHANNELS; i++)
mask_evtchn(i);
- irq_ctx_init(smp_processor_id());
+ if (xen_hvm_domain()) {
+ xen_callback_vector();
+ native_init_IRQ();
+ /* pci_xen_hvm_init must be called after native_init_IRQ so that
+ * __acpi_register_gsi can point at the right function */
+ pci_xen_hvm_init();
+ } else {
+ irq_ctx_init(smp_processor_id());
+ if (xen_initial_domain())
+ xen_setup_pirqs();
+ }
}
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index af031950f9b1..ef11daf0cafe 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -38,7 +38,6 @@
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/fs.h>
-#include <linux/errno.h>
#include <linux/miscdevice.h>
#include <linux/major.h>
#include <linux/proc_fs.h>
@@ -46,9 +45,10 @@
#include <linux/poll.h>
#include <linux/irq.h>
#include <linux/init.h>
-#include <linux/gfp.h>
#include <linux/mutex.h>
#include <linux/cpu.h>
+
+#include <xen/xen.h>
#include <xen/events.h>
#include <xen/evtchn.h>
#include <asm/xen/hypervisor.h>
@@ -69,20 +69,51 @@ struct per_user_data {
const char *name;
};
-/* Who's bound to each port? */
-static struct per_user_data *port_user[NR_EVENT_CHANNELS];
+/*
+ * Who's bound to each port? This is logically an array of struct
+ * per_user_data *, but we encode the current enabled-state in bit 0.
+ */
+static unsigned long *port_user;
static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
-irqreturn_t evtchn_interrupt(int irq, void *data)
+static inline struct per_user_data *get_port_user(unsigned port)
+{
+ return (struct per_user_data *)(port_user[port] & ~1);
+}
+
+static inline void set_port_user(unsigned port, struct per_user_data *u)
+{
+ port_user[port] = (unsigned long)u;
+}
+
+static inline bool get_port_enabled(unsigned port)
+{
+ return port_user[port] & 1;
+}
+
+static inline void set_port_enabled(unsigned port, bool enabled)
+{
+ if (enabled)
+ port_user[port] |= 1;
+ else
+ port_user[port] &= ~1;
+}
+
+static irqreturn_t evtchn_interrupt(int irq, void *data)
{
unsigned int port = (unsigned long)data;
struct per_user_data *u;
spin_lock(&port_user_lock);
- u = port_user[port];
+ u = get_port_user(port);
+
+ WARN(!get_port_enabled(port),
+ "Interrupt for port %d, but apparently not enabled; per-user %p\n",
+ port, u);
disable_irq_nosync(irq);
+ set_port_enabled(port, false);
if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
@@ -92,9 +123,8 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
kill_fasync(&u->evtchn_async_queue,
SIGIO, POLL_IN);
}
- } else {
+ } else
u->ring_overflow = 1;
- }
spin_unlock(&port_user_lock);
@@ -198,9 +228,18 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
goto out;
spin_lock_irq(&port_user_lock);
- for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
- if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
- enable_irq(irq_from_evtchn(kbuf[i]));
+
+ for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
+ unsigned port = kbuf[i];
+
+ if (port < NR_EVENT_CHANNELS &&
+ get_port_user(port) == u &&
+ !get_port_enabled(port)) {
+ set_port_enabled(port, true);
+ enable_irq(irq_from_evtchn(port));
+ }
+ }
+
spin_unlock_irq(&port_user_lock);
rc = count;
@@ -222,8 +261,9 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
* interrupt handler yet, and our caller has already
* serialized bind operations.)
*/
- BUG_ON(port_user[port] != NULL);
- port_user[port] = u;
+ BUG_ON(get_port_user(port) != NULL);
+ set_port_user(port, u);
+ set_port_enabled(port, true); /* start enabled */
rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
u->name, (void *)(unsigned long)port);
@@ -239,10 +279,7 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
unbind_from_irqhandler(irq, (void *)(unsigned long)port);
- /* make sure we unbind the irq handler before clearing the port */
- barrier();
-
- port_user[port] = NULL;
+ set_port_user(port, NULL);
}
static long evtchn_ioctl(struct file *file,
@@ -333,15 +370,17 @@ static long evtchn_ioctl(struct file *file,
spin_lock_irq(&port_user_lock);
rc = -ENOTCONN;
- if (port_user[unbind.port] != u) {
+ if (get_port_user(unbind.port) != u) {
spin_unlock_irq(&port_user_lock);
break;
}
- evtchn_unbind_from_user(u, unbind.port);
+ disable_irq(irq_from_evtchn(unbind.port));
spin_unlock_irq(&port_user_lock);
+ evtchn_unbind_from_user(u, unbind.port);
+
rc = 0;
break;
}
@@ -355,7 +394,7 @@ static long evtchn_ioctl(struct file *file,
if (notify.port >= NR_EVENT_CHANNELS) {
rc = -EINVAL;
- } else if (port_user[notify.port] != u) {
+ } else if (get_port_user(notify.port) != u) {
rc = -ENOTCONN;
} else {
notify_remote_via_evtchn(notify.port);
@@ -431,7 +470,7 @@ static int evtchn_open(struct inode *inode, struct file *filp)
filp->private_data = u;
- return 0;
+ return nonseekable_open(inode, filp);;
}
static int evtchn_release(struct inode *inode, struct file *filp)
@@ -444,14 +483,21 @@ static int evtchn_release(struct inode *inode, struct file *filp)
free_page((unsigned long)u->ring);
for (i = 0; i < NR_EVENT_CHANNELS; i++) {
- if (port_user[i] != u)
+ if (get_port_user(i) != u)
continue;
- evtchn_unbind_from_user(port_user[i], i);
+ disable_irq(irq_from_evtchn(i));
}
spin_unlock_irq(&port_user_lock);
+ for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+ if (get_port_user(i) != u)
+ continue;
+
+ evtchn_unbind_from_user(get_port_user(i), i);
+ }
+
kfree(u->name);
kfree(u);
@@ -467,11 +513,12 @@ static const struct file_operations evtchn_fops = {
.fasync = evtchn_fasync,
.open = evtchn_open,
.release = evtchn_release,
+ .llseek = no_llseek,
};
static struct miscdevice evtchn_miscdev = {
.minor = MISC_DYNAMIC_MINOR,
- .name = "evtchn",
+ .name = "xen/evtchn",
.fops = &evtchn_fops,
};
static int __init evtchn_init(void)
@@ -481,8 +528,11 @@ static int __init evtchn_init(void)
if (!xen_domain())
return -ENODEV;
+ port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
+ if (port_user == NULL)
+ return -ENOMEM;
+
spin_lock_init(&port_user_lock);
- memset(port_user, 0, sizeof(port_user));
/* Create '/dev/misc/evtchn'. */
err = misc_register(&evtchn_miscdev);
@@ -498,6 +548,9 @@ static int __init evtchn_init(void)
static void __exit evtchn_cleanup(void)
{
+ kfree(port_user);
+ port_user = NULL;
+
misc_deregister(&evtchn_miscdev);
}
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
new file mode 100644
index 000000000000..1e31cdcdae1e
--- /dev/null
+++ b/drivers/xen/gntdev.c
@@ -0,0 +1,665 @@
+/******************************************************************************
+ * gntdev.c
+ *
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ * (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#undef DEBUG
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_notifier.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+
+#include <xen/xen.h>
+#include <xen/grant_table.h>
+#include <xen/gntdev.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
+ "Gerd Hoffmann <kraxel@redhat.com>");
+MODULE_DESCRIPTION("User-space granted page access driver");
+
+static int limit = 1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at "
+ "once by a gntdev instance");
+
+struct gntdev_priv {
+ struct list_head maps;
+ uint32_t used;
+ uint32_t limit;
+ /* lock protects maps from concurrent changes */
+ spinlock_t lock;
+ struct mm_struct *mm;
+ struct mmu_notifier mn;
+};
+
+struct grant_map {
+ struct list_head next;
+ struct gntdev_priv *priv;
+ struct vm_area_struct *vma;
+ int index;
+ int count;
+ int flags;
+ int is_mapped;
+ struct ioctl_gntdev_grant_ref *grants;
+ struct gnttab_map_grant_ref *map_ops;
+ struct gnttab_unmap_grant_ref *unmap_ops;
+ struct page **pages;
+};
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_print_maps(struct gntdev_priv *priv,
+ char *text, int text_index)
+{
+#ifdef DEBUG
+ struct grant_map *map;
+
+ pr_debug("maps list (priv %p, usage %d/%d)\n",
+ priv, priv->used, priv->limit);
+
+ list_for_each_entry(map, &priv->maps, next)
+ pr_debug(" index %2d, count %2d %s\n",
+ map->index, map->count,
+ map->index == text_index && text ? text : "");
+#endif
+}
+
+static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
+{
+ struct grant_map *add;
+ int i;
+
+ add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
+ if (NULL == add)
+ return NULL;
+
+ add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL);
+ add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL);
+ add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
+ add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL);
+ if (NULL == add->grants ||
+ NULL == add->map_ops ||
+ NULL == add->unmap_ops ||
+ NULL == add->pages)
+ goto err;
+
+ for (i = 0; i < count; i++) {
+ add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (add->pages[i] == NULL)
+ goto err;
+ }
+
+ add->index = 0;
+ add->count = count;
+ add->priv = priv;
+
+ if (add->count + priv->used > priv->limit)
+ goto err;
+
+ return add;
+
+err:
+ if (add->pages)
+ for (i = 0; i < count; i++) {
+ if (add->pages[i])
+ __free_page(add->pages[i]);
+ }
+ kfree(add->pages);
+ kfree(add->grants);
+ kfree(add->map_ops);
+ kfree(add->unmap_ops);
+ kfree(add);
+ return NULL;
+}
+
+static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (add->index + add->count < map->index) {
+ list_add_tail(&add->next, &map->next);
+ goto done;
+ }
+ add->index = map->index + map->count;
+ }
+ list_add_tail(&add->next, &priv->maps);
+
+done:
+ priv->used += add->count;
+ gntdev_print_maps(priv, "[new]", add->index);
+}
+
+static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
+ int index, int count)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (map->index != index)
+ continue;
+ if (map->count != count)
+ continue;
+ return map;
+ }
+ return NULL;
+}
+
+static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
+ unsigned long vaddr)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ if (vaddr < map->vma->vm_start)
+ continue;
+ if (vaddr >= map->vma->vm_end)
+ continue;
+ return map;
+ }
+ return NULL;
+}
+
+static int gntdev_del_map(struct grant_map *map)
+{
+ int i;
+
+ if (map->vma)
+ return -EBUSY;
+ for (i = 0; i < map->count; i++)
+ if (map->unmap_ops[i].handle)
+ return -EBUSY;
+
+ map->priv->used -= map->count;
+ list_del(&map->next);
+ return 0;
+}
+
+static void gntdev_free_map(struct grant_map *map)
+{
+ int i;
+
+ if (!map)
+ return;
+
+ if (map->pages)
+ for (i = 0; i < map->count; i++) {
+ if (map->pages[i])
+ __free_page(map->pages[i]);
+ }
+ kfree(map->pages);
+ kfree(map->grants);
+ kfree(map->map_ops);
+ kfree(map->unmap_ops);
+ kfree(map);
+}
+
+/* ------------------------------------------------------------------ */
+
+static int find_grant_ptes(pte_t *pte, pgtable_t token,
+ unsigned long addr, void *data)
+{
+ struct grant_map *map = data;
+ unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+ u64 pte_maddr;
+
+ BUG_ON(pgnr >= map->count);
+ pte_maddr = arbitrary_virt_to_machine(pte).maddr;
+
+ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr,
+ GNTMAP_contains_pte | map->flags,
+ map->grants[pgnr].ref,
+ map->grants[pgnr].domid);
+ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr,
+ GNTMAP_contains_pte | map->flags,
+ 0 /* handle */);
+ return 0;
+}
+
+static int map_grant_pages(struct grant_map *map)
+{
+ int i, err = 0;
+
+ pr_debug("map %d+%d\n", map->index, map->count);
+ err = gnttab_map_refs(map->map_ops, map->pages, map->count);
+ if (err)
+ return err;
+
+ for (i = 0; i < map->count; i++) {
+ if (map->map_ops[i].status)
+ err = -EINVAL;
+ map->unmap_ops[i].handle = map->map_ops[i].handle;
+ }
+ return err;
+}
+
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+ int i, err = 0;
+
+ pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
+ err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages);
+ if (err)
+ return err;
+
+ for (i = 0; i < pages; i++) {
+ if (map->unmap_ops[offset+i].status)
+ err = -EINVAL;
+ map->unmap_ops[offset+i].handle = 0;
+ }
+ return err;
+}
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_vma_close(struct vm_area_struct *vma)
+{
+ struct grant_map *map = vma->vm_private_data;
+
+ pr_debug("close %p\n", vma);
+ map->is_mapped = 0;
+ map->vma = NULL;
+ vma->vm_private_data = NULL;
+}
+
+static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n",
+ vmf->virtual_address, vmf->pgoff);
+ vmf->flags = VM_FAULT_ERROR;
+ return 0;
+}
+
+static struct vm_operations_struct gntdev_vmops = {
+ .close = gntdev_vma_close,
+ .fault = gntdev_vma_fault,
+};
+
+/* ------------------------------------------------------------------ */
+
+static void mn_invl_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+ struct grant_map *map;
+ unsigned long mstart, mend;
+ int err;
+
+ spin_lock(&priv->lock);
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ if (!map->is_mapped)
+ continue;
+ if (map->vma->vm_start >= end)
+ continue;
+ if (map->vma->vm_end <= start)
+ continue;
+ mstart = max(start, map->vma->vm_start);
+ mend = min(end, map->vma->vm_end);
+ pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
+ map->index, map->count,
+ map->vma->vm_start, map->vma->vm_end,
+ start, end, mstart, mend);
+ err = unmap_grant_pages(map,
+ (mstart - map->vma->vm_start) >> PAGE_SHIFT,
+ (mend - mstart) >> PAGE_SHIFT);
+ WARN_ON(err);
+ }
+ spin_unlock(&priv->lock);
+}
+
+static void mn_invl_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
+}
+
+static void mn_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+ struct grant_map *map;
+ int err;
+
+ spin_lock(&priv->lock);
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ pr_debug("map %d+%d (%lx %lx)\n",
+ map->index, map->count,
+ map->vma->vm_start, map->vma->vm_end);
+ err = unmap_grant_pages(map, /* offset */ 0, map->count);
+ WARN_ON(err);
+ }
+ spin_unlock(&priv->lock);
+}
+
+struct mmu_notifier_ops gntdev_mmu_ops = {
+ .release = mn_release,
+ .invalidate_page = mn_invl_page,
+ .invalidate_range_start = mn_invl_range_start,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+ struct gntdev_priv *priv;
+ int ret = 0;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&priv->maps);
+ spin_lock_init(&priv->lock);
+ priv->limit = limit;
+
+ priv->mm = get_task_mm(current);
+ if (!priv->mm) {
+ kfree(priv);
+ return -ENOMEM;
+ }
+ priv->mn.ops = &gntdev_mmu_ops;
+ ret = mmu_notifier_register(&priv->mn, priv->mm);
+ mmput(priv->mm);
+
+ if (ret) {
+ kfree(priv);
+ return ret;
+ }
+
+ flip->private_data = priv;
+ pr_debug("priv %p\n", priv);
+
+ return 0;
+}
+
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ struct grant_map *map;
+ int err;
+
+ pr_debug("priv %p\n", priv);
+
+ spin_lock(&priv->lock);
+ while (!list_empty(&priv->maps)) {
+ map = list_entry(priv->maps.next, struct grant_map, next);
+ err = gntdev_del_map(map);
+ if (WARN_ON(err))
+ gntdev_free_map(map);
+
+ }
+ spin_unlock(&priv->lock);
+
+ mmu_notifier_unregister(&priv->mn, priv->mm);
+ kfree(priv);
+ return 0;
+}
+
+static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
+ struct ioctl_gntdev_map_grant_ref __user *u)
+{
+ struct ioctl_gntdev_map_grant_ref op;
+ struct grant_map *map;
+ int err;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ pr_debug("priv %p, add %d\n", priv, op.count);
+ if (unlikely(op.count <= 0))
+ return -EINVAL;
+ if (unlikely(op.count > priv->limit))
+ return -EINVAL;
+
+ err = -ENOMEM;
+ map = gntdev_alloc_map(priv, op.count);
+ if (!map)
+ return err;
+ if (copy_from_user(map->grants, &u->refs,
+ sizeof(map->grants[0]) * op.count) != 0) {
+ gntdev_free_map(map);
+ return err;
+ }
+
+ spin_lock(&priv->lock);
+ gntdev_add_map(priv, map);
+ op.index = map->index << PAGE_SHIFT;
+ spin_unlock(&priv->lock);
+
+ if (copy_to_user(u, &op, sizeof(op)) != 0) {
+ spin_lock(&priv->lock);
+ gntdev_del_map(map);
+ spin_unlock(&priv->lock);
+ gntdev_free_map(map);
+ return err;
+ }
+ return 0;
+}
+
+static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
+ struct ioctl_gntdev_unmap_grant_ref __user *u)
+{
+ struct ioctl_gntdev_unmap_grant_ref op;
+ struct grant_map *map;
+ int err = -ENOENT;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
+ if (map)
+ err = gntdev_del_map(map);
+ spin_unlock(&priv->lock);
+ if (!err)
+ gntdev_free_map(map);
+ return err;
+}
+
+static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
+ struct ioctl_gntdev_get_offset_for_vaddr __user *u)
+{
+ struct ioctl_gntdev_get_offset_for_vaddr op;
+ struct grant_map *map;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_vaddr(priv, op.vaddr);
+ if (map == NULL ||
+ map->vma->vm_start != op.vaddr) {
+ spin_unlock(&priv->lock);
+ return -EINVAL;
+ }
+ op.offset = map->index << PAGE_SHIFT;
+ op.count = map->count;
+ spin_unlock(&priv->lock);
+
+ if (copy_to_user(u, &op, sizeof(op)) != 0)
+ return -EFAULT;
+ return 0;
+}
+
+static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
+ struct ioctl_gntdev_set_max_grants __user *u)
+{
+ struct ioctl_gntdev_set_max_grants op;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ pr_debug("priv %p, limit %d\n", priv, op.count);
+ if (op.count > limit)
+ return -E2BIG;
+
+ spin_lock(&priv->lock);
+ priv->limit = op.count;
+ spin_unlock(&priv->lock);
+ return 0;
+}
+
+static long gntdev_ioctl(struct file *flip,
+ unsigned int cmd, unsigned long arg)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ void __user *ptr = (void __user *)arg;
+
+ switch (cmd) {
+ case IOCTL_GNTDEV_MAP_GRANT_REF:
+ return gntdev_ioctl_map_grant_ref(priv, ptr);
+
+ case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+ return gntdev_ioctl_unmap_grant_ref(priv, ptr);
+
+ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+ return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
+
+ case IOCTL_GNTDEV_SET_MAX_GRANTS:
+ return gntdev_ioctl_set_max_grants(priv, ptr);
+
+ default:
+ pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ int index = vma->vm_pgoff;
+ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ struct grant_map *map;
+ int err = -EINVAL;
+
+ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ pr_debug("map %d+%d at %lx (pgoff %lx)\n",
+ index, count, vma->vm_start, vma->vm_pgoff);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_index(priv, index, count);
+ if (!map)
+ goto unlock_out;
+ if (map->vma)
+ goto unlock_out;
+ if (priv->mm != vma->vm_mm) {
+ printk(KERN_WARNING "Huh? Other mm?\n");
+ goto unlock_out;
+ }
+
+ vma->vm_ops = &gntdev_vmops;
+
+ vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP;
+
+ vma->vm_private_data = map;
+ map->vma = vma;
+
+ map->flags = GNTMAP_host_map | GNTMAP_application_map;
+ if (!(vma->vm_flags & VM_WRITE))
+ map->flags |= GNTMAP_readonly;
+
+ spin_unlock(&priv->lock);
+
+ err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+ vma->vm_end - vma->vm_start,
+ find_grant_ptes, map);
+ if (err) {
+ printk(KERN_WARNING "find_grant_ptes() failure.\n");
+ return err;
+ }
+
+ err = map_grant_pages(map);
+ if (err) {
+ printk(KERN_WARNING "map_grant_pages() failure.\n");
+ return err;
+ }
+
+ map->is_mapped = 1;
+
+ return 0;
+
+unlock_out:
+ spin_unlock(&priv->lock);
+ return err;
+}
+
+static const struct file_operations gntdev_fops = {
+ .owner = THIS_MODULE,
+ .open = gntdev_open,
+ .release = gntdev_release,
+ .mmap = gntdev_mmap,
+ .unlocked_ioctl = gntdev_ioctl
+};
+
+static struct miscdevice gntdev_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/gntdev",
+ .fops = &gntdev_fops,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int __init gntdev_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&gntdev_miscdev);
+ if (err != 0) {
+ printk(KERN_ERR "Could not register gntdev device\n");
+ return err;
+ }
+ return 0;
+}
+
+static void __exit gntdev_exit(void)
+{
+ misc_deregister(&gntdev_miscdev);
+}
+
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+
+/* ------------------------------------------------------------------ */
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 7d8f531fb8e8..9ef54ebc1194 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -34,12 +34,16 @@
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
+#include <xen/interface/memory.h>
#include <asm/xen/hypercall.h>
#include <asm/pgtable.h>
@@ -57,6 +61,8 @@ static unsigned int boot_max_nr_grant_frames;
static int gnttab_free_count;
static grant_ref_t gnttab_free_head;
static DEFINE_SPINLOCK(gnttab_list_lock);
+unsigned long xen_hvm_resume_frames;
+EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
static struct grant_entry *shared;
@@ -431,7 +437,7 @@ static unsigned int __max_nr_grant_frames(void)
return query.max_nr_frames;
}
-static inline unsigned int max_nr_grant_frames(void)
+unsigned int gnttab_max_grant_frames(void)
{
unsigned int xen_max = __max_nr_grant_frames();
@@ -439,6 +445,53 @@ static inline unsigned int max_nr_grant_frames(void)
return boot_max_nr_grant_frames;
return xen_max;
}
+EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
+
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret;
+ pte_t *pte;
+ unsigned long mfn;
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < count; i++) {
+ /* m2p override only supported for GNTMAP_contains_pte mappings */
+ if (!(map_ops[i].flags & GNTMAP_contains_pte))
+ continue;
+ pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+ (map_ops[i].host_addr & ~PAGE_MASK));
+ mfn = pte_mfn(*pte);
+ ret = m2p_add_override(mfn, pages[i]);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_map_refs);
+
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret;
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < count; i++) {
+ ret = m2p_remove_override(pages[i]);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
@@ -447,6 +500,30 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
unsigned int nr_gframes = end_idx + 1;
int rc;
+ if (xen_hvm_domain()) {
+ struct xen_add_to_physmap xatp;
+ unsigned int i = end_idx;
+ rc = 0;
+ /*
+ * Loop backwards, so that the first hypercall has the largest
+ * index, ensuring that the table will grow only once.
+ */
+ do {
+ xatp.domid = DOMID_SELF;
+ xatp.idx = i;
+ xatp.space = XENMAPSPACE_grant_table;
+ xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i;
+ rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
+ if (rc != 0) {
+ printk(KERN_WARNING
+ "grant table add_to_physmap failed, err=%d\n", rc);
+ break;
+ }
+ } while (i-- > start_idx);
+
+ return rc;
+ }
+
frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
if (!frames)
return -ENOMEM;
@@ -463,7 +540,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
BUG_ON(rc || setup.status);
- rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(),
+ rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(),
&shared);
BUG_ON(rc);
@@ -474,9 +551,27 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
int gnttab_resume(void)
{
- if (max_nr_grant_frames() < nr_grant_frames)
+ unsigned int max_nr_gframes;
+
+ max_nr_gframes = gnttab_max_grant_frames();
+ if (max_nr_gframes < nr_grant_frames)
return -ENOSYS;
- return gnttab_map(0, nr_grant_frames - 1);
+
+ if (xen_pv_domain())
+ return gnttab_map(0, nr_grant_frames - 1);
+
+ if (!shared) {
+ shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes);
+ if (shared == NULL) {
+ printk(KERN_WARNING
+ "Failed to ioremap gnttab share frames!");
+ return -ENOMEM;
+ }
+ }
+
+ gnttab_map(0, nr_grant_frames - 1);
+
+ return 0;
}
int gnttab_suspend(void)
@@ -493,7 +588,7 @@ static int gnttab_expand(unsigned int req_entries)
cur = nr_grant_frames;
extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
GREFS_PER_GRANT_FRAME);
- if (cur + extra > max_nr_grant_frames())
+ if (cur + extra > gnttab_max_grant_frames())
return -ENOSPC;
rc = gnttab_map(cur, cur + extra - 1);
@@ -503,15 +598,12 @@ static int gnttab_expand(unsigned int req_entries)
return rc;
}
-static int __devinit gnttab_init(void)
+int gnttab_init(void)
{
int i;
unsigned int max_nr_glist_frames, nr_glist_frames;
unsigned int nr_init_grefs;
- if (!xen_domain())
- return -ENODEV;
-
nr_grant_frames = 1;
boot_max_nr_grant_frames = __max_nr_grant_frames();
@@ -554,5 +646,18 @@ static int __devinit gnttab_init(void)
kfree(gnttab_list);
return -ENOMEM;
}
+EXPORT_SYMBOL_GPL(gnttab_init);
+
+static int __devinit __gnttab_init(void)
+{
+ /* Delay grant-table initialization in the PV on HVM case */
+ if (xen_hvm_domain())
+ return 0;
+
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+ return gnttab_init();
+}
-core_initcall(gnttab_init);
+core_initcall(__gnttab_init);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 10d03d7931c4..db8c4c4ac880 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -3,11 +3,13 @@
*/
#include <linux/kernel.h>
#include <linux/err.h>
+#include <linux/slab.h>
#include <linux/reboot.h>
#include <linux/sysrq.h>
#include <linux/stop_machine.h>
#include <linux/freezer.h>
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/grant_table.h>
#include <xen/events.h>
@@ -16,6 +18,7 @@
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
+#include <asm/xen/hypervisor.h>
enum shutdown_state {
SHUTDOWN_INVALID = -1,
@@ -32,10 +35,31 @@ enum shutdown_state {
static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
#ifdef CONFIG_PM_SLEEP
-static int xen_suspend(void *data)
+static int xen_hvm_suspend(void *data)
{
+ struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
int *cancelled = data;
+
+ BUG_ON(!irqs_disabled());
+
+ *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
+
+ xen_hvm_post_suspend(*cancelled);
+ gnttab_resume();
+
+ if (!*cancelled) {
+ xen_irq_resume();
+ xen_console_resume();
+ xen_timer_resume();
+ }
+
+ return 0;
+}
+
+static int xen_suspend(void *data)
+{
int err;
+ int *cancelled = data;
BUG_ON(!irqs_disabled());
@@ -43,7 +67,6 @@ static int xen_suspend(void *data)
if (err) {
printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
err);
- dpm_resume_noirq(PMSG_RESUME);
return err;
}
@@ -69,7 +92,6 @@ static int xen_suspend(void *data)
}
sysdev_resume();
- dpm_resume_noirq(PMSG_RESUME);
return 0;
}
@@ -88,14 +110,14 @@ static void do_suspend(void)
err = freeze_processes();
if (err) {
printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
- return;
+ goto out;
}
#endif
err = dpm_suspend_start(PMSG_SUSPEND);
if (err) {
printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err);
- goto out;
+ goto out_thaw;
}
printk(KERN_DEBUG "suspending xenstore...\n");
@@ -104,31 +126,37 @@ static void do_suspend(void)
err = dpm_suspend_noirq(PMSG_SUSPEND);
if (err) {
printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err);
- goto resume_devices;
+ goto out_resume;
}
- err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+ if (xen_hvm_domain())
+ err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0));
+ else
+ err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+
+ dpm_resume_noirq(PMSG_RESUME);
+
if (err) {
printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
- goto out;
+ cancelled = 1;
}
+out_resume:
if (!cancelled) {
xen_arch_resume();
xs_resume();
} else
xs_suspend_cancel();
- dpm_resume_noirq(PMSG_RESUME);
-
-resume_devices:
dpm_resume_end(PMSG_RESUME);
/* Make sure timer events get retriggered on all CPUs */
clock_was_set();
-out:
+
+out_thaw:
#ifdef CONFIG_PREEMPT
thaw_processes();
+out:
#endif
shutting_down = SHUTDOWN_INVALID;
}
@@ -183,6 +211,7 @@ static void shutdown_handler(struct xenbus_watch *watch,
kfree(str);
}
+#ifdef CONFIG_MAGIC_SYSRQ
static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
unsigned int len)
{
@@ -209,18 +238,19 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
goto again;
if (sysrq_key != '\0')
- handle_sysrq(sysrq_key, NULL);
+ handle_sysrq(sysrq_key);
}
-static struct xenbus_watch shutdown_watch = {
- .node = "control/shutdown",
- .callback = shutdown_handler
-};
-
static struct xenbus_watch sysrq_watch = {
.node = "control/sysrq",
.callback = sysrq_handler
};
+#endif
+
+static struct xenbus_watch shutdown_watch = {
+ .node = "control/shutdown",
+ .callback = shutdown_handler
+};
static int setup_shutdown_watcher(void)
{
@@ -232,11 +262,13 @@ static int setup_shutdown_watcher(void)
return err;
}
+#ifdef CONFIG_MAGIC_SYSRQ
err = register_xenbus_watch(&sysrq_watch);
if (err) {
printk(KERN_ERR "Failed to set sysrq watcher\n");
return err;
}
+#endif
return 0;
}
@@ -249,7 +281,19 @@ static int shutdown_event(struct notifier_block *notifier,
return NOTIFY_DONE;
}
-static int __init setup_shutdown_event(void)
+static int __init __setup_shutdown_event(void)
+{
+ /* Delay initialization in the PV on HVM case */
+ if (xen_hvm_domain())
+ return 0;
+
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+ return xen_setup_shutdown_event();
+}
+
+int xen_setup_shutdown_event(void)
{
static struct notifier_block xenstore_notifier = {
.notifier_call = shutdown_event
@@ -258,5 +302,6 @@ static int __init setup_shutdown_event(void)
return 0;
}
+EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
-subsys_initcall(setup_shutdown_event);
+subsys_initcall(__setup_shutdown_event);
diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
new file mode 100644
index 000000000000..cef4bafc07dc
--- /dev/null
+++ b/drivers/xen/pci.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Weidong Han <weidong.han@intel.com>
+ */
+
+#include <linux/pci.h>
+#include <xen/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/xen.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include "../pci/pci.h"
+
+static int xen_add_device(struct device *dev)
+{
+ int r;
+ struct pci_dev *pci_dev = to_pci_dev(dev);
+
+#ifdef CONFIG_PCI_IOV
+ if (pci_dev->is_virtfn) {
+ struct physdev_manage_pci_ext manage_pci_ext = {
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn,
+ .is_virtfn = 1,
+ .physfn.bus = pci_dev->physfn->bus->number,
+ .physfn.devfn = pci_dev->physfn->devfn,
+ };
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
+ &manage_pci_ext);
+ } else
+#endif
+ if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
+ struct physdev_manage_pci_ext manage_pci_ext = {
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn,
+ .is_extfn = 1,
+ };
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
+ &manage_pci_ext);
+ } else {
+ struct physdev_manage_pci manage_pci = {
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn,
+ };
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
+ &manage_pci);
+ }
+
+ return r;
+}
+
+static int xen_remove_device(struct device *dev)
+{
+ int r;
+ struct pci_dev *pci_dev = to_pci_dev(dev);
+ struct physdev_manage_pci manage_pci;
+
+ manage_pci.bus = pci_dev->bus->number;
+ manage_pci.devfn = pci_dev->devfn;
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
+ &manage_pci);
+
+ return r;
+}
+
+static int xen_pci_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+ int r = 0;
+
+ switch (action) {
+ case BUS_NOTIFY_ADD_DEVICE:
+ r = xen_add_device(dev);
+ break;
+ case BUS_NOTIFY_DEL_DEVICE:
+ r = xen_remove_device(dev);
+ break;
+ default:
+ break;
+ }
+
+ return r;
+}
+
+struct notifier_block device_nb = {
+ .notifier_call = xen_pci_notifier,
+};
+
+static int __init register_xen_pci_notifier(void)
+{
+ if (!xen_initial_domain())
+ return 0;
+
+ return bus_register_notifier(&pci_bus_type, &device_nb);
+}
+
+arch_initcall(register_xen_pci_notifier);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
new file mode 100644
index 000000000000..afbe041f42c5
--- /dev/null
+++ b/drivers/xen/platform-pci.c
@@ -0,0 +1,200 @@
+/******************************************************************************
+ * platform-pci.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2005, Intel Corporation.
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include <xen/platform_pci.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/hvm.h>
+#include <xen/xen-ops.h>
+
+#define DRV_NAME "xen-platform-pci"
+
+MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com");
+MODULE_DESCRIPTION("Xen platform PCI device");
+MODULE_LICENSE("GPL");
+
+static unsigned long platform_mmio;
+static unsigned long platform_mmio_alloc;
+static unsigned long platform_mmiolen;
+static uint64_t callback_via;
+
+unsigned long alloc_xen_mmio(unsigned long len)
+{
+ unsigned long addr;
+
+ addr = platform_mmio + platform_mmio_alloc;
+ platform_mmio_alloc += len;
+ BUG_ON(platform_mmio_alloc > platform_mmiolen);
+
+ return addr;
+}
+
+static uint64_t get_callback_via(struct pci_dev *pdev)
+{
+ u8 pin;
+ int irq;
+
+ irq = pdev->irq;
+ if (irq < 16)
+ return irq; /* ISA IRQ */
+
+ pin = pdev->pin;
+
+ /* We don't know the GSI. Specify the PCI INTx line instead. */
+ return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */
+ ((uint64_t)pci_domain_nr(pdev->bus) << 32) |
+ ((uint64_t)pdev->bus->number << 16) |
+ ((uint64_t)(pdev->devfn & 0xff) << 8) |
+ ((uint64_t)(pin - 1) & 3);
+}
+
+static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
+{
+ xen_hvm_evtchn_do_upcall();
+ return IRQ_HANDLED;
+}
+
+static int xen_allocate_irq(struct pci_dev *pdev)
+{
+ return request_irq(pdev->irq, do_hvm_evtchn_intr,
+ IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
+ "xen-platform-pci", pdev);
+}
+
+static int platform_pci_resume(struct pci_dev *pdev)
+{
+ int err;
+ if (xen_have_vector_callback)
+ return 0;
+ err = xen_set_callback_via(callback_via);
+ if (err) {
+ dev_err(&pdev->dev, "platform_pci_resume failure!\n");
+ return err;
+ }
+ return 0;
+}
+
+static int __devinit platform_pci_init(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ int i, ret;
+ long ioaddr;
+ long mmio_addr, mmio_len;
+ unsigned int max_nr_gframes;
+
+ i = pci_enable_device(pdev);
+ if (i)
+ return i;
+
+ ioaddr = pci_resource_start(pdev, 0);
+
+ mmio_addr = pci_resource_start(pdev, 1);
+ mmio_len = pci_resource_len(pdev, 1);
+
+ if (mmio_addr == 0 || ioaddr == 0) {
+ dev_err(&pdev->dev, "no resources found\n");
+ ret = -ENOENT;
+ goto pci_out;
+ }
+
+ ret = pci_request_region(pdev, 1, DRV_NAME);
+ if (ret < 0)
+ goto pci_out;
+
+ ret = pci_request_region(pdev, 0, DRV_NAME);
+ if (ret < 0)
+ goto mem_out;
+
+ platform_mmio = mmio_addr;
+ platform_mmiolen = mmio_len;
+
+ if (!xen_have_vector_callback) {
+ ret = xen_allocate_irq(pdev);
+ if (ret) {
+ dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret);
+ goto out;
+ }
+ callback_via = get_callback_via(pdev);
+ ret = xen_set_callback_via(callback_via);
+ if (ret) {
+ dev_warn(&pdev->dev, "Unable to set the evtchn callback "
+ "err=%d\n", ret);
+ goto out;
+ }
+ }
+
+ max_nr_gframes = gnttab_max_grant_frames();
+ xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+ ret = gnttab_init();
+ if (ret)
+ goto out;
+ xenbus_probe(NULL);
+ ret = xen_setup_shutdown_event();
+ if (ret)
+ goto out;
+ return 0;
+
+out:
+ pci_release_region(pdev, 0);
+mem_out:
+ pci_release_region(pdev, 1);
+pci_out:
+ pci_disable_device(pdev);
+ return ret;
+}
+
+static struct pci_device_id platform_pci_tbl[] __devinitdata = {
+ {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM,
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+ {0,}
+};
+
+MODULE_DEVICE_TABLE(pci, platform_pci_tbl);
+
+static struct pci_driver platform_driver = {
+ .name = DRV_NAME,
+ .probe = platform_pci_init,
+ .id_table = platform_pci_tbl,
+#ifdef CONFIG_PM
+ .resume_early = platform_pci_resume,
+#endif
+};
+
+static int __init platform_pci_module_init(void)
+{
+ /* no unplug has been done, IGNORE hasn't been specified: just
+ * return now */
+ if (!xen_platform_pci_unplug)
+ return -ENODEV;
+
+ return pci_register_driver(&platform_driver);
+}
+
+module_init(platform_pci_module_init);
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
new file mode 100644
index 000000000000..54469c3eeacd
--- /dev/null
+++ b/drivers/xen/swiotlb-xen.c
@@ -0,0 +1,515 @@
+/*
+ * Copyright 2010
+ * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ *
+ * This code provides a IOMMU for Xen PV guests with PCI passthrough.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License v2.0 as published by
+ * the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * PV guests under Xen are running in an non-contiguous memory architecture.
+ *
+ * When PCI pass-through is utilized, this necessitates an IOMMU for
+ * translating bus (DMA) to virtual and vice-versa and also providing a
+ * mechanism to have contiguous pages for device drivers operations (say DMA
+ * operations).
+ *
+ * Specifically, under Xen the Linux idea of pages is an illusion. It
+ * assumes that pages start at zero and go up to the available memory. To
+ * help with that, the Linux Xen MMU provides a lookup mechanism to
+ * translate the page frame numbers (PFN) to machine frame numbers (MFN)
+ * and vice-versa. The MFN are the "real" frame numbers. Furthermore
+ * memory is not contiguous. Xen hypervisor stitches memory for guests
+ * from different pools, which means there is no guarantee that PFN==MFN
+ * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are
+ * allocated in descending order (high to low), meaning the guest might
+ * never get any MFN's under the 4GB mark.
+ *
+ */
+
+#include <linux/bootmem.h>
+#include <linux/dma-mapping.h>
+#include <xen/swiotlb-xen.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+/*
+ * Used to do a quick range check in swiotlb_tbl_unmap_single and
+ * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+
+static char *xen_io_tlb_start, *xen_io_tlb_end;
+static unsigned long xen_io_tlb_nslabs;
+/*
+ * Quick lookup value of the bus address of the IOTLB.
+ */
+
+u64 start_dma_addr;
+
+static dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
+{
+ return phys_to_machine(XPADDR(paddr)).maddr;;
+}
+
+static phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
+{
+ return machine_to_phys(XMADDR(baddr)).paddr;
+}
+
+static dma_addr_t xen_virt_to_bus(void *address)
+{
+ return xen_phys_to_bus(virt_to_phys(address));
+}
+
+static int check_pages_physically_contiguous(unsigned long pfn,
+ unsigned int offset,
+ size_t length)
+{
+ unsigned long next_mfn;
+ int i;
+ int nr_pages;
+
+ next_mfn = pfn_to_mfn(pfn);
+ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+ for (i = 1; i < nr_pages; i++) {
+ if (pfn_to_mfn(++pfn) != ++next_mfn)
+ return 0;
+ }
+ return 1;
+}
+
+static int range_straddles_page_boundary(phys_addr_t p, size_t size)
+{
+ unsigned long pfn = PFN_DOWN(p);
+ unsigned int offset = p & ~PAGE_MASK;
+
+ if (offset + size <= PAGE_SIZE)
+ return 0;
+ if (check_pages_physically_contiguous(pfn, offset, size))
+ return 0;
+ return 1;
+}
+
+static int is_xen_swiotlb_buffer(dma_addr_t dma_addr)
+{
+ unsigned long mfn = PFN_DOWN(dma_addr);
+ unsigned long pfn = mfn_to_local_pfn(mfn);
+ phys_addr_t paddr;
+
+ /* If the address is outside our domain, it CAN
+ * have the same virtual address as another address
+ * in our domain. Therefore _only_ check address within our domain.
+ */
+ if (pfn_valid(pfn)) {
+ paddr = PFN_PHYS(pfn);
+ return paddr >= virt_to_phys(xen_io_tlb_start) &&
+ paddr < virt_to_phys(xen_io_tlb_end);
+ }
+ return 0;
+}
+
+static int max_dma_bits = 32;
+
+static int
+xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
+{
+ int i, rc;
+ int dma_bits;
+
+ dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
+
+ i = 0;
+ do {
+ int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE);
+
+ do {
+ rc = xen_create_contiguous_region(
+ (unsigned long)buf + (i << IO_TLB_SHIFT),
+ get_order(slabs << IO_TLB_SHIFT),
+ dma_bits);
+ } while (rc && dma_bits++ < max_dma_bits);
+ if (rc)
+ return rc;
+
+ i += slabs;
+ } while (i < nslabs);
+ return 0;
+}
+
+void __init xen_swiotlb_init(int verbose)
+{
+ unsigned long bytes;
+ int rc;
+
+ xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
+ xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
+
+ bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT;
+
+ /*
+ * Get IO TLB memory from any location.
+ */
+ xen_io_tlb_start = alloc_bootmem(bytes);
+ if (!xen_io_tlb_start)
+ panic("Cannot allocate SWIOTLB buffer");
+
+ xen_io_tlb_end = xen_io_tlb_start + bytes;
+ /*
+ * And replace that memory with pages under 4GB.
+ */
+ rc = xen_swiotlb_fixup(xen_io_tlb_start,
+ bytes,
+ xen_io_tlb_nslabs);
+ if (rc)
+ goto error;
+
+ start_dma_addr = xen_virt_to_bus(xen_io_tlb_start);
+ swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose);
+
+ return;
+error:
+ panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\
+ "We either don't have the permission or you do not have enough"\
+ "free memory under 4GB!\n", rc);
+}
+
+void *
+xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags)
+{
+ void *ret;
+ int order = get_order(size);
+ u64 dma_mask = DMA_BIT_MASK(32);
+ unsigned long vstart;
+
+ /*
+ * Ignore region specifiers - the kernel's ideas of
+ * pseudo-phys memory layout has nothing to do with the
+ * machine physical layout. We can't allocate highmem
+ * because we can't return a pointer to it.
+ */
+ flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
+
+ if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
+ return ret;
+
+ vstart = __get_free_pages(flags, order);
+ ret = (void *)vstart;
+
+ if (hwdev && hwdev->coherent_dma_mask)
+ dma_mask = dma_alloc_coherent_mask(hwdev, flags);
+
+ if (ret) {
+ if (xen_create_contiguous_region(vstart, order,
+ fls64(dma_mask)) != 0) {
+ free_pages(vstart, order);
+ return NULL;
+ }
+ memset(ret, 0, size);
+ *dma_handle = virt_to_machine(ret).maddr;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent);
+
+void
+xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+ dma_addr_t dev_addr)
+{
+ int order = get_order(size);
+
+ if (dma_release_from_coherent(hwdev, order, vaddr))
+ return;
+
+ xen_destroy_contiguous_region((unsigned long)vaddr, order);
+ free_pages((unsigned long)vaddr, order);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent);
+
+
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode. The
+ * physical address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed.
+ */
+dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ phys_addr_t phys = page_to_phys(page) + offset;
+ dma_addr_t dev_addr = xen_phys_to_bus(phys);
+ void *map;
+
+ BUG_ON(dir == DMA_NONE);
+ /*
+ * If the address happens to be in the device's DMA window,
+ * we can safely return the device addr and not worry about bounce
+ * buffering it.
+ */
+ if (dma_capable(dev, dev_addr, size) &&
+ !range_straddles_page_boundary(phys, size) && !swiotlb_force)
+ return dev_addr;
+
+ /*
+ * Oh well, have to allocate and map a bounce buffer.
+ */
+ map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir);
+ if (!map)
+ return DMA_ERROR_CODE;
+
+ dev_addr = xen_virt_to_bus(map);
+
+ /*
+ * Ensure that the address returned is DMA'ble
+ */
+ if (!dma_capable(dev, dev_addr, size))
+ panic("map_single: bounce buffer is not DMA'ble");
+
+ return dev_addr;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
+
+/*
+ * Unmap a single streaming mode DMA translation. The dma_addr and size must
+ * match what was provided for in a previous xen_swiotlb_map_page call. All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = xen_bus_to_phys(dev_addr);
+
+ BUG_ON(dir == DMA_NONE);
+
+ /* NOTE: We use dev_addr here, not paddr! */
+ if (is_xen_swiotlb_buffer(dev_addr)) {
+ swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
+ return;
+ }
+
+ if (dir != DMA_FROM_DEVICE)
+ return;
+
+ /*
+ * phys_to_virt doesn't work with hihgmem page but we could
+ * call dma_mark_clean() with hihgmem page here. However, we
+ * are fine since dma_mark_clean() is null on POWERPC. We can
+ * make dma_mark_clean() take a physical address if necessary.
+ */
+ dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ xen_unmap_single(hwdev, dev_addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
+
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the dma mapping, you must
+ * call this function before doing so. At the next point you give the dma
+ * address back to the card, you must first perform a
+ * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+static void
+xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir,
+ enum dma_sync_target target)
+{
+ phys_addr_t paddr = xen_bus_to_phys(dev_addr);
+
+ BUG_ON(dir == DMA_NONE);
+
+ /* NOTE: We use dev_addr here, not paddr! */
+ if (is_xen_swiotlb_buffer(dev_addr)) {
+ swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
+ target);
+ return;
+ }
+
+ if (dir != DMA_FROM_DEVICE)
+ return;
+
+ dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+void
+xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_cpu);
+
+void
+xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_device);
+
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above xen_swiotlb_map_page
+ * interface. Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length. They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ * DMA address/length pairs than there are SG table elements.
+ * (for example via virtual mapping capabilities)
+ * The routine returns the number of addr/length pairs actually
+ * used, at most nents.
+ *
+ * Device ownership issues as mentioned above for xen_swiotlb_map_page are the
+ * same here.
+ */
+int
+xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
+ for_each_sg(sgl, sg, nelems, i) {
+ phys_addr_t paddr = sg_phys(sg);
+ dma_addr_t dev_addr = xen_phys_to_bus(paddr);
+
+ if (swiotlb_force ||
+ !dma_capable(hwdev, dev_addr, sg->length) ||
+ range_straddles_page_boundary(paddr, sg->length)) {
+ void *map = swiotlb_tbl_map_single(hwdev,
+ start_dma_addr,
+ sg_phys(sg),
+ sg->length, dir);
+ if (!map) {
+ /* Don't panic here, we expect map_sg users
+ to do proper error handling. */
+ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
+ attrs);
+ sgl[0].dma_length = 0;
+ return DMA_ERROR_CODE;
+ }
+ sg->dma_address = xen_virt_to_bus(map);
+ } else
+ sg->dma_address = dev_addr;
+ sg->dma_length = sg->length;
+ }
+ return nelems;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs);
+
+int
+xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ enum dma_data_direction dir)
+{
+ return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg);
+
+/*
+ * Unmap a set of streaming mode DMA translations. Again, cpu read rules
+ * concerning calls here are the same as for swiotlb_unmap_page() above.
+ */
+void
+xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
+ for_each_sg(sgl, sg, nelems, i)
+ xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs);
+
+void
+xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ enum dma_data_direction dir)
+{
+ return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg);
+
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
+ * and usage.
+ */
+static void
+xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ enum dma_sync_target target)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nelems, i)
+ xen_swiotlb_sync_single(hwdev, sg->dma_address,
+ sg->dma_length, dir, target);
+}
+
+void
+xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_cpu);
+
+void
+xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_device);
+
+int
+xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
+{
+ return !dma_addr;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mapping_error);
+
+/*
+ * Return whether the given device DMA address mask can be supported
+ * properly. For example, if your device can only drive the low 24-bits
+ * during bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+int
+xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
+{
+ return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported);
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 88a60e03ccf0..60f1827a32cb 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -7,6 +7,7 @@
* published by the Free Software Foundation.
*/
+#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kobject.h>
@@ -14,6 +15,7 @@
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
@@ -425,7 +427,7 @@ static ssize_t hyp_sysfs_store(struct kobject *kobj,
return 0;
}
-static struct sysfs_ops hyp_sysfs_ops = {
+static const struct sysfs_ops hyp_sysfs_ops = {
.show = hyp_sysfs_show,
.store = hyp_sysfs_store,
};
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
index 5571f5b84223..8dca685358b4 100644
--- a/drivers/xen/xenbus/Makefile
+++ b/drivers/xen/xenbus/Makefile
@@ -5,3 +5,8 @@ xenbus-objs += xenbus_client.o
xenbus-objs += xenbus_comms.o
xenbus-objs += xenbus_xs.o
xenbus-objs += xenbus_probe.o
+
+xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
+xenbus-objs += $(xenbus-be-objs-y)
+
+obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 92a1ef80a288..cdacf923e073 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -30,6 +30,7 @@
* IN THE SOFTWARE.
*/
+#include <linux/slab.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <asm/xen/hypervisor.h>
@@ -49,6 +50,8 @@ const char *xenbus_strstate(enum xenbus_state state)
[ XenbusStateConnected ] = "Connected",
[ XenbusStateClosing ] = "Closing",
[ XenbusStateClosed ] = "Closed",
+ [XenbusStateReconfiguring] = "Reconfiguring",
+ [XenbusStateReconfigured] = "Reconfigured",
};
return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
}
@@ -132,17 +135,12 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev,
}
EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
+static void xenbus_switch_fatal(struct xenbus_device *, int, int,
+ const char *, ...);
-/**
- * xenbus_switch_state
- * @dev: xenbus device
- * @state: new state
- *
- * Advertise in the store a change of the given driver to the given new_state.
- * Return 0 on success, or -errno on error. On error, the device will switch
- * to XenbusStateClosing, and the error will be saved in the store.
- */
-int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
+static int
+__xenbus_switch_state(struct xenbus_device *dev,
+ enum xenbus_state state, int depth)
{
/* We check whether the state is currently set to the given value, and
if not, then the state is set. We don't want to unconditionally
@@ -151,35 +149,65 @@ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
to it, as the device will be tearing down, and we don't want to
resurrect that directory.
- Note that, because of this cached value of our state, this function
- will not work inside a Xenstore transaction (something it was
- trying to in the past) because dev->state would not get reset if
- the transaction was aborted.
-
+ Note that, because of this cached value of our state, this
+ function will not take a caller's Xenstore transaction
+ (something it was trying to in the past) because dev->state
+ would not get reset if the transaction was aborted.
*/
+ struct xenbus_transaction xbt;
int current_state;
- int err;
+ int err, abort;
if (state == dev->state)
return 0;
- err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
- &current_state);
- if (err != 1)
+again:
+ abort = 1;
+
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_switch_fatal(dev, depth, err, "starting transaction");
return 0;
+ }
+
+ err = xenbus_scanf(xbt, dev->nodename, "state", "%d", &current_state);
+ if (err != 1)
+ goto abort;
- err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
+ err = xenbus_printf(xbt, dev->nodename, "state", "%d", state);
if (err) {
- if (state != XenbusStateClosing) /* Avoid looping */
- xenbus_dev_fatal(dev, err, "writing new state");
- return err;
+ xenbus_switch_fatal(dev, depth, err, "writing new state");
+ goto abort;
}
- dev->state = state;
+ abort = 0;
+abort:
+ err = xenbus_transaction_end(xbt, abort);
+ if (err) {
+ if (err == -EAGAIN && !abort)
+ goto again;
+ xenbus_switch_fatal(dev, depth, err, "ending transaction");
+ } else
+ dev->state = state;
return 0;
}
+
+/**
+ * xenbus_switch_state
+ * @dev: xenbus device
+ * @state: new state
+ *
+ * Advertise in the store a change of the given driver to the given new_state.
+ * Return 0 on success, or -errno on error. On error, the device will switch
+ * to XenbusStateClosing, and the error will be saved in the store.
+ */
+int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
+{
+ return __xenbus_switch_state(dev, state, 0);
+}
+
EXPORT_SYMBOL_GPL(xenbus_switch_state);
int xenbus_frontend_closed(struct xenbus_device *dev)
@@ -283,6 +311,23 @@ void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
/**
+ * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps
+ * avoiding recursion within xenbus_switch_state.
+ */
+static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
+ const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ xenbus_va_dev_error(dev, err, fmt, ap);
+ va_end(ap);
+
+ if (!depth)
+ __xenbus_switch_state(dev, XenbusStateClosing, 1);
+}
+
+/**
* xenbus_grant_ring
* @dev: xenbus device
* @ring_mfn: mfn of ring to grant
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index d42e25d5968d..baa65e7fbbc7 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -45,35 +45,33 @@
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/io.h>
+#include <linux/slab.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/events.h>
#include <xen/page.h>
+#include <xen/hvm.h>
+
#include "xenbus_comms.h"
#include "xenbus_probe.h"
int xen_store_evtchn;
-EXPORT_SYMBOL(xen_store_evtchn);
+EXPORT_SYMBOL_GPL(xen_store_evtchn);
struct xenstore_domain_interface *xen_store_interface;
+EXPORT_SYMBOL_GPL(xen_store_interface);
+
static unsigned long xen_store_mfn;
static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
-static void wait_for_devices(struct xenbus_driver *xendrv);
-
-static int xenbus_probe_frontend(const char *type, const char *name);
-
-static void xenbus_dev_shutdown(struct device *_dev);
-
-static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
-static int xenbus_dev_resume(struct device *dev);
-
/* If something in array of ids matches this device, return it. */
static const struct xenbus_device_id *
match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -94,34 +92,7 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv)
return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
}
-
-static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env)
-{
- struct xenbus_device *dev = to_xenbus_device(_dev);
-
- if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
- return -ENOMEM;
-
- return 0;
-}
-
-/* device/<type>/<id> => <type>-<id> */
-static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
-{
- nodename = strchr(nodename, '/');
- if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
- printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
- return -EINVAL;
- }
-
- strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
- if (!strchr(bus_id, '/')) {
- printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
- return -EINVAL;
- }
- *strchr(bus_id, '/') = '-';
- return 0;
-}
+EXPORT_SYMBOL_GPL(xenbus_match);
static void free_otherend_details(struct xenbus_device *dev)
@@ -141,7 +112,30 @@ static void free_otherend_watch(struct xenbus_device *dev)
}
-int read_otherend_details(struct xenbus_device *xendev,
+static int talk_to_otherend(struct xenbus_device *dev)
+{
+ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+
+ free_otherend_watch(dev);
+ free_otherend_details(dev);
+
+ return drv->read_otherend_details(dev);
+}
+
+
+
+static int watch_otherend(struct xenbus_device *dev)
+{
+ struct xen_bus_type *bus =
+ container_of(dev->dev.bus, struct xen_bus_type, bus);
+
+ return xenbus_watch_pathfmt(dev, &dev->otherend_watch,
+ bus->otherend_changed,
+ "%s/%s", dev->otherend, "state");
+}
+
+
+int xenbus_read_otherend_details(struct xenbus_device *xendev,
char *id_node, char *path_node)
{
int err = xenbus_gather(XBT_NIL, xendev->nodename,
@@ -166,39 +160,11 @@ int read_otherend_details(struct xenbus_device *xendev,
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
-
-static int read_backend_details(struct xenbus_device *xendev)
-{
- return read_otherend_details(xendev, "backend-id", "backend");
-}
-
-static struct device_attribute xenbus_dev_attrs[] = {
- __ATTR_NULL
-};
-
-/* Bus type for frontend drivers. */
-static struct xen_bus_type xenbus_frontend = {
- .root = "device",
- .levels = 2, /* device/type/<id> */
- .get_bus_id = frontend_bus_id,
- .probe = xenbus_probe_frontend,
- .bus = {
- .name = "xen",
- .match = xenbus_match,
- .uevent = xenbus_uevent,
- .probe = xenbus_dev_probe,
- .remove = xenbus_dev_remove,
- .shutdown = xenbus_dev_shutdown,
- .dev_attrs = xenbus_dev_attrs,
-
- .suspend = xenbus_dev_suspend,
- .resume = xenbus_dev_resume,
- },
-};
-
-static void otherend_changed(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
+void xenbus_otherend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len,
+ int ignore_on_shutdown)
{
struct xenbus_device *dev =
container_of(watch, struct xenbus_device, otherend_watch);
@@ -226,11 +192,7 @@ static void otherend_changed(struct xenbus_watch *watch,
* work that can fail e.g., when the rootfs is gone.
*/
if (system_state > SYSTEM_RUNNING) {
- struct xen_bus_type *bus = bus;
- bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
- /* If we're frontend, drive the state machine to Closed. */
- /* This should cause the backend to release our resources. */
- if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
+ if (ignore_on_shutdown && (state == XenbusStateClosing))
xenbus_frontend_closed(dev);
return;
}
@@ -238,25 +200,7 @@ static void otherend_changed(struct xenbus_watch *watch,
if (drv->otherend_changed)
drv->otherend_changed(dev, state);
}
-
-
-static int talk_to_otherend(struct xenbus_device *dev)
-{
- struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
-
- free_otherend_watch(dev);
- free_otherend_details(dev);
-
- return drv->read_otherend_details(dev);
-}
-
-
-static int watch_otherend(struct xenbus_device *dev)
-{
- return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
- "%s/%s", dev->otherend, "state");
-}
-
+EXPORT_SYMBOL_GPL(xenbus_otherend_changed);
int xenbus_dev_probe(struct device *_dev)
{
@@ -300,8 +244,9 @@ int xenbus_dev_probe(struct device *_dev)
fail:
xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
xenbus_switch_state(dev, XenbusStateClosed);
- return -ENODEV;
+ return err;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_probe);
int xenbus_dev_remove(struct device *_dev)
{
@@ -319,8 +264,9 @@ int xenbus_dev_remove(struct device *_dev)
xenbus_switch_state(dev, XenbusStateClosed);
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_remove);
-static void xenbus_dev_shutdown(struct device *_dev)
+void xenbus_dev_shutdown(struct device *_dev)
{
struct xenbus_device *dev = to_xenbus_device(_dev);
unsigned long timeout = 5*HZ;
@@ -341,6 +287,7 @@ static void xenbus_dev_shutdown(struct device *_dev)
out:
put_device(&dev->dev);
}
+EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
int xenbus_register_driver_common(struct xenbus_driver *drv,
struct xen_bus_type *bus,
@@ -354,25 +301,7 @@ int xenbus_register_driver_common(struct xenbus_driver *drv,
return driver_register(&drv->driver);
}
-
-int __xenbus_register_frontend(struct xenbus_driver *drv,
- struct module *owner, const char *mod_name)
-{
- int ret;
-
- drv->read_otherend_details = read_backend_details;
-
- ret = xenbus_register_driver_common(drv, &xenbus_frontend,
- owner, mod_name);
- if (ret)
- return ret;
-
- /* If this driver is loaded as a module wait for devices to attach. */
- wait_for_devices(drv);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
+EXPORT_SYMBOL_GPL(xenbus_register_driver_common);
void xenbus_unregister_driver(struct xenbus_driver *drv)
{
@@ -454,21 +383,21 @@ static ssize_t xendev_show_nodename(struct device *dev,
{
return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
}
-DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
+static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
static ssize_t xendev_show_devtype(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
}
-DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
+static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
static ssize_t xendev_show_modalias(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
}
-DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
+static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
int xenbus_probe_node(struct xen_bus_type *bus,
const char *type,
@@ -543,24 +472,7 @@ fail:
kfree(xendev);
return err;
}
-
-/* device/<typename>/<name> */
-static int xenbus_probe_frontend(const char *type, const char *name)
-{
- char *nodename;
- int err;
-
- nodename = kasprintf(GFP_KERNEL, "%s/%s/%s",
- xenbus_frontend.root, type, name);
- if (!nodename)
- return -ENOMEM;
-
- DPRINTK("%s", nodename);
-
- err = xenbus_probe_node(&xenbus_frontend, type, nodename);
- kfree(nodename);
- return err;
-}
+EXPORT_SYMBOL_GPL(xenbus_probe_node);
static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
{
@@ -574,10 +486,11 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
return PTR_ERR(dir);
for (i = 0; i < dir_n; i++) {
- err = bus->probe(type, dir[i]);
+ err = bus->probe(bus, type, dir[i]);
if (err)
break;
}
+
kfree(dir);
return err;
}
@@ -597,9 +510,11 @@ int xenbus_probe_devices(struct xen_bus_type *bus)
if (err)
break;
}
+
kfree(dir);
return err;
}
+EXPORT_SYMBOL_GPL(xenbus_probe_devices);
static unsigned int char_count(const char *str, char c)
{
@@ -662,32 +577,18 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
}
EXPORT_SYMBOL_GPL(xenbus_dev_changed);
-static void frontend_changed(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
-{
- DPRINTK("");
-
- xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
-}
-
-/* We watch for devices appearing and vanishing. */
-static struct xenbus_watch fe_watch = {
- .node = "device",
- .callback = frontend_changed,
-};
-
-static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
+int xenbus_dev_suspend(struct device *dev, pm_message_t state)
{
int err = 0;
struct xenbus_driver *drv;
- struct xenbus_device *xdev;
+ struct xenbus_device *xdev
+ = container_of(dev, struct xenbus_device, dev);
- DPRINTK("");
+ DPRINTK("%s", xdev->nodename);
if (dev->driver == NULL)
return 0;
drv = to_xenbus_driver(dev->driver);
- xdev = container_of(dev, struct xenbus_device, dev);
if (drv->suspend)
err = drv->suspend(xdev, state);
if (err)
@@ -695,21 +596,20 @@ static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
"xenbus: suspend %s failed: %i\n", dev_name(dev), err);
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
-static int xenbus_dev_resume(struct device *dev)
+int xenbus_dev_resume(struct device *dev)
{
int err;
struct xenbus_driver *drv;
- struct xenbus_device *xdev;
+ struct xenbus_device *xdev
+ = container_of(dev, struct xenbus_device, dev);
- DPRINTK("");
+ DPRINTK("%s", xdev->nodename);
if (dev->driver == NULL)
return 0;
-
drv = to_xenbus_driver(dev->driver);
- xdev = container_of(dev, struct xenbus_device, dev);
-
err = talk_to_otherend(xdev);
if (err) {
printk(KERN_WARNING
@@ -740,6 +640,7 @@ static int xenbus_dev_resume(struct device *dev)
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_resume);
/* A flag to determine if xenstored is 'ready' (i.e. has started) */
int xenstored_ready = 0;
@@ -766,59 +667,95 @@ EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
void xenbus_probe(struct work_struct *unused)
{
- BUG_ON((xenstored_ready <= 0));
-
- /* Enumerate devices in xenstore and watch for changes. */
- xenbus_probe_devices(&xenbus_frontend);
- register_xenbus_watch(&fe_watch);
- xenbus_backend_probe_and_watch();
+ xenstored_ready = 1;
/* Notify others that xenstore is up */
blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
}
+EXPORT_SYMBOL_GPL(xenbus_probe);
-static int __init xenbus_probe_init(void)
+static int __init xenbus_probe_initcall(void)
+{
+ if (!xen_domain())
+ return -ENODEV;
+
+ if (xen_initial_domain() || xen_hvm_domain())
+ return 0;
+
+ xenbus_probe(NULL);
+ return 0;
+}
+
+device_initcall(xenbus_probe_initcall);
+
+static int __init xenbus_init(void)
{
int err = 0;
+ unsigned long page = 0;
DPRINTK("");
err = -ENODEV;
if (!xen_domain())
- goto out_error;
-
- /* Register ourselves with the kernel bus subsystem */
- err = bus_register(&xenbus_frontend.bus);
- if (err)
- goto out_error;
-
- err = xenbus_backend_bus_register();
- if (err)
- goto out_unreg_front;
+ return err;
/*
* Domain0 doesn't have a store_evtchn or store_mfn yet.
*/
if (xen_initial_domain()) {
- /* dom0 not yet supported */
+ struct evtchn_alloc_unbound alloc_unbound;
+
+ /* Allocate Xenstore page */
+ page = get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ goto out_error;
+
+ xen_store_mfn = xen_start_info->store_mfn =
+ pfn_to_mfn(virt_to_phys((void *)page) >>
+ PAGE_SHIFT);
+
+ /* Next allocate a local port which xenstored can bind to */
+ alloc_unbound.dom = DOMID_SELF;
+ alloc_unbound.remote_dom = 0;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+ &alloc_unbound);
+ if (err == -ENOSYS)
+ goto out_error;
+
+ BUG_ON(err);
+ xen_store_evtchn = xen_start_info->store_evtchn =
+ alloc_unbound.port;
+
+ xen_store_interface = mfn_to_virt(xen_store_mfn);
} else {
- xenstored_ready = 1;
- xen_store_evtchn = xen_start_info->store_evtchn;
- xen_store_mfn = xen_start_info->store_mfn;
+ if (xen_hvm_domain()) {
+ uint64_t v = 0;
+ err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
+ if (err)
+ goto out_error;
+ xen_store_evtchn = (int)v;
+ err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
+ if (err)
+ goto out_error;
+ xen_store_mfn = (unsigned long)v;
+ xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
+ } else {
+ xen_store_evtchn = xen_start_info->store_evtchn;
+ xen_store_mfn = xen_start_info->store_mfn;
+ xen_store_interface = mfn_to_virt(xen_store_mfn);
+ xenstored_ready = 1;
+ }
}
- xen_store_interface = mfn_to_virt(xen_store_mfn);
/* Initialize the interface to xenstore. */
err = xs_init();
if (err) {
printk(KERN_WARNING
"XENBUS: Error initializing xenstore comms: %i\n", err);
- goto out_unreg_back;
+ goto out_error;
}
- if (!xen_initial_domain())
- xenbus_probe(NULL);
-
#ifdef CONFIG_XEN_COMPAT_XENFS
/*
* Create xenfs mountpoint in /proc for compatibility with
@@ -829,112 +766,13 @@ static int __init xenbus_probe_init(void)
return 0;
- out_unreg_back:
- xenbus_backend_bus_unregister();
-
- out_unreg_front:
- bus_unregister(&xenbus_frontend.bus);
-
out_error:
+ if (page != 0)
+ free_page(page);
+
return err;
}
-postcore_initcall(xenbus_probe_init);
+postcore_initcall(xenbus_init);
MODULE_LICENSE("GPL");
-
-static int is_disconnected_device(struct device *dev, void *data)
-{
- struct xenbus_device *xendev = to_xenbus_device(dev);
- struct device_driver *drv = data;
- struct xenbus_driver *xendrv;
-
- /*
- * A device with no driver will never connect. We care only about
- * devices which should currently be in the process of connecting.
- */
- if (!dev->driver)
- return 0;
-
- /* Is this search limited to a particular driver? */
- if (drv && (dev->driver != drv))
- return 0;
-
- xendrv = to_xenbus_driver(dev->driver);
- return (xendev->state != XenbusStateConnected ||
- (xendrv->is_ready && !xendrv->is_ready(xendev)));
-}
-
-static int exists_disconnected_device(struct device_driver *drv)
-{
- return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
- is_disconnected_device);
-}
-
-static int print_device_status(struct device *dev, void *data)
-{
- struct xenbus_device *xendev = to_xenbus_device(dev);
- struct device_driver *drv = data;
-
- /* Is this operation limited to a particular driver? */
- if (drv && (dev->driver != drv))
- return 0;
-
- if (!dev->driver) {
- /* Information only: is this too noisy? */
- printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
- xendev->nodename);
- } else if (xendev->state != XenbusStateConnected) {
- printk(KERN_WARNING "XENBUS: Timeout connecting "
- "to device: %s (state %d)\n",
- xendev->nodename, xendev->state);
- }
-
- return 0;
-}
-
-/* We only wait for device setup after most initcalls have run. */
-static int ready_to_wait_for_devices;
-
-/*
- * On a 10 second timeout, wait for all devices currently configured. We need
- * to do this to guarantee that the filesystems and / or network devices
- * needed for boot are available, before we can allow the boot to proceed.
- *
- * This needs to be on a late_initcall, to happen after the frontend device
- * drivers have been initialised, but before the root fs is mounted.
- *
- * A possible improvement here would be to have the tools add a per-device
- * flag to the store entry, indicating whether it is needed at boot time.
- * This would allow people who knew what they were doing to accelerate their
- * boot slightly, but of course needs tools or manual intervention to set up
- * those flags correctly.
- */
-static void wait_for_devices(struct xenbus_driver *xendrv)
-{
- unsigned long timeout = jiffies + 10*HZ;
- struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
-
- if (!ready_to_wait_for_devices || !xen_domain())
- return;
-
- while (exists_disconnected_device(drv)) {
- if (time_after(jiffies, timeout))
- break;
- schedule_timeout_interruptible(HZ/10);
- }
-
- bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
- print_device_status);
-}
-
-#ifndef MODULE
-static int __init boot_wait_for_devices(void)
-{
- ready_to_wait_for_devices = 1;
- wait_for_devices(NULL);
- return 0;
-}
-
-late_initcall(boot_wait_for_devices);
-#endif
diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h
index 6c5e3185a6a2..24665812316a 100644
--- a/drivers/xen/xenbus/xenbus_probe.h
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@ -36,26 +36,15 @@
#define XEN_BUS_ID_SIZE 20
-#ifdef CONFIG_XEN_BACKEND
-extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
-extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
-extern void xenbus_backend_probe_and_watch(void);
-extern int xenbus_backend_bus_register(void);
-extern void xenbus_backend_bus_unregister(void);
-#else
-static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
-static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
-static inline void xenbus_backend_probe_and_watch(void) {}
-static inline int xenbus_backend_bus_register(void) { return 0; }
-static inline void xenbus_backend_bus_unregister(void) {}
-#endif
-
struct xen_bus_type
{
char *root;
unsigned int levels;
int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
- int (*probe)(const char *type, const char *dir);
+ int (*probe)(struct xen_bus_type *bus, const char *type,
+ const char *dir);
+ void (*otherend_changed)(struct xenbus_watch *watch, const char **vec,
+ unsigned int len);
struct bus_type bus;
};
@@ -73,4 +62,16 @@ extern int xenbus_probe_devices(struct xen_bus_type *bus);
extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
+extern void xenbus_dev_shutdown(struct device *_dev);
+
+extern int xenbus_dev_suspend(struct device *dev, pm_message_t state);
+extern int xenbus_dev_resume(struct device *dev);
+
+extern void xenbus_otherend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len,
+ int ignore_on_shutdown);
+
+extern int xenbus_read_otherend_details(struct xenbus_device *xendev,
+ char *id_node, char *path_node);
+
#endif
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
new file mode 100644
index 000000000000..6cf467bf63ec
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -0,0 +1,276 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have (backend half).
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005, 2006 XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define DPRINTK(fmt, args...) \
+ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
+ __func__, __LINE__, ##args)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/features.h>
+
+#include "xenbus_comms.h"
+#include "xenbus_probe.h"
+
+/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
+static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+ int domid, err;
+ const char *devid, *type, *frontend;
+ unsigned int typelen;
+
+ type = strchr(nodename, '/');
+ if (!type)
+ return -EINVAL;
+ type++;
+ typelen = strcspn(type, "/");
+ if (!typelen || type[typelen] != '/')
+ return -EINVAL;
+
+ devid = strrchr(nodename, '/') + 1;
+
+ err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
+ "frontend", NULL, &frontend,
+ NULL);
+ if (err)
+ return err;
+ if (strlen(frontend) == 0)
+ err = -ERANGE;
+ if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
+ err = -ENOENT;
+ kfree(frontend);
+
+ if (err)
+ return err;
+
+ if (snprintf(bus_id, XEN_BUS_ID_SIZE, "%.*s-%i-%s",
+ typelen, type, domid, devid) >= XEN_BUS_ID_SIZE)
+ return -ENOSPC;
+ return 0;
+}
+
+static int xenbus_uevent_backend(struct device *dev,
+ struct kobj_uevent_env *env)
+{
+ struct xenbus_device *xdev;
+ struct xenbus_driver *drv;
+ struct xen_bus_type *bus;
+
+ DPRINTK("");
+
+ if (dev == NULL)
+ return -ENODEV;
+
+ xdev = to_xenbus_device(dev);
+ bus = container_of(xdev->dev.bus, struct xen_bus_type, bus);
+ if (xdev == NULL)
+ return -ENODEV;
+
+ /* stuff we want to pass to /sbin/hotplug */
+ if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype))
+ return -ENOMEM;
+
+ if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename))
+ return -ENOMEM;
+
+ if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root))
+ return -ENOMEM;
+
+ if (dev->driver) {
+ drv = to_xenbus_driver(dev->driver);
+ if (drv && drv->uevent)
+ return drv->uevent(xdev, env);
+ }
+
+ return 0;
+}
+
+/* backend/<typename>/<frontend-uuid>/<name> */
+static int xenbus_probe_backend_unit(struct xen_bus_type *bus,
+ const char *dir,
+ const char *type,
+ const char *name)
+{
+ char *nodename;
+ int err;
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
+ if (!nodename)
+ return -ENOMEM;
+
+ DPRINTK("%s\n", nodename);
+
+ err = xenbus_probe_node(bus, type, nodename);
+ kfree(nodename);
+ return err;
+}
+
+/* backend/<typename>/<frontend-domid> */
+static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type,
+ const char *domid)
+{
+ char *nodename;
+ int err = 0;
+ char **dir;
+ unsigned int i, dir_n = 0;
+
+ DPRINTK("");
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid);
+ if (!nodename)
+ return -ENOMEM;
+
+ dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
+ if (IS_ERR(dir)) {
+ kfree(nodename);
+ return PTR_ERR(dir);
+ }
+
+ for (i = 0; i < dir_n; i++) {
+ err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]);
+ if (err)
+ break;
+ }
+ kfree(dir);
+ kfree(nodename);
+ return err;
+}
+
+static void frontend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ xenbus_otherend_changed(watch, vec, len, 0);
+}
+
+static struct device_attribute xenbus_backend_dev_attrs[] = {
+ __ATTR_NULL
+};
+
+static struct xen_bus_type xenbus_backend = {
+ .root = "backend",
+ .levels = 3, /* backend/type/<frontend>/<id> */
+ .get_bus_id = backend_bus_id,
+ .probe = xenbus_probe_backend,
+ .otherend_changed = frontend_changed,
+ .bus = {
+ .name = "xen-backend",
+ .match = xenbus_match,
+ .uevent = xenbus_uevent_backend,
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+ .shutdown = xenbus_dev_shutdown,
+ .dev_attrs = xenbus_backend_dev_attrs,
+ },
+};
+
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ DPRINTK("");
+
+ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
+}
+
+static struct xenbus_watch be_watch = {
+ .node = "backend",
+ .callback = backend_changed,
+};
+
+static int read_frontend_details(struct xenbus_device *xendev)
+{
+ return xenbus_read_otherend_details(xendev, "frontend-id", "frontend");
+}
+
+int xenbus_dev_is_online(struct xenbus_device *dev)
+{
+ int rc, val;
+
+ rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
+ if (rc != 1)
+ val = 0; /* no online node present */
+
+ return val;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
+
+int __xenbus_register_backend(struct xenbus_driver *drv,
+ struct module *owner, const char *mod_name)
+{
+ drv->read_otherend_details = read_frontend_details;
+
+ return xenbus_register_driver_common(drv, &xenbus_backend,
+ owner, mod_name);
+}
+EXPORT_SYMBOL_GPL(__xenbus_register_backend);
+
+static int backend_probe_and_watch(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ /* Enumerate devices in xenstore and watch for changes. */
+ xenbus_probe_devices(&xenbus_backend);
+ register_xenbus_watch(&be_watch);
+
+ return NOTIFY_DONE;
+}
+
+static int __init xenbus_probe_backend_init(void)
+{
+ static struct notifier_block xenstore_notifier = {
+ .notifier_call = backend_probe_and_watch
+ };
+ int err;
+
+ DPRINTK("");
+
+ /* Register ourselves with the kernel bus subsystem */
+ err = bus_register(&xenbus_backend.bus);
+ if (err)
+ return err;
+
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+subsys_initcall(xenbus_probe_backend_init);
diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
new file mode 100644
index 000000000000..5bcc2d6cf129
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
@@ -0,0 +1,294 @@
+#define DPRINTK(fmt, args...) \
+ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
+ __func__, __LINE__, ##args)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/page.h>
+
+#include <xen/platform_pci.h>
+
+#include "xenbus_comms.h"
+#include "xenbus_probe.h"
+
+
+/* device/<type>/<id> => <type>-<id> */
+static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+ nodename = strchr(nodename, '/');
+ if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
+ printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
+ return -EINVAL;
+ }
+
+ strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
+ if (!strchr(bus_id, '/')) {
+ printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
+ return -EINVAL;
+ }
+ *strchr(bus_id, '/') = '-';
+ return 0;
+}
+
+/* device/<typename>/<name> */
+static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type,
+ const char *name)
+{
+ char *nodename;
+ int err;
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name);
+ if (!nodename)
+ return -ENOMEM;
+
+ DPRINTK("%s", nodename);
+
+ err = xenbus_probe_node(bus, type, nodename);
+ kfree(nodename);
+ return err;
+}
+
+static int xenbus_uevent_frontend(struct device *_dev,
+ struct kobj_uevent_env *env)
+{
+ struct xenbus_device *dev = to_xenbus_device(_dev);
+
+ if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
+ return -ENOMEM;
+
+ return 0;
+}
+
+
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ xenbus_otherend_changed(watch, vec, len, 1);
+}
+
+static struct device_attribute xenbus_frontend_dev_attrs[] = {
+ __ATTR_NULL
+};
+
+static struct xen_bus_type xenbus_frontend = {
+ .root = "device",
+ .levels = 2, /* device/type/<id> */
+ .get_bus_id = frontend_bus_id,
+ .probe = xenbus_probe_frontend,
+ .otherend_changed = backend_changed,
+ .bus = {
+ .name = "xen",
+ .match = xenbus_match,
+ .uevent = xenbus_uevent_frontend,
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+ .shutdown = xenbus_dev_shutdown,
+ .dev_attrs = xenbus_frontend_dev_attrs,
+
+ .suspend = xenbus_dev_suspend,
+ .resume = xenbus_dev_resume,
+ },
+};
+
+static void frontend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ DPRINTK("");
+
+ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+}
+
+
+/* We watch for devices appearing and vanishing. */
+static struct xenbus_watch fe_watch = {
+ .node = "device",
+ .callback = frontend_changed,
+};
+
+static int read_backend_details(struct xenbus_device *xendev)
+{
+ return xenbus_read_otherend_details(xendev, "backend-id", "backend");
+}
+
+static int is_device_connecting(struct device *dev, void *data)
+{
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+ struct device_driver *drv = data;
+ struct xenbus_driver *xendrv;
+
+ /*
+ * A device with no driver will never connect. We care only about
+ * devices which should currently be in the process of connecting.
+ */
+ if (!dev->driver)
+ return 0;
+
+ /* Is this search limited to a particular driver? */
+ if (drv && (dev->driver != drv))
+ return 0;
+
+ xendrv = to_xenbus_driver(dev->driver);
+ return (xendev->state < XenbusStateConnected ||
+ (xendev->state == XenbusStateConnected &&
+ xendrv->is_ready && !xendrv->is_ready(xendev)));
+}
+
+static int exists_connecting_device(struct device_driver *drv)
+{
+ return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+ is_device_connecting);
+}
+
+static int print_device_status(struct device *dev, void *data)
+{
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+ struct device_driver *drv = data;
+
+ /* Is this operation limited to a particular driver? */
+ if (drv && (dev->driver != drv))
+ return 0;
+
+ if (!dev->driver) {
+ /* Information only: is this too noisy? */
+ printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
+ xendev->nodename);
+ } else if (xendev->state < XenbusStateConnected) {
+ enum xenbus_state rstate = XenbusStateUnknown;
+ if (xendev->otherend)
+ rstate = xenbus_read_driver_state(xendev->otherend);
+ printk(KERN_WARNING "XENBUS: Timeout connecting "
+ "to device: %s (local state %d, remote state %d)\n",
+ xendev->nodename, xendev->state, rstate);
+ }
+
+ return 0;
+}
+
+/* We only wait for device setup after most initcalls have run. */
+static int ready_to_wait_for_devices;
+
+/*
+ * On a 5-minute timeout, wait for all devices currently configured. We need
+ * to do this to guarantee that the filesystems and / or network devices
+ * needed for boot are available, before we can allow the boot to proceed.
+ *
+ * This needs to be on a late_initcall, to happen after the frontend device
+ * drivers have been initialised, but before the root fs is mounted.
+ *
+ * A possible improvement here would be to have the tools add a per-device
+ * flag to the store entry, indicating whether it is needed at boot time.
+ * This would allow people who knew what they were doing to accelerate their
+ * boot slightly, but of course needs tools or manual intervention to set up
+ * those flags correctly.
+ */
+static void wait_for_devices(struct xenbus_driver *xendrv)
+{
+ unsigned long start = jiffies;
+ struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+ unsigned int seconds_waited = 0;
+
+ if (!ready_to_wait_for_devices || !xen_domain())
+ return;
+
+ while (exists_connecting_device(drv)) {
+ if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
+ if (!seconds_waited)
+ printk(KERN_WARNING "XENBUS: Waiting for "
+ "devices to initialise: ");
+ seconds_waited += 5;
+ printk("%us...", 300 - seconds_waited);
+ if (seconds_waited == 300)
+ break;
+ }
+
+ schedule_timeout_interruptible(HZ/10);
+ }
+
+ if (seconds_waited)
+ printk("\n");
+
+ bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+ print_device_status);
+}
+
+int __xenbus_register_frontend(struct xenbus_driver *drv,
+ struct module *owner, const char *mod_name)
+{
+ int ret;
+
+ drv->read_otherend_details = read_backend_details;
+
+ ret = xenbus_register_driver_common(drv, &xenbus_frontend,
+ owner, mod_name);
+ if (ret)
+ return ret;
+
+ /* If this driver is loaded as a module wait for devices to attach. */
+ wait_for_devices(drv);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
+
+static int frontend_probe_and_watch(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ /* Enumerate devices in xenstore and watch for changes. */
+ xenbus_probe_devices(&xenbus_frontend);
+ register_xenbus_watch(&fe_watch);
+
+ return NOTIFY_DONE;
+}
+
+
+static int __init xenbus_probe_frontend_init(void)
+{
+ static struct notifier_block xenstore_notifier = {
+ .notifier_call = frontend_probe_and_watch
+ };
+ int err;
+
+ DPRINTK("");
+
+ /* Register ourselves with the kernel bus subsystem */
+ err = bus_register(&xenbus_frontend.bus);
+ if (err)
+ return err;
+
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+subsys_initcall(xenbus_probe_frontend_init);
+
+#ifndef MODULE
+static int __init boot_wait_for_devices(void)
+{
+ if (xen_hvm_domain() && !xen_platform_pci_unplug)
+ return -ENODEV;
+
+ ready_to_wait_for_devices = 1;
+ wait_for_devices(NULL);
+ return 0;
+}
+
+late_initcall(boot_wait_for_devices);
+#endif
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 6f91e8c8b932..5534690075af 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -534,7 +534,7 @@ int xenbus_printf(struct xenbus_transaction t,
#define PRINTF_BUFFER_SIZE 4096
char *printf_buffer;
- printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
+ printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_NOIO | __GFP_HIGH);
if (printf_buffer == NULL)
return -ENOMEM;
diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c
index a240b2c20b99..b91f8ff50d05 100644
--- a/drivers/xen/xencomm.c
+++ b/drivers/xen/xencomm.c
@@ -18,8 +18,8 @@
* Authors: Hollis Blanchard <hollisb@us.ibm.com>
*/
-#include <linux/gfp.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include <asm/page.h>
#include <xen/xencomm.h>
#include <xen/interface/xen.h>
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
index 25275c3bbdff..4fde9440fe1f 100644
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,3 +1,4 @@
obj-$(CONFIG_XENFS) += xenfs.o
-xenfs-objs = super.o xenbus.o \ No newline at end of file
+xenfs-y = super.o xenbus.o privcmd.o
+xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
new file mode 100644
index 000000000000..dbd3b16fd131
--- /dev/null
+++ b/drivers/xen/xenfs/privcmd.c
@@ -0,0 +1,400 @@
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
+#endif
+
+static long privcmd_ioctl_hypercall(void __user *udata)
+{
+ struct privcmd_hypercall hypercall;
+ long ret;
+
+ if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+ return -EFAULT;
+
+ ret = privcmd_call(hypercall.op,
+ hypercall.arg[0], hypercall.arg[1],
+ hypercall.arg[2], hypercall.arg[3],
+ hypercall.arg[4]);
+
+ return ret;
+}
+
+static void free_page_list(struct list_head *pages)
+{
+ struct page *p, *n;
+
+ list_for_each_entry_safe(p, n, pages, lru)
+ __free_page(p);
+
+ INIT_LIST_HEAD(pages);
+}
+
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data. If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+ unsigned nelem, size_t size,
+ void __user *data)
+{
+ unsigned pageidx;
+ void *pagedata;
+ int ret;
+
+ if (size > PAGE_SIZE)
+ return 0;
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* quiet, gcc */
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page = alloc_page(GFP_KERNEL);
+
+ ret = -ENOMEM;
+ if (page == NULL)
+ goto fail;
+
+ pagedata = page_address(page);
+
+ list_add_tail(&page->lru, pagelist);
+ pageidx = 0;
+ }
+
+ ret = -EFAULT;
+ if (copy_from_user(pagedata + pageidx, data, size))
+ goto fail;
+
+ data += size;
+ pageidx += size;
+ }
+
+ ret = 0;
+
+fail:
+ return ret;
+}
+
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+ struct list_head *pos,
+ int (*fn)(void *data, void *state),
+ void *state)
+{
+ void *pagedata;
+ unsigned pageidx;
+ int ret = 0;
+
+ BUG_ON(size > PAGE_SIZE);
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* hush, gcc */
+
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page;
+ pos = pos->next;
+ page = list_entry(pos, struct page, lru);
+ pagedata = page_address(page);
+ pageidx = 0;
+ }
+
+ ret = (*fn)(pagedata + pageidx, state);
+ if (ret)
+ break;
+ pageidx += size;
+ }
+
+ return ret;
+}
+
+struct mmap_mfn_state {
+ unsigned long va;
+ struct vm_area_struct *vma;
+ domid_t domain;
+};
+
+static int mmap_mfn_range(void *data, void *state)
+{
+ struct privcmd_mmap_entry *msg = data;
+ struct mmap_mfn_state *st = state;
+ struct vm_area_struct *vma = st->vma;
+ int rc;
+
+ /* Do not allow range to wrap the address space. */
+ if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+ ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+ return -EINVAL;
+
+ /* Range chunks must be contiguous in va space. */
+ if ((msg->va != st->va) ||
+ ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+ return -EINVAL;
+
+ rc = xen_remap_domain_mfn_range(vma,
+ msg->va & PAGE_MASK,
+ msg->mfn, msg->npages,
+ vma->vm_page_prot,
+ st->domain);
+ if (rc < 0)
+ return rc;
+
+ st->va += msg->npages << PAGE_SHIFT;
+
+ return 0;
+}
+
+static long privcmd_ioctl_mmap(void __user *udata)
+{
+ struct privcmd_mmap mmapcmd;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int rc;
+ LIST_HEAD(pagelist);
+ struct mmap_mfn_state state;
+
+ if (!xen_initial_domain())
+ return -EPERM;
+
+ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+ return -EFAULT;
+
+ rc = gather_array(&pagelist,
+ mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ mmapcmd.entry);
+
+ if (rc || list_empty(&pagelist))
+ goto out;
+
+ down_write(&mm->mmap_sem);
+
+ {
+ struct page *page = list_first_entry(&pagelist,
+ struct page, lru);
+ struct privcmd_mmap_entry *msg = page_address(page);
+
+ vma = find_vma(mm, msg->va);
+ rc = -EINVAL;
+
+ if (!vma || (msg->va != vma->vm_start) ||
+ !privcmd_enforce_singleshot_mapping(vma))
+ goto out_up;
+ }
+
+ state.va = vma->vm_start;
+ state.vma = vma;
+ state.domain = mmapcmd.dom;
+
+ rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ &pagelist,
+ mmap_mfn_range, &state);
+
+
+out_up:
+ up_write(&mm->mmap_sem);
+
+out:
+ free_page_list(&pagelist);
+
+ return rc;
+}
+
+struct mmap_batch_state {
+ domid_t domain;
+ unsigned long va;
+ struct vm_area_struct *vma;
+ int err;
+
+ xen_pfn_t __user *user;
+};
+
+static int mmap_batch_fn(void *data, void *state)
+{
+ xen_pfn_t *mfnp = data;
+ struct mmap_batch_state *st = state;
+
+ if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
+ st->vma->vm_page_prot, st->domain) < 0) {
+ *mfnp |= 0xf0000000U;
+ st->err++;
+ }
+ st->va += PAGE_SIZE;
+
+ return 0;
+}
+
+static int mmap_return_errors(void *data, void *state)
+{
+ xen_pfn_t *mfnp = data;
+ struct mmap_batch_state *st = state;
+
+ return put_user(*mfnp, st->user++);
+}
+
+static struct vm_operations_struct privcmd_vm_ops;
+
+static long privcmd_ioctl_mmap_batch(void __user *udata)
+{
+ int ret;
+ struct privcmd_mmapbatch m;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long nr_pages;
+ LIST_HEAD(pagelist);
+ struct mmap_batch_state state;
+
+ if (!xen_initial_domain())
+ return -EPERM;
+
+ if (copy_from_user(&m, udata, sizeof(m)))
+ return -EFAULT;
+
+ nr_pages = m.num;
+ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+ return -EINVAL;
+
+ ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
+ m.arr);
+
+ if (ret || list_empty(&pagelist))
+ goto out;
+
+ down_write(&mm->mmap_sem);
+
+ vma = find_vma(mm, m.addr);
+ ret = -EINVAL;
+ if (!vma ||
+ vma->vm_ops != &privcmd_vm_ops ||
+ (m.addr != vma->vm_start) ||
+ ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
+ !privcmd_enforce_singleshot_mapping(vma)) {
+ up_write(&mm->mmap_sem);
+ goto out;
+ }
+
+ state.domain = m.dom;
+ state.vma = vma;
+ state.va = m.addr;
+ state.err = 0;
+
+ ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist, mmap_batch_fn, &state);
+
+ up_write(&mm->mmap_sem);
+
+ if (state.err > 0) {
+ state.user = m.arr;
+ ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist,
+ mmap_return_errors, &state);
+ }
+
+out:
+ free_page_list(&pagelist);
+
+ return ret;
+}
+
+static long privcmd_ioctl(struct file *file,
+ unsigned int cmd, unsigned long data)
+{
+ int ret = -ENOSYS;
+ void __user *udata = (void __user *) data;
+
+ switch (cmd) {
+ case IOCTL_PRIVCMD_HYPERCALL:
+ ret = privcmd_ioctl_hypercall(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAP:
+ ret = privcmd_ioctl_mmap(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAPBATCH:
+ ret = privcmd_ioctl_mmap_batch(udata);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+ vma, vma->vm_start, vma->vm_end,
+ vmf->pgoff, vmf->virtual_address);
+
+ return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+ .fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ /* Unsupported for auto-translate guests. */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -ENOSYS;
+
+ /* DONTCOPY is essential for Xen because copy_page_range doesn't know
+ * how to recreate these mappings */
+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
+ vma->vm_ops = &privcmd_vm_ops;
+ vma->vm_private_data = NULL;
+
+ return 0;
+}
+
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+{
+ return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+}
+#endif
+
+const struct file_operations privcmd_file_ops = {
+ .unlocked_ioctl = privcmd_ioctl,
+ .mmap = privcmd_mmap,
+};
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 6559e0c752ce..1aa389719846 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -13,6 +13,8 @@
#include <linux/fs.h>
#include <linux/magic.h>
+#include <xen/xen.h>
+
#include "xenfs.h"
#include <asm/xen/hypervisor.h>
@@ -20,6 +22,46 @@
MODULE_DESCRIPTION("Xen filesystem");
MODULE_LICENSE("GPL");
+static struct inode *xenfs_make_inode(struct super_block *sb, int mode)
+{
+ struct inode *ret = new_inode(sb);
+
+ if (ret) {
+ ret->i_mode = mode;
+ ret->i_uid = ret->i_gid = 0;
+ ret->i_blocks = 0;
+ ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
+ }
+ return ret;
+}
+
+static struct dentry *xenfs_create_file(struct super_block *sb,
+ struct dentry *parent,
+ const char *name,
+ const struct file_operations *fops,
+ void *data,
+ int mode)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ dentry = d_alloc_name(parent, name);
+ if (!dentry)
+ return NULL;
+
+ inode = xenfs_make_inode(sb, S_IFREG | mode);
+ if (!inode) {
+ dput(dentry);
+ return NULL;
+ }
+
+ inode->i_fop = fops;
+ inode->i_private = data;
+
+ d_add(dentry, inode);
+ return dentry;
+}
+
static ssize_t capabilities_read(struct file *file, char __user *buf,
size_t size, loff_t *off)
{
@@ -33,6 +75,7 @@ static ssize_t capabilities_read(struct file *file, char __user *buf,
static const struct file_operations capabilities_file_ops = {
.read = capabilities_read,
+ .llseek = default_llseek,
};
static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
@@ -41,29 +84,42 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
[1] = {},
{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
{ "capabilities", &capabilities_file_ops, S_IRUGO },
+ { "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR },
{""},
};
+ int rc;
+
+ rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+ if (rc < 0)
+ return rc;
+
+ if (xen_initial_domain()) {
+ xenfs_create_file(sb, sb->s_root, "xsd_kva",
+ &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR);
+ xenfs_create_file(sb, sb->s_root, "xsd_port",
+ &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR);
+ }
- return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+ return rc;
}
-static int xenfs_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *data, struct vfsmount *mnt)
+static struct dentry *xenfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data)
{
- return get_sb_single(fs_type, flags, data, xenfs_fill_super, mnt);
+ return mount_single(fs_type, flags, data, xenfs_fill_super);
}
static struct file_system_type xenfs_type = {
.owner = THIS_MODULE,
.name = "xenfs",
- .get_sb = xenfs_get_sb,
+ .mount = xenfs_mount,
.kill_sb = kill_litter_super,
};
static int __init xenfs_init(void)
{
- if (xen_pv_domain())
+ if (xen_domain())
return register_filesystem(&xenfs_type);
printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n");
@@ -72,7 +128,7 @@ static int __init xenfs_init(void)
static void __exit xenfs_exit(void)
{
- if (xen_pv_domain())
+ if (xen_domain())
unregister_filesystem(&xenfs_type);
}
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
index 8f6c7d4b3e4d..bbd000f88af7 100644
--- a/drivers/xen/xenfs/xenbus.c
+++ b/drivers/xen/xenfs/xenbus.c
@@ -43,6 +43,7 @@
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/mutex.h>
+#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
@@ -50,6 +51,7 @@
#include <linux/init.h>
#include <linux/namei.h>
#include <linux/string.h>
+#include <linux/slab.h>
#include "xenfs.h"
#include "../xenbus/xenbus_comms.h"
@@ -587,4 +589,5 @@ const struct file_operations xenbus_file_ops = {
.open = xenbus_file_open,
.release = xenbus_file_release,
.poll = xenbus_file_poll,
+ .llseek = no_llseek,
};
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
index 51f08b2d0bf1..b68aa6200003 100644
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -2,5 +2,8 @@
#define _XENFS_XENBUS_H
extern const struct file_operations xenbus_file_ops;
+extern const struct file_operations privcmd_file_ops;
+extern const struct file_operations xsd_kva_file_ops;
+extern const struct file_operations xsd_port_file_ops;
#endif /* _XENFS_XENBUS_H */
diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c
new file mode 100644
index 000000000000..fef20dbc6a5c
--- /dev/null
+++ b/drivers/xen/xenfs/xenstored.c
@@ -0,0 +1,68 @@
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+
+#include <xen/page.h>
+
+#include "xenfs.h"
+#include "../xenbus/xenbus_comms.h"
+
+static ssize_t xsd_read(struct file *file, char __user *buf,
+ size_t size, loff_t *off)
+{
+ const char *str = (const char *)file->private_data;
+ return simple_read_from_buffer(buf, size, off, str, strlen(str));
+}
+
+static int xsd_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static int xsd_kva_open(struct inode *inode, struct file *file)
+{
+ file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p",
+ xen_store_interface);
+ if (!file->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ size_t size = vma->vm_end - vma->vm_start;
+
+ if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+ return -EINVAL;
+
+ if (remap_pfn_range(vma, vma->vm_start,
+ virt_to_pfn(xen_store_interface),
+ size, vma->vm_page_prot))
+ return -EAGAIN;
+
+ return 0;
+}
+
+const struct file_operations xsd_kva_file_ops = {
+ .open = xsd_kva_open,
+ .mmap = xsd_kva_mmap,
+ .read = xsd_read,
+ .release = xsd_release,
+};
+
+static int xsd_port_open(struct inode *inode, struct file *file)
+{
+ file->private_data = (void *)kasprintf(GFP_KERNEL, "%d",
+ xen_store_evtchn);
+ if (!file->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+const struct file_operations xsd_port_file_ops = {
+ .open = xsd_port_open,
+ .read = xsd_read,
+ .release = xsd_release,
+};