28 files changed, 3953 insertions, 554 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index cab100acf983..07bec09d1dad 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -1,6 +1,8 @@
+menu "Xen driver support"
+	depends on XEN
+
 config XEN_BALLOON
 	bool "Xen memory balloon driver"
-	depends on XEN
 	default y
 	help
 	  The balloon driver allows the Xen domain to request more memory from
@@ -20,7 +22,6 @@ config XEN_SCRUB_PAGES
 
 config XEN_DEV_EVTCHN
 	tristate "Xen /dev/xen/evtchn device"
-	depends on XEN
 	default y
 	help
 	  The evtchn driver allows a userspace process to triger event
@@ -28,9 +29,16 @@ config XEN_DEV_EVTCHN
 	  firing.
 	  If in doubt, say yes.
 
+config XEN_BACKEND
+	bool "Backend driver support"
+	depends on XEN_DOM0
+	default y
+	help
+	  Support for backend device drivers that provide I/O services
+	  to other virtual machines.
+
 config XENFS
 	tristate "Xen filesystem"
-	depends on XEN
 	default y
 	help
 	  The xen filesystem provides a way for domains to share
@@ -53,11 +61,38 @@ config XEN_COMPAT_XENFS
 
 config XEN_SYS_HYPERVISOR
        bool "Create xen entries under /sys/hypervisor"
-       depends on XEN && SYSFS
+       depends on SYSFS
        select SYS_HYPERVISOR
        default y
        help
          Create entries under /sys/hypervisor describing the Xen
 	 hypervisor environment.  When running native or in another
 	 virtual environment, /sys/hypervisor will still be present,
-	 but will have no xen contents.
-\ No newline at end of file
+	 but will have no xen contents.
+
+config XEN_XENBUS_FRONTEND
+	tristate
+
+config XEN_GNTDEV
+	tristate "userspace grant access device driver"
+	depends on XEN
+	select MMU_NOTIFIER
+	help
+	  Allows userspace processes to use grants.
+
+config XEN_PLATFORM_PCI
+	tristate "xen platform pci device driver"
+	depends on XEN_PVHVM && PCI
+	default m
+	help
+	  Driver for the Xen PCI Platform device: it is responsible for
+	  initializing xenbus and grant_table when running in a Xen HVM
+	  domain. As a consequence this driver is required to run any Xen PV
+	  frontend on Xen HVM.
+
+config SWIOTLB_XEN
+	def_bool y
+	depends on PCI
+	select SWIOTLB
+
+endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index ec2a39b1e26f..5088cc2e6fe2 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,9 +1,22 @@
 obj-y	+= grant-table.o features.o events.o manage.o
 obj-y	+= xenbus/
 
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_features.o			:= $(nostackp)
+
+obj-$(CONFIG_BLOCK)		+= biomerge.o
 obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
 obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
-obj-$(CONFIG_XEN_DEV_EVTCHN)	+= evtchn.o
+obj-$(CONFIG_XEN_DEV_EVTCHN)	+= xen-evtchn.o
+obj-$(CONFIG_XEN_GNTDEV)	+= xen-gntdev.o
 obj-$(CONFIG_XENFS)		+= xenfs/
-obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
-\ No newline at end of file
+obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
+obj-$(CONFIG_XEN_PLATFORM_PCI)	+= xen-platform-pci.o
+obj-$(CONFIG_SWIOTLB_XEN)	+= swiotlb-xen.o
+obj-$(CONFIG_XEN_DOM0)		+= pci.o
+
+xen-evtchn-y			:= evtchn.o
+xen-gntdev-y				:= gntdev.o
+
+xen-platform-pci-y		:= platform-pci.o
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index f5bbd9e83416..43f9f02c7db0 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -43,15 +43,19 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/sysdev.h>
+#include <linux/gfp.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
+#include <asm/e820.h>
 
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/memory.h>
 #include <xen/xenbus.h>
@@ -66,8 +70,6 @@ struct balloon_stats {
 	/* We aim for 'current allocation' == 'target allocation'. */
 	unsigned long current_pages;
 	unsigned long target_pages;
-	/* We may hit the hard limit in Xen. If we do then we remember it. */
-	unsigned long hard_limit;
 	/*
 	 * Drivers may alter the memory reservation independently, but they
 	 * must inform the balloon driver so we avoid hitting the hard limit.
@@ -84,23 +86,12 @@ static struct sys_device balloon_sysdev;
 
 static int register_balloon(struct sys_device *sysdev);
 
-/*
- * Protects atomic reservation decrease/increase against concurrent increases.
- * Also protects non-atomic updates of current_pages and driver_pages, and
- * balloon lists.
- */
-static DEFINE_SPINLOCK(balloon_lock);
-
 static struct balloon_stats balloon_stats;
 
 /* We increase/decrease in batches which fit in a page */
 static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
 
-/* VM /proc information for memory */
-extern unsigned long totalram_pages;
-
 #ifdef CONFIG_HIGHMEM
-extern unsigned long totalhigh_pages;
 #define inc_totalhigh_pages() (totalhigh_pages++)
 #define dec_totalhigh_pages() (totalhigh_pages--)
 #else
@@ -129,7 +120,7 @@ static void scrub_page(struct page *page)
 }
 
 /* balloon_append: add the given page to the balloon. */
-static void balloon_append(struct page *page)
+static void __balloon_append(struct page *page)
 {
 	/* Lowmem is re-populated first, so highmem pages go at list tail. */
 	if (PageHighMem(page)) {
@@ -142,6 +133,12 @@ static void balloon_append(struct page *page)
 	}
 }
 
+static void balloon_append(struct page *page)
+{
+	__balloon_append(page);
+	totalram_pages--;
+}
+
 /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
 static struct page *balloon_retrieve(void)
 {
@@ -160,6 +157,8 @@ static struct page *balloon_retrieve(void)
 	else
 		balloon_stats.balloon_low--;
 
+	totalram_pages++;
+
 	return page;
 }
 
@@ -185,7 +184,7 @@ static void balloon_alarm(unsigned long unused)
 
 static unsigned long current_target(void)
 {
-	unsigned long target = min(balloon_stats.target_pages, balloon_stats.hard_limit);
+	unsigned long target = balloon_stats.target_pages;
 
 	target = min(target,
 		     balloon_stats.current_pages +
@@ -197,7 +196,7 @@ static unsigned long current_target(void)
 
 static int increase_reservation(unsigned long nr_pages)
 {
-	unsigned long  pfn, i, flags;
+	unsigned long  pfn, i;
 	struct page   *page;
 	long           rc;
 	struct xen_memory_reservation reservation = {
@@ -209,35 +208,20 @@ static int increase_reservation(unsigned long nr_pages)
 	if (nr_pages > ARRAY_SIZE(frame_list))
 		nr_pages = ARRAY_SIZE(frame_list);
 
-	spin_lock_irqsave(&balloon_lock, flags);
-
 	page = balloon_first_page();
 	for (i = 0; i < nr_pages; i++) {
 		BUG_ON(page == NULL);
-		frame_list[i] = page_to_pfn(page);;
+		frame_list[i] = page_to_pfn(page);
 		page = balloon_next_page(page);
 	}
 
 	set_xen_guest_handle(reservation.extent_start, frame_list);
 	reservation.nr_extents = nr_pages;
 	rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
-	if (rc < nr_pages) {
-		if (rc > 0) {
-			int ret;
-
-			/* We hit the Xen hard limit: reprobe. */
-			reservation.nr_extents = rc;
-			ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
-						   &reservation);
-			BUG_ON(ret != rc);
-		}
-		if (rc >= 0)
-			balloon_stats.hard_limit = (balloon_stats.current_pages + rc -
-						    balloon_stats.driver_pages);
+	if (rc < 0)
 		goto out;
-	}
 
-	for (i = 0; i < nr_pages; i++) {
+	for (i = 0; i < rc; i++) {
 		page = balloon_retrieve();
 		BUG_ON(page == NULL);
 
@@ -263,18 +247,15 @@ static int increase_reservation(unsigned long nr_pages)
 		__free_page(page);
 	}
 
-	balloon_stats.current_pages += nr_pages;
-	totalram_pages = balloon_stats.current_pages;
+	balloon_stats.current_pages += rc;
 
  out:
-	spin_unlock_irqrestore(&balloon_lock, flags);
-
-	return 0;
+	return rc < 0 ? rc : rc != nr_pages;
 }
 
 static int decrease_reservation(unsigned long nr_pages)
 {
-	unsigned long  pfn, i, flags;
+	unsigned long  pfn, i;
 	struct page   *page;
 	int            need_sleep = 0;
 	int ret;
@@ -312,8 +293,6 @@ static int decrease_reservation(unsigned long nr_pages)
 	kmap_flush_unused();
 	flush_tlb_all();
 
-	spin_lock_irqsave(&balloon_lock, flags);
-
 	/* No more mappings: invalidate P2M and add to balloon. */
 	for (i = 0; i < nr_pages; i++) {
 		pfn = mfn_to_pfn(frame_list[i]);
@@ -327,9 +306,6 @@ static int decrease_reservation(unsigned long nr_pages)
 	BUG_ON(ret != nr_pages);
 
 	balloon_stats.current_pages -= nr_pages;
-	totalram_pages = balloon_stats.current_pages;
-
-	spin_unlock_irqrestore(&balloon_lock, flags);
 
 	return need_sleep;
 }
@@ -371,7 +347,6 @@ static void balloon_process(struct work_struct *work)
 static void balloon_set_new_target(unsigned long target)
 {
 	/* No need for lock. Not read-modify-write updates. */
-	balloon_stats.hard_limit   = ~0UL;
 	balloon_stats.target_pages = target;
 	schedule_work(&balloon_worker);
 }
@@ -417,7 +392,7 @@ static struct notifier_block xenstore_notifier;
 
 static int __init balloon_init(void)
 {
-	unsigned long pfn;
+	unsigned long pfn, extra_pfn_end;
 	struct page *page;
 
 	if (!xen_pv_domain())
@@ -426,12 +401,10 @@ static int __init balloon_init(void)
 	pr_info("xen_balloon: Initialising balloon driver.\n");
 
 	balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
-	totalram_pages   = balloon_stats.current_pages;
 	balloon_stats.target_pages  = balloon_stats.current_pages;
 	balloon_stats.balloon_low   = 0;
 	balloon_stats.balloon_high  = 0;
 	balloon_stats.driver_pages  = 0UL;
-	balloon_stats.hard_limit    = ~0UL;
 
 	init_timer(&balloon_timer);
 	balloon_timer.data = 0;
@@ -439,11 +412,24 @@ static int __init balloon_init(void)
 
 	register_balloon(&balloon_sysdev);
 
-	/* Initialise the balloon with excess memory space. */
-	for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+	/*
+	 * Initialise the balloon with excess memory space.  We need
+	 * to make sure we don't add memory which doesn't exist or
+	 * logically exist.  The E820 map can be trimmed to be smaller
+	 * than the amount of physical memory due to the mem= command
+	 * line parameter.  And if this is a 32-bit non-HIGHMEM kernel
+	 * on a system with memory which requires highmem to access,
+	 * don't try to use it.
+	 */
+	extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()),
+			    (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size));
+	for (pfn = PFN_UP(xen_extra_mem_start);
+	     pfn < extra_pfn_end;
+	     pfn++) {
 		page = pfn_to_page(pfn);
-		if (!PageReserved(page))
-			balloon_append(page);
+		/* totalram_pages doesn't include the boot-time
+		   balloon extension, so don't subtract from it. */
+		__balloon_append(page);
 	}
 
 	target_watch.callback = watch_target;
@@ -476,9 +462,6 @@ module_exit(balloon_exit);
 BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
 BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low));
 BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
-BALLOON_SHOW(hard_limit_kb,
-	     (balloon_stats.hard_limit!=~0UL) ? "%lu\n" : "???\n",
-	     (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0);
 BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages));
 
 static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr,
@@ -548,7 +531,6 @@ static struct attribute *balloon_info_attrs[] = {
 	&attr_current_kb.attr,
 	&attr_low_kb.attr,
 	&attr_high_kb.attr,
-	&attr_hard_limit_kb.attr,
 	&attr_driver_kb.attr,
 	NULL
 };
diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
new file mode 100644
index 000000000000..ba6eda4b5143
--- /dev/null
+++ b/drivers/xen/biomerge.c
@@ -0,0 +1,13 @@
+#include <linux/bio.h>
+#include <linux/io.h>
+#include <xen/page.h>
+
+bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+			       const struct bio_vec *vec2)
+{
+	unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page));
+	unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page));
+
+	return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
+		((mfn1 == mfn2) || ((mfn1+1) == mfn2));
+}
diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index bdfd584ad853..14e2d995e958 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -1,5 +1,6 @@
 #include <linux/notifier.h>
 
+#include <xen/xen.h>
 #include <xen/xenbus.h>
 
 #include <asm/xen/hypervisor.h>
@@ -86,7 +87,7 @@ static int setup_cpu_watcher(struct notifier_block *notifier,
 	for_each_possible_cpu(cpu) {
 		if (vcpu_online(cpu) == 0) {
 			(void)cpu_down(cpu);
-			cpu_clear(cpu, cpu_present_map);
+			set_cpu_present(cpu, false);
 		}
 	}
 
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index abad71b1632b..74681478100a 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -16,7 +16,7 @@
  *    (typically dom0).
  * 2. VIRQs, typically used for timers.  These are per-cpu events.
  * 3. IPIs.
- * 4. Hardware interrupts. Not supported at present.
+ * 4. PIRQs - Hardware interrupts.
  *
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
@@ -27,18 +27,28 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/irqnr.h>
+#include <linux/pci.h>
 
+#include <asm/desc.h>
 #include <asm/ptrace.h>
 #include <asm/irq.h>
 #include <asm/idle.h>
+#include <asm/io_apic.h>
 #include <asm/sync_bitops.h>
+#include <asm/xen/pci.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 
+#include <xen/xen.h>
+#include <xen/hvm.h>
 #include <xen/xen-ops.h>
 #include <xen/events.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/event_channel.h>
+#include <xen/interface/hvm/hvm_op.h>
+#include <xen/interface/hvm/params.h>
 
 /*
  * This lock protects updates to the following mapping and reference-count
@@ -47,10 +57,10 @@
 static DEFINE_SPINLOCK(irq_mapping_update_lock);
 
 /* IRQ <-> VIRQ mapping. */
-static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
 
 /* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
 
 /* Interrupt types. */
 enum xen_irq_type {
@@ -67,7 +77,8 @@ enum xen_irq_type {
  * event channel - irq->event channel mapping
  * cpu - cpu this event channel is bound to
  * index - type-specific information:
- *    PIRQ - vector, with MSB being "needs EIO"
+ *    PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
+ *           guest, or GSI (real passthrough IRQ) of the device.
  *    VIRQ - virq number
  *    IPI - IPI vector
  *    EVTCHN -
@@ -82,21 +93,29 @@ struct irq_info
 		unsigned short virq;
 		enum ipi_vector ipi;
 		struct {
+			unsigned short pirq;
 			unsigned short gsi;
-			unsigned short vector;
+			unsigned char vector;
+			unsigned char flags;
 		} pirq;
 	} u;
 };
+#define PIRQ_NEEDS_EOI	(1 << 0)
+#define PIRQ_SHAREABLE	(1 << 1)
 
-static struct irq_info irq_info[NR_IRQS];
+static struct irq_info *irq_info;
+static int *pirq_to_irq;
 
-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
-	[0 ... NR_EVENT_CHANNELS-1] = -1
-};
+static int *evtchn_to_irq;
 struct cpu_evtchn_s {
 	unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
 };
-static struct cpu_evtchn_s *cpu_evtchn_mask_p;
+
+static __initdata struct cpu_evtchn_s init_evtchn_mask = {
+	.bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
+};
+static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
+
 static inline unsigned long *cpu_evtchn_mask(int cpu)
 {
 	return cpu_evtchn_mask_p[cpu].bits;
@@ -106,6 +125,8 @@ static inline unsigned long *cpu_evtchn_mask(int cpu)
 #define VALID_EVTCHN(chn)	((chn) != 0)
 
 static struct irq_chip xen_dynamic_chip;
+static struct irq_chip xen_percpu_chip;
+static struct irq_chip xen_pirq_chip;
 
 /* Constructor for packed IRQ information. */
 static struct irq_info mk_unbound_info(void)
@@ -131,11 +152,12 @@ static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq)
 			.cpu = 0, .u.virq = virq };
 }
 
-static struct irq_info mk_pirq_info(unsigned short evtchn,
+static struct irq_info mk_pirq_info(unsigned short evtchn, unsigned short pirq,
 				    unsigned short gsi, unsigned short vector)
 {
 	return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
-			.cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
+			.cpu = 0,
+			.u.pirq = { .pirq = pirq, .gsi = gsi, .vector = vector } };
 }
 
 /*
@@ -148,6 +170,9 @@ static struct irq_info *info_for_irq(unsigned irq)
 
 static unsigned int evtchn_from_irq(unsigned irq)
 {
+	if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq)))
+		return 0;
+
 	return info_for_irq(irq)->evtchn;
 }
 
@@ -177,6 +202,16 @@ static unsigned virq_from_irq(unsigned irq)
 	return info->u.virq;
 }
 
+static unsigned pirq_from_irq(unsigned irq)
+{
+	struct irq_info *info = info_for_irq(irq);
+
+	BUG_ON(info == NULL);
+	BUG_ON(info->type != IRQT_PIRQ);
+
+	return info->u.pirq.pirq;
+}
+
 static unsigned gsi_from_irq(unsigned irq)
 {
 	struct irq_info *info = info_for_irq(irq);
@@ -218,6 +253,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn)
 	return ret;
 }
 
+static bool pirq_needs_eoi(unsigned irq)
+{
+	struct irq_info *info = info_for_irq(irq);
+
+	BUG_ON(info->type != IRQT_PIRQ);
+
+	return info->u.pirq.flags & PIRQ_NEEDS_EOI;
+}
+
 static inline unsigned long active_evtchns(unsigned int cpu,
 					   struct shared_info *sh,
 					   unsigned int idx)
@@ -236,17 +280,17 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 	cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
 #endif
 
-	__clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
-	__set_bit(chn, cpu_evtchn_mask(cpu));
+	clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
+	set_bit(chn, cpu_evtchn_mask(cpu));
 
 	irq_info[irq].cpu = cpu;
 }
 
 static void init_evtchn_cpu_bindings(void)
 {
+	int i;
 #ifdef CONFIG_SMP
 	struct irq_desc *desc;
-	int i;
 
 	/* By default all event channels notify CPU#0. */
 	for_each_irq_desc(i, desc) {
@@ -254,7 +298,10 @@ static void init_evtchn_cpu_bindings(void)
 	}
 #endif
 
-	memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0)));
+	for_each_possible_cpu(i)
+		memset(cpu_evtchn_mask(i),
+		       (i == 0) ? ~0 : 0, sizeof(struct cpu_evtchn_s));
+
 }
 
 static inline void clear_evtchn(int port)
@@ -311,7 +358,7 @@ static void unmask_evtchn(int port)
 		struct evtchn_unmask unmask = { .port = port };
 		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
 	} else {
-		struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
 
 		sync_clear_bit(port, &s->evtchn_mask[0]);
 
@@ -329,27 +376,435 @@ static void unmask_evtchn(int port)
 	put_cpu();
 }
 
-static int find_unbound_irq(void)
+static int get_nr_hw_irqs(void)
 {
-	int irq;
-	struct irq_desc *desc;
+	int ret = 1;
 
-	for (irq = 0; irq < nr_irqs; irq++)
-		if (irq_info[irq].type == IRQT_UNBOUND)
+#ifdef CONFIG_X86_IO_APIC
+	ret = get_nr_irqs_gsi();
+#endif
+
+	return ret;
+}
+
+static int find_unbound_pirq(int type)
+{
+	int rc, i;
+	struct physdev_get_free_pirq op_get_free_pirq;
+	op_get_free_pirq.type = type;
+
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
+	if (!rc)
+		return op_get_free_pirq.pirq;
+
+	for (i = 0; i < nr_irqs; i++) {
+		if (pirq_to_irq[i] < 0)
+			return i;
+	}
+	return -1;
+}
+
+static int find_unbound_irq(void)
+{
+	struct irq_data *data;
+	int irq, res;
+	int bottom = get_nr_hw_irqs();
+	int top = nr_irqs-1;
+
+	if (bottom == nr_irqs)
+		goto no_irqs;
+
+	/* This loop starts from the top of IRQ space and goes down.
+	 * We need this b/c if we have a PCI device in a Xen PV guest
+	 * we do not have an IO-APIC (though the backend might have them)
+	 * mapped in. To not have a collision of physical IRQs with the Xen
+	 * event channels start at the top of the IRQ space for virtual IRQs.
+	 */
+	for (irq = top; irq > bottom; irq--) {
+		data = irq_get_irq_data(irq);
+		/* only 15->0 have init'd desc; handle irq > 16 */
+		if (!data)
+			break;
+		if (data->chip == &no_irq_chip)
 			break;
+		if (data->chip != &xen_dynamic_chip)
+			continue;
+		if (irq_info[irq].type == IRQT_UNBOUND)
+			return irq;
+	}
+
+	if (irq == bottom)
+		goto no_irqs;
 
-	if (irq == nr_irqs)
-		panic("No available IRQ to bind to: increase nr_irqs!\n");
+	res = irq_alloc_desc_at(irq, -1);
 
-	desc = irq_to_desc_alloc_node(irq, 0);
-	if (WARN_ON(desc == NULL))
+	if (WARN_ON(res != irq))
 		return -1;
 
-	dynamic_irq_init(irq);
+	return irq;
+
+no_irqs:
+	panic("No available IRQ to bind to: increase nr_irqs!\n");
+}
+
+static bool identity_mapped_irq(unsigned irq)
+{
+	/* identity map all the hardware irqs */
+	return irq < get_nr_hw_irqs();
+}
+
+static void pirq_unmask_notify(int irq)
+{
+	struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) };
+
+	if (unlikely(pirq_needs_eoi(irq))) {
+		int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+		WARN_ON(rc);
+	}
+}
+
+static void pirq_query_unmask(int irq)
+{
+	struct physdev_irq_status_query irq_status;
+	struct irq_info *info = info_for_irq(irq);
+
+	BUG_ON(info->type != IRQT_PIRQ);
+
+	irq_status.irq = pirq_from_irq(irq);
+	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+		irq_status.flags = 0;
+
+	info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
+	if (irq_status.flags & XENIRQSTAT_needs_eoi)
+		info->u.pirq.flags |= PIRQ_NEEDS_EOI;
+}
+
+static bool probing_irq(int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	return desc && desc->action == NULL;
+}
+
+static unsigned int startup_pirq(unsigned int irq)
+{
+	struct evtchn_bind_pirq bind_pirq;
+	struct irq_info *info = info_for_irq(irq);
+	int evtchn = evtchn_from_irq(irq);
+	int rc;
+
+	BUG_ON(info->type != IRQT_PIRQ);
+
+	if (VALID_EVTCHN(evtchn))
+		goto out;
+
+	bind_pirq.pirq = pirq_from_irq(irq);
+	/* NB. We are happy to share unless we are probing. */
+	bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
+					BIND_PIRQ__WILL_SHARE : 0;
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
+	if (rc != 0) {
+		if (!probing_irq(irq))
+			printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
+			       irq);
+		return 0;
+	}
+	evtchn = bind_pirq.port;
+
+	pirq_query_unmask(irq);
+
+	evtchn_to_irq[evtchn] = irq;
+	bind_evtchn_to_cpu(evtchn, 0);
+	info->evtchn = evtchn;
+
+out:
+	unmask_evtchn(evtchn);
+	pirq_unmask_notify(irq);
+
+	return 0;
+}
+
+static void shutdown_pirq(unsigned int irq)
+{
+	struct evtchn_close close;
+	struct irq_info *info = info_for_irq(irq);
+	int evtchn = evtchn_from_irq(irq);
+
+	BUG_ON(info->type != IRQT_PIRQ);
+
+	if (!VALID_EVTCHN(evtchn))
+		return;
+
+	mask_evtchn(evtchn);
+
+	close.port = evtchn;
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+		BUG();
+
+	bind_evtchn_to_cpu(evtchn, 0);
+	evtchn_to_irq[evtchn] = -1;
+	info->evtchn = 0;
+}
+
+static void enable_pirq(unsigned int irq)
+{
+	startup_pirq(irq);
+}
+
+static void disable_pirq(unsigned int irq)
+{
+}
+
+static void ack_pirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	move_native_irq(irq);
+
+	if (VALID_EVTCHN(evtchn)) {
+		mask_evtchn(evtchn);
+		clear_evtchn(evtchn);
+	}
+}
+
+static void end_pirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (WARN_ON(!desc))
+		return;
+
+	if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
+	    (IRQ_DISABLED|IRQ_PENDING)) {
+		shutdown_pirq(irq);
+	} else if (VALID_EVTCHN(evtchn)) {
+		unmask_evtchn(evtchn);
+		pirq_unmask_notify(irq);
+	}
+}
+
+static int find_irq_by_gsi(unsigned gsi)
+{
+	int irq;
+
+	for (irq = 0; irq < nr_irqs; irq++) {
+		struct irq_info *info = info_for_irq(irq);
+
+		if (info == NULL || info->type != IRQT_PIRQ)
+			continue;
+
+		if (gsi_from_irq(irq) == gsi)
+			return irq;
+	}
+
+	return -1;
+}
+
+int xen_allocate_pirq(unsigned gsi, int shareable, char *name)
+{
+	return xen_map_pirq_gsi(gsi, gsi, shareable, name);
+}
+
+/* xen_map_pirq_gsi might allocate irqs from the top down, as a
+ * consequence don't assume that the irq number returned has a low value
+ * or can be used as a pirq number unless you know otherwise.
+ *
+ * One notable exception is when xen_map_pirq_gsi is called passing an
+ * hardware gsi as argument, in that case the irq number returned
+ * matches the gsi number passed as second argument.
+ *
+ * Note: We don't assign an event channel until the irq actually started
+ * up.  Return an existing irq if we've already got one for the gsi.
+ */
+int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
+{
+	int irq = 0;
+	struct physdev_irq irq_op;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if ((pirq > nr_irqs) || (gsi > nr_irqs)) {
+		printk(KERN_WARNING "xen_map_pirq_gsi: %s %s is incorrect!\n",
+			pirq > nr_irqs ? "pirq" :"",
+			gsi > nr_irqs ? "gsi" : "");
+		goto out;
+	}
+
+	irq = find_irq_by_gsi(gsi);
+	if (irq != -1) {
+		printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n",
+		       irq, gsi);
+		goto out;	/* XXX need refcount? */
+	}
+
+	/* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
+	 * we are using the !xen_initial_domain() to drop in the function.*/
+	if (identity_mapped_irq(gsi) || (!xen_initial_domain() &&
+				xen_pv_domain())) {
+		irq = gsi;
+		irq_alloc_desc_at(irq, -1);
+	} else
+		irq = find_unbound_irq();
+
+	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
+				      handle_level_irq, name);
+
+	irq_op.irq = irq;
+	irq_op.vector = 0;
+
+	/* Only the privileged domain can do this. For non-priv, the pcifront
+	 * driver provides a PCI bus that does the call to do exactly
+	 * this in the priv domain. */
+	if (xen_initial_domain() &&
+	    HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
+		irq_free_desc(irq);
+		irq = -ENOSPC;
+		goto out;
+	}
+
+	irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector);
+	irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
+	pirq_to_irq[pirq] = irq;
+
+out:
+	spin_unlock(&irq_mapping_update_lock);
 
 	return irq;
 }
 
+#ifdef CONFIG_PCI_MSI
+#include <linux/msi.h>
+#include "../pci/msi.h"
+
+void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc)
+{
+	spin_lock(&irq_mapping_update_lock);
+
+	if (alloc & XEN_ALLOC_IRQ) {
+		*irq = find_unbound_irq();
+		if (*irq == -1)
+			goto out;
+	}
+
+	if (alloc & XEN_ALLOC_PIRQ) {
+		*pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI);
+		if (*pirq == -1)
+			goto out;
+	}
+
+	set_irq_chip_and_handler_name(*irq, &xen_pirq_chip,
+				      handle_level_irq, name);
+
+	irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0);
+	pirq_to_irq[*pirq] = *irq;
+
+out:
+	spin_unlock(&irq_mapping_update_lock);
+}
+
+int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
+{
+	int irq = -1;
+	struct physdev_map_pirq map_irq;
+	int rc;
+	int pos;
+	u32 table_offset, bir;
+
+	memset(&map_irq, 0, sizeof(map_irq));
+	map_irq.domid = DOMID_SELF;
+	map_irq.type = MAP_PIRQ_TYPE_MSI;
+	map_irq.index = -1;
+	map_irq.pirq = -1;
+	map_irq.bus = dev->bus->number;
+	map_irq.devfn = dev->devfn;
+
+	if (type == PCI_CAP_ID_MSIX) {
+		pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+
+		pci_read_config_dword(dev, msix_table_offset_reg(pos),
+					&table_offset);
+		bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+
+		map_irq.table_base = pci_resource_start(dev, bir);
+		map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+	}
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = find_unbound_irq();
+
+	if (irq == -1)
+		goto out;
+
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+	if (rc) {
+		printk(KERN_WARNING "xen map irq failed %d\n", rc);
+
+		irq_free_desc(irq);
+
+		irq = -1;
+		goto out;
+	}
+	irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index);
+
+	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
+			handle_level_irq,
+			(type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
+
+out:
+	spin_unlock(&irq_mapping_update_lock);
+	return irq;
+}
+#endif
+
+int xen_destroy_irq(int irq)
+{
+	struct irq_desc *desc;
+	struct physdev_unmap_pirq unmap_irq;
+	struct irq_info *info = info_for_irq(irq);
+	int rc = -ENOENT;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	desc = irq_to_desc(irq);
+	if (!desc)
+		goto out;
+
+	if (xen_initial_domain()) {
+		unmap_irq.pirq = info->u.pirq.pirq;
+		unmap_irq.domid = DOMID_SELF;
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
+		if (rc) {
+			printk(KERN_WARNING "unmap irq failed %d\n", rc);
+			goto out;
+		}
+		pirq_to_irq[info->u.pirq.pirq] = -1;
+	}
+	irq_info[irq] = mk_unbound_info();
+
+	irq_free_desc(irq);
+
+out:
+	spin_unlock(&irq_mapping_update_lock);
+	return rc;
+}
+
+int xen_vector_from_irq(unsigned irq)
+{
+	return vector_from_irq(irq);
+}
+
+int xen_gsi_from_irq(unsigned irq)
+{
+	return gsi_from_irq(irq);
+}
+
+int xen_irq_from_pirq(unsigned pirq)
+{
+	return pirq_to_irq[pirq];
+}
+
 int bind_evtchn_to_irq(unsigned int evtchn)
 {
 	int irq;
@@ -362,7 +817,7 @@ int bind_evtchn_to_irq(unsigned int evtchn)
 		irq = find_unbound_irq();
 
 		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
-					      handle_level_irq, "event");
+					      handle_fasteoi_irq, "event");
 
 		evtchn_to_irq[evtchn] = irq;
 		irq_info[irq] = mk_evtchn_info(evtchn);
@@ -388,8 +843,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 		if (irq < 0)
 			goto out;
 
-		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
-					      handle_level_irq, "ipi");
+		set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
+					      handle_percpu_irq, "ipi");
 
 		bind_ipi.vcpu = cpu;
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
@@ -410,7 +865,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 }
 
 
-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 {
 	struct evtchn_bind_virq bind_virq;
 	int evtchn, irq;
@@ -420,6 +875,11 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 	irq = per_cpu(virq_to_irq, cpu)[virq];
 
 	if (irq == -1) {
+		irq = find_unbound_irq();
+
+		set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
+					      handle_percpu_irq, "virq");
+
 		bind_virq.virq = virq;
 		bind_virq.vcpu = cpu;
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
@@ -427,11 +887,6 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 			BUG();
 		evtchn = bind_virq.port;
 
-		irq = find_unbound_irq();
-
-		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
-					      handle_level_irq, "virq");
-
 		evtchn_to_irq[evtchn] = irq;
 		irq_info[irq] = mk_virq_info(evtchn, virq);
 
@@ -474,9 +929,12 @@ static void unbind_from_irq(unsigned int irq)
 		bind_evtchn_to_cpu(evtchn, 0);
 
 		evtchn_to_irq[evtchn] = -1;
+	}
+
+	if (irq_info[irq].type != IRQT_UNBOUND) {
 		irq_info[irq] = mk_unbound_info();
 
-		dynamic_irq_cleanup(irq);
+		irq_free_desc(irq);
 	}
 
 	spin_unlock(&irq_mapping_update_lock);
@@ -532,6 +990,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
 	if (irq < 0)
 		return irq;
 
+	irqflags |= IRQF_NO_SUSPEND;
 	retval = request_irq(irq, handler, irqflags, devname, dev_id);
 	if (retval != 0) {
 		unbind_from_irq(irq);
@@ -559,41 +1018,75 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 {
 	struct shared_info *sh = HYPERVISOR_shared_info;
 	int cpu = smp_processor_id();
+	unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu);
 	int i;
 	unsigned long flags;
 	static DEFINE_SPINLOCK(debug_lock);
+	struct vcpu_info *v;
 
 	spin_lock_irqsave(&debug_lock, flags);
 
-	printk("vcpu %d\n  ", cpu);
+	printk("\nvcpu %d\n  ", cpu);
 
 	for_each_online_cpu(i) {
-		struct vcpu_info *v = per_cpu(xen_vcpu, i);
-		printk("%d: masked=%d pending=%d event_sel %08lx\n  ", i,
-			(get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask,
-			v->evtchn_upcall_pending,
-			v->evtchn_pending_sel);
-	}
-	printk("pending:\n   ");
-	for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
-		printk("%08lx%s", sh->evtchn_pending[i],
-			i % 8 == 0 ? "\n   " : " ");
-	printk("\nmasks:\n   ");
-	for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
-		printk("%08lx%s", sh->evtchn_mask[i],
-			i % 8 == 0 ? "\n   " : " ");
-
-	printk("\nunmasked:\n   ");
-	for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
-		printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
-			i % 8 == 0 ? "\n   " : " ");
+		int pending;
+		v = per_cpu(xen_vcpu, i);
+		pending = (get_irq_regs() && i == cpu)
+			? xen_irqs_disabled(get_irq_regs())
+			: v->evtchn_upcall_mask;
+		printk("%d: masked=%d pending=%d event_sel %0*lx\n  ", i,
+		       pending, v->evtchn_upcall_pending,
+		       (int)(sizeof(v->evtchn_pending_sel)*2),
+		       v->evtchn_pending_sel);
+	}
+	v = per_cpu(xen_vcpu, cpu);
+
+	printk("\npending:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2,
+		       sh->evtchn_pending[i],
+		       i % 8 == 0 ? "\n   " : " ");
+	printk("\nglobal mask:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+		printk("%0*lx%s",
+		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       sh->evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nglobally unmasked:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
+		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocal cpu%d mask:\n   ", cpu);
+	for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
+		       cpu_evtchn[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocally unmasked:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
+		unsigned long pending = sh->evtchn_pending[i]
+			& ~sh->evtchn_mask[i]
+			& cpu_evtchn[i];
+		printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
+		       pending, i % 8 == 0 ? "\n   " : " ");
+	}
 
 	printk("\npending list:\n");
-	for(i = 0; i < NR_EVENT_CHANNELS; i++) {
+	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
 		if (sync_test_bit(i, sh->evtchn_pending)) {
-			printk("  %d: event %d -> irq %d\n",
+			int word_idx = i / BITS_PER_LONG;
+			printk("  %d: event %d -> irq %d%s%s%s\n",
 			       cpu_from_evtchn(i), i,
-			       evtchn_to_irq[i]);
+			       evtchn_to_irq[i],
+			       sync_test_bit(word_idx, &v->evtchn_pending_sel)
+					     ? "" : " l2-clear",
+			       !sync_test_bit(i, sh->evtchn_mask)
+					     ? "" : " globally-masked",
+			       sync_test_bit(i, cpu_evtchn)
+					     ? "" : " locally-masked");
 		}
 	}
 
@@ -602,6 +1095,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+
 /*
  * Search the CPUs pending events bitmasks.  For each one found, map
  * the event number to an irq, and feed it into do_IRQ() for
@@ -611,24 +1106,19 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
  * a bitset of words which contain pending event bits.  The second
  * level is a bitset of pending events themselves.
  */
-void xen_evtchn_do_upcall(struct pt_regs *regs)
+static void __xen_evtchn_do_upcall(void)
 {
 	int cpu = get_cpu();
-	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct shared_info *s = HYPERVISOR_shared_info;
-	struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
-	static DEFINE_PER_CPU(unsigned, nesting_count);
+	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
  	unsigned count;
 
-	exit_idle();
-	irq_enter();
-
 	do {
 		unsigned long pending_words;
 
 		vcpu_info->evtchn_upcall_pending = 0;
 
-		if (__get_cpu_var(nesting_count)++)
+		if (__this_cpu_inc_return(xed_nesting_count) - 1)
 			goto out;
 
 #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
@@ -645,24 +1135,48 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 				int bit_idx = __ffs(pending_bits);
 				int port = (word_idx * BITS_PER_LONG) + bit_idx;
 				int irq = evtchn_to_irq[port];
+				struct irq_desc *desc;
 
-				if (irq != -1)
-					handle_irq(irq, regs);
+				mask_evtchn(port);
+				clear_evtchn(port);
+
+				if (irq != -1) {
+					desc = irq_to_desc(irq);
+					if (desc)
+						generic_handle_irq_desc(irq, desc);
+				}
 			}
 		}
 
 		BUG_ON(!irqs_disabled());
 
-		count = __get_cpu_var(nesting_count);
-		__get_cpu_var(nesting_count) = 0;
-	} while(count != 1);
+		count = __this_cpu_read(xed_nesting_count);
+		__this_cpu_write(xed_nesting_count, 0);
+	} while (count != 1 || vcpu_info->evtchn_upcall_pending);
 
 out:
+
+	put_cpu();
+}
+
+void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	exit_idle();
+	irq_enter();
+
+	__xen_evtchn_do_upcall();
+
 	irq_exit();
 	set_irq_regs(old_regs);
+}
 
-	put_cpu();
+void xen_hvm_evtchn_do_upcall(void)
+{
+	__xen_evtchn_do_upcall();
 }
+EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
 
 /* Rebind a new event channel to an existing irq. */
 void rebind_evtchn_irq(int evtchn, int irq)
@@ -699,7 +1213,10 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 	struct evtchn_bind_vcpu bind_vcpu;
 	int evtchn = evtchn_from_irq(irq);
 
-	if (!VALID_EVTCHN(evtchn))
+	/* events delivered via platform PCI interrupts are always
+	 * routed to vcpu 0 */
+	if (!VALID_EVTCHN(evtchn) ||
+		(xen_hvm_domain() && !xen_have_vector_callback))
 		return -1;
 
 	/* Send future instances of this interrupt to other vcpu. */
@@ -760,10 +1277,10 @@ static void ack_dynirq(unsigned int irq)
 {
 	int evtchn = evtchn_from_irq(irq);
 
-	move_native_irq(irq);
+	move_masked_irq(irq);
 
 	if (VALID_EVTCHN(evtchn))
-		clear_evtchn(evtchn);
+		unmask_evtchn(evtchn);
 }
 
 static int retrigger_dynirq(unsigned int irq)
@@ -785,6 +1302,42 @@ static int retrigger_dynirq(unsigned int irq)
 	return ret;
 }
 
+static void restore_cpu_pirqs(void)
+{
+	int pirq, rc, irq, gsi;
+	struct physdev_map_pirq map_irq;
+
+	for (pirq = 0; pirq < nr_irqs; pirq++) {
+		irq = pirq_to_irq[pirq];
+		if (irq == -1)
+			continue;
+
+		/* save/restore of PT devices doesn't work, so at this point the
+		 * only devices present are GSI based emulated devices */
+		gsi = gsi_from_irq(irq);
+		if (!gsi)
+			continue;
+
+		map_irq.domid = DOMID_SELF;
+		map_irq.type = MAP_PIRQ_TYPE_GSI;
+		map_irq.index = gsi;
+		map_irq.pirq = pirq;
+
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+		if (rc) {
+			printk(KERN_WARNING "xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n",
+					gsi, irq, pirq, rc);
+			irq_info[irq] = mk_unbound_info();
+			pirq_to_irq[pirq] = -1;
+			continue;
+		}
+
+		printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
+
+		startup_pirq(irq);
+	}
+}
+
 static void restore_cpu_virqs(unsigned int cpu)
 {
 	struct evtchn_bind_virq bind_virq;
@@ -808,9 +1361,6 @@ static void restore_cpu_virqs(unsigned int cpu)
 		evtchn_to_irq[evtchn] = irq;
 		irq_info[irq] = mk_virq_info(evtchn, virq);
 		bind_evtchn_to_cpu(evtchn, cpu);
-
-		/* Ready for use. */
-		unmask_evtchn(evtchn);
 	}
 }
 
@@ -836,10 +1386,6 @@ static void restore_cpu_ipis(unsigned int cpu)
 		evtchn_to_irq[evtchn] = irq;
 		irq_info[irq] = mk_ipi_info(evtchn, ipi);
 		bind_evtchn_to_cpu(evtchn, cpu);
-
-		/* Ready for use. */
-		unmask_evtchn(evtchn);
-
 	}
 }
 
@@ -851,7 +1397,7 @@ void xen_clear_irq_pending(int irq)
 	if (VALID_EVTCHN(evtchn))
 		clear_evtchn(evtchn);
 }
-
+EXPORT_SYMBOL(xen_clear_irq_pending);
 void xen_set_irq_pending(int irq)
 {
 	int evtchn = evtchn_from_irq(irq);
@@ -871,9 +1417,9 @@ bool xen_test_irq_pending(int irq)
 	return ret;
 }
 
-/* Poll waiting for an irq to become pending.  In the usual case, the
-   irq will be disabled so it won't deliver an interrupt. */
-void xen_poll_irq(int irq)
+/* Poll waiting for an irq to become pending with timeout.  In the usual case,
+ * the irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq_timeout(int irq, u64 timeout)
 {
 	evtchn_port_t evtchn = evtchn_from_irq(irq);
 
@@ -881,17 +1427,25 @@ void xen_poll_irq(int irq)
 		struct sched_poll poll;
 
 		poll.nr_ports = 1;
-		poll.timeout = 0;
+		poll.timeout = timeout;
 		set_xen_guest_handle(poll.ports, &evtchn);
 
 		if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
 			BUG();
 	}
 }
+EXPORT_SYMBOL(xen_poll_irq_timeout);
+/* Poll waiting for an irq to become pending.  In the usual case, the
+ * irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq)
+{
+	xen_poll_irq_timeout(irq, 0 /* no timeout */);
+}
 
 void xen_irq_resume(void)
 {
 	unsigned int cpu, irq, evtchn;
+	struct irq_desc *desc;
 
 	init_evtchn_cpu_bindings();
 
@@ -910,6 +1464,25 @@ void xen_irq_resume(void)
 		restore_cpu_virqs(cpu);
 		restore_cpu_ipis(cpu);
 	}
+
+	/*
+	 * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These
+	 * are not handled by the IRQ core.
+	 */
+	for_each_irq_desc(irq, desc) {
+		if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND))
+			continue;
+		if (desc->status & IRQ_DISABLED)
+			continue;
+
+		evtchn = evtchn_from_irq(irq);
+		if (evtchn == -1)
+			continue;
+
+		unmask_evtchn(evtchn);
+	}
+
+	restore_cpu_pirqs();
 }
 
 static struct irq_chip xen_dynamic_chip __read_mostly = {
@@ -919,18 +1492,98 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
 	.mask		= disable_dynirq,
 	.unmask		= enable_dynirq,
 
-	.ack		= ack_dynirq,
+	.eoi		= ack_dynirq,
+	.set_affinity	= set_affinity_irq,
+	.retrigger	= retrigger_dynirq,
+};
+
+static struct irq_chip xen_pirq_chip __read_mostly = {
+	.name		= "xen-pirq",
+
+	.startup	= startup_pirq,
+	.shutdown	= shutdown_pirq,
+
+	.enable		= enable_pirq,
+	.unmask		= enable_pirq,
+
+	.disable	= disable_pirq,
+	.mask		= disable_pirq,
+
+	.ack		= ack_pirq,
+	.end		= end_pirq,
+
 	.set_affinity	= set_affinity_irq,
+
 	.retrigger	= retrigger_dynirq,
 };
 
+static struct irq_chip xen_percpu_chip __read_mostly = {
+	.name		= "xen-percpu",
+
+	.disable	= disable_dynirq,
+	.mask		= disable_dynirq,
+	.unmask		= enable_dynirq,
+
+	.ack		= ack_dynirq,
+};
+
+int xen_set_callback_via(uint64_t via)
+{
+	struct xen_hvm_param a;
+	a.domid = DOMID_SELF;
+	a.index = HVM_PARAM_CALLBACK_IRQ;
+	a.value = via;
+	return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
+}
+EXPORT_SYMBOL_GPL(xen_set_callback_via);
+
+#ifdef CONFIG_XEN_PVHVM
+/* Vector callbacks are better than PCI interrupts to receive event
+ * channel notifications because we can receive vector callbacks on any
+ * vcpu and we don't need PCI support or APIC interactions. */
+void xen_callback_vector(void)
+{
+	int rc;
+	uint64_t callback_via;
+	if (xen_have_vector_callback) {
+		callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK);
+		rc = xen_set_callback_via(callback_via);
+		if (rc) {
+			printk(KERN_ERR "Request for Xen HVM callback vector"
+					" failed.\n");
+			xen_have_vector_callback = 0;
+			return;
+		}
+		printk(KERN_INFO "Xen HVM callback vector for event delivery is "
+				"enabled\n");
+		/* in the restore case the vector has already been allocated */
+		if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors))
+			alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector);
+	}
+}
+#else
+void xen_callback_vector(void) {}
+#endif
+
 void __init xen_init_IRQ(void)
 {
 	int i;
 
 	cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
 				    GFP_KERNEL);
-	BUG_ON(cpu_evtchn_mask_p == NULL);
+	irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL);
+
+	/* We are using nr_irqs as the maximum number of pirq available but
+	 * that number is actually chosen by Xen and we don't know exactly
+	 * what it is. Be careful choosing high pirq numbers. */
+	pirq_to_irq = kcalloc(nr_irqs, sizeof(*pirq_to_irq), GFP_KERNEL);
+	for (i = 0; i < nr_irqs; i++)
+		pirq_to_irq[i] = -1;
+
+	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
+				    GFP_KERNEL);
+	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+		evtchn_to_irq[i] = -1;
 
 	init_evtchn_cpu_bindings();
 
@@ -938,5 +1591,15 @@ void __init xen_init_IRQ(void)
 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
 		mask_evtchn(i);
 
-	irq_ctx_init(smp_processor_id());
+	if (xen_hvm_domain()) {
+		xen_callback_vector();
+		native_init_IRQ();
+		/* pci_xen_hvm_init must be called after native_init_IRQ so that
+		 * __acpi_register_gsi can point at the right function */
+		pci_xen_hvm_init();
+	} else {
+		irq_ctx_init(smp_processor_id());
+		if (xen_initial_domain())
+			xen_setup_pirqs();
+	}
 }
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index af031950f9b1..ef11daf0cafe 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -38,7 +38,6 @@
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/errno.h>
 #include <linux/miscdevice.h>
 #include <linux/major.h>
 #include <linux/proc_fs.h>
@@ -46,9 +45,10 @@
 #include <linux/poll.h>
 #include <linux/irq.h>
 #include <linux/init.h>
-#include <linux/gfp.h>
 #include <linux/mutex.h>
 #include <linux/cpu.h>
+
+#include <xen/xen.h>
 #include <xen/events.h>
 #include <xen/evtchn.h>
 #include <asm/xen/hypervisor.h>
@@ -69,20 +69,51 @@ struct per_user_data {
 	const char *name;
 };
 
-/* Who's bound to each port? */
-static struct per_user_data *port_user[NR_EVENT_CHANNELS];
+/*
+ * Who's bound to each port?  This is logically an array of struct
+ * per_user_data *, but we encode the current enabled-state in bit 0.
+ */
+static unsigned long *port_user;
 static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
 
-irqreturn_t evtchn_interrupt(int irq, void *data)
+static inline struct per_user_data *get_port_user(unsigned port)
+{
+	return (struct per_user_data *)(port_user[port] & ~1);
+}
+
+static inline void set_port_user(unsigned port, struct per_user_data *u)
+{
+	port_user[port] = (unsigned long)u;
+}
+
+static inline bool get_port_enabled(unsigned port)
+{
+	return port_user[port] & 1;
+}
+
+static inline void set_port_enabled(unsigned port, bool enabled)
+{
+	if (enabled)
+		port_user[port] |= 1;
+	else
+		port_user[port] &= ~1;
+}
+
+static irqreturn_t evtchn_interrupt(int irq, void *data)
 {
 	unsigned int port = (unsigned long)data;
 	struct per_user_data *u;
 
 	spin_lock(&port_user_lock);
 
-	u = port_user[port];
+	u = get_port_user(port);
+
+	WARN(!get_port_enabled(port),
+	     "Interrupt for port %d, but apparently not enabled; per-user %p\n",
+	     port, u);
 
 	disable_irq_nosync(irq);
+	set_port_enabled(port, false);
 
 	if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
 		u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
@@ -92,9 +123,8 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
 			kill_fasync(&u->evtchn_async_queue,
 				    SIGIO, POLL_IN);
 		}
-	} else {
+	} else
 		u->ring_overflow = 1;
-	}
 
 	spin_unlock(&port_user_lock);
 
@@ -198,9 +228,18 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
 		goto out;
 
 	spin_lock_irq(&port_user_lock);
-	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
-		if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
-			enable_irq(irq_from_evtchn(kbuf[i]));
+
+	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
+		unsigned port = kbuf[i];
+
+		if (port < NR_EVENT_CHANNELS &&
+		    get_port_user(port) == u &&
+		    !get_port_enabled(port)) {
+			set_port_enabled(port, true);
+			enable_irq(irq_from_evtchn(port));
+		}
+	}
+
 	spin_unlock_irq(&port_user_lock);
 
 	rc = count;
@@ -222,8 +261,9 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
 	 * interrupt handler yet, and our caller has already
 	 * serialized bind operations.)
 	 */
-	BUG_ON(port_user[port] != NULL);
-	port_user[port] = u;
+	BUG_ON(get_port_user(port) != NULL);
+	set_port_user(port, u);
+	set_port_enabled(port, true); /* start enabled */
 
 	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
 				       u->name, (void *)(unsigned long)port);
@@ -239,10 +279,7 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
 
 	unbind_from_irqhandler(irq, (void *)(unsigned long)port);
 
-	/* make sure we unbind the irq handler before clearing the port */
-	barrier();
-
-	port_user[port] = NULL;
+	set_port_user(port, NULL);
 }
 
 static long evtchn_ioctl(struct file *file,
@@ -333,15 +370,17 @@ static long evtchn_ioctl(struct file *file,
 		spin_lock_irq(&port_user_lock);
 
 		rc = -ENOTCONN;
-		if (port_user[unbind.port] != u) {
+		if (get_port_user(unbind.port) != u) {
 			spin_unlock_irq(&port_user_lock);
 			break;
 		}
 
-		evtchn_unbind_from_user(u, unbind.port);
+		disable_irq(irq_from_evtchn(unbind.port));
 
 		spin_unlock_irq(&port_user_lock);
 
+		evtchn_unbind_from_user(u, unbind.port);
+
 		rc = 0;
 		break;
 	}
@@ -355,7 +394,7 @@ static long evtchn_ioctl(struct file *file,
 
 		if (notify.port >= NR_EVENT_CHANNELS) {
 			rc = -EINVAL;
-		} else if (port_user[notify.port] != u) {
+		} else if (get_port_user(notify.port) != u) {
 			rc = -ENOTCONN;
 		} else {
 			notify_remote_via_evtchn(notify.port);
@@ -431,7 +470,7 @@ static int evtchn_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = u;
 
-	return 0;
+	return nonseekable_open(inode, filp);;
 }
 
 static int evtchn_release(struct inode *inode, struct file *filp)
@@ -444,14 +483,21 @@ static int evtchn_release(struct inode *inode, struct file *filp)
 	free_page((unsigned long)u->ring);
 
 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
-		if (port_user[i] != u)
+		if (get_port_user(i) != u)
 			continue;
 
-		evtchn_unbind_from_user(port_user[i], i);
+		disable_irq(irq_from_evtchn(i));
 	}
 
 	spin_unlock_irq(&port_user_lock);
 
+	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+		if (get_port_user(i) != u)
+			continue;
+
+		evtchn_unbind_from_user(get_port_user(i), i);
+	}
+
 	kfree(u->name);
 	kfree(u);
 
@@ -467,11 +513,12 @@ static const struct file_operations evtchn_fops = {
 	.fasync  = evtchn_fasync,
 	.open    = evtchn_open,
 	.release = evtchn_release,
+	.llseek	 = no_llseek,
 };
 
 static struct miscdevice evtchn_miscdev = {
 	.minor        = MISC_DYNAMIC_MINOR,
-	.name         = "evtchn",
+	.name         = "xen/evtchn",
 	.fops         = &evtchn_fops,
 };
 static int __init evtchn_init(void)
@@ -481,8 +528,11 @@ static int __init evtchn_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
+	port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
+	if (port_user == NULL)
+		return -ENOMEM;
+
 	spin_lock_init(&port_user_lock);
-	memset(port_user, 0, sizeof(port_user));
 
 	/* Create '/dev/misc/evtchn'. */
 	err = misc_register(&evtchn_miscdev);
@@ -498,6 +548,9 @@ static int __init evtchn_init(void)
 
 static void __exit evtchn_cleanup(void)
 {
+	kfree(port_user);
+	port_user = NULL;
+
 	misc_deregister(&evtchn_miscdev);
 }
 
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
new file mode 100644
index 000000000000..1e31cdcdae1e
--- /dev/null
+++ b/drivers/xen/gntdev.c
@@ -0,0 +1,665 @@
+/******************************************************************************
+ * gntdev.c
+ *
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ *           (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#undef DEBUG
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_notifier.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+
+#include <xen/xen.h>
+#include <xen/grant_table.h>
+#include <xen/gntdev.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
+	      "Gerd Hoffmann <kraxel@redhat.com>");
+MODULE_DESCRIPTION("User-space granted page access driver");
+
+static int limit = 1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at "
+		"once by a gntdev instance");
+
+struct gntdev_priv {
+	struct list_head maps;
+	uint32_t used;
+	uint32_t limit;
+	/* lock protects maps from concurrent changes */
+	spinlock_t lock;
+	struct mm_struct *mm;
+	struct mmu_notifier mn;
+};
+
+struct grant_map {
+	struct list_head next;
+	struct gntdev_priv *priv;
+	struct vm_area_struct *vma;
+	int index;
+	int count;
+	int flags;
+	int is_mapped;
+	struct ioctl_gntdev_grant_ref *grants;
+	struct gnttab_map_grant_ref   *map_ops;
+	struct gnttab_unmap_grant_ref *unmap_ops;
+	struct page **pages;
+};
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_print_maps(struct gntdev_priv *priv,
+			      char *text, int text_index)
+{
+#ifdef DEBUG
+	struct grant_map *map;
+
+	pr_debug("maps list (priv %p, usage %d/%d)\n",
+	       priv, priv->used, priv->limit);
+
+	list_for_each_entry(map, &priv->maps, next)
+		pr_debug("  index %2d, count %2d %s\n",
+		       map->index, map->count,
+		       map->index == text_index && text ? text : "");
+#endif
+}
+
+static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
+{
+	struct grant_map *add;
+	int i;
+
+	add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
+	if (NULL == add)
+		return NULL;
+
+	add->grants    = kzalloc(sizeof(add->grants[0])    * count, GFP_KERNEL);
+	add->map_ops   = kzalloc(sizeof(add->map_ops[0])   * count, GFP_KERNEL);
+	add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
+	add->pages     = kzalloc(sizeof(add->pages[0])     * count, GFP_KERNEL);
+	if (NULL == add->grants    ||
+	    NULL == add->map_ops   ||
+	    NULL == add->unmap_ops ||
+	    NULL == add->pages)
+		goto err;
+
+	for (i = 0; i < count; i++) {
+		add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		if (add->pages[i] == NULL)
+			goto err;
+	}
+
+	add->index = 0;
+	add->count = count;
+	add->priv  = priv;
+
+	if (add->count + priv->used > priv->limit)
+		goto err;
+
+	return add;
+
+err:
+	if (add->pages)
+		for (i = 0; i < count; i++) {
+			if (add->pages[i])
+				__free_page(add->pages[i]);
+		}
+	kfree(add->pages);
+	kfree(add->grants);
+	kfree(add->map_ops);
+	kfree(add->unmap_ops);
+	kfree(add);
+	return NULL;
+}
+
+static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
+{
+	struct grant_map *map;
+
+	list_for_each_entry(map, &priv->maps, next) {
+		if (add->index + add->count < map->index) {
+			list_add_tail(&add->next, &map->next);
+			goto done;
+		}
+		add->index = map->index + map->count;
+	}
+	list_add_tail(&add->next, &priv->maps);
+
+done:
+	priv->used += add->count;
+	gntdev_print_maps(priv, "[new]", add->index);
+}
+
+static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
+		int index, int count)
+{
+	struct grant_map *map;
+
+	list_for_each_entry(map, &priv->maps, next) {
+		if (map->index != index)
+			continue;
+		if (map->count != count)
+			continue;
+		return map;
+	}
+	return NULL;
+}
+
+static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
+					       unsigned long vaddr)
+{
+	struct grant_map *map;
+
+	list_for_each_entry(map, &priv->maps, next) {
+		if (!map->vma)
+			continue;
+		if (vaddr < map->vma->vm_start)
+			continue;
+		if (vaddr >= map->vma->vm_end)
+			continue;
+		return map;
+	}
+	return NULL;
+}
+
+static int gntdev_del_map(struct grant_map *map)
+{
+	int i;
+
+	if (map->vma)
+		return -EBUSY;
+	for (i = 0; i < map->count; i++)
+		if (map->unmap_ops[i].handle)
+			return -EBUSY;
+
+	map->priv->used -= map->count;
+	list_del(&map->next);
+	return 0;
+}
+
+static void gntdev_free_map(struct grant_map *map)
+{
+	int i;
+
+	if (!map)
+		return;
+
+	if (map->pages)
+		for (i = 0; i < map->count; i++) {
+			if (map->pages[i])
+				__free_page(map->pages[i]);
+		}
+	kfree(map->pages);
+	kfree(map->grants);
+	kfree(map->map_ops);
+	kfree(map->unmap_ops);
+	kfree(map);
+}
+
+/* ------------------------------------------------------------------ */
+
+static int find_grant_ptes(pte_t *pte, pgtable_t token,
+		unsigned long addr, void *data)
+{
+	struct grant_map *map = data;
+	unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+	u64 pte_maddr;
+
+	BUG_ON(pgnr >= map->count);
+	pte_maddr = arbitrary_virt_to_machine(pte).maddr;
+
+	gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr,
+			  GNTMAP_contains_pte | map->flags,
+			  map->grants[pgnr].ref,
+			  map->grants[pgnr].domid);
+	gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr,
+			    GNTMAP_contains_pte | map->flags,
+			    0 /* handle */);
+	return 0;
+}
+
+static int map_grant_pages(struct grant_map *map)
+{
+	int i, err = 0;
+
+	pr_debug("map %d+%d\n", map->index, map->count);
+	err = gnttab_map_refs(map->map_ops, map->pages, map->count);
+	if (err)
+		return err;
+
+	for (i = 0; i < map->count; i++) {
+		if (map->map_ops[i].status)
+			err = -EINVAL;
+		map->unmap_ops[i].handle = map->map_ops[i].handle;
+	}
+	return err;
+}
+
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+	int i, err = 0;
+
+	pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
+	err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages);
+	if (err)
+		return err;
+
+	for (i = 0; i < pages; i++) {
+		if (map->unmap_ops[offset+i].status)
+			err = -EINVAL;
+		map->unmap_ops[offset+i].handle = 0;
+	}
+	return err;
+}
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_vma_close(struct vm_area_struct *vma)
+{
+	struct grant_map *map = vma->vm_private_data;
+
+	pr_debug("close %p\n", vma);
+	map->is_mapped = 0;
+	map->vma = NULL;
+	vma->vm_private_data = NULL;
+}
+
+static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n",
+			vmf->virtual_address, vmf->pgoff);
+	vmf->flags = VM_FAULT_ERROR;
+	return 0;
+}
+
+static struct vm_operations_struct gntdev_vmops = {
+	.close = gntdev_vma_close,
+	.fault = gntdev_vma_fault,
+};
+
+/* ------------------------------------------------------------------ */
+
+static void mn_invl_range_start(struct mmu_notifier *mn,
+				struct mm_struct *mm,
+				unsigned long start, unsigned long end)
+{
+	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+	struct grant_map *map;
+	unsigned long mstart, mend;
+	int err;
+
+	spin_lock(&priv->lock);
+	list_for_each_entry(map, &priv->maps, next) {
+		if (!map->vma)
+			continue;
+		if (!map->is_mapped)
+			continue;
+		if (map->vma->vm_start >= end)
+			continue;
+		if (map->vma->vm_end <= start)
+			continue;
+		mstart = max(start, map->vma->vm_start);
+		mend   = min(end,   map->vma->vm_end);
+		pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
+				map->index, map->count,
+				map->vma->vm_start, map->vma->vm_end,
+				start, end, mstart, mend);
+		err = unmap_grant_pages(map,
+					(mstart - map->vma->vm_start) >> PAGE_SHIFT,
+					(mend - mstart) >> PAGE_SHIFT);
+		WARN_ON(err);
+	}
+	spin_unlock(&priv->lock);
+}
+
+static void mn_invl_page(struct mmu_notifier *mn,
+			 struct mm_struct *mm,
+			 unsigned long address)
+{
+	mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
+}
+
+static void mn_release(struct mmu_notifier *mn,
+		       struct mm_struct *mm)
+{
+	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+	struct grant_map *map;
+	int err;
+
+	spin_lock(&priv->lock);
+	list_for_each_entry(map, &priv->maps, next) {
+		if (!map->vma)
+			continue;
+		pr_debug("map %d+%d (%lx %lx)\n",
+				map->index, map->count,
+				map->vma->vm_start, map->vma->vm_end);
+		err = unmap_grant_pages(map, /* offset */ 0, map->count);
+		WARN_ON(err);
+	}
+	spin_unlock(&priv->lock);
+}
+
+struct mmu_notifier_ops gntdev_mmu_ops = {
+	.release                = mn_release,
+	.invalidate_page        = mn_invl_page,
+	.invalidate_range_start = mn_invl_range_start,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+	struct gntdev_priv *priv;
+	int ret = 0;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&priv->maps);
+	spin_lock_init(&priv->lock);
+	priv->limit = limit;
+
+	priv->mm = get_task_mm(current);
+	if (!priv->mm) {
+		kfree(priv);
+		return -ENOMEM;
+	}
+	priv->mn.ops = &gntdev_mmu_ops;
+	ret = mmu_notifier_register(&priv->mn, priv->mm);
+	mmput(priv->mm);
+
+	if (ret) {
+		kfree(priv);
+		return ret;
+	}
+
+	flip->private_data = priv;
+	pr_debug("priv %p\n", priv);
+
+	return 0;
+}
+
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+	struct gntdev_priv *priv = flip->private_data;
+	struct grant_map *map;
+	int err;
+
+	pr_debug("priv %p\n", priv);
+
+	spin_lock(&priv->lock);
+	while (!list_empty(&priv->maps)) {
+		map = list_entry(priv->maps.next, struct grant_map, next);
+		err = gntdev_del_map(map);
+		if (WARN_ON(err))
+			gntdev_free_map(map);
+
+	}
+	spin_unlock(&priv->lock);
+
+	mmu_notifier_unregister(&priv->mn, priv->mm);
+	kfree(priv);
+	return 0;
+}
+
+static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
+				       struct ioctl_gntdev_map_grant_ref __user *u)
+{
+	struct ioctl_gntdev_map_grant_ref op;
+	struct grant_map *map;
+	int err;
+
+	if (copy_from_user(&op, u, sizeof(op)) != 0)
+		return -EFAULT;
+	pr_debug("priv %p, add %d\n", priv, op.count);
+	if (unlikely(op.count <= 0))
+		return -EINVAL;
+	if (unlikely(op.count > priv->limit))
+		return -EINVAL;
+
+	err = -ENOMEM;
+	map = gntdev_alloc_map(priv, op.count);
+	if (!map)
+		return err;
+	if (copy_from_user(map->grants, &u->refs,
+			   sizeof(map->grants[0]) * op.count) != 0) {
+		gntdev_free_map(map);
+		return err;
+	}
+
+	spin_lock(&priv->lock);
+	gntdev_add_map(priv, map);
+	op.index = map->index << PAGE_SHIFT;
+	spin_unlock(&priv->lock);
+
+	if (copy_to_user(u, &op, sizeof(op)) != 0) {
+		spin_lock(&priv->lock);
+		gntdev_del_map(map);
+		spin_unlock(&priv->lock);
+		gntdev_free_map(map);
+		return err;
+	}
+	return 0;
+}
+
+static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
+					 struct ioctl_gntdev_unmap_grant_ref __user *u)
+{
+	struct ioctl_gntdev_unmap_grant_ref op;
+	struct grant_map *map;
+	int err = -ENOENT;
+
+	if (copy_from_user(&op, u, sizeof(op)) != 0)
+		return -EFAULT;
+	pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count);
+
+	spin_lock(&priv->lock);
+	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
+	if (map)
+		err = gntdev_del_map(map);
+	spin_unlock(&priv->lock);
+	if (!err)
+		gntdev_free_map(map);
+	return err;
+}
+
+static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
+					      struct ioctl_gntdev_get_offset_for_vaddr __user *u)
+{
+	struct ioctl_gntdev_get_offset_for_vaddr op;
+	struct grant_map *map;
+
+	if (copy_from_user(&op, u, sizeof(op)) != 0)
+		return -EFAULT;
+	pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);
+
+	spin_lock(&priv->lock);
+	map = gntdev_find_map_vaddr(priv, op.vaddr);
+	if (map == NULL ||
+	    map->vma->vm_start != op.vaddr) {
+		spin_unlock(&priv->lock);
+		return -EINVAL;
+	}
+	op.offset = map->index << PAGE_SHIFT;
+	op.count = map->count;
+	spin_unlock(&priv->lock);
+
+	if (copy_to_user(u, &op, sizeof(op)) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
+					struct ioctl_gntdev_set_max_grants __user *u)
+{
+	struct ioctl_gntdev_set_max_grants op;
+
+	if (copy_from_user(&op, u, sizeof(op)) != 0)
+		return -EFAULT;
+	pr_debug("priv %p, limit %d\n", priv, op.count);
+	if (op.count > limit)
+		return -E2BIG;
+
+	spin_lock(&priv->lock);
+	priv->limit = op.count;
+	spin_unlock(&priv->lock);
+	return 0;
+}
+
+static long gntdev_ioctl(struct file *flip,
+			 unsigned int cmd, unsigned long arg)
+{
+	struct gntdev_priv *priv = flip->private_data;
+	void __user *ptr = (void __user *)arg;
+
+	switch (cmd) {
+	case IOCTL_GNTDEV_MAP_GRANT_REF:
+		return gntdev_ioctl_map_grant_ref(priv, ptr);
+
+	case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+		return gntdev_ioctl_unmap_grant_ref(priv, ptr);
+
+	case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+		return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
+
+	case IOCTL_GNTDEV_SET_MAX_GRANTS:
+		return gntdev_ioctl_set_max_grants(priv, ptr);
+
+	default:
+		pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
+		return -ENOIOCTLCMD;
+	}
+
+	return 0;
+}
+
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
+{
+	struct gntdev_priv *priv = flip->private_data;
+	int index = vma->vm_pgoff;
+	int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	struct grant_map *map;
+	int err = -EINVAL;
+
+	if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	pr_debug("map %d+%d at %lx (pgoff %lx)\n",
+			index, count, vma->vm_start, vma->vm_pgoff);
+
+	spin_lock(&priv->lock);
+	map = gntdev_find_map_index(priv, index, count);
+	if (!map)
+		goto unlock_out;
+	if (map->vma)
+		goto unlock_out;
+	if (priv->mm != vma->vm_mm) {
+		printk(KERN_WARNING "Huh? Other mm?\n");
+		goto unlock_out;
+	}
+
+	vma->vm_ops = &gntdev_vmops;
+
+	vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP;
+
+	vma->vm_private_data = map;
+	map->vma = vma;
+
+	map->flags = GNTMAP_host_map | GNTMAP_application_map;
+	if (!(vma->vm_flags & VM_WRITE))
+		map->flags |= GNTMAP_readonly;
+
+	spin_unlock(&priv->lock);
+
+	err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+				  vma->vm_end - vma->vm_start,
+				  find_grant_ptes, map);
+	if (err) {
+		printk(KERN_WARNING "find_grant_ptes() failure.\n");
+		return err;
+	}
+
+	err = map_grant_pages(map);
+	if (err) {
+		printk(KERN_WARNING "map_grant_pages() failure.\n");
+		return err;
+	}
+
+	map->is_mapped = 1;
+
+	return 0;
+
+unlock_out:
+	spin_unlock(&priv->lock);
+	return err;
+}
+
+static const struct file_operations gntdev_fops = {
+	.owner = THIS_MODULE,
+	.open = gntdev_open,
+	.release = gntdev_release,
+	.mmap = gntdev_mmap,
+	.unlocked_ioctl = gntdev_ioctl
+};
+
+static struct miscdevice gntdev_miscdev = {
+	.minor        = MISC_DYNAMIC_MINOR,
+	.name         = "xen/gntdev",
+	.fops         = &gntdev_fops,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int __init gntdev_init(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	err = misc_register(&gntdev_miscdev);
+	if (err != 0) {
+		printk(KERN_ERR "Could not register gntdev device\n");
+		return err;
+	}
+	return 0;
+}
+
+static void __exit gntdev_exit(void)
+{
+	misc_deregister(&gntdev_miscdev);
+}
+
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+
+/* ------------------------------------------------------------------ */
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 7d8f531fb8e8..9ef54ebc1194 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -34,12 +34,16 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/io.h>
 
+#include <xen/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/page.h>
 #include <xen/grant_table.h>
+#include <xen/interface/memory.h>
 #include <asm/xen/hypercall.h>
 
 #include <asm/pgtable.h>
@@ -57,6 +61,8 @@ static unsigned int boot_max_nr_grant_frames;
 static int gnttab_free_count;
 static grant_ref_t gnttab_free_head;
 static DEFINE_SPINLOCK(gnttab_list_lock);
+unsigned long xen_hvm_resume_frames;
+EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
 
 static struct grant_entry *shared;
 
@@ -431,7 +437,7 @@ static unsigned int __max_nr_grant_frames(void)
 	return query.max_nr_frames;
 }
 
-static inline unsigned int max_nr_grant_frames(void)
+unsigned int gnttab_max_grant_frames(void)
 {
 	unsigned int xen_max = __max_nr_grant_frames();
 
@@ -439,6 +445,53 @@ static inline unsigned int max_nr_grant_frames(void)
 		return boot_max_nr_grant_frames;
 	return xen_max;
 }
+EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
+
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+		    struct page **pages, unsigned int count)
+{
+	int i, ret;
+	pte_t *pte;
+	unsigned long mfn;
+
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < count; i++) {
+		/* m2p override only supported for GNTMAP_contains_pte mappings */
+		if (!(map_ops[i].flags & GNTMAP_contains_pte))
+			continue;
+		pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+				(map_ops[i].host_addr & ~PAGE_MASK));
+		mfn = pte_mfn(*pte);
+		ret = m2p_add_override(mfn, pages[i]);
+		if (ret)
+			return ret;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_map_refs);
+
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+		struct page **pages, unsigned int count)
+{
+	int i, ret;
+
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < count; i++) {
+		ret = m2p_remove_override(pages[i]);
+		if (ret)
+			return ret;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
 
 static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 {
@@ -447,6 +500,30 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 	unsigned int nr_gframes = end_idx + 1;
 	int rc;
 
+	if (xen_hvm_domain()) {
+		struct xen_add_to_physmap xatp;
+		unsigned int i = end_idx;
+		rc = 0;
+		/*
+		 * Loop backwards, so that the first hypercall has the largest
+		 * index, ensuring that the table will grow only once.
+		 */
+		do {
+			xatp.domid = DOMID_SELF;
+			xatp.idx = i;
+			xatp.space = XENMAPSPACE_grant_table;
+			xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i;
+			rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
+			if (rc != 0) {
+				printk(KERN_WARNING
+						"grant table add_to_physmap failed, err=%d\n", rc);
+				break;
+			}
+		} while (i-- > start_idx);
+
+		return rc;
+	}
+
 	frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
 	if (!frames)
 		return -ENOMEM;
@@ -463,7 +540,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 
 	BUG_ON(rc || setup.status);
 
-	rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(),
+	rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(),
 				    &shared);
 	BUG_ON(rc);
 
@@ -474,9 +551,27 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 
 int gnttab_resume(void)
 {
-	if (max_nr_grant_frames() < nr_grant_frames)
+	unsigned int max_nr_gframes;
+
+	max_nr_gframes = gnttab_max_grant_frames();
+	if (max_nr_gframes < nr_grant_frames)
 		return -ENOSYS;
-	return gnttab_map(0, nr_grant_frames - 1);
+
+	if (xen_pv_domain())
+		return gnttab_map(0, nr_grant_frames - 1);
+
+	if (!shared) {
+		shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes);
+		if (shared == NULL) {
+			printk(KERN_WARNING
+					"Failed to ioremap gnttab share frames!");
+			return -ENOMEM;
+		}
+	}
+
+	gnttab_map(0, nr_grant_frames - 1);
+
+	return 0;
 }
 
 int gnttab_suspend(void)
@@ -493,7 +588,7 @@ static int gnttab_expand(unsigned int req_entries)
 	cur = nr_grant_frames;
 	extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
 		 GREFS_PER_GRANT_FRAME);
-	if (cur + extra > max_nr_grant_frames())
+	if (cur + extra > gnttab_max_grant_frames())
 		return -ENOSPC;
 
 	rc = gnttab_map(cur, cur + extra - 1);
@@ -503,15 +598,12 @@ static int gnttab_expand(unsigned int req_entries)
 	return rc;
 }
 
-static int __devinit gnttab_init(void)
+int gnttab_init(void)
 {
 	int i;
 	unsigned int max_nr_glist_frames, nr_glist_frames;
 	unsigned int nr_init_grefs;
 
-	if (!xen_domain())
-		return -ENODEV;
-
 	nr_grant_frames = 1;
 	boot_max_nr_grant_frames = __max_nr_grant_frames();
 
@@ -554,5 +646,18 @@ static int __devinit gnttab_init(void)
 	kfree(gnttab_list);
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(gnttab_init);
+
+static int __devinit __gnttab_init(void)
+{
+	/* Delay grant-table initialization in the PV on HVM case */
+	if (xen_hvm_domain())
+		return 0;
+
+	if (!xen_pv_domain())
+		return -ENODEV;
+
+	return gnttab_init();
+}
 
-core_initcall(gnttab_init);
+core_initcall(__gnttab_init);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 10d03d7931c4..db8c4c4ac880 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -3,11 +3,13 @@
  */
 #include <linux/kernel.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 #include <linux/reboot.h>
 #include <linux/sysrq.h>
 #include <linux/stop_machine.h>
 #include <linux/freezer.h>
 
+#include <xen/xen.h>
 #include <xen/xenbus.h>
 #include <xen/grant_table.h>
 #include <xen/events.h>
@@ -16,6 +18,7 @@
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/page.h>
+#include <asm/xen/hypervisor.h>
 
 enum shutdown_state {
 	SHUTDOWN_INVALID = -1,
@@ -32,10 +35,31 @@ enum shutdown_state {
 static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
 
 #ifdef CONFIG_PM_SLEEP
-static int xen_suspend(void *data)
+static int xen_hvm_suspend(void *data)
 {
+	struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
 	int *cancelled = data;
+
+	BUG_ON(!irqs_disabled());
+
+	*cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
+
+	xen_hvm_post_suspend(*cancelled);
+	gnttab_resume();
+
+	if (!*cancelled) {
+		xen_irq_resume();
+		xen_console_resume();
+		xen_timer_resume();
+	}
+
+	return 0;
+}
+
+static int xen_suspend(void *data)
+{
 	int err;
+	int *cancelled = data;
 
 	BUG_ON(!irqs_disabled());
 
@@ -43,7 +67,6 @@ static int xen_suspend(void *data)
 	if (err) {
 		printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
 			err);
-		dpm_resume_noirq(PMSG_RESUME);
 		return err;
 	}
 
@@ -69,7 +92,6 @@ static int xen_suspend(void *data)
 	}
 
 	sysdev_resume();
-	dpm_resume_noirq(PMSG_RESUME);
 
 	return 0;
 }
@@ -88,14 +110,14 @@ static void do_suspend(void)
 	err = freeze_processes();
 	if (err) {
 		printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
-		return;
+		goto out;
 	}
 #endif
 
 	err = dpm_suspend_start(PMSG_SUSPEND);
 	if (err) {
 		printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err);
-		goto out;
+		goto out_thaw;
 	}
 
 	printk(KERN_DEBUG "suspending xenstore...\n");
@@ -104,31 +126,37 @@ static void do_suspend(void)
 	err = dpm_suspend_noirq(PMSG_SUSPEND);
 	if (err) {
 		printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err);
-		goto resume_devices;
+		goto out_resume;
 	}
 
-	err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+	if (xen_hvm_domain())
+		err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0));
+	else
+		err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+
+	dpm_resume_noirq(PMSG_RESUME);
+
 	if (err) {
 		printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
-		goto out;
+		cancelled = 1;
 	}
 
+out_resume:
 	if (!cancelled) {
 		xen_arch_resume();
 		xs_resume();
 	} else
 		xs_suspend_cancel();
 
-	dpm_resume_noirq(PMSG_RESUME);
-
-resume_devices:
 	dpm_resume_end(PMSG_RESUME);
 
 	/* Make sure timer events get retriggered on all CPUs */
 	clock_was_set();
-out:
+
+out_thaw:
 #ifdef CONFIG_PREEMPT
 	thaw_processes();
+out:
 #endif
 	shutting_down = SHUTDOWN_INVALID;
 }
@@ -183,6 +211,7 @@ static void shutdown_handler(struct xenbus_watch *watch,
 	kfree(str);
 }
 
+#ifdef CONFIG_MAGIC_SYSRQ
 static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
 			  unsigned int len)
 {
@@ -209,18 +238,19 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
 		goto again;
 
 	if (sysrq_key != '\0')
-		handle_sysrq(sysrq_key, NULL);
+		handle_sysrq(sysrq_key);
 }
 
-static struct xenbus_watch shutdown_watch = {
-	.node = "control/shutdown",
-	.callback = shutdown_handler
-};
-
 static struct xenbus_watch sysrq_watch = {
 	.node = "control/sysrq",
 	.callback = sysrq_handler
 };
+#endif
+
+static struct xenbus_watch shutdown_watch = {
+	.node = "control/shutdown",
+	.callback = shutdown_handler
+};
 
 static int setup_shutdown_watcher(void)
 {
@@ -232,11 +262,13 @@ static int setup_shutdown_watcher(void)
 		return err;
 	}
 
+#ifdef CONFIG_MAGIC_SYSRQ
 	err = register_xenbus_watch(&sysrq_watch);
 	if (err) {
 		printk(KERN_ERR "Failed to set sysrq watcher\n");
 		return err;
 	}
+#endif
 
 	return 0;
 }
@@ -249,7 +281,19 @@ static int shutdown_event(struct notifier_block *notifier,
 	return NOTIFY_DONE;
 }
 
-static int __init setup_shutdown_event(void)
+static int __init __setup_shutdown_event(void)
+{
+	/* Delay initialization in the PV on HVM case */
+	if (xen_hvm_domain())
+		return 0;
+
+	if (!xen_pv_domain())
+		return -ENODEV;
+
+	return xen_setup_shutdown_event();
+}
+
+int xen_setup_shutdown_event(void)
 {
 	static struct notifier_block xenstore_notifier = {
 		.notifier_call = shutdown_event
@@ -258,5 +302,6 @@ static int __init setup_shutdown_event(void)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
 
-subsys_initcall(setup_shutdown_event);
+subsys_initcall(__setup_shutdown_event);
diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
new file mode 100644
index 000000000000..cef4bafc07dc
--- /dev/null
+++ b/drivers/xen/pci.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Weidong Han <weidong.han@intel.com>
+ */
+
+#include <linux/pci.h>
+#include <xen/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/xen.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include "../pci/pci.h"
+
+static int xen_add_device(struct device *dev)
+{
+	int r;
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+
+#ifdef CONFIG_PCI_IOV
+	if (pci_dev->is_virtfn) {
+		struct physdev_manage_pci_ext manage_pci_ext = {
+			.bus		= pci_dev->bus->number,
+			.devfn		= pci_dev->devfn,
+			.is_virtfn 	= 1,
+			.physfn.bus	= pci_dev->physfn->bus->number,
+			.physfn.devfn	= pci_dev->physfn->devfn,
+		};
+
+		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
+			&manage_pci_ext);
+	} else
+#endif
+	if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
+		struct physdev_manage_pci_ext manage_pci_ext = {
+			.bus		= pci_dev->bus->number,
+			.devfn		= pci_dev->devfn,
+			.is_extfn	= 1,
+		};
+
+		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
+			&manage_pci_ext);
+	} else {
+		struct physdev_manage_pci manage_pci = {
+			.bus 	= pci_dev->bus->number,
+			.devfn	= pci_dev->devfn,
+		};
+
+		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
+			&manage_pci);
+	}
+
+	return r;
+}
+
+static int xen_remove_device(struct device *dev)
+{
+	int r;
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	struct physdev_manage_pci manage_pci;
+
+	manage_pci.bus = pci_dev->bus->number;
+	manage_pci.devfn = pci_dev->devfn;
+
+	r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
+		&manage_pci);
+
+	return r;
+}
+
+static int xen_pci_notifier(struct notifier_block *nb,
+			    unsigned long action, void *data)
+{
+	struct device *dev = data;
+	int r = 0;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		r = xen_add_device(dev);
+		break;
+	case BUS_NOTIFY_DEL_DEVICE:
+		r = xen_remove_device(dev);
+		break;
+	default:
+		break;
+	}
+
+	return r;
+}
+
+struct notifier_block device_nb = {
+	.notifier_call = xen_pci_notifier,
+};
+
+static int __init register_xen_pci_notifier(void)
+{
+	if (!xen_initial_domain())
+		return 0;
+
+	return bus_register_notifier(&pci_bus_type, &device_nb);
+}
+
+arch_initcall(register_xen_pci_notifier);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
new file mode 100644
index 000000000000..afbe041f42c5
--- /dev/null
+++ b/drivers/xen/platform-pci.c
@@ -0,0 +1,200 @@
+/******************************************************************************
+ * platform-pci.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2005, Intel Corporation.
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include <xen/platform_pci.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/hvm.h>
+#include <xen/xen-ops.h>
+
+#define DRV_NAME    "xen-platform-pci"
+
+MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com");
+MODULE_DESCRIPTION("Xen platform PCI device");
+MODULE_LICENSE("GPL");
+
+static unsigned long platform_mmio;
+static unsigned long platform_mmio_alloc;
+static unsigned long platform_mmiolen;
+static uint64_t callback_via;
+
+unsigned long alloc_xen_mmio(unsigned long len)
+{
+	unsigned long addr;
+
+	addr = platform_mmio + platform_mmio_alloc;
+	platform_mmio_alloc += len;
+	BUG_ON(platform_mmio_alloc > platform_mmiolen);
+
+	return addr;
+}
+
+static uint64_t get_callback_via(struct pci_dev *pdev)
+{
+	u8 pin;
+	int irq;
+
+	irq = pdev->irq;
+	if (irq < 16)
+		return irq; /* ISA IRQ */
+
+	pin = pdev->pin;
+
+	/* We don't know the GSI. Specify the PCI INTx line instead. */
+	return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */
+		((uint64_t)pci_domain_nr(pdev->bus) << 32) |
+		((uint64_t)pdev->bus->number << 16) |
+		((uint64_t)(pdev->devfn & 0xff) << 8) |
+		((uint64_t)(pin - 1) & 3);
+}
+
+static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
+{
+	xen_hvm_evtchn_do_upcall();
+	return IRQ_HANDLED;
+}
+
+static int xen_allocate_irq(struct pci_dev *pdev)
+{
+	return request_irq(pdev->irq, do_hvm_evtchn_intr,
+			IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
+			"xen-platform-pci", pdev);
+}
+
+static int platform_pci_resume(struct pci_dev *pdev)
+{
+	int err;
+	if (xen_have_vector_callback)
+		return 0;
+	err = xen_set_callback_via(callback_via);
+	if (err) {
+		dev_err(&pdev->dev, "platform_pci_resume failure!\n");
+		return err;
+	}
+	return 0;
+}
+
+static int __devinit platform_pci_init(struct pci_dev *pdev,
+				       const struct pci_device_id *ent)
+{
+	int i, ret;
+	long ioaddr;
+	long mmio_addr, mmio_len;
+	unsigned int max_nr_gframes;
+
+	i = pci_enable_device(pdev);
+	if (i)
+		return i;
+
+	ioaddr = pci_resource_start(pdev, 0);
+
+	mmio_addr = pci_resource_start(pdev, 1);
+	mmio_len = pci_resource_len(pdev, 1);
+
+	if (mmio_addr == 0 || ioaddr == 0) {
+		dev_err(&pdev->dev, "no resources found\n");
+		ret = -ENOENT;
+		goto pci_out;
+	}
+
+	ret = pci_request_region(pdev, 1, DRV_NAME);
+	if (ret < 0)
+		goto pci_out;
+
+	ret = pci_request_region(pdev, 0, DRV_NAME);
+	if (ret < 0)
+		goto mem_out;
+
+	platform_mmio = mmio_addr;
+	platform_mmiolen = mmio_len;
+
+	if (!xen_have_vector_callback) {
+		ret = xen_allocate_irq(pdev);
+		if (ret) {
+			dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret);
+			goto out;
+		}
+		callback_via = get_callback_via(pdev);
+		ret = xen_set_callback_via(callback_via);
+		if (ret) {
+			dev_warn(&pdev->dev, "Unable to set the evtchn callback "
+					 "err=%d\n", ret);
+			goto out;
+		}
+	}
+
+	max_nr_gframes = gnttab_max_grant_frames();
+	xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+	ret = gnttab_init();
+	if (ret)
+		goto out;
+	xenbus_probe(NULL);
+	ret = xen_setup_shutdown_event();
+	if (ret)
+		goto out;
+	return 0;
+
+out:
+	pci_release_region(pdev, 0);
+mem_out:
+	pci_release_region(pdev, 1);
+pci_out:
+	pci_disable_device(pdev);
+	return ret;
+}
+
+static struct pci_device_id platform_pci_tbl[] __devinitdata = {
+	{PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{0,}
+};
+
+MODULE_DEVICE_TABLE(pci, platform_pci_tbl);
+
+static struct pci_driver platform_driver = {
+	.name =           DRV_NAME,
+	.probe =          platform_pci_init,
+	.id_table =       platform_pci_tbl,
+#ifdef CONFIG_PM
+	.resume_early =   platform_pci_resume,
+#endif
+};
+
+static int __init platform_pci_module_init(void)
+{
+	/* no unplug has been done, IGNORE hasn't been specified: just
+	 * return now */
+	if (!xen_platform_pci_unplug)
+		return -ENODEV;
+
+	return pci_register_driver(&platform_driver);
+}
+
+module_init(platform_pci_module_init);
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
new file mode 100644
index 000000000000..54469c3eeacd
--- /dev/null
+++ b/drivers/xen/swiotlb-xen.c
@@ -0,0 +1,515 @@
+/*
+ *  Copyright 2010
+ *  by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ *
+ * This code provides a IOMMU for Xen PV guests with PCI passthrough.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License v2.0 as published by
+ * the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * PV guests under Xen are running in an non-contiguous memory architecture.
+ *
+ * When PCI pass-through is utilized, this necessitates an IOMMU for
+ * translating bus (DMA) to virtual and vice-versa and also providing a
+ * mechanism to have contiguous pages for device drivers operations (say DMA
+ * operations).
+ *
+ * Specifically, under Xen the Linux idea of pages is an illusion. It
+ * assumes that pages start at zero and go up to the available memory. To
+ * help with that, the Linux Xen MMU provides a lookup mechanism to
+ * translate the page frame numbers (PFN) to machine frame numbers (MFN)
+ * and vice-versa. The MFN are the "real" frame numbers. Furthermore
+ * memory is not contiguous. Xen hypervisor stitches memory for guests
+ * from different pools, which means there is no guarantee that PFN==MFN
+ * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are
+ * allocated in descending order (high to low), meaning the guest might
+ * never get any MFN's under the 4GB mark.
+ *
+ */
+
+#include <linux/bootmem.h>
+#include <linux/dma-mapping.h>
+#include <xen/swiotlb-xen.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+/*
+ * Used to do a quick range check in swiotlb_tbl_unmap_single and
+ * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+
+static char *xen_io_tlb_start, *xen_io_tlb_end;
+static unsigned long xen_io_tlb_nslabs;
+/*
+ * Quick lookup value of the bus address of the IOTLB.
+ */
+
+u64 start_dma_addr;
+
+static dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
+{
+	return phys_to_machine(XPADDR(paddr)).maddr;;
+}
+
+static phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
+{
+	return machine_to_phys(XMADDR(baddr)).paddr;
+}
+
+static dma_addr_t xen_virt_to_bus(void *address)
+{
+	return xen_phys_to_bus(virt_to_phys(address));
+}
+
+static int check_pages_physically_contiguous(unsigned long pfn,
+					     unsigned int offset,
+					     size_t length)
+{
+	unsigned long next_mfn;
+	int i;
+	int nr_pages;
+
+	next_mfn = pfn_to_mfn(pfn);
+	nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+	for (i = 1; i < nr_pages; i++) {
+		if (pfn_to_mfn(++pfn) != ++next_mfn)
+			return 0;
+	}
+	return 1;
+}
+
+static int range_straddles_page_boundary(phys_addr_t p, size_t size)
+{
+	unsigned long pfn = PFN_DOWN(p);
+	unsigned int offset = p & ~PAGE_MASK;
+
+	if (offset + size <= PAGE_SIZE)
+		return 0;
+	if (check_pages_physically_contiguous(pfn, offset, size))
+		return 0;
+	return 1;
+}
+
+static int is_xen_swiotlb_buffer(dma_addr_t dma_addr)
+{
+	unsigned long mfn = PFN_DOWN(dma_addr);
+	unsigned long pfn = mfn_to_local_pfn(mfn);
+	phys_addr_t paddr;
+
+	/* If the address is outside our domain, it CAN
+	 * have the same virtual address as another address
+	 * in our domain. Therefore _only_ check address within our domain.
+	 */
+	if (pfn_valid(pfn)) {
+		paddr = PFN_PHYS(pfn);
+		return paddr >= virt_to_phys(xen_io_tlb_start) &&
+		       paddr < virt_to_phys(xen_io_tlb_end);
+	}
+	return 0;
+}
+
+static int max_dma_bits = 32;
+
+static int
+xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
+{
+	int i, rc;
+	int dma_bits;
+
+	dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
+
+	i = 0;
+	do {
+		int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE);
+
+		do {
+			rc = xen_create_contiguous_region(
+				(unsigned long)buf + (i << IO_TLB_SHIFT),
+				get_order(slabs << IO_TLB_SHIFT),
+				dma_bits);
+		} while (rc && dma_bits++ < max_dma_bits);
+		if (rc)
+			return rc;
+
+		i += slabs;
+	} while (i < nslabs);
+	return 0;
+}
+
+void __init xen_swiotlb_init(int verbose)
+{
+	unsigned long bytes;
+	int rc;
+
+	xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
+	xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
+
+	bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT;
+
+	/*
+	 * Get IO TLB memory from any location.
+	 */
+	xen_io_tlb_start = alloc_bootmem(bytes);
+	if (!xen_io_tlb_start)
+		panic("Cannot allocate SWIOTLB buffer");
+
+	xen_io_tlb_end = xen_io_tlb_start + bytes;
+	/*
+	 * And replace that memory with pages under 4GB.
+	 */
+	rc = xen_swiotlb_fixup(xen_io_tlb_start,
+			       bytes,
+			       xen_io_tlb_nslabs);
+	if (rc)
+		goto error;
+
+	start_dma_addr = xen_virt_to_bus(xen_io_tlb_start);
+	swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose);
+
+	return;
+error:
+	panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\
+	      "We either don't have the permission or you do not have enough"\
+	      "free memory under 4GB!\n", rc);
+}
+
+void *
+xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+			   dma_addr_t *dma_handle, gfp_t flags)
+{
+	void *ret;
+	int order = get_order(size);
+	u64 dma_mask = DMA_BIT_MASK(32);
+	unsigned long vstart;
+
+	/*
+	* Ignore region specifiers - the kernel's ideas of
+	* pseudo-phys memory layout has nothing to do with the
+	* machine physical layout.  We can't allocate highmem
+	* because we can't return a pointer to it.
+	*/
+	flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
+
+	if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
+		return ret;
+
+	vstart = __get_free_pages(flags, order);
+	ret = (void *)vstart;
+
+	if (hwdev && hwdev->coherent_dma_mask)
+		dma_mask = dma_alloc_coherent_mask(hwdev, flags);
+
+	if (ret) {
+		if (xen_create_contiguous_region(vstart, order,
+						 fls64(dma_mask)) != 0) {
+			free_pages(vstart, order);
+			return NULL;
+		}
+		memset(ret, 0, size);
+		*dma_handle = virt_to_machine(ret).maddr;
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent);
+
+void
+xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+			  dma_addr_t dev_addr)
+{
+	int order = get_order(size);
+
+	if (dma_release_from_coherent(hwdev, order, vaddr))
+		return;
+
+	xen_destroy_contiguous_region((unsigned long)vaddr, order);
+	free_pages((unsigned long)vaddr, order);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent);
+
+
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode.  The
+ * physical address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed.
+ */
+dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
+				unsigned long offset, size_t size,
+				enum dma_data_direction dir,
+				struct dma_attrs *attrs)
+{
+	phys_addr_t phys = page_to_phys(page) + offset;
+	dma_addr_t dev_addr = xen_phys_to_bus(phys);
+	void *map;
+
+	BUG_ON(dir == DMA_NONE);
+	/*
+	 * If the address happens to be in the device's DMA window,
+	 * we can safely return the device addr and not worry about bounce
+	 * buffering it.
+	 */
+	if (dma_capable(dev, dev_addr, size) &&
+	    !range_straddles_page_boundary(phys, size) && !swiotlb_force)
+		return dev_addr;
+
+	/*
+	 * Oh well, have to allocate and map a bounce buffer.
+	 */
+	map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir);
+	if (!map)
+		return DMA_ERROR_CODE;
+
+	dev_addr = xen_virt_to_bus(map);
+
+	/*
+	 * Ensure that the address returned is DMA'ble
+	 */
+	if (!dma_capable(dev, dev_addr, size))
+		panic("map_single: bounce buffer is not DMA'ble");
+
+	return dev_addr;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
+
+/*
+ * Unmap a single streaming mode DMA translation.  The dma_addr and size must
+ * match what was provided for in a previous xen_swiotlb_map_page call.  All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+			     size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t paddr = xen_bus_to_phys(dev_addr);
+
+	BUG_ON(dir == DMA_NONE);
+
+	/* NOTE: We use dev_addr here, not paddr! */
+	if (is_xen_swiotlb_buffer(dev_addr)) {
+		swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
+		return;
+	}
+
+	if (dir != DMA_FROM_DEVICE)
+		return;
+
+	/*
+	 * phys_to_virt doesn't work with hihgmem page but we could
+	 * call dma_mark_clean() with hihgmem page here. However, we
+	 * are fine since dma_mark_clean() is null on POWERPC. We can
+	 * make dma_mark_clean() take a physical address if necessary.
+	 */
+	dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+			    size_t size, enum dma_data_direction dir,
+			    struct dma_attrs *attrs)
+{
+	xen_unmap_single(hwdev, dev_addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
+
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the dma mapping, you must
+ * call this function before doing so.  At the next point you give the dma
+ * address back to the card, you must first perform a
+ * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+static void
+xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+			size_t size, enum dma_data_direction dir,
+			enum dma_sync_target target)
+{
+	phys_addr_t paddr = xen_bus_to_phys(dev_addr);
+
+	BUG_ON(dir == DMA_NONE);
+
+	/* NOTE: We use dev_addr here, not paddr! */
+	if (is_xen_swiotlb_buffer(dev_addr)) {
+		swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
+				       target);
+		return;
+	}
+
+	if (dir != DMA_FROM_DEVICE)
+		return;
+
+	dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+void
+xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+				size_t size, enum dma_data_direction dir)
+{
+	xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_cpu);
+
+void
+xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+				   size_t size, enum dma_data_direction dir)
+{
+	xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_device);
+
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above xen_swiotlb_map_page
+ * interface.  Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length.  They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for xen_swiotlb_map_page are the
+ * same here.
+ */
+int
+xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+			 int nelems, enum dma_data_direction dir,
+			 struct dma_attrs *attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for_each_sg(sgl, sg, nelems, i) {
+		phys_addr_t paddr = sg_phys(sg);
+		dma_addr_t dev_addr = xen_phys_to_bus(paddr);
+
+		if (swiotlb_force ||
+		    !dma_capable(hwdev, dev_addr, sg->length) ||
+		    range_straddles_page_boundary(paddr, sg->length)) {
+			void *map = swiotlb_tbl_map_single(hwdev,
+							   start_dma_addr,
+							   sg_phys(sg),
+							   sg->length, dir);
+			if (!map) {
+				/* Don't panic here, we expect map_sg users
+				   to do proper error handling. */
+				xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
+							   attrs);
+				sgl[0].dma_length = 0;
+				return DMA_ERROR_CODE;
+			}
+			sg->dma_address = xen_virt_to_bus(map);
+		} else
+			sg->dma_address = dev_addr;
+		sg->dma_length = sg->length;
+	}
+	return nelems;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs);
+
+int
+xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+		   enum dma_data_direction dir)
+{
+	return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg);
+
+/*
+ * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
+ * concerning calls here are the same as for swiotlb_unmap_page() above.
+ */
+void
+xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+			   int nelems, enum dma_data_direction dir,
+			   struct dma_attrs *attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for_each_sg(sgl, sg, nelems, i)
+		xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs);
+
+void
+xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+		     enum dma_data_direction dir)
+{
+	return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg);
+
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
+ * and usage.
+ */
+static void
+xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
+		    int nelems, enum dma_data_direction dir,
+		    enum dma_sync_target target)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nelems, i)
+		xen_swiotlb_sync_single(hwdev, sg->dma_address,
+					sg->dma_length, dir, target);
+}
+
+void
+xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+			    int nelems, enum dma_data_direction dir)
+{
+	xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_cpu);
+
+void
+xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+			       int nelems, enum dma_data_direction dir)
+{
+	xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_device);
+
+int
+xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
+{
+	return !dma_addr;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mapping_error);
+
+/*
+ * Return whether the given device DMA address mask can be supported
+ * properly.  For example, if your device can only drive the low 24-bits
+ * during bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+int
+xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
+{
+	return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported);
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 88a60e03ccf0..60f1827a32cb 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -7,6 +7,7 @@
  *  published by the Free Software Foundation.
  */
 
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
@@ -14,6 +15,7 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
+#include <xen/xen.h>
 #include <xen/xenbus.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>
@@ -425,7 +427,7 @@ static ssize_t hyp_sysfs_store(struct kobject *kobj,
 	return 0;
 }
 
-static struct sysfs_ops hyp_sysfs_ops = {
+static const struct sysfs_ops hyp_sysfs_ops = {
 	.show = hyp_sysfs_show,
 	.store = hyp_sysfs_store,
 };
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
index 5571f5b84223..8dca685358b4 100644
--- a/drivers/xen/xenbus/Makefile
+++ b/drivers/xen/xenbus/Makefile
@@ -5,3 +5,8 @@ xenbus-objs += xenbus_client.o
 xenbus-objs += xenbus_comms.o
 xenbus-objs += xenbus_xs.o
 xenbus-objs += xenbus_probe.o
+
+xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
+xenbus-objs += $(xenbus-be-objs-y)
+
+obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 92a1ef80a288..cdacf923e073 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -30,6 +30,7 @@
  * IN THE SOFTWARE.
  */
 
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 #include <asm/xen/hypervisor.h>
@@ -49,6 +50,8 @@ const char *xenbus_strstate(enum xenbus_state state)
 		[ XenbusStateConnected    ] = "Connected",
 		[ XenbusStateClosing      ] = "Closing",
 		[ XenbusStateClosed	  ] = "Closed",
+		[XenbusStateReconfiguring] = "Reconfiguring",
+		[XenbusStateReconfigured] = "Reconfigured",
 	};
 	return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
 }
@@ -132,17 +135,12 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev,
 }
 EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
 
+static void xenbus_switch_fatal(struct xenbus_device *, int, int,
+				const char *, ...);
 
-/**
- * xenbus_switch_state
- * @dev: xenbus device
- * @state: new state
- *
- * Advertise in the store a change of the given driver to the given new_state.
- * Return 0 on success, or -errno on error.  On error, the device will switch
- * to XenbusStateClosing, and the error will be saved in the store.
- */
-int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
+static int
+__xenbus_switch_state(struct xenbus_device *dev,
+		      enum xenbus_state state, int depth)
 {
 	/* We check whether the state is currently set to the given value, and
 	   if not, then the state is set.  We don't want to unconditionally
@@ -151,35 +149,65 @@ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
 	   to it, as the device will be tearing down, and we don't want to
 	   resurrect that directory.
 
-	   Note that, because of this cached value of our state, this function
-	   will not work inside a Xenstore transaction (something it was
-	   trying to in the past) because dev->state would not get reset if
-	   the transaction was aborted.
-
+	   Note that, because of this cached value of our state, this
+	   function will not take a caller's Xenstore transaction
+	   (something it was trying to in the past) because dev->state
+	   would not get reset if the transaction was aborted.
 	 */
 
+	struct xenbus_transaction xbt;
 	int current_state;
-	int err;
+	int err, abort;
 
 	if (state == dev->state)
 		return 0;
 
-	err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
-			   &current_state);
-	if (err != 1)
+again:
+	abort = 1;
+
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_switch_fatal(dev, depth, err, "starting transaction");
 		return 0;
+	}
+
+	err = xenbus_scanf(xbt, dev->nodename, "state", "%d", &current_state);
+	if (err != 1)
+		goto abort;
 
-	err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
+	err = xenbus_printf(xbt, dev->nodename, "state", "%d", state);
 	if (err) {
-		if (state != XenbusStateClosing) /* Avoid looping */
-			xenbus_dev_fatal(dev, err, "writing new state");
-		return err;
+		xenbus_switch_fatal(dev, depth, err, "writing new state");
+		goto abort;
 	}
 
-	dev->state = state;
+	abort = 0;
+abort:
+	err = xenbus_transaction_end(xbt, abort);
+	if (err) {
+		if (err == -EAGAIN && !abort)
+			goto again;
+		xenbus_switch_fatal(dev, depth, err, "ending transaction");
+	} else
+		dev->state = state;
 
 	return 0;
 }
+
+/**
+ * xenbus_switch_state
+ * @dev: xenbus device
+ * @state: new state
+ *
+ * Advertise in the store a change of the given driver to the given new_state.
+ * Return 0 on success, or -errno on error.  On error, the device will switch
+ * to XenbusStateClosing, and the error will be saved in the store.
+ */
+int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
+{
+	return __xenbus_switch_state(dev, state, 0);
+}
+
 EXPORT_SYMBOL_GPL(xenbus_switch_state);
 
 int xenbus_frontend_closed(struct xenbus_device *dev)
@@ -283,6 +311,23 @@ void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
 EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
 
 /**
+ * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps
+ * avoiding recursion within xenbus_switch_state.
+ */
+static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
+				const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	xenbus_va_dev_error(dev, err, fmt, ap);
+	va_end(ap);
+
+	if (!depth)
+		__xenbus_switch_state(dev, XenbusStateClosing, 1);
+}
+
+/**
  * xenbus_grant_ring
  * @dev: xenbus device
  * @ring_mfn: mfn of ring to grant
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index d42e25d5968d..baa65e7fbbc7 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -45,35 +45,33 @@
 #include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/io.h>
+#include <linux/slab.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
 #include <xen/xenbus.h>
 #include <xen/events.h>
 #include <xen/page.h>
 
+#include <xen/hvm.h>
+
 #include "xenbus_comms.h"
 #include "xenbus_probe.h"
 
 
 int xen_store_evtchn;
-EXPORT_SYMBOL(xen_store_evtchn);
+EXPORT_SYMBOL_GPL(xen_store_evtchn);
 
 struct xenstore_domain_interface *xen_store_interface;
+EXPORT_SYMBOL_GPL(xen_store_interface);
+
 static unsigned long xen_store_mfn;
 
 static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
 
-static void wait_for_devices(struct xenbus_driver *xendrv);
-
-static int xenbus_probe_frontend(const char *type, const char *name);
-
-static void xenbus_dev_shutdown(struct device *_dev);
-
-static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
-static int xenbus_dev_resume(struct device *dev);
-
 /* If something in array of ids matches this device, return it. */
 static const struct xenbus_device_id *
 match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -94,34 +92,7 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv)
 
 	return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
 }
-
-static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env)
-{
-	struct xenbus_device *dev = to_xenbus_device(_dev);
-
-	if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
-		return -ENOMEM;
-
-	return 0;
-}
-
-/* device/<type>/<id> => <type>-<id> */
-static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
-{
-	nodename = strchr(nodename, '/');
-	if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
-		printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
-		return -EINVAL;
-	}
-
-	strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
-	if (!strchr(bus_id, '/')) {
-		printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
-		return -EINVAL;
-	}
-	*strchr(bus_id, '/') = '-';
-	return 0;
-}
+EXPORT_SYMBOL_GPL(xenbus_match);
 
 
 static void free_otherend_details(struct xenbus_device *dev)
@@ -141,7 +112,30 @@ static void free_otherend_watch(struct xenbus_device *dev)
 }
 
 
-int read_otherend_details(struct xenbus_device *xendev,
+static int talk_to_otherend(struct xenbus_device *dev)
+{
+	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+
+	free_otherend_watch(dev);
+	free_otherend_details(dev);
+
+	return drv->read_otherend_details(dev);
+}
+
+
+
+static int watch_otherend(struct xenbus_device *dev)
+{
+	struct xen_bus_type *bus =
+		container_of(dev->dev.bus, struct xen_bus_type, bus);
+
+	return xenbus_watch_pathfmt(dev, &dev->otherend_watch,
+				    bus->otherend_changed,
+				    "%s/%s", dev->otherend, "state");
+}
+
+
+int xenbus_read_otherend_details(struct xenbus_device *xendev,
 				 char *id_node, char *path_node)
 {
 	int err = xenbus_gather(XBT_NIL, xendev->nodename,
@@ -166,39 +160,11 @@ int read_otherend_details(struct xenbus_device *xendev,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
 
-
-static int read_backend_details(struct xenbus_device *xendev)
-{
-	return read_otherend_details(xendev, "backend-id", "backend");
-}
-
-static struct device_attribute xenbus_dev_attrs[] = {
-	__ATTR_NULL
-};
-
-/* Bus type for frontend drivers. */
-static struct xen_bus_type xenbus_frontend = {
-	.root = "device",
-	.levels = 2, 		/* device/type/<id> */
-	.get_bus_id = frontend_bus_id,
-	.probe = xenbus_probe_frontend,
-	.bus = {
-		.name      = "xen",
-		.match     = xenbus_match,
-		.uevent    = xenbus_uevent,
-		.probe     = xenbus_dev_probe,
-		.remove    = xenbus_dev_remove,
-		.shutdown  = xenbus_dev_shutdown,
-		.dev_attrs = xenbus_dev_attrs,
-
-		.suspend   = xenbus_dev_suspend,
-		.resume    = xenbus_dev_resume,
-	},
-};
-
-static void otherend_changed(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len)
+void xenbus_otherend_changed(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len,
+			     int ignore_on_shutdown)
 {
 	struct xenbus_device *dev =
 		container_of(watch, struct xenbus_device, otherend_watch);
@@ -226,11 +192,7 @@ static void otherend_changed(struct xenbus_watch *watch,
 	 * work that can fail e.g., when the rootfs is gone.
 	 */
 	if (system_state > SYSTEM_RUNNING) {
-		struct xen_bus_type *bus = bus;
-		bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
-		/* If we're frontend, drive the state machine to Closed. */
-		/* This should cause the backend to release our resources. */
-		if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
+		if (ignore_on_shutdown && (state == XenbusStateClosing))
 			xenbus_frontend_closed(dev);
 		return;
 	}
@@ -238,25 +200,7 @@ static void otherend_changed(struct xenbus_watch *watch,
 	if (drv->otherend_changed)
 		drv->otherend_changed(dev, state);
 }
-
-
-static int talk_to_otherend(struct xenbus_device *dev)
-{
-	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
-
-	free_otherend_watch(dev);
-	free_otherend_details(dev);
-
-	return drv->read_otherend_details(dev);
-}
-
-
-static int watch_otherend(struct xenbus_device *dev)
-{
-	return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
-				    "%s/%s", dev->otherend, "state");
-}
-
+EXPORT_SYMBOL_GPL(xenbus_otherend_changed);
 
 int xenbus_dev_probe(struct device *_dev)
 {
@@ -300,8 +244,9 @@ int xenbus_dev_probe(struct device *_dev)
 fail:
 	xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
 	xenbus_switch_state(dev, XenbusStateClosed);
-	return -ENODEV;
+	return err;
 }
+EXPORT_SYMBOL_GPL(xenbus_dev_probe);
 
 int xenbus_dev_remove(struct device *_dev)
 {
@@ -319,8 +264,9 @@ int xenbus_dev_remove(struct device *_dev)
 	xenbus_switch_state(dev, XenbusStateClosed);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xenbus_dev_remove);
 
-static void xenbus_dev_shutdown(struct device *_dev)
+void xenbus_dev_shutdown(struct device *_dev)
 {
 	struct xenbus_device *dev = to_xenbus_device(_dev);
 	unsigned long timeout = 5*HZ;
@@ -341,6 +287,7 @@ static void xenbus_dev_shutdown(struct device *_dev)
  out:
 	put_device(&dev->dev);
 }
+EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
 
 int xenbus_register_driver_common(struct xenbus_driver *drv,
 				  struct xen_bus_type *bus,
@@ -354,25 +301,7 @@ int xenbus_register_driver_common(struct xenbus_driver *drv,
 
 	return driver_register(&drv->driver);
 }
-
-int __xenbus_register_frontend(struct xenbus_driver *drv,
-			       struct module *owner, const char *mod_name)
-{
-	int ret;
-
-	drv->read_otherend_details = read_backend_details;
-
-	ret = xenbus_register_driver_common(drv, &xenbus_frontend,
-					    owner, mod_name);
-	if (ret)
-		return ret;
-
-	/* If this driver is loaded as a module wait for devices to attach. */
-	wait_for_devices(drv);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
+EXPORT_SYMBOL_GPL(xenbus_register_driver_common);
 
 void xenbus_unregister_driver(struct xenbus_driver *drv)
 {
@@ -454,21 +383,21 @@ static ssize_t xendev_show_nodename(struct device *dev,
 {
 	return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
 }
-DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
+static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
 
 static ssize_t xendev_show_devtype(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
 }
-DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
+static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
 
 static ssize_t xendev_show_modalias(struct device *dev,
 				    struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
 }
-DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
+static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
 
 int xenbus_probe_node(struct xen_bus_type *bus,
 		      const char *type,
@@ -543,24 +472,7 @@ fail:
 	kfree(xendev);
 	return err;
 }
-
-/* device/<typename>/<name> */
-static int xenbus_probe_frontend(const char *type, const char *name)
-{
-	char *nodename;
-	int err;
-
-	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s",
-			     xenbus_frontend.root, type, name);
-	if (!nodename)
-		return -ENOMEM;
-
-	DPRINTK("%s", nodename);
-
-	err = xenbus_probe_node(&xenbus_frontend, type, nodename);
-	kfree(nodename);
-	return err;
-}
+EXPORT_SYMBOL_GPL(xenbus_probe_node);
 
 static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
 {
@@ -574,10 +486,11 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
 		return PTR_ERR(dir);
 
 	for (i = 0; i < dir_n; i++) {
-		err = bus->probe(type, dir[i]);
+		err = bus->probe(bus, type, dir[i]);
 		if (err)
 			break;
 	}
+
 	kfree(dir);
 	return err;
 }
@@ -597,9 +510,11 @@ int xenbus_probe_devices(struct xen_bus_type *bus)
 		if (err)
 			break;
 	}
+
 	kfree(dir);
 	return err;
 }
+EXPORT_SYMBOL_GPL(xenbus_probe_devices);
 
 static unsigned int char_count(const char *str, char c)
 {
@@ -662,32 +577,18 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
 }
 EXPORT_SYMBOL_GPL(xenbus_dev_changed);
 
-static void frontend_changed(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len)
-{
-	DPRINTK("");
-
-	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
-}
-
-/* We watch for devices appearing and vanishing. */
-static struct xenbus_watch fe_watch = {
-	.node = "device",
-	.callback = frontend_changed,
-};
-
-static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
+int xenbus_dev_suspend(struct device *dev, pm_message_t state)
 {
 	int err = 0;
 	struct xenbus_driver *drv;
-	struct xenbus_device *xdev;
+	struct xenbus_device *xdev
+		= container_of(dev, struct xenbus_device, dev);
 
-	DPRINTK("");
+	DPRINTK("%s", xdev->nodename);
 
 	if (dev->driver == NULL)
 		return 0;
 	drv = to_xenbus_driver(dev->driver);
-	xdev = container_of(dev, struct xenbus_device, dev);
 	if (drv->suspend)
 		err = drv->suspend(xdev, state);
 	if (err)
@@ -695,21 +596,20 @@ static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
 		       "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
 
-static int xenbus_dev_resume(struct device *dev)
+int xenbus_dev_resume(struct device *dev)
 {
 	int err;
 	struct xenbus_driver *drv;
-	struct xenbus_device *xdev;
+	struct xenbus_device *xdev
+		= container_of(dev, struct xenbus_device, dev);
 
-	DPRINTK("");
+	DPRINTK("%s", xdev->nodename);
 
 	if (dev->driver == NULL)
 		return 0;
-
 	drv = to_xenbus_driver(dev->driver);
-	xdev = container_of(dev, struct xenbus_device, dev);
-
 	err = talk_to_otherend(xdev);
 	if (err) {
 		printk(KERN_WARNING
@@ -740,6 +640,7 @@ static int xenbus_dev_resume(struct device *dev)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xenbus_dev_resume);
 
 /* A flag to determine if xenstored is 'ready' (i.e. has started) */
 int xenstored_ready = 0;
@@ -766,59 +667,95 @@ EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
 
 void xenbus_probe(struct work_struct *unused)
 {
-	BUG_ON((xenstored_ready <= 0));
-
-	/* Enumerate devices in xenstore and watch for changes. */
-	xenbus_probe_devices(&xenbus_frontend);
-	register_xenbus_watch(&fe_watch);
-	xenbus_backend_probe_and_watch();
+	xenstored_ready = 1;
 
 	/* Notify others that xenstore is up */
 	blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
 }
+EXPORT_SYMBOL_GPL(xenbus_probe);
 
-static int __init xenbus_probe_init(void)
+static int __init xenbus_probe_initcall(void)
+{
+	if (!xen_domain())
+		return -ENODEV;
+
+	if (xen_initial_domain() || xen_hvm_domain())
+		return 0;
+
+	xenbus_probe(NULL);
+	return 0;
+}
+
+device_initcall(xenbus_probe_initcall);
+
+static int __init xenbus_init(void)
 {
 	int err = 0;
+	unsigned long page = 0;
 
 	DPRINTK("");
 
 	err = -ENODEV;
 	if (!xen_domain())
-		goto out_error;
-
-	/* Register ourselves with the kernel bus subsystem */
-	err = bus_register(&xenbus_frontend.bus);
-	if (err)
-		goto out_error;
-
-	err = xenbus_backend_bus_register();
-	if (err)
-		goto out_unreg_front;
+		return err;
 
 	/*
 	 * Domain0 doesn't have a store_evtchn or store_mfn yet.
 	 */
 	if (xen_initial_domain()) {
-		/* dom0 not yet supported */
+		struct evtchn_alloc_unbound alloc_unbound;
+
+		/* Allocate Xenstore page */
+		page = get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			goto out_error;
+
+		xen_store_mfn = xen_start_info->store_mfn =
+			pfn_to_mfn(virt_to_phys((void *)page) >>
+				   PAGE_SHIFT);
+
+		/* Next allocate a local port which xenstored can bind to */
+		alloc_unbound.dom        = DOMID_SELF;
+		alloc_unbound.remote_dom = 0;
+
+		err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+						  &alloc_unbound);
+		if (err == -ENOSYS)
+			goto out_error;
+
+		BUG_ON(err);
+		xen_store_evtchn = xen_start_info->store_evtchn =
+			alloc_unbound.port;
+
+		xen_store_interface = mfn_to_virt(xen_store_mfn);
 	} else {
-		xenstored_ready = 1;
-		xen_store_evtchn = xen_start_info->store_evtchn;
-		xen_store_mfn = xen_start_info->store_mfn;
+		if (xen_hvm_domain()) {
+			uint64_t v = 0;
+			err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
+			if (err)
+				goto out_error;
+			xen_store_evtchn = (int)v;
+			err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
+			if (err)
+				goto out_error;
+			xen_store_mfn = (unsigned long)v;
+			xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
+		} else {
+			xen_store_evtchn = xen_start_info->store_evtchn;
+			xen_store_mfn = xen_start_info->store_mfn;
+			xen_store_interface = mfn_to_virt(xen_store_mfn);
+			xenstored_ready = 1;
+		}
 	}
-	xen_store_interface = mfn_to_virt(xen_store_mfn);
 
 	/* Initialize the interface to xenstore. */
 	err = xs_init();
 	if (err) {
 		printk(KERN_WARNING
 		       "XENBUS: Error initializing xenstore comms: %i\n", err);
-		goto out_unreg_back;
+		goto out_error;
 	}
 
-	if (!xen_initial_domain())
-		xenbus_probe(NULL);
-
 #ifdef CONFIG_XEN_COMPAT_XENFS
 	/*
 	 * Create xenfs mountpoint in /proc for compatibility with
@@ -829,112 +766,13 @@ static int __init xenbus_probe_init(void)
 
 	return 0;
 
-  out_unreg_back:
-	xenbus_backend_bus_unregister();
-
-  out_unreg_front:
-	bus_unregister(&xenbus_frontend.bus);
-
   out_error:
+	if (page != 0)
+		free_page(page);
+
 	return err;
 }
 
-postcore_initcall(xenbus_probe_init);
+postcore_initcall(xenbus_init);
 
 MODULE_LICENSE("GPL");
-
-static int is_disconnected_device(struct device *dev, void *data)
-{
-	struct xenbus_device *xendev = to_xenbus_device(dev);
-	struct device_driver *drv = data;
-	struct xenbus_driver *xendrv;
-
-	/*
-	 * A device with no driver will never connect. We care only about
-	 * devices which should currently be in the process of connecting.
-	 */
-	if (!dev->driver)
-		return 0;
-
-	/* Is this search limited to a particular driver? */
-	if (drv && (dev->driver != drv))
-		return 0;
-
-	xendrv = to_xenbus_driver(dev->driver);
-	return (xendev->state != XenbusStateConnected ||
-		(xendrv->is_ready && !xendrv->is_ready(xendev)));
-}
-
-static int exists_disconnected_device(struct device_driver *drv)
-{
-	return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
-				is_disconnected_device);
-}
-
-static int print_device_status(struct device *dev, void *data)
-{
-	struct xenbus_device *xendev = to_xenbus_device(dev);
-	struct device_driver *drv = data;
-
-	/* Is this operation limited to a particular driver? */
-	if (drv && (dev->driver != drv))
-		return 0;
-
-	if (!dev->driver) {
-		/* Information only: is this too noisy? */
-		printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
-		       xendev->nodename);
-	} else if (xendev->state != XenbusStateConnected) {
-		printk(KERN_WARNING "XENBUS: Timeout connecting "
-		       "to device: %s (state %d)\n",
-		       xendev->nodename, xendev->state);
-	}
-
-	return 0;
-}
-
-/* We only wait for device setup after most initcalls have run. */
-static int ready_to_wait_for_devices;
-
-/*
- * On a 10 second timeout, wait for all devices currently configured.  We need
- * to do this to guarantee that the filesystems and / or network devices
- * needed for boot are available, before we can allow the boot to proceed.
- *
- * This needs to be on a late_initcall, to happen after the frontend device
- * drivers have been initialised, but before the root fs is mounted.
- *
- * A possible improvement here would be to have the tools add a per-device
- * flag to the store entry, indicating whether it is needed at boot time.
- * This would allow people who knew what they were doing to accelerate their
- * boot slightly, but of course needs tools or manual intervention to set up
- * those flags correctly.
- */
-static void wait_for_devices(struct xenbus_driver *xendrv)
-{
-	unsigned long timeout = jiffies + 10*HZ;
-	struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
-
-	if (!ready_to_wait_for_devices || !xen_domain())
-		return;
-
-	while (exists_disconnected_device(drv)) {
-		if (time_after(jiffies, timeout))
-			break;
-		schedule_timeout_interruptible(HZ/10);
-	}
-
-	bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
-			 print_device_status);
-}
-
-#ifndef MODULE
-static int __init boot_wait_for_devices(void)
-{
-	ready_to_wait_for_devices = 1;
-	wait_for_devices(NULL);
-	return 0;
-}
-
-late_initcall(boot_wait_for_devices);
-#endif
diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h
index 6c5e3185a6a2..24665812316a 100644
--- a/drivers/xen/xenbus/xenbus_probe.h
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@ -36,26 +36,15 @@
 
 #define XEN_BUS_ID_SIZE			20
 
-#ifdef CONFIG_XEN_BACKEND
-extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
-extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
-extern void xenbus_backend_probe_and_watch(void);
-extern int xenbus_backend_bus_register(void);
-extern void xenbus_backend_bus_unregister(void);
-#else
-static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
-static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
-static inline void xenbus_backend_probe_and_watch(void) {}
-static inline int xenbus_backend_bus_register(void) { return 0; }
-static inline void xenbus_backend_bus_unregister(void) {}
-#endif
-
 struct xen_bus_type
 {
 	char *root;
 	unsigned int levels;
 	int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
-	int (*probe)(const char *type, const char *dir);
+	int (*probe)(struct xen_bus_type *bus, const char *type,
+		     const char *dir);
+	void (*otherend_changed)(struct xenbus_watch *watch, const char **vec,
+				 unsigned int len);
 	struct bus_type bus;
 };
 
@@ -73,4 +62,16 @@ extern int xenbus_probe_devices(struct xen_bus_type *bus);
 
 extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
 
+extern void xenbus_dev_shutdown(struct device *_dev);
+
+extern int xenbus_dev_suspend(struct device *dev, pm_message_t state);
+extern int xenbus_dev_resume(struct device *dev);
+
+extern void xenbus_otherend_changed(struct xenbus_watch *watch,
+				    const char **vec, unsigned int len,
+				    int ignore_on_shutdown);
+
+extern int xenbus_read_otherend_details(struct xenbus_device *xendev,
+					char *id_node, char *path_node);
+
 #endif
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
new file mode 100644
index 000000000000..6cf467bf63ec
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -0,0 +1,276 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have (backend half).
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005, 2006 XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define DPRINTK(fmt, args...)				\
+	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\
+		 __func__, __LINE__, ##args)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/features.h>
+
+#include "xenbus_comms.h"
+#include "xenbus_probe.h"
+
+/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
+static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+	int domid, err;
+	const char *devid, *type, *frontend;
+	unsigned int typelen;
+
+	type = strchr(nodename, '/');
+	if (!type)
+		return -EINVAL;
+	type++;
+	typelen = strcspn(type, "/");
+	if (!typelen || type[typelen] != '/')
+		return -EINVAL;
+
+	devid = strrchr(nodename, '/') + 1;
+
+	err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
+			    "frontend", NULL, &frontend,
+			    NULL);
+	if (err)
+		return err;
+	if (strlen(frontend) == 0)
+		err = -ERANGE;
+	if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
+		err = -ENOENT;
+	kfree(frontend);
+
+	if (err)
+		return err;
+
+	if (snprintf(bus_id, XEN_BUS_ID_SIZE, "%.*s-%i-%s",
+		     typelen, type, domid, devid) >= XEN_BUS_ID_SIZE)
+		return -ENOSPC;
+	return 0;
+}
+
+static int xenbus_uevent_backend(struct device *dev,
+				 struct kobj_uevent_env *env)
+{
+	struct xenbus_device *xdev;
+	struct xenbus_driver *drv;
+	struct xen_bus_type *bus;
+
+	DPRINTK("");
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	xdev = to_xenbus_device(dev);
+	bus = container_of(xdev->dev.bus, struct xen_bus_type, bus);
+	if (xdev == NULL)
+		return -ENODEV;
+
+	/* stuff we want to pass to /sbin/hotplug */
+	if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype))
+		return -ENOMEM;
+
+	if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename))
+		return -ENOMEM;
+
+	if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root))
+		return -ENOMEM;
+
+	if (dev->driver) {
+		drv = to_xenbus_driver(dev->driver);
+		if (drv && drv->uevent)
+			return drv->uevent(xdev, env);
+	}
+
+	return 0;
+}
+
+/* backend/<typename>/<frontend-uuid>/<name> */
+static int xenbus_probe_backend_unit(struct xen_bus_type *bus,
+				     const char *dir,
+				     const char *type,
+				     const char *name)
+{
+	char *nodename;
+	int err;
+
+	nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
+	if (!nodename)
+		return -ENOMEM;
+
+	DPRINTK("%s\n", nodename);
+
+	err = xenbus_probe_node(bus, type, nodename);
+	kfree(nodename);
+	return err;
+}
+
+/* backend/<typename>/<frontend-domid> */
+static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type,
+				const char *domid)
+{
+	char *nodename;
+	int err = 0;
+	char **dir;
+	unsigned int i, dir_n = 0;
+
+	DPRINTK("");
+
+	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid);
+	if (!nodename)
+		return -ENOMEM;
+
+	dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
+	if (IS_ERR(dir)) {
+		kfree(nodename);
+		return PTR_ERR(dir);
+	}
+
+	for (i = 0; i < dir_n; i++) {
+		err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]);
+		if (err)
+			break;
+	}
+	kfree(dir);
+	kfree(nodename);
+	return err;
+}
+
+static void frontend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len)
+{
+	xenbus_otherend_changed(watch, vec, len, 0);
+}
+
+static struct device_attribute xenbus_backend_dev_attrs[] = {
+	__ATTR_NULL
+};
+
+static struct xen_bus_type xenbus_backend = {
+	.root = "backend",
+	.levels = 3,		/* backend/type/<frontend>/<id> */
+	.get_bus_id = backend_bus_id,
+	.probe = xenbus_probe_backend,
+	.otherend_changed = frontend_changed,
+	.bus = {
+		.name		= "xen-backend",
+		.match		= xenbus_match,
+		.uevent		= xenbus_uevent_backend,
+		.probe		= xenbus_dev_probe,
+		.remove		= xenbus_dev_remove,
+		.shutdown	= xenbus_dev_shutdown,
+		.dev_attrs	= xenbus_backend_dev_attrs,
+	},
+};
+
+static void backend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len)
+{
+	DPRINTK("");
+
+	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
+}
+
+static struct xenbus_watch be_watch = {
+	.node = "backend",
+	.callback = backend_changed,
+};
+
+static int read_frontend_details(struct xenbus_device *xendev)
+{
+	return xenbus_read_otherend_details(xendev, "frontend-id", "frontend");
+}
+
+int xenbus_dev_is_online(struct xenbus_device *dev)
+{
+	int rc, val;
+
+	rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
+	if (rc != 1)
+		val = 0; /* no online node present */
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
+
+int __xenbus_register_backend(struct xenbus_driver *drv,
+			      struct module *owner, const char *mod_name)
+{
+	drv->read_otherend_details = read_frontend_details;
+
+	return xenbus_register_driver_common(drv, &xenbus_backend,
+					     owner, mod_name);
+}
+EXPORT_SYMBOL_GPL(__xenbus_register_backend);
+
+static int backend_probe_and_watch(struct notifier_block *notifier,
+				   unsigned long event,
+				   void *data)
+{
+	/* Enumerate devices in xenstore and watch for changes. */
+	xenbus_probe_devices(&xenbus_backend);
+	register_xenbus_watch(&be_watch);
+
+	return NOTIFY_DONE;
+}
+
+static int __init xenbus_probe_backend_init(void)
+{
+	static struct notifier_block xenstore_notifier = {
+		.notifier_call = backend_probe_and_watch
+	};
+	int err;
+
+	DPRINTK("");
+
+	/* Register ourselves with the kernel bus subsystem */
+	err = bus_register(&xenbus_backend.bus);
+	if (err)
+		return err;
+
+	register_xenstore_notifier(&xenstore_notifier);
+
+	return 0;
+}
+subsys_initcall(xenbus_probe_backend_init);
diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
new file mode 100644
index 000000000000..5bcc2d6cf129
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
@@ -0,0 +1,294 @@
+#define DPRINTK(fmt, args...)				\
+	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\
+		 __func__, __LINE__, ##args)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/page.h>
+
+#include <xen/platform_pci.h>
+
+#include "xenbus_comms.h"
+#include "xenbus_probe.h"
+
+
+/* device/<type>/<id> => <type>-<id> */
+static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+	nodename = strchr(nodename, '/');
+	if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
+		printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
+		return -EINVAL;
+	}
+
+	strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
+	if (!strchr(bus_id, '/')) {
+		printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
+		return -EINVAL;
+	}
+	*strchr(bus_id, '/') = '-';
+	return 0;
+}
+
+/* device/<typename>/<name> */
+static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type,
+				 const char *name)
+{
+	char *nodename;
+	int err;
+
+	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name);
+	if (!nodename)
+		return -ENOMEM;
+
+	DPRINTK("%s", nodename);
+
+	err = xenbus_probe_node(bus, type, nodename);
+	kfree(nodename);
+	return err;
+}
+
+static int xenbus_uevent_frontend(struct device *_dev,
+				  struct kobj_uevent_env *env)
+{
+	struct xenbus_device *dev = to_xenbus_device(_dev);
+
+	if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
+		return -ENOMEM;
+
+	return 0;
+}
+
+
+static void backend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len)
+{
+	xenbus_otherend_changed(watch, vec, len, 1);
+}
+
+static struct device_attribute xenbus_frontend_dev_attrs[] = {
+	__ATTR_NULL
+};
+
+static struct xen_bus_type xenbus_frontend = {
+	.root = "device",
+	.levels = 2,		/* device/type/<id> */
+	.get_bus_id = frontend_bus_id,
+	.probe = xenbus_probe_frontend,
+	.otherend_changed = backend_changed,
+	.bus = {
+		.name		= "xen",
+		.match		= xenbus_match,
+		.uevent		= xenbus_uevent_frontend,
+		.probe		= xenbus_dev_probe,
+		.remove		= xenbus_dev_remove,
+		.shutdown	= xenbus_dev_shutdown,
+		.dev_attrs	= xenbus_frontend_dev_attrs,
+
+		.suspend	= xenbus_dev_suspend,
+		.resume		= xenbus_dev_resume,
+	},
+};
+
+static void frontend_changed(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len)
+{
+	DPRINTK("");
+
+	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+}
+
+
+/* We watch for devices appearing and vanishing. */
+static struct xenbus_watch fe_watch = {
+	.node = "device",
+	.callback = frontend_changed,
+};
+
+static int read_backend_details(struct xenbus_device *xendev)
+{
+	return xenbus_read_otherend_details(xendev, "backend-id", "backend");
+}
+
+static int is_device_connecting(struct device *dev, void *data)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct device_driver *drv = data;
+	struct xenbus_driver *xendrv;
+
+	/*
+	 * A device with no driver will never connect. We care only about
+	 * devices which should currently be in the process of connecting.
+	 */
+	if (!dev->driver)
+		return 0;
+
+	/* Is this search limited to a particular driver? */
+	if (drv && (dev->driver != drv))
+		return 0;
+
+	xendrv = to_xenbus_driver(dev->driver);
+	return (xendev->state < XenbusStateConnected ||
+		(xendev->state == XenbusStateConnected &&
+		 xendrv->is_ready && !xendrv->is_ready(xendev)));
+}
+
+static int exists_connecting_device(struct device_driver *drv)
+{
+	return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+				is_device_connecting);
+}
+
+static int print_device_status(struct device *dev, void *data)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct device_driver *drv = data;
+
+	/* Is this operation limited to a particular driver? */
+	if (drv && (dev->driver != drv))
+		return 0;
+
+	if (!dev->driver) {
+		/* Information only: is this too noisy? */
+		printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
+		       xendev->nodename);
+	} else if (xendev->state < XenbusStateConnected) {
+		enum xenbus_state rstate = XenbusStateUnknown;
+		if (xendev->otherend)
+			rstate = xenbus_read_driver_state(xendev->otherend);
+		printk(KERN_WARNING "XENBUS: Timeout connecting "
+		       "to device: %s (local state %d, remote state %d)\n",
+		       xendev->nodename, xendev->state, rstate);
+	}
+
+	return 0;
+}
+
+/* We only wait for device setup after most initcalls have run. */
+static int ready_to_wait_for_devices;
+
+/*
+ * On a 5-minute timeout, wait for all devices currently configured.  We need
+ * to do this to guarantee that the filesystems and / or network devices
+ * needed for boot are available, before we can allow the boot to proceed.
+ *
+ * This needs to be on a late_initcall, to happen after the frontend device
+ * drivers have been initialised, but before the root fs is mounted.
+ *
+ * A possible improvement here would be to have the tools add a per-device
+ * flag to the store entry, indicating whether it is needed at boot time.
+ * This would allow people who knew what they were doing to accelerate their
+ * boot slightly, but of course needs tools or manual intervention to set up
+ * those flags correctly.
+ */
+static void wait_for_devices(struct xenbus_driver *xendrv)
+{
+	unsigned long start = jiffies;
+	struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+	unsigned int seconds_waited = 0;
+
+	if (!ready_to_wait_for_devices || !xen_domain())
+		return;
+
+	while (exists_connecting_device(drv)) {
+		if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
+			if (!seconds_waited)
+				printk(KERN_WARNING "XENBUS: Waiting for "
+				       "devices to initialise: ");
+			seconds_waited += 5;
+			printk("%us...", 300 - seconds_waited);
+			if (seconds_waited == 300)
+				break;
+		}
+
+		schedule_timeout_interruptible(HZ/10);
+	}
+
+	if (seconds_waited)
+		printk("\n");
+
+	bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+			 print_device_status);
+}
+
+int __xenbus_register_frontend(struct xenbus_driver *drv,
+			       struct module *owner, const char *mod_name)
+{
+	int ret;
+
+	drv->read_otherend_details = read_backend_details;
+
+	ret = xenbus_register_driver_common(drv, &xenbus_frontend,
+					    owner, mod_name);
+	if (ret)
+		return ret;
+
+	/* If this driver is loaded as a module wait for devices to attach. */
+	wait_for_devices(drv);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
+
+static int frontend_probe_and_watch(struct notifier_block *notifier,
+				   unsigned long event,
+				   void *data)
+{
+	/* Enumerate devices in xenstore and watch for changes. */
+	xenbus_probe_devices(&xenbus_frontend);
+	register_xenbus_watch(&fe_watch);
+
+	return NOTIFY_DONE;
+}
+
+
+static int __init xenbus_probe_frontend_init(void)
+{
+	static struct notifier_block xenstore_notifier = {
+		.notifier_call = frontend_probe_and_watch
+	};
+	int err;
+
+	DPRINTK("");
+
+	/* Register ourselves with the kernel bus subsystem */
+	err = bus_register(&xenbus_frontend.bus);
+	if (err)
+		return err;
+
+	register_xenstore_notifier(&xenstore_notifier);
+
+	return 0;
+}
+subsys_initcall(xenbus_probe_frontend_init);
+
+#ifndef MODULE
+static int __init boot_wait_for_devices(void)
+{
+	if (xen_hvm_domain() && !xen_platform_pci_unplug)
+		return -ENODEV;
+
+	ready_to_wait_for_devices = 1;
+	wait_for_devices(NULL);
+	return 0;
+}
+
+late_initcall(boot_wait_for_devices);
+#endif
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 6f91e8c8b932..5534690075af 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -534,7 +534,7 @@ int xenbus_printf(struct xenbus_transaction t,
 #define PRINTF_BUFFER_SIZE 4096
 	char *printf_buffer;
 
-	printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
+	printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_NOIO | __GFP_HIGH);
 	if (printf_buffer == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c
index a240b2c20b99..b91f8ff50d05 100644
--- a/drivers/xen/xencomm.c
+++ b/drivers/xen/xencomm.c
@@ -18,8 +18,8 @@
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  */
 
-#include <linux/gfp.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <asm/page.h>
 #include <xen/xencomm.h>
 #include <xen/interface/xen.h>
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
index 25275c3bbdff..4fde9440fe1f 100644
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_XENFS) += xenfs.o
 
-xenfs-objs = super.o xenbus.o
-\ No newline at end of file
+xenfs-y			  = super.o xenbus.o privcmd.o
+xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
new file mode 100644
index 000000000000..dbd3b16fd131
--- /dev/null
+++ b/drivers/xen/xenfs/privcmd.c
@@ -0,0 +1,400 @@
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
+#endif
+
+static long privcmd_ioctl_hypercall(void __user *udata)
+{
+	struct privcmd_hypercall hypercall;
+	long ret;
+
+	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+		return -EFAULT;
+
+	ret = privcmd_call(hypercall.op,
+			   hypercall.arg[0], hypercall.arg[1],
+			   hypercall.arg[2], hypercall.arg[3],
+			   hypercall.arg[4]);
+
+	return ret;
+}
+
+static void free_page_list(struct list_head *pages)
+{
+	struct page *p, *n;
+
+	list_for_each_entry_safe(p, n, pages, lru)
+		__free_page(p);
+
+	INIT_LIST_HEAD(pages);
+}
+
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data.  If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+			unsigned nelem, size_t size,
+			void __user *data)
+{
+	unsigned pageidx;
+	void *pagedata;
+	int ret;
+
+	if (size > PAGE_SIZE)
+		return 0;
+
+	pageidx = PAGE_SIZE;
+	pagedata = NULL;	/* quiet, gcc */
+	while (nelem--) {
+		if (pageidx > PAGE_SIZE-size) {
+			struct page *page = alloc_page(GFP_KERNEL);
+
+			ret = -ENOMEM;
+			if (page == NULL)
+				goto fail;
+
+			pagedata = page_address(page);
+
+			list_add_tail(&page->lru, pagelist);
+			pageidx = 0;
+		}
+
+		ret = -EFAULT;
+		if (copy_from_user(pagedata + pageidx, data, size))
+			goto fail;
+
+		data += size;
+		pageidx += size;
+	}
+
+	ret = 0;
+
+fail:
+	return ret;
+}
+
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+			  struct list_head *pos,
+			  int (*fn)(void *data, void *state),
+			  void *state)
+{
+	void *pagedata;
+	unsigned pageidx;
+	int ret = 0;
+
+	BUG_ON(size > PAGE_SIZE);
+
+	pageidx = PAGE_SIZE;
+	pagedata = NULL;	/* hush, gcc */
+
+	while (nelem--) {
+		if (pageidx > PAGE_SIZE-size) {
+			struct page *page;
+			pos = pos->next;
+			page = list_entry(pos, struct page, lru);
+			pagedata = page_address(page);
+			pageidx = 0;
+		}
+
+		ret = (*fn)(pagedata + pageidx, state);
+		if (ret)
+			break;
+		pageidx += size;
+	}
+
+	return ret;
+}
+
+struct mmap_mfn_state {
+	unsigned long va;
+	struct vm_area_struct *vma;
+	domid_t domain;
+};
+
+static int mmap_mfn_range(void *data, void *state)
+{
+	struct privcmd_mmap_entry *msg = data;
+	struct mmap_mfn_state *st = state;
+	struct vm_area_struct *vma = st->vma;
+	int rc;
+
+	/* Do not allow range to wrap the address space. */
+	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+	    ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+		return -EINVAL;
+
+	/* Range chunks must be contiguous in va space. */
+	if ((msg->va != st->va) ||
+	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+		return -EINVAL;
+
+	rc = xen_remap_domain_mfn_range(vma,
+					msg->va & PAGE_MASK,
+					msg->mfn, msg->npages,
+					vma->vm_page_prot,
+					st->domain);
+	if (rc < 0)
+		return rc;
+
+	st->va += msg->npages << PAGE_SHIFT;
+
+	return 0;
+}
+
+static long privcmd_ioctl_mmap(void __user *udata)
+{
+	struct privcmd_mmap mmapcmd;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int rc;
+	LIST_HEAD(pagelist);
+	struct mmap_mfn_state state;
+
+	if (!xen_initial_domain())
+		return -EPERM;
+
+	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+		return -EFAULT;
+
+	rc = gather_array(&pagelist,
+			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+			  mmapcmd.entry);
+
+	if (rc || list_empty(&pagelist))
+		goto out;
+
+	down_write(&mm->mmap_sem);
+
+	{
+		struct page *page = list_first_entry(&pagelist,
+						     struct page, lru);
+		struct privcmd_mmap_entry *msg = page_address(page);
+
+		vma = find_vma(mm, msg->va);
+		rc = -EINVAL;
+
+		if (!vma || (msg->va != vma->vm_start) ||
+		    !privcmd_enforce_singleshot_mapping(vma))
+			goto out_up;
+	}
+
+	state.va = vma->vm_start;
+	state.vma = vma;
+	state.domain = mmapcmd.dom;
+
+	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+			    &pagelist,
+			    mmap_mfn_range, &state);
+
+
+out_up:
+	up_write(&mm->mmap_sem);
+
+out:
+	free_page_list(&pagelist);
+
+	return rc;
+}
+
+struct mmap_batch_state {
+	domid_t domain;
+	unsigned long va;
+	struct vm_area_struct *vma;
+	int err;
+
+	xen_pfn_t __user *user;
+};
+
+static int mmap_batch_fn(void *data, void *state)
+{
+	xen_pfn_t *mfnp = data;
+	struct mmap_batch_state *st = state;
+
+	if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
+				       st->vma->vm_page_prot, st->domain) < 0) {
+		*mfnp |= 0xf0000000U;
+		st->err++;
+	}
+	st->va += PAGE_SIZE;
+
+	return 0;
+}
+
+static int mmap_return_errors(void *data, void *state)
+{
+	xen_pfn_t *mfnp = data;
+	struct mmap_batch_state *st = state;
+
+	return put_user(*mfnp, st->user++);
+}
+
+static struct vm_operations_struct privcmd_vm_ops;
+
+static long privcmd_ioctl_mmap_batch(void __user *udata)
+{
+	int ret;
+	struct privcmd_mmapbatch m;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long nr_pages;
+	LIST_HEAD(pagelist);
+	struct mmap_batch_state state;
+
+	if (!xen_initial_domain())
+		return -EPERM;
+
+	if (copy_from_user(&m, udata, sizeof(m)))
+		return -EFAULT;
+
+	nr_pages = m.num;
+	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+		return -EINVAL;
+
+	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
+			   m.arr);
+
+	if (ret || list_empty(&pagelist))
+		goto out;
+
+	down_write(&mm->mmap_sem);
+
+	vma = find_vma(mm, m.addr);
+	ret = -EINVAL;
+	if (!vma ||
+	    vma->vm_ops != &privcmd_vm_ops ||
+	    (m.addr != vma->vm_start) ||
+	    ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
+	    !privcmd_enforce_singleshot_mapping(vma)) {
+		up_write(&mm->mmap_sem);
+		goto out;
+	}
+
+	state.domain = m.dom;
+	state.vma = vma;
+	state.va = m.addr;
+	state.err = 0;
+
+	ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+			     &pagelist, mmap_batch_fn, &state);
+
+	up_write(&mm->mmap_sem);
+
+	if (state.err > 0) {
+		state.user = m.arr;
+		ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+			       &pagelist,
+			       mmap_return_errors, &state);
+	}
+
+out:
+	free_page_list(&pagelist);
+
+	return ret;
+}
+
+static long privcmd_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long data)
+{
+	int ret = -ENOSYS;
+	void __user *udata = (void __user *) data;
+
+	switch (cmd) {
+	case IOCTL_PRIVCMD_HYPERCALL:
+		ret = privcmd_ioctl_hypercall(udata);
+		break;
+
+	case IOCTL_PRIVCMD_MMAP:
+		ret = privcmd_ioctl_mmap(udata);
+		break;
+
+	case IOCTL_PRIVCMD_MMAPBATCH:
+		ret = privcmd_ioctl_mmap_batch(udata);
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+	       vma, vma->vm_start, vma->vm_end,
+	       vmf->pgoff, vmf->virtual_address);
+
+	return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+	.fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	/* Unsupported for auto-translate guests. */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return -ENOSYS;
+
+	/* DONTCOPY is essential for Xen because copy_page_range doesn't know
+	 * how to recreate these mappings */
+	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
+	vma->vm_ops = &privcmd_vm_ops;
+	vma->vm_private_data = NULL;
+
+	return 0;
+}
+
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+{
+	return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+}
+#endif
+
+const struct file_operations privcmd_file_ops = {
+	.unlocked_ioctl = privcmd_ioctl,
+	.mmap = privcmd_mmap,
+};
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 6559e0c752ce..1aa389719846 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -13,6 +13,8 @@
 #include <linux/fs.h>
 #include <linux/magic.h>
 
+#include <xen/xen.h>
+
 #include "xenfs.h"
 
 #include <asm/xen/hypervisor.h>
@@ -20,6 +22,46 @@
 MODULE_DESCRIPTION("Xen filesystem");
 MODULE_LICENSE("GPL");
 
+static struct inode *xenfs_make_inode(struct super_block *sb, int mode)
+{
+	struct inode *ret = new_inode(sb);
+
+	if (ret) {
+		ret->i_mode = mode;
+		ret->i_uid = ret->i_gid = 0;
+		ret->i_blocks = 0;
+		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
+	}
+	return ret;
+}
+
+static struct dentry *xenfs_create_file(struct super_block *sb,
+					struct dentry *parent,
+					const char *name,
+					const struct file_operations *fops,
+					void *data,
+					int mode)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	dentry = d_alloc_name(parent, name);
+	if (!dentry)
+		return NULL;
+
+	inode = xenfs_make_inode(sb, S_IFREG | mode);
+	if (!inode) {
+		dput(dentry);
+		return NULL;
+	}
+
+	inode->i_fop = fops;
+	inode->i_private = data;
+
+	d_add(dentry, inode);
+	return dentry;
+}
+
 static ssize_t capabilities_read(struct file *file, char __user *buf,
 				 size_t size, loff_t *off)
 {
@@ -33,6 +75,7 @@ static ssize_t capabilities_read(struct file *file, char __user *buf,
 
 static const struct file_operations capabilities_file_ops = {
 	.read = capabilities_read,
+	.llseek = default_llseek,
 };
 
 static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
@@ -41,29 +84,42 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
 		[1] = {},
 		{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
 		{ "capabilities", &capabilities_file_ops, S_IRUGO },
+		{ "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR },
 		{""},
 	};
+	int rc;
+
+	rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+	if (rc < 0)
+		return rc;
+
+	if (xen_initial_domain()) {
+		xenfs_create_file(sb, sb->s_root, "xsd_kva",
+				  &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR);
+		xenfs_create_file(sb, sb->s_root, "xsd_port",
+				  &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR);
+	}
 
-	return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+	return rc;
 }
 
-static int xenfs_get_sb(struct file_system_type *fs_type,
-			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
+static struct dentry *xenfs_mount(struct file_system_type *fs_type,
+				  int flags, const char *dev_name,
+				  void *data)
 {
-	return get_sb_single(fs_type, flags, data, xenfs_fill_super, mnt);
+	return mount_single(fs_type, flags, data, xenfs_fill_super);
 }
 
 static struct file_system_type xenfs_type = {
 	.owner =	THIS_MODULE,
 	.name =		"xenfs",
-	.get_sb =	xenfs_get_sb,
+	.mount =	xenfs_mount,
 	.kill_sb =	kill_litter_super,
 };
 
 static int __init xenfs_init(void)
 {
-	if (xen_pv_domain())
+	if (xen_domain())
 		return register_filesystem(&xenfs_type);
 
 	printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n");
@@ -72,7 +128,7 @@ static int __init xenfs_init(void)
 
 static void __exit xenfs_exit(void)
 {
-	if (xen_pv_domain())
+	if (xen_domain())
 		unregister_filesystem(&xenfs_type);
 }
 
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
index 8f6c7d4b3e4d..bbd000f88af7 100644
--- a/drivers/xen/xenfs/xenbus.c
+++ b/drivers/xen/xenfs/xenbus.c
@@ -43,6 +43,7 @@
 #include <linux/fs.h>
 #include <linux/poll.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
@@ -50,6 +51,7 @@
 #include <linux/init.h>
 #include <linux/namei.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 
 #include "xenfs.h"
 #include "../xenbus/xenbus_comms.h"
@@ -587,4 +589,5 @@ const struct file_operations xenbus_file_ops = {
 	.open = xenbus_file_open,
 	.release = xenbus_file_release,
 	.poll = xenbus_file_poll,
+	.llseek = no_llseek,
 };
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
index 51f08b2d0bf1..b68aa6200003 100644
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -2,5 +2,8 @@
 #define _XENFS_XENBUS_H
 
 extern const struct file_operations xenbus_file_ops;
+extern const struct file_operations privcmd_file_ops;
+extern const struct file_operations xsd_kva_file_ops;
+extern const struct file_operations xsd_port_file_ops;
 
 #endif	/* _XENFS_XENBUS_H */
diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c
new file mode 100644
index 000000000000..fef20dbc6a5c
--- /dev/null
+++ b/drivers/xen/xenfs/xenstored.c
@@ -0,0 +1,68 @@
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+
+#include <xen/page.h>
+
+#include "xenfs.h"
+#include "../xenbus/xenbus_comms.h"
+
+static ssize_t xsd_read(struct file *file, char __user *buf,
+			    size_t size, loff_t *off)
+{
+	const char *str = (const char *)file->private_data;
+	return simple_read_from_buffer(buf, size, off, str, strlen(str));
+}
+
+static int xsd_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static int xsd_kva_open(struct inode *inode, struct file *file)
+{
+	file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p",
+					       xen_store_interface);
+	if (!file->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start;
+
+	if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+		return -EINVAL;
+
+	if (remap_pfn_range(vma, vma->vm_start,
+			    virt_to_pfn(xen_store_interface),
+			    size, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+const struct file_operations xsd_kva_file_ops = {
+	.open = xsd_kva_open,
+	.mmap = xsd_kva_mmap,
+	.read = xsd_read,
+	.release = xsd_release,
+};
+
+static int xsd_port_open(struct inode *inode, struct file *file)
+{
+	file->private_data = (void *)kasprintf(GFP_KERNEL, "%d",
+					       xen_store_evtchn);
+	if (!file->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+const struct file_operations xsd_port_file_ops = {
+	.open = xsd_port_open,
+	.read = xsd_read,
+	.release = xsd_release,
+};