Merge tag 'for-linus-4.3-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 8 Sep 2015 18:46:48 +0000 (11:46 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 8 Sep 2015 18:46:48 +0000 (11:46 -0700)
Pull xen updates from David Vrabel:
 "Xen features and fixes for 4.3:

   - Convert xen-blkfront to the multiqueue API
   - [arm] Support binding event channels to different VCPUs.
   - [x86] Support > 512 GiB in a PV guests (off by default as such a
     guest cannot be migrated with the current toolstack).
   - [x86] PMU support for PV dom0 (limited support for using perf with
     Xen and other guests)"

* tag 'for-linus-4.3-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (33 commits)
  xen: switch extra memory accounting to use pfns
  xen: limit memory to architectural maximum
  xen: avoid another early crash of memory limited dom0
  xen: avoid early crash of memory limited dom0
  arm/xen: Remove helpers which are PV specific
  xen/x86: Don't try to set PCE bit in CR4
  xen/PMU: PMU emulation code
  xen/PMU: Intercept PMU-related MSR and APIC accesses
  xen/PMU: Describe vendor-specific PMU registers
  xen/PMU: Initialization code for Xen PMU
  xen/PMU: Sysfs interface for setting Xen PMU mode
  xen: xensyms support
  xen: remove no longer needed p2m.h
  xen: allow more than 512 GB of RAM for 64 bit pv-domains
  xen: move p2m list if conflicting with e820 map
  xen: add explicit memblock_reserve() calls for special pages
  mm: provide early_memremap_ro to establish read-only mapping
  xen: check for initrd conflicting with e820 map
  xen: check pre-allocated page tables for conflict with memory map
  xen: check for kernel memory conflicting with memory layout
  ...

1  2 
Documentation/kernel-parameters.txt
arch/x86/xen/Kconfig
arch/x86/xen/enlighten.c
drivers/block/xen-blkfront.c
drivers/xen/events/events_base.c

@@@ -910,8 -910,6 +910,8 @@@ bytes respectively. Such letter suffixe
                        Disable PIN 1 of APIC timer
                        Can be useful to work around chipset bugs.
  
 +      dis_ucode_ldr   [X86] Disable the microcode loader.
 +
        dma_debug=off   If the kernel is compiled with DMA_API_DEBUG support,
                        this option disables the debugging code at boot.
  
                             <bus_id>,<clkrate>
  
        i8042.debug     [HW] Toggle i8042 debug mode
 +      i8042.unmask_kbd_data
 +                      [HW] Enable printing of interrupt data from the KBD port
 +                           (disabled by default, and as a pre-condition
 +                           requires that i8042.debug=1 be enabled)
        i8042.direct    [HW] Put keyboard port into non-translated mode
        i8042.dumbkbd   [HW] Pretend that controller can only read data from
                             keyboard and cannot control its state
                        The default parameter value of '0' causes the kernel
                        not to attempt recovery of lost locks.
  
 +      nfs4.layoutstats_timer =
 +                      [NFSv4.2] Change the rate at which the kernel sends
 +                      layoutstats to the pNFS metadata server.
 +
 +                      Setting this to value to 0 causes the kernel to use
 +                      whatever value is the default set by the layout
 +                      driver. A non-zero value sets the minimum interval
 +                      in seconds between layoutstats transmissions.
 +
        nfsd.nfs4_disable_idmapping=
                        [NFSv4] When set to the default of '1', the NFSv4
                        server will return only numeric uids and gids to
                        in a given burst of a callback-flood test.
  
        rcutorture.fqs_duration= [KNL]
 -                      Set duration of force_quiescent_state bursts.
 +                      Set duration of force_quiescent_state bursts
 +                      in microseconds.
  
        rcutorture.fqs_holdoff= [KNL]
 -                      Set holdoff time within force_quiescent_state bursts.
 +                      Set holdoff time within force_quiescent_state bursts
 +                      in microseconds.
  
        rcutorture.fqs_stutter= [KNL]
 -                      Set wait time between force_quiescent_state bursts.
 +                      Set wait time between force_quiescent_state bursts
 +                      in seconds.
 +
 +      rcutorture.gp_cond= [KNL]
 +                      Use conditional/asynchronous update-side
 +                      primitives, if available.
  
        rcutorture.gp_exp= [KNL]
 -                      Use expedited update-side primitives.
 +                      Use expedited update-side primitives, if available.
  
        rcutorture.gp_normal= [KNL]
 -                      Use normal (non-expedited) update-side primitives.
 -                      If both gp_exp and gp_normal are set, do both.
 -                      If neither gp_exp nor gp_normal are set, still
 -                      do both.
 +                      Use normal (non-expedited) asynchronous
 +                      update-side primitives, if available.
 +
 +      rcutorture.gp_sync= [KNL]
 +                      Use normal (non-expedited) synchronous
 +                      update-side primitives, if available.  If all
 +                      of rcutorture.gp_cond=, rcutorture.gp_exp=,
 +                      rcutorture.gp_normal=, and rcutorture.gp_sync=
 +                      are zero, rcutorture acts as if is interpreted
 +                      they are all non-zero.
  
        rcutorture.n_barrier_cbs= [KNL]
                        Set callbacks/threads for rcu_barrier() testing.
                        Set time (s) between CPU-hotplug operations, or
                        zero to disable CPU-hotplug testing.
  
 -      rcutorture.torture_runnable= [BOOT]
 -                      Start rcutorture running at boot time.
 -
        rcutorture.shuffle_interval= [KNL]
                        Set task-shuffle interval (s).  Shuffling tasks
                        allows some CPUs to go into dyntick-idle mode
                        Test RCU's dyntick-idle handling.  See also the
                        rcutorture.shuffle_interval parameter.
  
 +      rcutorture.torture_runnable= [BOOT]
 +                      Start rcutorture running at boot time.
 +
        rcutorture.torture_type= [KNL]
                        Specify the RCU implementation to test.
  
                        plus one apbt timer for broadcast timer.
                        x86_intel_mid_timer=apbt_only | lapic_and_apbt
  
+       xen_512gb_limit         [KNL,X86-64,XEN]
+                       Restricts the kernel running paravirtualized under Xen
+                       to use only up to 512 GB of RAM. The reason to do so is
+                       crash analysis tools and Xen tools for doing domain
+                       save/restore/migration must be enabled to handle larger
+                       domains.
        xen_emul_unplug=                [HW,X86,XEN]
                        Unplug Xen emulated devices
                        Format: [unplug0,][unplug1]
diff --combined arch/x86/xen/Kconfig
@@@ -7,8 -7,9 +7,9 @@@ config XE
        depends on PARAVIRT
        select PARAVIRT_CLOCK
        select XEN_HAVE_PVMMU
+       select XEN_HAVE_VPMU
        depends on X86_64 || (X86_32 && X86_PAE)
 -      depends on X86_TSC
 +      depends on X86_LOCAL_APIC && X86_TSC
        help
          This is the Linux Xen port.  Enabling this will allow the
          kernel to boot in a paravirtualized environment under the
  config XEN_DOM0
        def_bool y
        depends on XEN && PCI_XEN && SWIOTLB_XEN
 -      depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
 +      depends on X86_IO_APIC && ACPI && PCI
  
  config XEN_PVHVM
        def_bool y
        depends on XEN && PCI && X86_LOCAL_APIC
  
- config XEN_MAX_DOMAIN_MEMORY
-        int
-        default 500 if X86_64
-        default 64 if X86_32
-        depends on XEN
-        help
-          This only affects the sizing of some bss arrays, the unused
-          portions of which are freed.
+ config XEN_512GB
+       bool "Limit Xen pv-domain memory to 512GB"
+       depends on XEN && X86_64
+       default y
+       help
+         Limit paravirtualized user domains to 512GB of RAM.
+         The Xen tools and crash dump analysis tools might not support
+         pv-domains with more than 512 GB of RAM. This option controls the
+         default setting of the kernel to use only up to 512 GB or more.
+         It is always possible to change the default via specifying the
+         boot parameter "xen_512gb_limit".
  
  config XEN_SAVE_RESTORE
         bool
diff --combined arch/x86/xen/enlighten.c
@@@ -84,6 -84,7 +84,7 @@@
  #include "mmu.h"
  #include "smp.h"
  #include "multicalls.h"
+ #include "pmu.h"
  
  EXPORT_SYMBOL_GPL(hypercall_page);
  
@@@ -1010,8 -1011,7 +1011,7 @@@ static void xen_write_cr0(unsigned lon
  
  static void xen_write_cr4(unsigned long cr4)
  {
-       cr4 &= ~X86_CR4_PGE;
-       cr4 &= ~X86_CR4_PSE;
+       cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
  
        native_write_cr4(cr4);
  }
@@@ -1030,6 -1030,9 +1030,9 @@@ static u64 xen_read_msr_safe(unsigned i
  {
        u64 val;
  
+       if (pmu_msr_read(msr, &val, err))
+               return val;
        val = native_read_msr_safe(msr, err);
        switch (msr) {
        case MSR_IA32_APICBASE:
@@@ -1076,7 -1079,8 +1079,8 @@@ static int xen_write_msr_safe(unsigned 
                   Xen console noise. */
  
        default:
-               ret = native_write_msr_safe(msr, low, high);
+               if (!pmu_msr_write(msr, low, high, &ret))
+                       ret = native_write_msr_safe(msr, low, high);
        }
  
        return ret;
@@@ -1215,8 -1219,11 +1219,8 @@@ static const struct pv_cpu_ops xen_cpu_
        .read_msr = xen_read_msr_safe,
        .write_msr = xen_write_msr_safe,
  
-       .read_pmc = native_read_pmc,
 -      .read_tsc = native_read_tsc,
+       .read_pmc = xen_read_pmc,
  
 -      .read_tscp = native_read_tscp,
 -
        .iret = xen_iret,
  #ifdef CONFIG_X86_64
        .usergs_sysret32 = xen_sysret32,
@@@ -1264,6 -1271,10 +1268,10 @@@ static const struct pv_apic_ops xen_api
  static void xen_reboot(int reason)
  {
        struct sched_shutdown r = { .reason = reason };
+       int cpu;
+       for_each_online_cpu(cpu)
+               xen_pmu_finish(cpu);
  
        if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
                BUG();
@@@ -1607,7 -1618,9 +1615,9 @@@ asmlinkage __visible void __init xen_st
        early_boot_irqs_disabled = true;
  
        xen_raw_console_write("mapping kernel into physical memory\n");
-       xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
+       xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
+                                  xen_start_info->nr_pages);
+       xen_reserve_special_pages();
  
        /*
         * Modify the cache mode translation tables to match Xen's PAT
@@@ -37,6 -37,7 +37,7 @@@
  
  #include <linux/interrupt.h>
  #include <linux/blkdev.h>
+ #include <linux/blk-mq.h>
  #include <linux/hdreg.h>
  #include <linux/cdrom.h>
  #include <linux/module.h>
@@@ -82,6 -83,7 +83,6 @@@ struct blk_shadow 
  struct split_bio {
        struct bio *bio;
        atomic_t pending;
 -      int err;
  };
  
  static DEFINE_MUTEX(blkfront_mutex);
@@@ -147,6 -149,7 +148,7 @@@ struct blkfront_inf
        unsigned int feature_persistent:1;
        unsigned int max_indirect_segments;
        int is_ready;
+       struct blk_mq_tag_set tag_set;
  };
  
  static unsigned int nr_minors;
@@@ -616,54 -619,41 +618,41 @@@ static inline bool blkif_request_flush_
                 !(info->feature_flush & REQ_FUA)));
  }
  
- /*
-  * do_blkif_request
-  *  read a block; request is in a request queue
-  */
- static void do_blkif_request(struct request_queue *rq)
+ static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
+                          const struct blk_mq_queue_data *qd)
  {
-       struct blkfront_info *info = NULL;
-       struct request *req;
-       int queued;
-       pr_debug("Entered do_blkif_request\n");
-       queued = 0;
+       struct blkfront_info *info = qd->rq->rq_disk->private_data;
  
-       while ((req = blk_peek_request(rq)) != NULL) {
-               info = req->rq_disk->private_data;
-               if (RING_FULL(&info->ring))
-                       goto wait;
+       blk_mq_start_request(qd->rq);
+       spin_lock_irq(&info->io_lock);
+       if (RING_FULL(&info->ring))
+               goto out_busy;
  
-               blk_start_request(req);
+       if (blkif_request_flush_invalid(qd->rq, info))
+               goto out_err;
  
-               if (blkif_request_flush_invalid(req, info)) {
-                       __blk_end_request_all(req, -EOPNOTSUPP);
-                       continue;
-               }
+       if (blkif_queue_request(qd->rq))
+               goto out_busy;
  
-               pr_debug("do_blk_req %p: cmd %p, sec %lx, "
-                        "(%u/%u) [%s]\n",
-                        req, req->cmd, (unsigned long)blk_rq_pos(req),
-                        blk_rq_cur_sectors(req), blk_rq_sectors(req),
-                        rq_data_dir(req) ? "write" : "read");
-               if (blkif_queue_request(req)) {
-                       blk_requeue_request(rq, req);
- wait:
-                       /* Avoid pointless unplugs. */
-                       blk_stop_queue(rq);
-                       break;
-               }
+       flush_requests(info);
+       spin_unlock_irq(&info->io_lock);
+       return BLK_MQ_RQ_QUEUE_OK;
  
-               queued++;
-       }
+ out_err:
+       spin_unlock_irq(&info->io_lock);
+       return BLK_MQ_RQ_QUEUE_ERROR;
  
-       if (queued != 0)
-               flush_requests(info);
+ out_busy:
+       spin_unlock_irq(&info->io_lock);
+       blk_mq_stop_hw_queue(hctx);
+       return BLK_MQ_RQ_QUEUE_BUSY;
  }
  
+ static struct blk_mq_ops blkfront_mq_ops = {
+       .queue_rq = blkif_queue_rq,
+       .map_queue = blk_mq_map_queue,
+ };
  static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
                                unsigned int physical_sector_size,
                                unsigned int segments)
        struct request_queue *rq;
        struct blkfront_info *info = gd->private_data;
  
-       rq = blk_init_queue(do_blkif_request, &info->io_lock);
-       if (rq == NULL)
+       memset(&info->tag_set, 0, sizeof(info->tag_set));
+       info->tag_set.ops = &blkfront_mq_ops;
+       info->tag_set.nr_hw_queues = 1;
+       info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+       info->tag_set.numa_node = NUMA_NO_NODE;
+       info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+       info->tag_set.cmd_size = 0;
+       info->tag_set.driver_data = info;
+       if (blk_mq_alloc_tag_set(&info->tag_set))
                return -1;
+       rq = blk_mq_init_queue(&info->tag_set);
+       if (IS_ERR(rq)) {
+               blk_mq_free_tag_set(&info->tag_set);
+               return -1;
+       }
  
        queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
  
@@@ -901,19 -904,15 +903,15 @@@ static int xlvbd_alloc_gendisk(blkif_se
  static void xlvbd_release_gendisk(struct blkfront_info *info)
  {
        unsigned int minor, nr_minors;
-       unsigned long flags;
  
        if (info->rq == NULL)
                return;
  
-       spin_lock_irqsave(&info->io_lock, flags);
        /* No more blkif_request(). */
-       blk_stop_queue(info->rq);
+       blk_mq_stop_hw_queues(info->rq);
  
        /* No more gnttab callback work. */
        gnttab_cancel_free_callback(&info->callback);
-       spin_unlock_irqrestore(&info->io_lock, flags);
  
        /* Flush gnttab callback work. Must be done with no locks held. */
        flush_work(&info->work);
        xlbd_release_minors(minor, nr_minors);
  
        blk_cleanup_queue(info->rq);
+       blk_mq_free_tag_set(&info->tag_set);
        info->rq = NULL;
  
        put_disk(info->gd);
        info->gd = NULL;
  }
  
+ /* Must be called with io_lock holded */
  static void kick_pending_request_queues(struct blkfront_info *info)
  {
-       if (!RING_FULL(&info->ring)) {
-               /* Re-enable calldowns. */
-               blk_start_queue(info->rq);
-               /* Kick things off immediately. */
-               do_blkif_request(info->rq);
-       }
+       if (!RING_FULL(&info->ring))
+               blk_mq_start_stopped_hw_queues(info->rq, true);
  }
  
  static void blkif_restart_queue(struct work_struct *work)
@@@ -963,7 -960,7 +959,7 @@@ static void blkif_free(struct blkfront_
                BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
        /* No more blkif_request(). */
        if (info->rq)
-               blk_stop_queue(info->rq);
+               blk_mq_stop_hw_queues(info->rq);
  
        /* Remove all persistent grants */
        if (!list_empty(&info->grants)) {
@@@ -1146,7 -1143,6 +1142,6 @@@ static irqreturn_t blkif_interrupt(int 
        RING_IDX i, rp;
        unsigned long flags;
        struct blkfront_info *info = (struct blkfront_info *)dev_id;
-       int error;
  
        spin_lock_irqsave(&info->io_lock, flags);
  
                        continue;
                }
  
-               error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
+               req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
                switch (bret->operation) {
                case BLKIF_OP_DISCARD:
                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
                                struct request_queue *rq = info->rq;
                                printk(KERN_WARNING "blkfront: %s: %s op failed\n",
                                           info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               req->errors = -EOPNOTSUPP;
                                info->feature_discard = 0;
                                info->feature_secdiscard = 0;
                                queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
                                queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
                        }
-                       __blk_end_request_all(req, error);
+                       blk_mq_complete_request(req);
                        break;
                case BLKIF_OP_FLUSH_DISKCACHE:
                case BLKIF_OP_WRITE_BARRIER:
                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
                                printk(KERN_WARNING "blkfront: %s: %s op failed\n",
                                       info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               req->errors = -EOPNOTSUPP;
                        }
                        if (unlikely(bret->status == BLKIF_RSP_ERROR &&
                                     info->shadow[id].req.u.rw.nr_segments == 0)) {
                                printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
                                       info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               req->errors = -EOPNOTSUPP;
                        }
-                       if (unlikely(error)) {
-                               if (error == -EOPNOTSUPP)
-                                       error = 0;
+                       if (unlikely(req->errors)) {
+                               if (req->errors == -EOPNOTSUPP)
+                                       req->errors = 0;
                                info->feature_flush = 0;
                                xlvbd_flush(info);
                        }
                                dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
                                        "request: %x\n", bret->status);
  
-                       __blk_end_request_all(req, error);
+                       blk_mq_complete_request(req);
                        break;
                default:
                        BUG();
@@@ -1480,14 -1476,16 +1475,14 @@@ static int blkfront_probe(struct xenbus
        return 0;
  }
  
 -static void split_bio_end(struct bio *bio, int error)
 +static void split_bio_end(struct bio *bio)
  {
        struct split_bio *split_bio = bio->bi_private;
  
 -      if (error)
 -              split_bio->err = error;
 -
        if (atomic_dec_and_test(&split_bio->pending)) {
                split_bio->bio->bi_phys_segments = 0;
 -              bio_endio(split_bio->bio, split_bio->err);
 +              split_bio->bio->bi_error = bio->bi_error;
 +              bio_endio(split_bio->bio);
                kfree(split_bio);
        }
        bio_put(bio);
@@@ -1555,28 -1553,6 +1550,6 @@@ static int blkif_recover(struct blkfron
  
        kfree(copy);
  
-       /*
-        * Empty the queue, this is important because we might have
-        * requests in the queue with more segments than what we
-        * can handle now.
-        */
-       spin_lock_irq(&info->io_lock);
-       while ((req = blk_fetch_request(info->rq)) != NULL) {
-               if (req->cmd_flags &
-                   (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
-                       list_add(&req->queuelist, &requests);
-                       continue;
-               }
-               merge_bio.head = req->bio;
-               merge_bio.tail = req->biotail;
-               bio_list_merge(&bio_list, &merge_bio);
-               req->bio = NULL;
-               if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
-                       pr_alert("diskcache flush request found!\n");
-               __blk_end_request_all(req, 0);
-       }
-       spin_unlock_irq(&info->io_lock);
        xenbus_switch_state(info->xbdev, XenbusStateConnected);
  
        spin_lock_irq(&info->io_lock);
                /* Requeue pending requests (flush or discard) */
                list_del_init(&req->queuelist);
                BUG_ON(req->nr_phys_segments > segs);
-               blk_requeue_request(info->rq, req);
+               blk_mq_requeue_request(req);
        }
        spin_unlock_irq(&info->io_lock);
+       blk_mq_kick_requeue_list(info->rq);
  
        while ((bio = bio_list_pop(&bio_list)) != NULL) {
                /* Traverse the list of pending bios and re-queue them */
@@@ -336,7 -336,7 +336,7 @@@ static void bind_evtchn_to_cpu(unsigne
  
        BUG_ON(irq == -1);
  #ifdef CONFIG_SMP
 -      cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu));
 +      cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu));
  #endif
        xen_evtchn_port_bind_to_cpu(info, cpu);
  
@@@ -373,7 -373,7 +373,7 @@@ static void xen_irq_init(unsigned irq
        struct irq_info *info;
  #ifdef CONFIG_SMP
        /* By default all event channels notify CPU#0. */
 -      cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(0));
 +      cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0));
  #endif
  
        info = kzalloc(sizeof(*info), GFP_KERNEL);
@@@ -1301,11 -1301,7 +1301,7 @@@ static int rebind_irq_to_cpu(unsigned i
        if (!VALID_EVTCHN(evtchn))
                return -1;
  
-       /*
-        * Events delivered via platform PCI interrupts are always
-        * routed to vcpu 0 and hence cannot be rebound.
-        */
-       if (xen_hvm_domain() && !xen_have_vector_callback)
+       if (!xen_support_evtchn_rebind())
                return -1;
  
        /* Send future instances of this interrupt to other vcpu. */